1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Per-interface tunables (for developers) 68 * 69 * ibd_tx_copy_thresh 70 * This sets the threshold at which ibd will attempt to do a bcopy of the 71 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 72 * is restricted by various parameters, so setting of this value must be 73 * made after careful considerations only. For instance, IB HCAs currently 74 * impose a relatively small limit (when compared to ethernet NICs) on the 75 * length of the SGL for transmit. On the other hand, the ip stack could 76 * send down mp chains that are quite long when LSO is enabled. 77 * 78 * ibd_num_swqe 79 * Number of "send WQE" elements that will be allocated and used by ibd. 80 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 81 * buffer in each of these send wqes must be taken into account. This 82 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 83 * currently set to the same value of ibd_tx_copy_thresh, but may be 84 * changed independently if needed). 85 * 86 * ibd_num_rwqe 87 * Number of "receive WQE" elements that will be allocated and used by 88 * ibd. This parameter is limited by the maximum channel size of the HCA. 89 * Each buffer in the receive wqe will be of MTU size. 90 * 91 * ibd_num_lso_bufs 92 * Number of "larger-than-MTU" copy buffers to use for cases when the 93 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 94 * and too large to be used with regular MTU-sized copy buffers. It is 95 * not recommended to tune this variable without understanding the 96 * application environment and/or memory resources. The size of each of 97 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 98 * 99 * ibd_num_ah 100 * Number of AH cache entries to allocate 101 * 102 * ibd_hash_size 103 * Hash table size for the active AH list 104 * 105 * ibd_tx_softintr 106 * ibd_rx_softintr 107 * The softintr mechanism allows ibd to avoid event queue overflows if 108 * the receive/completion handlers are to be expensive. These are enabled 109 * by default. 110 * 111 * ibd_log_sz 112 * This specifies the size of the ibd log buffer in bytes. The buffer is 113 * allocated and logging is enabled only when IBD_LOGGING is defined. 114 * 115 */ 116 uint_t ibd_tx_copy_thresh = 0x1000; 117 uint_t ibd_num_swqe = 4000; 118 uint_t ibd_num_rwqe = 4000; 119 uint_t ibd_num_lso_bufs = 0x400; 120 uint_t ibd_num_ah = 256; 121 uint_t ibd_hash_size = 32; 122 uint_t ibd_rx_softintr = 1; 123 uint_t ibd_tx_softintr = 1; 124 uint_t ibd_create_broadcast_group = 1; 125 #ifdef IBD_LOGGING 126 uint_t ibd_log_sz = 0x20000; 127 #endif 128 129 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 130 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 131 #define IBD_NUM_SWQE ibd_num_swqe 132 #define IBD_NUM_RWQE ibd_num_rwqe 133 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 134 #define IBD_NUM_AH ibd_num_ah 135 #define IBD_HASH_SIZE ibd_hash_size 136 #ifdef IBD_LOGGING 137 #define IBD_LOG_SZ ibd_log_sz 138 #endif 139 140 /* 141 * ibd_rc_tx_copy_thresh 142 * This sets the threshold upto which ibd will attempt to do a bcopy of the 143 * outgoing data into a pre-mapped buffer. 144 */ 145 uint_t ibd_rc_tx_copy_thresh = 0x1000; 146 147 /* 148 * Receive CQ moderation parameters: tunable (for developers) 149 */ 150 uint_t ibd_rxcomp_count = 4; 151 uint_t ibd_rxcomp_usec = 10; 152 153 /* 154 * Send CQ moderation parameters: tunable (for developers) 155 */ 156 uint_t ibd_txcomp_count = 16; 157 uint_t ibd_txcomp_usec = 300; 158 159 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 160 #define IBD_RX_POST_CNT 8 161 162 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 163 #define IBD_LOG_RX_POST 4 164 165 /* Minimum number of receive work requests driver needs to always have */ 166 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 167 168 /* 169 * LSO parameters 170 */ 171 #define IBD_LSO_MAXLEN 65536 172 #define IBD_LSO_BUFSZ 8192 173 #define IBD_PROP_LSO_POLICY "lso-policy" 174 175 /* 176 * Async operation states 177 */ 178 #define IBD_OP_NOTSTARTED 0 179 #define IBD_OP_ONGOING 1 180 #define IBD_OP_COMPLETED 2 181 #define IBD_OP_ERRORED 3 182 #define IBD_OP_ROUTERED 4 183 184 /* 185 * State of IBD driver initialization during attach/m_start 186 */ 187 #define IBD_DRV_STATE_INITIALIZED 0x00001 188 #define IBD_DRV_RXINTR_ADDED 0x00002 189 #define IBD_DRV_TXINTR_ADDED 0x00004 190 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 191 #define IBD_DRV_HCA_OPENED 0x00010 192 #define IBD_DRV_PD_ALLOCD 0x00020 193 #define IBD_DRV_MAC_REGISTERED 0x00040 194 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 195 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 196 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 197 #define IBD_DRV_CQS_ALLOCD 0x00400 198 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 199 #define IBD_DRV_TXLIST_ALLOCD 0x01000 200 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 201 #define IBD_DRV_RXLIST_ALLOCD 0x04000 202 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 203 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 204 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 205 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 206 #define IBD_DRV_STARTED 0x80000 207 #define IBD_DRV_RC_SRQ_ALLOCD 0x100000 208 #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000 209 #define IBD_DRV_RC_LISTEN 0x400000 210 #ifdef DEBUG 211 #define IBD_DRV_RC_PRIVATE_STATE 0x800000 212 #endif 213 214 /* 215 * Start/stop in-progress flags; note that restart must always remain 216 * the OR of start and stop flag values. 217 */ 218 #define IBD_DRV_START_IN_PROGRESS 0x10000000 219 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 220 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 221 222 /* 223 * Miscellaneous constants 224 */ 225 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 226 #define IBD_DEF_MAX_SDU 2044 227 #define IBD_DEFAULT_QKEY 0xB1B 228 #ifdef IBD_LOGGING 229 #define IBD_DMAX_LINE 100 230 #endif 231 232 /* 233 * Enumerations for link states 234 */ 235 typedef enum { 236 IBD_LINK_DOWN, 237 IBD_LINK_UP, 238 IBD_LINK_UP_ABSENT 239 } ibd_link_op_t; 240 241 /* 242 * Driver State Pointer 243 */ 244 void *ibd_list; 245 246 /* 247 * Logging 248 */ 249 #ifdef IBD_LOGGING 250 kmutex_t ibd_lbuf_lock; 251 uint8_t *ibd_lbuf; 252 uint32_t ibd_lbuf_ndx; 253 #endif 254 255 /* 256 * Required system entry points 257 */ 258 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 259 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 260 261 /* 262 * Required driver entry points for GLDv3 263 */ 264 static int ibd_m_stat(void *, uint_t, uint64_t *); 265 static int ibd_m_start(void *); 266 static void ibd_m_stop(void *); 267 static int ibd_m_promisc(void *, boolean_t); 268 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 269 static int ibd_m_unicst(void *, const uint8_t *); 270 static mblk_t *ibd_m_tx(void *, mblk_t *); 271 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 272 273 /* 274 * Private driver entry points for GLDv3 275 */ 276 277 /* 278 * Initialization 279 */ 280 static int ibd_state_init(ibd_state_t *, dev_info_t *); 281 static int ibd_init_txlist(ibd_state_t *); 282 static int ibd_init_rxlist(ibd_state_t *); 283 static int ibd_acache_init(ibd_state_t *); 284 #ifdef IBD_LOGGING 285 static void ibd_log_init(void); 286 #endif 287 288 /* 289 * Termination/cleanup 290 */ 291 static void ibd_state_fini(ibd_state_t *); 292 static void ibd_fini_txlist(ibd_state_t *); 293 static void ibd_fini_rxlist(ibd_state_t *); 294 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 295 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 296 static void ibd_acache_fini(ibd_state_t *); 297 #ifdef IBD_LOGGING 298 static void ibd_log_fini(void); 299 #endif 300 301 /* 302 * Allocation/acquire/map routines 303 */ 304 static int ibd_alloc_tx_copybufs(ibd_state_t *); 305 static int ibd_alloc_rx_copybufs(ibd_state_t *); 306 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 307 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 308 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 309 uint32_t *); 310 311 /* 312 * Free/release/unmap routines 313 */ 314 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 315 static void ibd_free_tx_copybufs(ibd_state_t *); 316 static void ibd_free_rx_copybufs(ibd_state_t *); 317 static void ibd_free_rx_rsrcs(ibd_state_t *); 318 static void ibd_free_tx_lsobufs(ibd_state_t *); 319 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 320 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 321 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 322 323 /* 324 * Handlers/callback routines 325 */ 326 static uint_t ibd_intr(caddr_t); 327 static uint_t ibd_tx_recycle(caddr_t); 328 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 329 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 330 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 331 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 332 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 333 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 334 static void ibd_freemsg_cb(char *); 335 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 336 ibt_async_event_t *); 337 static void ibd_snet_notices_handler(void *, ib_gid_t, 338 ibt_subnet_event_code_t, ibt_subnet_event_t *); 339 340 /* 341 * Send/receive routines 342 */ 343 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 344 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 345 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 346 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 347 348 /* 349 * Threads 350 */ 351 static void ibd_async_work(ibd_state_t *); 352 353 /* 354 * Async tasks 355 */ 356 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 357 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 358 static void ibd_async_setprom(ibd_state_t *); 359 static void ibd_async_unsetprom(ibd_state_t *); 360 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 361 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 362 static void ibd_async_txsched(ibd_state_t *); 363 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 364 365 /* 366 * Async task helpers 367 */ 368 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 369 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 370 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 371 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 372 ipoib_mac_t *, ipoib_mac_t *); 373 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 374 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 375 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 376 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 377 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 378 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 379 static uint64_t ibd_get_portspeed(ibd_state_t *); 380 static boolean_t ibd_async_safe(ibd_state_t *); 381 static void ibd_async_done(ibd_state_t *); 382 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 383 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 384 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 385 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 386 387 /* 388 * Helpers for attach/start routines 389 */ 390 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 391 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 392 static int ibd_unattach(ibd_state_t *, dev_info_t *); 393 static int ibd_get_port_details(ibd_state_t *); 394 static int ibd_alloc_cqs(ibd_state_t *); 395 static int ibd_setup_ud_channel(ibd_state_t *); 396 static int ibd_start(ibd_state_t *); 397 static int ibd_undo_start(ibd_state_t *, link_state_t); 398 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 399 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 400 401 402 /* 403 * Miscellaneous helpers 404 */ 405 static int ibd_sched_poll(ibd_state_t *, int, int); 406 static void ibd_resume_transmission(ibd_state_t *); 407 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 408 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 409 static void *list_get_head(list_t *); 410 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 411 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 412 #ifdef IBD_LOGGING 413 static void ibd_log(const char *, ...); 414 #endif 415 416 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 417 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 418 419 /* Module Driver Info */ 420 static struct modldrv ibd_modldrv = { 421 &mod_driverops, /* This one is a driver */ 422 "InfiniBand GLDv3 Driver", /* short description */ 423 &ibd_dev_ops /* driver specific ops */ 424 }; 425 426 /* Module Linkage */ 427 static struct modlinkage ibd_modlinkage = { 428 MODREV_1, (void *)&ibd_modldrv, NULL 429 }; 430 431 /* 432 * Module (static) info passed to IBTL during ibt_attach 433 */ 434 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 435 IBTI_V_CURR, 436 IBT_NETWORK, 437 ibd_async_handler, 438 NULL, 439 "IPIB" 440 }; 441 442 /* 443 * GLDv3 entry points 444 */ 445 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 446 static mac_callbacks_t ibd_m_callbacks = { 447 IBD_M_CALLBACK_FLAGS, 448 ibd_m_stat, 449 ibd_m_start, 450 ibd_m_stop, 451 ibd_m_promisc, 452 ibd_m_multicst, 453 ibd_m_unicst, 454 ibd_m_tx, 455 NULL, 456 ibd_m_getcapab 457 }; 458 459 /* 460 * Fill/clear <scope> and <p_key> in multicast/broadcast address 461 */ 462 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 463 { \ 464 *(uint32_t *)((char *)(maddr) + 4) |= \ 465 htonl((uint32_t)(scope) << 16); \ 466 *(uint32_t *)((char *)(maddr) + 8) |= \ 467 htonl((uint32_t)(pkey) << 16); \ 468 } 469 470 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 471 { \ 472 *(uint32_t *)((char *)(maddr) + 4) &= \ 473 htonl(~((uint32_t)0xF << 16)); \ 474 *(uint32_t *)((char *)(maddr) + 8) &= \ 475 htonl(~((uint32_t)0xFFFF << 16)); \ 476 } 477 478 /* 479 * Rudimentary debugging support 480 */ 481 #ifdef DEBUG 482 int ibd_debuglevel = 100; 483 void 484 debug_print(int l, char *fmt, ...) 485 { 486 va_list ap; 487 488 if (l < ibd_debuglevel) 489 return; 490 va_start(ap, fmt); 491 vcmn_err(CE_CONT, fmt, ap); 492 va_end(ap); 493 } 494 #endif 495 496 /* 497 * Common routine to print warning messages; adds in hca guid, port number 498 * and pkey to be able to identify the IBA interface. 499 */ 500 void 501 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 502 { 503 ib_guid_t hca_guid; 504 char ibd_print_buf[256]; 505 int len; 506 va_list ap; 507 508 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 509 0, "hca-guid", 0); 510 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 511 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 512 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 513 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 514 va_start(ap, fmt); 515 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 516 fmt, ap); 517 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 518 va_end(ap); 519 } 520 521 /* 522 * Warlock directives 523 */ 524 525 /* 526 * id_lso_lock 527 * 528 * state->id_lso->bkt_nfree may be accessed without a lock to 529 * determine the threshold at which we have to ask the nw layer 530 * to resume transmission (see ibd_resume_transmission()). 531 */ 532 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 533 ibd_state_t::id_lso)) 534 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 535 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 536 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 537 538 /* 539 * id_scq_poll_lock 540 */ 541 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 542 ibd_state_t::id_scq_poll_busy)) 543 544 /* 545 * id_txpost_lock 546 */ 547 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 548 ibd_state_t::id_tx_head)) 549 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 550 ibd_state_t::id_tx_busy)) 551 552 /* 553 * id_acache_req_lock 554 */ 555 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 556 ibd_state_t::id_acache_req_cv)) 557 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 558 ibd_state_t::id_req_list)) 559 _NOTE(SCHEME_PROTECTS_DATA("atomic", 560 ibd_acache_s::ac_ref)) 561 562 /* 563 * id_ac_mutex 564 * 565 * This mutex is actually supposed to protect id_ah_op as well, 566 * but this path of the code isn't clean (see update of id_ah_op 567 * in ibd_async_acache(), immediately after the call to 568 * ibd_async_mcache()). For now, we'll skip this check by 569 * declaring that id_ah_op is protected by some internal scheme 570 * that warlock isn't aware of. 571 */ 572 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 573 ibd_state_t::id_ah_active)) 574 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 575 ibd_state_t::id_ah_free)) 576 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 577 ibd_state_t::id_ah_addr)) 578 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 579 ibd_state_t::id_ah_op)) 580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 581 ibd_state_t::id_ah_error)) 582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 583 ibd_state_t::id_ac_hot_ace)) 584 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 585 586 /* 587 * id_mc_mutex 588 */ 589 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 590 ibd_state_t::id_mc_full)) 591 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 592 ibd_state_t::id_mc_non)) 593 594 /* 595 * id_trap_lock 596 */ 597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 598 ibd_state_t::id_trap_cv)) 599 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 600 ibd_state_t::id_trap_stop)) 601 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 602 ibd_state_t::id_trap_inprog)) 603 604 /* 605 * id_prom_op 606 */ 607 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 608 ibd_state_t::id_prom_op)) 609 610 /* 611 * id_sched_lock 612 */ 613 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 614 ibd_state_t::id_sched_needed)) 615 616 /* 617 * id_link_mutex 618 */ 619 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 620 ibd_state_t::id_link_state)) 621 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 622 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 623 ibd_state_t::id_link_speed)) 624 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 625 626 /* 627 * id_tx_list.dl_mutex 628 */ 629 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 630 ibd_state_t::id_tx_list.dl_head)) 631 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 632 ibd_state_t::id_tx_list.dl_pending_sends)) 633 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 634 ibd_state_t::id_tx_list.dl_cnt)) 635 636 /* 637 * id_rx_list.dl_mutex 638 */ 639 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 640 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 641 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 642 ibd_state_t::id_rx_list.dl_cnt)) 643 644 645 /* 646 * Items protected by atomic updates 647 */ 648 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 649 ibd_state_s::id_brd_rcv 650 ibd_state_s::id_brd_xmt 651 ibd_state_s::id_multi_rcv 652 ibd_state_s::id_multi_xmt 653 ibd_state_s::id_num_intrs 654 ibd_state_s::id_rcv_bytes 655 ibd_state_s::id_rcv_pkt 656 ibd_state_s::id_rx_post_queue_index 657 ibd_state_s::id_tx_short 658 ibd_state_s::id_xmt_bytes 659 ibd_state_s::id_xmt_pkt 660 ibd_state_s::rc_rcv_trans_byte 661 ibd_state_s::rc_rcv_trans_pkt 662 ibd_state_s::rc_rcv_copy_byte 663 ibd_state_s::rc_rcv_copy_pkt 664 ibd_state_s::rc_xmt_bytes 665 ibd_state_s::rc_xmt_small_pkt 666 ibd_state_s::rc_xmt_fragmented_pkt 667 ibd_state_s::rc_xmt_map_fail_pkt 668 ibd_state_s::rc_xmt_map_succ_pkt)) 669 670 /* 671 * Non-mutex protection schemes for data elements. Almost all of 672 * these are non-shared items. 673 */ 674 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 675 callb_cpr 676 ib_gid_s 677 ib_header_info 678 ibd_acache_rq 679 ibd_acache_s::ac_mce 680 ibd_acache_s::ac_chan 681 ibd_mcache::mc_fullreap 682 ibd_mcache::mc_jstate 683 ibd_mcache::mc_req 684 ibd_rwqe_s 685 ibd_swqe_s 686 ibd_wqe_s 687 ibt_wr_ds_s::ds_va 688 ibt_wr_lso_s 689 ipoib_mac::ipoib_qpn 690 mac_capab_lso_s 691 msgb::b_next 692 msgb::b_cont 693 msgb::b_rptr 694 msgb::b_wptr 695 ibd_state_s::id_bgroup_created 696 ibd_state_s::id_mac_state 697 ibd_state_s::id_mtu 698 ibd_state_s::id_num_rwqe 699 ibd_state_s::id_num_swqe 700 ibd_state_s::id_qpnum 701 ibd_state_s::id_rcq_hdl 702 ibd_state_s::id_rx_buf_sz 703 ibd_state_s::id_rx_bufs 704 ibd_state_s::id_rx_mr_hdl 705 ibd_state_s::id_rx_wqes 706 ibd_state_s::id_rxwcs 707 ibd_state_s::id_rxwcs_size 708 ibd_state_s::id_rx_nqueues 709 ibd_state_s::id_rx_queues 710 ibd_state_s::id_scope 711 ibd_state_s::id_scq_hdl 712 ibd_state_s::id_tx_buf_sz 713 ibd_state_s::id_tx_bufs 714 ibd_state_s::id_tx_mr_hdl 715 ibd_state_s::id_tx_rel_list.dl_cnt 716 ibd_state_s::id_tx_wqes 717 ibd_state_s::id_txwcs 718 ibd_state_s::id_txwcs_size 719 ibd_state_s::rc_listen_hdl 720 ibd_state_s::rc_listen_hdl_OFED_interop 721 ibd_state_s::rc_srq_size 722 ibd_state_s::rc_srq_rwqes 723 ibd_state_s::rc_srq_rx_bufs 724 ibd_state_s::rc_srq_rx_mr_hdl 725 ibd_state_s::rc_tx_largebuf_desc_base 726 ibd_state_s::rc_tx_mr_bufs 727 ibd_state_s::rc_tx_mr_hdl 728 ipha_s 729 icmph_s 730 ibt_path_info_s::pi_sid 731 ibd_rc_chan_s::ace 732 ibd_rc_chan_s::chan_hdl 733 ibd_rc_chan_s::state 734 ibd_rc_chan_s::chan_state 735 ibd_rc_chan_s::is_tx_chan 736 ibd_rc_chan_s::rcq_hdl 737 ibd_rc_chan_s::rcq_size 738 ibd_rc_chan_s::scq_hdl 739 ibd_rc_chan_s::scq_size 740 ibd_rc_chan_s::requester_gid 741 ibd_rc_chan_s::requester_pkey 742 ibd_rc_chan_s::rx_bufs 743 ibd_rc_chan_s::rx_mr_hdl 744 ibd_rc_chan_s::rx_rwqes 745 ibd_rc_chan_s::tx_wqes 746 ibd_rc_chan_s::tx_mr_bufs 747 ibd_rc_chan_s::tx_mr_hdl 748 ibd_rc_chan_s::tx_rel_list.dl_cnt 749 ibd_rc_chan_s::tx_trans_error_cnt 750 ibd_rc_tx_largebuf_s::lb_buf 751 ibd_rc_msg_hello_s 752 ibt_cm_return_args_s)) 753 754 /* 755 * ibd_rc_chan_s::next is protected by two mutexes: 756 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 757 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 758 */ 759 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 760 ibd_rc_chan_s::next)) 761 762 /* 763 * ibd_state_s.rc_tx_large_bufs_lock 764 */ 765 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 766 ibd_state_s::rc_tx_largebuf_free_head)) 767 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 768 ibd_state_s::rc_tx_largebuf_nfree)) 769 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 770 ibd_rc_tx_largebuf_s::lb_next)) 771 772 /* 773 * ibd_acache_s.tx_too_big_mutex 774 */ 775 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 776 ibd_acache_s::tx_too_big_ongoing)) 777 778 /* 779 * tx_wqe_list.dl_mutex 780 */ 781 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 782 ibd_rc_chan_s::tx_wqe_list.dl_head)) 783 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 784 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 785 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 786 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 787 788 /* 789 * ibd_state_s.rc_ace_recycle_lock 790 */ 791 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 792 ibd_state_s::rc_ace_recycle)) 793 794 /* 795 * rc_srq_rwqe_list.dl_mutex 796 */ 797 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 798 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 799 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 800 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 801 802 /* 803 * Non-mutex protection schemes for data elements. They are counters 804 * for problem diagnosis. Don't need be protected. 805 */ 806 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 807 ibd_state_s::rc_rcv_alloc_fail 808 ibd_state_s::rc_rcq_invoke 809 ibd_state_s::rc_rcq_err 810 ibd_state_s::rc_ace_not_found 811 ibd_state_s::rc_xmt_drop_too_long_pkt 812 ibd_state_s::rc_xmt_icmp_too_long_pkt 813 ibd_state_s::rc_xmt_reenter_too_long_pkt 814 ibd_state_s::rc_swqe_short 815 ibd_state_s::rc_swqe_mac_update 816 ibd_state_s::rc_xmt_buf_short 817 ibd_state_s::rc_xmt_buf_mac_update 818 ibd_state_s::rc_scq_no_swqe 819 ibd_state_s::rc_scq_no_largebuf 820 ibd_state_s::rc_scq_invoke 821 ibd_state_s::rc_conn_succ 822 ibd_state_s::rc_conn_fail 823 ibd_state_s::rc_null_conn 824 ibd_state_s::rc_no_estab_conn 825 ibd_state_s::rc_act_close 826 ibd_state_s::rc_pas_close 827 ibd_state_s::rc_delay_ace_recycle 828 ibd_state_s::rc_act_close_simultaneous 829 ibd_state_s::rc_reset_cnt)) 830 831 #ifdef DEBUG 832 /* 833 * Non-mutex protection schemes for data elements. They are counters 834 * for problem diagnosis. Don't need be protected. 835 */ 836 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 837 ibd_state_s::rc_rwqe_short 838 ibd_rc_stat_s::rc_rcv_trans_byte 839 ibd_rc_stat_s::rc_rcv_trans_pkt 840 ibd_rc_stat_s::rc_rcv_copy_byte 841 ibd_rc_stat_s::rc_rcv_copy_pkt 842 ibd_rc_stat_s::rc_rcv_alloc_fail 843 ibd_rc_stat_s::rc_rcq_invoke 844 ibd_rc_stat_s::rc_rcq_err 845 ibd_rc_stat_s::rc_scq_invoke 846 ibd_rc_stat_s::rc_rwqe_short 847 ibd_rc_stat_s::rc_xmt_bytes 848 ibd_rc_stat_s::rc_xmt_small_pkt 849 ibd_rc_stat_s::rc_xmt_fragmented_pkt 850 ibd_rc_stat_s::rc_xmt_map_fail_pkt 851 ibd_rc_stat_s::rc_xmt_map_succ_pkt 852 ibd_rc_stat_s::rc_ace_not_found 853 ibd_rc_stat_s::rc_scq_no_swqe 854 ibd_rc_stat_s::rc_scq_no_largebuf 855 ibd_rc_stat_s::rc_swqe_short 856 ibd_rc_stat_s::rc_swqe_mac_update 857 ibd_rc_stat_s::rc_xmt_buf_short 858 ibd_rc_stat_s::rc_xmt_buf_mac_update 859 ibd_rc_stat_s::rc_conn_succ 860 ibd_rc_stat_s::rc_conn_fail 861 ibd_rc_stat_s::rc_null_conn 862 ibd_rc_stat_s::rc_no_estab_conn 863 ibd_rc_stat_s::rc_act_close 864 ibd_rc_stat_s::rc_pas_close 865 ibd_rc_stat_s::rc_delay_ace_recycle 866 ibd_rc_stat_s::rc_act_close_simultaneous 867 ibd_rc_stat_s::rc_reset_cnt)) 868 #endif 869 870 int 871 _init() 872 { 873 int status; 874 875 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 876 PAGESIZE), 0); 877 if (status != 0) { 878 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 879 return (status); 880 } 881 882 mac_init_ops(&ibd_dev_ops, "ibd"); 883 status = mod_install(&ibd_modlinkage); 884 if (status != 0) { 885 DPRINT(10, "_init:failed in mod_install()"); 886 ddi_soft_state_fini(&ibd_list); 887 mac_fini_ops(&ibd_dev_ops); 888 return (status); 889 } 890 891 #ifdef IBD_LOGGING 892 ibd_log_init(); 893 #endif 894 return (0); 895 } 896 897 int 898 _info(struct modinfo *modinfop) 899 { 900 return (mod_info(&ibd_modlinkage, modinfop)); 901 } 902 903 int 904 _fini() 905 { 906 int status; 907 908 status = mod_remove(&ibd_modlinkage); 909 if (status != 0) 910 return (status); 911 912 mac_fini_ops(&ibd_dev_ops); 913 ddi_soft_state_fini(&ibd_list); 914 #ifdef IBD_LOGGING 915 ibd_log_fini(); 916 #endif 917 return (0); 918 } 919 920 /* 921 * Convert the GID part of the mac address from network byte order 922 * to host order. 923 */ 924 static void 925 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 926 { 927 ib_sn_prefix_t nbopref; 928 ib_guid_t nboguid; 929 930 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 931 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 932 dgid->gid_prefix = b2h64(nbopref); 933 dgid->gid_guid = b2h64(nboguid); 934 } 935 936 /* 937 * Create the IPoIB address in network byte order from host order inputs. 938 */ 939 static void 940 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 941 ib_guid_t guid) 942 { 943 ib_sn_prefix_t nbopref; 944 ib_guid_t nboguid; 945 946 mac->ipoib_qpn = htonl(qpn); 947 nbopref = h2b64(prefix); 948 nboguid = h2b64(guid); 949 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 950 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 951 } 952 953 /* 954 * Send to the appropriate all-routers group when the IBA multicast group 955 * does not exist, based on whether the target group is v4 or v6. 956 */ 957 static boolean_t 958 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 959 ipoib_mac_t *rmac) 960 { 961 boolean_t retval = B_TRUE; 962 uint32_t adjscope = state->id_scope << 16; 963 uint32_t topword; 964 965 /* 966 * Copy the first 4 bytes in without assuming any alignment of 967 * input mac address; this will have IPoIB signature, flags and 968 * scope bits. 969 */ 970 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 971 topword = ntohl(topword); 972 973 /* 974 * Generate proper address for IPv4/v6, adding in the Pkey properly. 975 */ 976 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 977 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 978 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 979 ((uint32_t)(state->id_pkey << 16))), 980 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 981 else 982 /* 983 * Does not have proper bits in the mgid address. 984 */ 985 retval = B_FALSE; 986 987 return (retval); 988 } 989 990 /* 991 * Membership states for different mcg's are tracked by two lists: 992 * the "non" list is used for promiscuous mode, when all mcg traffic 993 * needs to be inspected. This type of membership is never used for 994 * transmission, so there can not be an AH in the active list 995 * corresponding to a member in this list. This list does not need 996 * any protection, since all operations are performed by the async 997 * thread. 998 * 999 * "Full" and "SendOnly" membership is tracked using a single list, 1000 * the "full" list. This is because this single list can then be 1001 * searched during transmit to a multicast group (if an AH for the 1002 * mcg is not found in the active list), since at least one type 1003 * of membership must be present before initiating the transmit. 1004 * This list is also emptied during driver detach, since sendonly 1005 * membership acquired during transmit is dropped at detach time 1006 * along with ipv4 broadcast full membership. Insert/deletes to 1007 * this list are done only by the async thread, but it is also 1008 * searched in program context (see multicast disable case), thus 1009 * the id_mc_mutex protects the list. The driver detach path also 1010 * deconstructs the "full" list, but it ensures that the async 1011 * thread will not be accessing the list (by blocking out mcg 1012 * trap handling and making sure no more Tx reaping will happen). 1013 * 1014 * Currently, an IBA attach is done in the SendOnly case too, 1015 * although this is not required. 1016 */ 1017 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1018 list_insert_head(&state->id_mc_full, mce) 1019 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1020 list_insert_head(&state->id_mc_non, mce) 1021 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1022 ibd_mcache_find(mgid, &state->id_mc_full) 1023 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1024 ibd_mcache_find(mgid, &state->id_mc_non) 1025 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1026 list_remove(&state->id_mc_full, mce) 1027 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1028 list_remove(&state->id_mc_non, mce) 1029 1030 static void * 1031 list_get_head(list_t *list) 1032 { 1033 list_node_t *lhead = list_head(list); 1034 1035 if (lhead != NULL) 1036 list_remove(list, lhead); 1037 return (lhead); 1038 } 1039 1040 /* 1041 * This is always guaranteed to be able to queue the work. 1042 */ 1043 void 1044 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1045 { 1046 /* Initialize request */ 1047 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1048 ptr->rq_op = op; 1049 1050 /* 1051 * Queue provided slot onto request pool. 1052 */ 1053 mutex_enter(&state->id_acache_req_lock); 1054 list_insert_tail(&state->id_req_list, ptr); 1055 1056 /* Go, fetch, async thread */ 1057 cv_signal(&state->id_acache_req_cv); 1058 mutex_exit(&state->id_acache_req_lock); 1059 } 1060 1061 /* 1062 * Main body of the per interface async thread. 1063 */ 1064 static void 1065 ibd_async_work(ibd_state_t *state) 1066 { 1067 ibd_req_t *ptr; 1068 callb_cpr_t cprinfo; 1069 1070 mutex_enter(&state->id_acache_req_lock); 1071 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1072 callb_generic_cpr, "ibd_async_work"); 1073 1074 for (;;) { 1075 ptr = list_get_head(&state->id_req_list); 1076 if (ptr != NULL) { 1077 mutex_exit(&state->id_acache_req_lock); 1078 1079 /* 1080 * Once we have done the operation, there is no 1081 * guarantee the request slot is going to be valid, 1082 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1083 * TRAP). 1084 * 1085 * Perform the request. 1086 */ 1087 switch (ptr->rq_op) { 1088 case IBD_ASYNC_GETAH: 1089 ibd_async_acache(state, &ptr->rq_mac); 1090 break; 1091 case IBD_ASYNC_JOIN: 1092 case IBD_ASYNC_LEAVE: 1093 ibd_async_multicast(state, 1094 ptr->rq_gid, ptr->rq_op); 1095 break; 1096 case IBD_ASYNC_PROMON: 1097 ibd_async_setprom(state); 1098 break; 1099 case IBD_ASYNC_PROMOFF: 1100 ibd_async_unsetprom(state); 1101 break; 1102 case IBD_ASYNC_REAP: 1103 ibd_async_reap_group(state, 1104 ptr->rq_ptr, ptr->rq_gid, 1105 IB_MC_JSTATE_FULL); 1106 /* 1107 * the req buf contains in mce 1108 * structure, so we do not need 1109 * to free it here. 1110 */ 1111 ptr = NULL; 1112 break; 1113 case IBD_ASYNC_TRAP: 1114 ibd_async_trap(state, ptr); 1115 break; 1116 case IBD_ASYNC_SCHED: 1117 ibd_async_txsched(state); 1118 break; 1119 case IBD_ASYNC_LINK: 1120 ibd_async_link(state, ptr); 1121 break; 1122 case IBD_ASYNC_EXIT: 1123 mutex_enter(&state->id_acache_req_lock); 1124 #ifndef __lock_lint 1125 CALLB_CPR_EXIT(&cprinfo); 1126 #else 1127 mutex_exit(&state->id_acache_req_lock); 1128 #endif 1129 return; 1130 case IBD_ASYNC_RC_TOO_BIG: 1131 ibd_async_rc_process_too_big(state, 1132 ptr); 1133 break; 1134 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1135 ibd_async_rc_close_act_chan(state, ptr); 1136 break; 1137 case IBD_ASYNC_RC_RECYCLE_ACE: 1138 ibd_async_rc_recycle_ace(state, ptr); 1139 break; 1140 } 1141 if (ptr != NULL) 1142 kmem_cache_free(state->id_req_kmc, ptr); 1143 1144 mutex_enter(&state->id_acache_req_lock); 1145 } else { 1146 #ifndef __lock_lint 1147 /* 1148 * Nothing to do: wait till new request arrives. 1149 */ 1150 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1151 cv_wait(&state->id_acache_req_cv, 1152 &state->id_acache_req_lock); 1153 CALLB_CPR_SAFE_END(&cprinfo, 1154 &state->id_acache_req_lock); 1155 #endif 1156 } 1157 } 1158 1159 /*NOTREACHED*/ 1160 _NOTE(NOT_REACHED) 1161 } 1162 1163 /* 1164 * Return when it is safe to queue requests to the async daemon; primarily 1165 * for subnet trap and async event handling. Disallow requests before the 1166 * daemon is created, and when interface deinitilization starts. 1167 */ 1168 static boolean_t 1169 ibd_async_safe(ibd_state_t *state) 1170 { 1171 mutex_enter(&state->id_trap_lock); 1172 if (state->id_trap_stop) { 1173 mutex_exit(&state->id_trap_lock); 1174 return (B_FALSE); 1175 } 1176 state->id_trap_inprog++; 1177 mutex_exit(&state->id_trap_lock); 1178 return (B_TRUE); 1179 } 1180 1181 /* 1182 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1183 * trap or event handling to complete to kill the async thread and deconstruct 1184 * the mcg/ace list. 1185 */ 1186 static void 1187 ibd_async_done(ibd_state_t *state) 1188 { 1189 mutex_enter(&state->id_trap_lock); 1190 if (--state->id_trap_inprog == 0) 1191 cv_signal(&state->id_trap_cv); 1192 mutex_exit(&state->id_trap_lock); 1193 } 1194 1195 /* 1196 * Hash functions: 1197 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1198 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1199 * These operate on mac addresses input into ibd_send, but there is no 1200 * guarantee on the alignment of the ipoib_mac_t structure. 1201 */ 1202 /*ARGSUSED*/ 1203 static uint_t 1204 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1205 { 1206 ulong_t ptraddr = (ulong_t)key; 1207 uint_t hval; 1208 1209 /* 1210 * If the input address is 4 byte aligned, we can just dereference 1211 * it. This is most common, since IP will send in a 4 byte aligned 1212 * IP header, which implies the 24 byte IPoIB psuedo header will be 1213 * 4 byte aligned too. 1214 */ 1215 if ((ptraddr & 3) == 0) 1216 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1217 1218 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1219 return (hval); 1220 } 1221 1222 static int 1223 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1224 { 1225 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1226 return (0); 1227 else 1228 return (1); 1229 } 1230 1231 /* 1232 * Initialize all the per interface caches and lists; AH cache, 1233 * MCG list etc. 1234 */ 1235 static int 1236 ibd_acache_init(ibd_state_t *state) 1237 { 1238 ibd_ace_t *ce; 1239 int i; 1240 1241 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1242 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1243 1244 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1245 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1246 mutex_enter(&state->id_ac_mutex); 1247 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1248 offsetof(ibd_ace_t, ac_list)); 1249 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1250 offsetof(ibd_ace_t, ac_list)); 1251 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1252 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1253 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1254 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1255 offsetof(ibd_mce_t, mc_list)); 1256 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1257 offsetof(ibd_mce_t, mc_list)); 1258 list_create(&state->id_req_list, sizeof (ibd_req_t), 1259 offsetof(ibd_req_t, rq_list)); 1260 state->id_ac_hot_ace = NULL; 1261 1262 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1263 IBD_NUM_AH, KM_SLEEP); 1264 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1265 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1266 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1267 mutex_exit(&state->id_ac_mutex); 1268 ibd_acache_fini(state); 1269 return (DDI_FAILURE); 1270 } else { 1271 CLEAR_REFCYCLE(ce); 1272 ce->ac_mce = NULL; 1273 mutex_init(&ce->tx_too_big_mutex, NULL, 1274 MUTEX_DRIVER, NULL); 1275 IBD_ACACHE_INSERT_FREE(state, ce); 1276 } 1277 } 1278 mutex_exit(&state->id_ac_mutex); 1279 return (DDI_SUCCESS); 1280 } 1281 1282 static void 1283 ibd_acache_fini(ibd_state_t *state) 1284 { 1285 ibd_ace_t *ptr; 1286 1287 mutex_enter(&state->id_ac_mutex); 1288 1289 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1290 ASSERT(GET_REF(ptr) == 0); 1291 mutex_destroy(&ptr->tx_too_big_mutex); 1292 (void) ibt_free_ud_dest(ptr->ac_dest); 1293 } 1294 1295 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1296 ASSERT(GET_REF(ptr) == 0); 1297 mutex_destroy(&ptr->tx_too_big_mutex); 1298 (void) ibt_free_ud_dest(ptr->ac_dest); 1299 } 1300 1301 list_destroy(&state->id_ah_free); 1302 list_destroy(&state->id_ah_active); 1303 list_destroy(&state->id_mc_full); 1304 list_destroy(&state->id_mc_non); 1305 list_destroy(&state->id_req_list); 1306 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1307 mutex_exit(&state->id_ac_mutex); 1308 mutex_destroy(&state->id_ac_mutex); 1309 mutex_destroy(&state->id_mc_mutex); 1310 mutex_destroy(&state->id_acache_req_lock); 1311 cv_destroy(&state->id_acache_req_cv); 1312 } 1313 1314 /* 1315 * Search AH active hash list for a cached path to input destination. 1316 * If we are "just looking", hold == F. When we are in the Tx path, 1317 * we set hold == T to grab a reference on the AH so that it can not 1318 * be recycled to a new destination while the Tx request is posted. 1319 */ 1320 ibd_ace_t * 1321 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1322 { 1323 ibd_ace_t *ptr; 1324 1325 ASSERT(mutex_owned(&state->id_ac_mutex)); 1326 1327 /* 1328 * Do hash search. 1329 */ 1330 if (mod_hash_find(state->id_ah_active_hash, 1331 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1332 if (hold) 1333 INC_REF(ptr, num); 1334 return (ptr); 1335 } 1336 return (NULL); 1337 } 1338 1339 /* 1340 * This is called by the tx side; if an initialized AH is found in 1341 * the active list, it is locked down and can be used; if no entry 1342 * is found, an async request is queued to do path resolution. 1343 */ 1344 static ibd_ace_t * 1345 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1346 { 1347 ibd_ace_t *ptr; 1348 ibd_req_t *req; 1349 1350 /* 1351 * Only attempt to print when we can; in the mdt pattr case, the 1352 * address is not aligned properly. 1353 */ 1354 if (((ulong_t)mac & 3) == 0) { 1355 DPRINT(4, 1356 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1357 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1358 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1359 htonl(mac->ipoib_gidsuff[1])); 1360 } 1361 1362 mutex_enter(&state->id_ac_mutex); 1363 1364 if (((ptr = state->id_ac_hot_ace) != NULL) && 1365 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1366 INC_REF(ptr, numwqe); 1367 mutex_exit(&state->id_ac_mutex); 1368 return (ptr); 1369 } 1370 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1371 state->id_ac_hot_ace = ptr; 1372 mutex_exit(&state->id_ac_mutex); 1373 return (ptr); 1374 } 1375 1376 /* 1377 * Implementation of a single outstanding async request; if 1378 * the operation is not started yet, queue a request and move 1379 * to ongoing state. Remember in id_ah_addr for which address 1380 * we are queueing the request, in case we need to flag an error; 1381 * Any further requests, for the same or different address, until 1382 * the operation completes, is sent back to GLDv3 to be retried. 1383 * The async thread will update id_ah_op with an error indication 1384 * or will set it to indicate the next look up can start; either 1385 * way, it will mac_tx_update() so that all blocked requests come 1386 * back here. 1387 */ 1388 *err = EAGAIN; 1389 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1390 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1391 if (req != NULL) { 1392 /* 1393 * We did not even find the entry; queue a request 1394 * for it. 1395 */ 1396 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1397 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1398 state->id_ah_op = IBD_OP_ONGOING; 1399 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1400 } 1401 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1402 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1403 /* 1404 * Check the status of the pathrecord lookup request 1405 * we had queued before. 1406 */ 1407 if (state->id_ah_op == IBD_OP_ERRORED) { 1408 *err = EFAULT; 1409 state->id_ah_error++; 1410 } else { 1411 /* 1412 * IBD_OP_ROUTERED case: We need to send to the 1413 * all-router MCG. If we can find the AH for 1414 * the mcg, the Tx will be attempted. If we 1415 * do not find the AH, we return NORESOURCES 1416 * to retry. 1417 */ 1418 ipoib_mac_t routermac; 1419 1420 (void) ibd_get_allroutergroup(state, mac, &routermac); 1421 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1422 numwqe); 1423 } 1424 state->id_ah_op = IBD_OP_NOTSTARTED; 1425 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1426 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1427 /* 1428 * This case can happen when we get a higher band 1429 * packet. The easiest way is to reset the state machine 1430 * to accommodate the higher priority packet. 1431 */ 1432 state->id_ah_op = IBD_OP_NOTSTARTED; 1433 } 1434 mutex_exit(&state->id_ac_mutex); 1435 1436 return (ptr); 1437 } 1438 1439 /* 1440 * Grab a not-currently-in-use AH/PathRecord from the active 1441 * list to recycle to a new destination. Only the async thread 1442 * executes this code. 1443 */ 1444 static ibd_ace_t * 1445 ibd_acache_get_unref(ibd_state_t *state) 1446 { 1447 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1448 boolean_t try_rc_chan_recycle = B_FALSE; 1449 1450 ASSERT(mutex_owned(&state->id_ac_mutex)); 1451 1452 /* 1453 * Do plain linear search. 1454 */ 1455 while (ptr != NULL) { 1456 /* 1457 * Note that it is possible that the "cycle" bit 1458 * is set on the AH w/o any reference count. The 1459 * mcg must have been deleted, and the tx cleanup 1460 * just decremented the reference count to 0, but 1461 * hasn't gotten around to grabbing the id_ac_mutex 1462 * to move the AH into the free list. 1463 */ 1464 if (GET_REF(ptr) == 0) { 1465 if (ptr->ac_chan != NULL) { 1466 ASSERT(state->id_enable_rc == B_TRUE); 1467 if (!try_rc_chan_recycle) { 1468 try_rc_chan_recycle = B_TRUE; 1469 ibd_rc_signal_ace_recycle(state, ptr); 1470 } 1471 } else { 1472 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1473 break; 1474 } 1475 } 1476 ptr = list_prev(&state->id_ah_active, ptr); 1477 } 1478 return (ptr); 1479 } 1480 1481 /* 1482 * Invoked to clean up AH from active list in case of multicast 1483 * disable and to handle sendonly memberships during mcg traps. 1484 * And for port up processing for multicast and unicast AHs. 1485 * Normally, the AH is taken off the active list, and put into 1486 * the free list to be recycled for a new destination. In case 1487 * Tx requests on the AH have not completed yet, the AH is marked 1488 * for reaping (which will put the AH on the free list) once the Tx's 1489 * complete; in this case, depending on the "force" input, we take 1490 * out the AH from the active list right now, or leave it also for 1491 * the reap operation. Returns TRUE if the AH is taken off the active 1492 * list (and either put into the free list right now, or arranged for 1493 * later), FALSE otherwise. 1494 */ 1495 boolean_t 1496 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1497 { 1498 ibd_ace_t *acactive; 1499 boolean_t ret = B_TRUE; 1500 1501 ASSERT(mutex_owned(&state->id_ac_mutex)); 1502 1503 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1504 1505 /* 1506 * Note that the AH might already have the cycle bit set 1507 * on it; this might happen if sequences of multicast 1508 * enables and disables are coming so fast, that posted 1509 * Tx's to the mcg have not completed yet, and the cycle 1510 * bit is set successively by each multicast disable. 1511 */ 1512 if (SET_CYCLE_IF_REF(acactive)) { 1513 if (!force) { 1514 /* 1515 * The ace is kept on the active list, further 1516 * Tx's can still grab a reference on it; the 1517 * ace is reaped when all pending Tx's 1518 * referencing the AH complete. 1519 */ 1520 ret = B_FALSE; 1521 } else { 1522 /* 1523 * In the mcg trap case, we always pull the 1524 * AH from the active list. And also the port 1525 * up multi/unicast case. 1526 */ 1527 ASSERT(acactive->ac_chan == NULL); 1528 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1529 acactive->ac_mce = NULL; 1530 } 1531 } else { 1532 /* 1533 * Determined the ref count is 0, thus reclaim 1534 * immediately after pulling out the ace from 1535 * the active list. 1536 */ 1537 ASSERT(acactive->ac_chan == NULL); 1538 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1539 acactive->ac_mce = NULL; 1540 IBD_ACACHE_INSERT_FREE(state, acactive); 1541 } 1542 1543 } 1544 return (ret); 1545 } 1546 1547 /* 1548 * Helper function for async path record lookup. If we are trying to 1549 * Tx to a MCG, check our membership, possibly trying to join the 1550 * group if required. If that fails, try to send the packet to the 1551 * all router group (indicated by the redirect output), pointing 1552 * the input mac address to the router mcg address. 1553 */ 1554 static ibd_mce_t * 1555 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1556 { 1557 ib_gid_t mgid; 1558 ibd_mce_t *mce; 1559 ipoib_mac_t routermac; 1560 1561 *redirect = B_FALSE; 1562 ibd_n2h_gid(mac, &mgid); 1563 1564 /* 1565 * Check the FullMember+SendOnlyNonMember list. 1566 * Since we are the only one who manipulates the 1567 * id_mc_full list, no locks are needed. 1568 */ 1569 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1570 if (mce != NULL) { 1571 DPRINT(4, "ibd_async_mcache : already joined to group"); 1572 return (mce); 1573 } 1574 1575 /* 1576 * Not found; try to join(SendOnlyNonMember) and attach. 1577 */ 1578 DPRINT(4, "ibd_async_mcache : not joined to group"); 1579 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1580 NULL) { 1581 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1582 return (mce); 1583 } 1584 1585 /* 1586 * MCGroup not present; try to join the all-router group. If 1587 * any of the following steps succeed, we will be redirecting 1588 * to the all router group. 1589 */ 1590 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1591 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1592 return (NULL); 1593 *redirect = B_TRUE; 1594 ibd_n2h_gid(&routermac, &mgid); 1595 bcopy(&routermac, mac, IPOIB_ADDRL); 1596 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1597 mgid.gid_prefix, mgid.gid_guid); 1598 1599 /* 1600 * Are we already joined to the router group? 1601 */ 1602 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1603 DPRINT(4, "ibd_async_mcache : using already joined router" 1604 "group\n"); 1605 return (mce); 1606 } 1607 1608 /* 1609 * Can we join(SendOnlyNonMember) the router group? 1610 */ 1611 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1612 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1613 NULL) { 1614 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1615 return (mce); 1616 } 1617 1618 return (NULL); 1619 } 1620 1621 /* 1622 * Async path record lookup code. 1623 */ 1624 static void 1625 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1626 { 1627 ibd_ace_t *ce; 1628 ibd_mce_t *mce = NULL; 1629 ibt_path_attr_t path_attr; 1630 ibt_path_info_t path_info; 1631 ib_gid_t destgid; 1632 char ret = IBD_OP_NOTSTARTED; 1633 1634 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1635 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1636 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1637 htonl(mac->ipoib_gidsuff[1])); 1638 1639 /* 1640 * Check whether we are trying to transmit to a MCG. 1641 * In that case, we need to make sure we are a member of 1642 * the MCG. 1643 */ 1644 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1645 boolean_t redirected; 1646 1647 /* 1648 * If we can not find or join the group or even 1649 * redirect, error out. 1650 */ 1651 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1652 NULL) { 1653 state->id_ah_op = IBD_OP_ERRORED; 1654 return; 1655 } 1656 1657 /* 1658 * If we got redirected, we need to determine whether 1659 * the AH for the new mcg is in the cache already, and 1660 * not pull it in then; otherwise proceed to get the 1661 * path for the new mcg. There is no guarantee that 1662 * if the AH is currently in the cache, it will still be 1663 * there when we look in ibd_acache_lookup(), but that's 1664 * okay, we will come back here. 1665 */ 1666 if (redirected) { 1667 ret = IBD_OP_ROUTERED; 1668 DPRINT(4, "ibd_async_acache : redirected to " 1669 "%08X:%08X:%08X:%08X:%08X", 1670 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1671 htonl(mac->ipoib_gidpref[1]), 1672 htonl(mac->ipoib_gidsuff[0]), 1673 htonl(mac->ipoib_gidsuff[1])); 1674 1675 mutex_enter(&state->id_ac_mutex); 1676 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1677 state->id_ah_op = IBD_OP_ROUTERED; 1678 mutex_exit(&state->id_ac_mutex); 1679 DPRINT(4, "ibd_async_acache : router AH found"); 1680 return; 1681 } 1682 mutex_exit(&state->id_ac_mutex); 1683 } 1684 } 1685 1686 /* 1687 * Get an AH from the free list. 1688 */ 1689 mutex_enter(&state->id_ac_mutex); 1690 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1691 /* 1692 * No free ones; try to grab an unreferenced active 1693 * one. Maybe we need to make the active list LRU, 1694 * but that will create more work for Tx callbacks. 1695 * Is there a way of not having to pull out the 1696 * entry from the active list, but just indicate it 1697 * is being recycled? Yes, but that creates one more 1698 * check in the fast lookup path. 1699 */ 1700 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1701 /* 1702 * Pretty serious shortage now. 1703 */ 1704 state->id_ah_op = IBD_OP_NOTSTARTED; 1705 mutex_exit(&state->id_ac_mutex); 1706 DPRINT(10, "ibd_async_acache : failed to find AH " 1707 "slot\n"); 1708 return; 1709 } 1710 /* 1711 * We could check whether ac_mce points to a SendOnly 1712 * member and drop that membership now. Or do it lazily 1713 * at detach time. 1714 */ 1715 ce->ac_mce = NULL; 1716 } 1717 mutex_exit(&state->id_ac_mutex); 1718 ASSERT(ce->ac_mce == NULL); 1719 1720 /* 1721 * Update the entry. 1722 */ 1723 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1724 1725 bzero(&path_info, sizeof (path_info)); 1726 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1727 path_attr.pa_sgid = state->id_sgid; 1728 path_attr.pa_num_dgids = 1; 1729 ibd_n2h_gid(&ce->ac_mac, &destgid); 1730 path_attr.pa_dgids = &destgid; 1731 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1732 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1733 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1734 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1735 goto error; 1736 } 1737 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1738 ntohl(ce->ac_mac.ipoib_qpn), 1739 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1740 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1741 goto error; 1742 } 1743 1744 /* 1745 * mce is set whenever an AH is being associated with a 1746 * MCG; this will come in handy when we leave the MCG. The 1747 * lock protects Tx fastpath from scanning the active list. 1748 */ 1749 if (mce != NULL) 1750 ce->ac_mce = mce; 1751 1752 /* 1753 * initiate a RC mode connection for unicast address 1754 */ 1755 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1756 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1757 ASSERT(ce->ac_chan == NULL); 1758 DPRINT(10, "ibd_async_acache: call " 1759 "ibd_rc_try_connect(ace=%p)", ce); 1760 ibd_rc_try_connect(state, ce, &path_info); 1761 if (ce->ac_chan == NULL) { 1762 DPRINT(10, "ibd_async_acache: fail to setup RC" 1763 " channel"); 1764 state->rc_conn_fail++; 1765 goto error; 1766 } 1767 } 1768 1769 mutex_enter(&state->id_ac_mutex); 1770 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1771 state->id_ah_op = ret; 1772 mutex_exit(&state->id_ac_mutex); 1773 return; 1774 error: 1775 /* 1776 * We might want to drop SendOnly membership here if we 1777 * joined above. The lock protects Tx callbacks inserting 1778 * into the free list. 1779 */ 1780 mutex_enter(&state->id_ac_mutex); 1781 state->id_ah_op = IBD_OP_ERRORED; 1782 IBD_ACACHE_INSERT_FREE(state, ce); 1783 mutex_exit(&state->id_ac_mutex); 1784 } 1785 1786 /* 1787 * While restoring port's presence on the subnet on a port up, it is possible 1788 * that the port goes down again. 1789 */ 1790 static void 1791 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1792 { 1793 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1794 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1795 LINK_STATE_UP; 1796 ibd_mce_t *mce, *pmce; 1797 ibd_ace_t *ace, *pace; 1798 1799 DPRINT(10, "ibd_async_link(): %d", opcode); 1800 1801 /* 1802 * On a link up, revalidate the link speed/width. No point doing 1803 * this on a link down, since we will be unable to do SA operations, 1804 * defaulting to the lowest speed. Also notice that we update our 1805 * notion of speed before calling mac_link_update(), which will do 1806 * necessary higher level notifications for speed changes. 1807 */ 1808 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1809 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1810 state->id_link_speed = ibd_get_portspeed(state); 1811 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1812 } 1813 1814 /* 1815 * Do all the work required to establish our presence on 1816 * the subnet. 1817 */ 1818 if (opcode == IBD_LINK_UP_ABSENT) { 1819 /* 1820 * If in promiscuous mode ... 1821 */ 1822 if (state->id_prom_op == IBD_OP_COMPLETED) { 1823 /* 1824 * Drop all nonmembership. 1825 */ 1826 ibd_async_unsetprom(state); 1827 1828 /* 1829 * Then, try to regain nonmembership to all mcg's. 1830 */ 1831 ibd_async_setprom(state); 1832 1833 } 1834 1835 /* 1836 * Drop all sendonly membership (which also gets rid of the 1837 * AHs); try to reacquire all full membership. 1838 */ 1839 mce = list_head(&state->id_mc_full); 1840 while ((pmce = mce) != NULL) { 1841 mce = list_next(&state->id_mc_full, mce); 1842 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1843 ibd_leave_group(state, 1844 pmce->mc_info.mc_adds_vect.av_dgid, 1845 IB_MC_JSTATE_SEND_ONLY_NON); 1846 else 1847 ibd_reacquire_group(state, pmce); 1848 } 1849 1850 /* 1851 * Recycle all active AHs to free list (and if there are 1852 * pending posts, make sure they will go into the free list 1853 * once the Tx's complete). Grab the lock to prevent 1854 * concurrent Tx's as well as Tx cleanups. 1855 */ 1856 mutex_enter(&state->id_ac_mutex); 1857 ace = list_head(&state->id_ah_active); 1858 while ((pace = ace) != NULL) { 1859 boolean_t cycled; 1860 1861 ace = list_next(&state->id_ah_active, ace); 1862 mce = pace->ac_mce; 1863 if (pace->ac_chan != NULL) { 1864 ASSERT(mce == NULL); 1865 ASSERT(state->id_enable_rc == B_TRUE); 1866 if (pace->ac_chan->chan_state == 1867 IBD_RC_STATE_ACT_ESTAB) { 1868 INC_REF(pace, 1); 1869 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 1870 pace->ac_chan->chan_state = 1871 IBD_RC_STATE_ACT_CLOSING; 1872 ibd_rc_signal_act_close(state, pace); 1873 } else { 1874 state->rc_act_close_simultaneous++; 1875 DPRINT(40, "ibd_async_link: other " 1876 "thread is closing it, ace=%p, " 1877 "ac_chan=%p, chan_state=%d", 1878 pace, pace->ac_chan, 1879 pace->ac_chan->chan_state); 1880 } 1881 } else { 1882 cycled = ibd_acache_recycle(state, 1883 &pace->ac_mac, B_TRUE); 1884 } 1885 /* 1886 * If this is for an mcg, it must be for a fullmember, 1887 * since we got rid of send-only members above when 1888 * processing the mce list. 1889 */ 1890 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1891 IB_MC_JSTATE_FULL))); 1892 1893 /* 1894 * Check if the fullmember mce needs to be torn down, 1895 * ie whether the DLPI disable has already been done. 1896 * If so, do some of the work of tx_cleanup, namely 1897 * causing leave (which will fail), detach and 1898 * mce-freeing. tx_cleanup will put the AH into free 1899 * list. The reason to duplicate some of this 1900 * tx_cleanup work is because we want to delete the 1901 * AH right now instead of waiting for tx_cleanup, to 1902 * force subsequent Tx's to reacquire an AH. 1903 */ 1904 if ((mce != NULL) && (mce->mc_fullreap)) 1905 ibd_async_reap_group(state, mce, 1906 mce->mc_info.mc_adds_vect.av_dgid, 1907 mce->mc_jstate); 1908 } 1909 mutex_exit(&state->id_ac_mutex); 1910 } 1911 1912 /* 1913 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1914 * (which stops further events from being delivered) before 1915 * mac_unregister(). At this point, it is guaranteed that mac_register 1916 * has already been done. 1917 */ 1918 mutex_enter(&state->id_link_mutex); 1919 state->id_link_state = lstate; 1920 mac_link_update(state->id_mh, lstate); 1921 mutex_exit(&state->id_link_mutex); 1922 1923 ibd_async_done(state); 1924 } 1925 1926 /* 1927 * Check the pkey table to see if we can find the pkey we're looking for. 1928 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1929 * failure. 1930 */ 1931 static int 1932 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1933 uint16_t *pkix) 1934 { 1935 uint16_t ndx; 1936 1937 ASSERT(pkix != NULL); 1938 1939 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1940 if (pkey_tbl[ndx] == pkey) { 1941 *pkix = ndx; 1942 return (0); 1943 } 1944 } 1945 return (-1); 1946 } 1947 1948 /* 1949 * When the link is notified up, we need to do a few things, based 1950 * on the port's current p_init_type_reply claiming a reinit has been 1951 * done or not. The reinit steps are: 1952 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1953 * the old Pkey and GID0 are correct. 1954 * 2. Register for mcg traps (already done by ibmf). 1955 * 3. If PreservePresenceReply indicates the SM has restored port's presence 1956 * in subnet, nothing more to do. Else go to next steps (on async daemon). 1957 * 4. Give up all sendonly memberships. 1958 * 5. Acquire all full memberships. 1959 * 6. In promiscuous mode, acquire all non memberships. 1960 * 7. Recycle all AHs to free list. 1961 */ 1962 static void 1963 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 1964 { 1965 ibt_hca_portinfo_t *port_infop = NULL; 1966 ibt_status_t ibt_status; 1967 uint_t psize, port_infosz; 1968 ibd_link_op_t opcode; 1969 ibd_req_t *req; 1970 link_state_t new_link_state = LINK_STATE_UP; 1971 uint8_t itreply; 1972 uint16_t pkix; 1973 int ret; 1974 1975 /* 1976 * Let's not race with a plumb or an unplumb; if we detect a 1977 * pkey relocation event later on here, we may have to restart. 1978 */ 1979 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 1980 1981 mutex_enter(&state->id_link_mutex); 1982 1983 /* 1984 * If the init code in ibd_m_start hasn't yet set up the 1985 * pkey/gid, nothing to do; that code will set the link state. 1986 */ 1987 if (state->id_link_state == LINK_STATE_UNKNOWN) { 1988 mutex_exit(&state->id_link_mutex); 1989 goto link_mod_return; 1990 } 1991 1992 /* 1993 * If this routine was called in response to a port down event, 1994 * we just need to see if this should be informed. 1995 */ 1996 if (code == IBT_ERROR_PORT_DOWN) { 1997 new_link_state = LINK_STATE_DOWN; 1998 goto update_link_state; 1999 } 2000 2001 /* 2002 * If it's not a port down event we've received, try to get the port 2003 * attributes first. If we fail here, the port is as good as down. 2004 * Otherwise, if the link went down by the time the handler gets 2005 * here, give up - we cannot even validate the pkey/gid since those 2006 * are not valid and this is as bad as a port down anyway. 2007 */ 2008 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2009 &port_infop, &psize, &port_infosz); 2010 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2011 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2012 new_link_state = LINK_STATE_DOWN; 2013 goto update_link_state; 2014 } 2015 2016 /* 2017 * Check the SM InitTypeReply flags. If both NoLoadReply and 2018 * PreserveContentReply are 0, we don't know anything about the 2019 * data loaded into the port attributes, so we need to verify 2020 * if gid0 and pkey are still valid. 2021 */ 2022 itreply = port_infop->p_init_type_reply; 2023 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2024 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2025 /* 2026 * Check to see if the subnet part of GID0 has changed. If 2027 * not, check the simple case first to see if the pkey 2028 * index is the same as before; finally check to see if the 2029 * pkey has been relocated to a different index in the table. 2030 */ 2031 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2032 if (bcmp(port_infop->p_sgid_tbl, 2033 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2034 2035 new_link_state = LINK_STATE_DOWN; 2036 2037 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2038 state->id_pkey) { 2039 2040 new_link_state = LINK_STATE_UP; 2041 2042 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2043 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2044 2045 ibt_free_portinfo(port_infop, port_infosz); 2046 mutex_exit(&state->id_link_mutex); 2047 2048 /* 2049 * Currently a restart is required if our pkey has moved 2050 * in the pkey table. If we get the ibt_recycle_ud() to 2051 * work as documented (expected), we may be able to 2052 * avoid a complete restart. Note that we've already 2053 * marked both the start and stop 'in-progress' flags, 2054 * so it is ok to go ahead and do this restart. 2055 */ 2056 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2057 if ((ret = ibd_start(state)) != 0) { 2058 DPRINT(10, "ibd_restart: cannot restart, " 2059 "ret=%d", ret); 2060 } 2061 2062 goto link_mod_return; 2063 } else { 2064 new_link_state = LINK_STATE_DOWN; 2065 } 2066 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2067 } 2068 2069 update_link_state: 2070 if (port_infop) { 2071 ibt_free_portinfo(port_infop, port_infosz); 2072 } 2073 2074 /* 2075 * If the old state is the same as the new state, nothing to do 2076 */ 2077 if (state->id_link_state == new_link_state) { 2078 mutex_exit(&state->id_link_mutex); 2079 goto link_mod_return; 2080 } 2081 2082 /* 2083 * Ok, so there was a link state change; see if it's safe to ask 2084 * the async thread to do the work 2085 */ 2086 if (!ibd_async_safe(state)) { 2087 state->id_link_state = new_link_state; 2088 mutex_exit(&state->id_link_mutex); 2089 goto link_mod_return; 2090 } 2091 2092 mutex_exit(&state->id_link_mutex); 2093 2094 /* 2095 * If we're reporting a link up, check InitTypeReply to see if 2096 * the SM has ensured that the port's presence in mcg, traps, 2097 * etc. is intact. 2098 */ 2099 if (new_link_state == LINK_STATE_DOWN) { 2100 opcode = IBD_LINK_DOWN; 2101 } else { 2102 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2103 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2104 opcode = IBD_LINK_UP; 2105 } else { 2106 opcode = IBD_LINK_UP_ABSENT; 2107 } 2108 } 2109 2110 /* 2111 * Queue up a request for ibd_async_link() to handle this link 2112 * state change event 2113 */ 2114 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2115 req->rq_ptr = (void *)opcode; 2116 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2117 2118 link_mod_return: 2119 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2120 } 2121 2122 /* 2123 * For the port up/down events, IBTL guarantees there will not be concurrent 2124 * invocations of the handler. IBTL might coalesce link transition events, 2125 * and not invoke the handler for _each_ up/down transition, but it will 2126 * invoke the handler with last known state 2127 */ 2128 static void 2129 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2130 ibt_async_code_t code, ibt_async_event_t *event) 2131 { 2132 ibd_state_t *state = (ibd_state_t *)clnt_private; 2133 2134 switch (code) { 2135 case IBT_ERROR_CATASTROPHIC_CHAN: 2136 ibd_print_warn(state, "catastrophic channel error"); 2137 break; 2138 case IBT_ERROR_CQ: 2139 ibd_print_warn(state, "completion queue error"); 2140 break; 2141 case IBT_PORT_CHANGE_EVENT: 2142 /* 2143 * Events will be delivered to all instances that have 2144 * done ibt_open_hca() but not yet done ibt_close_hca(). 2145 * Only need to do work for our port; IBTF will deliver 2146 * events for other ports on the hca we have ibt_open_hca'ed 2147 * too. Note that id_port is initialized in ibd_attach() 2148 * before we do an ibt_open_hca() in ibd_attach(). 2149 */ 2150 ASSERT(state->id_hca_hdl == hca_hdl); 2151 if (state->id_port != event->ev_port) 2152 break; 2153 2154 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2155 IBT_PORT_CHANGE_PKEY) { 2156 ibd_link_mod(state, code); 2157 } 2158 break; 2159 case IBT_ERROR_PORT_DOWN: 2160 case IBT_CLNT_REREG_EVENT: 2161 case IBT_EVENT_PORT_UP: 2162 /* 2163 * Events will be delivered to all instances that have 2164 * done ibt_open_hca() but not yet done ibt_close_hca(). 2165 * Only need to do work for our port; IBTF will deliver 2166 * events for other ports on the hca we have ibt_open_hca'ed 2167 * too. Note that id_port is initialized in ibd_attach() 2168 * before we do an ibt_open_hca() in ibd_attach(). 2169 */ 2170 ASSERT(state->id_hca_hdl == hca_hdl); 2171 if (state->id_port != event->ev_port) 2172 break; 2173 2174 ibd_link_mod(state, code); 2175 break; 2176 2177 case IBT_HCA_ATTACH_EVENT: 2178 case IBT_HCA_DETACH_EVENT: 2179 /* 2180 * When a new card is plugged to the system, attach_event is 2181 * invoked. Additionally, a cfgadm needs to be run to make the 2182 * card known to the system, and an ifconfig needs to be run to 2183 * plumb up any ibd interfaces on the card. In the case of card 2184 * unplug, a cfgadm is run that will trigger any RCM scripts to 2185 * unplumb the ibd interfaces on the card; when the card is 2186 * actually unplugged, the detach_event is invoked; 2187 * additionally, if any ibd instances are still active on the 2188 * card (eg there were no associated RCM scripts), driver's 2189 * detach routine is invoked. 2190 */ 2191 break; 2192 default: 2193 break; 2194 } 2195 } 2196 2197 static int 2198 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2199 { 2200 mac_register_t *macp; 2201 int ret; 2202 2203 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2204 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2205 return (DDI_FAILURE); 2206 } 2207 2208 /* 2209 * Note that when we register with mac during attach, we don't 2210 * have the id_macaddr yet, so we'll simply be registering a 2211 * zero macaddr that we'll overwrite later during plumb (in 2212 * ibd_m_start()). Similar is the case with id_mtu - we'll 2213 * update the mac layer with the correct mtu during plumb. 2214 */ 2215 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2216 macp->m_driver = state; 2217 macp->m_dip = dip; 2218 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2219 macp->m_callbacks = &ibd_m_callbacks; 2220 macp->m_min_sdu = 0; 2221 if (state->id_enable_rc) { 2222 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2223 } else { 2224 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2225 } 2226 2227 /* 2228 * Register ourselves with the GLDv3 interface 2229 */ 2230 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2231 mac_free(macp); 2232 DPRINT(10, 2233 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2234 return (DDI_FAILURE); 2235 } 2236 2237 mac_free(macp); 2238 return (DDI_SUCCESS); 2239 } 2240 2241 static int 2242 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2243 { 2244 ibt_hca_attr_t hca_attrs; 2245 ibt_status_t ibt_status; 2246 2247 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2248 2249 /* 2250 * Query the HCA and fetch its attributes 2251 */ 2252 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2253 ASSERT(ibt_status == IBT_SUCCESS); 2254 2255 /* 2256 * 1. Set the Hardware Checksum capability. Currently we only consider 2257 * full checksum offload. 2258 */ 2259 if (state->id_enable_rc) { 2260 state->id_hwcksum_capab = 0; 2261 } else { 2262 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2263 == IBT_HCA_CKSUM_FULL) { 2264 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2265 } 2266 } 2267 2268 /* 2269 * 2. Set LSO policy, capability and maximum length 2270 */ 2271 if (state->id_enable_rc) { 2272 state->id_lso_policy = B_FALSE; 2273 state->id_lso_capable = B_FALSE; 2274 state->id_lso_maxlen = 0; 2275 } else { 2276 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS 2277 |DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2278 state->id_lso_policy = B_TRUE; 2279 } else { 2280 state->id_lso_policy = B_FALSE; 2281 } 2282 2283 if (hca_attrs.hca_max_lso_size > 0) { 2284 state->id_lso_capable = B_TRUE; 2285 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2286 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2287 else 2288 state->id_lso_maxlen = 2289 hca_attrs.hca_max_lso_size; 2290 } else { 2291 state->id_lso_capable = B_FALSE; 2292 state->id_lso_maxlen = 0; 2293 } 2294 } 2295 2296 /* 2297 * 3. Set Reserved L_Key capability 2298 */ 2299 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2300 state->id_hca_res_lkey_capab = 1; 2301 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2302 state->rc_enable_iov_map = B_TRUE; 2303 } else { 2304 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2305 state->rc_enable_iov_map = B_FALSE; 2306 } 2307 2308 /* 2309 * 4. Set maximum sqseg value after checking to see if extended sgl 2310 * size information is provided by the hca 2311 */ 2312 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2313 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2314 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2315 } else { 2316 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2317 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2318 } 2319 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2320 state->id_max_sqseg = IBD_MAX_SQSEG; 2321 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2322 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2323 state->id_max_sqseg, IBD_MAX_SQSEG); 2324 } 2325 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2326 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2327 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2328 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2329 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2330 } 2331 2332 /* 2333 * Translating the virtual address regions into physical regions 2334 * for using the Reserved LKey feature results in a wr sgl that 2335 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2336 * we'll fix a high-water mark (65%) for when we should stop. 2337 */ 2338 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2339 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2340 2341 /* 2342 * 5. Set number of recv and send wqes after checking hca maximum 2343 * channel size 2344 */ 2345 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2346 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2347 } else { 2348 state->id_num_rwqe = IBD_NUM_RWQE; 2349 } 2350 state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN; 2351 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2352 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2353 } else { 2354 state->id_num_swqe = IBD_NUM_SWQE; 2355 } 2356 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2357 2358 return (DDI_SUCCESS); 2359 } 2360 2361 static int 2362 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2363 { 2364 int instance; 2365 uint32_t progress = state->id_mac_state; 2366 ibt_status_t ret; 2367 2368 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2369 cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n"); 2370 return (DDI_FAILURE); 2371 } 2372 2373 /* make sure rx resources are freed */ 2374 ibd_free_rx_rsrcs(state); 2375 2376 if (progress & IBD_DRV_MAC_REGISTERED) { 2377 (void) mac_unregister(state->id_mh); 2378 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2379 } 2380 2381 if (progress & IBD_DRV_PD_ALLOCD) { 2382 if ((ret = ibt_free_pd(state->id_hca_hdl, 2383 state->id_pd_hdl)) != IBT_SUCCESS) { 2384 ibd_print_warn(state, "failed to free " 2385 "protection domain, ret=%d", ret); 2386 } 2387 state->id_pd_hdl = NULL; 2388 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2389 } 2390 2391 if (progress & IBD_DRV_HCA_OPENED) { 2392 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2393 IBT_SUCCESS) { 2394 ibd_print_warn(state, "failed to close " 2395 "HCA device, ret=%d", ret); 2396 } 2397 state->id_hca_hdl = NULL; 2398 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2399 } 2400 2401 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2402 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2403 ibd_print_warn(state, 2404 "ibt_detach() failed, ret=%d", ret); 2405 } 2406 state->id_ibt_hdl = NULL; 2407 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2408 } 2409 2410 if (progress & IBD_DRV_TXINTR_ADDED) { 2411 ddi_remove_softintr(state->id_tx); 2412 state->id_tx = NULL; 2413 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2414 } 2415 2416 if (progress & IBD_DRV_RXINTR_ADDED) { 2417 ddi_remove_softintr(state->id_rx); 2418 state->id_rx = NULL; 2419 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2420 } 2421 2422 #ifdef DEBUG 2423 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2424 kstat_delete(state->rc_ksp); 2425 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2426 } 2427 #endif 2428 2429 if (progress & IBD_DRV_STATE_INITIALIZED) { 2430 ibd_state_fini(state); 2431 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2432 } 2433 2434 instance = ddi_get_instance(dip); 2435 ddi_soft_state_free(ibd_list, instance); 2436 2437 return (DDI_SUCCESS); 2438 } 2439 2440 /* 2441 * Attach device to the IO framework. 2442 */ 2443 static int 2444 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2445 { 2446 ibd_state_t *state = NULL; 2447 ib_guid_t hca_guid; 2448 int instance; 2449 ibt_status_t ret; 2450 int rv; 2451 2452 /* 2453 * IBD doesn't support suspend/resume 2454 */ 2455 if (cmd != DDI_ATTACH) 2456 return (DDI_FAILURE); 2457 2458 /* 2459 * Allocate softstate structure 2460 */ 2461 instance = ddi_get_instance(dip); 2462 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2463 return (DDI_FAILURE); 2464 state = ddi_get_soft_state(ibd_list, instance); 2465 2466 /* 2467 * Initialize mutexes and condition variables 2468 */ 2469 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2470 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2471 goto attach_fail; 2472 } 2473 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2474 2475 /* 2476 * Allocate rx,tx softintr 2477 */ 2478 if (ibd_rx_softintr == 1) { 2479 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2480 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2481 DPRINT(10, "ibd_attach: failed in " 2482 "ddi_add_softintr(id_rx), ret=%d", rv); 2483 goto attach_fail; 2484 } 2485 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2486 } 2487 if (ibd_tx_softintr == 1) { 2488 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2489 NULL, NULL, ibd_tx_recycle, 2490 (caddr_t)state)) != DDI_SUCCESS) { 2491 DPRINT(10, "ibd_attach: failed in " 2492 "ddi_add_softintr(id_tx), ret=%d", rv); 2493 goto attach_fail; 2494 } 2495 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2496 } 2497 2498 /* 2499 * Obtain IBA P_Key, port number and HCA guid and validate 2500 * them (for P_Key, only full members are allowed as per 2501 * IPoIB specification; neither port number nor HCA guid 2502 * can be zero) 2503 */ 2504 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2505 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2506 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2507 state->id_pkey); 2508 goto attach_fail; 2509 } 2510 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2511 "port-number", 0)) == 0) { 2512 DPRINT(10, "ibd_attach: invalid port number (%d)", 2513 state->id_port); 2514 goto attach_fail; 2515 } 2516 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2517 "hca-guid", 0)) == 0) { 2518 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2519 hca_guid); 2520 goto attach_fail; 2521 } 2522 2523 /* 2524 * Attach to IBTL 2525 */ 2526 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2527 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2528 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2529 goto attach_fail; 2530 } 2531 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2532 2533 /* 2534 * Open the HCA 2535 */ 2536 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2537 &state->id_hca_hdl)) != IBT_SUCCESS) { 2538 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2539 goto attach_fail; 2540 } 2541 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2542 2543 /* Get RC config before ibd_record_capab */ 2544 ibd_rc_get_conf(state); 2545 2546 #ifdef DEBUG 2547 /* Initialize Driver Counters for Reliable Connected Mode */ 2548 if (state->id_enable_rc) { 2549 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2550 DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats"); 2551 goto attach_fail; 2552 } 2553 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2554 } 2555 #endif 2556 2557 /* 2558 * Record capabilities 2559 */ 2560 (void) ibd_record_capab(state, dip); 2561 2562 /* 2563 * Allocate a protection domain on the HCA 2564 */ 2565 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2566 &state->id_pd_hdl)) != IBT_SUCCESS) { 2567 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2568 goto attach_fail; 2569 } 2570 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2571 2572 2573 /* 2574 * Register ibd interfaces with the Nemo framework 2575 */ 2576 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2577 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2578 goto attach_fail; 2579 } 2580 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2581 2582 /* 2583 * We're done with everything we could to make the attach 2584 * succeed. All the buffer allocations and IPoIB broadcast 2585 * group joins are deferred to when the interface instance 2586 * is actually plumbed to avoid wasting memory. 2587 */ 2588 return (DDI_SUCCESS); 2589 2590 attach_fail: 2591 (void) ibd_unattach(state, dip); 2592 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2593 return (DDI_FAILURE); 2594 } 2595 2596 /* 2597 * Detach device from the IO framework. 2598 */ 2599 static int 2600 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2601 { 2602 ibd_state_t *state; 2603 int instance; 2604 2605 /* 2606 * IBD doesn't support suspend/resume 2607 */ 2608 if (cmd != DDI_DETACH) 2609 return (DDI_FAILURE); 2610 2611 /* 2612 * Get the instance softstate 2613 */ 2614 instance = ddi_get_instance(dip); 2615 state = ddi_get_soft_state(ibd_list, instance); 2616 2617 /* 2618 * Release all resources we're holding still. Note that if we'd 2619 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2620 * so far, we should find all the flags we need in id_mac_state. 2621 */ 2622 return (ibd_unattach(state, dip)); 2623 } 2624 2625 /* 2626 * Pre ibt_attach() driver initialization 2627 */ 2628 static int 2629 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2630 { 2631 char buf[64]; 2632 2633 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2634 state->id_link_state = LINK_STATE_UNKNOWN; 2635 2636 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2637 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2638 state->id_trap_stop = B_TRUE; 2639 state->id_trap_inprog = 0; 2640 2641 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2642 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2643 state->id_dip = dip; 2644 2645 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2646 2647 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2648 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2649 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2650 state->id_tx_busy = 0; 2651 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2652 2653 state->id_rx_list.dl_bufs_outstanding = 0; 2654 state->id_rx_list.dl_cnt = 0; 2655 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2656 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2657 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2658 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2659 0, NULL, NULL, NULL, NULL, NULL, 0); 2660 2661 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2662 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2663 2664 /* For Reliable Connected Mode */ 2665 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2666 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2667 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2668 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2669 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2670 MUTEX_DRIVER, NULL); 2671 2672 return (DDI_SUCCESS); 2673 } 2674 2675 /* 2676 * Post ibt_detach() driver deconstruction 2677 */ 2678 static void 2679 ibd_state_fini(ibd_state_t *state) 2680 { 2681 cv_destroy(&state->id_macst_cv); 2682 mutex_destroy(&state->id_macst_lock); 2683 2684 kmem_cache_destroy(state->id_req_kmc); 2685 2686 mutex_destroy(&state->id_rx_list.dl_mutex); 2687 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2688 2689 mutex_destroy(&state->id_txpost_lock); 2690 mutex_destroy(&state->id_tx_list.dl_mutex); 2691 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2692 mutex_destroy(&state->id_lso_lock); 2693 2694 mutex_destroy(&state->id_sched_lock); 2695 mutex_destroy(&state->id_scq_poll_lock); 2696 mutex_destroy(&state->id_rcq_poll_lock); 2697 2698 cv_destroy(&state->id_trap_cv); 2699 mutex_destroy(&state->id_trap_lock); 2700 mutex_destroy(&state->id_link_mutex); 2701 2702 /* For Reliable Connected Mode */ 2703 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2704 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2705 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2706 mutex_destroy(&state->rc_tx_large_bufs_lock); 2707 mutex_destroy(&state->rc_rx_lock); 2708 } 2709 2710 /* 2711 * Fetch link speed from SA for snmp ifspeed reporting. 2712 */ 2713 static uint64_t 2714 ibd_get_portspeed(ibd_state_t *state) 2715 { 2716 int ret; 2717 ibt_path_info_t path; 2718 ibt_path_attr_t path_attr; 2719 uint8_t num_paths; 2720 uint64_t ifspeed; 2721 2722 /* 2723 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2724 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2725 * 2000000000. Start with that as default. 2726 */ 2727 ifspeed = 2000000000; 2728 2729 bzero(&path_attr, sizeof (path_attr)); 2730 2731 /* 2732 * Get the port speed from Loopback path information. 2733 */ 2734 path_attr.pa_dgids = &state->id_sgid; 2735 path_attr.pa_num_dgids = 1; 2736 path_attr.pa_sgid = state->id_sgid; 2737 2738 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2739 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2740 goto earlydone; 2741 2742 if (num_paths < 1) 2743 goto earlydone; 2744 2745 /* 2746 * In case SA does not return an expected value, report the default 2747 * speed as 1X. 2748 */ 2749 ret = 1; 2750 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2751 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2752 ret = 1; 2753 break; 2754 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2755 ret = 4; 2756 break; 2757 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2758 ret = 12; 2759 break; 2760 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2761 ret = 2; 2762 break; 2763 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2764 ret = 8; 2765 break; 2766 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2767 ret = 16; 2768 break; 2769 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2770 ret = 24; 2771 break; 2772 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2773 ret = 32; 2774 break; 2775 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2776 ret = 48; 2777 break; 2778 } 2779 2780 ifspeed *= ret; 2781 2782 earlydone: 2783 return (ifspeed); 2784 } 2785 2786 /* 2787 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2788 * representing the input mcg mgid. 2789 */ 2790 static ibd_mce_t * 2791 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2792 { 2793 ibd_mce_t *ptr = list_head(mlist); 2794 2795 /* 2796 * Do plain linear search. 2797 */ 2798 while (ptr != NULL) { 2799 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2800 sizeof (ib_gid_t)) == 0) 2801 return (ptr); 2802 ptr = list_next(mlist, ptr); 2803 } 2804 return (NULL); 2805 } 2806 2807 /* 2808 * Execute IBA JOIN. 2809 */ 2810 static ibt_status_t 2811 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2812 { 2813 ibt_mcg_attr_t mcg_attr; 2814 2815 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2816 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2817 mcg_attr.mc_mgid = mgid; 2818 mcg_attr.mc_join_state = mce->mc_jstate; 2819 mcg_attr.mc_scope = state->id_scope; 2820 mcg_attr.mc_pkey = state->id_pkey; 2821 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2822 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2823 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2824 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2825 NULL, NULL)); 2826 } 2827 2828 /* 2829 * This code JOINs the port in the proper way (depending on the join 2830 * state) so that IBA fabric will forward mcg packets to/from the port. 2831 * It also attaches the QPN to the mcg so it can receive those mcg 2832 * packets. This code makes sure not to attach the mcg to the QP if 2833 * that has been previously done due to the mcg being joined with a 2834 * different join state, even though this is not required by SWG_0216, 2835 * refid 3610. 2836 */ 2837 static ibd_mce_t * 2838 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2839 { 2840 ibt_status_t ibt_status; 2841 ibd_mce_t *mce, *tmce, *omce = NULL; 2842 boolean_t do_attach = B_TRUE; 2843 2844 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2845 jstate, mgid.gid_prefix, mgid.gid_guid); 2846 2847 /* 2848 * For enable_multicast Full member joins, we need to do some 2849 * extra work. If there is already an mce on the list that 2850 * indicates full membership, that means the membership has 2851 * not yet been dropped (since the disable_multicast was issued) 2852 * because there are pending Tx's to the mcg; in that case, just 2853 * mark the mce not to be reaped when the Tx completion queues 2854 * an async reap operation. 2855 * 2856 * If there is already an mce on the list indicating sendonly 2857 * membership, try to promote to full membership. Be careful 2858 * not to deallocate the old mce, since there might be an AH 2859 * pointing to it; instead, update the old mce with new data 2860 * that tracks the full membership. 2861 */ 2862 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2863 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2864 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2865 ASSERT(omce->mc_fullreap); 2866 omce->mc_fullreap = B_FALSE; 2867 return (omce); 2868 } else { 2869 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2870 } 2871 } 2872 2873 /* 2874 * Allocate the ibd_mce_t to track this JOIN. 2875 */ 2876 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2877 mce->mc_fullreap = B_FALSE; 2878 mce->mc_jstate = jstate; 2879 2880 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2881 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2882 ibt_status); 2883 kmem_free(mce, sizeof (ibd_mce_t)); 2884 return (NULL); 2885 } 2886 2887 /* 2888 * Is an IBA attach required? Not if the interface is already joined 2889 * to the mcg in a different appropriate join state. 2890 */ 2891 if (jstate == IB_MC_JSTATE_NON) { 2892 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2893 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2894 do_attach = B_FALSE; 2895 } else if (jstate == IB_MC_JSTATE_FULL) { 2896 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2897 do_attach = B_FALSE; 2898 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2899 do_attach = B_FALSE; 2900 } 2901 2902 if (do_attach) { 2903 /* 2904 * Do the IBA attach. 2905 */ 2906 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2907 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2908 &mce->mc_info)) != IBT_SUCCESS) { 2909 DPRINT(10, "ibd_join_group : failed qp attachment " 2910 "%d\n", ibt_status); 2911 /* 2912 * NOTE that we should probably preserve the join info 2913 * in the list and later try to leave again at detach 2914 * time. 2915 */ 2916 (void) ibt_leave_mcg(state->id_sgid, mgid, 2917 state->id_sgid, jstate); 2918 kmem_free(mce, sizeof (ibd_mce_t)); 2919 return (NULL); 2920 } 2921 } 2922 2923 /* 2924 * Insert the ibd_mce_t in the proper list. 2925 */ 2926 if (jstate == IB_MC_JSTATE_NON) { 2927 IBD_MCACHE_INSERT_NON(state, mce); 2928 } else { 2929 /* 2930 * Set up the mc_req fields used for reaping the 2931 * mcg in case of delayed tx completion (see 2932 * ibd_tx_cleanup()). Also done for sendonly join in 2933 * case we are promoted to fullmembership later and 2934 * keep using the same mce. 2935 */ 2936 mce->mc_req.rq_gid = mgid; 2937 mce->mc_req.rq_ptr = mce; 2938 /* 2939 * Check whether this is the case of trying to join 2940 * full member, and we were already joined send only. 2941 * We try to drop our SendOnly membership, but it is 2942 * possible that the mcg does not exist anymore (and 2943 * the subnet trap never reached us), so the leave 2944 * operation might fail. 2945 */ 2946 if (omce != NULL) { 2947 (void) ibt_leave_mcg(state->id_sgid, mgid, 2948 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2949 omce->mc_jstate = IB_MC_JSTATE_FULL; 2950 bcopy(&mce->mc_info, &omce->mc_info, 2951 sizeof (ibt_mcg_info_t)); 2952 kmem_free(mce, sizeof (ibd_mce_t)); 2953 return (omce); 2954 } 2955 mutex_enter(&state->id_mc_mutex); 2956 IBD_MCACHE_INSERT_FULL(state, mce); 2957 mutex_exit(&state->id_mc_mutex); 2958 } 2959 2960 return (mce); 2961 } 2962 2963 /* 2964 * Called during port up event handling to attempt to reacquire full 2965 * membership to an mcg. Stripped down version of ibd_join_group(). 2966 * Note that it is possible that the mcg might have gone away, and 2967 * gets recreated at this point. 2968 */ 2969 static void 2970 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2971 { 2972 ib_gid_t mgid; 2973 2974 /* 2975 * If the mc_fullreap flag is set, or this join fails, a subsequent 2976 * reap/leave is going to try to leave the group. We could prevent 2977 * that by adding a boolean flag into ibd_mce_t, if required. 2978 */ 2979 if (mce->mc_fullreap) 2980 return; 2981 2982 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2983 2984 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2985 mgid.gid_guid); 2986 2987 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2988 ibd_print_warn(state, "Failure on port up to rejoin " 2989 "multicast gid %016llx:%016llx", 2990 (u_longlong_t)mgid.gid_prefix, 2991 (u_longlong_t)mgid.gid_guid); 2992 } 2993 2994 /* 2995 * This code handles delayed Tx completion cleanups for mcg's to which 2996 * disable_multicast has been issued, regular mcg related cleanups during 2997 * disable_multicast, disable_promiscuous and mcg traps, as well as 2998 * cleanups during driver detach time. Depending on the join state, 2999 * it deletes the mce from the appropriate list and issues the IBA 3000 * leave/detach; except in the disable_multicast case when the mce 3001 * is left on the active list for a subsequent Tx completion cleanup. 3002 */ 3003 static void 3004 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3005 uint8_t jstate) 3006 { 3007 ibd_mce_t *tmce; 3008 boolean_t do_detach = B_TRUE; 3009 3010 /* 3011 * Before detaching, we must check whether the other list 3012 * contains the mcg; if we detach blindly, the consumer 3013 * who set up the other list will also stop receiving 3014 * traffic. 3015 */ 3016 if (jstate == IB_MC_JSTATE_FULL) { 3017 /* 3018 * The following check is only relevant while coming 3019 * from the Tx completion path in the reap case. 3020 */ 3021 if (!mce->mc_fullreap) 3022 return; 3023 mutex_enter(&state->id_mc_mutex); 3024 IBD_MCACHE_PULLOUT_FULL(state, mce); 3025 mutex_exit(&state->id_mc_mutex); 3026 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3027 do_detach = B_FALSE; 3028 } else if (jstate == IB_MC_JSTATE_NON) { 3029 IBD_MCACHE_PULLOUT_NON(state, mce); 3030 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3031 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3032 do_detach = B_FALSE; 3033 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3034 mutex_enter(&state->id_mc_mutex); 3035 IBD_MCACHE_PULLOUT_FULL(state, mce); 3036 mutex_exit(&state->id_mc_mutex); 3037 do_detach = B_FALSE; 3038 } 3039 3040 /* 3041 * If we are reacting to a mcg trap and leaving our sendonly or 3042 * non membership, the mcg is possibly already gone, so attempting 3043 * to leave might fail. On the other hand, we must try to leave 3044 * anyway, since this might be a trap from long ago, and we could 3045 * have potentially sendonly joined to a recent incarnation of 3046 * the mcg and are about to loose track of this information. 3047 */ 3048 if (do_detach) { 3049 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3050 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3051 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3052 } 3053 3054 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3055 kmem_free(mce, sizeof (ibd_mce_t)); 3056 } 3057 3058 /* 3059 * Async code executed due to multicast and promiscuous disable requests 3060 * and mcg trap handling; also executed during driver detach. Mostly, a 3061 * leave and detach is done; except for the fullmember case when Tx 3062 * requests are pending, whence arrangements are made for subsequent 3063 * cleanup on Tx completion. 3064 */ 3065 static void 3066 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3067 { 3068 ipoib_mac_t mcmac; 3069 boolean_t recycled; 3070 ibd_mce_t *mce; 3071 3072 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3073 jstate, mgid.gid_prefix, mgid.gid_guid); 3074 3075 if (jstate == IB_MC_JSTATE_NON) { 3076 recycled = B_TRUE; 3077 mce = IBD_MCACHE_FIND_NON(state, mgid); 3078 /* 3079 * In case we are handling a mcg trap, we might not find 3080 * the mcg in the non list. 3081 */ 3082 if (mce == NULL) { 3083 return; 3084 } 3085 } else { 3086 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3087 3088 /* 3089 * In case we are handling a mcg trap, make sure the trap 3090 * is not arriving late; if we have an mce that indicates 3091 * that we are already a fullmember, that would be a clear 3092 * indication that the trap arrived late (ie, is for a 3093 * previous incarnation of the mcg). 3094 */ 3095 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3096 if ((mce == NULL) || (mce->mc_jstate == 3097 IB_MC_JSTATE_FULL)) { 3098 return; 3099 } 3100 } else { 3101 ASSERT(jstate == IB_MC_JSTATE_FULL); 3102 3103 /* 3104 * If join group failed, mce will be NULL here. 3105 * This is because in GLDv3 driver, set multicast 3106 * will always return success. 3107 */ 3108 if (mce == NULL) { 3109 return; 3110 } 3111 3112 mce->mc_fullreap = B_TRUE; 3113 } 3114 3115 /* 3116 * If no pending Tx's remain that reference the AH 3117 * for the mcg, recycle it from active to free list. 3118 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3119 * so the last completing Tx will cause an async reap 3120 * operation to be invoked, at which time we will drop our 3121 * membership to the mcg so that the pending Tx's complete 3122 * successfully. Refer to comments on "AH and MCE active 3123 * list manipulation" at top of this file. The lock protects 3124 * against Tx fast path and Tx cleanup code. 3125 */ 3126 mutex_enter(&state->id_ac_mutex); 3127 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3128 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3129 IB_MC_JSTATE_SEND_ONLY_NON)); 3130 mutex_exit(&state->id_ac_mutex); 3131 } 3132 3133 if (recycled) { 3134 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3135 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3136 ibd_async_reap_group(state, mce, mgid, jstate); 3137 } 3138 } 3139 3140 /* 3141 * Find the broadcast address as defined by IPoIB; implicitly 3142 * determines the IBA scope, mtu, tclass etc of the link the 3143 * interface is going to be a member of. 3144 */ 3145 static ibt_status_t 3146 ibd_find_bgroup(ibd_state_t *state) 3147 { 3148 ibt_mcg_attr_t mcg_attr; 3149 uint_t numg; 3150 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3151 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3152 IB_MC_SCOPE_GLOBAL }; 3153 int i, mcgmtu; 3154 boolean_t found = B_FALSE; 3155 int ret; 3156 ibt_mcg_info_t mcg_info; 3157 3158 state->id_bgroup_created = B_FALSE; 3159 3160 query_bcast_grp: 3161 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3162 mcg_attr.mc_pkey = state->id_pkey; 3163 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3164 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3165 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3166 3167 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3168 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3169 3170 /* 3171 * Look for the IPoIB broadcast group. 3172 */ 3173 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3174 state->id_mgid.gid_prefix = 3175 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3176 ((uint64_t)state->id_scope << 48) | 3177 ((uint32_t)(state->id_pkey << 16))); 3178 mcg_attr.mc_mgid = state->id_mgid; 3179 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3180 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3181 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3182 found = B_TRUE; 3183 break; 3184 } 3185 } 3186 3187 if (!found) { 3188 if (ibd_create_broadcast_group) { 3189 /* 3190 * If we created the broadcast group, but failed to 3191 * find it, we can't do anything except leave the 3192 * one we created and return failure. 3193 */ 3194 if (state->id_bgroup_created) { 3195 ibd_print_warn(state, "IPoIB broadcast group " 3196 "absent. Unable to query after create."); 3197 goto find_bgroup_fail; 3198 } 3199 3200 /* 3201 * Create the ipoib broadcast group if it didn't exist 3202 */ 3203 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3204 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3205 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3206 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3207 mcg_attr.mc_pkey = state->id_pkey; 3208 mcg_attr.mc_flow = 0; 3209 mcg_attr.mc_sl = 0; 3210 mcg_attr.mc_tclass = 0; 3211 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3212 state->id_mgid.gid_prefix = 3213 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3214 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3215 ((uint32_t)(state->id_pkey << 16))); 3216 mcg_attr.mc_mgid = state->id_mgid; 3217 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3218 3219 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3220 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3221 ibd_print_warn(state, "IPoIB broadcast group " 3222 "absent, create failed: ret = %d\n", ret); 3223 state->id_bgroup_created = B_FALSE; 3224 return (IBT_FAILURE); 3225 } 3226 state->id_bgroup_created = B_TRUE; 3227 goto query_bcast_grp; 3228 } else { 3229 ibd_print_warn(state, "IPoIB broadcast group absent"); 3230 return (IBT_FAILURE); 3231 } 3232 } 3233 3234 /* 3235 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3236 */ 3237 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3238 if (state->id_mtu < mcgmtu) { 3239 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3240 "greater than port's maximum MTU %d", mcgmtu, 3241 state->id_mtu); 3242 ibt_free_mcg_info(state->id_mcinfo, 1); 3243 goto find_bgroup_fail; 3244 } 3245 state->id_mtu = mcgmtu; 3246 3247 return (IBT_SUCCESS); 3248 3249 find_bgroup_fail: 3250 if (state->id_bgroup_created) { 3251 (void) ibt_leave_mcg(state->id_sgid, 3252 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3253 IB_MC_JSTATE_FULL); 3254 } 3255 3256 return (IBT_FAILURE); 3257 } 3258 3259 static int 3260 ibd_alloc_tx_copybufs(ibd_state_t *state) 3261 { 3262 ibt_mr_attr_t mem_attr; 3263 3264 /* 3265 * Allocate one big chunk for all regular tx copy bufs 3266 */ 3267 state->id_tx_buf_sz = state->id_mtu; 3268 if (state->id_lso_policy && state->id_lso_capable && 3269 (IBD_TX_BUF_SZ > state->id_mtu)) { 3270 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3271 } 3272 3273 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3274 state->id_tx_buf_sz, KM_SLEEP); 3275 3276 state->id_tx_wqes = kmem_zalloc(state->id_num_swqe * 3277 sizeof (ibd_swqe_t), KM_SLEEP); 3278 3279 /* 3280 * Do one memory registration on the entire txbuf area 3281 */ 3282 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3283 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3284 mem_attr.mr_as = NULL; 3285 mem_attr.mr_flags = IBT_MR_SLEEP; 3286 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3287 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3288 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3289 kmem_free(state->id_tx_wqes, 3290 state->id_num_swqe * sizeof (ibd_swqe_t)); 3291 kmem_free(state->id_tx_bufs, 3292 state->id_num_swqe * state->id_tx_buf_sz); 3293 state->id_tx_bufs = NULL; 3294 return (DDI_FAILURE); 3295 } 3296 3297 return (DDI_SUCCESS); 3298 } 3299 3300 static int 3301 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3302 { 3303 ibt_mr_attr_t mem_attr; 3304 ibd_lsobuf_t *buflist; 3305 ibd_lsobuf_t *lbufp; 3306 ibd_lsobuf_t *tail; 3307 ibd_lsobkt_t *bktp; 3308 uint8_t *membase; 3309 uint8_t *memp; 3310 uint_t memsz; 3311 int i; 3312 3313 /* 3314 * Allocate the lso bucket 3315 */ 3316 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3317 3318 /* 3319 * Allocate the entire lso memory and register it 3320 */ 3321 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3322 membase = kmem_zalloc(memsz, KM_SLEEP); 3323 3324 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3325 mem_attr.mr_len = memsz; 3326 mem_attr.mr_as = NULL; 3327 mem_attr.mr_flags = IBT_MR_SLEEP; 3328 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3329 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3330 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3331 kmem_free(membase, memsz); 3332 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3333 return (DDI_FAILURE); 3334 } 3335 3336 mutex_enter(&state->id_lso_lock); 3337 3338 /* 3339 * Now allocate the buflist. Note that the elements in the buflist and 3340 * the buffers in the lso memory have a permanent 1-1 relation, so we 3341 * can always derive the address of a buflist entry from the address of 3342 * an lso buffer. 3343 */ 3344 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3345 KM_SLEEP); 3346 3347 /* 3348 * Set up the lso buf chain 3349 */ 3350 memp = membase; 3351 lbufp = buflist; 3352 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3353 lbufp->lb_isfree = 1; 3354 lbufp->lb_buf = memp; 3355 lbufp->lb_next = lbufp + 1; 3356 3357 tail = lbufp; 3358 3359 memp += IBD_LSO_BUFSZ; 3360 lbufp++; 3361 } 3362 tail->lb_next = NULL; 3363 3364 /* 3365 * Set up the LSO buffer information in ibd state 3366 */ 3367 bktp->bkt_bufl = buflist; 3368 bktp->bkt_free_head = buflist; 3369 bktp->bkt_mem = membase; 3370 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3371 bktp->bkt_nfree = bktp->bkt_nelem; 3372 3373 state->id_lso = bktp; 3374 mutex_exit(&state->id_lso_lock); 3375 3376 return (DDI_SUCCESS); 3377 } 3378 3379 /* 3380 * Statically allocate Tx buffer list(s). 3381 */ 3382 static int 3383 ibd_init_txlist(ibd_state_t *state) 3384 { 3385 ibd_swqe_t *swqe; 3386 ibt_lkey_t lkey; 3387 int i; 3388 uint_t len; 3389 uint8_t *bufaddr; 3390 3391 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3392 return (DDI_FAILURE); 3393 3394 if (state->id_lso_policy && state->id_lso_capable) { 3395 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3396 state->id_lso_policy = B_FALSE; 3397 } 3398 3399 mutex_enter(&state->id_tx_list.dl_mutex); 3400 state->id_tx_list.dl_head = NULL; 3401 state->id_tx_list.dl_pending_sends = B_FALSE; 3402 state->id_tx_list.dl_cnt = 0; 3403 mutex_exit(&state->id_tx_list.dl_mutex); 3404 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3405 state->id_tx_rel_list.dl_head = NULL; 3406 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3407 state->id_tx_rel_list.dl_cnt = 0; 3408 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3409 3410 /* 3411 * Allocate and setup the swqe list 3412 */ 3413 lkey = state->id_tx_mr_desc.md_lkey; 3414 bufaddr = state->id_tx_bufs; 3415 len = state->id_tx_buf_sz; 3416 swqe = state->id_tx_wqes; 3417 mutex_enter(&state->id_tx_list.dl_mutex); 3418 for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) { 3419 swqe->swqe_next = NULL; 3420 swqe->swqe_im_mblk = NULL; 3421 3422 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3423 bufaddr; 3424 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3425 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3426 3427 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3428 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3429 swqe->w_swr.wr_trans = IBT_UD_SRV; 3430 3431 /* These are set in send */ 3432 swqe->w_swr.wr_nds = 0; 3433 swqe->w_swr.wr_sgl = NULL; 3434 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3435 3436 /* add to list */ 3437 state->id_tx_list.dl_cnt++; 3438 swqe->swqe_next = state->id_tx_list.dl_head; 3439 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3440 } 3441 mutex_exit(&state->id_tx_list.dl_mutex); 3442 3443 return (DDI_SUCCESS); 3444 } 3445 3446 static int 3447 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3448 uint32_t *nds_p) 3449 { 3450 ibd_lsobkt_t *bktp; 3451 ibd_lsobuf_t *lbufp; 3452 ibd_lsobuf_t *nextp; 3453 ibt_lkey_t lso_lkey; 3454 uint_t frag_sz; 3455 uint_t num_needed; 3456 int i; 3457 3458 ASSERT(sgl_p != NULL); 3459 ASSERT(nds_p != NULL); 3460 ASSERT(req_sz != 0); 3461 3462 /* 3463 * Determine how many bufs we'd need for the size requested 3464 */ 3465 num_needed = req_sz / IBD_LSO_BUFSZ; 3466 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3467 num_needed++; 3468 3469 mutex_enter(&state->id_lso_lock); 3470 3471 /* 3472 * If we don't have enough lso bufs, return failure 3473 */ 3474 ASSERT(state->id_lso != NULL); 3475 bktp = state->id_lso; 3476 if (bktp->bkt_nfree < num_needed) { 3477 mutex_exit(&state->id_lso_lock); 3478 return (-1); 3479 } 3480 3481 /* 3482 * Pick the first 'num_needed' bufs from the free list 3483 */ 3484 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3485 lbufp = bktp->bkt_free_head; 3486 for (i = 0; i < num_needed; i++) { 3487 ASSERT(lbufp->lb_isfree != 0); 3488 ASSERT(lbufp->lb_buf != NULL); 3489 3490 nextp = lbufp->lb_next; 3491 3492 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3493 sgl_p[i].ds_key = lso_lkey; 3494 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3495 3496 lbufp->lb_isfree = 0; 3497 lbufp->lb_next = NULL; 3498 3499 lbufp = nextp; 3500 } 3501 bktp->bkt_free_head = lbufp; 3502 3503 /* 3504 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3505 * to adjust the last sgl entry's length. Since we know we need atleast 3506 * one, the i-1 use below is ok. 3507 */ 3508 if (frag_sz) { 3509 sgl_p[i-1].ds_len = frag_sz; 3510 } 3511 3512 /* 3513 * Update nfree count and return 3514 */ 3515 bktp->bkt_nfree -= num_needed; 3516 3517 mutex_exit(&state->id_lso_lock); 3518 3519 *nds_p = num_needed; 3520 3521 return (0); 3522 } 3523 3524 static void 3525 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3526 { 3527 ibd_lsobkt_t *bktp; 3528 ibd_lsobuf_t *lbufp; 3529 uint8_t *lso_mem_end; 3530 uint_t ndx; 3531 int i; 3532 3533 mutex_enter(&state->id_lso_lock); 3534 3535 bktp = state->id_lso; 3536 ASSERT(bktp != NULL); 3537 3538 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3539 for (i = 0; i < nds; i++) { 3540 uint8_t *va; 3541 3542 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3543 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3544 3545 /* 3546 * Figure out the buflist element this sgl buffer corresponds 3547 * to and put it back at the head 3548 */ 3549 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3550 lbufp = bktp->bkt_bufl + ndx; 3551 3552 ASSERT(lbufp->lb_isfree == 0); 3553 ASSERT(lbufp->lb_buf == va); 3554 3555 lbufp->lb_isfree = 1; 3556 lbufp->lb_next = bktp->bkt_free_head; 3557 bktp->bkt_free_head = lbufp; 3558 } 3559 bktp->bkt_nfree += nds; 3560 3561 mutex_exit(&state->id_lso_lock); 3562 } 3563 3564 static void 3565 ibd_free_tx_copybufs(ibd_state_t *state) 3566 { 3567 /* 3568 * Unregister txbuf mr 3569 */ 3570 if (ibt_deregister_mr(state->id_hca_hdl, 3571 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3572 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3573 } 3574 state->id_tx_mr_hdl = NULL; 3575 3576 /* 3577 * Free txbuf memory 3578 */ 3579 kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t)); 3580 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3581 state->id_tx_wqes = NULL; 3582 state->id_tx_bufs = NULL; 3583 } 3584 3585 static void 3586 ibd_free_tx_lsobufs(ibd_state_t *state) 3587 { 3588 ibd_lsobkt_t *bktp; 3589 3590 mutex_enter(&state->id_lso_lock); 3591 3592 if ((bktp = state->id_lso) == NULL) { 3593 mutex_exit(&state->id_lso_lock); 3594 return; 3595 } 3596 3597 /* 3598 * First, free the buflist 3599 */ 3600 ASSERT(bktp->bkt_bufl != NULL); 3601 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3602 3603 /* 3604 * Unregister the LSO memory and free it 3605 */ 3606 ASSERT(bktp->bkt_mr_hdl != NULL); 3607 if (ibt_deregister_mr(state->id_hca_hdl, 3608 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3609 DPRINT(10, 3610 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3611 } 3612 ASSERT(bktp->bkt_mem); 3613 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3614 3615 /* 3616 * Finally free the bucket 3617 */ 3618 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3619 state->id_lso = NULL; 3620 3621 mutex_exit(&state->id_lso_lock); 3622 } 3623 3624 /* 3625 * Free the statically allocated Tx buffer list. 3626 */ 3627 static void 3628 ibd_fini_txlist(ibd_state_t *state) 3629 { 3630 /* 3631 * Free the allocated swqes 3632 */ 3633 mutex_enter(&state->id_tx_list.dl_mutex); 3634 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3635 state->id_tx_list.dl_head = NULL; 3636 state->id_tx_list.dl_pending_sends = B_FALSE; 3637 state->id_tx_list.dl_cnt = 0; 3638 state->id_tx_rel_list.dl_head = NULL; 3639 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3640 state->id_tx_rel_list.dl_cnt = 0; 3641 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3642 mutex_exit(&state->id_tx_list.dl_mutex); 3643 3644 ibd_free_tx_lsobufs(state); 3645 ibd_free_tx_copybufs(state); 3646 } 3647 3648 /* 3649 * post a list of rwqes, NULL terminated. 3650 */ 3651 static void 3652 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3653 { 3654 uint_t i; 3655 uint_t num_posted; 3656 ibt_status_t ibt_status; 3657 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3658 3659 while (rwqe) { 3660 /* Post up to IBD_RX_POST_CNT receive work requests */ 3661 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3662 wrs[i] = rwqe->w_rwr; 3663 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3664 if (rwqe == NULL) { 3665 i++; 3666 break; 3667 } 3668 } 3669 3670 /* 3671 * If posting fails for some reason, we'll never receive 3672 * completion intimation, so we'll need to cleanup. But 3673 * we need to make sure we don't clean up nodes whose 3674 * wrs have been successfully posted. We assume that the 3675 * hca driver returns on the first failure to post and 3676 * therefore the first 'num_posted' entries don't need 3677 * cleanup here. 3678 */ 3679 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3680 3681 num_posted = 0; 3682 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3683 &num_posted); 3684 if (ibt_status != IBT_SUCCESS) { 3685 /* This cannot happen unless the device has an error. */ 3686 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3687 "posting multiple wrs failed: " 3688 "requested=%d, done=%d, ret=%d", 3689 IBD_RX_POST_CNT, num_posted, ibt_status); 3690 atomic_add_32(&state->id_rx_list.dl_cnt, 3691 num_posted - i); 3692 } 3693 } 3694 } 3695 3696 /* 3697 * Grab a list of rwqes from the array of lists, and post the list. 3698 */ 3699 static void 3700 ibd_post_recv_intr(ibd_state_t *state) 3701 { 3702 ibd_rx_queue_t *rxp; 3703 ibd_rwqe_t *list; 3704 3705 /* rotate through the rx_queue array, expecting an adequate number */ 3706 state->id_rx_post_queue_index = 3707 (state->id_rx_post_queue_index + 1) & 3708 (state->id_rx_nqueues - 1); 3709 3710 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3711 mutex_enter(&rxp->rx_post_lock); 3712 list = WQE_TO_RWQE(rxp->rx_head); 3713 rxp->rx_head = NULL; 3714 rxp->rx_cnt = 0; 3715 mutex_exit(&rxp->rx_post_lock); 3716 ibd_post_recv_list(state, list); 3717 } 3718 3719 /* macro explained below */ 3720 #define RX_QUEUE_HASH(rwqe) \ 3721 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3722 3723 /* 3724 * Add a rwqe to one of the the Rx lists. If the list is large enough 3725 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3726 * 3727 * Note: one of 2^N lists is chosen via a hash. This is done 3728 * because using one list is contentious. If the first list is busy 3729 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3730 * 3731 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3732 * even distribution of mapping rwqes to the 2^N queues. 3733 */ 3734 static void 3735 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3736 { 3737 ibd_rx_queue_t *rxp; 3738 3739 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3740 3741 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3742 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3743 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3744 mutex_enter(&rxp->rx_post_lock); 3745 } 3746 rwqe->rwqe_next = rxp->rx_head; 3747 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 3748 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 3749 3750 /* only call ibt_post_recv() every Nth time through here */ 3751 if ((active & (state->id_rx_nqueues - 1)) == 0) { 3752 rxp->rx_head = NULL; 3753 rxp->rx_cnt = 0; 3754 mutex_exit(&rxp->rx_post_lock); 3755 ibd_post_recv_list(state, rwqe); 3756 return; 3757 } 3758 } 3759 rxp->rx_head = RWQE_TO_WQE(rwqe); 3760 mutex_exit(&rxp->rx_post_lock); 3761 } 3762 3763 static int 3764 ibd_alloc_rx_copybufs(ibd_state_t *state) 3765 { 3766 ibt_mr_attr_t mem_attr; 3767 int i; 3768 3769 /* 3770 * Allocate one big chunk for all regular rx copy bufs 3771 */ 3772 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 3773 3774 state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe * 3775 state->id_rx_buf_sz, KM_SLEEP); 3776 3777 state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe * 3778 sizeof (ibd_rwqe_t), KM_SLEEP); 3779 3780 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 3781 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 3782 sizeof (ibd_rx_queue_t), KM_SLEEP); 3783 for (i = 0; i < state->id_rx_nqueues; i++) { 3784 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3785 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 3786 } 3787 3788 /* 3789 * Do one memory registration on the entire rxbuf area 3790 */ 3791 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 3792 mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz; 3793 mem_attr.mr_as = NULL; 3794 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3795 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3796 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 3797 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 3798 kmem_free(state->id_rx_wqes, 3799 state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3800 kmem_free(state->id_rx_bufs, 3801 state->id_num_rwqe * state->id_rx_buf_sz); 3802 state->id_rx_bufs = NULL; 3803 state->id_rx_wqes = NULL; 3804 return (DDI_FAILURE); 3805 } 3806 3807 return (DDI_SUCCESS); 3808 } 3809 3810 /* 3811 * Allocate the statically allocated Rx buffer list. 3812 */ 3813 static int 3814 ibd_init_rxlist(ibd_state_t *state) 3815 { 3816 ibd_rwqe_t *rwqe, *next; 3817 ibd_wqe_t *list; 3818 ibt_lkey_t lkey; 3819 int i; 3820 uint_t len; 3821 uint8_t *bufaddr; 3822 3823 mutex_enter(&state->id_rx_free_list.dl_mutex); 3824 if (state->id_rx_free_list.dl_head != NULL) { 3825 /* rx rsrcs were never freed. Just repost them */ 3826 len = state->id_rx_buf_sz; 3827 list = state->id_rx_free_list.dl_head; 3828 state->id_rx_free_list.dl_head = NULL; 3829 state->id_rx_free_list.dl_cnt = 0; 3830 mutex_exit(&state->id_rx_free_list.dl_mutex); 3831 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3832 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3833 if ((rwqe->rwqe_im_mblk = desballoc( 3834 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 3835 &rwqe->w_freemsg_cb)) == NULL) { 3836 /* allow freemsg_cb to free the rwqes */ 3837 if (atomic_dec_32_nv(&state->id_running) != 0) { 3838 cmn_err(CE_WARN, "ibd_init_rxlist: " 3839 "id_running was not 1\n"); 3840 } 3841 DPRINT(10, "ibd_init_rxlist : " 3842 "failed in desballoc()"); 3843 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3844 rwqe = next) { 3845 next = WQE_TO_RWQE(rwqe->rwqe_next); 3846 freemsg(rwqe->rwqe_im_mblk); 3847 } 3848 atomic_inc_32(&state->id_running); 3849 return (DDI_FAILURE); 3850 } 3851 } 3852 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3853 return (DDI_SUCCESS); 3854 } 3855 mutex_exit(&state->id_rx_free_list.dl_mutex); 3856 3857 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 3858 return (DDI_FAILURE); 3859 3860 /* 3861 * Allocate and setup the rwqe list 3862 */ 3863 len = state->id_rx_buf_sz; 3864 lkey = state->id_rx_mr_desc.md_lkey; 3865 rwqe = state->id_rx_wqes; 3866 bufaddr = state->id_rx_bufs; 3867 list = NULL; 3868 for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) { 3869 rwqe->w_state = state; 3870 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3871 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3872 3873 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 3874 3875 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 3876 &rwqe->w_freemsg_cb)) == NULL) { 3877 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 3878 /* allow freemsg_cb to free the rwqes */ 3879 if (atomic_dec_32_nv(&state->id_running) != 0) { 3880 cmn_err(CE_WARN, "ibd_init_rxlist: " 3881 "id_running was not 1\n"); 3882 } 3883 DPRINT(10, "ibd_init_rxlist : " 3884 "failed in desballoc()"); 3885 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3886 rwqe = next) { 3887 next = WQE_TO_RWQE(rwqe->rwqe_next); 3888 freemsg(rwqe->rwqe_im_mblk); 3889 } 3890 atomic_inc_32(&state->id_running); 3891 return (DDI_FAILURE); 3892 } 3893 3894 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 3895 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3896 (ib_vaddr_t)(uintptr_t)bufaddr; 3897 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 3898 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3899 rwqe->w_rwr.wr_nds = 1; 3900 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3901 3902 rwqe->rwqe_next = list; 3903 list = RWQE_TO_WQE(rwqe); 3904 } 3905 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3906 3907 return (DDI_SUCCESS); 3908 } 3909 3910 static void 3911 ibd_free_rx_copybufs(ibd_state_t *state) 3912 { 3913 int i; 3914 3915 /* 3916 * Unregister rxbuf mr 3917 */ 3918 if (ibt_deregister_mr(state->id_hca_hdl, 3919 state->id_rx_mr_hdl) != IBT_SUCCESS) { 3920 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 3921 } 3922 state->id_rx_mr_hdl = NULL; 3923 3924 /* 3925 * Free rxbuf memory 3926 */ 3927 for (i = 0; i < state->id_rx_nqueues; i++) { 3928 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3929 mutex_destroy(&rxp->rx_post_lock); 3930 } 3931 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 3932 sizeof (ibd_rx_queue_t)); 3933 kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3934 kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz); 3935 state->id_rx_queues = NULL; 3936 state->id_rx_wqes = NULL; 3937 state->id_rx_bufs = NULL; 3938 } 3939 3940 static void 3941 ibd_free_rx_rsrcs(ibd_state_t *state) 3942 { 3943 mutex_enter(&state->id_rx_free_list.dl_mutex); 3944 if (state->id_rx_free_list.dl_head == NULL) { 3945 /* already freed */ 3946 mutex_exit(&state->id_rx_free_list.dl_mutex); 3947 return; 3948 } 3949 ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe); 3950 ibd_free_rx_copybufs(state); 3951 state->id_rx_free_list.dl_cnt = 0; 3952 state->id_rx_free_list.dl_head = NULL; 3953 mutex_exit(&state->id_rx_free_list.dl_mutex); 3954 } 3955 3956 /* 3957 * Free the statically allocated Rx buffer list. 3958 */ 3959 static void 3960 ibd_fini_rxlist(ibd_state_t *state) 3961 { 3962 ibd_rwqe_t *rwqe; 3963 int i; 3964 3965 /* run through the rx_queue's, calling freemsg() */ 3966 for (i = 0; i < state->id_rx_nqueues; i++) { 3967 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3968 mutex_enter(&rxp->rx_post_lock); 3969 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 3970 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3971 freemsg(rwqe->rwqe_im_mblk); 3972 rxp->rx_cnt--; 3973 } 3974 rxp->rx_head = NULL; 3975 mutex_exit(&rxp->rx_post_lock); 3976 } 3977 3978 /* cannot free rx resources unless gld returned everything */ 3979 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 3980 ibd_free_rx_rsrcs(state); 3981 } 3982 3983 /* 3984 * Free an allocated recv wqe. 3985 */ 3986 /* ARGSUSED */ 3987 static void 3988 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3989 { 3990 /* 3991 * desballoc() failed (no memory). 3992 * 3993 * This rwqe is placed on a free list so that it 3994 * can be reinstated when memory is available. 3995 * 3996 * NOTE: no code currently exists to reinstate 3997 * these "lost" rwqes. 3998 */ 3999 mutex_enter(&state->id_rx_free_list.dl_mutex); 4000 state->id_rx_free_list.dl_cnt++; 4001 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4002 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4003 mutex_exit(&state->id_rx_free_list.dl_mutex); 4004 } 4005 4006 /* 4007 * IBA Rx completion queue handler. Guaranteed to be single 4008 * threaded and nonreentrant for this CQ. 4009 */ 4010 /* ARGSUSED */ 4011 static void 4012 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4013 { 4014 ibd_state_t *state = (ibd_state_t *)arg; 4015 4016 atomic_inc_64(&state->id_num_intrs); 4017 4018 if (ibd_rx_softintr == 1) { 4019 mutex_enter(&state->id_rcq_poll_lock); 4020 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4021 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4022 mutex_exit(&state->id_rcq_poll_lock); 4023 return; 4024 } else { 4025 mutex_exit(&state->id_rcq_poll_lock); 4026 ddi_trigger_softintr(state->id_rx); 4027 } 4028 } else 4029 (void) ibd_intr((caddr_t)state); 4030 } 4031 4032 /* 4033 * CQ handler for Tx completions, when the Tx CQ is in 4034 * interrupt driven mode. 4035 */ 4036 /* ARGSUSED */ 4037 static void 4038 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4039 { 4040 ibd_state_t *state = (ibd_state_t *)arg; 4041 4042 atomic_inc_64(&state->id_num_intrs); 4043 4044 if (ibd_tx_softintr == 1) { 4045 mutex_enter(&state->id_scq_poll_lock); 4046 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4047 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4048 mutex_exit(&state->id_scq_poll_lock); 4049 return; 4050 } else { 4051 mutex_exit(&state->id_scq_poll_lock); 4052 ddi_trigger_softintr(state->id_tx); 4053 } 4054 } else 4055 (void) ibd_tx_recycle((caddr_t)state); 4056 } 4057 4058 /* 4059 * Multicast group create/delete trap handler. These will be delivered 4060 * on a kernel thread (handling can thus block) and can be invoked 4061 * concurrently. The handler can be invoked anytime after it is 4062 * registered and before ibt_detach(). 4063 */ 4064 /* ARGSUSED */ 4065 static void 4066 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4067 ibt_subnet_event_t *event) 4068 { 4069 ibd_state_t *state = (ibd_state_t *)arg; 4070 ibd_req_t *req; 4071 4072 /* 4073 * The trap handler will get invoked once for every event for 4074 * every port. The input "gid" is the GID0 of the port the 4075 * trap came in on; we just need to act on traps that came 4076 * to our port, meaning the port on which the ipoib interface 4077 * resides. Since ipoib uses GID0 of the port, we just match 4078 * the gids to check whether we need to handle the trap. 4079 */ 4080 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4081 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4082 return; 4083 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4084 4085 DPRINT(10, "ibd_notices_handler : %d\n", code); 4086 4087 switch (code) { 4088 case IBT_SM_EVENT_UNAVAILABLE: 4089 /* 4090 * If we are in promiscuous mode or have 4091 * sendnonmembers, we need to print a warning 4092 * message right now. Else, just store the 4093 * information, print when we enter promiscuous 4094 * mode or attempt nonmember send. We might 4095 * also want to stop caching sendnonmember. 4096 */ 4097 ibd_print_warn(state, "IBA multicast support " 4098 "degraded due to unavailability of multicast " 4099 "traps"); 4100 break; 4101 case IBT_SM_EVENT_AVAILABLE: 4102 /* 4103 * If we printed a warning message above or 4104 * while trying to nonmember send or get into 4105 * promiscuous mode, print an okay message. 4106 */ 4107 ibd_print_warn(state, "IBA multicast support " 4108 "restored due to availability of multicast " 4109 "traps"); 4110 break; 4111 case IBT_SM_EVENT_MCG_CREATED: 4112 case IBT_SM_EVENT_MCG_DELETED: 4113 /* 4114 * Common processing of creation/deletion traps. 4115 * First check if the instance is being 4116 * [de]initialized; back off then, without doing 4117 * anything more, since we are not sure if the 4118 * async thread is around, or whether we might 4119 * be racing with the detach code in ibd_m_stop() 4120 * that scans the mcg list. 4121 */ 4122 if (!ibd_async_safe(state)) 4123 return; 4124 4125 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4126 req->rq_gid = event->sm_notice_gid; 4127 req->rq_ptr = (void *)code; 4128 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4129 break; 4130 } 4131 } 4132 4133 static void 4134 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4135 { 4136 ib_gid_t mgid = req->rq_gid; 4137 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4138 4139 DPRINT(10, "ibd_async_trap : %d\n", code); 4140 4141 /* 4142 * Atomically search the nonmember and sendonlymember lists and 4143 * delete. 4144 */ 4145 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4146 4147 if (state->id_prom_op == IBD_OP_COMPLETED) { 4148 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4149 4150 /* 4151 * If in promiscuous mode, try to join/attach to the new 4152 * mcg. Given the unreliable out-of-order mode of trap 4153 * delivery, we can never be sure whether it is a problem 4154 * if the join fails. Thus, we warn the admin of a failure 4155 * if this was a creation trap. Note that the trap might 4156 * actually be reporting a long past event, and the mcg 4157 * might already have been deleted, thus we might be warning 4158 * in vain. 4159 */ 4160 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4161 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4162 ibd_print_warn(state, "IBA promiscuous mode missed " 4163 "new multicast gid %016llx:%016llx", 4164 (u_longlong_t)mgid.gid_prefix, 4165 (u_longlong_t)mgid.gid_guid); 4166 } 4167 4168 /* 4169 * Free the request slot allocated by the subnet event thread. 4170 */ 4171 ibd_async_done(state); 4172 } 4173 4174 /* 4175 * GLDv3 entry point to get capabilities. 4176 */ 4177 static boolean_t 4178 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4179 { 4180 ibd_state_t *state = arg; 4181 4182 switch (cap) { 4183 case MAC_CAPAB_HCKSUM: { 4184 uint32_t *txflags = cap_data; 4185 4186 /* 4187 * We either do full checksum or not do it at all 4188 */ 4189 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4190 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4191 else 4192 return (B_FALSE); 4193 break; 4194 } 4195 4196 case MAC_CAPAB_LSO: { 4197 mac_capab_lso_t *cap_lso = cap_data; 4198 4199 /* 4200 * In addition to the capability and policy, since LSO 4201 * relies on hw checksum, we'll not enable LSO if we 4202 * don't have hw checksum. Of course, if the HCA doesn't 4203 * provide the reserved lkey capability, enabling LSO will 4204 * actually affect performance adversely, so we'll disable 4205 * LSO even for that case. 4206 */ 4207 if (!state->id_lso_policy || !state->id_lso_capable) 4208 return (B_FALSE); 4209 4210 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4211 return (B_FALSE); 4212 4213 if (state->id_hca_res_lkey_capab == 0) { 4214 ibd_print_warn(state, "no reserved-lkey capability, " 4215 "disabling LSO"); 4216 return (B_FALSE); 4217 } 4218 4219 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4220 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4221 break; 4222 } 4223 4224 default: 4225 return (B_FALSE); 4226 } 4227 4228 return (B_TRUE); 4229 } 4230 4231 static int 4232 ibd_get_port_details(ibd_state_t *state) 4233 { 4234 ibt_hca_portinfo_t *port_infop; 4235 ibt_status_t ret; 4236 uint_t psize, port_infosz; 4237 4238 mutex_enter(&state->id_link_mutex); 4239 4240 /* 4241 * Query for port information 4242 */ 4243 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4244 &port_infop, &psize, &port_infosz); 4245 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4246 mutex_exit(&state->id_link_mutex); 4247 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4248 "failed, ret=%d", ret); 4249 return (ENETDOWN); 4250 } 4251 4252 /* 4253 * If the link already went down by the time we get here, 4254 * give up 4255 */ 4256 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4257 mutex_exit(&state->id_link_mutex); 4258 ibt_free_portinfo(port_infop, port_infosz); 4259 DPRINT(10, "ibd_get_port_details: port is not active"); 4260 return (ENETDOWN); 4261 } 4262 4263 /* 4264 * If the link is active, verify the pkey 4265 */ 4266 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4267 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4268 mutex_exit(&state->id_link_mutex); 4269 ibt_free_portinfo(port_infop, port_infosz); 4270 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4271 "failed, ret=%d", ret); 4272 return (ENONET); 4273 } 4274 4275 state->id_mtu = (128 << port_infop->p_mtu); 4276 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4277 state->id_sgid = *port_infop->p_sgid_tbl; 4278 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4279 state->id_link_state = LINK_STATE_UP; 4280 4281 mutex_exit(&state->id_link_mutex); 4282 ibt_free_portinfo(port_infop, port_infosz); 4283 4284 /* 4285 * Now that the port is active, record the port speed 4286 */ 4287 state->id_link_speed = ibd_get_portspeed(state); 4288 4289 return (0); 4290 } 4291 4292 static int 4293 ibd_alloc_cqs(ibd_state_t *state) 4294 { 4295 ibt_hca_attr_t hca_attrs; 4296 ibt_cq_attr_t cq_attr; 4297 ibt_status_t ret; 4298 uint32_t real_size; 4299 4300 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4301 ASSERT(ret == IBT_SUCCESS); 4302 4303 /* 4304 * Allocate Rx/combined CQ: 4305 * Theoretically, there is no point in having more than #rwqe 4306 * plus #swqe cqe's, except that the CQ will be signaled for 4307 * overflow when the last wqe completes, if none of the previous 4308 * cqe's have been polled. Thus, we allocate just a few less wqe's 4309 * to make sure such overflow does not occur. 4310 */ 4311 cq_attr.cq_sched = NULL; 4312 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4313 4314 /* 4315 * Allocate Receive CQ. 4316 */ 4317 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4318 cq_attr.cq_size = state->id_num_rwqe + 1; 4319 } else { 4320 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4321 state->id_num_rwqe = cq_attr.cq_size - 1; 4322 } 4323 4324 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4325 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4326 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4327 "failed, ret=%d\n", ret); 4328 return (DDI_FAILURE); 4329 } 4330 4331 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4332 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4333 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4334 "moderation failed, ret=%d\n", ret); 4335 } 4336 4337 /* make the #rx wc's the same as max rx chain size */ 4338 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 4339 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4340 state->id_rxwcs_size, KM_SLEEP); 4341 4342 /* 4343 * Allocate Send CQ. 4344 */ 4345 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4346 cq_attr.cq_size = state->id_num_swqe + 1; 4347 } else { 4348 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4349 state->id_num_swqe = cq_attr.cq_size - 1; 4350 } 4351 4352 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4353 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4354 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4355 "failed, ret=%d\n", ret); 4356 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4357 state->id_rxwcs_size); 4358 (void) ibt_free_cq(state->id_rcq_hdl); 4359 return (DDI_FAILURE); 4360 } 4361 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4362 ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) { 4363 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4364 "moderation failed, ret=%d\n", ret); 4365 } 4366 4367 state->id_txwcs_size = IBD_TX_POLL_THRESH; 4368 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4369 state->id_txwcs_size, KM_SLEEP); 4370 4371 /* 4372 * Print message in case we could not allocate as many wqe's 4373 * as was requested. 4374 */ 4375 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4376 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4377 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4378 } 4379 if (state->id_num_swqe != IBD_NUM_SWQE) { 4380 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4381 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4382 } 4383 4384 return (DDI_SUCCESS); 4385 } 4386 4387 static int 4388 ibd_setup_ud_channel(ibd_state_t *state) 4389 { 4390 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4391 ibt_ud_chan_query_attr_t ud_chan_attr; 4392 ibt_status_t ret; 4393 4394 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 4395 if (state->id_hca_res_lkey_capab) 4396 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4397 if (state->id_lso_policy && state->id_lso_capable) 4398 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4399 4400 ud_alloc_attr.ud_hca_port_num = state->id_port; 4401 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4402 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4403 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4404 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4405 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4406 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4407 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4408 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4409 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4410 ud_alloc_attr.ud_clone_chan = NULL; 4411 4412 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4413 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4414 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4415 "failed, ret=%d\n", ret); 4416 return (DDI_FAILURE); 4417 } 4418 4419 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4420 &ud_chan_attr)) != IBT_SUCCESS) { 4421 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4422 "failed, ret=%d\n", ret); 4423 (void) ibt_free_channel(state->id_chnl_hdl); 4424 return (DDI_FAILURE); 4425 } 4426 4427 state->id_qpnum = ud_chan_attr.ud_qpn; 4428 4429 return (DDI_SUCCESS); 4430 } 4431 4432 static int 4433 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4434 { 4435 uint32_t progress = state->id_mac_state; 4436 uint_t attempts; 4437 ibt_status_t ret; 4438 ib_gid_t mgid; 4439 ibd_mce_t *mce; 4440 uint8_t jstate; 4441 4442 if (atomic_dec_32_nv(&state->id_running) != 0) 4443 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 4444 4445 /* 4446 * Before we try to stop/undo whatever we did in ibd_start(), 4447 * we need to mark the link state appropriately to prevent the 4448 * ip layer from using this instance for any new transfers. Note 4449 * that if the original state of the link was "up" when we're 4450 * here, we'll set the final link state to "unknown", to behave 4451 * in the same fashion as other ethernet drivers. 4452 */ 4453 mutex_enter(&state->id_link_mutex); 4454 if (cur_link_state == LINK_STATE_DOWN) { 4455 state->id_link_state = cur_link_state; 4456 } else { 4457 state->id_link_state = LINK_STATE_UNKNOWN; 4458 } 4459 mutex_exit(&state->id_link_mutex); 4460 mac_link_update(state->id_mh, state->id_link_state); 4461 4462 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4463 if (progress & IBD_DRV_STARTED) { 4464 state->id_mac_state &= (~IBD_DRV_STARTED); 4465 } 4466 4467 /* Stop listen under Reliable Connected Mode */ 4468 if (progress & IBD_DRV_RC_LISTEN) { 4469 ASSERT(state->id_enable_rc); 4470 if (state->rc_listen_hdl != NULL) { 4471 ibd_rc_stop_listen(state); 4472 } 4473 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 4474 } 4475 4476 if (state->id_enable_rc) { 4477 if (ibd_rc_close_all_chan(state) != DDI_SUCCESS) { 4478 (void) ibd_rc_listen(state); 4479 state->id_mac_state |= IBD_DRV_RC_LISTEN; 4480 return (DDI_FAILURE); 4481 } 4482 } 4483 4484 /* 4485 * First, stop receive interrupts; this stops the driver from 4486 * handing up buffers to higher layers. Wait for receive buffers 4487 * to be returned and give up after 1 second. 4488 */ 4489 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4490 attempts = 10; 4491 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 4492 0) > 0) { 4493 delay(drv_usectohz(100000)); 4494 if (--attempts == 0) { 4495 /* 4496 * There are pending bufs with the network 4497 * layer and we have no choice but to wait 4498 * for them to be done with. Reap all the 4499 * Tx/Rx completions that were posted since 4500 * we turned off the notification and 4501 * return failure. 4502 */ 4503 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 4504 DPRINT(2, "ibd_undo_start: " 4505 "reclaiming failed"); 4506 break; 4507 } 4508 } 4509 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4510 } 4511 4512 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 4513 ibd_rc_fini_tx_largebuf_list(state); 4514 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 4515 } 4516 4517 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 4518 ASSERT(state->id_enable_rc); 4519 ibd_rc_fini_srq_list(state); 4520 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 4521 } 4522 4523 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4524 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4525 4526 mutex_enter(&state->id_trap_lock); 4527 state->id_trap_stop = B_TRUE; 4528 while (state->id_trap_inprog > 0) 4529 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4530 mutex_exit(&state->id_trap_lock); 4531 4532 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4533 } 4534 4535 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4536 /* 4537 * Flushing the channel ensures that all pending WQE's 4538 * are marked with flush_error and handed to the CQ. It 4539 * does not guarantee the invocation of the CQ handler. 4540 * This call is guaranteed to return successfully for 4541 * UD QPNs. 4542 */ 4543 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4544 IBT_SUCCESS) { 4545 DPRINT(10, "ibd_undo_start: flush_channel " 4546 "failed, ret=%d", ret); 4547 } 4548 4549 /* 4550 * Give some time for the TX CQ handler to process the 4551 * completions. 4552 */ 4553 mutex_enter(&state->id_tx_list.dl_mutex); 4554 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4555 attempts = 10; 4556 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 4557 != state->id_num_swqe) { 4558 if (--attempts == 0) 4559 break; 4560 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4561 mutex_exit(&state->id_tx_list.dl_mutex); 4562 delay(drv_usectohz(100000)); 4563 mutex_enter(&state->id_tx_list.dl_mutex); 4564 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4565 } 4566 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4567 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 4568 state->id_num_swqe) { 4569 cmn_err(CE_WARN, "tx resources not freed\n"); 4570 } 4571 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4572 mutex_exit(&state->id_tx_list.dl_mutex); 4573 4574 attempts = 10; 4575 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4576 if (--attempts == 0) 4577 break; 4578 delay(drv_usectohz(100000)); 4579 } 4580 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4581 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4582 cmn_err(CE_WARN, "rx resources not freed\n"); 4583 } 4584 4585 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4586 } 4587 4588 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4589 /* 4590 * No new async requests will be posted since the device 4591 * link state has been marked as unknown; completion handlers 4592 * have been turned off, so Tx handler will not cause any 4593 * more IBD_ASYNC_REAP requests. 4594 * 4595 * Queue a request for the async thread to exit, which will 4596 * be serviced after any pending ones. This can take a while, 4597 * specially if the SM is unreachable, since IBMF will slowly 4598 * timeout each SM request issued by the async thread. Reap 4599 * the thread before continuing on, we do not want it to be 4600 * lingering in modunloaded code (or we could move the reap 4601 * to ibd_detach(), provided we keep track of the current 4602 * id_async_thrid somewhere safe). 4603 */ 4604 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4605 thread_join(state->id_async_thrid); 4606 4607 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4608 } 4609 4610 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4611 /* 4612 * Drop all residual full/non membership. This includes full 4613 * membership to the broadcast group, and any nonmembership 4614 * acquired during transmits. We do this after the Tx completion 4615 * handlers are done, since those might result in some late 4616 * leaves; this also eliminates a potential race with that 4617 * path wrt the mc full list insert/delete. Trap handling 4618 * has also been suppressed at this point. Thus, no locks 4619 * are required while traversing the mc full list. 4620 */ 4621 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4622 mce = list_head(&state->id_mc_full); 4623 while (mce != NULL) { 4624 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4625 jstate = mce->mc_jstate; 4626 mce = list_next(&state->id_mc_full, mce); 4627 ibd_leave_group(state, mgid, jstate); 4628 } 4629 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4630 } 4631 4632 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4633 ibd_fini_rxlist(state); 4634 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4635 } 4636 4637 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4638 ibd_fini_txlist(state); 4639 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4640 } 4641 4642 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4643 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4644 IBT_SUCCESS) { 4645 DPRINT(10, "ibd_undo_start: free_channel " 4646 "failed, ret=%d", ret); 4647 } 4648 4649 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4650 } 4651 4652 if (progress & IBD_DRV_CQS_ALLOCD) { 4653 kmem_free(state->id_txwcs, 4654 sizeof (ibt_wc_t) * state->id_txwcs_size); 4655 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4656 IBT_SUCCESS) { 4657 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4658 "failed, ret=%d", ret); 4659 } 4660 4661 kmem_free(state->id_rxwcs, 4662 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4663 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4664 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4665 "ret=%d", ret); 4666 } 4667 4668 state->id_txwcs = NULL; 4669 state->id_rxwcs = NULL; 4670 state->id_scq_hdl = NULL; 4671 state->id_rcq_hdl = NULL; 4672 4673 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4674 } 4675 4676 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4677 mutex_enter(&state->id_ac_mutex); 4678 mod_hash_destroy_hash(state->id_ah_active_hash); 4679 mutex_exit(&state->id_ac_mutex); 4680 ibd_acache_fini(state); 4681 4682 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4683 } 4684 4685 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4686 /* 4687 * If we'd created the ipoib broadcast group and had 4688 * successfully joined it, leave it now 4689 */ 4690 if (state->id_bgroup_created) { 4691 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4692 jstate = IB_MC_JSTATE_FULL; 4693 (void) ibt_leave_mcg(state->id_sgid, mgid, 4694 state->id_sgid, jstate); 4695 } 4696 ibt_free_mcg_info(state->id_mcinfo, 1); 4697 4698 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4699 } 4700 4701 return (DDI_SUCCESS); 4702 } 4703 4704 /* 4705 * These pair of routines are used to set/clear the condition that 4706 * the caller is likely to do something to change the id_mac_state. 4707 * If there's already someone doing either a start or a stop (possibly 4708 * due to the async handler detecting a pkey relocation event, a plumb 4709 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4710 * that's done. 4711 */ 4712 static void 4713 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4714 { 4715 mutex_enter(&state->id_macst_lock); 4716 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4717 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4718 4719 state->id_mac_state |= flag; 4720 mutex_exit(&state->id_macst_lock); 4721 } 4722 4723 static void 4724 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4725 { 4726 mutex_enter(&state->id_macst_lock); 4727 state->id_mac_state &= (~flag); 4728 cv_signal(&state->id_macst_cv); 4729 mutex_exit(&state->id_macst_lock); 4730 } 4731 4732 /* 4733 * GLDv3 entry point to start hardware. 4734 */ 4735 /*ARGSUSED*/ 4736 static int 4737 ibd_m_start(void *arg) 4738 { 4739 ibd_state_t *state = arg; 4740 int ret; 4741 4742 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4743 4744 ret = ibd_start(state); 4745 4746 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4747 4748 return (ret); 4749 } 4750 4751 static int 4752 ibd_start(ibd_state_t *state) 4753 { 4754 kthread_t *kht; 4755 int err; 4756 ibt_status_t ret; 4757 4758 if (state->id_mac_state & IBD_DRV_STARTED) 4759 return (DDI_SUCCESS); 4760 4761 if (atomic_inc_32_nv(&state->id_running) != 1) { 4762 DPRINT(10, "ibd_start: id_running is non-zero"); 4763 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 4764 atomic_dec_32(&state->id_running); 4765 return (EINVAL); 4766 } 4767 4768 /* 4769 * Get port details; if we fail here, very likely the port 4770 * state is inactive or the pkey can't be verified. 4771 */ 4772 if ((err = ibd_get_port_details(state)) != 0) { 4773 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4774 goto start_fail; 4775 } 4776 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4777 4778 /* 4779 * Find the IPoIB broadcast group 4780 */ 4781 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4782 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4783 err = ENOTACTIVE; 4784 goto start_fail; 4785 } 4786 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4787 4788 /* 4789 * Initialize per-interface caches and lists; if we fail here, 4790 * it is most likely due to a lack of resources 4791 */ 4792 if (ibd_acache_init(state) != DDI_SUCCESS) { 4793 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4794 err = ENOMEM; 4795 goto start_fail; 4796 } 4797 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4798 4799 /* 4800 * Allocate send and receive completion queues 4801 */ 4802 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4803 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4804 err = ENOMEM; 4805 goto start_fail; 4806 } 4807 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4808 4809 /* 4810 * Setup a UD channel 4811 */ 4812 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4813 err = ENOMEM; 4814 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4815 goto start_fail; 4816 } 4817 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4818 4819 /* 4820 * Allocate and initialize the tx buffer list 4821 */ 4822 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4823 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4824 err = ENOMEM; 4825 goto start_fail; 4826 } 4827 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4828 4829 /* 4830 * Create the send cq handler here 4831 */ 4832 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4833 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4834 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4835 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4836 "failed, ret=%d", ret); 4837 err = EINVAL; 4838 goto start_fail; 4839 } 4840 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4841 4842 /* 4843 * Allocate and initialize the rx buffer list 4844 */ 4845 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4846 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4847 err = ENOMEM; 4848 goto start_fail; 4849 } 4850 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4851 4852 /* 4853 * Join IPoIB broadcast group 4854 */ 4855 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4856 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4857 err = ENOTACTIVE; 4858 goto start_fail; 4859 } 4860 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4861 4862 /* 4863 * Create the async thread; thread_create never fails. 4864 */ 4865 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4866 TS_RUN, minclsyspri); 4867 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4868 state->id_async_thrid = kht->t_did; 4869 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4870 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4871 4872 /* 4873 * When we did mac_register() in ibd_attach(), we didn't register 4874 * the real macaddr and we didn't have the true port mtu. Now that 4875 * we're almost ready, set the local mac address and broadcast 4876 * addresses and update gldv3 about the real values of these 4877 * parameters. 4878 */ 4879 if (state->id_enable_rc) { 4880 ibd_h2n_mac(&state->id_macaddr, 4881 IBD_MAC_ADDR_RC + state->id_qpnum, 4882 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4883 } else { 4884 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4885 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4886 } 4887 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4888 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4889 4890 if (!state->id_enable_rc) { 4891 (void) mac_maxsdu_update(state->id_mh, state->id_mtu 4892 - IPOIB_HDRSIZE); 4893 } 4894 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4895 4896 /* 4897 * Setup the receive cq handler 4898 */ 4899 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4900 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4901 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4902 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4903 "failed, ret=%d", ret); 4904 err = EINVAL; 4905 goto start_fail; 4906 } 4907 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4908 4909 /* 4910 * Setup the subnet notices handler after we've initialized the acache/ 4911 * mcache and started the async thread, both of which are required for 4912 * the trap handler to function properly. 4913 * 4914 * Now that the async thread has been started (and we've already done 4915 * a mac_register() during attach so mac_tx_update() can be called 4916 * if necessary without any problem), we can enable the trap handler 4917 * to queue requests to the async thread. 4918 */ 4919 ibt_register_subnet_notices(state->id_ibt_hdl, 4920 ibd_snet_notices_handler, state); 4921 mutex_enter(&state->id_trap_lock); 4922 state->id_trap_stop = B_FALSE; 4923 mutex_exit(&state->id_trap_lock); 4924 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4925 4926 if (state->id_enable_rc) { 4927 if (state->rc_enable_srq) { 4928 /* Allocate SRQ resource */ 4929 if (ibd_rc_init_srq_list(state) != IBT_SUCCESS) 4930 goto start_fail; 4931 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 4932 } 4933 4934 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 4935 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 4936 "failed"); 4937 goto start_fail; 4938 } 4939 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 4940 4941 /* RC: begin to listen only after everything is available */ 4942 if (ibd_rc_listen(state) != IBT_SUCCESS) { 4943 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 4944 goto start_fail; 4945 } 4946 state->id_mac_state |= IBD_DRV_RC_LISTEN; 4947 } 4948 4949 /* 4950 * Indicate link status to GLDv3 and higher layers. By default, 4951 * we assume we are in up state (which must have been true at 4952 * least at the time the broadcast mcg's were probed); if there 4953 * were any up/down transitions till the time we come here, the 4954 * async handler will have updated last known state, which we 4955 * use to tell GLDv3. The async handler will not send any 4956 * notifications to GLDv3 till we reach here in the initialization 4957 * sequence. 4958 */ 4959 state->id_mac_state |= IBD_DRV_STARTED; 4960 mac_link_update(state->id_mh, state->id_link_state); 4961 4962 return (DDI_SUCCESS); 4963 4964 start_fail: 4965 /* 4966 * If we ran into a problem during ibd_start() and ran into 4967 * some other problem during undoing our partial work, we can't 4968 * do anything about it. Ignore any errors we might get from 4969 * ibd_undo_start() and just return the original error we got. 4970 */ 4971 (void) ibd_undo_start(state, LINK_STATE_DOWN); 4972 return (err); 4973 } 4974 4975 /* 4976 * GLDv3 entry point to stop hardware from receiving packets. 4977 */ 4978 /*ARGSUSED*/ 4979 static void 4980 ibd_m_stop(void *arg) 4981 { 4982 ibd_state_t *state = (ibd_state_t *)arg; 4983 4984 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4985 4986 (void) ibd_undo_start(state, state->id_link_state); 4987 4988 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4989 } 4990 4991 /* 4992 * GLDv3 entry point to modify device's mac address. We do not 4993 * allow address modifications. 4994 */ 4995 static int 4996 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4997 { 4998 ibd_state_t *state = arg; 4999 5000 /* 5001 * Don't bother even comparing the macaddr if we haven't 5002 * completed ibd_m_start(). 5003 */ 5004 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5005 return (0); 5006 5007 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 5008 return (0); 5009 else 5010 return (EINVAL); 5011 } 5012 5013 /* 5014 * The blocking part of the IBA join/leave operations are done out 5015 * of here on the async thread. 5016 */ 5017 static void 5018 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 5019 { 5020 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 5021 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 5022 5023 if (op == IBD_ASYNC_JOIN) { 5024 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 5025 ibd_print_warn(state, "Join multicast group failed :" 5026 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5027 } 5028 } else { 5029 /* 5030 * Here, we must search for the proper mcg_info and 5031 * use that to leave the group. 5032 */ 5033 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 5034 } 5035 } 5036 5037 /* 5038 * GLDv3 entry point for multicast enable/disable requests. 5039 * This function queues the operation to the async thread and 5040 * return success for a valid multicast address. 5041 */ 5042 static int 5043 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 5044 { 5045 ibd_state_t *state = (ibd_state_t *)arg; 5046 ipoib_mac_t maddr, *mcast; 5047 ib_gid_t mgid; 5048 ibd_req_t *req; 5049 5050 /* 5051 * If we haven't completed ibd_m_start(), async thread wouldn't 5052 * have been started and id_bcaddr wouldn't be set, so there's 5053 * no point in continuing. 5054 */ 5055 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5056 return (0); 5057 5058 /* 5059 * The incoming multicast address might not be aligned properly 5060 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 5061 * it to look like one though, to get the offsets of the mc gid, 5062 * since we know we are not going to dereference any values with 5063 * the ipoib_mac_t pointer. 5064 */ 5065 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 5066 mcast = &maddr; 5067 5068 /* 5069 * Check validity of MCG address. We could additionally check 5070 * that a enable/disable is not being issued on the "broadcast" 5071 * mcg, but since this operation is only invokable by privileged 5072 * programs anyway, we allow the flexibility to those dlpi apps. 5073 * Note that we do not validate the "scope" of the IBA mcg. 5074 */ 5075 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 5076 return (EINVAL); 5077 5078 /* 5079 * fill in multicast pkey and scope 5080 */ 5081 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 5082 5083 /* 5084 * If someone is trying to JOIN/LEAVE the broadcast group, we do 5085 * nothing (i.e. we stay JOINed to the broadcast group done in 5086 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 5087 * requires to be joined to broadcast groups at all times. 5088 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 5089 * depends on this. 5090 */ 5091 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5092 return (0); 5093 5094 ibd_n2h_gid(mcast, &mgid); 5095 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5096 if (req == NULL) 5097 return (ENOMEM); 5098 5099 req->rq_gid = mgid; 5100 5101 if (add) { 5102 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 5103 mgid.gid_prefix, mgid.gid_guid); 5104 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 5105 } else { 5106 DPRINT(1, "ibd_m_multicst : unset_multicast : " 5107 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5108 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 5109 } 5110 return (0); 5111 } 5112 5113 /* 5114 * The blocking part of the IBA promiscuous operations are done 5115 * out of here on the async thread. The dlpireq parameter indicates 5116 * whether this invocation is due to a dlpi request or due to 5117 * a port up/down event. 5118 */ 5119 static void 5120 ibd_async_unsetprom(ibd_state_t *state) 5121 { 5122 ibd_mce_t *mce = list_head(&state->id_mc_non); 5123 ib_gid_t mgid; 5124 5125 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 5126 5127 while (mce != NULL) { 5128 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5129 mce = list_next(&state->id_mc_non, mce); 5130 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 5131 } 5132 state->id_prom_op = IBD_OP_NOTSTARTED; 5133 } 5134 5135 /* 5136 * The blocking part of the IBA promiscuous operations are done 5137 * out of here on the async thread. The dlpireq parameter indicates 5138 * whether this invocation is due to a dlpi request or due to 5139 * a port up/down event. 5140 */ 5141 static void 5142 ibd_async_setprom(ibd_state_t *state) 5143 { 5144 ibt_mcg_attr_t mcg_attr; 5145 ibt_mcg_info_t *mcg_info; 5146 ib_gid_t mgid; 5147 uint_t numg; 5148 int i; 5149 char ret = IBD_OP_COMPLETED; 5150 5151 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 5152 5153 /* 5154 * Obtain all active MC groups on the IB fabric with 5155 * specified criteria (scope + Pkey + Qkey + mtu). 5156 */ 5157 bzero(&mcg_attr, sizeof (mcg_attr)); 5158 mcg_attr.mc_pkey = state->id_pkey; 5159 mcg_attr.mc_scope = state->id_scope; 5160 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 5161 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 5162 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 5163 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 5164 IBT_SUCCESS) { 5165 ibd_print_warn(state, "Could not get list of IBA multicast " 5166 "groups"); 5167 ret = IBD_OP_ERRORED; 5168 goto done; 5169 } 5170 5171 /* 5172 * Iterate over the returned mcg's and join as NonMember 5173 * to the IP mcg's. 5174 */ 5175 for (i = 0; i < numg; i++) { 5176 /* 5177 * Do a NonMember JOIN on the MC group. 5178 */ 5179 mgid = mcg_info[i].mc_adds_vect.av_dgid; 5180 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 5181 ibd_print_warn(state, "IBA promiscuous mode missed " 5182 "multicast gid %016llx:%016llx", 5183 (u_longlong_t)mgid.gid_prefix, 5184 (u_longlong_t)mgid.gid_guid); 5185 } 5186 5187 ibt_free_mcg_info(mcg_info, numg); 5188 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 5189 done: 5190 state->id_prom_op = ret; 5191 } 5192 5193 /* 5194 * GLDv3 entry point for multicast promiscuous enable/disable requests. 5195 * GLDv3 assumes phys state receives more packets than multi state, 5196 * which is not true for IPoIB. Thus, treat the multi and phys 5197 * promiscuous states the same way to work with GLDv3's assumption. 5198 */ 5199 static int 5200 ibd_m_promisc(void *arg, boolean_t on) 5201 { 5202 ibd_state_t *state = (ibd_state_t *)arg; 5203 ibd_req_t *req; 5204 5205 /* 5206 * Async thread wouldn't have been started if we haven't 5207 * passed ibd_m_start() 5208 */ 5209 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5210 return (0); 5211 5212 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5213 if (req == NULL) 5214 return (ENOMEM); 5215 if (on) { 5216 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 5217 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 5218 } else { 5219 DPRINT(1, "ibd_m_promisc : unset_promisc"); 5220 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 5221 } 5222 5223 return (0); 5224 } 5225 5226 /* 5227 * GLDv3 entry point for gathering statistics. 5228 */ 5229 static int 5230 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 5231 { 5232 ibd_state_t *state = (ibd_state_t *)arg; 5233 5234 switch (stat) { 5235 case MAC_STAT_IFSPEED: 5236 *val = state->id_link_speed; 5237 break; 5238 case MAC_STAT_MULTIRCV: 5239 *val = state->id_multi_rcv; 5240 break; 5241 case MAC_STAT_BRDCSTRCV: 5242 *val = state->id_brd_rcv; 5243 break; 5244 case MAC_STAT_MULTIXMT: 5245 *val = state->id_multi_xmt; 5246 break; 5247 case MAC_STAT_BRDCSTXMT: 5248 *val = state->id_brd_xmt; 5249 break; 5250 case MAC_STAT_RBYTES: 5251 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 5252 + state->rc_rcv_copy_byte; 5253 break; 5254 case MAC_STAT_IPACKETS: 5255 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 5256 + state->rc_rcv_copy_pkt; 5257 break; 5258 case MAC_STAT_OBYTES: 5259 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 5260 break; 5261 case MAC_STAT_OPACKETS: 5262 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 5263 state->rc_xmt_fragmented_pkt + 5264 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 5265 break; 5266 case MAC_STAT_OERRORS: 5267 *val = state->id_ah_error; /* failed AH translation */ 5268 break; 5269 case MAC_STAT_IERRORS: 5270 *val = 0; 5271 break; 5272 case MAC_STAT_NOXMTBUF: 5273 *val = state->id_tx_short + state->rc_swqe_short + 5274 state->rc_xmt_buf_short; 5275 break; 5276 case MAC_STAT_NORCVBUF: 5277 default: 5278 return (ENOTSUP); 5279 } 5280 5281 return (0); 5282 } 5283 5284 static void 5285 ibd_async_txsched(ibd_state_t *state) 5286 { 5287 ibd_resume_transmission(state); 5288 } 5289 5290 static void 5291 ibd_resume_transmission(ibd_state_t *state) 5292 { 5293 int flag; 5294 int met_thresh = 0; 5295 int thresh = 0; 5296 int ret = -1; 5297 5298 mutex_enter(&state->id_sched_lock); 5299 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5300 mutex_enter(&state->id_tx_list.dl_mutex); 5301 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5302 met_thresh = state->id_tx_list.dl_cnt + 5303 state->id_tx_rel_list.dl_cnt; 5304 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5305 mutex_exit(&state->id_tx_list.dl_mutex); 5306 thresh = IBD_FREE_SWQES_THRESH; 5307 flag = IBD_RSRC_SWQE; 5308 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5309 ASSERT(state->id_lso != NULL); 5310 mutex_enter(&state->id_lso_lock); 5311 met_thresh = state->id_lso->bkt_nfree; 5312 thresh = IBD_FREE_LSOS_THRESH; 5313 mutex_exit(&state->id_lso_lock); 5314 flag = IBD_RSRC_LSOBUF; 5315 if (met_thresh > thresh) 5316 state->id_sched_lso_cnt++; 5317 } 5318 if (met_thresh > thresh) { 5319 state->id_sched_needed &= ~flag; 5320 state->id_sched_cnt++; 5321 ret = 0; 5322 } 5323 mutex_exit(&state->id_sched_lock); 5324 5325 if (ret == 0) 5326 mac_tx_update(state->id_mh); 5327 } 5328 5329 /* 5330 * Release the send wqe back into free list. 5331 */ 5332 static void 5333 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 5334 { 5335 /* 5336 * Add back on Tx list for reuse. 5337 */ 5338 ASSERT(tail->swqe_next == NULL); 5339 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5340 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 5341 tail->swqe_next = state->id_tx_rel_list.dl_head; 5342 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 5343 state->id_tx_rel_list.dl_cnt += n; 5344 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5345 } 5346 5347 /* 5348 * Acquire a send wqe from free list. 5349 * Returns error number and send wqe pointer. 5350 */ 5351 static ibd_swqe_t * 5352 ibd_acquire_swqe(ibd_state_t *state) 5353 { 5354 ibd_swqe_t *wqe; 5355 5356 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5357 if (state->id_tx_rel_list.dl_head != NULL) { 5358 /* transfer id_tx_rel_list to id_tx_list */ 5359 state->id_tx_list.dl_head = 5360 state->id_tx_rel_list.dl_head; 5361 state->id_tx_list.dl_cnt = 5362 state->id_tx_rel_list.dl_cnt; 5363 state->id_tx_list.dl_pending_sends = B_FALSE; 5364 5365 /* clear id_tx_rel_list */ 5366 state->id_tx_rel_list.dl_head = NULL; 5367 state->id_tx_rel_list.dl_cnt = 0; 5368 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5369 5370 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5371 state->id_tx_list.dl_cnt -= 1; 5372 state->id_tx_list.dl_head = wqe->swqe_next; 5373 } else { /* no free swqe */ 5374 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5375 state->id_tx_list.dl_pending_sends = B_TRUE; 5376 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5377 state->id_tx_short++; 5378 wqe = NULL; 5379 } 5380 return (wqe); 5381 } 5382 5383 static int 5384 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5385 ibt_ud_dest_hdl_t ud_dest) 5386 { 5387 mblk_t *nmp; 5388 int iph_len, tcph_len; 5389 ibt_wr_lso_t *lso; 5390 uintptr_t ip_start, tcp_start; 5391 uint8_t *dst; 5392 uint_t pending, mblen; 5393 5394 /* 5395 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5396 * we need to adjust it here for lso. 5397 */ 5398 lso = &(node->w_swr.wr.ud_lso); 5399 lso->lso_ud_dest = ud_dest; 5400 lso->lso_mss = mss; 5401 5402 /* 5403 * Calculate the LSO header size and set it in the UD LSO structure. 5404 * Note that the only assumption we make is that each of the IPoIB, 5405 * IP and TCP headers will be contained in a single mblk fragment; 5406 * together, the headers may span multiple mblk fragments. 5407 */ 5408 nmp = mp; 5409 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5410 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5411 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5412 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5413 nmp = nmp->b_cont; 5414 5415 } 5416 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5417 5418 tcp_start = ip_start + iph_len; 5419 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5420 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5421 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5422 nmp = nmp->b_cont; 5423 } 5424 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5425 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5426 5427 /* 5428 * If the lso header fits entirely within a single mblk fragment, 5429 * we'll avoid an additional copy of the lso header here and just 5430 * pass the b_rptr of the mblk directly. 5431 * 5432 * If this isn't true, we'd have to allocate for it explicitly. 5433 */ 5434 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5435 lso->lso_hdr = mp->b_rptr; 5436 } else { 5437 /* On work completion, remember to free this allocated hdr */ 5438 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5439 if (lso->lso_hdr == NULL) { 5440 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5441 "sz = %d", lso->lso_hdr_sz); 5442 lso->lso_hdr_sz = 0; 5443 lso->lso_mss = 0; 5444 return (-1); 5445 } 5446 } 5447 5448 /* 5449 * Copy in the lso header only if we need to 5450 */ 5451 if (lso->lso_hdr != mp->b_rptr) { 5452 dst = lso->lso_hdr; 5453 pending = lso->lso_hdr_sz; 5454 5455 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5456 mblen = MBLKL(nmp); 5457 if (pending > mblen) { 5458 bcopy(nmp->b_rptr, dst, mblen); 5459 dst += mblen; 5460 pending -= mblen; 5461 } else { 5462 bcopy(nmp->b_rptr, dst, pending); 5463 break; 5464 } 5465 } 5466 } 5467 5468 return (0); 5469 } 5470 5471 static void 5472 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5473 { 5474 ibt_wr_lso_t *lso; 5475 5476 if ((!node) || (!mp)) 5477 return; 5478 5479 /* 5480 * Free any header space that we might've allocated if we 5481 * did an LSO 5482 */ 5483 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5484 lso = &(node->w_swr.wr.ud_lso); 5485 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5486 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5487 lso->lso_hdr = NULL; 5488 lso->lso_hdr_sz = 0; 5489 } 5490 } 5491 } 5492 5493 static void 5494 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5495 { 5496 uint_t i; 5497 uint_t num_posted; 5498 uint_t n_wrs; 5499 ibt_status_t ibt_status; 5500 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 5501 ibd_swqe_t *tx_head, *elem; 5502 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 5503 5504 /* post the one request, then check for more */ 5505 ibt_status = ibt_post_send(state->id_chnl_hdl, 5506 &node->w_swr, 1, NULL); 5507 if (ibt_status != IBT_SUCCESS) { 5508 ibd_print_warn(state, "ibd_post_send: " 5509 "posting one wr failed: ret=%d", ibt_status); 5510 ibd_tx_cleanup(state, node); 5511 } 5512 5513 tx_head = NULL; 5514 for (;;) { 5515 if (tx_head == NULL) { 5516 mutex_enter(&state->id_txpost_lock); 5517 tx_head = state->id_tx_head; 5518 if (tx_head == NULL) { 5519 state->id_tx_busy = 0; 5520 mutex_exit(&state->id_txpost_lock); 5521 return; 5522 } 5523 state->id_tx_head = NULL; 5524 mutex_exit(&state->id_txpost_lock); 5525 } 5526 5527 /* 5528 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 5529 * at a time if possible, and keep posting them. 5530 */ 5531 for (n_wrs = 0, elem = tx_head; 5532 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 5533 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5534 nodes[n_wrs] = elem; 5535 wrs[n_wrs] = elem->w_swr; 5536 } 5537 tx_head = elem; 5538 5539 ASSERT(n_wrs != 0); 5540 5541 /* 5542 * If posting fails for some reason, we'll never receive 5543 * completion intimation, so we'll need to cleanup. But 5544 * we need to make sure we don't clean up nodes whose 5545 * wrs have been successfully posted. We assume that the 5546 * hca driver returns on the first failure to post and 5547 * therefore the first 'num_posted' entries don't need 5548 * cleanup here. 5549 */ 5550 num_posted = 0; 5551 ibt_status = ibt_post_send(state->id_chnl_hdl, 5552 wrs, n_wrs, &num_posted); 5553 if (ibt_status != IBT_SUCCESS) { 5554 ibd_print_warn(state, "ibd_post_send: " 5555 "posting multiple wrs failed: " 5556 "requested=%d, done=%d, ret=%d", 5557 n_wrs, num_posted, ibt_status); 5558 5559 for (i = num_posted; i < n_wrs; i++) 5560 ibd_tx_cleanup(state, nodes[i]); 5561 } 5562 } 5563 } 5564 5565 static int 5566 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5567 uint_t lsohdr_sz) 5568 { 5569 ibt_wr_ds_t *sgl; 5570 ibt_status_t ibt_status; 5571 mblk_t *nmp; 5572 mblk_t *data_mp; 5573 uchar_t *bufp; 5574 size_t blksize; 5575 size_t skip; 5576 size_t avail; 5577 uint_t pktsize; 5578 uint_t frag_len; 5579 uint_t pending_hdr; 5580 int nmblks; 5581 int i; 5582 5583 /* 5584 * Let's skip ahead to the data if this is LSO 5585 */ 5586 data_mp = mp; 5587 pending_hdr = 0; 5588 if (lsohdr_sz) { 5589 pending_hdr = lsohdr_sz; 5590 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5591 frag_len = nmp->b_wptr - nmp->b_rptr; 5592 if (frag_len > pending_hdr) 5593 break; 5594 pending_hdr -= frag_len; 5595 } 5596 data_mp = nmp; /* start of data past lso header */ 5597 ASSERT(data_mp != NULL); 5598 } 5599 5600 /* 5601 * Calculate the size of message data and number of msg blocks 5602 */ 5603 pktsize = 0; 5604 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5605 nmp = nmp->b_cont, nmblks++) { 5606 pktsize += MBLKL(nmp); 5607 } 5608 pktsize -= pending_hdr; 5609 5610 /* 5611 * We only do ibt_map_mem_iov() if the pktsize is above the 5612 * "copy-threshold", and if the number of mp fragments is less than 5613 * the maximum acceptable. 5614 */ 5615 if ((state->id_hca_res_lkey_capab) && 5616 (pktsize > IBD_TX_COPY_THRESH) && 5617 (nmblks < state->id_max_sqseg_hiwm)) { 5618 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5619 ibt_iov_attr_t iov_attr; 5620 5621 iov_attr.iov_as = NULL; 5622 iov_attr.iov = iov_arr; 5623 iov_attr.iov_buf = NULL; 5624 iov_attr.iov_list_len = nmblks; 5625 iov_attr.iov_wr_nds = state->id_max_sqseg; 5626 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5627 iov_attr.iov_flags = IBT_IOV_SLEEP; 5628 5629 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5630 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5631 iov_arr[i].iov_len = MBLKL(nmp); 5632 if (i == 0) { 5633 iov_arr[i].iov_addr += pending_hdr; 5634 iov_arr[i].iov_len -= pending_hdr; 5635 } 5636 } 5637 5638 node->w_buftype = IBD_WQE_MAPPED; 5639 node->w_swr.wr_sgl = node->w_sgl; 5640 5641 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5642 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5643 if (ibt_status != IBT_SUCCESS) { 5644 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5645 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5646 goto ibd_copy_path; 5647 } 5648 5649 return (0); 5650 } 5651 5652 ibd_copy_path: 5653 if (pktsize <= state->id_tx_buf_sz) { 5654 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5655 node->w_swr.wr_nds = 1; 5656 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5657 node->w_buftype = IBD_WQE_TXBUF; 5658 5659 /* 5660 * Even though this is the copy path for transfers less than 5661 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5662 * is possible the first data mblk fragment (data_mp) still 5663 * contains part of the LSO header that we need to skip. 5664 */ 5665 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5666 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5667 blksize = MBLKL(nmp) - pending_hdr; 5668 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5669 bufp += blksize; 5670 pending_hdr = 0; 5671 } 5672 5673 return (0); 5674 } 5675 5676 /* 5677 * Copy path for transfers greater than id_tx_buf_sz 5678 */ 5679 node->w_swr.wr_sgl = node->w_sgl; 5680 if (ibd_acquire_lsobufs(state, pktsize, 5681 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5682 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5683 return (-1); 5684 } 5685 node->w_buftype = IBD_WQE_LSOBUF; 5686 5687 /* 5688 * Copy the larger-than-id_tx_buf_sz packet into a set of 5689 * fixed-sized, pre-mapped LSO buffers. Note that we might 5690 * need to skip part of the LSO header in the first fragment 5691 * as before. 5692 */ 5693 nmp = data_mp; 5694 skip = pending_hdr; 5695 for (i = 0; i < node->w_swr.wr_nds; i++) { 5696 sgl = node->w_swr.wr_sgl + i; 5697 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5698 avail = IBD_LSO_BUFSZ; 5699 while (nmp && avail) { 5700 blksize = MBLKL(nmp) - skip; 5701 if (blksize > avail) { 5702 bcopy(nmp->b_rptr + skip, bufp, avail); 5703 skip += avail; 5704 avail = 0; 5705 } else { 5706 bcopy(nmp->b_rptr + skip, bufp, blksize); 5707 skip = 0; 5708 avail -= blksize; 5709 bufp += blksize; 5710 nmp = nmp->b_cont; 5711 } 5712 } 5713 } 5714 5715 return (0); 5716 } 5717 5718 /* 5719 * Schedule a completion queue polling to reap the resource we're 5720 * short on. If we implement the change to reap tx completions 5721 * in a separate thread, we'll need to wake up that thread here. 5722 */ 5723 static int 5724 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5725 { 5726 ibd_req_t *req; 5727 5728 mutex_enter(&state->id_sched_lock); 5729 state->id_sched_needed |= resource_type; 5730 mutex_exit(&state->id_sched_lock); 5731 5732 /* 5733 * If we are asked to queue a work entry, we need to do it 5734 */ 5735 if (q_flag) { 5736 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5737 if (req == NULL) 5738 return (-1); 5739 5740 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5741 } 5742 5743 return (0); 5744 } 5745 5746 /* 5747 * The passed in packet has this format: 5748 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5749 */ 5750 static boolean_t 5751 ibd_send(ibd_state_t *state, mblk_t *mp) 5752 { 5753 ibd_ace_t *ace; 5754 ibd_swqe_t *node; 5755 ipoib_mac_t *dest; 5756 ib_header_info_t *ipibp; 5757 ip6_t *ip6h; 5758 uint_t pktsize; 5759 uint32_t mss; 5760 uint32_t hckflags; 5761 uint32_t lsoflags = 0; 5762 uint_t lsohdr_sz = 0; 5763 int ret, len; 5764 boolean_t dofree = B_FALSE; 5765 boolean_t rc; 5766 /* if (rc_chan == NULL) send by UD; else send by RC; */ 5767 ibd_rc_chan_t *rc_chan; 5768 int nmblks; 5769 mblk_t *nmp; 5770 5771 /* 5772 * If we aren't done with the device initialization and start, 5773 * we shouldn't be here. 5774 */ 5775 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5776 return (B_FALSE); 5777 5778 /* 5779 * Obtain an address handle for the destination. 5780 */ 5781 ipibp = (ib_header_info_t *)mp->b_rptr; 5782 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5783 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5784 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5785 5786 rc_chan = NULL; 5787 ace = ibd_acache_lookup(state, dest, &ret, 1); 5788 if (state->id_enable_rc && (ace != NULL) && 5789 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 5790 if (ace->ac_chan == NULL) { 5791 state->rc_null_conn++; 5792 } else { 5793 if (ace->ac_chan->chan_state == 5794 IBD_RC_STATE_ACT_ESTAB) { 5795 rc_chan = ace->ac_chan; 5796 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 5797 node = WQE_TO_SWQE( 5798 rc_chan->tx_wqe_list.dl_head); 5799 if (node != NULL) { 5800 rc_chan->tx_wqe_list.dl_cnt -= 1; 5801 rc_chan->tx_wqe_list.dl_head = 5802 node->swqe_next; 5803 } else { 5804 node = ibd_rc_acquire_swqes(rc_chan); 5805 } 5806 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 5807 5808 if (node == NULL) { 5809 state->rc_swqe_short++; 5810 mutex_enter(&state->id_sched_lock); 5811 state->id_sched_needed |= 5812 IBD_RSRC_RC_SWQE; 5813 mutex_exit(&state->id_sched_lock); 5814 ibd_dec_ref_ace(state, ace); 5815 return (B_FALSE); 5816 } 5817 } else { 5818 state->rc_no_estab_conn++; 5819 } 5820 } 5821 } 5822 5823 if (rc_chan == NULL) { 5824 mutex_enter(&state->id_tx_list.dl_mutex); 5825 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 5826 if (node != NULL) { 5827 state->id_tx_list.dl_cnt -= 1; 5828 state->id_tx_list.dl_head = node->swqe_next; 5829 } else { 5830 node = ibd_acquire_swqe(state); 5831 } 5832 mutex_exit(&state->id_tx_list.dl_mutex); 5833 if (node == NULL) { 5834 /* 5835 * If we don't have an swqe available, schedule a 5836 * transmit completion queue cleanup and hold off on 5837 * sending more packets until we have some free swqes 5838 */ 5839 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 5840 if (ace != NULL) { 5841 ibd_dec_ref_ace(state, ace); 5842 } 5843 return (B_FALSE); 5844 } 5845 5846 /* 5847 * If a poll cannot be scheduled, we have no choice but 5848 * to drop this packet 5849 */ 5850 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5851 if (ace != NULL) { 5852 ibd_dec_ref_ace(state, ace); 5853 } 5854 return (B_TRUE); 5855 } 5856 } 5857 5858 /* 5859 * Initialize the commonly used fields in swqe to NULL to protect 5860 * against ibd_tx_cleanup accidentally misinterpreting these on a 5861 * failure. 5862 */ 5863 node->swqe_im_mblk = NULL; 5864 node->w_swr.wr_nds = 0; 5865 node->w_swr.wr_sgl = NULL; 5866 node->w_swr.wr_opcode = IBT_WRC_SEND; 5867 5868 /* 5869 * Calculate the size of message data and number of msg blocks 5870 */ 5871 pktsize = 0; 5872 for (nmblks = 0, nmp = mp; nmp != NULL; 5873 nmp = nmp->b_cont, nmblks++) { 5874 pktsize += MBLKL(nmp); 5875 } 5876 5877 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5878 atomic_inc_64(&state->id_brd_xmt); 5879 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5880 atomic_inc_64(&state->id_multi_xmt); 5881 5882 if (ace != NULL) { 5883 node->w_ahandle = ace; 5884 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5885 } else { 5886 DPRINT(5, 5887 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5888 ((ret == EFAULT) ? "failed" : "queued"), 5889 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5890 htonl(dest->ipoib_gidpref[1]), 5891 htonl(dest->ipoib_gidsuff[0]), 5892 htonl(dest->ipoib_gidsuff[1])); 5893 state->rc_ace_not_found++; 5894 node->w_ahandle = NULL; 5895 5896 /* 5897 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5898 * can not find a path for the specific dest address. We 5899 * should get rid of this kind of packet. We also should get 5900 * rid of the packet if we cannot schedule a poll via the 5901 * async thread. For the normal case, ibd will return the 5902 * packet to upper layer and wait for AH creating. 5903 * 5904 * Note that we always queue a work slot entry for the async 5905 * thread when we fail AH lookup (even in intr mode); this is 5906 * due to the convoluted way the code currently looks for AH. 5907 */ 5908 if (ret == EFAULT) { 5909 dofree = B_TRUE; 5910 rc = B_TRUE; 5911 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5912 dofree = B_TRUE; 5913 rc = B_TRUE; 5914 } else { 5915 dofree = B_FALSE; 5916 rc = B_FALSE; 5917 } 5918 goto ibd_send_fail; 5919 } 5920 5921 /* 5922 * For ND6 packets, padding is at the front of the source lladdr. 5923 * Insert the padding at front. 5924 */ 5925 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 5926 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5927 if (!pullupmsg(mp, IPV6_HDR_LEN + 5928 sizeof (ib_header_info_t))) { 5929 DPRINT(10, "ibd_send: pullupmsg failure "); 5930 dofree = B_TRUE; 5931 rc = B_TRUE; 5932 goto ibd_send_fail; 5933 } 5934 ipibp = (ib_header_info_t *)mp->b_rptr; 5935 } 5936 ip6h = (ip6_t *)((uchar_t *)ipibp + 5937 sizeof (ib_header_info_t)); 5938 len = ntohs(ip6h->ip6_plen); 5939 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5940 mblk_t *pad; 5941 5942 pad = allocb(4, 0); 5943 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5944 linkb(mp, pad); 5945 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5946 IPV6_HDR_LEN + len + 4) { 5947 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5948 IPV6_HDR_LEN + len + 4)) { 5949 DPRINT(10, "ibd_send: pullupmsg " 5950 "failure "); 5951 dofree = B_TRUE; 5952 rc = B_TRUE; 5953 goto ibd_send_fail; 5954 } 5955 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5956 sizeof (ib_header_info_t)); 5957 } 5958 5959 /* LINTED: E_CONSTANT_CONDITION */ 5960 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5961 } 5962 } 5963 5964 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 5965 mp->b_rptr += sizeof (ib_addrs_t); 5966 pktsize -= sizeof (ib_addrs_t); 5967 5968 if (rc_chan) { /* send in RC mode */ 5969 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5970 ibt_iov_attr_t iov_attr; 5971 uint_t i; 5972 size_t blksize; 5973 uchar_t *bufp; 5974 ibd_rc_tx_largebuf_t *lbufp; 5975 5976 atomic_add_64(&state->rc_xmt_bytes, pktsize); 5977 5978 /* 5979 * Upper layer does Tx checksum, we don't need do any 5980 * checksum here. 5981 */ 5982 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 5983 5984 /* 5985 * We only do ibt_map_mem_iov() if the pktsize is above 5986 * the "copy-threshold", and if the number of mp 5987 * fragments is less than the maximum acceptable. 5988 */ 5989 if (pktsize <= ibd_rc_tx_copy_thresh) { 5990 atomic_inc_64(&state->rc_xmt_small_pkt); 5991 /* 5992 * Only process unicast packet in Reliable Connected 5993 * mode. 5994 */ 5995 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5996 node->w_swr.wr_nds = 1; 5997 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5998 node->w_buftype = IBD_WQE_TXBUF; 5999 6000 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6001 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6002 blksize = MBLKL(nmp); 6003 bcopy(nmp->b_rptr, bufp, blksize); 6004 bufp += blksize; 6005 } 6006 freemsg(mp); 6007 ASSERT(node->swqe_im_mblk == NULL); 6008 } else { 6009 if ((state->rc_enable_iov_map) && 6010 (nmblks < state->rc_max_sqseg_hiwm)) { 6011 6012 /* do ibt_map_mem_iov() */ 6013 iov_attr.iov_as = NULL; 6014 iov_attr.iov = iov_arr; 6015 iov_attr.iov_buf = NULL; 6016 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 6017 iov_attr.iov_lso_hdr_sz = 0; 6018 iov_attr.iov_flags = IBT_IOV_SLEEP; 6019 6020 i = 0; 6021 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6022 iov_arr[i].iov_len = MBLKL(nmp); 6023 if (iov_arr[i].iov_len != 0) { 6024 iov_arr[i].iov_addr = (caddr_t) 6025 (void *)nmp->b_rptr; 6026 i++; 6027 } 6028 } 6029 iov_attr.iov_list_len = i; 6030 node->w_swr.wr_sgl = node->w_sgl; 6031 6032 ret = ibt_map_mem_iov(state->id_hca_hdl, 6033 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 6034 &node->w_mi_hdl); 6035 if (ret != IBT_SUCCESS) { 6036 atomic_inc_64( 6037 &state->rc_xmt_map_fail_pkt); 6038 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 6039 ") failed, nmblks=%d, real_nmblks" 6040 "=%d, ret=0x%x", nmblks, i, ret); 6041 goto ibd_rc_large_copy; 6042 } 6043 6044 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 6045 node->w_buftype = IBD_WQE_MAPPED; 6046 node->swqe_im_mblk = mp; 6047 } else { 6048 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 6049 ibd_rc_large_copy: 6050 mutex_enter(&state->rc_tx_large_bufs_lock); 6051 if (state->rc_tx_largebuf_nfree == 0) { 6052 state->rc_xmt_buf_short++; 6053 mutex_exit 6054 (&state->rc_tx_large_bufs_lock); 6055 mutex_enter(&state->id_sched_lock); 6056 state->id_sched_needed |= 6057 IBD_RSRC_RC_TX_LARGEBUF; 6058 mutex_exit(&state->id_sched_lock); 6059 dofree = B_FALSE; 6060 rc = B_FALSE; 6061 /* 6062 * If we don't have Tx large bufs, 6063 * return failure. node->w_buftype 6064 * should not be IBD_WQE_RC_COPYBUF, 6065 * otherwise it will cause problem 6066 * in ibd_rc_tx_cleanup() 6067 */ 6068 node->w_buftype = IBD_WQE_TXBUF; 6069 goto ibd_send_fail; 6070 } 6071 6072 lbufp = state->rc_tx_largebuf_free_head; 6073 ASSERT(lbufp->lb_buf != NULL); 6074 state->rc_tx_largebuf_free_head = 6075 lbufp->lb_next; 6076 lbufp->lb_next = NULL; 6077 /* Update nfree count */ 6078 state->rc_tx_largebuf_nfree --; 6079 mutex_exit(&state->rc_tx_large_bufs_lock); 6080 bufp = lbufp->lb_buf; 6081 node->w_sgl[0].ds_va = 6082 (ib_vaddr_t)(uintptr_t)bufp; 6083 node->w_sgl[0].ds_key = 6084 state->rc_tx_mr_desc.md_lkey; 6085 node->w_sgl[0].ds_len = pktsize; 6086 node->w_swr.wr_sgl = node->w_sgl; 6087 node->w_swr.wr_nds = 1; 6088 node->w_buftype = IBD_WQE_RC_COPYBUF; 6089 node->w_rc_tx_largebuf = lbufp; 6090 6091 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6092 blksize = MBLKL(nmp); 6093 if (blksize != 0) { 6094 bcopy(nmp->b_rptr, bufp, 6095 blksize); 6096 bufp += blksize; 6097 } 6098 } 6099 freemsg(mp); 6100 ASSERT(node->swqe_im_mblk == NULL); 6101 } 6102 } 6103 6104 node->swqe_next = NULL; 6105 mutex_enter(&rc_chan->tx_post_lock); 6106 if (rc_chan->tx_busy) { 6107 if (rc_chan->tx_head) { 6108 rc_chan->tx_tail->swqe_next = 6109 SWQE_TO_WQE(node); 6110 } else { 6111 rc_chan->tx_head = node; 6112 } 6113 rc_chan->tx_tail = node; 6114 mutex_exit(&rc_chan->tx_post_lock); 6115 } else { 6116 rc_chan->tx_busy = 1; 6117 mutex_exit(&rc_chan->tx_post_lock); 6118 ibd_rc_post_send(rc_chan, node); 6119 } 6120 6121 return (B_TRUE); 6122 } /* send by RC */ 6123 6124 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 6125 /* 6126 * Too long pktsize. The packet size from GLD should <= 6127 * state->id_mtu + sizeof (ib_addrs_t) 6128 */ 6129 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 6130 ibd_req_t *req; 6131 6132 mutex_enter(&ace->tx_too_big_mutex); 6133 if (ace->tx_too_big_ongoing) { 6134 mutex_exit(&ace->tx_too_big_mutex); 6135 state->rc_xmt_reenter_too_long_pkt++; 6136 dofree = B_TRUE; 6137 } else { 6138 ace->tx_too_big_ongoing = B_TRUE; 6139 mutex_exit(&ace->tx_too_big_mutex); 6140 state->rc_xmt_icmp_too_long_pkt++; 6141 6142 req = kmem_cache_alloc(state->id_req_kmc, 6143 KM_NOSLEEP); 6144 if (req == NULL) { 6145 ibd_print_warn(state, "ibd_send: alloc " 6146 "ibd_req_t fail"); 6147 /* Drop it. */ 6148 dofree = B_TRUE; 6149 } else { 6150 req->rq_ptr = mp; 6151 req->rq_ptr2 = ace; 6152 ibd_queue_work_slot(state, req, 6153 IBD_ASYNC_RC_TOO_BIG); 6154 dofree = B_FALSE; 6155 } 6156 } 6157 } else { 6158 ibd_print_warn(state, "Reliable Connected mode is on. " 6159 "Multicast packet length %d > %d is too long to " 6160 "send packet (%d > %d), drop it", 6161 pktsize, state->id_mtu); 6162 state->rc_xmt_drop_too_long_pkt++; 6163 /* Drop it. */ 6164 dofree = B_TRUE; 6165 } 6166 rc = B_TRUE; 6167 goto ibd_send_fail; 6168 } 6169 6170 atomic_add_64(&state->id_xmt_bytes, pktsize); 6171 atomic_inc_64(&state->id_xmt_pkt); 6172 6173 /* 6174 * Do LSO and checksum related work here. For LSO send, adjust the 6175 * ud destination, the opcode and the LSO header information to the 6176 * work request. 6177 */ 6178 lso_info_get(mp, &mss, &lsoflags); 6179 if ((lsoflags & HW_LSO) != HW_LSO) { 6180 node->w_swr.wr_opcode = IBT_WRC_SEND; 6181 lsohdr_sz = 0; 6182 } else { 6183 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 6184 /* 6185 * The routine can only fail if there's no memory; we 6186 * can only drop the packet if this happens 6187 */ 6188 ibd_print_warn(state, 6189 "ibd_send: no memory, lso posting failed"); 6190 dofree = B_TRUE; 6191 rc = B_TRUE; 6192 goto ibd_send_fail; 6193 } 6194 6195 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 6196 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 6197 } 6198 6199 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 6200 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 6201 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 6202 else 6203 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 6204 6205 /* 6206 * Prepare the sgl for posting; the routine can only fail if there's 6207 * no lso buf available for posting. If this is the case, we should 6208 * probably resched for lso bufs to become available and then try again. 6209 */ 6210 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 6211 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 6212 dofree = B_TRUE; 6213 rc = B_TRUE; 6214 } else { 6215 dofree = B_FALSE; 6216 rc = B_FALSE; 6217 } 6218 goto ibd_send_fail; 6219 } 6220 node->swqe_im_mblk = mp; 6221 6222 /* 6223 * Queue the wqe to hardware; since we can now simply queue a 6224 * post instead of doing it serially, we cannot assume anything 6225 * about the 'node' after ibd_post_send() returns. 6226 */ 6227 node->swqe_next = NULL; 6228 6229 mutex_enter(&state->id_txpost_lock); 6230 if (state->id_tx_busy) { 6231 if (state->id_tx_head) { 6232 state->id_tx_tail->swqe_next = 6233 SWQE_TO_WQE(node); 6234 } else { 6235 state->id_tx_head = node; 6236 } 6237 state->id_tx_tail = node; 6238 mutex_exit(&state->id_txpost_lock); 6239 } else { 6240 state->id_tx_busy = 1; 6241 mutex_exit(&state->id_txpost_lock); 6242 ibd_post_send(state, node); 6243 } 6244 6245 return (B_TRUE); 6246 6247 ibd_send_fail: 6248 if (node && mp) 6249 ibd_free_lsohdr(node, mp); 6250 6251 if (dofree) 6252 freemsg(mp); 6253 6254 if (node != NULL) { 6255 if (rc_chan) { 6256 ibd_rc_tx_cleanup(node); 6257 } else { 6258 ibd_tx_cleanup(state, node); 6259 } 6260 } 6261 6262 return (rc); 6263 } 6264 6265 /* 6266 * GLDv3 entry point for transmitting datagram. 6267 */ 6268 static mblk_t * 6269 ibd_m_tx(void *arg, mblk_t *mp) 6270 { 6271 ibd_state_t *state = (ibd_state_t *)arg; 6272 mblk_t *next; 6273 6274 if (state->id_link_state != LINK_STATE_UP) { 6275 freemsgchain(mp); 6276 mp = NULL; 6277 } 6278 6279 while (mp != NULL) { 6280 next = mp->b_next; 6281 mp->b_next = NULL; 6282 if (ibd_send(state, mp) == B_FALSE) { 6283 /* Send fail */ 6284 mp->b_next = next; 6285 break; 6286 } 6287 mp = next; 6288 } 6289 6290 return (mp); 6291 } 6292 6293 /* 6294 * this handles Tx and Rx completions. With separate CQs, this handles 6295 * only Rx completions. 6296 */ 6297 static uint_t 6298 ibd_intr(caddr_t arg) 6299 { 6300 ibd_state_t *state = (ibd_state_t *)arg; 6301 6302 ibd_poll_rcq(state, state->id_rcq_hdl); 6303 6304 return (DDI_INTR_CLAIMED); 6305 } 6306 6307 /* 6308 * Poll and fully drain the send cq 6309 */ 6310 static void 6311 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6312 { 6313 ibt_wc_t *wcs = state->id_txwcs; 6314 uint_t numwcs = state->id_txwcs_size; 6315 ibd_wqe_t *wqe; 6316 ibd_swqe_t *head, *tail; 6317 ibt_wc_t *wc; 6318 uint_t num_polled; 6319 int i; 6320 6321 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 6322 head = tail = NULL; 6323 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 6324 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 6325 if (wc->wc_status != IBT_WC_SUCCESS) { 6326 /* 6327 * Channel being torn down. 6328 */ 6329 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 6330 DPRINT(5, "ibd_drain_scq: flush error"); 6331 DPRINT(10, "ibd_drain_scq: Bad " 6332 "status %d", wc->wc_status); 6333 } else { 6334 DPRINT(10, "ibd_drain_scq: " 6335 "unexpected wc_status %d", 6336 wc->wc_status); 6337 } 6338 /* 6339 * Fallthrough to invoke the Tx handler to 6340 * release held resources, e.g., AH refcount. 6341 */ 6342 } 6343 /* 6344 * Add this swqe to the list to be cleaned up. 6345 */ 6346 if (head) 6347 tail->swqe_next = wqe; 6348 else 6349 head = WQE_TO_SWQE(wqe); 6350 tail = WQE_TO_SWQE(wqe); 6351 } 6352 tail->swqe_next = NULL; 6353 ibd_tx_cleanup_list(state, head, tail); 6354 6355 /* 6356 * Resume any blocked transmissions if possible 6357 */ 6358 ibd_resume_transmission(state); 6359 } 6360 } 6361 6362 /* 6363 * Poll and fully drain the receive cq 6364 */ 6365 static void 6366 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6367 { 6368 ibt_wc_t *wcs = state->id_rxwcs; 6369 uint_t numwcs = state->id_rxwcs_size; 6370 ibd_rwqe_t *rwqe; 6371 ibt_wc_t *wc; 6372 uint_t num_polled; 6373 int i; 6374 mblk_t *head, *tail, *mp; 6375 6376 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 6377 head = tail = NULL; 6378 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 6379 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 6380 if (wc->wc_status != IBT_WC_SUCCESS) { 6381 /* 6382 * Channel being torn down. 6383 */ 6384 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 6385 DPRINT(5, "ibd_drain_rcq: " 6386 "expected flushed rwqe"); 6387 } else { 6388 DPRINT(5, "ibd_drain_rcq: " 6389 "unexpected wc_status %d", 6390 wc->wc_status); 6391 } 6392 atomic_inc_32( 6393 &state->id_rx_list.dl_bufs_outstanding); 6394 freemsg(rwqe->rwqe_im_mblk); 6395 continue; 6396 } 6397 mp = ibd_process_rx(state, rwqe, wc); 6398 if (mp == NULL) 6399 continue; 6400 6401 /* 6402 * Add this mp to the list to send to the nw layer. 6403 */ 6404 if (head) 6405 tail->b_next = mp; 6406 else 6407 head = mp; 6408 tail = mp; 6409 } 6410 if (head) 6411 mac_rx(state->id_mh, state->id_rh, head); 6412 6413 /* 6414 * Account for #rwqes polled. 6415 * Post more here, if less than one fourth full. 6416 */ 6417 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 6418 (state->id_num_rwqe / 4)) 6419 ibd_post_recv_intr(state); 6420 } 6421 } 6422 6423 /* 6424 * Common code for interrupt handling as well as for polling 6425 * for all completed wqe's while detaching. 6426 */ 6427 static void 6428 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6429 { 6430 int flag, redo_flag; 6431 int redo = 1; 6432 6433 flag = IBD_CQ_POLLING; 6434 redo_flag = IBD_REDO_CQ_POLLING; 6435 6436 mutex_enter(&state->id_scq_poll_lock); 6437 if (state->id_scq_poll_busy & flag) { 6438 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 6439 state->id_scq_poll_busy |= redo_flag; 6440 mutex_exit(&state->id_scq_poll_lock); 6441 return; 6442 } 6443 state->id_scq_poll_busy |= flag; 6444 mutex_exit(&state->id_scq_poll_lock); 6445 6446 /* 6447 * In some cases (eg detaching), this code can be invoked on 6448 * any cpu after disabling cq notification (thus no concurrency 6449 * exists). Apart from that, the following applies normally: 6450 * Transmit completion handling could be from any cpu if 6451 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 6452 * is interrupt driven. 6453 */ 6454 6455 /* 6456 * Poll and drain the CQ 6457 */ 6458 ibd_drain_scq(state, cq_hdl); 6459 6460 /* 6461 * Enable CQ notifications and redrain the cq to catch any 6462 * completions we might have missed after the ibd_drain_scq() 6463 * above and before the ibt_enable_cq_notify() that follows. 6464 * Finally, service any new requests to poll the cq that 6465 * could've come in after the ibt_enable_cq_notify(). 6466 */ 6467 do { 6468 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 6469 IBT_SUCCESS) { 6470 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6471 } 6472 6473 ibd_drain_scq(state, cq_hdl); 6474 6475 mutex_enter(&state->id_scq_poll_lock); 6476 if (state->id_scq_poll_busy & redo_flag) 6477 state->id_scq_poll_busy &= ~redo_flag; 6478 else { 6479 state->id_scq_poll_busy &= ~flag; 6480 redo = 0; 6481 } 6482 mutex_exit(&state->id_scq_poll_lock); 6483 6484 } while (redo); 6485 } 6486 6487 /* 6488 * Common code for interrupt handling as well as for polling 6489 * for all completed wqe's while detaching. 6490 */ 6491 static void 6492 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 6493 { 6494 int flag, redo_flag; 6495 int redo = 1; 6496 6497 flag = IBD_CQ_POLLING; 6498 redo_flag = IBD_REDO_CQ_POLLING; 6499 6500 mutex_enter(&state->id_rcq_poll_lock); 6501 if (state->id_rcq_poll_busy & flag) { 6502 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 6503 state->id_rcq_poll_busy |= redo_flag; 6504 mutex_exit(&state->id_rcq_poll_lock); 6505 return; 6506 } 6507 state->id_rcq_poll_busy |= flag; 6508 mutex_exit(&state->id_rcq_poll_lock); 6509 6510 /* 6511 * Poll and drain the CQ 6512 */ 6513 ibd_drain_rcq(state, rcq); 6514 6515 /* 6516 * Enable CQ notifications and redrain the cq to catch any 6517 * completions we might have missed after the ibd_drain_cq() 6518 * above and before the ibt_enable_cq_notify() that follows. 6519 * Finally, service any new requests to poll the cq that 6520 * could've come in after the ibt_enable_cq_notify(). 6521 */ 6522 do { 6523 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 6524 IBT_SUCCESS) { 6525 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6526 } 6527 6528 ibd_drain_rcq(state, rcq); 6529 6530 mutex_enter(&state->id_rcq_poll_lock); 6531 if (state->id_rcq_poll_busy & redo_flag) 6532 state->id_rcq_poll_busy &= ~redo_flag; 6533 else { 6534 state->id_rcq_poll_busy &= ~flag; 6535 redo = 0; 6536 } 6537 mutex_exit(&state->id_rcq_poll_lock); 6538 6539 } while (redo); 6540 } 6541 6542 /* 6543 * Unmap the memory area associated with a given swqe. 6544 */ 6545 void 6546 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 6547 { 6548 ibt_status_t stat; 6549 6550 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6551 6552 if (swqe->w_mi_hdl) { 6553 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6554 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6555 DPRINT(10, 6556 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6557 } 6558 swqe->w_mi_hdl = NULL; 6559 } 6560 swqe->w_swr.wr_nds = 0; 6561 } 6562 6563 void 6564 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 6565 { 6566 /* 6567 * The recycling logic can be eliminated from here 6568 * and put into the async thread if we create another 6569 * list to hold ACE's for unjoined mcg's. 6570 */ 6571 if (DEC_REF_DO_CYCLE(ace)) { 6572 ibd_mce_t *mce; 6573 6574 /* 6575 * Check with the lock taken: we decremented 6576 * reference count without the lock, and some 6577 * transmitter might already have bumped the 6578 * reference count (possible in case of multicast 6579 * disable when we leave the AH on the active 6580 * list). If not still 0, get out, leaving the 6581 * recycle bit intact. 6582 * 6583 * Atomically transition the AH from active 6584 * to free list, and queue a work request to 6585 * leave the group and destroy the mce. No 6586 * transmitter can be looking at the AH or 6587 * the MCE in between, since we have the 6588 * ac_mutex lock. In the SendOnly reap case, 6589 * it is not necessary to hold the ac_mutex 6590 * and recheck the ref count (since the AH was 6591 * taken off the active list), we just do it 6592 * to have uniform processing with the Full 6593 * reap case. 6594 */ 6595 mutex_enter(&state->id_ac_mutex); 6596 mce = ace->ac_mce; 6597 if (GET_REF_CYCLE(ace) == 0) { 6598 CLEAR_REFCYCLE(ace); 6599 /* 6600 * Identify the case of fullmember reap as 6601 * opposed to mcg trap reap. Also, port up 6602 * might set ac_mce to NULL to indicate Tx 6603 * cleanup should do no more than put the 6604 * AH in the free list (see ibd_async_link). 6605 */ 6606 if (mce != NULL) { 6607 ace->ac_mce = NULL; 6608 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6609 /* 6610 * mc_req was initialized at mce 6611 * creation time. 6612 */ 6613 ibd_queue_work_slot(state, 6614 &mce->mc_req, IBD_ASYNC_REAP); 6615 } 6616 IBD_ACACHE_INSERT_FREE(state, ace); 6617 } 6618 mutex_exit(&state->id_ac_mutex); 6619 } 6620 } 6621 6622 /* 6623 * Common code that deals with clean ups after a successful or 6624 * erroneous transmission attempt. 6625 */ 6626 static void 6627 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6628 { 6629 ibd_ace_t *ace = swqe->w_ahandle; 6630 6631 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6632 6633 /* 6634 * If this was a dynamic mapping in ibd_send(), we need to 6635 * unmap here. If this was an lso buffer we'd used for sending, 6636 * we need to release the lso buf to the pool, since the resource 6637 * is scarce. However, if this was simply a normal send using 6638 * the copybuf (present in each swqe), we don't need to release it. 6639 */ 6640 if (swqe->swqe_im_mblk != NULL) { 6641 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6642 ibd_unmap_mem(state, swqe); 6643 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6644 ibd_release_lsobufs(state, 6645 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6646 } 6647 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6648 freemsg(swqe->swqe_im_mblk); 6649 swqe->swqe_im_mblk = NULL; 6650 } 6651 6652 /* 6653 * Drop the reference count on the AH; it can be reused 6654 * now for a different destination if there are no more 6655 * posted sends that will use it. This can be eliminated 6656 * if we can always associate each Tx buffer with an AH. 6657 * The ace can be null if we are cleaning up from the 6658 * ibd_send() error path. 6659 */ 6660 if (ace != NULL) { 6661 ibd_dec_ref_ace(state, ace); 6662 } 6663 6664 /* 6665 * Release the send wqe for reuse. 6666 */ 6667 swqe->swqe_next = NULL; 6668 ibd_release_swqe(state, swqe, swqe, 1); 6669 } 6670 6671 static void 6672 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 6673 { 6674 ibd_ace_t *ace; 6675 ibd_swqe_t *swqe; 6676 int n = 0; 6677 6678 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 6679 6680 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 6681 6682 /* 6683 * If this was a dynamic mapping in ibd_send(), we need to 6684 * unmap here. If this was an lso buffer we'd used for sending, 6685 * we need to release the lso buf to the pool, since the 6686 * resource is scarce. However, if this was simply a normal 6687 * send using the copybuf (present in each swqe), we don't need 6688 * to release it. 6689 */ 6690 if (swqe->swqe_im_mblk != NULL) { 6691 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6692 ibd_unmap_mem(state, swqe); 6693 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6694 ibd_release_lsobufs(state, 6695 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6696 } 6697 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6698 freemsg(swqe->swqe_im_mblk); 6699 swqe->swqe_im_mblk = NULL; 6700 } 6701 6702 /* 6703 * Drop the reference count on the AH; it can be reused 6704 * now for a different destination if there are no more 6705 * posted sends that will use it. This can be eliminated 6706 * if we can always associate each Tx buffer with an AH. 6707 * The ace can be null if we are cleaning up from the 6708 * ibd_send() error path. 6709 */ 6710 ace = swqe->w_ahandle; 6711 if (ace != NULL) { 6712 ibd_dec_ref_ace(state, ace); 6713 } 6714 n++; 6715 } 6716 6717 /* 6718 * Release the send wqes for reuse. 6719 */ 6720 ibd_release_swqe(state, head, tail, n); 6721 } 6722 6723 /* 6724 * Processing to be done after receipt of a packet; hand off to GLD 6725 * in the format expected by GLD. The received packet has this 6726 * format: 2b sap :: 00 :: data. 6727 */ 6728 static mblk_t * 6729 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6730 { 6731 ib_header_info_t *phdr; 6732 mblk_t *mp; 6733 ipoib_hdr_t *ipibp; 6734 ipha_t *iphap; 6735 ip6_t *ip6h; 6736 int len; 6737 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 6738 uint32_t bufs; 6739 6740 /* 6741 * Track number handed to upper layer that need to be returned. 6742 */ 6743 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 6744 6745 /* Never run out of rwqes, use allocb when running low */ 6746 if (bufs >= state->id_rx_bufs_outstanding_limit) { 6747 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6748 atomic_inc_32(&state->id_rx_allocb); 6749 mp = allocb(pkt_len, BPRI_HI); 6750 if (mp) { 6751 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 6752 ibd_post_recv(state, rwqe); 6753 } else { /* no memory */ 6754 atomic_inc_32(&state->id_rx_allocb_failed); 6755 ibd_post_recv(state, rwqe); 6756 return (NULL); 6757 } 6758 } else { 6759 mp = rwqe->rwqe_im_mblk; 6760 } 6761 6762 6763 /* 6764 * Adjust write pointer depending on how much data came in. 6765 */ 6766 mp->b_wptr = mp->b_rptr + pkt_len; 6767 6768 /* 6769 * Make sure this is NULL or we're in trouble. 6770 */ 6771 if (mp->b_next != NULL) { 6772 ibd_print_warn(state, 6773 "ibd_process_rx: got duplicate mp from rcq?"); 6774 mp->b_next = NULL; 6775 } 6776 6777 /* 6778 * the IB link will deliver one of the IB link layer 6779 * headers called, the Global Routing Header (GRH). 6780 * ibd driver uses the information in GRH to build the 6781 * Header_info structure and pass it with the datagram up 6782 * to GLDv3. 6783 * If the GRH is not valid, indicate to GLDv3 by setting 6784 * the VerTcFlow field to 0. 6785 */ 6786 phdr = (ib_header_info_t *)mp->b_rptr; 6787 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6788 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6789 6790 /* if it is loop back packet, just drop it. */ 6791 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6792 IPOIB_ADDRL) == 0) { 6793 freemsg(mp); 6794 return (NULL); 6795 } 6796 6797 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6798 sizeof (ipoib_mac_t)); 6799 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6800 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6801 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6802 } else { 6803 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6804 } 6805 } else { 6806 /* 6807 * It can not be a IBA multicast packet. Must have been 6808 * unicast for us. Just copy the interface address to dst. 6809 */ 6810 phdr->ib_grh.ipoib_vertcflow = 0; 6811 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6812 sizeof (ipoib_mac_t)); 6813 } 6814 6815 /* 6816 * For ND6 packets, padding is at the front of the source/target 6817 * lladdr. However the inet6 layer is not aware of it, hence remove 6818 * the padding from such packets. 6819 */ 6820 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6821 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 6822 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6823 len = ntohs(ip6h->ip6_plen); 6824 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6825 /* LINTED: E_CONSTANT_CONDITION */ 6826 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6827 } 6828 } 6829 6830 /* 6831 * Update statistics 6832 */ 6833 atomic_add_64(&state->id_rcv_bytes, pkt_len); 6834 atomic_inc_64(&state->id_rcv_pkt); 6835 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6836 atomic_inc_64(&state->id_brd_rcv); 6837 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6838 atomic_inc_64(&state->id_multi_rcv); 6839 6840 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6841 /* 6842 * Set receive checksum status in mp 6843 * Hardware checksumming can be considered valid only if: 6844 * 1. CQE.IP_OK bit is set 6845 * 2. CQE.CKSUM = 0xffff 6846 * 3. IPv6 routing header is not present in the packet 6847 * 4. If there are no IP_OPTIONS in the IP HEADER 6848 */ 6849 6850 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6851 (wc->wc_cksum == 0xFFFF) && 6852 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6853 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6854 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6855 } 6856 6857 return (mp); 6858 } 6859 6860 /* 6861 * Callback code invoked from STREAMs when the receive data buffer is 6862 * free for recycling. 6863 */ 6864 static void 6865 ibd_freemsg_cb(char *arg) 6866 { 6867 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6868 ibd_state_t *state = rwqe->w_state; 6869 6870 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6871 6872 /* 6873 * If the driver is stopped, just free the rwqe. 6874 */ 6875 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 6876 DPRINT(6, "ibd_freemsg: wqe being freed"); 6877 rwqe->rwqe_im_mblk = NULL; 6878 ibd_free_rwqe(state, rwqe); 6879 return; 6880 } 6881 6882 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6883 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6884 if (rwqe->rwqe_im_mblk == NULL) { 6885 ibd_free_rwqe(state, rwqe); 6886 DPRINT(6, "ibd_freemsg: desballoc failed"); 6887 return; 6888 } 6889 6890 ibd_post_recv(state, rwqe); 6891 } 6892 6893 static uint_t 6894 ibd_tx_recycle(caddr_t arg) 6895 { 6896 ibd_state_t *state = (ibd_state_t *)arg; 6897 6898 /* 6899 * Poll for completed entries 6900 */ 6901 ibd_poll_scq(state, state->id_scq_hdl); 6902 6903 return (DDI_INTR_CLAIMED); 6904 } 6905 6906 #ifdef IBD_LOGGING 6907 static void 6908 ibd_log_init(void) 6909 { 6910 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6911 ibd_lbuf_ndx = 0; 6912 6913 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 6914 } 6915 6916 static void 6917 ibd_log_fini(void) 6918 { 6919 if (ibd_lbuf) 6920 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6921 ibd_lbuf_ndx = 0; 6922 ibd_lbuf = NULL; 6923 6924 mutex_destroy(&ibd_lbuf_lock); 6925 } 6926 6927 static void 6928 ibd_log(const char *fmt, ...) 6929 { 6930 va_list ap; 6931 uint32_t off; 6932 uint32_t msglen; 6933 char tmpbuf[IBD_DMAX_LINE]; 6934 6935 if (ibd_lbuf == NULL) 6936 return; 6937 6938 va_start(ap, fmt); 6939 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6940 va_end(ap); 6941 6942 if (msglen >= IBD_DMAX_LINE) 6943 msglen = IBD_DMAX_LINE - 1; 6944 6945 mutex_enter(&ibd_lbuf_lock); 6946 6947 off = ibd_lbuf_ndx; /* current msg should go here */ 6948 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6949 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6950 6951 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6952 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6953 6954 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6955 ibd_lbuf_ndx = 0; 6956 6957 mutex_exit(&ibd_lbuf_lock); 6958 6959 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6960 } 6961 #endif 6962