1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Per-interface tunables (for developers) 68 * 69 * ibd_tx_copy_thresh 70 * This sets the threshold at which ibd will attempt to do a bcopy of the 71 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 72 * is restricted by various parameters, so setting of this value must be 73 * made after careful considerations only. For instance, IB HCAs currently 74 * impose a relatively small limit (when compared to ethernet NICs) on the 75 * length of the SGL for transmit. On the other hand, the ip stack could 76 * send down mp chains that are quite long when LSO is enabled. 77 * 78 * ibd_num_swqe 79 * Number of "send WQE" elements that will be allocated and used by ibd. 80 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 81 * buffer in each of these send wqes must be taken into account. This 82 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 83 * currently set to the same value of ibd_tx_copy_thresh, but may be 84 * changed independently if needed). 85 * 86 * ibd_num_rwqe 87 * Number of "receive WQE" elements that will be allocated and used by 88 * ibd. This parameter is limited by the maximum channel size of the HCA. 89 * Each buffer in the receive wqe will be of MTU size. 90 * 91 * ibd_num_lso_bufs 92 * Number of "larger-than-MTU" copy buffers to use for cases when the 93 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 94 * and too large to be used with regular MTU-sized copy buffers. It is 95 * not recommended to tune this variable without understanding the 96 * application environment and/or memory resources. The size of each of 97 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 98 * 99 * ibd_num_ah 100 * Number of AH cache entries to allocate 101 * 102 * ibd_hash_size 103 * Hash table size for the active AH list 104 * 105 * ibd_tx_softintr 106 * ibd_rx_softintr 107 * The softintr mechanism allows ibd to avoid event queue overflows if 108 * the receive/completion handlers are to be expensive. These are enabled 109 * by default. 110 * 111 * ibd_log_sz 112 * This specifies the size of the ibd log buffer in bytes. The buffer is 113 * allocated and logging is enabled only when IBD_LOGGING is defined. 114 * 115 */ 116 uint_t ibd_tx_copy_thresh = 0x1000; 117 uint_t ibd_num_swqe = 4000; 118 uint_t ibd_num_rwqe = 4000; 119 uint_t ibd_num_lso_bufs = 0x400; 120 uint_t ibd_num_ah = 256; 121 uint_t ibd_hash_size = 32; 122 uint_t ibd_rx_softintr = 1; 123 uint_t ibd_tx_softintr = 1; 124 uint_t ibd_create_broadcast_group = 1; 125 #ifdef IBD_LOGGING 126 uint_t ibd_log_sz = 0x20000; 127 #endif 128 129 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 130 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 131 #define IBD_NUM_SWQE ibd_num_swqe 132 #define IBD_NUM_RWQE ibd_num_rwqe 133 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 134 #define IBD_NUM_AH ibd_num_ah 135 #define IBD_HASH_SIZE ibd_hash_size 136 #ifdef IBD_LOGGING 137 #define IBD_LOG_SZ ibd_log_sz 138 #endif 139 140 /* 141 * ibd_rc_tx_copy_thresh 142 * This sets the threshold upto which ibd will attempt to do a bcopy of the 143 * outgoing data into a pre-mapped buffer. 144 */ 145 uint_t ibd_rc_tx_copy_thresh = 0x1000; 146 147 /* 148 * Receive CQ moderation parameters: tunable (for developers) 149 */ 150 uint_t ibd_rxcomp_count = 4; 151 uint_t ibd_rxcomp_usec = 10; 152 153 /* 154 * Send CQ moderation parameters: tunable (for developers) 155 */ 156 uint_t ibd_txcomp_count = 16; 157 uint_t ibd_txcomp_usec = 300; 158 159 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 160 #define IBD_RX_POST_CNT 8 161 162 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 163 #define IBD_LOG_RX_POST 4 164 165 /* Minimum number of receive work requests driver needs to always have */ 166 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 167 168 /* 169 * LSO parameters 170 */ 171 #define IBD_LSO_MAXLEN 65536 172 #define IBD_LSO_BUFSZ 8192 173 #define IBD_PROP_LSO_POLICY "lso-policy" 174 175 /* 176 * Async operation states 177 */ 178 #define IBD_OP_NOTSTARTED 0 179 #define IBD_OP_ONGOING 1 180 #define IBD_OP_COMPLETED 2 181 #define IBD_OP_ERRORED 3 182 #define IBD_OP_ROUTERED 4 183 184 /* 185 * State of IBD driver initialization during attach/m_start 186 */ 187 #define IBD_DRV_STATE_INITIALIZED 0x00001 188 #define IBD_DRV_RXINTR_ADDED 0x00002 189 #define IBD_DRV_TXINTR_ADDED 0x00004 190 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 191 #define IBD_DRV_HCA_OPENED 0x00010 192 #define IBD_DRV_PD_ALLOCD 0x00020 193 #define IBD_DRV_MAC_REGISTERED 0x00040 194 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 195 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 196 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 197 #define IBD_DRV_CQS_ALLOCD 0x00400 198 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 199 #define IBD_DRV_TXLIST_ALLOCD 0x01000 200 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 201 #define IBD_DRV_RXLIST_ALLOCD 0x04000 202 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 203 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 204 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 205 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 206 #define IBD_DRV_STARTED 0x80000 207 #define IBD_DRV_RC_SRQ_ALLOCD 0x100000 208 #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000 209 #define IBD_DRV_RC_LISTEN 0x400000 210 #ifdef DEBUG 211 #define IBD_DRV_RC_PRIVATE_STATE 0x800000 212 #endif 213 214 /* 215 * Start/stop in-progress flags; note that restart must always remain 216 * the OR of start and stop flag values. 217 */ 218 #define IBD_DRV_START_IN_PROGRESS 0x10000000 219 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 220 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 221 222 /* 223 * Miscellaneous constants 224 */ 225 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 226 #define IBD_DEF_MAX_SDU 2044 227 #define IBD_DEFAULT_QKEY 0xB1B 228 #ifdef IBD_LOGGING 229 #define IBD_DMAX_LINE 100 230 #endif 231 232 /* 233 * Enumerations for link states 234 */ 235 typedef enum { 236 IBD_LINK_DOWN, 237 IBD_LINK_UP, 238 IBD_LINK_UP_ABSENT 239 } ibd_link_op_t; 240 241 /* 242 * Driver State Pointer 243 */ 244 void *ibd_list; 245 246 /* 247 * Driver Global Data 248 */ 249 ibd_global_state_t ibd_gstate; 250 251 /* 252 * Logging 253 */ 254 #ifdef IBD_LOGGING 255 kmutex_t ibd_lbuf_lock; 256 uint8_t *ibd_lbuf; 257 uint32_t ibd_lbuf_ndx; 258 #endif 259 260 /* 261 * Required system entry points 262 */ 263 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 264 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 265 266 /* 267 * Required driver entry points for GLDv3 268 */ 269 static int ibd_m_stat(void *, uint_t, uint64_t *); 270 static int ibd_m_start(void *); 271 static void ibd_m_stop(void *); 272 static int ibd_m_promisc(void *, boolean_t); 273 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 274 static int ibd_m_unicst(void *, const uint8_t *); 275 static mblk_t *ibd_m_tx(void *, mblk_t *); 276 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 277 278 /* 279 * Private driver entry points for GLDv3 280 */ 281 282 /* 283 * Initialization 284 */ 285 static int ibd_state_init(ibd_state_t *, dev_info_t *); 286 static int ibd_init_txlist(ibd_state_t *); 287 static int ibd_init_rxlist(ibd_state_t *); 288 static int ibd_acache_init(ibd_state_t *); 289 #ifdef IBD_LOGGING 290 static void ibd_log_init(void); 291 #endif 292 293 /* 294 * Termination/cleanup 295 */ 296 static void ibd_state_fini(ibd_state_t *); 297 static void ibd_fini_txlist(ibd_state_t *); 298 static void ibd_fini_rxlist(ibd_state_t *); 299 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 300 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 301 static void ibd_acache_fini(ibd_state_t *); 302 #ifdef IBD_LOGGING 303 static void ibd_log_fini(void); 304 #endif 305 306 /* 307 * Allocation/acquire/map routines 308 */ 309 static int ibd_alloc_tx_copybufs(ibd_state_t *); 310 static int ibd_alloc_rx_copybufs(ibd_state_t *); 311 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 312 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 313 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 314 uint32_t *); 315 316 /* 317 * Free/release/unmap routines 318 */ 319 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 320 static void ibd_free_tx_copybufs(ibd_state_t *); 321 static void ibd_free_rx_copybufs(ibd_state_t *); 322 static void ibd_free_rx_rsrcs(ibd_state_t *); 323 static void ibd_free_tx_lsobufs(ibd_state_t *); 324 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 325 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 326 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 327 328 /* 329 * Handlers/callback routines 330 */ 331 static uint_t ibd_intr(caddr_t); 332 static uint_t ibd_tx_recycle(caddr_t); 333 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 334 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 335 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 336 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 337 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 338 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 339 static void ibd_freemsg_cb(char *); 340 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 341 ibt_async_event_t *); 342 static void ibd_snet_notices_handler(void *, ib_gid_t, 343 ibt_subnet_event_code_t, ibt_subnet_event_t *); 344 345 /* 346 * Send/receive routines 347 */ 348 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 349 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 350 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 351 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 352 353 /* 354 * Threads 355 */ 356 static void ibd_async_work(ibd_state_t *); 357 358 /* 359 * Async tasks 360 */ 361 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 362 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 363 static void ibd_async_setprom(ibd_state_t *); 364 static void ibd_async_unsetprom(ibd_state_t *); 365 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 366 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 367 static void ibd_async_txsched(ibd_state_t *); 368 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 369 370 /* 371 * Async task helpers 372 */ 373 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 374 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 375 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 376 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 377 ipoib_mac_t *, ipoib_mac_t *); 378 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 379 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 380 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 381 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 382 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 383 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 384 static uint64_t ibd_get_portspeed(ibd_state_t *); 385 static boolean_t ibd_async_safe(ibd_state_t *); 386 static void ibd_async_done(ibd_state_t *); 387 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 388 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 389 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 390 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 391 392 /* 393 * Helpers for attach/start routines 394 */ 395 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 396 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 397 static int ibd_unattach(ibd_state_t *, dev_info_t *); 398 static int ibd_get_port_details(ibd_state_t *); 399 static int ibd_alloc_cqs(ibd_state_t *); 400 static int ibd_setup_ud_channel(ibd_state_t *); 401 static int ibd_start(ibd_state_t *); 402 static int ibd_undo_start(ibd_state_t *, link_state_t); 403 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 404 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 405 406 407 /* 408 * Miscellaneous helpers 409 */ 410 static int ibd_sched_poll(ibd_state_t *, int, int); 411 static void ibd_resume_transmission(ibd_state_t *); 412 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 413 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 414 static void *list_get_head(list_t *); 415 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 416 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 417 #ifdef IBD_LOGGING 418 static void ibd_log(const char *, ...); 419 #endif 420 421 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 422 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 423 424 /* Module Driver Info */ 425 static struct modldrv ibd_modldrv = { 426 &mod_driverops, /* This one is a driver */ 427 "InfiniBand GLDv3 Driver", /* short description */ 428 &ibd_dev_ops /* driver specific ops */ 429 }; 430 431 /* Module Linkage */ 432 static struct modlinkage ibd_modlinkage = { 433 MODREV_1, (void *)&ibd_modldrv, NULL 434 }; 435 436 /* 437 * Module (static) info passed to IBTL during ibt_attach 438 */ 439 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 440 IBTI_V_CURR, 441 IBT_NETWORK, 442 ibd_async_handler, 443 NULL, 444 "IPIB" 445 }; 446 447 /* 448 * GLDv3 entry points 449 */ 450 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 451 static mac_callbacks_t ibd_m_callbacks = { 452 IBD_M_CALLBACK_FLAGS, 453 ibd_m_stat, 454 ibd_m_start, 455 ibd_m_stop, 456 ibd_m_promisc, 457 ibd_m_multicst, 458 ibd_m_unicst, 459 ibd_m_tx, 460 NULL, 461 NULL, 462 ibd_m_getcapab 463 }; 464 465 /* 466 * Fill/clear <scope> and <p_key> in multicast/broadcast address 467 */ 468 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 469 { \ 470 *(uint32_t *)((char *)(maddr) + 4) |= \ 471 htonl((uint32_t)(scope) << 16); \ 472 *(uint32_t *)((char *)(maddr) + 8) |= \ 473 htonl((uint32_t)(pkey) << 16); \ 474 } 475 476 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 477 { \ 478 *(uint32_t *)((char *)(maddr) + 4) &= \ 479 htonl(~((uint32_t)0xF << 16)); \ 480 *(uint32_t *)((char *)(maddr) + 8) &= \ 481 htonl(~((uint32_t)0xFFFF << 16)); \ 482 } 483 484 /* 485 * Rudimentary debugging support 486 */ 487 #ifdef DEBUG 488 int ibd_debuglevel = 100; 489 void 490 debug_print(int l, char *fmt, ...) 491 { 492 va_list ap; 493 494 if (l < ibd_debuglevel) 495 return; 496 va_start(ap, fmt); 497 vcmn_err(CE_CONT, fmt, ap); 498 va_end(ap); 499 } 500 #endif 501 502 /* 503 * Common routine to print warning messages; adds in hca guid, port number 504 * and pkey to be able to identify the IBA interface. 505 */ 506 void 507 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 508 { 509 ib_guid_t hca_guid; 510 char ibd_print_buf[256]; 511 int len; 512 va_list ap; 513 514 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 515 0, "hca-guid", 0); 516 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 517 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 518 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 519 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 520 va_start(ap, fmt); 521 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 522 fmt, ap); 523 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 524 va_end(ap); 525 } 526 527 /* 528 * Warlock directives 529 */ 530 531 /* 532 * id_lso_lock 533 * 534 * state->id_lso->bkt_nfree may be accessed without a lock to 535 * determine the threshold at which we have to ask the nw layer 536 * to resume transmission (see ibd_resume_transmission()). 537 */ 538 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 539 ibd_state_t::id_lso)) 540 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 541 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 542 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 543 544 /* 545 * id_scq_poll_lock 546 */ 547 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 548 ibd_state_t::id_scq_poll_busy)) 549 550 /* 551 * id_txpost_lock 552 */ 553 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 554 ibd_state_t::id_tx_head)) 555 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 556 ibd_state_t::id_tx_busy)) 557 558 /* 559 * id_acache_req_lock 560 */ 561 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 562 ibd_state_t::id_acache_req_cv)) 563 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 564 ibd_state_t::id_req_list)) 565 _NOTE(SCHEME_PROTECTS_DATA("atomic", 566 ibd_acache_s::ac_ref)) 567 568 /* 569 * id_ac_mutex 570 * 571 * This mutex is actually supposed to protect id_ah_op as well, 572 * but this path of the code isn't clean (see update of id_ah_op 573 * in ibd_async_acache(), immediately after the call to 574 * ibd_async_mcache()). For now, we'll skip this check by 575 * declaring that id_ah_op is protected by some internal scheme 576 * that warlock isn't aware of. 577 */ 578 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 579 ibd_state_t::id_ah_active)) 580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 581 ibd_state_t::id_ah_free)) 582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 583 ibd_state_t::id_ah_addr)) 584 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 585 ibd_state_t::id_ah_op)) 586 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 587 ibd_state_t::id_ah_error)) 588 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 589 ibd_state_t::id_ac_hot_ace)) 590 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 591 592 /* 593 * id_mc_mutex 594 */ 595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 596 ibd_state_t::id_mc_full)) 597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 598 ibd_state_t::id_mc_non)) 599 600 /* 601 * id_trap_lock 602 */ 603 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 604 ibd_state_t::id_trap_cv)) 605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 606 ibd_state_t::id_trap_stop)) 607 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 608 ibd_state_t::id_trap_inprog)) 609 610 /* 611 * id_prom_op 612 */ 613 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 614 ibd_state_t::id_prom_op)) 615 616 /* 617 * id_sched_lock 618 */ 619 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 620 ibd_state_t::id_sched_needed)) 621 622 /* 623 * id_link_mutex 624 */ 625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 626 ibd_state_t::id_link_state)) 627 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 628 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 629 ibd_state_t::id_link_speed)) 630 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 631 632 /* 633 * id_tx_list.dl_mutex 634 */ 635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 636 ibd_state_t::id_tx_list.dl_head)) 637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 638 ibd_state_t::id_tx_list.dl_pending_sends)) 639 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 640 ibd_state_t::id_tx_list.dl_cnt)) 641 642 /* 643 * id_rx_list.dl_mutex 644 */ 645 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 646 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 647 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 648 ibd_state_t::id_rx_list.dl_cnt)) 649 650 651 /* 652 * Items protected by atomic updates 653 */ 654 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 655 ibd_state_s::id_brd_rcv 656 ibd_state_s::id_brd_xmt 657 ibd_state_s::id_multi_rcv 658 ibd_state_s::id_multi_xmt 659 ibd_state_s::id_num_intrs 660 ibd_state_s::id_rcv_bytes 661 ibd_state_s::id_rcv_pkt 662 ibd_state_s::id_rx_post_queue_index 663 ibd_state_s::id_tx_short 664 ibd_state_s::id_xmt_bytes 665 ibd_state_s::id_xmt_pkt 666 ibd_state_s::rc_rcv_trans_byte 667 ibd_state_s::rc_rcv_trans_pkt 668 ibd_state_s::rc_rcv_copy_byte 669 ibd_state_s::rc_rcv_copy_pkt 670 ibd_state_s::rc_xmt_bytes 671 ibd_state_s::rc_xmt_small_pkt 672 ibd_state_s::rc_xmt_fragmented_pkt 673 ibd_state_s::rc_xmt_map_fail_pkt 674 ibd_state_s::rc_xmt_map_succ_pkt)) 675 676 /* 677 * Non-mutex protection schemes for data elements. Almost all of 678 * these are non-shared items. 679 */ 680 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 681 callb_cpr 682 ib_gid_s 683 ib_header_info 684 ibd_acache_rq 685 ibd_acache_s::ac_mce 686 ibd_acache_s::ac_chan 687 ibd_mcache::mc_fullreap 688 ibd_mcache::mc_jstate 689 ibd_mcache::mc_req 690 ibd_rwqe_s 691 ibd_swqe_s 692 ibd_wqe_s 693 ibt_wr_ds_s::ds_va 694 ibt_wr_lso_s 695 ipoib_mac::ipoib_qpn 696 mac_capab_lso_s 697 msgb::b_next 698 msgb::b_cont 699 msgb::b_rptr 700 msgb::b_wptr 701 ibd_state_s::id_bgroup_created 702 ibd_state_s::id_mac_state 703 ibd_state_s::id_mtu 704 ibd_state_s::id_num_rwqe 705 ibd_state_s::id_num_swqe 706 ibd_state_s::id_qpnum 707 ibd_state_s::id_rcq_hdl 708 ibd_state_s::id_rx_buf_sz 709 ibd_state_s::id_rx_bufs 710 ibd_state_s::id_rx_mr_hdl 711 ibd_state_s::id_rx_wqes 712 ibd_state_s::id_rxwcs 713 ibd_state_s::id_rxwcs_size 714 ibd_state_s::id_rx_nqueues 715 ibd_state_s::id_rx_queues 716 ibd_state_s::id_scope 717 ibd_state_s::id_scq_hdl 718 ibd_state_s::id_tx_buf_sz 719 ibd_state_s::id_tx_bufs 720 ibd_state_s::id_tx_mr_hdl 721 ibd_state_s::id_tx_rel_list.dl_cnt 722 ibd_state_s::id_tx_wqes 723 ibd_state_s::id_txwcs 724 ibd_state_s::id_txwcs_size 725 ibd_state_s::rc_listen_hdl 726 ibd_state_s::rc_listen_hdl_OFED_interop 727 ibd_state_s::rc_srq_size 728 ibd_state_s::rc_srq_rwqes 729 ibd_state_s::rc_srq_rx_bufs 730 ibd_state_s::rc_srq_rx_mr_hdl 731 ibd_state_s::rc_tx_largebuf_desc_base 732 ibd_state_s::rc_tx_mr_bufs 733 ibd_state_s::rc_tx_mr_hdl 734 ipha_s 735 icmph_s 736 ibt_path_info_s::pi_sid 737 ibd_rc_chan_s::ace 738 ibd_rc_chan_s::chan_hdl 739 ibd_rc_chan_s::state 740 ibd_rc_chan_s::chan_state 741 ibd_rc_chan_s::is_tx_chan 742 ibd_rc_chan_s::rcq_hdl 743 ibd_rc_chan_s::rcq_size 744 ibd_rc_chan_s::scq_hdl 745 ibd_rc_chan_s::scq_size 746 ibd_rc_chan_s::requester_gid 747 ibd_rc_chan_s::requester_pkey 748 ibd_rc_chan_s::rx_bufs 749 ibd_rc_chan_s::rx_mr_hdl 750 ibd_rc_chan_s::rx_rwqes 751 ibd_rc_chan_s::tx_wqes 752 ibd_rc_chan_s::tx_mr_bufs 753 ibd_rc_chan_s::tx_mr_hdl 754 ibd_rc_chan_s::tx_rel_list.dl_cnt 755 ibd_rc_chan_s::tx_trans_error_cnt 756 ibd_rc_tx_largebuf_s::lb_buf 757 ibd_rc_msg_hello_s 758 ibt_cm_return_args_s)) 759 760 /* 761 * ibd_rc_chan_s::next is protected by two mutexes: 762 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 763 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 764 */ 765 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 766 ibd_rc_chan_s::next)) 767 768 /* 769 * ibd_state_s.rc_tx_large_bufs_lock 770 */ 771 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 772 ibd_state_s::rc_tx_largebuf_free_head)) 773 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 774 ibd_state_s::rc_tx_largebuf_nfree)) 775 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 776 ibd_rc_tx_largebuf_s::lb_next)) 777 778 /* 779 * ibd_acache_s.tx_too_big_mutex 780 */ 781 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 782 ibd_acache_s::tx_too_big_ongoing)) 783 784 /* 785 * tx_wqe_list.dl_mutex 786 */ 787 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 788 ibd_rc_chan_s::tx_wqe_list.dl_head)) 789 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 790 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 791 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 792 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 793 794 /* 795 * ibd_state_s.rc_ace_recycle_lock 796 */ 797 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 798 ibd_state_s::rc_ace_recycle)) 799 800 /* 801 * rc_srq_rwqe_list.dl_mutex 802 */ 803 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 804 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 805 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 806 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 807 808 /* 809 * Non-mutex protection schemes for data elements. They are counters 810 * for problem diagnosis. Don't need be protected. 811 */ 812 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 813 ibd_state_s::rc_rcv_alloc_fail 814 ibd_state_s::rc_rcq_invoke 815 ibd_state_s::rc_rcq_err 816 ibd_state_s::rc_ace_not_found 817 ibd_state_s::rc_xmt_drop_too_long_pkt 818 ibd_state_s::rc_xmt_icmp_too_long_pkt 819 ibd_state_s::rc_xmt_reenter_too_long_pkt 820 ibd_state_s::rc_swqe_short 821 ibd_state_s::rc_swqe_mac_update 822 ibd_state_s::rc_xmt_buf_short 823 ibd_state_s::rc_xmt_buf_mac_update 824 ibd_state_s::rc_scq_no_swqe 825 ibd_state_s::rc_scq_no_largebuf 826 ibd_state_s::rc_scq_invoke 827 ibd_state_s::rc_conn_succ 828 ibd_state_s::rc_conn_fail 829 ibd_state_s::rc_null_conn 830 ibd_state_s::rc_no_estab_conn 831 ibd_state_s::rc_act_close 832 ibd_state_s::rc_pas_close 833 ibd_state_s::rc_delay_ace_recycle 834 ibd_state_s::rc_act_close_simultaneous 835 ibd_state_s::rc_reset_cnt)) 836 837 #ifdef DEBUG 838 /* 839 * Non-mutex protection schemes for data elements. They are counters 840 * for problem diagnosis. Don't need be protected. 841 */ 842 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 843 ibd_state_s::rc_rwqe_short 844 ibd_rc_stat_s::rc_rcv_trans_byte 845 ibd_rc_stat_s::rc_rcv_trans_pkt 846 ibd_rc_stat_s::rc_rcv_copy_byte 847 ibd_rc_stat_s::rc_rcv_copy_pkt 848 ibd_rc_stat_s::rc_rcv_alloc_fail 849 ibd_rc_stat_s::rc_rcq_invoke 850 ibd_rc_stat_s::rc_rcq_err 851 ibd_rc_stat_s::rc_scq_invoke 852 ibd_rc_stat_s::rc_rwqe_short 853 ibd_rc_stat_s::rc_xmt_bytes 854 ibd_rc_stat_s::rc_xmt_small_pkt 855 ibd_rc_stat_s::rc_xmt_fragmented_pkt 856 ibd_rc_stat_s::rc_xmt_map_fail_pkt 857 ibd_rc_stat_s::rc_xmt_map_succ_pkt 858 ibd_rc_stat_s::rc_ace_not_found 859 ibd_rc_stat_s::rc_scq_no_swqe 860 ibd_rc_stat_s::rc_scq_no_largebuf 861 ibd_rc_stat_s::rc_swqe_short 862 ibd_rc_stat_s::rc_swqe_mac_update 863 ibd_rc_stat_s::rc_xmt_buf_short 864 ibd_rc_stat_s::rc_xmt_buf_mac_update 865 ibd_rc_stat_s::rc_conn_succ 866 ibd_rc_stat_s::rc_conn_fail 867 ibd_rc_stat_s::rc_null_conn 868 ibd_rc_stat_s::rc_no_estab_conn 869 ibd_rc_stat_s::rc_act_close 870 ibd_rc_stat_s::rc_pas_close 871 ibd_rc_stat_s::rc_delay_ace_recycle 872 ibd_rc_stat_s::rc_act_close_simultaneous 873 ibd_rc_stat_s::rc_reset_cnt)) 874 #endif 875 876 int 877 _init() 878 { 879 int status; 880 881 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 882 PAGESIZE), 0); 883 if (status != 0) { 884 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 885 return (status); 886 } 887 888 mac_init_ops(&ibd_dev_ops, "ibd"); 889 status = mod_install(&ibd_modlinkage); 890 if (status != 0) { 891 DPRINT(10, "_init:failed in mod_install()"); 892 ddi_soft_state_fini(&ibd_list); 893 mac_fini_ops(&ibd_dev_ops); 894 return (status); 895 } 896 897 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 898 mutex_enter(&ibd_gstate.ig_mutex); 899 ibd_gstate.ig_ibt_hdl = NULL; 900 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 901 ibd_gstate.ig_service_list = NULL; 902 mutex_exit(&ibd_gstate.ig_mutex); 903 904 #ifdef IBD_LOGGING 905 ibd_log_init(); 906 #endif 907 return (0); 908 } 909 910 int 911 _info(struct modinfo *modinfop) 912 { 913 return (mod_info(&ibd_modlinkage, modinfop)); 914 } 915 916 int 917 _fini() 918 { 919 int status; 920 921 status = mod_remove(&ibd_modlinkage); 922 if (status != 0) 923 return (status); 924 925 mac_fini_ops(&ibd_dev_ops); 926 ddi_soft_state_fini(&ibd_list); 927 mutex_destroy(&ibd_gstate.ig_mutex); 928 #ifdef IBD_LOGGING 929 ibd_log_fini(); 930 #endif 931 return (0); 932 } 933 934 /* 935 * Convert the GID part of the mac address from network byte order 936 * to host order. 937 */ 938 static void 939 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 940 { 941 ib_sn_prefix_t nbopref; 942 ib_guid_t nboguid; 943 944 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 945 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 946 dgid->gid_prefix = b2h64(nbopref); 947 dgid->gid_guid = b2h64(nboguid); 948 } 949 950 /* 951 * Create the IPoIB address in network byte order from host order inputs. 952 */ 953 static void 954 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 955 ib_guid_t guid) 956 { 957 ib_sn_prefix_t nbopref; 958 ib_guid_t nboguid; 959 960 mac->ipoib_qpn = htonl(qpn); 961 nbopref = h2b64(prefix); 962 nboguid = h2b64(guid); 963 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 964 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 965 } 966 967 /* 968 * Send to the appropriate all-routers group when the IBA multicast group 969 * does not exist, based on whether the target group is v4 or v6. 970 */ 971 static boolean_t 972 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 973 ipoib_mac_t *rmac) 974 { 975 boolean_t retval = B_TRUE; 976 uint32_t adjscope = state->id_scope << 16; 977 uint32_t topword; 978 979 /* 980 * Copy the first 4 bytes in without assuming any alignment of 981 * input mac address; this will have IPoIB signature, flags and 982 * scope bits. 983 */ 984 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 985 topword = ntohl(topword); 986 987 /* 988 * Generate proper address for IPv4/v6, adding in the Pkey properly. 989 */ 990 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 991 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 992 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 993 ((uint32_t)(state->id_pkey << 16))), 994 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 995 else 996 /* 997 * Does not have proper bits in the mgid address. 998 */ 999 retval = B_FALSE; 1000 1001 return (retval); 1002 } 1003 1004 /* 1005 * Membership states for different mcg's are tracked by two lists: 1006 * the "non" list is used for promiscuous mode, when all mcg traffic 1007 * needs to be inspected. This type of membership is never used for 1008 * transmission, so there can not be an AH in the active list 1009 * corresponding to a member in this list. This list does not need 1010 * any protection, since all operations are performed by the async 1011 * thread. 1012 * 1013 * "Full" and "SendOnly" membership is tracked using a single list, 1014 * the "full" list. This is because this single list can then be 1015 * searched during transmit to a multicast group (if an AH for the 1016 * mcg is not found in the active list), since at least one type 1017 * of membership must be present before initiating the transmit. 1018 * This list is also emptied during driver detach, since sendonly 1019 * membership acquired during transmit is dropped at detach time 1020 * along with ipv4 broadcast full membership. Insert/deletes to 1021 * this list are done only by the async thread, but it is also 1022 * searched in program context (see multicast disable case), thus 1023 * the id_mc_mutex protects the list. The driver detach path also 1024 * deconstructs the "full" list, but it ensures that the async 1025 * thread will not be accessing the list (by blocking out mcg 1026 * trap handling and making sure no more Tx reaping will happen). 1027 * 1028 * Currently, an IBA attach is done in the SendOnly case too, 1029 * although this is not required. 1030 */ 1031 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1032 list_insert_head(&state->id_mc_full, mce) 1033 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1034 list_insert_head(&state->id_mc_non, mce) 1035 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1036 ibd_mcache_find(mgid, &state->id_mc_full) 1037 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1038 ibd_mcache_find(mgid, &state->id_mc_non) 1039 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1040 list_remove(&state->id_mc_full, mce) 1041 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1042 list_remove(&state->id_mc_non, mce) 1043 1044 static void * 1045 list_get_head(list_t *list) 1046 { 1047 list_node_t *lhead = list_head(list); 1048 1049 if (lhead != NULL) 1050 list_remove(list, lhead); 1051 return (lhead); 1052 } 1053 1054 /* 1055 * This is always guaranteed to be able to queue the work. 1056 */ 1057 void 1058 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1059 { 1060 /* Initialize request */ 1061 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1062 ptr->rq_op = op; 1063 1064 /* 1065 * Queue provided slot onto request pool. 1066 */ 1067 mutex_enter(&state->id_acache_req_lock); 1068 list_insert_tail(&state->id_req_list, ptr); 1069 1070 /* Go, fetch, async thread */ 1071 cv_signal(&state->id_acache_req_cv); 1072 mutex_exit(&state->id_acache_req_lock); 1073 } 1074 1075 /* 1076 * Main body of the per interface async thread. 1077 */ 1078 static void 1079 ibd_async_work(ibd_state_t *state) 1080 { 1081 ibd_req_t *ptr; 1082 callb_cpr_t cprinfo; 1083 1084 mutex_enter(&state->id_acache_req_lock); 1085 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1086 callb_generic_cpr, "ibd_async_work"); 1087 1088 for (;;) { 1089 ptr = list_get_head(&state->id_req_list); 1090 if (ptr != NULL) { 1091 mutex_exit(&state->id_acache_req_lock); 1092 1093 /* 1094 * Once we have done the operation, there is no 1095 * guarantee the request slot is going to be valid, 1096 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1097 * TRAP). 1098 * 1099 * Perform the request. 1100 */ 1101 switch (ptr->rq_op) { 1102 case IBD_ASYNC_GETAH: 1103 ibd_async_acache(state, &ptr->rq_mac); 1104 break; 1105 case IBD_ASYNC_JOIN: 1106 case IBD_ASYNC_LEAVE: 1107 ibd_async_multicast(state, 1108 ptr->rq_gid, ptr->rq_op); 1109 break; 1110 case IBD_ASYNC_PROMON: 1111 ibd_async_setprom(state); 1112 break; 1113 case IBD_ASYNC_PROMOFF: 1114 ibd_async_unsetprom(state); 1115 break; 1116 case IBD_ASYNC_REAP: 1117 ibd_async_reap_group(state, 1118 ptr->rq_ptr, ptr->rq_gid, 1119 IB_MC_JSTATE_FULL); 1120 /* 1121 * the req buf contains in mce 1122 * structure, so we do not need 1123 * to free it here. 1124 */ 1125 ptr = NULL; 1126 break; 1127 case IBD_ASYNC_TRAP: 1128 ibd_async_trap(state, ptr); 1129 break; 1130 case IBD_ASYNC_SCHED: 1131 ibd_async_txsched(state); 1132 break; 1133 case IBD_ASYNC_LINK: 1134 ibd_async_link(state, ptr); 1135 break; 1136 case IBD_ASYNC_EXIT: 1137 mutex_enter(&state->id_acache_req_lock); 1138 #ifndef __lock_lint 1139 CALLB_CPR_EXIT(&cprinfo); 1140 #else 1141 mutex_exit(&state->id_acache_req_lock); 1142 #endif 1143 return; 1144 case IBD_ASYNC_RC_TOO_BIG: 1145 ibd_async_rc_process_too_big(state, 1146 ptr); 1147 break; 1148 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1149 ibd_async_rc_close_act_chan(state, ptr); 1150 break; 1151 case IBD_ASYNC_RC_RECYCLE_ACE: 1152 ibd_async_rc_recycle_ace(state, ptr); 1153 break; 1154 } 1155 if (ptr != NULL) 1156 kmem_cache_free(state->id_req_kmc, ptr); 1157 1158 mutex_enter(&state->id_acache_req_lock); 1159 } else { 1160 #ifndef __lock_lint 1161 /* 1162 * Nothing to do: wait till new request arrives. 1163 */ 1164 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1165 cv_wait(&state->id_acache_req_cv, 1166 &state->id_acache_req_lock); 1167 CALLB_CPR_SAFE_END(&cprinfo, 1168 &state->id_acache_req_lock); 1169 #endif 1170 } 1171 } 1172 1173 /*NOTREACHED*/ 1174 _NOTE(NOT_REACHED) 1175 } 1176 1177 /* 1178 * Return when it is safe to queue requests to the async daemon; primarily 1179 * for subnet trap and async event handling. Disallow requests before the 1180 * daemon is created, and when interface deinitilization starts. 1181 */ 1182 static boolean_t 1183 ibd_async_safe(ibd_state_t *state) 1184 { 1185 mutex_enter(&state->id_trap_lock); 1186 if (state->id_trap_stop) { 1187 mutex_exit(&state->id_trap_lock); 1188 return (B_FALSE); 1189 } 1190 state->id_trap_inprog++; 1191 mutex_exit(&state->id_trap_lock); 1192 return (B_TRUE); 1193 } 1194 1195 /* 1196 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1197 * trap or event handling to complete to kill the async thread and deconstruct 1198 * the mcg/ace list. 1199 */ 1200 static void 1201 ibd_async_done(ibd_state_t *state) 1202 { 1203 mutex_enter(&state->id_trap_lock); 1204 if (--state->id_trap_inprog == 0) 1205 cv_signal(&state->id_trap_cv); 1206 mutex_exit(&state->id_trap_lock); 1207 } 1208 1209 /* 1210 * Hash functions: 1211 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1212 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1213 * These operate on mac addresses input into ibd_send, but there is no 1214 * guarantee on the alignment of the ipoib_mac_t structure. 1215 */ 1216 /*ARGSUSED*/ 1217 static uint_t 1218 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1219 { 1220 ulong_t ptraddr = (ulong_t)key; 1221 uint_t hval; 1222 1223 /* 1224 * If the input address is 4 byte aligned, we can just dereference 1225 * it. This is most common, since IP will send in a 4 byte aligned 1226 * IP header, which implies the 24 byte IPoIB psuedo header will be 1227 * 4 byte aligned too. 1228 */ 1229 if ((ptraddr & 3) == 0) 1230 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1231 1232 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1233 return (hval); 1234 } 1235 1236 static int 1237 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1238 { 1239 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1240 return (0); 1241 else 1242 return (1); 1243 } 1244 1245 /* 1246 * Initialize all the per interface caches and lists; AH cache, 1247 * MCG list etc. 1248 */ 1249 static int 1250 ibd_acache_init(ibd_state_t *state) 1251 { 1252 ibd_ace_t *ce; 1253 int i; 1254 1255 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1256 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1257 1258 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1259 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1260 mutex_enter(&state->id_ac_mutex); 1261 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1262 offsetof(ibd_ace_t, ac_list)); 1263 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1264 offsetof(ibd_ace_t, ac_list)); 1265 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1266 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1267 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1268 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1269 offsetof(ibd_mce_t, mc_list)); 1270 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1271 offsetof(ibd_mce_t, mc_list)); 1272 list_create(&state->id_req_list, sizeof (ibd_req_t), 1273 offsetof(ibd_req_t, rq_list)); 1274 state->id_ac_hot_ace = NULL; 1275 1276 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1277 IBD_NUM_AH, KM_SLEEP); 1278 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1279 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1280 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1281 mutex_exit(&state->id_ac_mutex); 1282 ibd_acache_fini(state); 1283 return (DDI_FAILURE); 1284 } else { 1285 CLEAR_REFCYCLE(ce); 1286 ce->ac_mce = NULL; 1287 mutex_init(&ce->tx_too_big_mutex, NULL, 1288 MUTEX_DRIVER, NULL); 1289 IBD_ACACHE_INSERT_FREE(state, ce); 1290 } 1291 } 1292 mutex_exit(&state->id_ac_mutex); 1293 return (DDI_SUCCESS); 1294 } 1295 1296 static void 1297 ibd_acache_fini(ibd_state_t *state) 1298 { 1299 ibd_ace_t *ptr; 1300 1301 mutex_enter(&state->id_ac_mutex); 1302 1303 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1304 ASSERT(GET_REF(ptr) == 0); 1305 mutex_destroy(&ptr->tx_too_big_mutex); 1306 (void) ibt_free_ud_dest(ptr->ac_dest); 1307 } 1308 1309 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1310 ASSERT(GET_REF(ptr) == 0); 1311 mutex_destroy(&ptr->tx_too_big_mutex); 1312 (void) ibt_free_ud_dest(ptr->ac_dest); 1313 } 1314 1315 list_destroy(&state->id_ah_free); 1316 list_destroy(&state->id_ah_active); 1317 list_destroy(&state->id_mc_full); 1318 list_destroy(&state->id_mc_non); 1319 list_destroy(&state->id_req_list); 1320 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1321 mutex_exit(&state->id_ac_mutex); 1322 mutex_destroy(&state->id_ac_mutex); 1323 mutex_destroy(&state->id_mc_mutex); 1324 mutex_destroy(&state->id_acache_req_lock); 1325 cv_destroy(&state->id_acache_req_cv); 1326 } 1327 1328 /* 1329 * Search AH active hash list for a cached path to input destination. 1330 * If we are "just looking", hold == F. When we are in the Tx path, 1331 * we set hold == T to grab a reference on the AH so that it can not 1332 * be recycled to a new destination while the Tx request is posted. 1333 */ 1334 ibd_ace_t * 1335 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1336 { 1337 ibd_ace_t *ptr; 1338 1339 ASSERT(mutex_owned(&state->id_ac_mutex)); 1340 1341 /* 1342 * Do hash search. 1343 */ 1344 if (mod_hash_find(state->id_ah_active_hash, 1345 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1346 if (hold) 1347 INC_REF(ptr, num); 1348 return (ptr); 1349 } 1350 return (NULL); 1351 } 1352 1353 /* 1354 * This is called by the tx side; if an initialized AH is found in 1355 * the active list, it is locked down and can be used; if no entry 1356 * is found, an async request is queued to do path resolution. 1357 */ 1358 static ibd_ace_t * 1359 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1360 { 1361 ibd_ace_t *ptr; 1362 ibd_req_t *req; 1363 1364 /* 1365 * Only attempt to print when we can; in the mdt pattr case, the 1366 * address is not aligned properly. 1367 */ 1368 if (((ulong_t)mac & 3) == 0) { 1369 DPRINT(4, 1370 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1371 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1372 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1373 htonl(mac->ipoib_gidsuff[1])); 1374 } 1375 1376 mutex_enter(&state->id_ac_mutex); 1377 1378 if (((ptr = state->id_ac_hot_ace) != NULL) && 1379 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1380 INC_REF(ptr, numwqe); 1381 mutex_exit(&state->id_ac_mutex); 1382 return (ptr); 1383 } 1384 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1385 state->id_ac_hot_ace = ptr; 1386 mutex_exit(&state->id_ac_mutex); 1387 return (ptr); 1388 } 1389 1390 /* 1391 * Implementation of a single outstanding async request; if 1392 * the operation is not started yet, queue a request and move 1393 * to ongoing state. Remember in id_ah_addr for which address 1394 * we are queueing the request, in case we need to flag an error; 1395 * Any further requests, for the same or different address, until 1396 * the operation completes, is sent back to GLDv3 to be retried. 1397 * The async thread will update id_ah_op with an error indication 1398 * or will set it to indicate the next look up can start; either 1399 * way, it will mac_tx_update() so that all blocked requests come 1400 * back here. 1401 */ 1402 *err = EAGAIN; 1403 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1404 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1405 if (req != NULL) { 1406 /* 1407 * We did not even find the entry; queue a request 1408 * for it. 1409 */ 1410 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1411 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1412 state->id_ah_op = IBD_OP_ONGOING; 1413 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1414 } 1415 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1416 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1417 /* 1418 * Check the status of the pathrecord lookup request 1419 * we had queued before. 1420 */ 1421 if (state->id_ah_op == IBD_OP_ERRORED) { 1422 *err = EFAULT; 1423 state->id_ah_error++; 1424 } else { 1425 /* 1426 * IBD_OP_ROUTERED case: We need to send to the 1427 * all-router MCG. If we can find the AH for 1428 * the mcg, the Tx will be attempted. If we 1429 * do not find the AH, we return NORESOURCES 1430 * to retry. 1431 */ 1432 ipoib_mac_t routermac; 1433 1434 (void) ibd_get_allroutergroup(state, mac, &routermac); 1435 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1436 numwqe); 1437 } 1438 state->id_ah_op = IBD_OP_NOTSTARTED; 1439 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1440 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1441 /* 1442 * This case can happen when we get a higher band 1443 * packet. The easiest way is to reset the state machine 1444 * to accommodate the higher priority packet. 1445 */ 1446 state->id_ah_op = IBD_OP_NOTSTARTED; 1447 } 1448 mutex_exit(&state->id_ac_mutex); 1449 1450 return (ptr); 1451 } 1452 1453 /* 1454 * Grab a not-currently-in-use AH/PathRecord from the active 1455 * list to recycle to a new destination. Only the async thread 1456 * executes this code. 1457 */ 1458 static ibd_ace_t * 1459 ibd_acache_get_unref(ibd_state_t *state) 1460 { 1461 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1462 boolean_t try_rc_chan_recycle = B_FALSE; 1463 1464 ASSERT(mutex_owned(&state->id_ac_mutex)); 1465 1466 /* 1467 * Do plain linear search. 1468 */ 1469 while (ptr != NULL) { 1470 /* 1471 * Note that it is possible that the "cycle" bit 1472 * is set on the AH w/o any reference count. The 1473 * mcg must have been deleted, and the tx cleanup 1474 * just decremented the reference count to 0, but 1475 * hasn't gotten around to grabbing the id_ac_mutex 1476 * to move the AH into the free list. 1477 */ 1478 if (GET_REF(ptr) == 0) { 1479 if (ptr->ac_chan != NULL) { 1480 ASSERT(state->id_enable_rc == B_TRUE); 1481 if (!try_rc_chan_recycle) { 1482 try_rc_chan_recycle = B_TRUE; 1483 ibd_rc_signal_ace_recycle(state, ptr); 1484 } 1485 } else { 1486 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1487 break; 1488 } 1489 } 1490 ptr = list_prev(&state->id_ah_active, ptr); 1491 } 1492 return (ptr); 1493 } 1494 1495 /* 1496 * Invoked to clean up AH from active list in case of multicast 1497 * disable and to handle sendonly memberships during mcg traps. 1498 * And for port up processing for multicast and unicast AHs. 1499 * Normally, the AH is taken off the active list, and put into 1500 * the free list to be recycled for a new destination. In case 1501 * Tx requests on the AH have not completed yet, the AH is marked 1502 * for reaping (which will put the AH on the free list) once the Tx's 1503 * complete; in this case, depending on the "force" input, we take 1504 * out the AH from the active list right now, or leave it also for 1505 * the reap operation. Returns TRUE if the AH is taken off the active 1506 * list (and either put into the free list right now, or arranged for 1507 * later), FALSE otherwise. 1508 */ 1509 boolean_t 1510 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1511 { 1512 ibd_ace_t *acactive; 1513 boolean_t ret = B_TRUE; 1514 1515 ASSERT(mutex_owned(&state->id_ac_mutex)); 1516 1517 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1518 1519 /* 1520 * Note that the AH might already have the cycle bit set 1521 * on it; this might happen if sequences of multicast 1522 * enables and disables are coming so fast, that posted 1523 * Tx's to the mcg have not completed yet, and the cycle 1524 * bit is set successively by each multicast disable. 1525 */ 1526 if (SET_CYCLE_IF_REF(acactive)) { 1527 if (!force) { 1528 /* 1529 * The ace is kept on the active list, further 1530 * Tx's can still grab a reference on it; the 1531 * ace is reaped when all pending Tx's 1532 * referencing the AH complete. 1533 */ 1534 ret = B_FALSE; 1535 } else { 1536 /* 1537 * In the mcg trap case, we always pull the 1538 * AH from the active list. And also the port 1539 * up multi/unicast case. 1540 */ 1541 ASSERT(acactive->ac_chan == NULL); 1542 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1543 acactive->ac_mce = NULL; 1544 } 1545 } else { 1546 /* 1547 * Determined the ref count is 0, thus reclaim 1548 * immediately after pulling out the ace from 1549 * the active list. 1550 */ 1551 ASSERT(acactive->ac_chan == NULL); 1552 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1553 acactive->ac_mce = NULL; 1554 IBD_ACACHE_INSERT_FREE(state, acactive); 1555 } 1556 1557 } 1558 return (ret); 1559 } 1560 1561 /* 1562 * Helper function for async path record lookup. If we are trying to 1563 * Tx to a MCG, check our membership, possibly trying to join the 1564 * group if required. If that fails, try to send the packet to the 1565 * all router group (indicated by the redirect output), pointing 1566 * the input mac address to the router mcg address. 1567 */ 1568 static ibd_mce_t * 1569 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1570 { 1571 ib_gid_t mgid; 1572 ibd_mce_t *mce; 1573 ipoib_mac_t routermac; 1574 1575 *redirect = B_FALSE; 1576 ibd_n2h_gid(mac, &mgid); 1577 1578 /* 1579 * Check the FullMember+SendOnlyNonMember list. 1580 * Since we are the only one who manipulates the 1581 * id_mc_full list, no locks are needed. 1582 */ 1583 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1584 if (mce != NULL) { 1585 DPRINT(4, "ibd_async_mcache : already joined to group"); 1586 return (mce); 1587 } 1588 1589 /* 1590 * Not found; try to join(SendOnlyNonMember) and attach. 1591 */ 1592 DPRINT(4, "ibd_async_mcache : not joined to group"); 1593 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1594 NULL) { 1595 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1596 return (mce); 1597 } 1598 1599 /* 1600 * MCGroup not present; try to join the all-router group. If 1601 * any of the following steps succeed, we will be redirecting 1602 * to the all router group. 1603 */ 1604 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1605 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1606 return (NULL); 1607 *redirect = B_TRUE; 1608 ibd_n2h_gid(&routermac, &mgid); 1609 bcopy(&routermac, mac, IPOIB_ADDRL); 1610 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1611 mgid.gid_prefix, mgid.gid_guid); 1612 1613 /* 1614 * Are we already joined to the router group? 1615 */ 1616 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1617 DPRINT(4, "ibd_async_mcache : using already joined router" 1618 "group\n"); 1619 return (mce); 1620 } 1621 1622 /* 1623 * Can we join(SendOnlyNonMember) the router group? 1624 */ 1625 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1626 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1627 NULL) { 1628 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1629 return (mce); 1630 } 1631 1632 return (NULL); 1633 } 1634 1635 /* 1636 * Async path record lookup code. 1637 */ 1638 static void 1639 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1640 { 1641 ibd_ace_t *ce; 1642 ibd_mce_t *mce = NULL; 1643 ibt_path_attr_t path_attr; 1644 ibt_path_info_t path_info; 1645 ib_gid_t destgid; 1646 char ret = IBD_OP_NOTSTARTED; 1647 1648 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1649 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1650 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1651 htonl(mac->ipoib_gidsuff[1])); 1652 1653 /* 1654 * Check whether we are trying to transmit to a MCG. 1655 * In that case, we need to make sure we are a member of 1656 * the MCG. 1657 */ 1658 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1659 boolean_t redirected; 1660 1661 /* 1662 * If we can not find or join the group or even 1663 * redirect, error out. 1664 */ 1665 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1666 NULL) { 1667 state->id_ah_op = IBD_OP_ERRORED; 1668 return; 1669 } 1670 1671 /* 1672 * If we got redirected, we need to determine whether 1673 * the AH for the new mcg is in the cache already, and 1674 * not pull it in then; otherwise proceed to get the 1675 * path for the new mcg. There is no guarantee that 1676 * if the AH is currently in the cache, it will still be 1677 * there when we look in ibd_acache_lookup(), but that's 1678 * okay, we will come back here. 1679 */ 1680 if (redirected) { 1681 ret = IBD_OP_ROUTERED; 1682 DPRINT(4, "ibd_async_acache : redirected to " 1683 "%08X:%08X:%08X:%08X:%08X", 1684 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1685 htonl(mac->ipoib_gidpref[1]), 1686 htonl(mac->ipoib_gidsuff[0]), 1687 htonl(mac->ipoib_gidsuff[1])); 1688 1689 mutex_enter(&state->id_ac_mutex); 1690 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1691 state->id_ah_op = IBD_OP_ROUTERED; 1692 mutex_exit(&state->id_ac_mutex); 1693 DPRINT(4, "ibd_async_acache : router AH found"); 1694 return; 1695 } 1696 mutex_exit(&state->id_ac_mutex); 1697 } 1698 } 1699 1700 /* 1701 * Get an AH from the free list. 1702 */ 1703 mutex_enter(&state->id_ac_mutex); 1704 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1705 /* 1706 * No free ones; try to grab an unreferenced active 1707 * one. Maybe we need to make the active list LRU, 1708 * but that will create more work for Tx callbacks. 1709 * Is there a way of not having to pull out the 1710 * entry from the active list, but just indicate it 1711 * is being recycled? Yes, but that creates one more 1712 * check in the fast lookup path. 1713 */ 1714 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1715 /* 1716 * Pretty serious shortage now. 1717 */ 1718 state->id_ah_op = IBD_OP_NOTSTARTED; 1719 mutex_exit(&state->id_ac_mutex); 1720 DPRINT(10, "ibd_async_acache : failed to find AH " 1721 "slot\n"); 1722 return; 1723 } 1724 /* 1725 * We could check whether ac_mce points to a SendOnly 1726 * member and drop that membership now. Or do it lazily 1727 * at detach time. 1728 */ 1729 ce->ac_mce = NULL; 1730 } 1731 mutex_exit(&state->id_ac_mutex); 1732 ASSERT(ce->ac_mce == NULL); 1733 1734 /* 1735 * Update the entry. 1736 */ 1737 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1738 1739 bzero(&path_info, sizeof (path_info)); 1740 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1741 path_attr.pa_sgid = state->id_sgid; 1742 path_attr.pa_num_dgids = 1; 1743 ibd_n2h_gid(&ce->ac_mac, &destgid); 1744 path_attr.pa_dgids = &destgid; 1745 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1746 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1747 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1748 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1749 goto error; 1750 } 1751 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1752 ntohl(ce->ac_mac.ipoib_qpn), 1753 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1754 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1755 goto error; 1756 } 1757 1758 /* 1759 * mce is set whenever an AH is being associated with a 1760 * MCG; this will come in handy when we leave the MCG. The 1761 * lock protects Tx fastpath from scanning the active list. 1762 */ 1763 if (mce != NULL) 1764 ce->ac_mce = mce; 1765 1766 /* 1767 * initiate a RC mode connection for unicast address 1768 */ 1769 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1770 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1771 ASSERT(ce->ac_chan == NULL); 1772 DPRINT(10, "ibd_async_acache: call " 1773 "ibd_rc_try_connect(ace=%p)", ce); 1774 ibd_rc_try_connect(state, ce, &path_info); 1775 if (ce->ac_chan == NULL) { 1776 DPRINT(10, "ibd_async_acache: fail to setup RC" 1777 " channel"); 1778 state->rc_conn_fail++; 1779 goto error; 1780 } 1781 } 1782 1783 mutex_enter(&state->id_ac_mutex); 1784 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1785 state->id_ah_op = ret; 1786 mutex_exit(&state->id_ac_mutex); 1787 return; 1788 error: 1789 /* 1790 * We might want to drop SendOnly membership here if we 1791 * joined above. The lock protects Tx callbacks inserting 1792 * into the free list. 1793 */ 1794 mutex_enter(&state->id_ac_mutex); 1795 state->id_ah_op = IBD_OP_ERRORED; 1796 IBD_ACACHE_INSERT_FREE(state, ce); 1797 mutex_exit(&state->id_ac_mutex); 1798 } 1799 1800 /* 1801 * While restoring port's presence on the subnet on a port up, it is possible 1802 * that the port goes down again. 1803 */ 1804 static void 1805 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1806 { 1807 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1808 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1809 LINK_STATE_UP; 1810 ibd_mce_t *mce, *pmce; 1811 ibd_ace_t *ace, *pace; 1812 1813 DPRINT(10, "ibd_async_link(): %d", opcode); 1814 1815 /* 1816 * On a link up, revalidate the link speed/width. No point doing 1817 * this on a link down, since we will be unable to do SA operations, 1818 * defaulting to the lowest speed. Also notice that we update our 1819 * notion of speed before calling mac_link_update(), which will do 1820 * necessary higher level notifications for speed changes. 1821 */ 1822 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1823 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1824 state->id_link_speed = ibd_get_portspeed(state); 1825 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1826 } 1827 1828 /* 1829 * Do all the work required to establish our presence on 1830 * the subnet. 1831 */ 1832 if (opcode == IBD_LINK_UP_ABSENT) { 1833 /* 1834 * If in promiscuous mode ... 1835 */ 1836 if (state->id_prom_op == IBD_OP_COMPLETED) { 1837 /* 1838 * Drop all nonmembership. 1839 */ 1840 ibd_async_unsetprom(state); 1841 1842 /* 1843 * Then, try to regain nonmembership to all mcg's. 1844 */ 1845 ibd_async_setprom(state); 1846 1847 } 1848 1849 /* 1850 * Drop all sendonly membership (which also gets rid of the 1851 * AHs); try to reacquire all full membership. 1852 */ 1853 mce = list_head(&state->id_mc_full); 1854 while ((pmce = mce) != NULL) { 1855 mce = list_next(&state->id_mc_full, mce); 1856 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1857 ibd_leave_group(state, 1858 pmce->mc_info.mc_adds_vect.av_dgid, 1859 IB_MC_JSTATE_SEND_ONLY_NON); 1860 else 1861 ibd_reacquire_group(state, pmce); 1862 } 1863 1864 /* 1865 * Recycle all active AHs to free list (and if there are 1866 * pending posts, make sure they will go into the free list 1867 * once the Tx's complete). Grab the lock to prevent 1868 * concurrent Tx's as well as Tx cleanups. 1869 */ 1870 mutex_enter(&state->id_ac_mutex); 1871 ace = list_head(&state->id_ah_active); 1872 while ((pace = ace) != NULL) { 1873 boolean_t cycled; 1874 1875 ace = list_next(&state->id_ah_active, ace); 1876 mce = pace->ac_mce; 1877 if (pace->ac_chan != NULL) { 1878 ASSERT(mce == NULL); 1879 ASSERT(state->id_enable_rc == B_TRUE); 1880 if (pace->ac_chan->chan_state == 1881 IBD_RC_STATE_ACT_ESTAB) { 1882 INC_REF(pace, 1); 1883 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 1884 pace->ac_chan->chan_state = 1885 IBD_RC_STATE_ACT_CLOSING; 1886 ibd_rc_signal_act_close(state, pace); 1887 } else { 1888 state->rc_act_close_simultaneous++; 1889 DPRINT(40, "ibd_async_link: other " 1890 "thread is closing it, ace=%p, " 1891 "ac_chan=%p, chan_state=%d", 1892 pace, pace->ac_chan, 1893 pace->ac_chan->chan_state); 1894 } 1895 } else { 1896 cycled = ibd_acache_recycle(state, 1897 &pace->ac_mac, B_TRUE); 1898 } 1899 /* 1900 * If this is for an mcg, it must be for a fullmember, 1901 * since we got rid of send-only members above when 1902 * processing the mce list. 1903 */ 1904 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1905 IB_MC_JSTATE_FULL))); 1906 1907 /* 1908 * Check if the fullmember mce needs to be torn down, 1909 * ie whether the DLPI disable has already been done. 1910 * If so, do some of the work of tx_cleanup, namely 1911 * causing leave (which will fail), detach and 1912 * mce-freeing. tx_cleanup will put the AH into free 1913 * list. The reason to duplicate some of this 1914 * tx_cleanup work is because we want to delete the 1915 * AH right now instead of waiting for tx_cleanup, to 1916 * force subsequent Tx's to reacquire an AH. 1917 */ 1918 if ((mce != NULL) && (mce->mc_fullreap)) 1919 ibd_async_reap_group(state, mce, 1920 mce->mc_info.mc_adds_vect.av_dgid, 1921 mce->mc_jstate); 1922 } 1923 mutex_exit(&state->id_ac_mutex); 1924 } 1925 1926 /* 1927 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1928 * (which stops further events from being delivered) before 1929 * mac_unregister(). At this point, it is guaranteed that mac_register 1930 * has already been done. 1931 */ 1932 mutex_enter(&state->id_link_mutex); 1933 state->id_link_state = lstate; 1934 mac_link_update(state->id_mh, lstate); 1935 mutex_exit(&state->id_link_mutex); 1936 1937 ibd_async_done(state); 1938 } 1939 1940 /* 1941 * Check the pkey table to see if we can find the pkey we're looking for. 1942 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1943 * failure. 1944 */ 1945 static int 1946 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1947 uint16_t *pkix) 1948 { 1949 uint16_t ndx; 1950 1951 ASSERT(pkix != NULL); 1952 1953 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1954 if (pkey_tbl[ndx] == pkey) { 1955 *pkix = ndx; 1956 return (0); 1957 } 1958 } 1959 return (-1); 1960 } 1961 1962 /* 1963 * When the link is notified up, we need to do a few things, based 1964 * on the port's current p_init_type_reply claiming a reinit has been 1965 * done or not. The reinit steps are: 1966 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1967 * the old Pkey and GID0 are correct. 1968 * 2. Register for mcg traps (already done by ibmf). 1969 * 3. If PreservePresenceReply indicates the SM has restored port's presence 1970 * in subnet, nothing more to do. Else go to next steps (on async daemon). 1971 * 4. Give up all sendonly memberships. 1972 * 5. Acquire all full memberships. 1973 * 6. In promiscuous mode, acquire all non memberships. 1974 * 7. Recycle all AHs to free list. 1975 */ 1976 static void 1977 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 1978 { 1979 ibt_hca_portinfo_t *port_infop = NULL; 1980 ibt_status_t ibt_status; 1981 uint_t psize, port_infosz; 1982 ibd_link_op_t opcode; 1983 ibd_req_t *req; 1984 link_state_t new_link_state = LINK_STATE_UP; 1985 uint8_t itreply; 1986 uint16_t pkix; 1987 int ret; 1988 1989 /* 1990 * Let's not race with a plumb or an unplumb; if we detect a 1991 * pkey relocation event later on here, we may have to restart. 1992 */ 1993 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 1994 1995 mutex_enter(&state->id_link_mutex); 1996 1997 /* 1998 * If the init code in ibd_m_start hasn't yet set up the 1999 * pkey/gid, nothing to do; that code will set the link state. 2000 */ 2001 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2002 mutex_exit(&state->id_link_mutex); 2003 goto link_mod_return; 2004 } 2005 2006 /* 2007 * If this routine was called in response to a port down event, 2008 * we just need to see if this should be informed. 2009 */ 2010 if (code == IBT_ERROR_PORT_DOWN) { 2011 new_link_state = LINK_STATE_DOWN; 2012 goto update_link_state; 2013 } 2014 2015 /* 2016 * If it's not a port down event we've received, try to get the port 2017 * attributes first. If we fail here, the port is as good as down. 2018 * Otherwise, if the link went down by the time the handler gets 2019 * here, give up - we cannot even validate the pkey/gid since those 2020 * are not valid and this is as bad as a port down anyway. 2021 */ 2022 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2023 &port_infop, &psize, &port_infosz); 2024 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2025 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2026 new_link_state = LINK_STATE_DOWN; 2027 goto update_link_state; 2028 } 2029 2030 /* 2031 * Check the SM InitTypeReply flags. If both NoLoadReply and 2032 * PreserveContentReply are 0, we don't know anything about the 2033 * data loaded into the port attributes, so we need to verify 2034 * if gid0 and pkey are still valid. 2035 */ 2036 itreply = port_infop->p_init_type_reply; 2037 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2038 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2039 /* 2040 * Check to see if the subnet part of GID0 has changed. If 2041 * not, check the simple case first to see if the pkey 2042 * index is the same as before; finally check to see if the 2043 * pkey has been relocated to a different index in the table. 2044 */ 2045 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2046 if (bcmp(port_infop->p_sgid_tbl, 2047 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2048 2049 new_link_state = LINK_STATE_DOWN; 2050 2051 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2052 state->id_pkey) { 2053 2054 new_link_state = LINK_STATE_UP; 2055 2056 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2057 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2058 2059 ibt_free_portinfo(port_infop, port_infosz); 2060 mutex_exit(&state->id_link_mutex); 2061 2062 /* 2063 * Currently a restart is required if our pkey has moved 2064 * in the pkey table. If we get the ibt_recycle_ud() to 2065 * work as documented (expected), we may be able to 2066 * avoid a complete restart. Note that we've already 2067 * marked both the start and stop 'in-progress' flags, 2068 * so it is ok to go ahead and do this restart. 2069 */ 2070 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2071 if ((ret = ibd_start(state)) != 0) { 2072 DPRINT(10, "ibd_restart: cannot restart, " 2073 "ret=%d", ret); 2074 } 2075 2076 goto link_mod_return; 2077 } else { 2078 new_link_state = LINK_STATE_DOWN; 2079 } 2080 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2081 } 2082 2083 update_link_state: 2084 if (port_infop) { 2085 ibt_free_portinfo(port_infop, port_infosz); 2086 } 2087 2088 /* 2089 * If we're reporting a link up, check InitTypeReply to see if 2090 * the SM has ensured that the port's presence in mcg, traps, 2091 * etc. is intact. 2092 */ 2093 if (new_link_state == LINK_STATE_DOWN) { 2094 opcode = IBD_LINK_DOWN; 2095 } else { 2096 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2097 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2098 opcode = IBD_LINK_UP; 2099 } else { 2100 opcode = IBD_LINK_UP_ABSENT; 2101 } 2102 } 2103 2104 /* 2105 * If the old state is the same as the new state, and the SM indicated 2106 * no change in the port parameters, nothing to do. 2107 */ 2108 if ((state->id_link_state == new_link_state) && (opcode != 2109 IBD_LINK_UP_ABSENT)) { 2110 mutex_exit(&state->id_link_mutex); 2111 goto link_mod_return; 2112 } 2113 2114 /* 2115 * Ok, so there was a link state change; see if it's safe to ask 2116 * the async thread to do the work 2117 */ 2118 if (!ibd_async_safe(state)) { 2119 state->id_link_state = new_link_state; 2120 mutex_exit(&state->id_link_mutex); 2121 goto link_mod_return; 2122 } 2123 2124 mutex_exit(&state->id_link_mutex); 2125 2126 /* 2127 * Queue up a request for ibd_async_link() to handle this link 2128 * state change event 2129 */ 2130 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2131 req->rq_ptr = (void *)opcode; 2132 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2133 2134 link_mod_return: 2135 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2136 } 2137 2138 /* 2139 * For the port up/down events, IBTL guarantees there will not be concurrent 2140 * invocations of the handler. IBTL might coalesce link transition events, 2141 * and not invoke the handler for _each_ up/down transition, but it will 2142 * invoke the handler with last known state 2143 */ 2144 static void 2145 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2146 ibt_async_code_t code, ibt_async_event_t *event) 2147 { 2148 ibd_state_t *state = (ibd_state_t *)clnt_private; 2149 2150 switch (code) { 2151 case IBT_ERROR_CATASTROPHIC_CHAN: 2152 ibd_print_warn(state, "catastrophic channel error"); 2153 break; 2154 case IBT_ERROR_CQ: 2155 ibd_print_warn(state, "completion queue error"); 2156 break; 2157 case IBT_PORT_CHANGE_EVENT: 2158 /* 2159 * Events will be delivered to all instances that have 2160 * done ibt_open_hca() but not yet done ibt_close_hca(). 2161 * Only need to do work for our port; IBTF will deliver 2162 * events for other ports on the hca we have ibt_open_hca'ed 2163 * too. Note that id_port is initialized in ibd_attach() 2164 * before we do an ibt_open_hca() in ibd_attach(). 2165 */ 2166 ASSERT(state->id_hca_hdl == hca_hdl); 2167 if (state->id_port != event->ev_port) 2168 break; 2169 2170 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2171 IBT_PORT_CHANGE_PKEY) { 2172 ibd_link_mod(state, code); 2173 } 2174 break; 2175 case IBT_ERROR_PORT_DOWN: 2176 case IBT_CLNT_REREG_EVENT: 2177 case IBT_EVENT_PORT_UP: 2178 /* 2179 * Events will be delivered to all instances that have 2180 * done ibt_open_hca() but not yet done ibt_close_hca(). 2181 * Only need to do work for our port; IBTF will deliver 2182 * events for other ports on the hca we have ibt_open_hca'ed 2183 * too. Note that id_port is initialized in ibd_attach() 2184 * before we do an ibt_open_hca() in ibd_attach(). 2185 */ 2186 ASSERT(state->id_hca_hdl == hca_hdl); 2187 if (state->id_port != event->ev_port) 2188 break; 2189 2190 ibd_link_mod(state, code); 2191 break; 2192 2193 case IBT_HCA_ATTACH_EVENT: 2194 case IBT_HCA_DETACH_EVENT: 2195 /* 2196 * When a new card is plugged to the system, attach_event is 2197 * invoked. Additionally, a cfgadm needs to be run to make the 2198 * card known to the system, and an ifconfig needs to be run to 2199 * plumb up any ibd interfaces on the card. In the case of card 2200 * unplug, a cfgadm is run that will trigger any RCM scripts to 2201 * unplumb the ibd interfaces on the card; when the card is 2202 * actually unplugged, the detach_event is invoked; 2203 * additionally, if any ibd instances are still active on the 2204 * card (eg there were no associated RCM scripts), driver's 2205 * detach routine is invoked. 2206 */ 2207 break; 2208 default: 2209 break; 2210 } 2211 } 2212 2213 static int 2214 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2215 { 2216 mac_register_t *macp; 2217 int ret; 2218 2219 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2220 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2221 return (DDI_FAILURE); 2222 } 2223 2224 /* 2225 * Note that when we register with mac during attach, we don't 2226 * have the id_macaddr yet, so we'll simply be registering a 2227 * zero macaddr that we'll overwrite later during plumb (in 2228 * ibd_m_start()). Similar is the case with id_mtu - we'll 2229 * update the mac layer with the correct mtu during plumb. 2230 */ 2231 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2232 macp->m_driver = state; 2233 macp->m_dip = dip; 2234 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2235 macp->m_callbacks = &ibd_m_callbacks; 2236 macp->m_min_sdu = 0; 2237 if (state->id_enable_rc) { 2238 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2239 } else { 2240 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2241 } 2242 2243 /* 2244 * Register ourselves with the GLDv3 interface 2245 */ 2246 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2247 mac_free(macp); 2248 DPRINT(10, 2249 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2250 return (DDI_FAILURE); 2251 } 2252 2253 mac_free(macp); 2254 return (DDI_SUCCESS); 2255 } 2256 2257 static int 2258 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2259 { 2260 ibt_hca_attr_t hca_attrs; 2261 ibt_status_t ibt_status; 2262 2263 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2264 2265 /* 2266 * Query the HCA and fetch its attributes 2267 */ 2268 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2269 ASSERT(ibt_status == IBT_SUCCESS); 2270 2271 /* 2272 * 1. Set the Hardware Checksum capability. Currently we only consider 2273 * full checksum offload. 2274 */ 2275 if (state->id_enable_rc) { 2276 state->id_hwcksum_capab = 0; 2277 } else { 2278 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2279 == IBT_HCA_CKSUM_FULL) { 2280 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2281 } 2282 } 2283 2284 /* 2285 * 2. Set LSO policy, capability and maximum length 2286 */ 2287 if (state->id_enable_rc) { 2288 state->id_lso_policy = B_FALSE; 2289 state->id_lso_capable = B_FALSE; 2290 state->id_lso_maxlen = 0; 2291 } else { 2292 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS 2293 |DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2294 state->id_lso_policy = B_TRUE; 2295 } else { 2296 state->id_lso_policy = B_FALSE; 2297 } 2298 2299 if (hca_attrs.hca_max_lso_size > 0) { 2300 state->id_lso_capable = B_TRUE; 2301 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2302 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2303 else 2304 state->id_lso_maxlen = 2305 hca_attrs.hca_max_lso_size; 2306 } else { 2307 state->id_lso_capable = B_FALSE; 2308 state->id_lso_maxlen = 0; 2309 } 2310 } 2311 2312 /* 2313 * 3. Set Reserved L_Key capability 2314 */ 2315 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2316 state->id_hca_res_lkey_capab = 1; 2317 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2318 state->rc_enable_iov_map = B_TRUE; 2319 } else { 2320 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2321 state->rc_enable_iov_map = B_FALSE; 2322 } 2323 2324 /* 2325 * 4. Set maximum sqseg value after checking to see if extended sgl 2326 * size information is provided by the hca 2327 */ 2328 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2329 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2330 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2331 } else { 2332 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2333 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2334 } 2335 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2336 state->id_max_sqseg = IBD_MAX_SQSEG; 2337 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2338 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2339 state->id_max_sqseg, IBD_MAX_SQSEG); 2340 } 2341 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2342 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2343 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2344 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2345 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2346 } 2347 2348 /* 2349 * Translating the virtual address regions into physical regions 2350 * for using the Reserved LKey feature results in a wr sgl that 2351 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2352 * we'll fix a high-water mark (65%) for when we should stop. 2353 */ 2354 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2355 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2356 2357 /* 2358 * 5. Set number of recv and send wqes after checking hca maximum 2359 * channel size 2360 */ 2361 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2362 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2363 } else { 2364 state->id_num_rwqe = IBD_NUM_RWQE; 2365 } 2366 state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN; 2367 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2368 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2369 } else { 2370 state->id_num_swqe = IBD_NUM_SWQE; 2371 } 2372 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2373 2374 return (DDI_SUCCESS); 2375 } 2376 2377 static int 2378 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2379 { 2380 int instance; 2381 uint32_t progress = state->id_mac_state; 2382 ibt_status_t ret; 2383 2384 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2385 cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n"); 2386 return (DDI_FAILURE); 2387 } 2388 2389 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2390 cmn_err(CE_CONT, "ibd_detach: failed: srq bufs outstanding\n"); 2391 return (DDI_FAILURE); 2392 } 2393 2394 /* make sure rx resources are freed */ 2395 ibd_free_rx_rsrcs(state); 2396 2397 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2398 ASSERT(state->id_enable_rc); 2399 ibd_rc_fini_srq_list(state); 2400 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2401 } 2402 2403 if (progress & IBD_DRV_MAC_REGISTERED) { 2404 (void) mac_unregister(state->id_mh); 2405 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2406 } 2407 2408 if (progress & IBD_DRV_PD_ALLOCD) { 2409 if ((ret = ibt_free_pd(state->id_hca_hdl, 2410 state->id_pd_hdl)) != IBT_SUCCESS) { 2411 ibd_print_warn(state, "failed to free " 2412 "protection domain, ret=%d", ret); 2413 } 2414 state->id_pd_hdl = NULL; 2415 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2416 } 2417 2418 if (progress & IBD_DRV_HCA_OPENED) { 2419 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2420 IBT_SUCCESS) { 2421 ibd_print_warn(state, "failed to close " 2422 "HCA device, ret=%d", ret); 2423 } 2424 state->id_hca_hdl = NULL; 2425 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2426 } 2427 2428 mutex_enter(&ibd_gstate.ig_mutex); 2429 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2430 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2431 IBT_SUCCESS) { 2432 ibd_print_warn(state, 2433 "ibt_detach() failed, ret=%d", ret); 2434 } 2435 state->id_ibt_hdl = NULL; 2436 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2437 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2438 } 2439 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2440 (ibd_gstate.ig_ibt_hdl != NULL)) { 2441 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2442 IBT_SUCCESS) { 2443 ibd_print_warn(state, "ibt_detach(): global " 2444 "failed, ret=%d", ret); 2445 } 2446 ibd_gstate.ig_ibt_hdl = NULL; 2447 } 2448 mutex_exit(&ibd_gstate.ig_mutex); 2449 2450 if (progress & IBD_DRV_TXINTR_ADDED) { 2451 ddi_remove_softintr(state->id_tx); 2452 state->id_tx = NULL; 2453 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2454 } 2455 2456 if (progress & IBD_DRV_RXINTR_ADDED) { 2457 ddi_remove_softintr(state->id_rx); 2458 state->id_rx = NULL; 2459 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2460 } 2461 2462 #ifdef DEBUG 2463 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2464 kstat_delete(state->rc_ksp); 2465 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2466 } 2467 #endif 2468 2469 if (progress & IBD_DRV_STATE_INITIALIZED) { 2470 ibd_state_fini(state); 2471 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2472 } 2473 2474 instance = ddi_get_instance(dip); 2475 ddi_soft_state_free(ibd_list, instance); 2476 2477 return (DDI_SUCCESS); 2478 } 2479 2480 /* 2481 * Attach device to the IO framework. 2482 */ 2483 static int 2484 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2485 { 2486 ibd_state_t *state = NULL; 2487 ib_guid_t hca_guid; 2488 int instance; 2489 ibt_status_t ret; 2490 int rv; 2491 2492 /* 2493 * IBD doesn't support suspend/resume 2494 */ 2495 if (cmd != DDI_ATTACH) 2496 return (DDI_FAILURE); 2497 2498 /* 2499 * Allocate softstate structure 2500 */ 2501 instance = ddi_get_instance(dip); 2502 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2503 return (DDI_FAILURE); 2504 state = ddi_get_soft_state(ibd_list, instance); 2505 2506 /* 2507 * Initialize mutexes and condition variables 2508 */ 2509 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2510 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2511 goto attach_fail; 2512 } 2513 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2514 2515 /* 2516 * Allocate rx,tx softintr 2517 */ 2518 if (ibd_rx_softintr == 1) { 2519 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2520 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2521 DPRINT(10, "ibd_attach: failed in " 2522 "ddi_add_softintr(id_rx), ret=%d", rv); 2523 goto attach_fail; 2524 } 2525 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2526 } 2527 if (ibd_tx_softintr == 1) { 2528 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2529 NULL, NULL, ibd_tx_recycle, 2530 (caddr_t)state)) != DDI_SUCCESS) { 2531 DPRINT(10, "ibd_attach: failed in " 2532 "ddi_add_softintr(id_tx), ret=%d", rv); 2533 goto attach_fail; 2534 } 2535 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2536 } 2537 2538 /* 2539 * Obtain IBA P_Key, port number and HCA guid and validate 2540 * them (for P_Key, only full members are allowed as per 2541 * IPoIB specification; neither port number nor HCA guid 2542 * can be zero) 2543 */ 2544 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2545 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2546 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2547 state->id_pkey); 2548 goto attach_fail; 2549 } 2550 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2551 "port-number", 0)) == 0) { 2552 DPRINT(10, "ibd_attach: invalid port number (%d)", 2553 state->id_port); 2554 goto attach_fail; 2555 } 2556 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2557 "hca-guid", 0)) == 0) { 2558 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2559 hca_guid); 2560 goto attach_fail; 2561 } 2562 2563 /* 2564 * Attach to IBTL 2565 */ 2566 mutex_enter(&ibd_gstate.ig_mutex); 2567 if (ibd_gstate.ig_ibt_hdl == NULL) { 2568 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2569 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2570 DPRINT(10, "ibd_attach: global: failed in " 2571 "ibt_attach(), ret=%d", ret); 2572 mutex_exit(&ibd_gstate.ig_mutex); 2573 goto attach_fail; 2574 } 2575 } 2576 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2577 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2578 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", 2579 ret); 2580 mutex_exit(&ibd_gstate.ig_mutex); 2581 goto attach_fail; 2582 } 2583 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2584 mutex_exit(&ibd_gstate.ig_mutex); 2585 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2586 2587 /* 2588 * Open the HCA 2589 */ 2590 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2591 &state->id_hca_hdl)) != IBT_SUCCESS) { 2592 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2593 goto attach_fail; 2594 } 2595 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2596 2597 /* Get RC config before ibd_record_capab */ 2598 ibd_rc_get_conf(state); 2599 2600 #ifdef DEBUG 2601 /* Initialize Driver Counters for Reliable Connected Mode */ 2602 if (state->id_enable_rc) { 2603 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2604 DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats"); 2605 goto attach_fail; 2606 } 2607 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2608 } 2609 #endif 2610 2611 /* 2612 * Record capabilities 2613 */ 2614 (void) ibd_record_capab(state, dip); 2615 2616 /* 2617 * Allocate a protection domain on the HCA 2618 */ 2619 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2620 &state->id_pd_hdl)) != IBT_SUCCESS) { 2621 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2622 goto attach_fail; 2623 } 2624 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2625 2626 2627 /* 2628 * Register ibd interfaces with the Nemo framework 2629 */ 2630 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2631 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2632 goto attach_fail; 2633 } 2634 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2635 2636 /* 2637 * We're done with everything we could to make the attach 2638 * succeed. All the buffer allocations and IPoIB broadcast 2639 * group joins are deferred to when the interface instance 2640 * is actually plumbed to avoid wasting memory. 2641 */ 2642 return (DDI_SUCCESS); 2643 2644 attach_fail: 2645 (void) ibd_unattach(state, dip); 2646 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2647 return (DDI_FAILURE); 2648 } 2649 2650 /* 2651 * Detach device from the IO framework. 2652 */ 2653 static int 2654 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2655 { 2656 ibd_state_t *state; 2657 int instance; 2658 2659 /* 2660 * IBD doesn't support suspend/resume 2661 */ 2662 if (cmd != DDI_DETACH) 2663 return (DDI_FAILURE); 2664 2665 /* 2666 * Get the instance softstate 2667 */ 2668 instance = ddi_get_instance(dip); 2669 state = ddi_get_soft_state(ibd_list, instance); 2670 2671 /* 2672 * Release all resources we're holding still. Note that if we'd 2673 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2674 * so far, we should find all the flags we need in id_mac_state. 2675 */ 2676 return (ibd_unattach(state, dip)); 2677 } 2678 2679 /* 2680 * Pre ibt_attach() driver initialization 2681 */ 2682 static int 2683 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2684 { 2685 char buf[64]; 2686 2687 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2688 state->id_link_state = LINK_STATE_UNKNOWN; 2689 2690 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2691 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2692 state->id_trap_stop = B_TRUE; 2693 state->id_trap_inprog = 0; 2694 2695 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2696 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2697 state->id_dip = dip; 2698 2699 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2700 2701 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2702 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2703 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2704 state->id_tx_busy = 0; 2705 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2706 2707 state->id_rx_list.dl_bufs_outstanding = 0; 2708 state->id_rx_list.dl_cnt = 0; 2709 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2710 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2711 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2712 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2713 0, NULL, NULL, NULL, NULL, NULL, 0); 2714 2715 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2716 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2717 2718 /* For Reliable Connected Mode */ 2719 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2720 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2721 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2722 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2723 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2724 MUTEX_DRIVER, NULL); 2725 2726 return (DDI_SUCCESS); 2727 } 2728 2729 /* 2730 * Post ibt_detach() driver deconstruction 2731 */ 2732 static void 2733 ibd_state_fini(ibd_state_t *state) 2734 { 2735 cv_destroy(&state->id_macst_cv); 2736 mutex_destroy(&state->id_macst_lock); 2737 2738 kmem_cache_destroy(state->id_req_kmc); 2739 2740 mutex_destroy(&state->id_rx_list.dl_mutex); 2741 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2742 2743 mutex_destroy(&state->id_txpost_lock); 2744 mutex_destroy(&state->id_tx_list.dl_mutex); 2745 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2746 mutex_destroy(&state->id_lso_lock); 2747 2748 mutex_destroy(&state->id_sched_lock); 2749 mutex_destroy(&state->id_scq_poll_lock); 2750 mutex_destroy(&state->id_rcq_poll_lock); 2751 2752 cv_destroy(&state->id_trap_cv); 2753 mutex_destroy(&state->id_trap_lock); 2754 mutex_destroy(&state->id_link_mutex); 2755 2756 /* For Reliable Connected Mode */ 2757 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2758 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2759 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2760 mutex_destroy(&state->rc_tx_large_bufs_lock); 2761 mutex_destroy(&state->rc_rx_lock); 2762 } 2763 2764 /* 2765 * Fetch link speed from SA for snmp ifspeed reporting. 2766 */ 2767 static uint64_t 2768 ibd_get_portspeed(ibd_state_t *state) 2769 { 2770 int ret; 2771 ibt_path_info_t path; 2772 ibt_path_attr_t path_attr; 2773 uint8_t num_paths; 2774 uint64_t ifspeed; 2775 2776 /* 2777 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2778 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2779 * 2000000000. Start with that as default. 2780 */ 2781 ifspeed = 2000000000; 2782 2783 bzero(&path_attr, sizeof (path_attr)); 2784 2785 /* 2786 * Get the port speed from Loopback path information. 2787 */ 2788 path_attr.pa_dgids = &state->id_sgid; 2789 path_attr.pa_num_dgids = 1; 2790 path_attr.pa_sgid = state->id_sgid; 2791 2792 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2793 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2794 goto earlydone; 2795 2796 if (num_paths < 1) 2797 goto earlydone; 2798 2799 /* 2800 * In case SA does not return an expected value, report the default 2801 * speed as 1X. 2802 */ 2803 ret = 1; 2804 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2805 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2806 ret = 1; 2807 break; 2808 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2809 ret = 4; 2810 break; 2811 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2812 ret = 12; 2813 break; 2814 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2815 ret = 2; 2816 break; 2817 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2818 ret = 8; 2819 break; 2820 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2821 ret = 16; 2822 break; 2823 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2824 ret = 24; 2825 break; 2826 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2827 ret = 32; 2828 break; 2829 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2830 ret = 48; 2831 break; 2832 } 2833 2834 ifspeed *= ret; 2835 2836 earlydone: 2837 return (ifspeed); 2838 } 2839 2840 /* 2841 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2842 * representing the input mcg mgid. 2843 */ 2844 static ibd_mce_t * 2845 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2846 { 2847 ibd_mce_t *ptr = list_head(mlist); 2848 2849 /* 2850 * Do plain linear search. 2851 */ 2852 while (ptr != NULL) { 2853 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2854 sizeof (ib_gid_t)) == 0) 2855 return (ptr); 2856 ptr = list_next(mlist, ptr); 2857 } 2858 return (NULL); 2859 } 2860 2861 /* 2862 * Execute IBA JOIN. 2863 */ 2864 static ibt_status_t 2865 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2866 { 2867 ibt_mcg_attr_t mcg_attr; 2868 2869 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2870 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2871 mcg_attr.mc_mgid = mgid; 2872 mcg_attr.mc_join_state = mce->mc_jstate; 2873 mcg_attr.mc_scope = state->id_scope; 2874 mcg_attr.mc_pkey = state->id_pkey; 2875 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2876 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2877 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2878 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2879 NULL, NULL)); 2880 } 2881 2882 /* 2883 * This code JOINs the port in the proper way (depending on the join 2884 * state) so that IBA fabric will forward mcg packets to/from the port. 2885 * It also attaches the QPN to the mcg so it can receive those mcg 2886 * packets. This code makes sure not to attach the mcg to the QP if 2887 * that has been previously done due to the mcg being joined with a 2888 * different join state, even though this is not required by SWG_0216, 2889 * refid 3610. 2890 */ 2891 static ibd_mce_t * 2892 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2893 { 2894 ibt_status_t ibt_status; 2895 ibd_mce_t *mce, *tmce, *omce = NULL; 2896 boolean_t do_attach = B_TRUE; 2897 2898 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2899 jstate, mgid.gid_prefix, mgid.gid_guid); 2900 2901 /* 2902 * For enable_multicast Full member joins, we need to do some 2903 * extra work. If there is already an mce on the list that 2904 * indicates full membership, that means the membership has 2905 * not yet been dropped (since the disable_multicast was issued) 2906 * because there are pending Tx's to the mcg; in that case, just 2907 * mark the mce not to be reaped when the Tx completion queues 2908 * an async reap operation. 2909 * 2910 * If there is already an mce on the list indicating sendonly 2911 * membership, try to promote to full membership. Be careful 2912 * not to deallocate the old mce, since there might be an AH 2913 * pointing to it; instead, update the old mce with new data 2914 * that tracks the full membership. 2915 */ 2916 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2917 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2918 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2919 ASSERT(omce->mc_fullreap); 2920 omce->mc_fullreap = B_FALSE; 2921 return (omce); 2922 } else { 2923 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2924 } 2925 } 2926 2927 /* 2928 * Allocate the ibd_mce_t to track this JOIN. 2929 */ 2930 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2931 mce->mc_fullreap = B_FALSE; 2932 mce->mc_jstate = jstate; 2933 2934 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2935 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2936 ibt_status); 2937 kmem_free(mce, sizeof (ibd_mce_t)); 2938 return (NULL); 2939 } 2940 2941 /* 2942 * Is an IBA attach required? Not if the interface is already joined 2943 * to the mcg in a different appropriate join state. 2944 */ 2945 if (jstate == IB_MC_JSTATE_NON) { 2946 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2947 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2948 do_attach = B_FALSE; 2949 } else if (jstate == IB_MC_JSTATE_FULL) { 2950 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2951 do_attach = B_FALSE; 2952 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2953 do_attach = B_FALSE; 2954 } 2955 2956 if (do_attach) { 2957 /* 2958 * Do the IBA attach. 2959 */ 2960 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2961 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2962 &mce->mc_info)) != IBT_SUCCESS) { 2963 DPRINT(10, "ibd_join_group : failed qp attachment " 2964 "%d\n", ibt_status); 2965 /* 2966 * NOTE that we should probably preserve the join info 2967 * in the list and later try to leave again at detach 2968 * time. 2969 */ 2970 (void) ibt_leave_mcg(state->id_sgid, mgid, 2971 state->id_sgid, jstate); 2972 kmem_free(mce, sizeof (ibd_mce_t)); 2973 return (NULL); 2974 } 2975 } 2976 2977 /* 2978 * Insert the ibd_mce_t in the proper list. 2979 */ 2980 if (jstate == IB_MC_JSTATE_NON) { 2981 IBD_MCACHE_INSERT_NON(state, mce); 2982 } else { 2983 /* 2984 * Set up the mc_req fields used for reaping the 2985 * mcg in case of delayed tx completion (see 2986 * ibd_tx_cleanup()). Also done for sendonly join in 2987 * case we are promoted to fullmembership later and 2988 * keep using the same mce. 2989 */ 2990 mce->mc_req.rq_gid = mgid; 2991 mce->mc_req.rq_ptr = mce; 2992 /* 2993 * Check whether this is the case of trying to join 2994 * full member, and we were already joined send only. 2995 * We try to drop our SendOnly membership, but it is 2996 * possible that the mcg does not exist anymore (and 2997 * the subnet trap never reached us), so the leave 2998 * operation might fail. 2999 */ 3000 if (omce != NULL) { 3001 (void) ibt_leave_mcg(state->id_sgid, mgid, 3002 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3003 omce->mc_jstate = IB_MC_JSTATE_FULL; 3004 bcopy(&mce->mc_info, &omce->mc_info, 3005 sizeof (ibt_mcg_info_t)); 3006 kmem_free(mce, sizeof (ibd_mce_t)); 3007 return (omce); 3008 } 3009 mutex_enter(&state->id_mc_mutex); 3010 IBD_MCACHE_INSERT_FULL(state, mce); 3011 mutex_exit(&state->id_mc_mutex); 3012 } 3013 3014 return (mce); 3015 } 3016 3017 /* 3018 * Called during port up event handling to attempt to reacquire full 3019 * membership to an mcg. Stripped down version of ibd_join_group(). 3020 * Note that it is possible that the mcg might have gone away, and 3021 * gets recreated at this point. 3022 */ 3023 static void 3024 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3025 { 3026 ib_gid_t mgid; 3027 3028 /* 3029 * If the mc_fullreap flag is set, or this join fails, a subsequent 3030 * reap/leave is going to try to leave the group. We could prevent 3031 * that by adding a boolean flag into ibd_mce_t, if required. 3032 */ 3033 if (mce->mc_fullreap) 3034 return; 3035 3036 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3037 3038 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3039 mgid.gid_guid); 3040 3041 /* While reacquiring, leave and then join the MCG */ 3042 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, 3043 mce->mc_jstate); 3044 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3045 ibd_print_warn(state, "Failure on port up to rejoin " 3046 "multicast gid %016llx:%016llx", 3047 (u_longlong_t)mgid.gid_prefix, 3048 (u_longlong_t)mgid.gid_guid); 3049 } 3050 3051 /* 3052 * This code handles delayed Tx completion cleanups for mcg's to which 3053 * disable_multicast has been issued, regular mcg related cleanups during 3054 * disable_multicast, disable_promiscuous and mcg traps, as well as 3055 * cleanups during driver detach time. Depending on the join state, 3056 * it deletes the mce from the appropriate list and issues the IBA 3057 * leave/detach; except in the disable_multicast case when the mce 3058 * is left on the active list for a subsequent Tx completion cleanup. 3059 */ 3060 static void 3061 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3062 uint8_t jstate) 3063 { 3064 ibd_mce_t *tmce; 3065 boolean_t do_detach = B_TRUE; 3066 3067 /* 3068 * Before detaching, we must check whether the other list 3069 * contains the mcg; if we detach blindly, the consumer 3070 * who set up the other list will also stop receiving 3071 * traffic. 3072 */ 3073 if (jstate == IB_MC_JSTATE_FULL) { 3074 /* 3075 * The following check is only relevant while coming 3076 * from the Tx completion path in the reap case. 3077 */ 3078 if (!mce->mc_fullreap) 3079 return; 3080 mutex_enter(&state->id_mc_mutex); 3081 IBD_MCACHE_PULLOUT_FULL(state, mce); 3082 mutex_exit(&state->id_mc_mutex); 3083 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3084 do_detach = B_FALSE; 3085 } else if (jstate == IB_MC_JSTATE_NON) { 3086 IBD_MCACHE_PULLOUT_NON(state, mce); 3087 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3088 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3089 do_detach = B_FALSE; 3090 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3091 mutex_enter(&state->id_mc_mutex); 3092 IBD_MCACHE_PULLOUT_FULL(state, mce); 3093 mutex_exit(&state->id_mc_mutex); 3094 do_detach = B_FALSE; 3095 } 3096 3097 /* 3098 * If we are reacting to a mcg trap and leaving our sendonly or 3099 * non membership, the mcg is possibly already gone, so attempting 3100 * to leave might fail. On the other hand, we must try to leave 3101 * anyway, since this might be a trap from long ago, and we could 3102 * have potentially sendonly joined to a recent incarnation of 3103 * the mcg and are about to loose track of this information. 3104 */ 3105 if (do_detach) { 3106 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3107 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3108 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3109 } 3110 3111 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3112 kmem_free(mce, sizeof (ibd_mce_t)); 3113 } 3114 3115 /* 3116 * Async code executed due to multicast and promiscuous disable requests 3117 * and mcg trap handling; also executed during driver detach. Mostly, a 3118 * leave and detach is done; except for the fullmember case when Tx 3119 * requests are pending, whence arrangements are made for subsequent 3120 * cleanup on Tx completion. 3121 */ 3122 static void 3123 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3124 { 3125 ipoib_mac_t mcmac; 3126 boolean_t recycled; 3127 ibd_mce_t *mce; 3128 3129 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3130 jstate, mgid.gid_prefix, mgid.gid_guid); 3131 3132 if (jstate == IB_MC_JSTATE_NON) { 3133 recycled = B_TRUE; 3134 mce = IBD_MCACHE_FIND_NON(state, mgid); 3135 /* 3136 * In case we are handling a mcg trap, we might not find 3137 * the mcg in the non list. 3138 */ 3139 if (mce == NULL) { 3140 return; 3141 } 3142 } else { 3143 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3144 3145 /* 3146 * In case we are handling a mcg trap, make sure the trap 3147 * is not arriving late; if we have an mce that indicates 3148 * that we are already a fullmember, that would be a clear 3149 * indication that the trap arrived late (ie, is for a 3150 * previous incarnation of the mcg). 3151 */ 3152 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3153 if ((mce == NULL) || (mce->mc_jstate == 3154 IB_MC_JSTATE_FULL)) { 3155 return; 3156 } 3157 } else { 3158 ASSERT(jstate == IB_MC_JSTATE_FULL); 3159 3160 /* 3161 * If join group failed, mce will be NULL here. 3162 * This is because in GLDv3 driver, set multicast 3163 * will always return success. 3164 */ 3165 if (mce == NULL) { 3166 return; 3167 } 3168 3169 mce->mc_fullreap = B_TRUE; 3170 } 3171 3172 /* 3173 * If no pending Tx's remain that reference the AH 3174 * for the mcg, recycle it from active to free list. 3175 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3176 * so the last completing Tx will cause an async reap 3177 * operation to be invoked, at which time we will drop our 3178 * membership to the mcg so that the pending Tx's complete 3179 * successfully. Refer to comments on "AH and MCE active 3180 * list manipulation" at top of this file. The lock protects 3181 * against Tx fast path and Tx cleanup code. 3182 */ 3183 mutex_enter(&state->id_ac_mutex); 3184 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3185 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3186 IB_MC_JSTATE_SEND_ONLY_NON)); 3187 mutex_exit(&state->id_ac_mutex); 3188 } 3189 3190 if (recycled) { 3191 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3192 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3193 ibd_async_reap_group(state, mce, mgid, jstate); 3194 } 3195 } 3196 3197 /* 3198 * Find the broadcast address as defined by IPoIB; implicitly 3199 * determines the IBA scope, mtu, tclass etc of the link the 3200 * interface is going to be a member of. 3201 */ 3202 static ibt_status_t 3203 ibd_find_bgroup(ibd_state_t *state) 3204 { 3205 ibt_mcg_attr_t mcg_attr; 3206 uint_t numg; 3207 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3208 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3209 IB_MC_SCOPE_GLOBAL }; 3210 int i, mcgmtu; 3211 boolean_t found = B_FALSE; 3212 int ret; 3213 ibt_mcg_info_t mcg_info; 3214 3215 state->id_bgroup_created = B_FALSE; 3216 3217 query_bcast_grp: 3218 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3219 mcg_attr.mc_pkey = state->id_pkey; 3220 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3221 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3222 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3223 3224 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3225 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3226 3227 /* 3228 * Look for the IPoIB broadcast group. 3229 */ 3230 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3231 state->id_mgid.gid_prefix = 3232 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3233 ((uint64_t)state->id_scope << 48) | 3234 ((uint32_t)(state->id_pkey << 16))); 3235 mcg_attr.mc_mgid = state->id_mgid; 3236 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3237 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3238 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3239 found = B_TRUE; 3240 break; 3241 } 3242 } 3243 3244 if (!found) { 3245 if (ibd_create_broadcast_group) { 3246 /* 3247 * If we created the broadcast group, but failed to 3248 * find it, we can't do anything except leave the 3249 * one we created and return failure. 3250 */ 3251 if (state->id_bgroup_created) { 3252 ibd_print_warn(state, "IPoIB broadcast group " 3253 "absent. Unable to query after create."); 3254 goto find_bgroup_fail; 3255 } 3256 3257 /* 3258 * Create the ipoib broadcast group if it didn't exist 3259 */ 3260 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3261 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3262 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3263 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3264 mcg_attr.mc_pkey = state->id_pkey; 3265 mcg_attr.mc_flow = 0; 3266 mcg_attr.mc_sl = 0; 3267 mcg_attr.mc_tclass = 0; 3268 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3269 state->id_mgid.gid_prefix = 3270 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3271 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3272 ((uint32_t)(state->id_pkey << 16))); 3273 mcg_attr.mc_mgid = state->id_mgid; 3274 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3275 3276 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3277 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3278 ibd_print_warn(state, "IPoIB broadcast group " 3279 "absent, create failed: ret = %d\n", ret); 3280 state->id_bgroup_created = B_FALSE; 3281 return (IBT_FAILURE); 3282 } 3283 state->id_bgroup_created = B_TRUE; 3284 goto query_bcast_grp; 3285 } else { 3286 ibd_print_warn(state, "IPoIB broadcast group absent"); 3287 return (IBT_FAILURE); 3288 } 3289 } 3290 3291 /* 3292 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3293 */ 3294 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3295 if (state->id_mtu < mcgmtu) { 3296 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3297 "greater than port's maximum MTU %d", mcgmtu, 3298 state->id_mtu); 3299 ibt_free_mcg_info(state->id_mcinfo, 1); 3300 goto find_bgroup_fail; 3301 } 3302 state->id_mtu = mcgmtu; 3303 3304 return (IBT_SUCCESS); 3305 3306 find_bgroup_fail: 3307 if (state->id_bgroup_created) { 3308 (void) ibt_leave_mcg(state->id_sgid, 3309 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3310 IB_MC_JSTATE_FULL); 3311 } 3312 3313 return (IBT_FAILURE); 3314 } 3315 3316 static int 3317 ibd_alloc_tx_copybufs(ibd_state_t *state) 3318 { 3319 ibt_mr_attr_t mem_attr; 3320 3321 /* 3322 * Allocate one big chunk for all regular tx copy bufs 3323 */ 3324 state->id_tx_buf_sz = state->id_mtu; 3325 if (state->id_lso_policy && state->id_lso_capable && 3326 (IBD_TX_BUF_SZ > state->id_mtu)) { 3327 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3328 } 3329 3330 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3331 state->id_tx_buf_sz, KM_SLEEP); 3332 3333 state->id_tx_wqes = kmem_zalloc(state->id_num_swqe * 3334 sizeof (ibd_swqe_t), KM_SLEEP); 3335 3336 /* 3337 * Do one memory registration on the entire txbuf area 3338 */ 3339 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3340 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3341 mem_attr.mr_as = NULL; 3342 mem_attr.mr_flags = IBT_MR_SLEEP; 3343 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3344 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3345 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3346 kmem_free(state->id_tx_wqes, 3347 state->id_num_swqe * sizeof (ibd_swqe_t)); 3348 kmem_free(state->id_tx_bufs, 3349 state->id_num_swqe * state->id_tx_buf_sz); 3350 state->id_tx_bufs = NULL; 3351 return (DDI_FAILURE); 3352 } 3353 3354 return (DDI_SUCCESS); 3355 } 3356 3357 static int 3358 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3359 { 3360 ibt_mr_attr_t mem_attr; 3361 ibd_lsobuf_t *buflist; 3362 ibd_lsobuf_t *lbufp; 3363 ibd_lsobuf_t *tail; 3364 ibd_lsobkt_t *bktp; 3365 uint8_t *membase; 3366 uint8_t *memp; 3367 uint_t memsz; 3368 int i; 3369 3370 /* 3371 * Allocate the lso bucket 3372 */ 3373 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3374 3375 /* 3376 * Allocate the entire lso memory and register it 3377 */ 3378 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3379 membase = kmem_zalloc(memsz, KM_SLEEP); 3380 3381 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3382 mem_attr.mr_len = memsz; 3383 mem_attr.mr_as = NULL; 3384 mem_attr.mr_flags = IBT_MR_SLEEP; 3385 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3386 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3387 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3388 kmem_free(membase, memsz); 3389 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3390 return (DDI_FAILURE); 3391 } 3392 3393 mutex_enter(&state->id_lso_lock); 3394 3395 /* 3396 * Now allocate the buflist. Note that the elements in the buflist and 3397 * the buffers in the lso memory have a permanent 1-1 relation, so we 3398 * can always derive the address of a buflist entry from the address of 3399 * an lso buffer. 3400 */ 3401 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3402 KM_SLEEP); 3403 3404 /* 3405 * Set up the lso buf chain 3406 */ 3407 memp = membase; 3408 lbufp = buflist; 3409 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3410 lbufp->lb_isfree = 1; 3411 lbufp->lb_buf = memp; 3412 lbufp->lb_next = lbufp + 1; 3413 3414 tail = lbufp; 3415 3416 memp += IBD_LSO_BUFSZ; 3417 lbufp++; 3418 } 3419 tail->lb_next = NULL; 3420 3421 /* 3422 * Set up the LSO buffer information in ibd state 3423 */ 3424 bktp->bkt_bufl = buflist; 3425 bktp->bkt_free_head = buflist; 3426 bktp->bkt_mem = membase; 3427 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3428 bktp->bkt_nfree = bktp->bkt_nelem; 3429 3430 state->id_lso = bktp; 3431 mutex_exit(&state->id_lso_lock); 3432 3433 return (DDI_SUCCESS); 3434 } 3435 3436 /* 3437 * Statically allocate Tx buffer list(s). 3438 */ 3439 static int 3440 ibd_init_txlist(ibd_state_t *state) 3441 { 3442 ibd_swqe_t *swqe; 3443 ibt_lkey_t lkey; 3444 int i; 3445 uint_t len; 3446 uint8_t *bufaddr; 3447 3448 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3449 return (DDI_FAILURE); 3450 3451 if (state->id_lso_policy && state->id_lso_capable) { 3452 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3453 state->id_lso_policy = B_FALSE; 3454 } 3455 3456 mutex_enter(&state->id_tx_list.dl_mutex); 3457 state->id_tx_list.dl_head = NULL; 3458 state->id_tx_list.dl_pending_sends = B_FALSE; 3459 state->id_tx_list.dl_cnt = 0; 3460 mutex_exit(&state->id_tx_list.dl_mutex); 3461 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3462 state->id_tx_rel_list.dl_head = NULL; 3463 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3464 state->id_tx_rel_list.dl_cnt = 0; 3465 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3466 3467 /* 3468 * Allocate and setup the swqe list 3469 */ 3470 lkey = state->id_tx_mr_desc.md_lkey; 3471 bufaddr = state->id_tx_bufs; 3472 len = state->id_tx_buf_sz; 3473 swqe = state->id_tx_wqes; 3474 mutex_enter(&state->id_tx_list.dl_mutex); 3475 for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) { 3476 swqe->swqe_next = NULL; 3477 swqe->swqe_im_mblk = NULL; 3478 3479 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3480 bufaddr; 3481 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3482 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3483 3484 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3485 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3486 swqe->w_swr.wr_trans = IBT_UD_SRV; 3487 3488 /* These are set in send */ 3489 swqe->w_swr.wr_nds = 0; 3490 swqe->w_swr.wr_sgl = NULL; 3491 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3492 3493 /* add to list */ 3494 state->id_tx_list.dl_cnt++; 3495 swqe->swqe_next = state->id_tx_list.dl_head; 3496 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3497 } 3498 mutex_exit(&state->id_tx_list.dl_mutex); 3499 3500 return (DDI_SUCCESS); 3501 } 3502 3503 static int 3504 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3505 uint32_t *nds_p) 3506 { 3507 ibd_lsobkt_t *bktp; 3508 ibd_lsobuf_t *lbufp; 3509 ibd_lsobuf_t *nextp; 3510 ibt_lkey_t lso_lkey; 3511 uint_t frag_sz; 3512 uint_t num_needed; 3513 int i; 3514 3515 ASSERT(sgl_p != NULL); 3516 ASSERT(nds_p != NULL); 3517 ASSERT(req_sz != 0); 3518 3519 /* 3520 * Determine how many bufs we'd need for the size requested 3521 */ 3522 num_needed = req_sz / IBD_LSO_BUFSZ; 3523 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3524 num_needed++; 3525 3526 mutex_enter(&state->id_lso_lock); 3527 3528 /* 3529 * If we don't have enough lso bufs, return failure 3530 */ 3531 ASSERT(state->id_lso != NULL); 3532 bktp = state->id_lso; 3533 if (bktp->bkt_nfree < num_needed) { 3534 mutex_exit(&state->id_lso_lock); 3535 return (-1); 3536 } 3537 3538 /* 3539 * Pick the first 'num_needed' bufs from the free list 3540 */ 3541 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3542 lbufp = bktp->bkt_free_head; 3543 for (i = 0; i < num_needed; i++) { 3544 ASSERT(lbufp->lb_isfree != 0); 3545 ASSERT(lbufp->lb_buf != NULL); 3546 3547 nextp = lbufp->lb_next; 3548 3549 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3550 sgl_p[i].ds_key = lso_lkey; 3551 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3552 3553 lbufp->lb_isfree = 0; 3554 lbufp->lb_next = NULL; 3555 3556 lbufp = nextp; 3557 } 3558 bktp->bkt_free_head = lbufp; 3559 3560 /* 3561 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3562 * to adjust the last sgl entry's length. Since we know we need atleast 3563 * one, the i-1 use below is ok. 3564 */ 3565 if (frag_sz) { 3566 sgl_p[i-1].ds_len = frag_sz; 3567 } 3568 3569 /* 3570 * Update nfree count and return 3571 */ 3572 bktp->bkt_nfree -= num_needed; 3573 3574 mutex_exit(&state->id_lso_lock); 3575 3576 *nds_p = num_needed; 3577 3578 return (0); 3579 } 3580 3581 static void 3582 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3583 { 3584 ibd_lsobkt_t *bktp; 3585 ibd_lsobuf_t *lbufp; 3586 uint8_t *lso_mem_end; 3587 uint_t ndx; 3588 int i; 3589 3590 mutex_enter(&state->id_lso_lock); 3591 3592 bktp = state->id_lso; 3593 ASSERT(bktp != NULL); 3594 3595 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3596 for (i = 0; i < nds; i++) { 3597 uint8_t *va; 3598 3599 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3600 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3601 3602 /* 3603 * Figure out the buflist element this sgl buffer corresponds 3604 * to and put it back at the head 3605 */ 3606 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3607 lbufp = bktp->bkt_bufl + ndx; 3608 3609 ASSERT(lbufp->lb_isfree == 0); 3610 ASSERT(lbufp->lb_buf == va); 3611 3612 lbufp->lb_isfree = 1; 3613 lbufp->lb_next = bktp->bkt_free_head; 3614 bktp->bkt_free_head = lbufp; 3615 } 3616 bktp->bkt_nfree += nds; 3617 3618 mutex_exit(&state->id_lso_lock); 3619 } 3620 3621 static void 3622 ibd_free_tx_copybufs(ibd_state_t *state) 3623 { 3624 /* 3625 * Unregister txbuf mr 3626 */ 3627 if (ibt_deregister_mr(state->id_hca_hdl, 3628 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3629 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3630 } 3631 state->id_tx_mr_hdl = NULL; 3632 3633 /* 3634 * Free txbuf memory 3635 */ 3636 kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t)); 3637 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3638 state->id_tx_wqes = NULL; 3639 state->id_tx_bufs = NULL; 3640 } 3641 3642 static void 3643 ibd_free_tx_lsobufs(ibd_state_t *state) 3644 { 3645 ibd_lsobkt_t *bktp; 3646 3647 mutex_enter(&state->id_lso_lock); 3648 3649 if ((bktp = state->id_lso) == NULL) { 3650 mutex_exit(&state->id_lso_lock); 3651 return; 3652 } 3653 3654 /* 3655 * First, free the buflist 3656 */ 3657 ASSERT(bktp->bkt_bufl != NULL); 3658 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3659 3660 /* 3661 * Unregister the LSO memory and free it 3662 */ 3663 ASSERT(bktp->bkt_mr_hdl != NULL); 3664 if (ibt_deregister_mr(state->id_hca_hdl, 3665 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3666 DPRINT(10, 3667 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3668 } 3669 ASSERT(bktp->bkt_mem); 3670 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3671 3672 /* 3673 * Finally free the bucket 3674 */ 3675 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3676 state->id_lso = NULL; 3677 3678 mutex_exit(&state->id_lso_lock); 3679 } 3680 3681 /* 3682 * Free the statically allocated Tx buffer list. 3683 */ 3684 static void 3685 ibd_fini_txlist(ibd_state_t *state) 3686 { 3687 /* 3688 * Free the allocated swqes 3689 */ 3690 mutex_enter(&state->id_tx_list.dl_mutex); 3691 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3692 state->id_tx_list.dl_head = NULL; 3693 state->id_tx_list.dl_pending_sends = B_FALSE; 3694 state->id_tx_list.dl_cnt = 0; 3695 state->id_tx_rel_list.dl_head = NULL; 3696 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3697 state->id_tx_rel_list.dl_cnt = 0; 3698 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3699 mutex_exit(&state->id_tx_list.dl_mutex); 3700 3701 ibd_free_tx_lsobufs(state); 3702 ibd_free_tx_copybufs(state); 3703 } 3704 3705 /* 3706 * post a list of rwqes, NULL terminated. 3707 */ 3708 static void 3709 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3710 { 3711 uint_t i; 3712 uint_t num_posted; 3713 ibt_status_t ibt_status; 3714 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3715 3716 while (rwqe) { 3717 /* Post up to IBD_RX_POST_CNT receive work requests */ 3718 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3719 wrs[i] = rwqe->w_rwr; 3720 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3721 if (rwqe == NULL) { 3722 i++; 3723 break; 3724 } 3725 } 3726 3727 /* 3728 * If posting fails for some reason, we'll never receive 3729 * completion intimation, so we'll need to cleanup. But 3730 * we need to make sure we don't clean up nodes whose 3731 * wrs have been successfully posted. We assume that the 3732 * hca driver returns on the first failure to post and 3733 * therefore the first 'num_posted' entries don't need 3734 * cleanup here. 3735 */ 3736 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3737 3738 num_posted = 0; 3739 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3740 &num_posted); 3741 if (ibt_status != IBT_SUCCESS) { 3742 /* This cannot happen unless the device has an error. */ 3743 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3744 "posting multiple wrs failed: " 3745 "requested=%d, done=%d, ret=%d", 3746 IBD_RX_POST_CNT, num_posted, ibt_status); 3747 atomic_add_32(&state->id_rx_list.dl_cnt, 3748 num_posted - i); 3749 } 3750 } 3751 } 3752 3753 /* 3754 * Grab a list of rwqes from the array of lists, and post the list. 3755 */ 3756 static void 3757 ibd_post_recv_intr(ibd_state_t *state) 3758 { 3759 ibd_rx_queue_t *rxp; 3760 ibd_rwqe_t *list; 3761 3762 /* rotate through the rx_queue array, expecting an adequate number */ 3763 state->id_rx_post_queue_index = 3764 (state->id_rx_post_queue_index + 1) & 3765 (state->id_rx_nqueues - 1); 3766 3767 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3768 mutex_enter(&rxp->rx_post_lock); 3769 list = WQE_TO_RWQE(rxp->rx_head); 3770 rxp->rx_head = NULL; 3771 rxp->rx_cnt = 0; 3772 mutex_exit(&rxp->rx_post_lock); 3773 ibd_post_recv_list(state, list); 3774 } 3775 3776 /* macro explained below */ 3777 #define RX_QUEUE_HASH(rwqe) \ 3778 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3779 3780 /* 3781 * Add a rwqe to one of the the Rx lists. If the list is large enough 3782 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3783 * 3784 * Note: one of 2^N lists is chosen via a hash. This is done 3785 * because using one list is contentious. If the first list is busy 3786 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3787 * 3788 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3789 * even distribution of mapping rwqes to the 2^N queues. 3790 */ 3791 static void 3792 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3793 { 3794 ibd_rx_queue_t *rxp; 3795 3796 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3797 3798 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3799 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3800 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3801 mutex_enter(&rxp->rx_post_lock); 3802 } 3803 rwqe->rwqe_next = rxp->rx_head; 3804 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 3805 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 3806 3807 /* only call ibt_post_recv() every Nth time through here */ 3808 if ((active & (state->id_rx_nqueues - 1)) == 0) { 3809 rxp->rx_head = NULL; 3810 rxp->rx_cnt = 0; 3811 mutex_exit(&rxp->rx_post_lock); 3812 ibd_post_recv_list(state, rwqe); 3813 return; 3814 } 3815 } 3816 rxp->rx_head = RWQE_TO_WQE(rwqe); 3817 mutex_exit(&rxp->rx_post_lock); 3818 } 3819 3820 static int 3821 ibd_alloc_rx_copybufs(ibd_state_t *state) 3822 { 3823 ibt_mr_attr_t mem_attr; 3824 int i; 3825 3826 /* 3827 * Allocate one big chunk for all regular rx copy bufs 3828 */ 3829 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 3830 3831 state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe * 3832 state->id_rx_buf_sz, KM_SLEEP); 3833 3834 state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe * 3835 sizeof (ibd_rwqe_t), KM_SLEEP); 3836 3837 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 3838 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 3839 sizeof (ibd_rx_queue_t), KM_SLEEP); 3840 for (i = 0; i < state->id_rx_nqueues; i++) { 3841 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3842 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 3843 } 3844 3845 /* 3846 * Do one memory registration on the entire rxbuf area 3847 */ 3848 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 3849 mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz; 3850 mem_attr.mr_as = NULL; 3851 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3852 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3853 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 3854 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 3855 kmem_free(state->id_rx_wqes, 3856 state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3857 kmem_free(state->id_rx_bufs, 3858 state->id_num_rwqe * state->id_rx_buf_sz); 3859 state->id_rx_bufs = NULL; 3860 state->id_rx_wqes = NULL; 3861 return (DDI_FAILURE); 3862 } 3863 3864 return (DDI_SUCCESS); 3865 } 3866 3867 /* 3868 * Allocate the statically allocated Rx buffer list. 3869 */ 3870 static int 3871 ibd_init_rxlist(ibd_state_t *state) 3872 { 3873 ibd_rwqe_t *rwqe, *next; 3874 ibd_wqe_t *list; 3875 ibt_lkey_t lkey; 3876 int i; 3877 uint_t len; 3878 uint8_t *bufaddr; 3879 3880 mutex_enter(&state->id_rx_free_list.dl_mutex); 3881 if (state->id_rx_free_list.dl_head != NULL) { 3882 /* rx rsrcs were never freed. Just repost them */ 3883 len = state->id_rx_buf_sz; 3884 list = state->id_rx_free_list.dl_head; 3885 state->id_rx_free_list.dl_head = NULL; 3886 state->id_rx_free_list.dl_cnt = 0; 3887 mutex_exit(&state->id_rx_free_list.dl_mutex); 3888 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3889 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3890 if ((rwqe->rwqe_im_mblk = desballoc( 3891 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 3892 &rwqe->w_freemsg_cb)) == NULL) { 3893 /* allow freemsg_cb to free the rwqes */ 3894 if (atomic_dec_32_nv(&state->id_running) != 0) { 3895 cmn_err(CE_WARN, "ibd_init_rxlist: " 3896 "id_running was not 1\n"); 3897 } 3898 DPRINT(10, "ibd_init_rxlist : " 3899 "failed in desballoc()"); 3900 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3901 rwqe = next) { 3902 next = WQE_TO_RWQE(rwqe->rwqe_next); 3903 if (rwqe->rwqe_im_mblk) { 3904 atomic_inc_32(&state-> 3905 id_rx_list. 3906 dl_bufs_outstanding); 3907 freemsg(rwqe->rwqe_im_mblk); 3908 } else 3909 ibd_free_rwqe(state, rwqe); 3910 } 3911 atomic_inc_32(&state->id_running); 3912 return (DDI_FAILURE); 3913 } 3914 } 3915 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3916 return (DDI_SUCCESS); 3917 } 3918 mutex_exit(&state->id_rx_free_list.dl_mutex); 3919 3920 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 3921 return (DDI_FAILURE); 3922 3923 /* 3924 * Allocate and setup the rwqe list 3925 */ 3926 len = state->id_rx_buf_sz; 3927 lkey = state->id_rx_mr_desc.md_lkey; 3928 rwqe = state->id_rx_wqes; 3929 bufaddr = state->id_rx_bufs; 3930 list = NULL; 3931 for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) { 3932 rwqe->w_state = state; 3933 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3934 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3935 3936 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 3937 3938 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 3939 &rwqe->w_freemsg_cb)) == NULL) { 3940 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 3941 /* allow freemsg_cb to free the rwqes */ 3942 if (atomic_dec_32_nv(&state->id_running) != 0) { 3943 cmn_err(CE_WARN, "ibd_init_rxlist: " 3944 "id_running was not 1\n"); 3945 } 3946 DPRINT(10, "ibd_init_rxlist : " 3947 "failed in desballoc()"); 3948 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3949 rwqe = next) { 3950 next = WQE_TO_RWQE(rwqe->rwqe_next); 3951 freemsg(rwqe->rwqe_im_mblk); 3952 } 3953 atomic_inc_32(&state->id_running); 3954 3955 /* remove reference to free'd rwqes */ 3956 mutex_enter(&state->id_rx_free_list.dl_mutex); 3957 state->id_rx_free_list.dl_head = NULL; 3958 state->id_rx_free_list.dl_cnt = 0; 3959 mutex_exit(&state->id_rx_free_list.dl_mutex); 3960 3961 ibd_fini_rxlist(state); 3962 return (DDI_FAILURE); 3963 } 3964 3965 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 3966 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3967 (ib_vaddr_t)(uintptr_t)bufaddr; 3968 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 3969 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3970 rwqe->w_rwr.wr_nds = 1; 3971 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3972 3973 rwqe->rwqe_next = list; 3974 list = RWQE_TO_WQE(rwqe); 3975 } 3976 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3977 3978 return (DDI_SUCCESS); 3979 } 3980 3981 static void 3982 ibd_free_rx_copybufs(ibd_state_t *state) 3983 { 3984 int i; 3985 3986 /* 3987 * Unregister rxbuf mr 3988 */ 3989 if (ibt_deregister_mr(state->id_hca_hdl, 3990 state->id_rx_mr_hdl) != IBT_SUCCESS) { 3991 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 3992 } 3993 state->id_rx_mr_hdl = NULL; 3994 3995 /* 3996 * Free rxbuf memory 3997 */ 3998 for (i = 0; i < state->id_rx_nqueues; i++) { 3999 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4000 mutex_destroy(&rxp->rx_post_lock); 4001 } 4002 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 4003 sizeof (ibd_rx_queue_t)); 4004 kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t)); 4005 kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz); 4006 state->id_rx_queues = NULL; 4007 state->id_rx_wqes = NULL; 4008 state->id_rx_bufs = NULL; 4009 } 4010 4011 static void 4012 ibd_free_rx_rsrcs(ibd_state_t *state) 4013 { 4014 mutex_enter(&state->id_rx_free_list.dl_mutex); 4015 if (state->id_rx_free_list.dl_head == NULL) { 4016 /* already freed */ 4017 mutex_exit(&state->id_rx_free_list.dl_mutex); 4018 return; 4019 } 4020 ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe); 4021 ibd_free_rx_copybufs(state); 4022 state->id_rx_free_list.dl_cnt = 0; 4023 state->id_rx_free_list.dl_head = NULL; 4024 mutex_exit(&state->id_rx_free_list.dl_mutex); 4025 } 4026 4027 /* 4028 * Free the statically allocated Rx buffer list. 4029 */ 4030 static void 4031 ibd_fini_rxlist(ibd_state_t *state) 4032 { 4033 ibd_rwqe_t *rwqe; 4034 int i; 4035 4036 /* run through the rx_queue's, calling freemsg() */ 4037 for (i = 0; i < state->id_rx_nqueues; i++) { 4038 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4039 mutex_enter(&rxp->rx_post_lock); 4040 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 4041 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4042 freemsg(rwqe->rwqe_im_mblk); 4043 rxp->rx_cnt--; 4044 } 4045 rxp->rx_head = NULL; 4046 mutex_exit(&rxp->rx_post_lock); 4047 } 4048 4049 /* cannot free rx resources unless gld returned everything */ 4050 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 4051 ibd_free_rx_rsrcs(state); 4052 } 4053 4054 /* 4055 * Free an allocated recv wqe. 4056 */ 4057 /* ARGSUSED */ 4058 static void 4059 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 4060 { 4061 /* 4062 * desballoc() failed (no memory). 4063 * 4064 * This rwqe is placed on a free list so that it 4065 * can be reinstated when memory is available. 4066 * 4067 * NOTE: no code currently exists to reinstate 4068 * these "lost" rwqes. 4069 */ 4070 mutex_enter(&state->id_rx_free_list.dl_mutex); 4071 state->id_rx_free_list.dl_cnt++; 4072 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4073 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4074 mutex_exit(&state->id_rx_free_list.dl_mutex); 4075 } 4076 4077 /* 4078 * IBA Rx completion queue handler. Guaranteed to be single 4079 * threaded and nonreentrant for this CQ. 4080 */ 4081 /* ARGSUSED */ 4082 static void 4083 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4084 { 4085 ibd_state_t *state = (ibd_state_t *)arg; 4086 4087 atomic_inc_64(&state->id_num_intrs); 4088 4089 if (ibd_rx_softintr == 1) { 4090 mutex_enter(&state->id_rcq_poll_lock); 4091 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4092 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4093 mutex_exit(&state->id_rcq_poll_lock); 4094 return; 4095 } else { 4096 mutex_exit(&state->id_rcq_poll_lock); 4097 ddi_trigger_softintr(state->id_rx); 4098 } 4099 } else 4100 (void) ibd_intr((caddr_t)state); 4101 } 4102 4103 /* 4104 * CQ handler for Tx completions, when the Tx CQ is in 4105 * interrupt driven mode. 4106 */ 4107 /* ARGSUSED */ 4108 static void 4109 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4110 { 4111 ibd_state_t *state = (ibd_state_t *)arg; 4112 4113 atomic_inc_64(&state->id_num_intrs); 4114 4115 if (ibd_tx_softintr == 1) { 4116 mutex_enter(&state->id_scq_poll_lock); 4117 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4118 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4119 mutex_exit(&state->id_scq_poll_lock); 4120 return; 4121 } else { 4122 mutex_exit(&state->id_scq_poll_lock); 4123 ddi_trigger_softintr(state->id_tx); 4124 } 4125 } else 4126 (void) ibd_tx_recycle((caddr_t)state); 4127 } 4128 4129 /* 4130 * Multicast group create/delete trap handler. These will be delivered 4131 * on a kernel thread (handling can thus block) and can be invoked 4132 * concurrently. The handler can be invoked anytime after it is 4133 * registered and before ibt_detach(). 4134 */ 4135 /* ARGSUSED */ 4136 static void 4137 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4138 ibt_subnet_event_t *event) 4139 { 4140 ibd_state_t *state = (ibd_state_t *)arg; 4141 ibd_req_t *req; 4142 4143 /* 4144 * The trap handler will get invoked once for every event for 4145 * every port. The input "gid" is the GID0 of the port the 4146 * trap came in on; we just need to act on traps that came 4147 * to our port, meaning the port on which the ipoib interface 4148 * resides. Since ipoib uses GID0 of the port, we just match 4149 * the gids to check whether we need to handle the trap. 4150 */ 4151 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4152 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4153 return; 4154 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4155 4156 DPRINT(10, "ibd_notices_handler : %d\n", code); 4157 4158 switch (code) { 4159 case IBT_SM_EVENT_UNAVAILABLE: 4160 /* 4161 * If we are in promiscuous mode or have 4162 * sendnonmembers, we need to print a warning 4163 * message right now. Else, just store the 4164 * information, print when we enter promiscuous 4165 * mode or attempt nonmember send. We might 4166 * also want to stop caching sendnonmember. 4167 */ 4168 ibd_print_warn(state, "IBA multicast support " 4169 "degraded due to unavailability of multicast " 4170 "traps"); 4171 break; 4172 case IBT_SM_EVENT_AVAILABLE: 4173 /* 4174 * If we printed a warning message above or 4175 * while trying to nonmember send or get into 4176 * promiscuous mode, print an okay message. 4177 */ 4178 ibd_print_warn(state, "IBA multicast support " 4179 "restored due to availability of multicast " 4180 "traps"); 4181 break; 4182 case IBT_SM_EVENT_MCG_CREATED: 4183 case IBT_SM_EVENT_MCG_DELETED: 4184 /* 4185 * Common processing of creation/deletion traps. 4186 * First check if the instance is being 4187 * [de]initialized; back off then, without doing 4188 * anything more, since we are not sure if the 4189 * async thread is around, or whether we might 4190 * be racing with the detach code in ibd_m_stop() 4191 * that scans the mcg list. 4192 */ 4193 if (!ibd_async_safe(state)) 4194 return; 4195 4196 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4197 req->rq_gid = event->sm_notice_gid; 4198 req->rq_ptr = (void *)code; 4199 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4200 break; 4201 } 4202 } 4203 4204 static void 4205 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4206 { 4207 ib_gid_t mgid = req->rq_gid; 4208 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4209 4210 DPRINT(10, "ibd_async_trap : %d\n", code); 4211 4212 /* 4213 * Atomically search the nonmember and sendonlymember lists and 4214 * delete. 4215 */ 4216 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4217 4218 if (state->id_prom_op == IBD_OP_COMPLETED) { 4219 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4220 4221 /* 4222 * If in promiscuous mode, try to join/attach to the new 4223 * mcg. Given the unreliable out-of-order mode of trap 4224 * delivery, we can never be sure whether it is a problem 4225 * if the join fails. Thus, we warn the admin of a failure 4226 * if this was a creation trap. Note that the trap might 4227 * actually be reporting a long past event, and the mcg 4228 * might already have been deleted, thus we might be warning 4229 * in vain. 4230 */ 4231 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4232 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4233 ibd_print_warn(state, "IBA promiscuous mode missed " 4234 "new multicast gid %016llx:%016llx", 4235 (u_longlong_t)mgid.gid_prefix, 4236 (u_longlong_t)mgid.gid_guid); 4237 } 4238 4239 /* 4240 * Free the request slot allocated by the subnet event thread. 4241 */ 4242 ibd_async_done(state); 4243 } 4244 4245 /* 4246 * GLDv3 entry point to get capabilities. 4247 */ 4248 static boolean_t 4249 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4250 { 4251 ibd_state_t *state = arg; 4252 4253 switch (cap) { 4254 case MAC_CAPAB_HCKSUM: { 4255 uint32_t *txflags = cap_data; 4256 4257 /* 4258 * We either do full checksum or not do it at all 4259 */ 4260 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4261 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4262 else 4263 return (B_FALSE); 4264 break; 4265 } 4266 4267 case MAC_CAPAB_LSO: { 4268 mac_capab_lso_t *cap_lso = cap_data; 4269 4270 /* 4271 * In addition to the capability and policy, since LSO 4272 * relies on hw checksum, we'll not enable LSO if we 4273 * don't have hw checksum. Of course, if the HCA doesn't 4274 * provide the reserved lkey capability, enabling LSO will 4275 * actually affect performance adversely, so we'll disable 4276 * LSO even for that case. 4277 */ 4278 if (!state->id_lso_policy || !state->id_lso_capable) 4279 return (B_FALSE); 4280 4281 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4282 return (B_FALSE); 4283 4284 if (state->id_hca_res_lkey_capab == 0) { 4285 ibd_print_warn(state, "no reserved-lkey capability, " 4286 "disabling LSO"); 4287 return (B_FALSE); 4288 } 4289 4290 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4291 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4292 break; 4293 } 4294 4295 default: 4296 return (B_FALSE); 4297 } 4298 4299 return (B_TRUE); 4300 } 4301 4302 static int 4303 ibd_get_port_details(ibd_state_t *state) 4304 { 4305 ibt_hca_portinfo_t *port_infop; 4306 ibt_status_t ret; 4307 uint_t psize, port_infosz; 4308 4309 mutex_enter(&state->id_link_mutex); 4310 4311 /* 4312 * Query for port information 4313 */ 4314 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4315 &port_infop, &psize, &port_infosz); 4316 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4317 mutex_exit(&state->id_link_mutex); 4318 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4319 "failed, ret=%d", ret); 4320 return (ENETDOWN); 4321 } 4322 4323 /* 4324 * If the link already went down by the time we get here, 4325 * give up 4326 */ 4327 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4328 mutex_exit(&state->id_link_mutex); 4329 ibt_free_portinfo(port_infop, port_infosz); 4330 DPRINT(10, "ibd_get_port_details: port is not active"); 4331 return (ENETDOWN); 4332 } 4333 4334 /* 4335 * If the link is active, verify the pkey 4336 */ 4337 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4338 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4339 mutex_exit(&state->id_link_mutex); 4340 ibt_free_portinfo(port_infop, port_infosz); 4341 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4342 "failed, ret=%d", ret); 4343 return (ENONET); 4344 } 4345 4346 state->id_mtu = (128 << port_infop->p_mtu); 4347 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4348 state->id_sgid = *port_infop->p_sgid_tbl; 4349 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4350 state->id_link_state = LINK_STATE_UP; 4351 4352 mutex_exit(&state->id_link_mutex); 4353 ibt_free_portinfo(port_infop, port_infosz); 4354 4355 /* 4356 * Now that the port is active, record the port speed 4357 */ 4358 state->id_link_speed = ibd_get_portspeed(state); 4359 4360 return (0); 4361 } 4362 4363 static int 4364 ibd_alloc_cqs(ibd_state_t *state) 4365 { 4366 ibt_hca_attr_t hca_attrs; 4367 ibt_cq_attr_t cq_attr; 4368 ibt_status_t ret; 4369 uint32_t real_size; 4370 4371 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4372 ASSERT(ret == IBT_SUCCESS); 4373 4374 /* 4375 * Allocate Rx/combined CQ: 4376 * Theoretically, there is no point in having more than #rwqe 4377 * plus #swqe cqe's, except that the CQ will be signaled for 4378 * overflow when the last wqe completes, if none of the previous 4379 * cqe's have been polled. Thus, we allocate just a few less wqe's 4380 * to make sure such overflow does not occur. 4381 */ 4382 cq_attr.cq_sched = NULL; 4383 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4384 4385 /* 4386 * Allocate Receive CQ. 4387 */ 4388 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4389 cq_attr.cq_size = state->id_num_rwqe + 1; 4390 } else { 4391 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4392 state->id_num_rwqe = cq_attr.cq_size - 1; 4393 } 4394 4395 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4396 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4397 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4398 "failed, ret=%d\n", ret); 4399 return (DDI_FAILURE); 4400 } 4401 4402 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4403 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4404 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4405 "moderation failed, ret=%d\n", ret); 4406 } 4407 4408 /* make the #rx wc's the same as max rx chain size */ 4409 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 4410 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4411 state->id_rxwcs_size, KM_SLEEP); 4412 4413 /* 4414 * Allocate Send CQ. 4415 */ 4416 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4417 cq_attr.cq_size = state->id_num_swqe + 1; 4418 } else { 4419 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4420 state->id_num_swqe = cq_attr.cq_size - 1; 4421 } 4422 4423 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4424 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4425 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4426 "failed, ret=%d\n", ret); 4427 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4428 state->id_rxwcs_size); 4429 (void) ibt_free_cq(state->id_rcq_hdl); 4430 return (DDI_FAILURE); 4431 } 4432 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4433 ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) { 4434 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4435 "moderation failed, ret=%d\n", ret); 4436 } 4437 4438 state->id_txwcs_size = IBD_TX_POLL_THRESH; 4439 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4440 state->id_txwcs_size, KM_SLEEP); 4441 4442 /* 4443 * Print message in case we could not allocate as many wqe's 4444 * as was requested. 4445 */ 4446 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4447 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4448 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4449 } 4450 if (state->id_num_swqe != IBD_NUM_SWQE) { 4451 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4452 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4453 } 4454 4455 return (DDI_SUCCESS); 4456 } 4457 4458 static int 4459 ibd_setup_ud_channel(ibd_state_t *state) 4460 { 4461 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4462 ibt_ud_chan_query_attr_t ud_chan_attr; 4463 ibt_status_t ret; 4464 4465 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 4466 if (state->id_hca_res_lkey_capab) 4467 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4468 if (state->id_lso_policy && state->id_lso_capable) 4469 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4470 4471 ud_alloc_attr.ud_hca_port_num = state->id_port; 4472 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4473 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4474 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4475 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4476 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4477 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4478 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4479 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4480 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4481 ud_alloc_attr.ud_clone_chan = NULL; 4482 4483 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4484 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4485 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4486 "failed, ret=%d\n", ret); 4487 return (DDI_FAILURE); 4488 } 4489 4490 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4491 &ud_chan_attr)) != IBT_SUCCESS) { 4492 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4493 "failed, ret=%d\n", ret); 4494 (void) ibt_free_channel(state->id_chnl_hdl); 4495 return (DDI_FAILURE); 4496 } 4497 4498 state->id_qpnum = ud_chan_attr.ud_qpn; 4499 4500 return (DDI_SUCCESS); 4501 } 4502 4503 static int 4504 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4505 { 4506 uint32_t progress = state->id_mac_state; 4507 uint_t attempts; 4508 ibt_status_t ret; 4509 ib_gid_t mgid; 4510 ibd_mce_t *mce; 4511 uint8_t jstate; 4512 4513 if (atomic_dec_32_nv(&state->id_running) != 0) 4514 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 4515 4516 /* 4517 * Before we try to stop/undo whatever we did in ibd_start(), 4518 * we need to mark the link state appropriately to prevent the 4519 * ip layer from using this instance for any new transfers. Note 4520 * that if the original state of the link was "up" when we're 4521 * here, we'll set the final link state to "unknown", to behave 4522 * in the same fashion as other ethernet drivers. 4523 */ 4524 mutex_enter(&state->id_link_mutex); 4525 if (cur_link_state == LINK_STATE_DOWN) { 4526 state->id_link_state = cur_link_state; 4527 } else { 4528 state->id_link_state = LINK_STATE_UNKNOWN; 4529 } 4530 mutex_exit(&state->id_link_mutex); 4531 mac_link_update(state->id_mh, state->id_link_state); 4532 4533 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4534 if (progress & IBD_DRV_STARTED) { 4535 state->id_mac_state &= (~IBD_DRV_STARTED); 4536 } 4537 4538 /* Stop listen under Reliable Connected Mode */ 4539 if (progress & IBD_DRV_RC_LISTEN) { 4540 ASSERT(state->id_enable_rc); 4541 if (state->rc_listen_hdl != NULL) { 4542 ibd_rc_stop_listen(state); 4543 } 4544 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 4545 } 4546 4547 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 4548 (void) ibd_rc_close_all_chan(state); 4549 } 4550 4551 /* 4552 * First, stop receive interrupts; this stops the driver from 4553 * handing up buffers to higher layers. Wait for receive buffers 4554 * to be returned and give up after 1 second. 4555 */ 4556 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4557 attempts = 10; 4558 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 4559 0) > 0) { 4560 delay(drv_usectohz(100000)); 4561 if (--attempts == 0) { 4562 /* 4563 * There are pending bufs with the network 4564 * layer and we have no choice but to wait 4565 * for them to be done with. Reap all the 4566 * Tx/Rx completions that were posted since 4567 * we turned off the notification and 4568 * return failure. 4569 */ 4570 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 4571 DPRINT(2, "ibd_undo_start: " 4572 "reclaiming failed"); 4573 break; 4574 } 4575 } 4576 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4577 } 4578 4579 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 4580 ibd_rc_fini_tx_largebuf_list(state); 4581 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 4582 } 4583 4584 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 4585 ASSERT(state->id_enable_rc); 4586 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 4587 ibd_rc_fini_srq_list(state); 4588 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 4589 } else { 4590 cmn_err(CE_CONT, "ibd_undo_start: srq bufs " 4591 "outstanding\n"); 4592 } 4593 } 4594 4595 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4596 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4597 4598 mutex_enter(&state->id_trap_lock); 4599 state->id_trap_stop = B_TRUE; 4600 while (state->id_trap_inprog > 0) 4601 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4602 mutex_exit(&state->id_trap_lock); 4603 4604 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4605 } 4606 4607 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4608 /* 4609 * Flushing the channel ensures that all pending WQE's 4610 * are marked with flush_error and handed to the CQ. It 4611 * does not guarantee the invocation of the CQ handler. 4612 * This call is guaranteed to return successfully for 4613 * UD QPNs. 4614 */ 4615 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4616 IBT_SUCCESS) { 4617 DPRINT(10, "ibd_undo_start: flush_channel " 4618 "failed, ret=%d", ret); 4619 } 4620 4621 /* 4622 * Give some time for the TX CQ handler to process the 4623 * completions. 4624 */ 4625 mutex_enter(&state->id_tx_list.dl_mutex); 4626 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4627 attempts = 10; 4628 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 4629 != state->id_num_swqe) { 4630 if (--attempts == 0) 4631 break; 4632 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4633 mutex_exit(&state->id_tx_list.dl_mutex); 4634 delay(drv_usectohz(100000)); 4635 mutex_enter(&state->id_tx_list.dl_mutex); 4636 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4637 } 4638 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4639 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 4640 state->id_num_swqe) { 4641 cmn_err(CE_WARN, "tx resources not freed\n"); 4642 } 4643 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4644 mutex_exit(&state->id_tx_list.dl_mutex); 4645 4646 attempts = 10; 4647 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4648 if (--attempts == 0) 4649 break; 4650 delay(drv_usectohz(100000)); 4651 } 4652 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4653 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4654 cmn_err(CE_WARN, "rx resources not freed\n"); 4655 } 4656 4657 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4658 } 4659 4660 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4661 /* 4662 * No new async requests will be posted since the device 4663 * link state has been marked as unknown; completion handlers 4664 * have been turned off, so Tx handler will not cause any 4665 * more IBD_ASYNC_REAP requests. 4666 * 4667 * Queue a request for the async thread to exit, which will 4668 * be serviced after any pending ones. This can take a while, 4669 * specially if the SM is unreachable, since IBMF will slowly 4670 * timeout each SM request issued by the async thread. Reap 4671 * the thread before continuing on, we do not want it to be 4672 * lingering in modunloaded code (or we could move the reap 4673 * to ibd_detach(), provided we keep track of the current 4674 * id_async_thrid somewhere safe). 4675 */ 4676 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4677 thread_join(state->id_async_thrid); 4678 4679 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4680 } 4681 4682 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4683 /* 4684 * Drop all residual full/non membership. This includes full 4685 * membership to the broadcast group, and any nonmembership 4686 * acquired during transmits. We do this after the Tx completion 4687 * handlers are done, since those might result in some late 4688 * leaves; this also eliminates a potential race with that 4689 * path wrt the mc full list insert/delete. Trap handling 4690 * has also been suppressed at this point. Thus, no locks 4691 * are required while traversing the mc full list. 4692 */ 4693 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4694 mce = list_head(&state->id_mc_full); 4695 while (mce != NULL) { 4696 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4697 jstate = mce->mc_jstate; 4698 mce = list_next(&state->id_mc_full, mce); 4699 ibd_leave_group(state, mgid, jstate); 4700 } 4701 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4702 } 4703 4704 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4705 ibd_fini_rxlist(state); 4706 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4707 } 4708 4709 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4710 ibd_fini_txlist(state); 4711 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4712 } 4713 4714 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4715 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4716 IBT_SUCCESS) { 4717 DPRINT(10, "ibd_undo_start: free_channel " 4718 "failed, ret=%d", ret); 4719 } 4720 4721 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4722 } 4723 4724 if (progress & IBD_DRV_CQS_ALLOCD) { 4725 kmem_free(state->id_txwcs, 4726 sizeof (ibt_wc_t) * state->id_txwcs_size); 4727 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4728 IBT_SUCCESS) { 4729 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4730 "failed, ret=%d", ret); 4731 } 4732 4733 kmem_free(state->id_rxwcs, 4734 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4735 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4736 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4737 "ret=%d", ret); 4738 } 4739 4740 state->id_txwcs = NULL; 4741 state->id_rxwcs = NULL; 4742 state->id_scq_hdl = NULL; 4743 state->id_rcq_hdl = NULL; 4744 4745 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4746 } 4747 4748 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4749 mutex_enter(&state->id_ac_mutex); 4750 mod_hash_destroy_hash(state->id_ah_active_hash); 4751 mutex_exit(&state->id_ac_mutex); 4752 ibd_acache_fini(state); 4753 4754 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4755 } 4756 4757 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4758 /* 4759 * If we'd created the ipoib broadcast group and had 4760 * successfully joined it, leave it now 4761 */ 4762 if (state->id_bgroup_created) { 4763 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4764 jstate = IB_MC_JSTATE_FULL; 4765 (void) ibt_leave_mcg(state->id_sgid, mgid, 4766 state->id_sgid, jstate); 4767 } 4768 ibt_free_mcg_info(state->id_mcinfo, 1); 4769 4770 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4771 } 4772 4773 return (DDI_SUCCESS); 4774 } 4775 4776 /* 4777 * These pair of routines are used to set/clear the condition that 4778 * the caller is likely to do something to change the id_mac_state. 4779 * If there's already someone doing either a start or a stop (possibly 4780 * due to the async handler detecting a pkey relocation event, a plumb 4781 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4782 * that's done. 4783 */ 4784 static void 4785 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4786 { 4787 mutex_enter(&state->id_macst_lock); 4788 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4789 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4790 4791 state->id_mac_state |= flag; 4792 mutex_exit(&state->id_macst_lock); 4793 } 4794 4795 static void 4796 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4797 { 4798 mutex_enter(&state->id_macst_lock); 4799 state->id_mac_state &= (~flag); 4800 cv_signal(&state->id_macst_cv); 4801 mutex_exit(&state->id_macst_lock); 4802 } 4803 4804 /* 4805 * GLDv3 entry point to start hardware. 4806 */ 4807 /*ARGSUSED*/ 4808 static int 4809 ibd_m_start(void *arg) 4810 { 4811 ibd_state_t *state = arg; 4812 int ret; 4813 4814 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4815 4816 ret = ibd_start(state); 4817 4818 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4819 4820 return (ret); 4821 } 4822 4823 static int 4824 ibd_start(ibd_state_t *state) 4825 { 4826 kthread_t *kht; 4827 int err; 4828 ibt_status_t ret; 4829 4830 if (state->id_mac_state & IBD_DRV_STARTED) 4831 return (DDI_SUCCESS); 4832 4833 if (atomic_inc_32_nv(&state->id_running) != 1) { 4834 DPRINT(10, "ibd_start: id_running is non-zero"); 4835 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 4836 atomic_dec_32(&state->id_running); 4837 return (EINVAL); 4838 } 4839 4840 /* 4841 * Get port details; if we fail here, very likely the port 4842 * state is inactive or the pkey can't be verified. 4843 */ 4844 if ((err = ibd_get_port_details(state)) != 0) { 4845 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4846 goto start_fail; 4847 } 4848 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4849 4850 /* 4851 * Find the IPoIB broadcast group 4852 */ 4853 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4854 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4855 err = ENOTACTIVE; 4856 goto start_fail; 4857 } 4858 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4859 4860 /* 4861 * Initialize per-interface caches and lists; if we fail here, 4862 * it is most likely due to a lack of resources 4863 */ 4864 if (ibd_acache_init(state) != DDI_SUCCESS) { 4865 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4866 err = ENOMEM; 4867 goto start_fail; 4868 } 4869 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4870 4871 /* 4872 * Allocate send and receive completion queues 4873 */ 4874 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4875 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4876 err = ENOMEM; 4877 goto start_fail; 4878 } 4879 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4880 4881 /* 4882 * Setup a UD channel 4883 */ 4884 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4885 err = ENOMEM; 4886 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4887 goto start_fail; 4888 } 4889 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4890 4891 /* 4892 * Allocate and initialize the tx buffer list 4893 */ 4894 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4895 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4896 err = ENOMEM; 4897 goto start_fail; 4898 } 4899 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4900 4901 /* 4902 * Create the send cq handler here 4903 */ 4904 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4905 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4906 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4907 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4908 "failed, ret=%d", ret); 4909 err = EINVAL; 4910 goto start_fail; 4911 } 4912 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4913 4914 /* 4915 * Allocate and initialize the rx buffer list 4916 */ 4917 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4918 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4919 err = ENOMEM; 4920 goto start_fail; 4921 } 4922 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4923 4924 /* 4925 * Join IPoIB broadcast group 4926 */ 4927 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4928 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4929 err = ENOTACTIVE; 4930 goto start_fail; 4931 } 4932 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4933 4934 /* 4935 * Create the async thread; thread_create never fails. 4936 */ 4937 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4938 TS_RUN, minclsyspri); 4939 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4940 state->id_async_thrid = kht->t_did; 4941 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4942 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4943 4944 /* 4945 * When we did mac_register() in ibd_attach(), we didn't register 4946 * the real macaddr and we didn't have the true port mtu. Now that 4947 * we're almost ready, set the local mac address and broadcast 4948 * addresses and update gldv3 about the real values of these 4949 * parameters. 4950 */ 4951 if (state->id_enable_rc) { 4952 ibd_h2n_mac(&state->id_macaddr, 4953 IBD_MAC_ADDR_RC + state->id_qpnum, 4954 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4955 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 4956 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4957 } else { 4958 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4959 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4960 } 4961 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4962 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4963 4964 if (!state->id_enable_rc) { 4965 (void) mac_maxsdu_update(state->id_mh, state->id_mtu 4966 - IPOIB_HDRSIZE); 4967 } 4968 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4969 4970 /* 4971 * Setup the receive cq handler 4972 */ 4973 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4974 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4975 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4976 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4977 "failed, ret=%d", ret); 4978 err = EINVAL; 4979 goto start_fail; 4980 } 4981 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4982 4983 /* 4984 * Setup the subnet notices handler after we've initialized the acache/ 4985 * mcache and started the async thread, both of which are required for 4986 * the trap handler to function properly. 4987 * 4988 * Now that the async thread has been started (and we've already done 4989 * a mac_register() during attach so mac_tx_update() can be called 4990 * if necessary without any problem), we can enable the trap handler 4991 * to queue requests to the async thread. 4992 */ 4993 ibt_register_subnet_notices(state->id_ibt_hdl, 4994 ibd_snet_notices_handler, state); 4995 mutex_enter(&state->id_trap_lock); 4996 state->id_trap_stop = B_FALSE; 4997 mutex_exit(&state->id_trap_lock); 4998 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4999 5000 if (state->id_enable_rc) { 5001 if (state->rc_enable_srq) { 5002 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 5003 if (ibd_rc_repost_srq_free_list(state) != 5004 IBT_SUCCESS) { 5005 err = ENOMEM; 5006 goto start_fail; 5007 } 5008 } else { 5009 /* Allocate SRQ resource */ 5010 if (ibd_rc_init_srq_list(state) != 5011 IBT_SUCCESS) { 5012 err = ENOMEM; 5013 goto start_fail; 5014 } 5015 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 5016 } 5017 } 5018 5019 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 5020 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 5021 "failed"); 5022 err = ENOMEM; 5023 goto start_fail; 5024 } 5025 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 5026 5027 /* RC: begin to listen only after everything is available */ 5028 if (ibd_rc_listen(state) != IBT_SUCCESS) { 5029 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 5030 err = EINVAL; 5031 goto start_fail; 5032 } 5033 state->id_mac_state |= IBD_DRV_RC_LISTEN; 5034 } 5035 5036 /* 5037 * Indicate link status to GLDv3 and higher layers. By default, 5038 * we assume we are in up state (which must have been true at 5039 * least at the time the broadcast mcg's were probed); if there 5040 * were any up/down transitions till the time we come here, the 5041 * async handler will have updated last known state, which we 5042 * use to tell GLDv3. The async handler will not send any 5043 * notifications to GLDv3 till we reach here in the initialization 5044 * sequence. 5045 */ 5046 state->id_mac_state |= IBD_DRV_STARTED; 5047 mac_link_update(state->id_mh, state->id_link_state); 5048 5049 return (DDI_SUCCESS); 5050 5051 start_fail: 5052 /* 5053 * If we ran into a problem during ibd_start() and ran into 5054 * some other problem during undoing our partial work, we can't 5055 * do anything about it. Ignore any errors we might get from 5056 * ibd_undo_start() and just return the original error we got. 5057 */ 5058 (void) ibd_undo_start(state, LINK_STATE_DOWN); 5059 return (err); 5060 } 5061 5062 /* 5063 * GLDv3 entry point to stop hardware from receiving packets. 5064 */ 5065 /*ARGSUSED*/ 5066 static void 5067 ibd_m_stop(void *arg) 5068 { 5069 ibd_state_t *state = (ibd_state_t *)arg; 5070 5071 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 5072 5073 (void) ibd_undo_start(state, state->id_link_state); 5074 5075 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 5076 } 5077 5078 /* 5079 * GLDv3 entry point to modify device's mac address. We do not 5080 * allow address modifications. 5081 */ 5082 static int 5083 ibd_m_unicst(void *arg, const uint8_t *macaddr) 5084 { 5085 ibd_state_t *state = arg; 5086 5087 /* 5088 * Don't bother even comparing the macaddr if we haven't 5089 * completed ibd_m_start(). 5090 */ 5091 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5092 return (0); 5093 5094 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 5095 return (0); 5096 else 5097 return (EINVAL); 5098 } 5099 5100 /* 5101 * The blocking part of the IBA join/leave operations are done out 5102 * of here on the async thread. 5103 */ 5104 static void 5105 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 5106 { 5107 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 5108 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 5109 5110 if (op == IBD_ASYNC_JOIN) { 5111 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 5112 ibd_print_warn(state, "Join multicast group failed :" 5113 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5114 } 5115 } else { 5116 /* 5117 * Here, we must search for the proper mcg_info and 5118 * use that to leave the group. 5119 */ 5120 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 5121 } 5122 } 5123 5124 /* 5125 * GLDv3 entry point for multicast enable/disable requests. 5126 * This function queues the operation to the async thread and 5127 * return success for a valid multicast address. 5128 */ 5129 static int 5130 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 5131 { 5132 ibd_state_t *state = (ibd_state_t *)arg; 5133 ipoib_mac_t maddr, *mcast; 5134 ib_gid_t mgid; 5135 ibd_req_t *req; 5136 5137 /* 5138 * If we haven't completed ibd_m_start(), async thread wouldn't 5139 * have been started and id_bcaddr wouldn't be set, so there's 5140 * no point in continuing. 5141 */ 5142 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5143 return (0); 5144 5145 /* 5146 * The incoming multicast address might not be aligned properly 5147 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 5148 * it to look like one though, to get the offsets of the mc gid, 5149 * since we know we are not going to dereference any values with 5150 * the ipoib_mac_t pointer. 5151 */ 5152 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 5153 mcast = &maddr; 5154 5155 /* 5156 * Check validity of MCG address. We could additionally check 5157 * that a enable/disable is not being issued on the "broadcast" 5158 * mcg, but since this operation is only invokable by privileged 5159 * programs anyway, we allow the flexibility to those dlpi apps. 5160 * Note that we do not validate the "scope" of the IBA mcg. 5161 */ 5162 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 5163 return (EINVAL); 5164 5165 /* 5166 * fill in multicast pkey and scope 5167 */ 5168 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 5169 5170 /* 5171 * If someone is trying to JOIN/LEAVE the broadcast group, we do 5172 * nothing (i.e. we stay JOINed to the broadcast group done in 5173 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 5174 * requires to be joined to broadcast groups at all times. 5175 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 5176 * depends on this. 5177 */ 5178 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5179 return (0); 5180 5181 ibd_n2h_gid(mcast, &mgid); 5182 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5183 if (req == NULL) 5184 return (ENOMEM); 5185 5186 req->rq_gid = mgid; 5187 5188 if (add) { 5189 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 5190 mgid.gid_prefix, mgid.gid_guid); 5191 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 5192 } else { 5193 DPRINT(1, "ibd_m_multicst : unset_multicast : " 5194 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5195 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 5196 } 5197 return (0); 5198 } 5199 5200 /* 5201 * The blocking part of the IBA promiscuous operations are done 5202 * out of here on the async thread. The dlpireq parameter indicates 5203 * whether this invocation is due to a dlpi request or due to 5204 * a port up/down event. 5205 */ 5206 static void 5207 ibd_async_unsetprom(ibd_state_t *state) 5208 { 5209 ibd_mce_t *mce = list_head(&state->id_mc_non); 5210 ib_gid_t mgid; 5211 5212 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 5213 5214 while (mce != NULL) { 5215 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5216 mce = list_next(&state->id_mc_non, mce); 5217 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 5218 } 5219 state->id_prom_op = IBD_OP_NOTSTARTED; 5220 } 5221 5222 /* 5223 * The blocking part of the IBA promiscuous operations are done 5224 * out of here on the async thread. The dlpireq parameter indicates 5225 * whether this invocation is due to a dlpi request or due to 5226 * a port up/down event. 5227 */ 5228 static void 5229 ibd_async_setprom(ibd_state_t *state) 5230 { 5231 ibt_mcg_attr_t mcg_attr; 5232 ibt_mcg_info_t *mcg_info; 5233 ib_gid_t mgid; 5234 uint_t numg; 5235 int i; 5236 char ret = IBD_OP_COMPLETED; 5237 5238 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 5239 5240 /* 5241 * Obtain all active MC groups on the IB fabric with 5242 * specified criteria (scope + Pkey + Qkey + mtu). 5243 */ 5244 bzero(&mcg_attr, sizeof (mcg_attr)); 5245 mcg_attr.mc_pkey = state->id_pkey; 5246 mcg_attr.mc_scope = state->id_scope; 5247 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 5248 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 5249 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 5250 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 5251 IBT_SUCCESS) { 5252 ibd_print_warn(state, "Could not get list of IBA multicast " 5253 "groups"); 5254 ret = IBD_OP_ERRORED; 5255 goto done; 5256 } 5257 5258 /* 5259 * Iterate over the returned mcg's and join as NonMember 5260 * to the IP mcg's. 5261 */ 5262 for (i = 0; i < numg; i++) { 5263 /* 5264 * Do a NonMember JOIN on the MC group. 5265 */ 5266 mgid = mcg_info[i].mc_adds_vect.av_dgid; 5267 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 5268 ibd_print_warn(state, "IBA promiscuous mode missed " 5269 "multicast gid %016llx:%016llx", 5270 (u_longlong_t)mgid.gid_prefix, 5271 (u_longlong_t)mgid.gid_guid); 5272 } 5273 5274 ibt_free_mcg_info(mcg_info, numg); 5275 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 5276 done: 5277 state->id_prom_op = ret; 5278 } 5279 5280 /* 5281 * GLDv3 entry point for multicast promiscuous enable/disable requests. 5282 * GLDv3 assumes phys state receives more packets than multi state, 5283 * which is not true for IPoIB. Thus, treat the multi and phys 5284 * promiscuous states the same way to work with GLDv3's assumption. 5285 */ 5286 static int 5287 ibd_m_promisc(void *arg, boolean_t on) 5288 { 5289 ibd_state_t *state = (ibd_state_t *)arg; 5290 ibd_req_t *req; 5291 5292 /* 5293 * Async thread wouldn't have been started if we haven't 5294 * passed ibd_m_start() 5295 */ 5296 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5297 return (0); 5298 5299 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5300 if (req == NULL) 5301 return (ENOMEM); 5302 if (on) { 5303 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 5304 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 5305 } else { 5306 DPRINT(1, "ibd_m_promisc : unset_promisc"); 5307 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 5308 } 5309 5310 return (0); 5311 } 5312 5313 /* 5314 * GLDv3 entry point for gathering statistics. 5315 */ 5316 static int 5317 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 5318 { 5319 ibd_state_t *state = (ibd_state_t *)arg; 5320 5321 switch (stat) { 5322 case MAC_STAT_IFSPEED: 5323 *val = state->id_link_speed; 5324 break; 5325 case MAC_STAT_MULTIRCV: 5326 *val = state->id_multi_rcv; 5327 break; 5328 case MAC_STAT_BRDCSTRCV: 5329 *val = state->id_brd_rcv; 5330 break; 5331 case MAC_STAT_MULTIXMT: 5332 *val = state->id_multi_xmt; 5333 break; 5334 case MAC_STAT_BRDCSTXMT: 5335 *val = state->id_brd_xmt; 5336 break; 5337 case MAC_STAT_RBYTES: 5338 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 5339 + state->rc_rcv_copy_byte; 5340 break; 5341 case MAC_STAT_IPACKETS: 5342 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 5343 + state->rc_rcv_copy_pkt; 5344 break; 5345 case MAC_STAT_OBYTES: 5346 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 5347 break; 5348 case MAC_STAT_OPACKETS: 5349 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 5350 state->rc_xmt_fragmented_pkt + 5351 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 5352 break; 5353 case MAC_STAT_OERRORS: 5354 *val = state->id_ah_error; /* failed AH translation */ 5355 break; 5356 case MAC_STAT_IERRORS: 5357 *val = 0; 5358 break; 5359 case MAC_STAT_NOXMTBUF: 5360 *val = state->id_tx_short + state->rc_swqe_short + 5361 state->rc_xmt_buf_short; 5362 break; 5363 case MAC_STAT_NORCVBUF: 5364 default: 5365 return (ENOTSUP); 5366 } 5367 5368 return (0); 5369 } 5370 5371 static void 5372 ibd_async_txsched(ibd_state_t *state) 5373 { 5374 ibd_resume_transmission(state); 5375 } 5376 5377 static void 5378 ibd_resume_transmission(ibd_state_t *state) 5379 { 5380 int flag; 5381 int met_thresh = 0; 5382 int thresh = 0; 5383 int ret = -1; 5384 5385 mutex_enter(&state->id_sched_lock); 5386 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5387 mutex_enter(&state->id_tx_list.dl_mutex); 5388 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5389 met_thresh = state->id_tx_list.dl_cnt + 5390 state->id_tx_rel_list.dl_cnt; 5391 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5392 mutex_exit(&state->id_tx_list.dl_mutex); 5393 thresh = IBD_FREE_SWQES_THRESH; 5394 flag = IBD_RSRC_SWQE; 5395 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5396 ASSERT(state->id_lso != NULL); 5397 mutex_enter(&state->id_lso_lock); 5398 met_thresh = state->id_lso->bkt_nfree; 5399 thresh = IBD_FREE_LSOS_THRESH; 5400 mutex_exit(&state->id_lso_lock); 5401 flag = IBD_RSRC_LSOBUF; 5402 if (met_thresh > thresh) 5403 state->id_sched_lso_cnt++; 5404 } 5405 if (met_thresh > thresh) { 5406 state->id_sched_needed &= ~flag; 5407 state->id_sched_cnt++; 5408 ret = 0; 5409 } 5410 mutex_exit(&state->id_sched_lock); 5411 5412 if (ret == 0) 5413 mac_tx_update(state->id_mh); 5414 } 5415 5416 /* 5417 * Release the send wqe back into free list. 5418 */ 5419 static void 5420 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 5421 { 5422 /* 5423 * Add back on Tx list for reuse. 5424 */ 5425 ASSERT(tail->swqe_next == NULL); 5426 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5427 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 5428 tail->swqe_next = state->id_tx_rel_list.dl_head; 5429 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 5430 state->id_tx_rel_list.dl_cnt += n; 5431 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5432 } 5433 5434 /* 5435 * Acquire a send wqe from free list. 5436 * Returns error number and send wqe pointer. 5437 */ 5438 static ibd_swqe_t * 5439 ibd_acquire_swqe(ibd_state_t *state) 5440 { 5441 ibd_swqe_t *wqe; 5442 5443 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5444 if (state->id_tx_rel_list.dl_head != NULL) { 5445 /* transfer id_tx_rel_list to id_tx_list */ 5446 state->id_tx_list.dl_head = 5447 state->id_tx_rel_list.dl_head; 5448 state->id_tx_list.dl_cnt = 5449 state->id_tx_rel_list.dl_cnt; 5450 state->id_tx_list.dl_pending_sends = B_FALSE; 5451 5452 /* clear id_tx_rel_list */ 5453 state->id_tx_rel_list.dl_head = NULL; 5454 state->id_tx_rel_list.dl_cnt = 0; 5455 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5456 5457 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5458 state->id_tx_list.dl_cnt -= 1; 5459 state->id_tx_list.dl_head = wqe->swqe_next; 5460 } else { /* no free swqe */ 5461 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5462 state->id_tx_list.dl_pending_sends = B_TRUE; 5463 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5464 state->id_tx_short++; 5465 wqe = NULL; 5466 } 5467 return (wqe); 5468 } 5469 5470 static int 5471 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5472 ibt_ud_dest_hdl_t ud_dest) 5473 { 5474 mblk_t *nmp; 5475 int iph_len, tcph_len; 5476 ibt_wr_lso_t *lso; 5477 uintptr_t ip_start, tcp_start; 5478 uint8_t *dst; 5479 uint_t pending, mblen; 5480 5481 /* 5482 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5483 * we need to adjust it here for lso. 5484 */ 5485 lso = &(node->w_swr.wr.ud_lso); 5486 lso->lso_ud_dest = ud_dest; 5487 lso->lso_mss = mss; 5488 5489 /* 5490 * Calculate the LSO header size and set it in the UD LSO structure. 5491 * Note that the only assumption we make is that each of the IPoIB, 5492 * IP and TCP headers will be contained in a single mblk fragment; 5493 * together, the headers may span multiple mblk fragments. 5494 */ 5495 nmp = mp; 5496 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5497 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5498 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5499 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5500 nmp = nmp->b_cont; 5501 5502 } 5503 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5504 5505 tcp_start = ip_start + iph_len; 5506 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5507 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5508 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5509 nmp = nmp->b_cont; 5510 } 5511 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5512 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5513 5514 /* 5515 * If the lso header fits entirely within a single mblk fragment, 5516 * we'll avoid an additional copy of the lso header here and just 5517 * pass the b_rptr of the mblk directly. 5518 * 5519 * If this isn't true, we'd have to allocate for it explicitly. 5520 */ 5521 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5522 lso->lso_hdr = mp->b_rptr; 5523 } else { 5524 /* On work completion, remember to free this allocated hdr */ 5525 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5526 if (lso->lso_hdr == NULL) { 5527 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5528 "sz = %d", lso->lso_hdr_sz); 5529 lso->lso_hdr_sz = 0; 5530 lso->lso_mss = 0; 5531 return (-1); 5532 } 5533 } 5534 5535 /* 5536 * Copy in the lso header only if we need to 5537 */ 5538 if (lso->lso_hdr != mp->b_rptr) { 5539 dst = lso->lso_hdr; 5540 pending = lso->lso_hdr_sz; 5541 5542 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5543 mblen = MBLKL(nmp); 5544 if (pending > mblen) { 5545 bcopy(nmp->b_rptr, dst, mblen); 5546 dst += mblen; 5547 pending -= mblen; 5548 } else { 5549 bcopy(nmp->b_rptr, dst, pending); 5550 break; 5551 } 5552 } 5553 } 5554 5555 return (0); 5556 } 5557 5558 static void 5559 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5560 { 5561 ibt_wr_lso_t *lso; 5562 5563 if ((!node) || (!mp)) 5564 return; 5565 5566 /* 5567 * Free any header space that we might've allocated if we 5568 * did an LSO 5569 */ 5570 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5571 lso = &(node->w_swr.wr.ud_lso); 5572 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5573 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5574 lso->lso_hdr = NULL; 5575 lso->lso_hdr_sz = 0; 5576 } 5577 } 5578 } 5579 5580 static void 5581 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5582 { 5583 uint_t i; 5584 uint_t num_posted; 5585 uint_t n_wrs; 5586 ibt_status_t ibt_status; 5587 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 5588 ibd_swqe_t *tx_head, *elem; 5589 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 5590 5591 /* post the one request, then check for more */ 5592 ibt_status = ibt_post_send(state->id_chnl_hdl, 5593 &node->w_swr, 1, NULL); 5594 if (ibt_status != IBT_SUCCESS) { 5595 ibd_print_warn(state, "ibd_post_send: " 5596 "posting one wr failed: ret=%d", ibt_status); 5597 ibd_tx_cleanup(state, node); 5598 } 5599 5600 tx_head = NULL; 5601 for (;;) { 5602 if (tx_head == NULL) { 5603 mutex_enter(&state->id_txpost_lock); 5604 tx_head = state->id_tx_head; 5605 if (tx_head == NULL) { 5606 state->id_tx_busy = 0; 5607 mutex_exit(&state->id_txpost_lock); 5608 return; 5609 } 5610 state->id_tx_head = NULL; 5611 mutex_exit(&state->id_txpost_lock); 5612 } 5613 5614 /* 5615 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 5616 * at a time if possible, and keep posting them. 5617 */ 5618 for (n_wrs = 0, elem = tx_head; 5619 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 5620 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5621 nodes[n_wrs] = elem; 5622 wrs[n_wrs] = elem->w_swr; 5623 } 5624 tx_head = elem; 5625 5626 ASSERT(n_wrs != 0); 5627 5628 /* 5629 * If posting fails for some reason, we'll never receive 5630 * completion intimation, so we'll need to cleanup. But 5631 * we need to make sure we don't clean up nodes whose 5632 * wrs have been successfully posted. We assume that the 5633 * hca driver returns on the first failure to post and 5634 * therefore the first 'num_posted' entries don't need 5635 * cleanup here. 5636 */ 5637 num_posted = 0; 5638 ibt_status = ibt_post_send(state->id_chnl_hdl, 5639 wrs, n_wrs, &num_posted); 5640 if (ibt_status != IBT_SUCCESS) { 5641 ibd_print_warn(state, "ibd_post_send: " 5642 "posting multiple wrs failed: " 5643 "requested=%d, done=%d, ret=%d", 5644 n_wrs, num_posted, ibt_status); 5645 5646 for (i = num_posted; i < n_wrs; i++) 5647 ibd_tx_cleanup(state, nodes[i]); 5648 } 5649 } 5650 } 5651 5652 static int 5653 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5654 uint_t lsohdr_sz) 5655 { 5656 ibt_wr_ds_t *sgl; 5657 ibt_status_t ibt_status; 5658 mblk_t *nmp; 5659 mblk_t *data_mp; 5660 uchar_t *bufp; 5661 size_t blksize; 5662 size_t skip; 5663 size_t avail; 5664 uint_t pktsize; 5665 uint_t frag_len; 5666 uint_t pending_hdr; 5667 int nmblks; 5668 int i; 5669 5670 /* 5671 * Let's skip ahead to the data if this is LSO 5672 */ 5673 data_mp = mp; 5674 pending_hdr = 0; 5675 if (lsohdr_sz) { 5676 pending_hdr = lsohdr_sz; 5677 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5678 frag_len = nmp->b_wptr - nmp->b_rptr; 5679 if (frag_len > pending_hdr) 5680 break; 5681 pending_hdr -= frag_len; 5682 } 5683 data_mp = nmp; /* start of data past lso header */ 5684 ASSERT(data_mp != NULL); 5685 } 5686 5687 /* 5688 * Calculate the size of message data and number of msg blocks 5689 */ 5690 pktsize = 0; 5691 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5692 nmp = nmp->b_cont, nmblks++) { 5693 pktsize += MBLKL(nmp); 5694 } 5695 pktsize -= pending_hdr; 5696 5697 /* 5698 * We only do ibt_map_mem_iov() if the pktsize is above the 5699 * "copy-threshold", and if the number of mp fragments is less than 5700 * the maximum acceptable. 5701 */ 5702 if ((state->id_hca_res_lkey_capab) && 5703 (pktsize > IBD_TX_COPY_THRESH) && 5704 (nmblks < state->id_max_sqseg_hiwm)) { 5705 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5706 ibt_iov_attr_t iov_attr; 5707 5708 iov_attr.iov_as = NULL; 5709 iov_attr.iov = iov_arr; 5710 iov_attr.iov_buf = NULL; 5711 iov_attr.iov_list_len = nmblks; 5712 iov_attr.iov_wr_nds = state->id_max_sqseg; 5713 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5714 iov_attr.iov_flags = IBT_IOV_SLEEP; 5715 5716 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5717 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5718 iov_arr[i].iov_len = MBLKL(nmp); 5719 if (i == 0) { 5720 iov_arr[i].iov_addr += pending_hdr; 5721 iov_arr[i].iov_len -= pending_hdr; 5722 } 5723 } 5724 5725 node->w_buftype = IBD_WQE_MAPPED; 5726 node->w_swr.wr_sgl = node->w_sgl; 5727 5728 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5729 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5730 if (ibt_status != IBT_SUCCESS) { 5731 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5732 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5733 goto ibd_copy_path; 5734 } 5735 5736 return (0); 5737 } 5738 5739 ibd_copy_path: 5740 if (pktsize <= state->id_tx_buf_sz) { 5741 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5742 node->w_swr.wr_nds = 1; 5743 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5744 node->w_buftype = IBD_WQE_TXBUF; 5745 5746 /* 5747 * Even though this is the copy path for transfers less than 5748 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5749 * is possible the first data mblk fragment (data_mp) still 5750 * contains part of the LSO header that we need to skip. 5751 */ 5752 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5753 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5754 blksize = MBLKL(nmp) - pending_hdr; 5755 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5756 bufp += blksize; 5757 pending_hdr = 0; 5758 } 5759 5760 return (0); 5761 } 5762 5763 /* 5764 * Copy path for transfers greater than id_tx_buf_sz 5765 */ 5766 node->w_swr.wr_sgl = node->w_sgl; 5767 if (ibd_acquire_lsobufs(state, pktsize, 5768 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5769 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5770 return (-1); 5771 } 5772 node->w_buftype = IBD_WQE_LSOBUF; 5773 5774 /* 5775 * Copy the larger-than-id_tx_buf_sz packet into a set of 5776 * fixed-sized, pre-mapped LSO buffers. Note that we might 5777 * need to skip part of the LSO header in the first fragment 5778 * as before. 5779 */ 5780 nmp = data_mp; 5781 skip = pending_hdr; 5782 for (i = 0; i < node->w_swr.wr_nds; i++) { 5783 sgl = node->w_swr.wr_sgl + i; 5784 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5785 avail = IBD_LSO_BUFSZ; 5786 while (nmp && avail) { 5787 blksize = MBLKL(nmp) - skip; 5788 if (blksize > avail) { 5789 bcopy(nmp->b_rptr + skip, bufp, avail); 5790 skip += avail; 5791 avail = 0; 5792 } else { 5793 bcopy(nmp->b_rptr + skip, bufp, blksize); 5794 skip = 0; 5795 avail -= blksize; 5796 bufp += blksize; 5797 nmp = nmp->b_cont; 5798 } 5799 } 5800 } 5801 5802 return (0); 5803 } 5804 5805 /* 5806 * Schedule a completion queue polling to reap the resource we're 5807 * short on. If we implement the change to reap tx completions 5808 * in a separate thread, we'll need to wake up that thread here. 5809 */ 5810 static int 5811 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5812 { 5813 ibd_req_t *req; 5814 5815 mutex_enter(&state->id_sched_lock); 5816 state->id_sched_needed |= resource_type; 5817 mutex_exit(&state->id_sched_lock); 5818 5819 /* 5820 * If we are asked to queue a work entry, we need to do it 5821 */ 5822 if (q_flag) { 5823 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5824 if (req == NULL) 5825 return (-1); 5826 5827 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5828 } 5829 5830 return (0); 5831 } 5832 5833 /* 5834 * The passed in packet has this format: 5835 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5836 */ 5837 static boolean_t 5838 ibd_send(ibd_state_t *state, mblk_t *mp) 5839 { 5840 ibd_ace_t *ace; 5841 ibd_swqe_t *node; 5842 ipoib_mac_t *dest; 5843 ib_header_info_t *ipibp; 5844 ip6_t *ip6h; 5845 uint_t pktsize; 5846 uint32_t mss; 5847 uint32_t hckflags; 5848 uint32_t lsoflags = 0; 5849 uint_t lsohdr_sz = 0; 5850 int ret, len; 5851 boolean_t dofree = B_FALSE; 5852 boolean_t rc; 5853 /* if (rc_chan == NULL) send by UD; else send by RC; */ 5854 ibd_rc_chan_t *rc_chan; 5855 int nmblks; 5856 mblk_t *nmp; 5857 5858 /* 5859 * If we aren't done with the device initialization and start, 5860 * we shouldn't be here. 5861 */ 5862 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5863 return (B_FALSE); 5864 5865 /* 5866 * Obtain an address handle for the destination. 5867 */ 5868 ipibp = (ib_header_info_t *)mp->b_rptr; 5869 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5870 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5871 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5872 5873 rc_chan = NULL; 5874 ace = ibd_acache_lookup(state, dest, &ret, 1); 5875 if (state->id_enable_rc && (ace != NULL) && 5876 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 5877 if (ace->ac_chan == NULL) { 5878 state->rc_null_conn++; 5879 } else { 5880 if (ace->ac_chan->chan_state == 5881 IBD_RC_STATE_ACT_ESTAB) { 5882 rc_chan = ace->ac_chan; 5883 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 5884 node = WQE_TO_SWQE( 5885 rc_chan->tx_wqe_list.dl_head); 5886 if (node != NULL) { 5887 rc_chan->tx_wqe_list.dl_cnt -= 1; 5888 rc_chan->tx_wqe_list.dl_head = 5889 node->swqe_next; 5890 } else { 5891 node = ibd_rc_acquire_swqes(rc_chan); 5892 } 5893 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 5894 5895 if (node == NULL) { 5896 state->rc_swqe_short++; 5897 mutex_enter(&state->id_sched_lock); 5898 state->id_sched_needed |= 5899 IBD_RSRC_RC_SWQE; 5900 mutex_exit(&state->id_sched_lock); 5901 ibd_dec_ref_ace(state, ace); 5902 return (B_FALSE); 5903 } 5904 } else { 5905 state->rc_no_estab_conn++; 5906 } 5907 } 5908 } 5909 5910 if (rc_chan == NULL) { 5911 mutex_enter(&state->id_tx_list.dl_mutex); 5912 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 5913 if (node != NULL) { 5914 state->id_tx_list.dl_cnt -= 1; 5915 state->id_tx_list.dl_head = node->swqe_next; 5916 } else { 5917 node = ibd_acquire_swqe(state); 5918 } 5919 mutex_exit(&state->id_tx_list.dl_mutex); 5920 if (node == NULL) { 5921 /* 5922 * If we don't have an swqe available, schedule a 5923 * transmit completion queue cleanup and hold off on 5924 * sending more packets until we have some free swqes 5925 */ 5926 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 5927 if (ace != NULL) { 5928 ibd_dec_ref_ace(state, ace); 5929 } 5930 return (B_FALSE); 5931 } 5932 5933 /* 5934 * If a poll cannot be scheduled, we have no choice but 5935 * to drop this packet 5936 */ 5937 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5938 if (ace != NULL) { 5939 ibd_dec_ref_ace(state, ace); 5940 } 5941 return (B_TRUE); 5942 } 5943 } 5944 5945 /* 5946 * Initialize the commonly used fields in swqe to NULL to protect 5947 * against ibd_tx_cleanup accidentally misinterpreting these on a 5948 * failure. 5949 */ 5950 node->swqe_im_mblk = NULL; 5951 node->w_swr.wr_nds = 0; 5952 node->w_swr.wr_sgl = NULL; 5953 node->w_swr.wr_opcode = IBT_WRC_SEND; 5954 5955 /* 5956 * Calculate the size of message data and number of msg blocks 5957 */ 5958 pktsize = 0; 5959 for (nmblks = 0, nmp = mp; nmp != NULL; 5960 nmp = nmp->b_cont, nmblks++) { 5961 pktsize += MBLKL(nmp); 5962 } 5963 5964 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5965 atomic_inc_64(&state->id_brd_xmt); 5966 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5967 atomic_inc_64(&state->id_multi_xmt); 5968 5969 if (ace != NULL) { 5970 node->w_ahandle = ace; 5971 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5972 } else { 5973 DPRINT(5, 5974 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5975 ((ret == EFAULT) ? "failed" : "queued"), 5976 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5977 htonl(dest->ipoib_gidpref[1]), 5978 htonl(dest->ipoib_gidsuff[0]), 5979 htonl(dest->ipoib_gidsuff[1])); 5980 state->rc_ace_not_found++; 5981 node->w_ahandle = NULL; 5982 5983 /* 5984 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5985 * can not find a path for the specific dest address. We 5986 * should get rid of this kind of packet. We also should get 5987 * rid of the packet if we cannot schedule a poll via the 5988 * async thread. For the normal case, ibd will return the 5989 * packet to upper layer and wait for AH creating. 5990 * 5991 * Note that we always queue a work slot entry for the async 5992 * thread when we fail AH lookup (even in intr mode); this is 5993 * due to the convoluted way the code currently looks for AH. 5994 */ 5995 if (ret == EFAULT) { 5996 dofree = B_TRUE; 5997 rc = B_TRUE; 5998 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5999 dofree = B_TRUE; 6000 rc = B_TRUE; 6001 } else { 6002 dofree = B_FALSE; 6003 rc = B_FALSE; 6004 } 6005 goto ibd_send_fail; 6006 } 6007 6008 /* 6009 * For ND6 packets, padding is at the front of the source lladdr. 6010 * Insert the padding at front. 6011 */ 6012 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 6013 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 6014 if (!pullupmsg(mp, IPV6_HDR_LEN + 6015 sizeof (ib_header_info_t))) { 6016 DPRINT(10, "ibd_send: pullupmsg failure "); 6017 dofree = B_TRUE; 6018 rc = B_TRUE; 6019 goto ibd_send_fail; 6020 } 6021 ipibp = (ib_header_info_t *)mp->b_rptr; 6022 } 6023 ip6h = (ip6_t *)((uchar_t *)ipibp + 6024 sizeof (ib_header_info_t)); 6025 len = ntohs(ip6h->ip6_plen); 6026 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6027 mblk_t *pad; 6028 6029 pad = allocb(4, 0); 6030 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 6031 linkb(mp, pad); 6032 if (MBLKL(mp) < sizeof (ib_header_info_t) + 6033 IPV6_HDR_LEN + len + 4) { 6034 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 6035 IPV6_HDR_LEN + len + 4)) { 6036 DPRINT(10, "ibd_send: pullupmsg " 6037 "failure "); 6038 dofree = B_TRUE; 6039 rc = B_TRUE; 6040 goto ibd_send_fail; 6041 } 6042 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6043 sizeof (ib_header_info_t)); 6044 } 6045 6046 /* LINTED: E_CONSTANT_CONDITION */ 6047 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 6048 } 6049 } 6050 6051 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 6052 mp->b_rptr += sizeof (ib_addrs_t); 6053 pktsize -= sizeof (ib_addrs_t); 6054 6055 if (rc_chan) { /* send in RC mode */ 6056 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6057 ibt_iov_attr_t iov_attr; 6058 uint_t i; 6059 size_t blksize; 6060 uchar_t *bufp; 6061 ibd_rc_tx_largebuf_t *lbufp; 6062 6063 atomic_add_64(&state->rc_xmt_bytes, pktsize); 6064 6065 /* 6066 * Upper layer does Tx checksum, we don't need do any 6067 * checksum here. 6068 */ 6069 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 6070 6071 /* 6072 * We only do ibt_map_mem_iov() if the pktsize is above 6073 * the "copy-threshold", and if the number of mp 6074 * fragments is less than the maximum acceptable. 6075 */ 6076 if (pktsize <= ibd_rc_tx_copy_thresh) { 6077 atomic_inc_64(&state->rc_xmt_small_pkt); 6078 /* 6079 * Only process unicast packet in Reliable Connected 6080 * mode. 6081 */ 6082 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6083 node->w_swr.wr_nds = 1; 6084 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6085 node->w_buftype = IBD_WQE_TXBUF; 6086 6087 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6088 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6089 blksize = MBLKL(nmp); 6090 bcopy(nmp->b_rptr, bufp, blksize); 6091 bufp += blksize; 6092 } 6093 freemsg(mp); 6094 ASSERT(node->swqe_im_mblk == NULL); 6095 } else { 6096 if ((state->rc_enable_iov_map) && 6097 (nmblks < state->rc_max_sqseg_hiwm)) { 6098 6099 /* do ibt_map_mem_iov() */ 6100 iov_attr.iov_as = NULL; 6101 iov_attr.iov = iov_arr; 6102 iov_attr.iov_buf = NULL; 6103 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 6104 iov_attr.iov_lso_hdr_sz = 0; 6105 iov_attr.iov_flags = IBT_IOV_SLEEP; 6106 6107 i = 0; 6108 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6109 iov_arr[i].iov_len = MBLKL(nmp); 6110 if (iov_arr[i].iov_len != 0) { 6111 iov_arr[i].iov_addr = (caddr_t) 6112 (void *)nmp->b_rptr; 6113 i++; 6114 } 6115 } 6116 iov_attr.iov_list_len = i; 6117 node->w_swr.wr_sgl = node->w_sgl; 6118 6119 ret = ibt_map_mem_iov(state->id_hca_hdl, 6120 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 6121 &node->w_mi_hdl); 6122 if (ret != IBT_SUCCESS) { 6123 atomic_inc_64( 6124 &state->rc_xmt_map_fail_pkt); 6125 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 6126 ") failed, nmblks=%d, real_nmblks" 6127 "=%d, ret=0x%x", nmblks, i, ret); 6128 goto ibd_rc_large_copy; 6129 } 6130 6131 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 6132 node->w_buftype = IBD_WQE_MAPPED; 6133 node->swqe_im_mblk = mp; 6134 } else { 6135 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 6136 ibd_rc_large_copy: 6137 mutex_enter(&state->rc_tx_large_bufs_lock); 6138 if (state->rc_tx_largebuf_nfree == 0) { 6139 state->rc_xmt_buf_short++; 6140 mutex_exit 6141 (&state->rc_tx_large_bufs_lock); 6142 mutex_enter(&state->id_sched_lock); 6143 state->id_sched_needed |= 6144 IBD_RSRC_RC_TX_LARGEBUF; 6145 mutex_exit(&state->id_sched_lock); 6146 dofree = B_FALSE; 6147 rc = B_FALSE; 6148 /* 6149 * If we don't have Tx large bufs, 6150 * return failure. node->w_buftype 6151 * should not be IBD_WQE_RC_COPYBUF, 6152 * otherwise it will cause problem 6153 * in ibd_rc_tx_cleanup() 6154 */ 6155 node->w_buftype = IBD_WQE_TXBUF; 6156 goto ibd_send_fail; 6157 } 6158 6159 lbufp = state->rc_tx_largebuf_free_head; 6160 ASSERT(lbufp->lb_buf != NULL); 6161 state->rc_tx_largebuf_free_head = 6162 lbufp->lb_next; 6163 lbufp->lb_next = NULL; 6164 /* Update nfree count */ 6165 state->rc_tx_largebuf_nfree --; 6166 mutex_exit(&state->rc_tx_large_bufs_lock); 6167 bufp = lbufp->lb_buf; 6168 node->w_sgl[0].ds_va = 6169 (ib_vaddr_t)(uintptr_t)bufp; 6170 node->w_sgl[0].ds_key = 6171 state->rc_tx_mr_desc.md_lkey; 6172 node->w_sgl[0].ds_len = pktsize; 6173 node->w_swr.wr_sgl = node->w_sgl; 6174 node->w_swr.wr_nds = 1; 6175 node->w_buftype = IBD_WQE_RC_COPYBUF; 6176 node->w_rc_tx_largebuf = lbufp; 6177 6178 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6179 blksize = MBLKL(nmp); 6180 if (blksize != 0) { 6181 bcopy(nmp->b_rptr, bufp, 6182 blksize); 6183 bufp += blksize; 6184 } 6185 } 6186 freemsg(mp); 6187 ASSERT(node->swqe_im_mblk == NULL); 6188 } 6189 } 6190 6191 node->swqe_next = NULL; 6192 mutex_enter(&rc_chan->tx_post_lock); 6193 if (rc_chan->tx_busy) { 6194 if (rc_chan->tx_head) { 6195 rc_chan->tx_tail->swqe_next = 6196 SWQE_TO_WQE(node); 6197 } else { 6198 rc_chan->tx_head = node; 6199 } 6200 rc_chan->tx_tail = node; 6201 mutex_exit(&rc_chan->tx_post_lock); 6202 } else { 6203 rc_chan->tx_busy = 1; 6204 mutex_exit(&rc_chan->tx_post_lock); 6205 ibd_rc_post_send(rc_chan, node); 6206 } 6207 6208 return (B_TRUE); 6209 } /* send by RC */ 6210 6211 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 6212 /* 6213 * Too long pktsize. The packet size from GLD should <= 6214 * state->id_mtu + sizeof (ib_addrs_t) 6215 */ 6216 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 6217 ibd_req_t *req; 6218 6219 mutex_enter(&ace->tx_too_big_mutex); 6220 if (ace->tx_too_big_ongoing) { 6221 mutex_exit(&ace->tx_too_big_mutex); 6222 state->rc_xmt_reenter_too_long_pkt++; 6223 dofree = B_TRUE; 6224 } else { 6225 ace->tx_too_big_ongoing = B_TRUE; 6226 mutex_exit(&ace->tx_too_big_mutex); 6227 state->rc_xmt_icmp_too_long_pkt++; 6228 6229 req = kmem_cache_alloc(state->id_req_kmc, 6230 KM_NOSLEEP); 6231 if (req == NULL) { 6232 ibd_print_warn(state, "ibd_send: alloc " 6233 "ibd_req_t fail"); 6234 /* Drop it. */ 6235 dofree = B_TRUE; 6236 } else { 6237 req->rq_ptr = mp; 6238 req->rq_ptr2 = ace; 6239 ibd_queue_work_slot(state, req, 6240 IBD_ASYNC_RC_TOO_BIG); 6241 dofree = B_FALSE; 6242 } 6243 } 6244 } else { 6245 ibd_print_warn(state, "Reliable Connected mode is on. " 6246 "Multicast packet length %d > %d is too long to " 6247 "send packet (%d > %d), drop it", 6248 pktsize, state->id_mtu); 6249 state->rc_xmt_drop_too_long_pkt++; 6250 /* Drop it. */ 6251 dofree = B_TRUE; 6252 } 6253 rc = B_TRUE; 6254 goto ibd_send_fail; 6255 } 6256 6257 atomic_add_64(&state->id_xmt_bytes, pktsize); 6258 atomic_inc_64(&state->id_xmt_pkt); 6259 6260 /* 6261 * Do LSO and checksum related work here. For LSO send, adjust the 6262 * ud destination, the opcode and the LSO header information to the 6263 * work request. 6264 */ 6265 mac_lso_get(mp, &mss, &lsoflags); 6266 if ((lsoflags & HW_LSO) != HW_LSO) { 6267 node->w_swr.wr_opcode = IBT_WRC_SEND; 6268 lsohdr_sz = 0; 6269 } else { 6270 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 6271 /* 6272 * The routine can only fail if there's no memory; we 6273 * can only drop the packet if this happens 6274 */ 6275 ibd_print_warn(state, 6276 "ibd_send: no memory, lso posting failed"); 6277 dofree = B_TRUE; 6278 rc = B_TRUE; 6279 goto ibd_send_fail; 6280 } 6281 6282 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 6283 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 6284 } 6285 6286 mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &hckflags); 6287 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 6288 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 6289 else 6290 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 6291 6292 /* 6293 * Prepare the sgl for posting; the routine can only fail if there's 6294 * no lso buf available for posting. If this is the case, we should 6295 * probably resched for lso bufs to become available and then try again. 6296 */ 6297 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 6298 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 6299 dofree = B_TRUE; 6300 rc = B_TRUE; 6301 } else { 6302 dofree = B_FALSE; 6303 rc = B_FALSE; 6304 } 6305 goto ibd_send_fail; 6306 } 6307 node->swqe_im_mblk = mp; 6308 6309 /* 6310 * Queue the wqe to hardware; since we can now simply queue a 6311 * post instead of doing it serially, we cannot assume anything 6312 * about the 'node' after ibd_post_send() returns. 6313 */ 6314 node->swqe_next = NULL; 6315 6316 mutex_enter(&state->id_txpost_lock); 6317 if (state->id_tx_busy) { 6318 if (state->id_tx_head) { 6319 state->id_tx_tail->swqe_next = 6320 SWQE_TO_WQE(node); 6321 } else { 6322 state->id_tx_head = node; 6323 } 6324 state->id_tx_tail = node; 6325 mutex_exit(&state->id_txpost_lock); 6326 } else { 6327 state->id_tx_busy = 1; 6328 mutex_exit(&state->id_txpost_lock); 6329 ibd_post_send(state, node); 6330 } 6331 6332 return (B_TRUE); 6333 6334 ibd_send_fail: 6335 if (node && mp) 6336 ibd_free_lsohdr(node, mp); 6337 6338 if (dofree) 6339 freemsg(mp); 6340 6341 if (node != NULL) { 6342 if (rc_chan) { 6343 ibd_rc_tx_cleanup(node); 6344 } else { 6345 ibd_tx_cleanup(state, node); 6346 } 6347 } 6348 6349 return (rc); 6350 } 6351 6352 /* 6353 * GLDv3 entry point for transmitting datagram. 6354 */ 6355 static mblk_t * 6356 ibd_m_tx(void *arg, mblk_t *mp) 6357 { 6358 ibd_state_t *state = (ibd_state_t *)arg; 6359 mblk_t *next; 6360 6361 if (state->id_link_state != LINK_STATE_UP) { 6362 freemsgchain(mp); 6363 mp = NULL; 6364 } 6365 6366 while (mp != NULL) { 6367 next = mp->b_next; 6368 mp->b_next = NULL; 6369 if (ibd_send(state, mp) == B_FALSE) { 6370 /* Send fail */ 6371 mp->b_next = next; 6372 break; 6373 } 6374 mp = next; 6375 } 6376 6377 return (mp); 6378 } 6379 6380 /* 6381 * this handles Tx and Rx completions. With separate CQs, this handles 6382 * only Rx completions. 6383 */ 6384 static uint_t 6385 ibd_intr(caddr_t arg) 6386 { 6387 ibd_state_t *state = (ibd_state_t *)arg; 6388 6389 ibd_poll_rcq(state, state->id_rcq_hdl); 6390 6391 return (DDI_INTR_CLAIMED); 6392 } 6393 6394 /* 6395 * Poll and fully drain the send cq 6396 */ 6397 static void 6398 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6399 { 6400 ibt_wc_t *wcs = state->id_txwcs; 6401 uint_t numwcs = state->id_txwcs_size; 6402 ibd_wqe_t *wqe; 6403 ibd_swqe_t *head, *tail; 6404 ibt_wc_t *wc; 6405 uint_t num_polled; 6406 int i; 6407 6408 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 6409 head = tail = NULL; 6410 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 6411 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 6412 if (wc->wc_status != IBT_WC_SUCCESS) { 6413 /* 6414 * Channel being torn down. 6415 */ 6416 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 6417 DPRINT(5, "ibd_drain_scq: flush error"); 6418 DPRINT(10, "ibd_drain_scq: Bad " 6419 "status %d", wc->wc_status); 6420 } else { 6421 DPRINT(10, "ibd_drain_scq: " 6422 "unexpected wc_status %d", 6423 wc->wc_status); 6424 } 6425 /* 6426 * Fallthrough to invoke the Tx handler to 6427 * release held resources, e.g., AH refcount. 6428 */ 6429 } 6430 /* 6431 * Add this swqe to the list to be cleaned up. 6432 */ 6433 if (head) 6434 tail->swqe_next = wqe; 6435 else 6436 head = WQE_TO_SWQE(wqe); 6437 tail = WQE_TO_SWQE(wqe); 6438 } 6439 tail->swqe_next = NULL; 6440 ibd_tx_cleanup_list(state, head, tail); 6441 6442 /* 6443 * Resume any blocked transmissions if possible 6444 */ 6445 ibd_resume_transmission(state); 6446 } 6447 } 6448 6449 /* 6450 * Poll and fully drain the receive cq 6451 */ 6452 static void 6453 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6454 { 6455 ibt_wc_t *wcs = state->id_rxwcs; 6456 uint_t numwcs = state->id_rxwcs_size; 6457 ibd_rwqe_t *rwqe; 6458 ibt_wc_t *wc; 6459 uint_t num_polled; 6460 int i; 6461 mblk_t *head, *tail, *mp; 6462 6463 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 6464 head = tail = NULL; 6465 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 6466 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 6467 if (wc->wc_status != IBT_WC_SUCCESS) { 6468 /* 6469 * Channel being torn down. 6470 */ 6471 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 6472 DPRINT(5, "ibd_drain_rcq: " 6473 "expected flushed rwqe"); 6474 } else { 6475 DPRINT(5, "ibd_drain_rcq: " 6476 "unexpected wc_status %d", 6477 wc->wc_status); 6478 } 6479 atomic_inc_32( 6480 &state->id_rx_list.dl_bufs_outstanding); 6481 freemsg(rwqe->rwqe_im_mblk); 6482 continue; 6483 } 6484 mp = ibd_process_rx(state, rwqe, wc); 6485 if (mp == NULL) 6486 continue; 6487 6488 /* 6489 * Add this mp to the list to send to the nw layer. 6490 */ 6491 if (head) 6492 tail->b_next = mp; 6493 else 6494 head = mp; 6495 tail = mp; 6496 } 6497 if (head) 6498 mac_rx(state->id_mh, state->id_rh, head); 6499 6500 /* 6501 * Account for #rwqes polled. 6502 * Post more here, if less than one fourth full. 6503 */ 6504 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 6505 (state->id_num_rwqe / 4)) 6506 ibd_post_recv_intr(state); 6507 } 6508 } 6509 6510 /* 6511 * Common code for interrupt handling as well as for polling 6512 * for all completed wqe's while detaching. 6513 */ 6514 static void 6515 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6516 { 6517 int flag, redo_flag; 6518 int redo = 1; 6519 6520 flag = IBD_CQ_POLLING; 6521 redo_flag = IBD_REDO_CQ_POLLING; 6522 6523 mutex_enter(&state->id_scq_poll_lock); 6524 if (state->id_scq_poll_busy & flag) { 6525 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 6526 state->id_scq_poll_busy |= redo_flag; 6527 mutex_exit(&state->id_scq_poll_lock); 6528 return; 6529 } 6530 state->id_scq_poll_busy |= flag; 6531 mutex_exit(&state->id_scq_poll_lock); 6532 6533 /* 6534 * In some cases (eg detaching), this code can be invoked on 6535 * any cpu after disabling cq notification (thus no concurrency 6536 * exists). Apart from that, the following applies normally: 6537 * Transmit completion handling could be from any cpu if 6538 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 6539 * is interrupt driven. 6540 */ 6541 6542 /* 6543 * Poll and drain the CQ 6544 */ 6545 ibd_drain_scq(state, cq_hdl); 6546 6547 /* 6548 * Enable CQ notifications and redrain the cq to catch any 6549 * completions we might have missed after the ibd_drain_scq() 6550 * above and before the ibt_enable_cq_notify() that follows. 6551 * Finally, service any new requests to poll the cq that 6552 * could've come in after the ibt_enable_cq_notify(). 6553 */ 6554 do { 6555 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 6556 IBT_SUCCESS) { 6557 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6558 } 6559 6560 ibd_drain_scq(state, cq_hdl); 6561 6562 mutex_enter(&state->id_scq_poll_lock); 6563 if (state->id_scq_poll_busy & redo_flag) 6564 state->id_scq_poll_busy &= ~redo_flag; 6565 else { 6566 state->id_scq_poll_busy &= ~flag; 6567 redo = 0; 6568 } 6569 mutex_exit(&state->id_scq_poll_lock); 6570 6571 } while (redo); 6572 } 6573 6574 /* 6575 * Common code for interrupt handling as well as for polling 6576 * for all completed wqe's while detaching. 6577 */ 6578 static void 6579 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 6580 { 6581 int flag, redo_flag; 6582 int redo = 1; 6583 6584 flag = IBD_CQ_POLLING; 6585 redo_flag = IBD_REDO_CQ_POLLING; 6586 6587 mutex_enter(&state->id_rcq_poll_lock); 6588 if (state->id_rcq_poll_busy & flag) { 6589 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 6590 state->id_rcq_poll_busy |= redo_flag; 6591 mutex_exit(&state->id_rcq_poll_lock); 6592 return; 6593 } 6594 state->id_rcq_poll_busy |= flag; 6595 mutex_exit(&state->id_rcq_poll_lock); 6596 6597 /* 6598 * Poll and drain the CQ 6599 */ 6600 ibd_drain_rcq(state, rcq); 6601 6602 /* 6603 * Enable CQ notifications and redrain the cq to catch any 6604 * completions we might have missed after the ibd_drain_cq() 6605 * above and before the ibt_enable_cq_notify() that follows. 6606 * Finally, service any new requests to poll the cq that 6607 * could've come in after the ibt_enable_cq_notify(). 6608 */ 6609 do { 6610 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 6611 IBT_SUCCESS) { 6612 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6613 } 6614 6615 ibd_drain_rcq(state, rcq); 6616 6617 mutex_enter(&state->id_rcq_poll_lock); 6618 if (state->id_rcq_poll_busy & redo_flag) 6619 state->id_rcq_poll_busy &= ~redo_flag; 6620 else { 6621 state->id_rcq_poll_busy &= ~flag; 6622 redo = 0; 6623 } 6624 mutex_exit(&state->id_rcq_poll_lock); 6625 6626 } while (redo); 6627 } 6628 6629 /* 6630 * Unmap the memory area associated with a given swqe. 6631 */ 6632 void 6633 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 6634 { 6635 ibt_status_t stat; 6636 6637 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6638 6639 if (swqe->w_mi_hdl) { 6640 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6641 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6642 DPRINT(10, 6643 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6644 } 6645 swqe->w_mi_hdl = NULL; 6646 } 6647 swqe->w_swr.wr_nds = 0; 6648 } 6649 6650 void 6651 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 6652 { 6653 /* 6654 * The recycling logic can be eliminated from here 6655 * and put into the async thread if we create another 6656 * list to hold ACE's for unjoined mcg's. 6657 */ 6658 if (DEC_REF_DO_CYCLE(ace)) { 6659 ibd_mce_t *mce; 6660 6661 /* 6662 * Check with the lock taken: we decremented 6663 * reference count without the lock, and some 6664 * transmitter might already have bumped the 6665 * reference count (possible in case of multicast 6666 * disable when we leave the AH on the active 6667 * list). If not still 0, get out, leaving the 6668 * recycle bit intact. 6669 * 6670 * Atomically transition the AH from active 6671 * to free list, and queue a work request to 6672 * leave the group and destroy the mce. No 6673 * transmitter can be looking at the AH or 6674 * the MCE in between, since we have the 6675 * ac_mutex lock. In the SendOnly reap case, 6676 * it is not necessary to hold the ac_mutex 6677 * and recheck the ref count (since the AH was 6678 * taken off the active list), we just do it 6679 * to have uniform processing with the Full 6680 * reap case. 6681 */ 6682 mutex_enter(&state->id_ac_mutex); 6683 mce = ace->ac_mce; 6684 if (GET_REF_CYCLE(ace) == 0) { 6685 CLEAR_REFCYCLE(ace); 6686 /* 6687 * Identify the case of fullmember reap as 6688 * opposed to mcg trap reap. Also, port up 6689 * might set ac_mce to NULL to indicate Tx 6690 * cleanup should do no more than put the 6691 * AH in the free list (see ibd_async_link). 6692 */ 6693 if (mce != NULL) { 6694 ace->ac_mce = NULL; 6695 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6696 /* 6697 * mc_req was initialized at mce 6698 * creation time. 6699 */ 6700 ibd_queue_work_slot(state, 6701 &mce->mc_req, IBD_ASYNC_REAP); 6702 } 6703 IBD_ACACHE_INSERT_FREE(state, ace); 6704 } 6705 mutex_exit(&state->id_ac_mutex); 6706 } 6707 } 6708 6709 /* 6710 * Common code that deals with clean ups after a successful or 6711 * erroneous transmission attempt. 6712 */ 6713 static void 6714 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6715 { 6716 ibd_ace_t *ace = swqe->w_ahandle; 6717 6718 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6719 6720 /* 6721 * If this was a dynamic mapping in ibd_send(), we need to 6722 * unmap here. If this was an lso buffer we'd used for sending, 6723 * we need to release the lso buf to the pool, since the resource 6724 * is scarce. However, if this was simply a normal send using 6725 * the copybuf (present in each swqe), we don't need to release it. 6726 */ 6727 if (swqe->swqe_im_mblk != NULL) { 6728 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6729 ibd_unmap_mem(state, swqe); 6730 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6731 ibd_release_lsobufs(state, 6732 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6733 } 6734 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6735 freemsg(swqe->swqe_im_mblk); 6736 swqe->swqe_im_mblk = NULL; 6737 } 6738 6739 /* 6740 * Drop the reference count on the AH; it can be reused 6741 * now for a different destination if there are no more 6742 * posted sends that will use it. This can be eliminated 6743 * if we can always associate each Tx buffer with an AH. 6744 * The ace can be null if we are cleaning up from the 6745 * ibd_send() error path. 6746 */ 6747 if (ace != NULL) { 6748 ibd_dec_ref_ace(state, ace); 6749 } 6750 6751 /* 6752 * Release the send wqe for reuse. 6753 */ 6754 swqe->swqe_next = NULL; 6755 ibd_release_swqe(state, swqe, swqe, 1); 6756 } 6757 6758 static void 6759 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 6760 { 6761 ibd_ace_t *ace; 6762 ibd_swqe_t *swqe; 6763 int n = 0; 6764 6765 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 6766 6767 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 6768 6769 /* 6770 * If this was a dynamic mapping in ibd_send(), we need to 6771 * unmap here. If this was an lso buffer we'd used for sending, 6772 * we need to release the lso buf to the pool, since the 6773 * resource is scarce. However, if this was simply a normal 6774 * send using the copybuf (present in each swqe), we don't need 6775 * to release it. 6776 */ 6777 if (swqe->swqe_im_mblk != NULL) { 6778 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6779 ibd_unmap_mem(state, swqe); 6780 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6781 ibd_release_lsobufs(state, 6782 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6783 } 6784 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6785 freemsg(swqe->swqe_im_mblk); 6786 swqe->swqe_im_mblk = NULL; 6787 } 6788 6789 /* 6790 * Drop the reference count on the AH; it can be reused 6791 * now for a different destination if there are no more 6792 * posted sends that will use it. This can be eliminated 6793 * if we can always associate each Tx buffer with an AH. 6794 * The ace can be null if we are cleaning up from the 6795 * ibd_send() error path. 6796 */ 6797 ace = swqe->w_ahandle; 6798 if (ace != NULL) { 6799 ibd_dec_ref_ace(state, ace); 6800 } 6801 n++; 6802 } 6803 6804 /* 6805 * Release the send wqes for reuse. 6806 */ 6807 ibd_release_swqe(state, head, tail, n); 6808 } 6809 6810 /* 6811 * Processing to be done after receipt of a packet; hand off to GLD 6812 * in the format expected by GLD. The received packet has this 6813 * format: 2b sap :: 00 :: data. 6814 */ 6815 static mblk_t * 6816 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6817 { 6818 ib_header_info_t *phdr; 6819 mblk_t *mp; 6820 ipoib_hdr_t *ipibp; 6821 ipha_t *iphap; 6822 ip6_t *ip6h; 6823 int len; 6824 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 6825 uint32_t bufs; 6826 6827 /* 6828 * Track number handed to upper layer that need to be returned. 6829 */ 6830 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 6831 6832 /* Never run out of rwqes, use allocb when running low */ 6833 if (bufs >= state->id_rx_bufs_outstanding_limit) { 6834 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6835 atomic_inc_32(&state->id_rx_allocb); 6836 mp = allocb(pkt_len, BPRI_HI); 6837 if (mp) { 6838 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 6839 ibd_post_recv(state, rwqe); 6840 } else { /* no memory */ 6841 atomic_inc_32(&state->id_rx_allocb_failed); 6842 ibd_post_recv(state, rwqe); 6843 return (NULL); 6844 } 6845 } else { 6846 mp = rwqe->rwqe_im_mblk; 6847 } 6848 6849 6850 /* 6851 * Adjust write pointer depending on how much data came in. 6852 */ 6853 mp->b_wptr = mp->b_rptr + pkt_len; 6854 6855 /* 6856 * Make sure this is NULL or we're in trouble. 6857 */ 6858 if (mp->b_next != NULL) { 6859 ibd_print_warn(state, 6860 "ibd_process_rx: got duplicate mp from rcq?"); 6861 mp->b_next = NULL; 6862 } 6863 6864 /* 6865 * the IB link will deliver one of the IB link layer 6866 * headers called, the Global Routing Header (GRH). 6867 * ibd driver uses the information in GRH to build the 6868 * Header_info structure and pass it with the datagram up 6869 * to GLDv3. 6870 * If the GRH is not valid, indicate to GLDv3 by setting 6871 * the VerTcFlow field to 0. 6872 */ 6873 phdr = (ib_header_info_t *)mp->b_rptr; 6874 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6875 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6876 6877 /* if it is loop back packet, just drop it. */ 6878 if (state->id_enable_rc) { 6879 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 6880 &state->rc_macaddr_loopback, 6881 IPOIB_ADDRL) == 0) { 6882 freemsg(mp); 6883 return (NULL); 6884 } 6885 } else { 6886 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6887 IPOIB_ADDRL) == 0) { 6888 freemsg(mp); 6889 return (NULL); 6890 } 6891 } 6892 6893 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6894 sizeof (ipoib_mac_t)); 6895 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6896 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6897 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6898 } else { 6899 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6900 } 6901 } else { 6902 /* 6903 * It can not be a IBA multicast packet. Must have been 6904 * unicast for us. Just copy the interface address to dst. 6905 */ 6906 phdr->ib_grh.ipoib_vertcflow = 0; 6907 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6908 sizeof (ipoib_mac_t)); 6909 } 6910 6911 /* 6912 * For ND6 packets, padding is at the front of the source/target 6913 * lladdr. However the inet6 layer is not aware of it, hence remove 6914 * the padding from such packets. 6915 */ 6916 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6917 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 6918 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6919 len = ntohs(ip6h->ip6_plen); 6920 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6921 /* LINTED: E_CONSTANT_CONDITION */ 6922 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6923 } 6924 } 6925 6926 /* 6927 * Update statistics 6928 */ 6929 atomic_add_64(&state->id_rcv_bytes, pkt_len); 6930 atomic_inc_64(&state->id_rcv_pkt); 6931 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6932 atomic_inc_64(&state->id_brd_rcv); 6933 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6934 atomic_inc_64(&state->id_multi_rcv); 6935 6936 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6937 /* 6938 * Set receive checksum status in mp 6939 * Hardware checksumming can be considered valid only if: 6940 * 1. CQE.IP_OK bit is set 6941 * 2. CQE.CKSUM = 0xffff 6942 * 3. IPv6 routing header is not present in the packet 6943 * 4. If there are no IP_OPTIONS in the IP HEADER 6944 */ 6945 6946 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6947 (wc->wc_cksum == 0xFFFF) && 6948 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6949 mac_hcksum_set(mp, 0, 0, 0, 0, HCK_FULLCKSUM_OK); 6950 } 6951 6952 return (mp); 6953 } 6954 6955 /* 6956 * Callback code invoked from STREAMs when the receive data buffer is 6957 * free for recycling. 6958 */ 6959 static void 6960 ibd_freemsg_cb(char *arg) 6961 { 6962 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6963 ibd_state_t *state = rwqe->w_state; 6964 6965 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6966 6967 /* 6968 * If the driver is stopped, just free the rwqe. 6969 */ 6970 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 6971 DPRINT(6, "ibd_freemsg: wqe being freed"); 6972 rwqe->rwqe_im_mblk = NULL; 6973 ibd_free_rwqe(state, rwqe); 6974 return; 6975 } 6976 6977 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6978 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6979 if (rwqe->rwqe_im_mblk == NULL) { 6980 ibd_free_rwqe(state, rwqe); 6981 DPRINT(6, "ibd_freemsg: desballoc failed"); 6982 return; 6983 } 6984 6985 ibd_post_recv(state, rwqe); 6986 } 6987 6988 static uint_t 6989 ibd_tx_recycle(caddr_t arg) 6990 { 6991 ibd_state_t *state = (ibd_state_t *)arg; 6992 6993 /* 6994 * Poll for completed entries 6995 */ 6996 ibd_poll_scq(state, state->id_scq_hdl); 6997 6998 return (DDI_INTR_CLAIMED); 6999 } 7000 7001 #ifdef IBD_LOGGING 7002 static void 7003 ibd_log_init(void) 7004 { 7005 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 7006 ibd_lbuf_ndx = 0; 7007 7008 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 7009 } 7010 7011 static void 7012 ibd_log_fini(void) 7013 { 7014 if (ibd_lbuf) 7015 kmem_free(ibd_lbuf, IBD_LOG_SZ); 7016 ibd_lbuf_ndx = 0; 7017 ibd_lbuf = NULL; 7018 7019 mutex_destroy(&ibd_lbuf_lock); 7020 } 7021 7022 static void 7023 ibd_log(const char *fmt, ...) 7024 { 7025 va_list ap; 7026 uint32_t off; 7027 uint32_t msglen; 7028 char tmpbuf[IBD_DMAX_LINE]; 7029 7030 if (ibd_lbuf == NULL) 7031 return; 7032 7033 va_start(ap, fmt); 7034 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 7035 va_end(ap); 7036 7037 if (msglen >= IBD_DMAX_LINE) 7038 msglen = IBD_DMAX_LINE - 1; 7039 7040 mutex_enter(&ibd_lbuf_lock); 7041 7042 off = ibd_lbuf_ndx; /* current msg should go here */ 7043 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 7044 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 7045 7046 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 7047 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 7048 7049 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 7050 ibd_lbuf_ndx = 0; 7051 7052 mutex_exit(&ibd_lbuf_lock); 7053 7054 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 7055 } 7056 #endif 7057