1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Per-interface tunables (for developers) 68 * 69 * ibd_tx_copy_thresh 70 * This sets the threshold at which ibd will attempt to do a bcopy of the 71 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 72 * is restricted by various parameters, so setting of this value must be 73 * made after careful considerations only. For instance, IB HCAs currently 74 * impose a relatively small limit (when compared to ethernet NICs) on the 75 * length of the SGL for transmit. On the other hand, the ip stack could 76 * send down mp chains that are quite long when LSO is enabled. 77 * 78 * ibd_num_swqe 79 * Number of "send WQE" elements that will be allocated and used by ibd. 80 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 81 * buffer in each of these send wqes must be taken into account. This 82 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 83 * currently set to the same value of ibd_tx_copy_thresh, but may be 84 * changed independently if needed). 85 * 86 * ibd_num_rwqe 87 * Number of "receive WQE" elements that will be allocated and used by 88 * ibd. This parameter is limited by the maximum channel size of the HCA. 89 * Each buffer in the receive wqe will be of MTU size. 90 * 91 * ibd_num_lso_bufs 92 * Number of "larger-than-MTU" copy buffers to use for cases when the 93 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 94 * and too large to be used with regular MTU-sized copy buffers. It is 95 * not recommended to tune this variable without understanding the 96 * application environment and/or memory resources. The size of each of 97 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 98 * 99 * ibd_num_ah 100 * Number of AH cache entries to allocate 101 * 102 * ibd_hash_size 103 * Hash table size for the active AH list 104 * 105 * ibd_tx_softintr 106 * ibd_rx_softintr 107 * The softintr mechanism allows ibd to avoid event queue overflows if 108 * the receive/completion handlers are to be expensive. These are enabled 109 * by default. 110 * 111 * ibd_log_sz 112 * This specifies the size of the ibd log buffer in bytes. The buffer is 113 * allocated and logging is enabled only when IBD_LOGGING is defined. 114 * 115 */ 116 uint_t ibd_tx_copy_thresh = 0x1000; 117 uint_t ibd_num_swqe = 4000; 118 uint_t ibd_num_rwqe = 4000; 119 uint_t ibd_num_lso_bufs = 0x400; 120 uint_t ibd_num_ah = 256; 121 uint_t ibd_hash_size = 32; 122 uint_t ibd_rx_softintr = 1; 123 uint_t ibd_tx_softintr = 1; 124 uint_t ibd_create_broadcast_group = 1; 125 #ifdef IBD_LOGGING 126 uint_t ibd_log_sz = 0x20000; 127 #endif 128 129 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 130 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 131 #define IBD_NUM_SWQE ibd_num_swqe 132 #define IBD_NUM_RWQE ibd_num_rwqe 133 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 134 #define IBD_NUM_AH ibd_num_ah 135 #define IBD_HASH_SIZE ibd_hash_size 136 #ifdef IBD_LOGGING 137 #define IBD_LOG_SZ ibd_log_sz 138 #endif 139 140 /* 141 * ibd_rc_tx_copy_thresh 142 * This sets the threshold upto which ibd will attempt to do a bcopy of the 143 * outgoing data into a pre-mapped buffer. 144 */ 145 uint_t ibd_rc_tx_copy_thresh = 0x1000; 146 147 /* 148 * Receive CQ moderation parameters: tunable (for developers) 149 */ 150 uint_t ibd_rxcomp_count = 4; 151 uint_t ibd_rxcomp_usec = 10; 152 153 /* 154 * Send CQ moderation parameters: tunable (for developers) 155 */ 156 uint_t ibd_txcomp_count = 16; 157 uint_t ibd_txcomp_usec = 300; 158 159 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 160 #define IBD_RX_POST_CNT 8 161 162 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 163 #define IBD_LOG_RX_POST 4 164 165 /* Minimum number of receive work requests driver needs to always have */ 166 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 167 168 /* 169 * LSO parameters 170 */ 171 #define IBD_LSO_MAXLEN 65536 172 #define IBD_LSO_BUFSZ 8192 173 #define IBD_PROP_LSO_POLICY "lso-policy" 174 175 /* 176 * Async operation states 177 */ 178 #define IBD_OP_NOTSTARTED 0 179 #define IBD_OP_ONGOING 1 180 #define IBD_OP_COMPLETED 2 181 #define IBD_OP_ERRORED 3 182 #define IBD_OP_ROUTERED 4 183 184 /* 185 * State of IBD driver initialization during attach/m_start 186 */ 187 #define IBD_DRV_STATE_INITIALIZED 0x00001 188 #define IBD_DRV_RXINTR_ADDED 0x00002 189 #define IBD_DRV_TXINTR_ADDED 0x00004 190 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 191 #define IBD_DRV_HCA_OPENED 0x00010 192 #define IBD_DRV_PD_ALLOCD 0x00020 193 #define IBD_DRV_MAC_REGISTERED 0x00040 194 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 195 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 196 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 197 #define IBD_DRV_CQS_ALLOCD 0x00400 198 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 199 #define IBD_DRV_TXLIST_ALLOCD 0x01000 200 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 201 #define IBD_DRV_RXLIST_ALLOCD 0x04000 202 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 203 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 204 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 205 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 206 #define IBD_DRV_STARTED 0x80000 207 #define IBD_DRV_RC_SRQ_ALLOCD 0x100000 208 #define IBD_DRV_RC_LARGEBUF_ALLOCD 0x200000 209 #define IBD_DRV_RC_LISTEN 0x400000 210 #ifdef DEBUG 211 #define IBD_DRV_RC_PRIVATE_STATE 0x800000 212 #endif 213 214 /* 215 * Start/stop in-progress flags; note that restart must always remain 216 * the OR of start and stop flag values. 217 */ 218 #define IBD_DRV_START_IN_PROGRESS 0x10000000 219 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 220 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 221 222 /* 223 * Miscellaneous constants 224 */ 225 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 226 #define IBD_DEF_MAX_SDU 2044 227 #define IBD_DEFAULT_QKEY 0xB1B 228 #ifdef IBD_LOGGING 229 #define IBD_DMAX_LINE 100 230 #endif 231 232 /* 233 * Enumerations for link states 234 */ 235 typedef enum { 236 IBD_LINK_DOWN, 237 IBD_LINK_UP, 238 IBD_LINK_UP_ABSENT 239 } ibd_link_op_t; 240 241 /* 242 * Driver State Pointer 243 */ 244 void *ibd_list; 245 246 /* 247 * Driver Global Data 248 */ 249 ibd_global_state_t ibd_gstate; 250 251 /* 252 * Logging 253 */ 254 #ifdef IBD_LOGGING 255 kmutex_t ibd_lbuf_lock; 256 uint8_t *ibd_lbuf; 257 uint32_t ibd_lbuf_ndx; 258 #endif 259 260 /* 261 * Required system entry points 262 */ 263 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 264 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 265 266 /* 267 * Required driver entry points for GLDv3 268 */ 269 static int ibd_m_stat(void *, uint_t, uint64_t *); 270 static int ibd_m_start(void *); 271 static void ibd_m_stop(void *); 272 static int ibd_m_promisc(void *, boolean_t); 273 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 274 static int ibd_m_unicst(void *, const uint8_t *); 275 static mblk_t *ibd_m_tx(void *, mblk_t *); 276 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 277 278 /* 279 * Private driver entry points for GLDv3 280 */ 281 282 /* 283 * Initialization 284 */ 285 static int ibd_state_init(ibd_state_t *, dev_info_t *); 286 static int ibd_init_txlist(ibd_state_t *); 287 static int ibd_init_rxlist(ibd_state_t *); 288 static int ibd_acache_init(ibd_state_t *); 289 #ifdef IBD_LOGGING 290 static void ibd_log_init(void); 291 #endif 292 293 /* 294 * Termination/cleanup 295 */ 296 static void ibd_state_fini(ibd_state_t *); 297 static void ibd_fini_txlist(ibd_state_t *); 298 static void ibd_fini_rxlist(ibd_state_t *); 299 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 300 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 301 static void ibd_acache_fini(ibd_state_t *); 302 #ifdef IBD_LOGGING 303 static void ibd_log_fini(void); 304 #endif 305 306 /* 307 * Allocation/acquire/map routines 308 */ 309 static int ibd_alloc_tx_copybufs(ibd_state_t *); 310 static int ibd_alloc_rx_copybufs(ibd_state_t *); 311 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 312 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 313 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 314 uint32_t *); 315 316 /* 317 * Free/release/unmap routines 318 */ 319 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 320 static void ibd_free_tx_copybufs(ibd_state_t *); 321 static void ibd_free_rx_copybufs(ibd_state_t *); 322 static void ibd_free_rx_rsrcs(ibd_state_t *); 323 static void ibd_free_tx_lsobufs(ibd_state_t *); 324 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 325 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 326 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 327 328 /* 329 * Handlers/callback routines 330 */ 331 static uint_t ibd_intr(caddr_t); 332 static uint_t ibd_tx_recycle(caddr_t); 333 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 334 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 335 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 336 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 337 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 338 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 339 static void ibd_freemsg_cb(char *); 340 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 341 ibt_async_event_t *); 342 static void ibd_snet_notices_handler(void *, ib_gid_t, 343 ibt_subnet_event_code_t, ibt_subnet_event_t *); 344 345 /* 346 * Send/receive routines 347 */ 348 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 349 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 350 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 351 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 352 353 /* 354 * Threads 355 */ 356 static void ibd_async_work(ibd_state_t *); 357 358 /* 359 * Async tasks 360 */ 361 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 362 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 363 static void ibd_async_setprom(ibd_state_t *); 364 static void ibd_async_unsetprom(ibd_state_t *); 365 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 366 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 367 static void ibd_async_txsched(ibd_state_t *); 368 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 369 370 /* 371 * Async task helpers 372 */ 373 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 374 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 375 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 376 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 377 ipoib_mac_t *, ipoib_mac_t *); 378 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 379 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 380 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 381 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 382 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 383 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 384 static uint64_t ibd_get_portspeed(ibd_state_t *); 385 static boolean_t ibd_async_safe(ibd_state_t *); 386 static void ibd_async_done(ibd_state_t *); 387 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 388 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 389 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 390 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 391 392 /* 393 * Helpers for attach/start routines 394 */ 395 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 396 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 397 static int ibd_unattach(ibd_state_t *, dev_info_t *); 398 static int ibd_get_port_details(ibd_state_t *); 399 static int ibd_alloc_cqs(ibd_state_t *); 400 static int ibd_setup_ud_channel(ibd_state_t *); 401 static int ibd_start(ibd_state_t *); 402 static int ibd_undo_start(ibd_state_t *, link_state_t); 403 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 404 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 405 406 407 /* 408 * Miscellaneous helpers 409 */ 410 static int ibd_sched_poll(ibd_state_t *, int, int); 411 static void ibd_resume_transmission(ibd_state_t *); 412 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 413 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 414 static void *list_get_head(list_t *); 415 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 416 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 417 #ifdef IBD_LOGGING 418 static void ibd_log(const char *, ...); 419 #endif 420 421 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 422 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 423 424 /* Module Driver Info */ 425 static struct modldrv ibd_modldrv = { 426 &mod_driverops, /* This one is a driver */ 427 "InfiniBand GLDv3 Driver", /* short description */ 428 &ibd_dev_ops /* driver specific ops */ 429 }; 430 431 /* Module Linkage */ 432 static struct modlinkage ibd_modlinkage = { 433 MODREV_1, (void *)&ibd_modldrv, NULL 434 }; 435 436 /* 437 * Module (static) info passed to IBTL during ibt_attach 438 */ 439 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 440 IBTI_V_CURR, 441 IBT_NETWORK, 442 ibd_async_handler, 443 NULL, 444 "IPIB" 445 }; 446 447 /* 448 * GLDv3 entry points 449 */ 450 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 451 static mac_callbacks_t ibd_m_callbacks = { 452 IBD_M_CALLBACK_FLAGS, 453 ibd_m_stat, 454 ibd_m_start, 455 ibd_m_stop, 456 ibd_m_promisc, 457 ibd_m_multicst, 458 ibd_m_unicst, 459 ibd_m_tx, 460 NULL, 461 ibd_m_getcapab 462 }; 463 464 /* 465 * Fill/clear <scope> and <p_key> in multicast/broadcast address 466 */ 467 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 468 { \ 469 *(uint32_t *)((char *)(maddr) + 4) |= \ 470 htonl((uint32_t)(scope) << 16); \ 471 *(uint32_t *)((char *)(maddr) + 8) |= \ 472 htonl((uint32_t)(pkey) << 16); \ 473 } 474 475 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 476 { \ 477 *(uint32_t *)((char *)(maddr) + 4) &= \ 478 htonl(~((uint32_t)0xF << 16)); \ 479 *(uint32_t *)((char *)(maddr) + 8) &= \ 480 htonl(~((uint32_t)0xFFFF << 16)); \ 481 } 482 483 /* 484 * Rudimentary debugging support 485 */ 486 #ifdef DEBUG 487 int ibd_debuglevel = 100; 488 void 489 debug_print(int l, char *fmt, ...) 490 { 491 va_list ap; 492 493 if (l < ibd_debuglevel) 494 return; 495 va_start(ap, fmt); 496 vcmn_err(CE_CONT, fmt, ap); 497 va_end(ap); 498 } 499 #endif 500 501 /* 502 * Common routine to print warning messages; adds in hca guid, port number 503 * and pkey to be able to identify the IBA interface. 504 */ 505 void 506 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 507 { 508 ib_guid_t hca_guid; 509 char ibd_print_buf[256]; 510 int len; 511 va_list ap; 512 513 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 514 0, "hca-guid", 0); 515 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 516 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 517 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 518 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 519 va_start(ap, fmt); 520 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 521 fmt, ap); 522 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 523 va_end(ap); 524 } 525 526 /* 527 * Warlock directives 528 */ 529 530 /* 531 * id_lso_lock 532 * 533 * state->id_lso->bkt_nfree may be accessed without a lock to 534 * determine the threshold at which we have to ask the nw layer 535 * to resume transmission (see ibd_resume_transmission()). 536 */ 537 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 538 ibd_state_t::id_lso)) 539 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 540 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 541 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 542 543 /* 544 * id_scq_poll_lock 545 */ 546 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 547 ibd_state_t::id_scq_poll_busy)) 548 549 /* 550 * id_txpost_lock 551 */ 552 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 553 ibd_state_t::id_tx_head)) 554 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 555 ibd_state_t::id_tx_busy)) 556 557 /* 558 * id_acache_req_lock 559 */ 560 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 561 ibd_state_t::id_acache_req_cv)) 562 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 563 ibd_state_t::id_req_list)) 564 _NOTE(SCHEME_PROTECTS_DATA("atomic", 565 ibd_acache_s::ac_ref)) 566 567 /* 568 * id_ac_mutex 569 * 570 * This mutex is actually supposed to protect id_ah_op as well, 571 * but this path of the code isn't clean (see update of id_ah_op 572 * in ibd_async_acache(), immediately after the call to 573 * ibd_async_mcache()). For now, we'll skip this check by 574 * declaring that id_ah_op is protected by some internal scheme 575 * that warlock isn't aware of. 576 */ 577 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 578 ibd_state_t::id_ah_active)) 579 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 580 ibd_state_t::id_ah_free)) 581 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 582 ibd_state_t::id_ah_addr)) 583 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 584 ibd_state_t::id_ah_op)) 585 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 586 ibd_state_t::id_ah_error)) 587 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 588 ibd_state_t::id_ac_hot_ace)) 589 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 590 591 /* 592 * id_mc_mutex 593 */ 594 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 595 ibd_state_t::id_mc_full)) 596 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 597 ibd_state_t::id_mc_non)) 598 599 /* 600 * id_trap_lock 601 */ 602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 603 ibd_state_t::id_trap_cv)) 604 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 605 ibd_state_t::id_trap_stop)) 606 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 607 ibd_state_t::id_trap_inprog)) 608 609 /* 610 * id_prom_op 611 */ 612 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 613 ibd_state_t::id_prom_op)) 614 615 /* 616 * id_sched_lock 617 */ 618 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 619 ibd_state_t::id_sched_needed)) 620 621 /* 622 * id_link_mutex 623 */ 624 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 625 ibd_state_t::id_link_state)) 626 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 627 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 628 ibd_state_t::id_link_speed)) 629 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 630 631 /* 632 * id_tx_list.dl_mutex 633 */ 634 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 635 ibd_state_t::id_tx_list.dl_head)) 636 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 637 ibd_state_t::id_tx_list.dl_pending_sends)) 638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 639 ibd_state_t::id_tx_list.dl_cnt)) 640 641 /* 642 * id_rx_list.dl_mutex 643 */ 644 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 645 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 646 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 647 ibd_state_t::id_rx_list.dl_cnt)) 648 649 650 /* 651 * Items protected by atomic updates 652 */ 653 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 654 ibd_state_s::id_brd_rcv 655 ibd_state_s::id_brd_xmt 656 ibd_state_s::id_multi_rcv 657 ibd_state_s::id_multi_xmt 658 ibd_state_s::id_num_intrs 659 ibd_state_s::id_rcv_bytes 660 ibd_state_s::id_rcv_pkt 661 ibd_state_s::id_rx_post_queue_index 662 ibd_state_s::id_tx_short 663 ibd_state_s::id_xmt_bytes 664 ibd_state_s::id_xmt_pkt 665 ibd_state_s::rc_rcv_trans_byte 666 ibd_state_s::rc_rcv_trans_pkt 667 ibd_state_s::rc_rcv_copy_byte 668 ibd_state_s::rc_rcv_copy_pkt 669 ibd_state_s::rc_xmt_bytes 670 ibd_state_s::rc_xmt_small_pkt 671 ibd_state_s::rc_xmt_fragmented_pkt 672 ibd_state_s::rc_xmt_map_fail_pkt 673 ibd_state_s::rc_xmt_map_succ_pkt)) 674 675 /* 676 * Non-mutex protection schemes for data elements. Almost all of 677 * these are non-shared items. 678 */ 679 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 680 callb_cpr 681 ib_gid_s 682 ib_header_info 683 ibd_acache_rq 684 ibd_acache_s::ac_mce 685 ibd_acache_s::ac_chan 686 ibd_mcache::mc_fullreap 687 ibd_mcache::mc_jstate 688 ibd_mcache::mc_req 689 ibd_rwqe_s 690 ibd_swqe_s 691 ibd_wqe_s 692 ibt_wr_ds_s::ds_va 693 ibt_wr_lso_s 694 ipoib_mac::ipoib_qpn 695 mac_capab_lso_s 696 msgb::b_next 697 msgb::b_cont 698 msgb::b_rptr 699 msgb::b_wptr 700 ibd_state_s::id_bgroup_created 701 ibd_state_s::id_mac_state 702 ibd_state_s::id_mtu 703 ibd_state_s::id_num_rwqe 704 ibd_state_s::id_num_swqe 705 ibd_state_s::id_qpnum 706 ibd_state_s::id_rcq_hdl 707 ibd_state_s::id_rx_buf_sz 708 ibd_state_s::id_rx_bufs 709 ibd_state_s::id_rx_mr_hdl 710 ibd_state_s::id_rx_wqes 711 ibd_state_s::id_rxwcs 712 ibd_state_s::id_rxwcs_size 713 ibd_state_s::id_rx_nqueues 714 ibd_state_s::id_rx_queues 715 ibd_state_s::id_scope 716 ibd_state_s::id_scq_hdl 717 ibd_state_s::id_tx_buf_sz 718 ibd_state_s::id_tx_bufs 719 ibd_state_s::id_tx_mr_hdl 720 ibd_state_s::id_tx_rel_list.dl_cnt 721 ibd_state_s::id_tx_wqes 722 ibd_state_s::id_txwcs 723 ibd_state_s::id_txwcs_size 724 ibd_state_s::rc_listen_hdl 725 ibd_state_s::rc_listen_hdl_OFED_interop 726 ibd_state_s::rc_srq_size 727 ibd_state_s::rc_srq_rwqes 728 ibd_state_s::rc_srq_rx_bufs 729 ibd_state_s::rc_srq_rx_mr_hdl 730 ibd_state_s::rc_tx_largebuf_desc_base 731 ibd_state_s::rc_tx_mr_bufs 732 ibd_state_s::rc_tx_mr_hdl 733 ipha_s 734 icmph_s 735 ibt_path_info_s::pi_sid 736 ibd_rc_chan_s::ace 737 ibd_rc_chan_s::chan_hdl 738 ibd_rc_chan_s::state 739 ibd_rc_chan_s::chan_state 740 ibd_rc_chan_s::is_tx_chan 741 ibd_rc_chan_s::rcq_hdl 742 ibd_rc_chan_s::rcq_size 743 ibd_rc_chan_s::scq_hdl 744 ibd_rc_chan_s::scq_size 745 ibd_rc_chan_s::requester_gid 746 ibd_rc_chan_s::requester_pkey 747 ibd_rc_chan_s::rx_bufs 748 ibd_rc_chan_s::rx_mr_hdl 749 ibd_rc_chan_s::rx_rwqes 750 ibd_rc_chan_s::tx_wqes 751 ibd_rc_chan_s::tx_mr_bufs 752 ibd_rc_chan_s::tx_mr_hdl 753 ibd_rc_chan_s::tx_rel_list.dl_cnt 754 ibd_rc_chan_s::tx_trans_error_cnt 755 ibd_rc_tx_largebuf_s::lb_buf 756 ibd_rc_msg_hello_s 757 ibt_cm_return_args_s)) 758 759 /* 760 * ibd_rc_chan_s::next is protected by two mutexes: 761 * 1) ibd_state_s::rc_pass_chan_list.chan_list_mutex 762 * 2) ibd_state_s::rc_obs_act_chan_list.chan_list_mutex. 763 */ 764 _NOTE(SCHEME_PROTECTS_DATA("protected by two mutexes", 765 ibd_rc_chan_s::next)) 766 767 /* 768 * ibd_state_s.rc_tx_large_bufs_lock 769 */ 770 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 771 ibd_state_s::rc_tx_largebuf_free_head)) 772 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 773 ibd_state_s::rc_tx_largebuf_nfree)) 774 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_tx_large_bufs_lock, 775 ibd_rc_tx_largebuf_s::lb_next)) 776 777 /* 778 * ibd_acache_s.tx_too_big_mutex 779 */ 780 _NOTE(MUTEX_PROTECTS_DATA(ibd_acache_s::tx_too_big_mutex, 781 ibd_acache_s::tx_too_big_ongoing)) 782 783 /* 784 * tx_wqe_list.dl_mutex 785 */ 786 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 787 ibd_rc_chan_s::tx_wqe_list.dl_head)) 788 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 789 ibd_rc_chan_s::tx_wqe_list.dl_pending_sends)) 790 _NOTE(MUTEX_PROTECTS_DATA(ibd_rc_chan_s::tx_wqe_list.dl_mutex, 791 ibd_rc_chan_s::tx_wqe_list.dl_cnt)) 792 793 /* 794 * ibd_state_s.rc_ace_recycle_lock 795 */ 796 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_s::rc_ace_recycle_lock, 797 ibd_state_s::rc_ace_recycle)) 798 799 /* 800 * rc_srq_rwqe_list.dl_mutex 801 */ 802 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 803 ibd_state_t::rc_srq_rwqe_list.dl_bufs_outstanding)) 804 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 805 ibd_state_t::rc_srq_rwqe_list.dl_cnt)) 806 807 /* 808 * Non-mutex protection schemes for data elements. They are counters 809 * for problem diagnosis. Don't need be protected. 810 */ 811 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 812 ibd_state_s::rc_rcv_alloc_fail 813 ibd_state_s::rc_rcq_invoke 814 ibd_state_s::rc_rcq_err 815 ibd_state_s::rc_ace_not_found 816 ibd_state_s::rc_xmt_drop_too_long_pkt 817 ibd_state_s::rc_xmt_icmp_too_long_pkt 818 ibd_state_s::rc_xmt_reenter_too_long_pkt 819 ibd_state_s::rc_swqe_short 820 ibd_state_s::rc_swqe_mac_update 821 ibd_state_s::rc_xmt_buf_short 822 ibd_state_s::rc_xmt_buf_mac_update 823 ibd_state_s::rc_scq_no_swqe 824 ibd_state_s::rc_scq_no_largebuf 825 ibd_state_s::rc_scq_invoke 826 ibd_state_s::rc_conn_succ 827 ibd_state_s::rc_conn_fail 828 ibd_state_s::rc_null_conn 829 ibd_state_s::rc_no_estab_conn 830 ibd_state_s::rc_act_close 831 ibd_state_s::rc_pas_close 832 ibd_state_s::rc_delay_ace_recycle 833 ibd_state_s::rc_act_close_simultaneous 834 ibd_state_s::rc_reset_cnt)) 835 836 #ifdef DEBUG 837 /* 838 * Non-mutex protection schemes for data elements. They are counters 839 * for problem diagnosis. Don't need be protected. 840 */ 841 _NOTE(SCHEME_PROTECTS_DATA("counters for problem diagnosis", 842 ibd_state_s::rc_rwqe_short 843 ibd_rc_stat_s::rc_rcv_trans_byte 844 ibd_rc_stat_s::rc_rcv_trans_pkt 845 ibd_rc_stat_s::rc_rcv_copy_byte 846 ibd_rc_stat_s::rc_rcv_copy_pkt 847 ibd_rc_stat_s::rc_rcv_alloc_fail 848 ibd_rc_stat_s::rc_rcq_invoke 849 ibd_rc_stat_s::rc_rcq_err 850 ibd_rc_stat_s::rc_scq_invoke 851 ibd_rc_stat_s::rc_rwqe_short 852 ibd_rc_stat_s::rc_xmt_bytes 853 ibd_rc_stat_s::rc_xmt_small_pkt 854 ibd_rc_stat_s::rc_xmt_fragmented_pkt 855 ibd_rc_stat_s::rc_xmt_map_fail_pkt 856 ibd_rc_stat_s::rc_xmt_map_succ_pkt 857 ibd_rc_stat_s::rc_ace_not_found 858 ibd_rc_stat_s::rc_scq_no_swqe 859 ibd_rc_stat_s::rc_scq_no_largebuf 860 ibd_rc_stat_s::rc_swqe_short 861 ibd_rc_stat_s::rc_swqe_mac_update 862 ibd_rc_stat_s::rc_xmt_buf_short 863 ibd_rc_stat_s::rc_xmt_buf_mac_update 864 ibd_rc_stat_s::rc_conn_succ 865 ibd_rc_stat_s::rc_conn_fail 866 ibd_rc_stat_s::rc_null_conn 867 ibd_rc_stat_s::rc_no_estab_conn 868 ibd_rc_stat_s::rc_act_close 869 ibd_rc_stat_s::rc_pas_close 870 ibd_rc_stat_s::rc_delay_ace_recycle 871 ibd_rc_stat_s::rc_act_close_simultaneous 872 ibd_rc_stat_s::rc_reset_cnt)) 873 #endif 874 875 int 876 _init() 877 { 878 int status; 879 880 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 881 PAGESIZE), 0); 882 if (status != 0) { 883 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 884 return (status); 885 } 886 887 mac_init_ops(&ibd_dev_ops, "ibd"); 888 status = mod_install(&ibd_modlinkage); 889 if (status != 0) { 890 DPRINT(10, "_init:failed in mod_install()"); 891 ddi_soft_state_fini(&ibd_list); 892 mac_fini_ops(&ibd_dev_ops); 893 return (status); 894 } 895 896 mutex_init(&ibd_gstate.ig_mutex, NULL, MUTEX_DRIVER, NULL); 897 mutex_enter(&ibd_gstate.ig_mutex); 898 ibd_gstate.ig_ibt_hdl = NULL; 899 ibd_gstate.ig_ibt_hdl_ref_cnt = 0; 900 ibd_gstate.ig_service_list = NULL; 901 mutex_exit(&ibd_gstate.ig_mutex); 902 903 #ifdef IBD_LOGGING 904 ibd_log_init(); 905 #endif 906 return (0); 907 } 908 909 int 910 _info(struct modinfo *modinfop) 911 { 912 return (mod_info(&ibd_modlinkage, modinfop)); 913 } 914 915 int 916 _fini() 917 { 918 int status; 919 920 status = mod_remove(&ibd_modlinkage); 921 if (status != 0) 922 return (status); 923 924 mac_fini_ops(&ibd_dev_ops); 925 ddi_soft_state_fini(&ibd_list); 926 mutex_destroy(&ibd_gstate.ig_mutex); 927 #ifdef IBD_LOGGING 928 ibd_log_fini(); 929 #endif 930 return (0); 931 } 932 933 /* 934 * Convert the GID part of the mac address from network byte order 935 * to host order. 936 */ 937 static void 938 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 939 { 940 ib_sn_prefix_t nbopref; 941 ib_guid_t nboguid; 942 943 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 944 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 945 dgid->gid_prefix = b2h64(nbopref); 946 dgid->gid_guid = b2h64(nboguid); 947 } 948 949 /* 950 * Create the IPoIB address in network byte order from host order inputs. 951 */ 952 static void 953 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 954 ib_guid_t guid) 955 { 956 ib_sn_prefix_t nbopref; 957 ib_guid_t nboguid; 958 959 mac->ipoib_qpn = htonl(qpn); 960 nbopref = h2b64(prefix); 961 nboguid = h2b64(guid); 962 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 963 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 964 } 965 966 /* 967 * Send to the appropriate all-routers group when the IBA multicast group 968 * does not exist, based on whether the target group is v4 or v6. 969 */ 970 static boolean_t 971 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 972 ipoib_mac_t *rmac) 973 { 974 boolean_t retval = B_TRUE; 975 uint32_t adjscope = state->id_scope << 16; 976 uint32_t topword; 977 978 /* 979 * Copy the first 4 bytes in without assuming any alignment of 980 * input mac address; this will have IPoIB signature, flags and 981 * scope bits. 982 */ 983 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 984 topword = ntohl(topword); 985 986 /* 987 * Generate proper address for IPv4/v6, adding in the Pkey properly. 988 */ 989 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 990 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 991 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 992 ((uint32_t)(state->id_pkey << 16))), 993 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 994 else 995 /* 996 * Does not have proper bits in the mgid address. 997 */ 998 retval = B_FALSE; 999 1000 return (retval); 1001 } 1002 1003 /* 1004 * Membership states for different mcg's are tracked by two lists: 1005 * the "non" list is used for promiscuous mode, when all mcg traffic 1006 * needs to be inspected. This type of membership is never used for 1007 * transmission, so there can not be an AH in the active list 1008 * corresponding to a member in this list. This list does not need 1009 * any protection, since all operations are performed by the async 1010 * thread. 1011 * 1012 * "Full" and "SendOnly" membership is tracked using a single list, 1013 * the "full" list. This is because this single list can then be 1014 * searched during transmit to a multicast group (if an AH for the 1015 * mcg is not found in the active list), since at least one type 1016 * of membership must be present before initiating the transmit. 1017 * This list is also emptied during driver detach, since sendonly 1018 * membership acquired during transmit is dropped at detach time 1019 * along with ipv4 broadcast full membership. Insert/deletes to 1020 * this list are done only by the async thread, but it is also 1021 * searched in program context (see multicast disable case), thus 1022 * the id_mc_mutex protects the list. The driver detach path also 1023 * deconstructs the "full" list, but it ensures that the async 1024 * thread will not be accessing the list (by blocking out mcg 1025 * trap handling and making sure no more Tx reaping will happen). 1026 * 1027 * Currently, an IBA attach is done in the SendOnly case too, 1028 * although this is not required. 1029 */ 1030 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 1031 list_insert_head(&state->id_mc_full, mce) 1032 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1033 list_insert_head(&state->id_mc_non, mce) 1034 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1035 ibd_mcache_find(mgid, &state->id_mc_full) 1036 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1037 ibd_mcache_find(mgid, &state->id_mc_non) 1038 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1039 list_remove(&state->id_mc_full, mce) 1040 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1041 list_remove(&state->id_mc_non, mce) 1042 1043 static void * 1044 list_get_head(list_t *list) 1045 { 1046 list_node_t *lhead = list_head(list); 1047 1048 if (lhead != NULL) 1049 list_remove(list, lhead); 1050 return (lhead); 1051 } 1052 1053 /* 1054 * This is always guaranteed to be able to queue the work. 1055 */ 1056 void 1057 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1058 { 1059 /* Initialize request */ 1060 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1061 ptr->rq_op = op; 1062 1063 /* 1064 * Queue provided slot onto request pool. 1065 */ 1066 mutex_enter(&state->id_acache_req_lock); 1067 list_insert_tail(&state->id_req_list, ptr); 1068 1069 /* Go, fetch, async thread */ 1070 cv_signal(&state->id_acache_req_cv); 1071 mutex_exit(&state->id_acache_req_lock); 1072 } 1073 1074 /* 1075 * Main body of the per interface async thread. 1076 */ 1077 static void 1078 ibd_async_work(ibd_state_t *state) 1079 { 1080 ibd_req_t *ptr; 1081 callb_cpr_t cprinfo; 1082 1083 mutex_enter(&state->id_acache_req_lock); 1084 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1085 callb_generic_cpr, "ibd_async_work"); 1086 1087 for (;;) { 1088 ptr = list_get_head(&state->id_req_list); 1089 if (ptr != NULL) { 1090 mutex_exit(&state->id_acache_req_lock); 1091 1092 /* 1093 * Once we have done the operation, there is no 1094 * guarantee the request slot is going to be valid, 1095 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1096 * TRAP). 1097 * 1098 * Perform the request. 1099 */ 1100 switch (ptr->rq_op) { 1101 case IBD_ASYNC_GETAH: 1102 ibd_async_acache(state, &ptr->rq_mac); 1103 break; 1104 case IBD_ASYNC_JOIN: 1105 case IBD_ASYNC_LEAVE: 1106 ibd_async_multicast(state, 1107 ptr->rq_gid, ptr->rq_op); 1108 break; 1109 case IBD_ASYNC_PROMON: 1110 ibd_async_setprom(state); 1111 break; 1112 case IBD_ASYNC_PROMOFF: 1113 ibd_async_unsetprom(state); 1114 break; 1115 case IBD_ASYNC_REAP: 1116 ibd_async_reap_group(state, 1117 ptr->rq_ptr, ptr->rq_gid, 1118 IB_MC_JSTATE_FULL); 1119 /* 1120 * the req buf contains in mce 1121 * structure, so we do not need 1122 * to free it here. 1123 */ 1124 ptr = NULL; 1125 break; 1126 case IBD_ASYNC_TRAP: 1127 ibd_async_trap(state, ptr); 1128 break; 1129 case IBD_ASYNC_SCHED: 1130 ibd_async_txsched(state); 1131 break; 1132 case IBD_ASYNC_LINK: 1133 ibd_async_link(state, ptr); 1134 break; 1135 case IBD_ASYNC_EXIT: 1136 mutex_enter(&state->id_acache_req_lock); 1137 #ifndef __lock_lint 1138 CALLB_CPR_EXIT(&cprinfo); 1139 #else 1140 mutex_exit(&state->id_acache_req_lock); 1141 #endif 1142 return; 1143 case IBD_ASYNC_RC_TOO_BIG: 1144 ibd_async_rc_process_too_big(state, 1145 ptr); 1146 break; 1147 case IBD_ASYNC_RC_CLOSE_ACT_CHAN: 1148 ibd_async_rc_close_act_chan(state, ptr); 1149 break; 1150 case IBD_ASYNC_RC_RECYCLE_ACE: 1151 ibd_async_rc_recycle_ace(state, ptr); 1152 break; 1153 } 1154 if (ptr != NULL) 1155 kmem_cache_free(state->id_req_kmc, ptr); 1156 1157 mutex_enter(&state->id_acache_req_lock); 1158 } else { 1159 #ifndef __lock_lint 1160 /* 1161 * Nothing to do: wait till new request arrives. 1162 */ 1163 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1164 cv_wait(&state->id_acache_req_cv, 1165 &state->id_acache_req_lock); 1166 CALLB_CPR_SAFE_END(&cprinfo, 1167 &state->id_acache_req_lock); 1168 #endif 1169 } 1170 } 1171 1172 /*NOTREACHED*/ 1173 _NOTE(NOT_REACHED) 1174 } 1175 1176 /* 1177 * Return when it is safe to queue requests to the async daemon; primarily 1178 * for subnet trap and async event handling. Disallow requests before the 1179 * daemon is created, and when interface deinitilization starts. 1180 */ 1181 static boolean_t 1182 ibd_async_safe(ibd_state_t *state) 1183 { 1184 mutex_enter(&state->id_trap_lock); 1185 if (state->id_trap_stop) { 1186 mutex_exit(&state->id_trap_lock); 1187 return (B_FALSE); 1188 } 1189 state->id_trap_inprog++; 1190 mutex_exit(&state->id_trap_lock); 1191 return (B_TRUE); 1192 } 1193 1194 /* 1195 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1196 * trap or event handling to complete to kill the async thread and deconstruct 1197 * the mcg/ace list. 1198 */ 1199 static void 1200 ibd_async_done(ibd_state_t *state) 1201 { 1202 mutex_enter(&state->id_trap_lock); 1203 if (--state->id_trap_inprog == 0) 1204 cv_signal(&state->id_trap_cv); 1205 mutex_exit(&state->id_trap_lock); 1206 } 1207 1208 /* 1209 * Hash functions: 1210 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1211 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1212 * These operate on mac addresses input into ibd_send, but there is no 1213 * guarantee on the alignment of the ipoib_mac_t structure. 1214 */ 1215 /*ARGSUSED*/ 1216 static uint_t 1217 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1218 { 1219 ulong_t ptraddr = (ulong_t)key; 1220 uint_t hval; 1221 1222 /* 1223 * If the input address is 4 byte aligned, we can just dereference 1224 * it. This is most common, since IP will send in a 4 byte aligned 1225 * IP header, which implies the 24 byte IPoIB psuedo header will be 1226 * 4 byte aligned too. 1227 */ 1228 if ((ptraddr & 3) == 0) 1229 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1230 1231 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1232 return (hval); 1233 } 1234 1235 static int 1236 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1237 { 1238 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1239 return (0); 1240 else 1241 return (1); 1242 } 1243 1244 /* 1245 * Initialize all the per interface caches and lists; AH cache, 1246 * MCG list etc. 1247 */ 1248 static int 1249 ibd_acache_init(ibd_state_t *state) 1250 { 1251 ibd_ace_t *ce; 1252 int i; 1253 1254 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1255 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1256 1257 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1258 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1259 mutex_enter(&state->id_ac_mutex); 1260 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1261 offsetof(ibd_ace_t, ac_list)); 1262 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1263 offsetof(ibd_ace_t, ac_list)); 1264 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1265 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1266 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1267 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1268 offsetof(ibd_mce_t, mc_list)); 1269 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1270 offsetof(ibd_mce_t, mc_list)); 1271 list_create(&state->id_req_list, sizeof (ibd_req_t), 1272 offsetof(ibd_req_t, rq_list)); 1273 state->id_ac_hot_ace = NULL; 1274 1275 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1276 IBD_NUM_AH, KM_SLEEP); 1277 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1278 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1279 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1280 mutex_exit(&state->id_ac_mutex); 1281 ibd_acache_fini(state); 1282 return (DDI_FAILURE); 1283 } else { 1284 CLEAR_REFCYCLE(ce); 1285 ce->ac_mce = NULL; 1286 mutex_init(&ce->tx_too_big_mutex, NULL, 1287 MUTEX_DRIVER, NULL); 1288 IBD_ACACHE_INSERT_FREE(state, ce); 1289 } 1290 } 1291 mutex_exit(&state->id_ac_mutex); 1292 return (DDI_SUCCESS); 1293 } 1294 1295 static void 1296 ibd_acache_fini(ibd_state_t *state) 1297 { 1298 ibd_ace_t *ptr; 1299 1300 mutex_enter(&state->id_ac_mutex); 1301 1302 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1303 ASSERT(GET_REF(ptr) == 0); 1304 mutex_destroy(&ptr->tx_too_big_mutex); 1305 (void) ibt_free_ud_dest(ptr->ac_dest); 1306 } 1307 1308 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1309 ASSERT(GET_REF(ptr) == 0); 1310 mutex_destroy(&ptr->tx_too_big_mutex); 1311 (void) ibt_free_ud_dest(ptr->ac_dest); 1312 } 1313 1314 list_destroy(&state->id_ah_free); 1315 list_destroy(&state->id_ah_active); 1316 list_destroy(&state->id_mc_full); 1317 list_destroy(&state->id_mc_non); 1318 list_destroy(&state->id_req_list); 1319 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1320 mutex_exit(&state->id_ac_mutex); 1321 mutex_destroy(&state->id_ac_mutex); 1322 mutex_destroy(&state->id_mc_mutex); 1323 mutex_destroy(&state->id_acache_req_lock); 1324 cv_destroy(&state->id_acache_req_cv); 1325 } 1326 1327 /* 1328 * Search AH active hash list for a cached path to input destination. 1329 * If we are "just looking", hold == F. When we are in the Tx path, 1330 * we set hold == T to grab a reference on the AH so that it can not 1331 * be recycled to a new destination while the Tx request is posted. 1332 */ 1333 ibd_ace_t * 1334 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1335 { 1336 ibd_ace_t *ptr; 1337 1338 ASSERT(mutex_owned(&state->id_ac_mutex)); 1339 1340 /* 1341 * Do hash search. 1342 */ 1343 if (mod_hash_find(state->id_ah_active_hash, 1344 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1345 if (hold) 1346 INC_REF(ptr, num); 1347 return (ptr); 1348 } 1349 return (NULL); 1350 } 1351 1352 /* 1353 * This is called by the tx side; if an initialized AH is found in 1354 * the active list, it is locked down and can be used; if no entry 1355 * is found, an async request is queued to do path resolution. 1356 */ 1357 static ibd_ace_t * 1358 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1359 { 1360 ibd_ace_t *ptr; 1361 ibd_req_t *req; 1362 1363 /* 1364 * Only attempt to print when we can; in the mdt pattr case, the 1365 * address is not aligned properly. 1366 */ 1367 if (((ulong_t)mac & 3) == 0) { 1368 DPRINT(4, 1369 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1370 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1371 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1372 htonl(mac->ipoib_gidsuff[1])); 1373 } 1374 1375 mutex_enter(&state->id_ac_mutex); 1376 1377 if (((ptr = state->id_ac_hot_ace) != NULL) && 1378 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1379 INC_REF(ptr, numwqe); 1380 mutex_exit(&state->id_ac_mutex); 1381 return (ptr); 1382 } 1383 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1384 state->id_ac_hot_ace = ptr; 1385 mutex_exit(&state->id_ac_mutex); 1386 return (ptr); 1387 } 1388 1389 /* 1390 * Implementation of a single outstanding async request; if 1391 * the operation is not started yet, queue a request and move 1392 * to ongoing state. Remember in id_ah_addr for which address 1393 * we are queueing the request, in case we need to flag an error; 1394 * Any further requests, for the same or different address, until 1395 * the operation completes, is sent back to GLDv3 to be retried. 1396 * The async thread will update id_ah_op with an error indication 1397 * or will set it to indicate the next look up can start; either 1398 * way, it will mac_tx_update() so that all blocked requests come 1399 * back here. 1400 */ 1401 *err = EAGAIN; 1402 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1403 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1404 if (req != NULL) { 1405 /* 1406 * We did not even find the entry; queue a request 1407 * for it. 1408 */ 1409 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1410 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1411 state->id_ah_op = IBD_OP_ONGOING; 1412 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1413 } 1414 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1415 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1416 /* 1417 * Check the status of the pathrecord lookup request 1418 * we had queued before. 1419 */ 1420 if (state->id_ah_op == IBD_OP_ERRORED) { 1421 *err = EFAULT; 1422 state->id_ah_error++; 1423 } else { 1424 /* 1425 * IBD_OP_ROUTERED case: We need to send to the 1426 * all-router MCG. If we can find the AH for 1427 * the mcg, the Tx will be attempted. If we 1428 * do not find the AH, we return NORESOURCES 1429 * to retry. 1430 */ 1431 ipoib_mac_t routermac; 1432 1433 (void) ibd_get_allroutergroup(state, mac, &routermac); 1434 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1435 numwqe); 1436 } 1437 state->id_ah_op = IBD_OP_NOTSTARTED; 1438 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1439 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1440 /* 1441 * This case can happen when we get a higher band 1442 * packet. The easiest way is to reset the state machine 1443 * to accommodate the higher priority packet. 1444 */ 1445 state->id_ah_op = IBD_OP_NOTSTARTED; 1446 } 1447 mutex_exit(&state->id_ac_mutex); 1448 1449 return (ptr); 1450 } 1451 1452 /* 1453 * Grab a not-currently-in-use AH/PathRecord from the active 1454 * list to recycle to a new destination. Only the async thread 1455 * executes this code. 1456 */ 1457 static ibd_ace_t * 1458 ibd_acache_get_unref(ibd_state_t *state) 1459 { 1460 ibd_ace_t *ptr = list_tail(&state->id_ah_active); 1461 boolean_t try_rc_chan_recycle = B_FALSE; 1462 1463 ASSERT(mutex_owned(&state->id_ac_mutex)); 1464 1465 /* 1466 * Do plain linear search. 1467 */ 1468 while (ptr != NULL) { 1469 /* 1470 * Note that it is possible that the "cycle" bit 1471 * is set on the AH w/o any reference count. The 1472 * mcg must have been deleted, and the tx cleanup 1473 * just decremented the reference count to 0, but 1474 * hasn't gotten around to grabbing the id_ac_mutex 1475 * to move the AH into the free list. 1476 */ 1477 if (GET_REF(ptr) == 0) { 1478 if (ptr->ac_chan != NULL) { 1479 ASSERT(state->id_enable_rc == B_TRUE); 1480 if (!try_rc_chan_recycle) { 1481 try_rc_chan_recycle = B_TRUE; 1482 ibd_rc_signal_ace_recycle(state, ptr); 1483 } 1484 } else { 1485 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1486 break; 1487 } 1488 } 1489 ptr = list_prev(&state->id_ah_active, ptr); 1490 } 1491 return (ptr); 1492 } 1493 1494 /* 1495 * Invoked to clean up AH from active list in case of multicast 1496 * disable and to handle sendonly memberships during mcg traps. 1497 * And for port up processing for multicast and unicast AHs. 1498 * Normally, the AH is taken off the active list, and put into 1499 * the free list to be recycled for a new destination. In case 1500 * Tx requests on the AH have not completed yet, the AH is marked 1501 * for reaping (which will put the AH on the free list) once the Tx's 1502 * complete; in this case, depending on the "force" input, we take 1503 * out the AH from the active list right now, or leave it also for 1504 * the reap operation. Returns TRUE if the AH is taken off the active 1505 * list (and either put into the free list right now, or arranged for 1506 * later), FALSE otherwise. 1507 */ 1508 boolean_t 1509 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1510 { 1511 ibd_ace_t *acactive; 1512 boolean_t ret = B_TRUE; 1513 1514 ASSERT(mutex_owned(&state->id_ac_mutex)); 1515 1516 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1517 1518 /* 1519 * Note that the AH might already have the cycle bit set 1520 * on it; this might happen if sequences of multicast 1521 * enables and disables are coming so fast, that posted 1522 * Tx's to the mcg have not completed yet, and the cycle 1523 * bit is set successively by each multicast disable. 1524 */ 1525 if (SET_CYCLE_IF_REF(acactive)) { 1526 if (!force) { 1527 /* 1528 * The ace is kept on the active list, further 1529 * Tx's can still grab a reference on it; the 1530 * ace is reaped when all pending Tx's 1531 * referencing the AH complete. 1532 */ 1533 ret = B_FALSE; 1534 } else { 1535 /* 1536 * In the mcg trap case, we always pull the 1537 * AH from the active list. And also the port 1538 * up multi/unicast case. 1539 */ 1540 ASSERT(acactive->ac_chan == NULL); 1541 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1542 acactive->ac_mce = NULL; 1543 } 1544 } else { 1545 /* 1546 * Determined the ref count is 0, thus reclaim 1547 * immediately after pulling out the ace from 1548 * the active list. 1549 */ 1550 ASSERT(acactive->ac_chan == NULL); 1551 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1552 acactive->ac_mce = NULL; 1553 IBD_ACACHE_INSERT_FREE(state, acactive); 1554 } 1555 1556 } 1557 return (ret); 1558 } 1559 1560 /* 1561 * Helper function for async path record lookup. If we are trying to 1562 * Tx to a MCG, check our membership, possibly trying to join the 1563 * group if required. If that fails, try to send the packet to the 1564 * all router group (indicated by the redirect output), pointing 1565 * the input mac address to the router mcg address. 1566 */ 1567 static ibd_mce_t * 1568 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1569 { 1570 ib_gid_t mgid; 1571 ibd_mce_t *mce; 1572 ipoib_mac_t routermac; 1573 1574 *redirect = B_FALSE; 1575 ibd_n2h_gid(mac, &mgid); 1576 1577 /* 1578 * Check the FullMember+SendOnlyNonMember list. 1579 * Since we are the only one who manipulates the 1580 * id_mc_full list, no locks are needed. 1581 */ 1582 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1583 if (mce != NULL) { 1584 DPRINT(4, "ibd_async_mcache : already joined to group"); 1585 return (mce); 1586 } 1587 1588 /* 1589 * Not found; try to join(SendOnlyNonMember) and attach. 1590 */ 1591 DPRINT(4, "ibd_async_mcache : not joined to group"); 1592 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1593 NULL) { 1594 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1595 return (mce); 1596 } 1597 1598 /* 1599 * MCGroup not present; try to join the all-router group. If 1600 * any of the following steps succeed, we will be redirecting 1601 * to the all router group. 1602 */ 1603 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1604 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1605 return (NULL); 1606 *redirect = B_TRUE; 1607 ibd_n2h_gid(&routermac, &mgid); 1608 bcopy(&routermac, mac, IPOIB_ADDRL); 1609 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1610 mgid.gid_prefix, mgid.gid_guid); 1611 1612 /* 1613 * Are we already joined to the router group? 1614 */ 1615 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1616 DPRINT(4, "ibd_async_mcache : using already joined router" 1617 "group\n"); 1618 return (mce); 1619 } 1620 1621 /* 1622 * Can we join(SendOnlyNonMember) the router group? 1623 */ 1624 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1625 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1626 NULL) { 1627 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1628 return (mce); 1629 } 1630 1631 return (NULL); 1632 } 1633 1634 /* 1635 * Async path record lookup code. 1636 */ 1637 static void 1638 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1639 { 1640 ibd_ace_t *ce; 1641 ibd_mce_t *mce = NULL; 1642 ibt_path_attr_t path_attr; 1643 ibt_path_info_t path_info; 1644 ib_gid_t destgid; 1645 char ret = IBD_OP_NOTSTARTED; 1646 1647 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1648 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1649 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1650 htonl(mac->ipoib_gidsuff[1])); 1651 1652 /* 1653 * Check whether we are trying to transmit to a MCG. 1654 * In that case, we need to make sure we are a member of 1655 * the MCG. 1656 */ 1657 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1658 boolean_t redirected; 1659 1660 /* 1661 * If we can not find or join the group or even 1662 * redirect, error out. 1663 */ 1664 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1665 NULL) { 1666 state->id_ah_op = IBD_OP_ERRORED; 1667 return; 1668 } 1669 1670 /* 1671 * If we got redirected, we need to determine whether 1672 * the AH for the new mcg is in the cache already, and 1673 * not pull it in then; otherwise proceed to get the 1674 * path for the new mcg. There is no guarantee that 1675 * if the AH is currently in the cache, it will still be 1676 * there when we look in ibd_acache_lookup(), but that's 1677 * okay, we will come back here. 1678 */ 1679 if (redirected) { 1680 ret = IBD_OP_ROUTERED; 1681 DPRINT(4, "ibd_async_acache : redirected to " 1682 "%08X:%08X:%08X:%08X:%08X", 1683 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1684 htonl(mac->ipoib_gidpref[1]), 1685 htonl(mac->ipoib_gidsuff[0]), 1686 htonl(mac->ipoib_gidsuff[1])); 1687 1688 mutex_enter(&state->id_ac_mutex); 1689 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1690 state->id_ah_op = IBD_OP_ROUTERED; 1691 mutex_exit(&state->id_ac_mutex); 1692 DPRINT(4, "ibd_async_acache : router AH found"); 1693 return; 1694 } 1695 mutex_exit(&state->id_ac_mutex); 1696 } 1697 } 1698 1699 /* 1700 * Get an AH from the free list. 1701 */ 1702 mutex_enter(&state->id_ac_mutex); 1703 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1704 /* 1705 * No free ones; try to grab an unreferenced active 1706 * one. Maybe we need to make the active list LRU, 1707 * but that will create more work for Tx callbacks. 1708 * Is there a way of not having to pull out the 1709 * entry from the active list, but just indicate it 1710 * is being recycled? Yes, but that creates one more 1711 * check in the fast lookup path. 1712 */ 1713 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1714 /* 1715 * Pretty serious shortage now. 1716 */ 1717 state->id_ah_op = IBD_OP_NOTSTARTED; 1718 mutex_exit(&state->id_ac_mutex); 1719 DPRINT(10, "ibd_async_acache : failed to find AH " 1720 "slot\n"); 1721 return; 1722 } 1723 /* 1724 * We could check whether ac_mce points to a SendOnly 1725 * member and drop that membership now. Or do it lazily 1726 * at detach time. 1727 */ 1728 ce->ac_mce = NULL; 1729 } 1730 mutex_exit(&state->id_ac_mutex); 1731 ASSERT(ce->ac_mce == NULL); 1732 1733 /* 1734 * Update the entry. 1735 */ 1736 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1737 1738 bzero(&path_info, sizeof (path_info)); 1739 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1740 path_attr.pa_sgid = state->id_sgid; 1741 path_attr.pa_num_dgids = 1; 1742 ibd_n2h_gid(&ce->ac_mac, &destgid); 1743 path_attr.pa_dgids = &destgid; 1744 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1745 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1746 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1747 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1748 goto error; 1749 } 1750 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1751 ntohl(ce->ac_mac.ipoib_qpn), 1752 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1753 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1754 goto error; 1755 } 1756 1757 /* 1758 * mce is set whenever an AH is being associated with a 1759 * MCG; this will come in handy when we leave the MCG. The 1760 * lock protects Tx fastpath from scanning the active list. 1761 */ 1762 if (mce != NULL) 1763 ce->ac_mce = mce; 1764 1765 /* 1766 * initiate a RC mode connection for unicast address 1767 */ 1768 if (state->id_enable_rc && (mac->ipoib_qpn != htonl(IB_MC_QPN)) && 1769 (htonl(mac->ipoib_qpn) & IBD_MAC_ADDR_RC)) { 1770 ASSERT(ce->ac_chan == NULL); 1771 DPRINT(10, "ibd_async_acache: call " 1772 "ibd_rc_try_connect(ace=%p)", ce); 1773 ibd_rc_try_connect(state, ce, &path_info); 1774 if (ce->ac_chan == NULL) { 1775 DPRINT(10, "ibd_async_acache: fail to setup RC" 1776 " channel"); 1777 state->rc_conn_fail++; 1778 goto error; 1779 } 1780 } 1781 1782 mutex_enter(&state->id_ac_mutex); 1783 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1784 state->id_ah_op = ret; 1785 mutex_exit(&state->id_ac_mutex); 1786 return; 1787 error: 1788 /* 1789 * We might want to drop SendOnly membership here if we 1790 * joined above. The lock protects Tx callbacks inserting 1791 * into the free list. 1792 */ 1793 mutex_enter(&state->id_ac_mutex); 1794 state->id_ah_op = IBD_OP_ERRORED; 1795 IBD_ACACHE_INSERT_FREE(state, ce); 1796 mutex_exit(&state->id_ac_mutex); 1797 } 1798 1799 /* 1800 * While restoring port's presence on the subnet on a port up, it is possible 1801 * that the port goes down again. 1802 */ 1803 static void 1804 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1805 { 1806 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1807 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1808 LINK_STATE_UP; 1809 ibd_mce_t *mce, *pmce; 1810 ibd_ace_t *ace, *pace; 1811 1812 DPRINT(10, "ibd_async_link(): %d", opcode); 1813 1814 /* 1815 * On a link up, revalidate the link speed/width. No point doing 1816 * this on a link down, since we will be unable to do SA operations, 1817 * defaulting to the lowest speed. Also notice that we update our 1818 * notion of speed before calling mac_link_update(), which will do 1819 * necessary higher level notifications for speed changes. 1820 */ 1821 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1822 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1823 state->id_link_speed = ibd_get_portspeed(state); 1824 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1825 } 1826 1827 /* 1828 * Do all the work required to establish our presence on 1829 * the subnet. 1830 */ 1831 if (opcode == IBD_LINK_UP_ABSENT) { 1832 /* 1833 * If in promiscuous mode ... 1834 */ 1835 if (state->id_prom_op == IBD_OP_COMPLETED) { 1836 /* 1837 * Drop all nonmembership. 1838 */ 1839 ibd_async_unsetprom(state); 1840 1841 /* 1842 * Then, try to regain nonmembership to all mcg's. 1843 */ 1844 ibd_async_setprom(state); 1845 1846 } 1847 1848 /* 1849 * Drop all sendonly membership (which also gets rid of the 1850 * AHs); try to reacquire all full membership. 1851 */ 1852 mce = list_head(&state->id_mc_full); 1853 while ((pmce = mce) != NULL) { 1854 mce = list_next(&state->id_mc_full, mce); 1855 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1856 ibd_leave_group(state, 1857 pmce->mc_info.mc_adds_vect.av_dgid, 1858 IB_MC_JSTATE_SEND_ONLY_NON); 1859 else 1860 ibd_reacquire_group(state, pmce); 1861 } 1862 1863 /* 1864 * Recycle all active AHs to free list (and if there are 1865 * pending posts, make sure they will go into the free list 1866 * once the Tx's complete). Grab the lock to prevent 1867 * concurrent Tx's as well as Tx cleanups. 1868 */ 1869 mutex_enter(&state->id_ac_mutex); 1870 ace = list_head(&state->id_ah_active); 1871 while ((pace = ace) != NULL) { 1872 boolean_t cycled; 1873 1874 ace = list_next(&state->id_ah_active, ace); 1875 mce = pace->ac_mce; 1876 if (pace->ac_chan != NULL) { 1877 ASSERT(mce == NULL); 1878 ASSERT(state->id_enable_rc == B_TRUE); 1879 if (pace->ac_chan->chan_state == 1880 IBD_RC_STATE_ACT_ESTAB) { 1881 INC_REF(pace, 1); 1882 IBD_ACACHE_PULLOUT_ACTIVE(state, pace); 1883 pace->ac_chan->chan_state = 1884 IBD_RC_STATE_ACT_CLOSING; 1885 ibd_rc_signal_act_close(state, pace); 1886 } else { 1887 state->rc_act_close_simultaneous++; 1888 DPRINT(40, "ibd_async_link: other " 1889 "thread is closing it, ace=%p, " 1890 "ac_chan=%p, chan_state=%d", 1891 pace, pace->ac_chan, 1892 pace->ac_chan->chan_state); 1893 } 1894 } else { 1895 cycled = ibd_acache_recycle(state, 1896 &pace->ac_mac, B_TRUE); 1897 } 1898 /* 1899 * If this is for an mcg, it must be for a fullmember, 1900 * since we got rid of send-only members above when 1901 * processing the mce list. 1902 */ 1903 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1904 IB_MC_JSTATE_FULL))); 1905 1906 /* 1907 * Check if the fullmember mce needs to be torn down, 1908 * ie whether the DLPI disable has already been done. 1909 * If so, do some of the work of tx_cleanup, namely 1910 * causing leave (which will fail), detach and 1911 * mce-freeing. tx_cleanup will put the AH into free 1912 * list. The reason to duplicate some of this 1913 * tx_cleanup work is because we want to delete the 1914 * AH right now instead of waiting for tx_cleanup, to 1915 * force subsequent Tx's to reacquire an AH. 1916 */ 1917 if ((mce != NULL) && (mce->mc_fullreap)) 1918 ibd_async_reap_group(state, mce, 1919 mce->mc_info.mc_adds_vect.av_dgid, 1920 mce->mc_jstate); 1921 } 1922 mutex_exit(&state->id_ac_mutex); 1923 } 1924 1925 /* 1926 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1927 * (which stops further events from being delivered) before 1928 * mac_unregister(). At this point, it is guaranteed that mac_register 1929 * has already been done. 1930 */ 1931 mutex_enter(&state->id_link_mutex); 1932 state->id_link_state = lstate; 1933 mac_link_update(state->id_mh, lstate); 1934 mutex_exit(&state->id_link_mutex); 1935 1936 ibd_async_done(state); 1937 } 1938 1939 /* 1940 * Check the pkey table to see if we can find the pkey we're looking for. 1941 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1942 * failure. 1943 */ 1944 static int 1945 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1946 uint16_t *pkix) 1947 { 1948 uint16_t ndx; 1949 1950 ASSERT(pkix != NULL); 1951 1952 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1953 if (pkey_tbl[ndx] == pkey) { 1954 *pkix = ndx; 1955 return (0); 1956 } 1957 } 1958 return (-1); 1959 } 1960 1961 /* 1962 * When the link is notified up, we need to do a few things, based 1963 * on the port's current p_init_type_reply claiming a reinit has been 1964 * done or not. The reinit steps are: 1965 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1966 * the old Pkey and GID0 are correct. 1967 * 2. Register for mcg traps (already done by ibmf). 1968 * 3. If PreservePresenceReply indicates the SM has restored port's presence 1969 * in subnet, nothing more to do. Else go to next steps (on async daemon). 1970 * 4. Give up all sendonly memberships. 1971 * 5. Acquire all full memberships. 1972 * 6. In promiscuous mode, acquire all non memberships. 1973 * 7. Recycle all AHs to free list. 1974 */ 1975 static void 1976 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 1977 { 1978 ibt_hca_portinfo_t *port_infop = NULL; 1979 ibt_status_t ibt_status; 1980 uint_t psize, port_infosz; 1981 ibd_link_op_t opcode; 1982 ibd_req_t *req; 1983 link_state_t new_link_state = LINK_STATE_UP; 1984 uint8_t itreply; 1985 uint16_t pkix; 1986 int ret; 1987 1988 /* 1989 * Let's not race with a plumb or an unplumb; if we detect a 1990 * pkey relocation event later on here, we may have to restart. 1991 */ 1992 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 1993 1994 mutex_enter(&state->id_link_mutex); 1995 1996 /* 1997 * If the init code in ibd_m_start hasn't yet set up the 1998 * pkey/gid, nothing to do; that code will set the link state. 1999 */ 2000 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2001 mutex_exit(&state->id_link_mutex); 2002 goto link_mod_return; 2003 } 2004 2005 /* 2006 * If this routine was called in response to a port down event, 2007 * we just need to see if this should be informed. 2008 */ 2009 if (code == IBT_ERROR_PORT_DOWN) { 2010 new_link_state = LINK_STATE_DOWN; 2011 goto update_link_state; 2012 } 2013 2014 /* 2015 * If it's not a port down event we've received, try to get the port 2016 * attributes first. If we fail here, the port is as good as down. 2017 * Otherwise, if the link went down by the time the handler gets 2018 * here, give up - we cannot even validate the pkey/gid since those 2019 * are not valid and this is as bad as a port down anyway. 2020 */ 2021 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2022 &port_infop, &psize, &port_infosz); 2023 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2024 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2025 new_link_state = LINK_STATE_DOWN; 2026 goto update_link_state; 2027 } 2028 2029 /* 2030 * Check the SM InitTypeReply flags. If both NoLoadReply and 2031 * PreserveContentReply are 0, we don't know anything about the 2032 * data loaded into the port attributes, so we need to verify 2033 * if gid0 and pkey are still valid. 2034 */ 2035 itreply = port_infop->p_init_type_reply; 2036 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2037 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2038 /* 2039 * Check to see if the subnet part of GID0 has changed. If 2040 * not, check the simple case first to see if the pkey 2041 * index is the same as before; finally check to see if the 2042 * pkey has been relocated to a different index in the table. 2043 */ 2044 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2045 if (bcmp(port_infop->p_sgid_tbl, 2046 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2047 2048 new_link_state = LINK_STATE_DOWN; 2049 2050 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2051 state->id_pkey) { 2052 2053 new_link_state = LINK_STATE_UP; 2054 2055 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2056 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2057 2058 ibt_free_portinfo(port_infop, port_infosz); 2059 mutex_exit(&state->id_link_mutex); 2060 2061 /* 2062 * Currently a restart is required if our pkey has moved 2063 * in the pkey table. If we get the ibt_recycle_ud() to 2064 * work as documented (expected), we may be able to 2065 * avoid a complete restart. Note that we've already 2066 * marked both the start and stop 'in-progress' flags, 2067 * so it is ok to go ahead and do this restart. 2068 */ 2069 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2070 if ((ret = ibd_start(state)) != 0) { 2071 DPRINT(10, "ibd_restart: cannot restart, " 2072 "ret=%d", ret); 2073 } 2074 2075 goto link_mod_return; 2076 } else { 2077 new_link_state = LINK_STATE_DOWN; 2078 } 2079 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2080 } 2081 2082 update_link_state: 2083 if (port_infop) { 2084 ibt_free_portinfo(port_infop, port_infosz); 2085 } 2086 2087 /* 2088 * If the old state is the same as the new state, nothing to do 2089 */ 2090 if (state->id_link_state == new_link_state) { 2091 mutex_exit(&state->id_link_mutex); 2092 goto link_mod_return; 2093 } 2094 2095 /* 2096 * Ok, so there was a link state change; see if it's safe to ask 2097 * the async thread to do the work 2098 */ 2099 if (!ibd_async_safe(state)) { 2100 state->id_link_state = new_link_state; 2101 mutex_exit(&state->id_link_mutex); 2102 goto link_mod_return; 2103 } 2104 2105 mutex_exit(&state->id_link_mutex); 2106 2107 /* 2108 * If we're reporting a link up, check InitTypeReply to see if 2109 * the SM has ensured that the port's presence in mcg, traps, 2110 * etc. is intact. 2111 */ 2112 if (new_link_state == LINK_STATE_DOWN) { 2113 opcode = IBD_LINK_DOWN; 2114 } else { 2115 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2116 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2117 opcode = IBD_LINK_UP; 2118 } else { 2119 opcode = IBD_LINK_UP_ABSENT; 2120 } 2121 } 2122 2123 /* 2124 * Queue up a request for ibd_async_link() to handle this link 2125 * state change event 2126 */ 2127 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2128 req->rq_ptr = (void *)opcode; 2129 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2130 2131 link_mod_return: 2132 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2133 } 2134 2135 /* 2136 * For the port up/down events, IBTL guarantees there will not be concurrent 2137 * invocations of the handler. IBTL might coalesce link transition events, 2138 * and not invoke the handler for _each_ up/down transition, but it will 2139 * invoke the handler with last known state 2140 */ 2141 static void 2142 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2143 ibt_async_code_t code, ibt_async_event_t *event) 2144 { 2145 ibd_state_t *state = (ibd_state_t *)clnt_private; 2146 2147 switch (code) { 2148 case IBT_ERROR_CATASTROPHIC_CHAN: 2149 ibd_print_warn(state, "catastrophic channel error"); 2150 break; 2151 case IBT_ERROR_CQ: 2152 ibd_print_warn(state, "completion queue error"); 2153 break; 2154 case IBT_PORT_CHANGE_EVENT: 2155 /* 2156 * Events will be delivered to all instances that have 2157 * done ibt_open_hca() but not yet done ibt_close_hca(). 2158 * Only need to do work for our port; IBTF will deliver 2159 * events for other ports on the hca we have ibt_open_hca'ed 2160 * too. Note that id_port is initialized in ibd_attach() 2161 * before we do an ibt_open_hca() in ibd_attach(). 2162 */ 2163 ASSERT(state->id_hca_hdl == hca_hdl); 2164 if (state->id_port != event->ev_port) 2165 break; 2166 2167 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2168 IBT_PORT_CHANGE_PKEY) { 2169 ibd_link_mod(state, code); 2170 } 2171 break; 2172 case IBT_ERROR_PORT_DOWN: 2173 case IBT_CLNT_REREG_EVENT: 2174 case IBT_EVENT_PORT_UP: 2175 /* 2176 * Events will be delivered to all instances that have 2177 * done ibt_open_hca() but not yet done ibt_close_hca(). 2178 * Only need to do work for our port; IBTF will deliver 2179 * events for other ports on the hca we have ibt_open_hca'ed 2180 * too. Note that id_port is initialized in ibd_attach() 2181 * before we do an ibt_open_hca() in ibd_attach(). 2182 */ 2183 ASSERT(state->id_hca_hdl == hca_hdl); 2184 if (state->id_port != event->ev_port) 2185 break; 2186 2187 ibd_link_mod(state, code); 2188 break; 2189 2190 case IBT_HCA_ATTACH_EVENT: 2191 case IBT_HCA_DETACH_EVENT: 2192 /* 2193 * When a new card is plugged to the system, attach_event is 2194 * invoked. Additionally, a cfgadm needs to be run to make the 2195 * card known to the system, and an ifconfig needs to be run to 2196 * plumb up any ibd interfaces on the card. In the case of card 2197 * unplug, a cfgadm is run that will trigger any RCM scripts to 2198 * unplumb the ibd interfaces on the card; when the card is 2199 * actually unplugged, the detach_event is invoked; 2200 * additionally, if any ibd instances are still active on the 2201 * card (eg there were no associated RCM scripts), driver's 2202 * detach routine is invoked. 2203 */ 2204 break; 2205 default: 2206 break; 2207 } 2208 } 2209 2210 static int 2211 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2212 { 2213 mac_register_t *macp; 2214 int ret; 2215 2216 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2217 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2218 return (DDI_FAILURE); 2219 } 2220 2221 /* 2222 * Note that when we register with mac during attach, we don't 2223 * have the id_macaddr yet, so we'll simply be registering a 2224 * zero macaddr that we'll overwrite later during plumb (in 2225 * ibd_m_start()). Similar is the case with id_mtu - we'll 2226 * update the mac layer with the correct mtu during plumb. 2227 */ 2228 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2229 macp->m_driver = state; 2230 macp->m_dip = dip; 2231 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2232 macp->m_callbacks = &ibd_m_callbacks; 2233 macp->m_min_sdu = 0; 2234 if (state->id_enable_rc) { 2235 macp->m_max_sdu = state->rc_mtu - IPOIB_HDRSIZE; 2236 } else { 2237 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2238 } 2239 2240 /* 2241 * Register ourselves with the GLDv3 interface 2242 */ 2243 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2244 mac_free(macp); 2245 DPRINT(10, 2246 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2247 return (DDI_FAILURE); 2248 } 2249 2250 mac_free(macp); 2251 return (DDI_SUCCESS); 2252 } 2253 2254 static int 2255 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2256 { 2257 ibt_hca_attr_t hca_attrs; 2258 ibt_status_t ibt_status; 2259 2260 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2261 2262 /* 2263 * Query the HCA and fetch its attributes 2264 */ 2265 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2266 ASSERT(ibt_status == IBT_SUCCESS); 2267 2268 /* 2269 * 1. Set the Hardware Checksum capability. Currently we only consider 2270 * full checksum offload. 2271 */ 2272 if (state->id_enable_rc) { 2273 state->id_hwcksum_capab = 0; 2274 } else { 2275 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) 2276 == IBT_HCA_CKSUM_FULL) { 2277 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2278 } 2279 } 2280 2281 /* 2282 * 2. Set LSO policy, capability and maximum length 2283 */ 2284 if (state->id_enable_rc) { 2285 state->id_lso_policy = B_FALSE; 2286 state->id_lso_capable = B_FALSE; 2287 state->id_lso_maxlen = 0; 2288 } else { 2289 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS 2290 |DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2291 state->id_lso_policy = B_TRUE; 2292 } else { 2293 state->id_lso_policy = B_FALSE; 2294 } 2295 2296 if (hca_attrs.hca_max_lso_size > 0) { 2297 state->id_lso_capable = B_TRUE; 2298 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2299 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2300 else 2301 state->id_lso_maxlen = 2302 hca_attrs.hca_max_lso_size; 2303 } else { 2304 state->id_lso_capable = B_FALSE; 2305 state->id_lso_maxlen = 0; 2306 } 2307 } 2308 2309 /* 2310 * 3. Set Reserved L_Key capability 2311 */ 2312 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2313 state->id_hca_res_lkey_capab = 1; 2314 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2315 state->rc_enable_iov_map = B_TRUE; 2316 } else { 2317 /* If no reserved lkey, we will not use ibt_map_mem_iov */ 2318 state->rc_enable_iov_map = B_FALSE; 2319 } 2320 2321 /* 2322 * 4. Set maximum sqseg value after checking to see if extended sgl 2323 * size information is provided by the hca 2324 */ 2325 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2326 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2327 state->rc_tx_max_sqseg = hca_attrs.hca_conn_send_sgl_sz; 2328 } else { 2329 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2330 state->rc_tx_max_sqseg = hca_attrs.hca_max_sgl; 2331 } 2332 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2333 state->id_max_sqseg = IBD_MAX_SQSEG; 2334 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2335 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2336 state->id_max_sqseg, IBD_MAX_SQSEG); 2337 } 2338 if (state->rc_tx_max_sqseg > IBD_MAX_SQSEG) { 2339 state->rc_tx_max_sqseg = IBD_MAX_SQSEG; 2340 } else if (state->rc_tx_max_sqseg < IBD_MAX_SQSEG) { 2341 ibd_print_warn(state, "RC mode: Set #sgl = %d instead of " 2342 "default %d", state->rc_tx_max_sqseg, IBD_MAX_SQSEG); 2343 } 2344 2345 /* 2346 * Translating the virtual address regions into physical regions 2347 * for using the Reserved LKey feature results in a wr sgl that 2348 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2349 * we'll fix a high-water mark (65%) for when we should stop. 2350 */ 2351 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2352 state->rc_max_sqseg_hiwm = (state->rc_tx_max_sqseg * 65) / 100; 2353 2354 /* 2355 * 5. Set number of recv and send wqes after checking hca maximum 2356 * channel size 2357 */ 2358 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2359 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2360 } else { 2361 state->id_num_rwqe = IBD_NUM_RWQE; 2362 } 2363 state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN; 2364 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2365 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2366 } else { 2367 state->id_num_swqe = IBD_NUM_SWQE; 2368 } 2369 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2370 2371 return (DDI_SUCCESS); 2372 } 2373 2374 static int 2375 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2376 { 2377 int instance; 2378 uint32_t progress = state->id_mac_state; 2379 ibt_status_t ret; 2380 2381 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) != 0) { 2382 cmn_err(CE_CONT, "ibd_detach: failed: rx bufs outstanding\n"); 2383 return (DDI_FAILURE); 2384 } 2385 2386 if (state->rc_srq_rwqe_list.dl_bufs_outstanding != 0) { 2387 cmn_err(CE_CONT, "ibd_detach: failed: srq bufs outstanding\n"); 2388 return (DDI_FAILURE); 2389 } 2390 2391 /* make sure rx resources are freed */ 2392 ibd_free_rx_rsrcs(state); 2393 2394 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 2395 ASSERT(state->id_enable_rc); 2396 ibd_rc_fini_srq_list(state); 2397 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 2398 } 2399 2400 if (progress & IBD_DRV_MAC_REGISTERED) { 2401 (void) mac_unregister(state->id_mh); 2402 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2403 } 2404 2405 if (progress & IBD_DRV_PD_ALLOCD) { 2406 if ((ret = ibt_free_pd(state->id_hca_hdl, 2407 state->id_pd_hdl)) != IBT_SUCCESS) { 2408 ibd_print_warn(state, "failed to free " 2409 "protection domain, ret=%d", ret); 2410 } 2411 state->id_pd_hdl = NULL; 2412 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2413 } 2414 2415 if (progress & IBD_DRV_HCA_OPENED) { 2416 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2417 IBT_SUCCESS) { 2418 ibd_print_warn(state, "failed to close " 2419 "HCA device, ret=%d", ret); 2420 } 2421 state->id_hca_hdl = NULL; 2422 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2423 } 2424 2425 mutex_enter(&ibd_gstate.ig_mutex); 2426 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2427 if ((ret = ibt_detach(state->id_ibt_hdl)) != 2428 IBT_SUCCESS) { 2429 ibd_print_warn(state, 2430 "ibt_detach() failed, ret=%d", ret); 2431 } 2432 state->id_ibt_hdl = NULL; 2433 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2434 ibd_gstate.ig_ibt_hdl_ref_cnt--; 2435 } 2436 if ((ibd_gstate.ig_ibt_hdl_ref_cnt == 0) && 2437 (ibd_gstate.ig_ibt_hdl != NULL)) { 2438 if ((ret = ibt_detach(ibd_gstate.ig_ibt_hdl)) != 2439 IBT_SUCCESS) { 2440 ibd_print_warn(state, "ibt_detach(): global " 2441 "failed, ret=%d", ret); 2442 } 2443 ibd_gstate.ig_ibt_hdl = NULL; 2444 } 2445 mutex_exit(&ibd_gstate.ig_mutex); 2446 2447 if (progress & IBD_DRV_TXINTR_ADDED) { 2448 ddi_remove_softintr(state->id_tx); 2449 state->id_tx = NULL; 2450 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2451 } 2452 2453 if (progress & IBD_DRV_RXINTR_ADDED) { 2454 ddi_remove_softintr(state->id_rx); 2455 state->id_rx = NULL; 2456 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2457 } 2458 2459 #ifdef DEBUG 2460 if (progress & IBD_DRV_RC_PRIVATE_STATE) { 2461 kstat_delete(state->rc_ksp); 2462 state->id_mac_state &= (~IBD_DRV_RC_PRIVATE_STATE); 2463 } 2464 #endif 2465 2466 if (progress & IBD_DRV_STATE_INITIALIZED) { 2467 ibd_state_fini(state); 2468 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2469 } 2470 2471 instance = ddi_get_instance(dip); 2472 ddi_soft_state_free(ibd_list, instance); 2473 2474 return (DDI_SUCCESS); 2475 } 2476 2477 /* 2478 * Attach device to the IO framework. 2479 */ 2480 static int 2481 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2482 { 2483 ibd_state_t *state = NULL; 2484 ib_guid_t hca_guid; 2485 int instance; 2486 ibt_status_t ret; 2487 int rv; 2488 2489 /* 2490 * IBD doesn't support suspend/resume 2491 */ 2492 if (cmd != DDI_ATTACH) 2493 return (DDI_FAILURE); 2494 2495 /* 2496 * Allocate softstate structure 2497 */ 2498 instance = ddi_get_instance(dip); 2499 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2500 return (DDI_FAILURE); 2501 state = ddi_get_soft_state(ibd_list, instance); 2502 2503 /* 2504 * Initialize mutexes and condition variables 2505 */ 2506 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2507 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2508 goto attach_fail; 2509 } 2510 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2511 2512 /* 2513 * Allocate rx,tx softintr 2514 */ 2515 if (ibd_rx_softintr == 1) { 2516 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2517 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2518 DPRINT(10, "ibd_attach: failed in " 2519 "ddi_add_softintr(id_rx), ret=%d", rv); 2520 goto attach_fail; 2521 } 2522 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2523 } 2524 if (ibd_tx_softintr == 1) { 2525 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2526 NULL, NULL, ibd_tx_recycle, 2527 (caddr_t)state)) != DDI_SUCCESS) { 2528 DPRINT(10, "ibd_attach: failed in " 2529 "ddi_add_softintr(id_tx), ret=%d", rv); 2530 goto attach_fail; 2531 } 2532 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2533 } 2534 2535 /* 2536 * Obtain IBA P_Key, port number and HCA guid and validate 2537 * them (for P_Key, only full members are allowed as per 2538 * IPoIB specification; neither port number nor HCA guid 2539 * can be zero) 2540 */ 2541 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2542 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2543 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2544 state->id_pkey); 2545 goto attach_fail; 2546 } 2547 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2548 "port-number", 0)) == 0) { 2549 DPRINT(10, "ibd_attach: invalid port number (%d)", 2550 state->id_port); 2551 goto attach_fail; 2552 } 2553 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2554 "hca-guid", 0)) == 0) { 2555 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2556 hca_guid); 2557 goto attach_fail; 2558 } 2559 2560 /* 2561 * Attach to IBTL 2562 */ 2563 mutex_enter(&ibd_gstate.ig_mutex); 2564 if (ibd_gstate.ig_ibt_hdl == NULL) { 2565 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2566 &ibd_gstate.ig_ibt_hdl)) != IBT_SUCCESS) { 2567 DPRINT(10, "ibd_attach: global: failed in " 2568 "ibt_attach(), ret=%d", ret); 2569 mutex_exit(&ibd_gstate.ig_mutex); 2570 goto attach_fail; 2571 } 2572 } 2573 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2574 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2575 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", 2576 ret); 2577 mutex_exit(&ibd_gstate.ig_mutex); 2578 goto attach_fail; 2579 } 2580 ibd_gstate.ig_ibt_hdl_ref_cnt++; 2581 mutex_exit(&ibd_gstate.ig_mutex); 2582 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2583 2584 /* 2585 * Open the HCA 2586 */ 2587 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2588 &state->id_hca_hdl)) != IBT_SUCCESS) { 2589 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2590 goto attach_fail; 2591 } 2592 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2593 2594 /* Get RC config before ibd_record_capab */ 2595 ibd_rc_get_conf(state); 2596 2597 #ifdef DEBUG 2598 /* Initialize Driver Counters for Reliable Connected Mode */ 2599 if (state->id_enable_rc) { 2600 if (ibd_rc_init_stats(state) != DDI_SUCCESS) { 2601 DPRINT(10, "ibd_attach: failed in ibd_rc_init_stats"); 2602 goto attach_fail; 2603 } 2604 state->id_mac_state |= IBD_DRV_RC_PRIVATE_STATE; 2605 } 2606 #endif 2607 2608 /* 2609 * Record capabilities 2610 */ 2611 (void) ibd_record_capab(state, dip); 2612 2613 /* 2614 * Allocate a protection domain on the HCA 2615 */ 2616 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2617 &state->id_pd_hdl)) != IBT_SUCCESS) { 2618 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2619 goto attach_fail; 2620 } 2621 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2622 2623 2624 /* 2625 * Register ibd interfaces with the Nemo framework 2626 */ 2627 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2628 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2629 goto attach_fail; 2630 } 2631 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2632 2633 /* 2634 * We're done with everything we could to make the attach 2635 * succeed. All the buffer allocations and IPoIB broadcast 2636 * group joins are deferred to when the interface instance 2637 * is actually plumbed to avoid wasting memory. 2638 */ 2639 return (DDI_SUCCESS); 2640 2641 attach_fail: 2642 (void) ibd_unattach(state, dip); 2643 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2644 return (DDI_FAILURE); 2645 } 2646 2647 /* 2648 * Detach device from the IO framework. 2649 */ 2650 static int 2651 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2652 { 2653 ibd_state_t *state; 2654 int instance; 2655 2656 /* 2657 * IBD doesn't support suspend/resume 2658 */ 2659 if (cmd != DDI_DETACH) 2660 return (DDI_FAILURE); 2661 2662 /* 2663 * Get the instance softstate 2664 */ 2665 instance = ddi_get_instance(dip); 2666 state = ddi_get_soft_state(ibd_list, instance); 2667 2668 /* 2669 * Release all resources we're holding still. Note that if we'd 2670 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2671 * so far, we should find all the flags we need in id_mac_state. 2672 */ 2673 return (ibd_unattach(state, dip)); 2674 } 2675 2676 /* 2677 * Pre ibt_attach() driver initialization 2678 */ 2679 static int 2680 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2681 { 2682 char buf[64]; 2683 2684 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2685 state->id_link_state = LINK_STATE_UNKNOWN; 2686 2687 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2688 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2689 state->id_trap_stop = B_TRUE; 2690 state->id_trap_inprog = 0; 2691 2692 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2693 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2694 state->id_dip = dip; 2695 2696 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2697 2698 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2699 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2700 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2701 state->id_tx_busy = 0; 2702 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2703 2704 state->id_rx_list.dl_bufs_outstanding = 0; 2705 state->id_rx_list.dl_cnt = 0; 2706 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2707 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2708 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2709 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2710 0, NULL, NULL, NULL, NULL, NULL, 0); 2711 2712 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2713 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2714 2715 /* For Reliable Connected Mode */ 2716 mutex_init(&state->rc_rx_lock, NULL, MUTEX_DRIVER, NULL); 2717 mutex_init(&state->rc_tx_large_bufs_lock, NULL, MUTEX_DRIVER, NULL); 2718 mutex_init(&state->rc_srq_rwqe_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2719 mutex_init(&state->rc_srq_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2720 mutex_init(&state->rc_pass_chan_list.chan_list_mutex, NULL, 2721 MUTEX_DRIVER, NULL); 2722 2723 return (DDI_SUCCESS); 2724 } 2725 2726 /* 2727 * Post ibt_detach() driver deconstruction 2728 */ 2729 static void 2730 ibd_state_fini(ibd_state_t *state) 2731 { 2732 cv_destroy(&state->id_macst_cv); 2733 mutex_destroy(&state->id_macst_lock); 2734 2735 kmem_cache_destroy(state->id_req_kmc); 2736 2737 mutex_destroy(&state->id_rx_list.dl_mutex); 2738 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2739 2740 mutex_destroy(&state->id_txpost_lock); 2741 mutex_destroy(&state->id_tx_list.dl_mutex); 2742 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2743 mutex_destroy(&state->id_lso_lock); 2744 2745 mutex_destroy(&state->id_sched_lock); 2746 mutex_destroy(&state->id_scq_poll_lock); 2747 mutex_destroy(&state->id_rcq_poll_lock); 2748 2749 cv_destroy(&state->id_trap_cv); 2750 mutex_destroy(&state->id_trap_lock); 2751 mutex_destroy(&state->id_link_mutex); 2752 2753 /* For Reliable Connected Mode */ 2754 mutex_destroy(&state->rc_srq_free_list.dl_mutex); 2755 mutex_destroy(&state->rc_srq_rwqe_list.dl_mutex); 2756 mutex_destroy(&state->rc_pass_chan_list.chan_list_mutex); 2757 mutex_destroy(&state->rc_tx_large_bufs_lock); 2758 mutex_destroy(&state->rc_rx_lock); 2759 } 2760 2761 /* 2762 * Fetch link speed from SA for snmp ifspeed reporting. 2763 */ 2764 static uint64_t 2765 ibd_get_portspeed(ibd_state_t *state) 2766 { 2767 int ret; 2768 ibt_path_info_t path; 2769 ibt_path_attr_t path_attr; 2770 uint8_t num_paths; 2771 uint64_t ifspeed; 2772 2773 /* 2774 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2775 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2776 * 2000000000. Start with that as default. 2777 */ 2778 ifspeed = 2000000000; 2779 2780 bzero(&path_attr, sizeof (path_attr)); 2781 2782 /* 2783 * Get the port speed from Loopback path information. 2784 */ 2785 path_attr.pa_dgids = &state->id_sgid; 2786 path_attr.pa_num_dgids = 1; 2787 path_attr.pa_sgid = state->id_sgid; 2788 2789 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2790 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2791 goto earlydone; 2792 2793 if (num_paths < 1) 2794 goto earlydone; 2795 2796 /* 2797 * In case SA does not return an expected value, report the default 2798 * speed as 1X. 2799 */ 2800 ret = 1; 2801 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2802 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2803 ret = 1; 2804 break; 2805 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2806 ret = 4; 2807 break; 2808 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2809 ret = 12; 2810 break; 2811 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2812 ret = 2; 2813 break; 2814 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2815 ret = 8; 2816 break; 2817 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2818 ret = 16; 2819 break; 2820 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2821 ret = 24; 2822 break; 2823 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2824 ret = 32; 2825 break; 2826 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2827 ret = 48; 2828 break; 2829 } 2830 2831 ifspeed *= ret; 2832 2833 earlydone: 2834 return (ifspeed); 2835 } 2836 2837 /* 2838 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2839 * representing the input mcg mgid. 2840 */ 2841 static ibd_mce_t * 2842 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2843 { 2844 ibd_mce_t *ptr = list_head(mlist); 2845 2846 /* 2847 * Do plain linear search. 2848 */ 2849 while (ptr != NULL) { 2850 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2851 sizeof (ib_gid_t)) == 0) 2852 return (ptr); 2853 ptr = list_next(mlist, ptr); 2854 } 2855 return (NULL); 2856 } 2857 2858 /* 2859 * Execute IBA JOIN. 2860 */ 2861 static ibt_status_t 2862 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2863 { 2864 ibt_mcg_attr_t mcg_attr; 2865 2866 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2867 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2868 mcg_attr.mc_mgid = mgid; 2869 mcg_attr.mc_join_state = mce->mc_jstate; 2870 mcg_attr.mc_scope = state->id_scope; 2871 mcg_attr.mc_pkey = state->id_pkey; 2872 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2873 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2874 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2875 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2876 NULL, NULL)); 2877 } 2878 2879 /* 2880 * This code JOINs the port in the proper way (depending on the join 2881 * state) so that IBA fabric will forward mcg packets to/from the port. 2882 * It also attaches the QPN to the mcg so it can receive those mcg 2883 * packets. This code makes sure not to attach the mcg to the QP if 2884 * that has been previously done due to the mcg being joined with a 2885 * different join state, even though this is not required by SWG_0216, 2886 * refid 3610. 2887 */ 2888 static ibd_mce_t * 2889 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2890 { 2891 ibt_status_t ibt_status; 2892 ibd_mce_t *mce, *tmce, *omce = NULL; 2893 boolean_t do_attach = B_TRUE; 2894 2895 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2896 jstate, mgid.gid_prefix, mgid.gid_guid); 2897 2898 /* 2899 * For enable_multicast Full member joins, we need to do some 2900 * extra work. If there is already an mce on the list that 2901 * indicates full membership, that means the membership has 2902 * not yet been dropped (since the disable_multicast was issued) 2903 * because there are pending Tx's to the mcg; in that case, just 2904 * mark the mce not to be reaped when the Tx completion queues 2905 * an async reap operation. 2906 * 2907 * If there is already an mce on the list indicating sendonly 2908 * membership, try to promote to full membership. Be careful 2909 * not to deallocate the old mce, since there might be an AH 2910 * pointing to it; instead, update the old mce with new data 2911 * that tracks the full membership. 2912 */ 2913 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2914 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2915 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2916 ASSERT(omce->mc_fullreap); 2917 omce->mc_fullreap = B_FALSE; 2918 return (omce); 2919 } else { 2920 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2921 } 2922 } 2923 2924 /* 2925 * Allocate the ibd_mce_t to track this JOIN. 2926 */ 2927 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2928 mce->mc_fullreap = B_FALSE; 2929 mce->mc_jstate = jstate; 2930 2931 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2932 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2933 ibt_status); 2934 kmem_free(mce, sizeof (ibd_mce_t)); 2935 return (NULL); 2936 } 2937 2938 /* 2939 * Is an IBA attach required? Not if the interface is already joined 2940 * to the mcg in a different appropriate join state. 2941 */ 2942 if (jstate == IB_MC_JSTATE_NON) { 2943 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2944 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2945 do_attach = B_FALSE; 2946 } else if (jstate == IB_MC_JSTATE_FULL) { 2947 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2948 do_attach = B_FALSE; 2949 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2950 do_attach = B_FALSE; 2951 } 2952 2953 if (do_attach) { 2954 /* 2955 * Do the IBA attach. 2956 */ 2957 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2958 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2959 &mce->mc_info)) != IBT_SUCCESS) { 2960 DPRINT(10, "ibd_join_group : failed qp attachment " 2961 "%d\n", ibt_status); 2962 /* 2963 * NOTE that we should probably preserve the join info 2964 * in the list and later try to leave again at detach 2965 * time. 2966 */ 2967 (void) ibt_leave_mcg(state->id_sgid, mgid, 2968 state->id_sgid, jstate); 2969 kmem_free(mce, sizeof (ibd_mce_t)); 2970 return (NULL); 2971 } 2972 } 2973 2974 /* 2975 * Insert the ibd_mce_t in the proper list. 2976 */ 2977 if (jstate == IB_MC_JSTATE_NON) { 2978 IBD_MCACHE_INSERT_NON(state, mce); 2979 } else { 2980 /* 2981 * Set up the mc_req fields used for reaping the 2982 * mcg in case of delayed tx completion (see 2983 * ibd_tx_cleanup()). Also done for sendonly join in 2984 * case we are promoted to fullmembership later and 2985 * keep using the same mce. 2986 */ 2987 mce->mc_req.rq_gid = mgid; 2988 mce->mc_req.rq_ptr = mce; 2989 /* 2990 * Check whether this is the case of trying to join 2991 * full member, and we were already joined send only. 2992 * We try to drop our SendOnly membership, but it is 2993 * possible that the mcg does not exist anymore (and 2994 * the subnet trap never reached us), so the leave 2995 * operation might fail. 2996 */ 2997 if (omce != NULL) { 2998 (void) ibt_leave_mcg(state->id_sgid, mgid, 2999 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 3000 omce->mc_jstate = IB_MC_JSTATE_FULL; 3001 bcopy(&mce->mc_info, &omce->mc_info, 3002 sizeof (ibt_mcg_info_t)); 3003 kmem_free(mce, sizeof (ibd_mce_t)); 3004 return (omce); 3005 } 3006 mutex_enter(&state->id_mc_mutex); 3007 IBD_MCACHE_INSERT_FULL(state, mce); 3008 mutex_exit(&state->id_mc_mutex); 3009 } 3010 3011 return (mce); 3012 } 3013 3014 /* 3015 * Called during port up event handling to attempt to reacquire full 3016 * membership to an mcg. Stripped down version of ibd_join_group(). 3017 * Note that it is possible that the mcg might have gone away, and 3018 * gets recreated at this point. 3019 */ 3020 static void 3021 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 3022 { 3023 ib_gid_t mgid; 3024 3025 /* 3026 * If the mc_fullreap flag is set, or this join fails, a subsequent 3027 * reap/leave is going to try to leave the group. We could prevent 3028 * that by adding a boolean flag into ibd_mce_t, if required. 3029 */ 3030 if (mce->mc_fullreap) 3031 return; 3032 3033 mgid = mce->mc_info.mc_adds_vect.av_dgid; 3034 3035 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 3036 mgid.gid_guid); 3037 3038 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 3039 ibd_print_warn(state, "Failure on port up to rejoin " 3040 "multicast gid %016llx:%016llx", 3041 (u_longlong_t)mgid.gid_prefix, 3042 (u_longlong_t)mgid.gid_guid); 3043 } 3044 3045 /* 3046 * This code handles delayed Tx completion cleanups for mcg's to which 3047 * disable_multicast has been issued, regular mcg related cleanups during 3048 * disable_multicast, disable_promiscuous and mcg traps, as well as 3049 * cleanups during driver detach time. Depending on the join state, 3050 * it deletes the mce from the appropriate list and issues the IBA 3051 * leave/detach; except in the disable_multicast case when the mce 3052 * is left on the active list for a subsequent Tx completion cleanup. 3053 */ 3054 static void 3055 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3056 uint8_t jstate) 3057 { 3058 ibd_mce_t *tmce; 3059 boolean_t do_detach = B_TRUE; 3060 3061 /* 3062 * Before detaching, we must check whether the other list 3063 * contains the mcg; if we detach blindly, the consumer 3064 * who set up the other list will also stop receiving 3065 * traffic. 3066 */ 3067 if (jstate == IB_MC_JSTATE_FULL) { 3068 /* 3069 * The following check is only relevant while coming 3070 * from the Tx completion path in the reap case. 3071 */ 3072 if (!mce->mc_fullreap) 3073 return; 3074 mutex_enter(&state->id_mc_mutex); 3075 IBD_MCACHE_PULLOUT_FULL(state, mce); 3076 mutex_exit(&state->id_mc_mutex); 3077 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3078 do_detach = B_FALSE; 3079 } else if (jstate == IB_MC_JSTATE_NON) { 3080 IBD_MCACHE_PULLOUT_NON(state, mce); 3081 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3082 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3083 do_detach = B_FALSE; 3084 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3085 mutex_enter(&state->id_mc_mutex); 3086 IBD_MCACHE_PULLOUT_FULL(state, mce); 3087 mutex_exit(&state->id_mc_mutex); 3088 do_detach = B_FALSE; 3089 } 3090 3091 /* 3092 * If we are reacting to a mcg trap and leaving our sendonly or 3093 * non membership, the mcg is possibly already gone, so attempting 3094 * to leave might fail. On the other hand, we must try to leave 3095 * anyway, since this might be a trap from long ago, and we could 3096 * have potentially sendonly joined to a recent incarnation of 3097 * the mcg and are about to loose track of this information. 3098 */ 3099 if (do_detach) { 3100 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3101 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3102 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3103 } 3104 3105 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3106 kmem_free(mce, sizeof (ibd_mce_t)); 3107 } 3108 3109 /* 3110 * Async code executed due to multicast and promiscuous disable requests 3111 * and mcg trap handling; also executed during driver detach. Mostly, a 3112 * leave and detach is done; except for the fullmember case when Tx 3113 * requests are pending, whence arrangements are made for subsequent 3114 * cleanup on Tx completion. 3115 */ 3116 static void 3117 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3118 { 3119 ipoib_mac_t mcmac; 3120 boolean_t recycled; 3121 ibd_mce_t *mce; 3122 3123 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3124 jstate, mgid.gid_prefix, mgid.gid_guid); 3125 3126 if (jstate == IB_MC_JSTATE_NON) { 3127 recycled = B_TRUE; 3128 mce = IBD_MCACHE_FIND_NON(state, mgid); 3129 /* 3130 * In case we are handling a mcg trap, we might not find 3131 * the mcg in the non list. 3132 */ 3133 if (mce == NULL) { 3134 return; 3135 } 3136 } else { 3137 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3138 3139 /* 3140 * In case we are handling a mcg trap, make sure the trap 3141 * is not arriving late; if we have an mce that indicates 3142 * that we are already a fullmember, that would be a clear 3143 * indication that the trap arrived late (ie, is for a 3144 * previous incarnation of the mcg). 3145 */ 3146 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3147 if ((mce == NULL) || (mce->mc_jstate == 3148 IB_MC_JSTATE_FULL)) { 3149 return; 3150 } 3151 } else { 3152 ASSERT(jstate == IB_MC_JSTATE_FULL); 3153 3154 /* 3155 * If join group failed, mce will be NULL here. 3156 * This is because in GLDv3 driver, set multicast 3157 * will always return success. 3158 */ 3159 if (mce == NULL) { 3160 return; 3161 } 3162 3163 mce->mc_fullreap = B_TRUE; 3164 } 3165 3166 /* 3167 * If no pending Tx's remain that reference the AH 3168 * for the mcg, recycle it from active to free list. 3169 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3170 * so the last completing Tx will cause an async reap 3171 * operation to be invoked, at which time we will drop our 3172 * membership to the mcg so that the pending Tx's complete 3173 * successfully. Refer to comments on "AH and MCE active 3174 * list manipulation" at top of this file. The lock protects 3175 * against Tx fast path and Tx cleanup code. 3176 */ 3177 mutex_enter(&state->id_ac_mutex); 3178 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3179 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3180 IB_MC_JSTATE_SEND_ONLY_NON)); 3181 mutex_exit(&state->id_ac_mutex); 3182 } 3183 3184 if (recycled) { 3185 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3186 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3187 ibd_async_reap_group(state, mce, mgid, jstate); 3188 } 3189 } 3190 3191 /* 3192 * Find the broadcast address as defined by IPoIB; implicitly 3193 * determines the IBA scope, mtu, tclass etc of the link the 3194 * interface is going to be a member of. 3195 */ 3196 static ibt_status_t 3197 ibd_find_bgroup(ibd_state_t *state) 3198 { 3199 ibt_mcg_attr_t mcg_attr; 3200 uint_t numg; 3201 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3202 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3203 IB_MC_SCOPE_GLOBAL }; 3204 int i, mcgmtu; 3205 boolean_t found = B_FALSE; 3206 int ret; 3207 ibt_mcg_info_t mcg_info; 3208 3209 state->id_bgroup_created = B_FALSE; 3210 3211 query_bcast_grp: 3212 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3213 mcg_attr.mc_pkey = state->id_pkey; 3214 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3215 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3216 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3217 3218 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3219 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3220 3221 /* 3222 * Look for the IPoIB broadcast group. 3223 */ 3224 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3225 state->id_mgid.gid_prefix = 3226 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3227 ((uint64_t)state->id_scope << 48) | 3228 ((uint32_t)(state->id_pkey << 16))); 3229 mcg_attr.mc_mgid = state->id_mgid; 3230 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3231 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3232 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3233 found = B_TRUE; 3234 break; 3235 } 3236 } 3237 3238 if (!found) { 3239 if (ibd_create_broadcast_group) { 3240 /* 3241 * If we created the broadcast group, but failed to 3242 * find it, we can't do anything except leave the 3243 * one we created and return failure. 3244 */ 3245 if (state->id_bgroup_created) { 3246 ibd_print_warn(state, "IPoIB broadcast group " 3247 "absent. Unable to query after create."); 3248 goto find_bgroup_fail; 3249 } 3250 3251 /* 3252 * Create the ipoib broadcast group if it didn't exist 3253 */ 3254 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3255 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3256 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3257 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3258 mcg_attr.mc_pkey = state->id_pkey; 3259 mcg_attr.mc_flow = 0; 3260 mcg_attr.mc_sl = 0; 3261 mcg_attr.mc_tclass = 0; 3262 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3263 state->id_mgid.gid_prefix = 3264 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3265 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3266 ((uint32_t)(state->id_pkey << 16))); 3267 mcg_attr.mc_mgid = state->id_mgid; 3268 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3269 3270 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3271 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3272 ibd_print_warn(state, "IPoIB broadcast group " 3273 "absent, create failed: ret = %d\n", ret); 3274 state->id_bgroup_created = B_FALSE; 3275 return (IBT_FAILURE); 3276 } 3277 state->id_bgroup_created = B_TRUE; 3278 goto query_bcast_grp; 3279 } else { 3280 ibd_print_warn(state, "IPoIB broadcast group absent"); 3281 return (IBT_FAILURE); 3282 } 3283 } 3284 3285 /* 3286 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3287 */ 3288 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3289 if (state->id_mtu < mcgmtu) { 3290 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3291 "greater than port's maximum MTU %d", mcgmtu, 3292 state->id_mtu); 3293 ibt_free_mcg_info(state->id_mcinfo, 1); 3294 goto find_bgroup_fail; 3295 } 3296 state->id_mtu = mcgmtu; 3297 3298 return (IBT_SUCCESS); 3299 3300 find_bgroup_fail: 3301 if (state->id_bgroup_created) { 3302 (void) ibt_leave_mcg(state->id_sgid, 3303 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3304 IB_MC_JSTATE_FULL); 3305 } 3306 3307 return (IBT_FAILURE); 3308 } 3309 3310 static int 3311 ibd_alloc_tx_copybufs(ibd_state_t *state) 3312 { 3313 ibt_mr_attr_t mem_attr; 3314 3315 /* 3316 * Allocate one big chunk for all regular tx copy bufs 3317 */ 3318 state->id_tx_buf_sz = state->id_mtu; 3319 if (state->id_lso_policy && state->id_lso_capable && 3320 (IBD_TX_BUF_SZ > state->id_mtu)) { 3321 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3322 } 3323 3324 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3325 state->id_tx_buf_sz, KM_SLEEP); 3326 3327 state->id_tx_wqes = kmem_zalloc(state->id_num_swqe * 3328 sizeof (ibd_swqe_t), KM_SLEEP); 3329 3330 /* 3331 * Do one memory registration on the entire txbuf area 3332 */ 3333 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3334 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3335 mem_attr.mr_as = NULL; 3336 mem_attr.mr_flags = IBT_MR_SLEEP; 3337 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3338 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3339 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3340 kmem_free(state->id_tx_wqes, 3341 state->id_num_swqe * sizeof (ibd_swqe_t)); 3342 kmem_free(state->id_tx_bufs, 3343 state->id_num_swqe * state->id_tx_buf_sz); 3344 state->id_tx_bufs = NULL; 3345 return (DDI_FAILURE); 3346 } 3347 3348 return (DDI_SUCCESS); 3349 } 3350 3351 static int 3352 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3353 { 3354 ibt_mr_attr_t mem_attr; 3355 ibd_lsobuf_t *buflist; 3356 ibd_lsobuf_t *lbufp; 3357 ibd_lsobuf_t *tail; 3358 ibd_lsobkt_t *bktp; 3359 uint8_t *membase; 3360 uint8_t *memp; 3361 uint_t memsz; 3362 int i; 3363 3364 /* 3365 * Allocate the lso bucket 3366 */ 3367 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3368 3369 /* 3370 * Allocate the entire lso memory and register it 3371 */ 3372 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3373 membase = kmem_zalloc(memsz, KM_SLEEP); 3374 3375 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3376 mem_attr.mr_len = memsz; 3377 mem_attr.mr_as = NULL; 3378 mem_attr.mr_flags = IBT_MR_SLEEP; 3379 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3380 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3381 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3382 kmem_free(membase, memsz); 3383 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3384 return (DDI_FAILURE); 3385 } 3386 3387 mutex_enter(&state->id_lso_lock); 3388 3389 /* 3390 * Now allocate the buflist. Note that the elements in the buflist and 3391 * the buffers in the lso memory have a permanent 1-1 relation, so we 3392 * can always derive the address of a buflist entry from the address of 3393 * an lso buffer. 3394 */ 3395 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3396 KM_SLEEP); 3397 3398 /* 3399 * Set up the lso buf chain 3400 */ 3401 memp = membase; 3402 lbufp = buflist; 3403 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3404 lbufp->lb_isfree = 1; 3405 lbufp->lb_buf = memp; 3406 lbufp->lb_next = lbufp + 1; 3407 3408 tail = lbufp; 3409 3410 memp += IBD_LSO_BUFSZ; 3411 lbufp++; 3412 } 3413 tail->lb_next = NULL; 3414 3415 /* 3416 * Set up the LSO buffer information in ibd state 3417 */ 3418 bktp->bkt_bufl = buflist; 3419 bktp->bkt_free_head = buflist; 3420 bktp->bkt_mem = membase; 3421 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3422 bktp->bkt_nfree = bktp->bkt_nelem; 3423 3424 state->id_lso = bktp; 3425 mutex_exit(&state->id_lso_lock); 3426 3427 return (DDI_SUCCESS); 3428 } 3429 3430 /* 3431 * Statically allocate Tx buffer list(s). 3432 */ 3433 static int 3434 ibd_init_txlist(ibd_state_t *state) 3435 { 3436 ibd_swqe_t *swqe; 3437 ibt_lkey_t lkey; 3438 int i; 3439 uint_t len; 3440 uint8_t *bufaddr; 3441 3442 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3443 return (DDI_FAILURE); 3444 3445 if (state->id_lso_policy && state->id_lso_capable) { 3446 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3447 state->id_lso_policy = B_FALSE; 3448 } 3449 3450 mutex_enter(&state->id_tx_list.dl_mutex); 3451 state->id_tx_list.dl_head = NULL; 3452 state->id_tx_list.dl_pending_sends = B_FALSE; 3453 state->id_tx_list.dl_cnt = 0; 3454 mutex_exit(&state->id_tx_list.dl_mutex); 3455 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3456 state->id_tx_rel_list.dl_head = NULL; 3457 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3458 state->id_tx_rel_list.dl_cnt = 0; 3459 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3460 3461 /* 3462 * Allocate and setup the swqe list 3463 */ 3464 lkey = state->id_tx_mr_desc.md_lkey; 3465 bufaddr = state->id_tx_bufs; 3466 len = state->id_tx_buf_sz; 3467 swqe = state->id_tx_wqes; 3468 mutex_enter(&state->id_tx_list.dl_mutex); 3469 for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) { 3470 swqe->swqe_next = NULL; 3471 swqe->swqe_im_mblk = NULL; 3472 3473 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3474 bufaddr; 3475 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3476 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3477 3478 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3479 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3480 swqe->w_swr.wr_trans = IBT_UD_SRV; 3481 3482 /* These are set in send */ 3483 swqe->w_swr.wr_nds = 0; 3484 swqe->w_swr.wr_sgl = NULL; 3485 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3486 3487 /* add to list */ 3488 state->id_tx_list.dl_cnt++; 3489 swqe->swqe_next = state->id_tx_list.dl_head; 3490 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3491 } 3492 mutex_exit(&state->id_tx_list.dl_mutex); 3493 3494 return (DDI_SUCCESS); 3495 } 3496 3497 static int 3498 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3499 uint32_t *nds_p) 3500 { 3501 ibd_lsobkt_t *bktp; 3502 ibd_lsobuf_t *lbufp; 3503 ibd_lsobuf_t *nextp; 3504 ibt_lkey_t lso_lkey; 3505 uint_t frag_sz; 3506 uint_t num_needed; 3507 int i; 3508 3509 ASSERT(sgl_p != NULL); 3510 ASSERT(nds_p != NULL); 3511 ASSERT(req_sz != 0); 3512 3513 /* 3514 * Determine how many bufs we'd need for the size requested 3515 */ 3516 num_needed = req_sz / IBD_LSO_BUFSZ; 3517 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3518 num_needed++; 3519 3520 mutex_enter(&state->id_lso_lock); 3521 3522 /* 3523 * If we don't have enough lso bufs, return failure 3524 */ 3525 ASSERT(state->id_lso != NULL); 3526 bktp = state->id_lso; 3527 if (bktp->bkt_nfree < num_needed) { 3528 mutex_exit(&state->id_lso_lock); 3529 return (-1); 3530 } 3531 3532 /* 3533 * Pick the first 'num_needed' bufs from the free list 3534 */ 3535 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3536 lbufp = bktp->bkt_free_head; 3537 for (i = 0; i < num_needed; i++) { 3538 ASSERT(lbufp->lb_isfree != 0); 3539 ASSERT(lbufp->lb_buf != NULL); 3540 3541 nextp = lbufp->lb_next; 3542 3543 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3544 sgl_p[i].ds_key = lso_lkey; 3545 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3546 3547 lbufp->lb_isfree = 0; 3548 lbufp->lb_next = NULL; 3549 3550 lbufp = nextp; 3551 } 3552 bktp->bkt_free_head = lbufp; 3553 3554 /* 3555 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3556 * to adjust the last sgl entry's length. Since we know we need atleast 3557 * one, the i-1 use below is ok. 3558 */ 3559 if (frag_sz) { 3560 sgl_p[i-1].ds_len = frag_sz; 3561 } 3562 3563 /* 3564 * Update nfree count and return 3565 */ 3566 bktp->bkt_nfree -= num_needed; 3567 3568 mutex_exit(&state->id_lso_lock); 3569 3570 *nds_p = num_needed; 3571 3572 return (0); 3573 } 3574 3575 static void 3576 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3577 { 3578 ibd_lsobkt_t *bktp; 3579 ibd_lsobuf_t *lbufp; 3580 uint8_t *lso_mem_end; 3581 uint_t ndx; 3582 int i; 3583 3584 mutex_enter(&state->id_lso_lock); 3585 3586 bktp = state->id_lso; 3587 ASSERT(bktp != NULL); 3588 3589 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3590 for (i = 0; i < nds; i++) { 3591 uint8_t *va; 3592 3593 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3594 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3595 3596 /* 3597 * Figure out the buflist element this sgl buffer corresponds 3598 * to and put it back at the head 3599 */ 3600 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3601 lbufp = bktp->bkt_bufl + ndx; 3602 3603 ASSERT(lbufp->lb_isfree == 0); 3604 ASSERT(lbufp->lb_buf == va); 3605 3606 lbufp->lb_isfree = 1; 3607 lbufp->lb_next = bktp->bkt_free_head; 3608 bktp->bkt_free_head = lbufp; 3609 } 3610 bktp->bkt_nfree += nds; 3611 3612 mutex_exit(&state->id_lso_lock); 3613 } 3614 3615 static void 3616 ibd_free_tx_copybufs(ibd_state_t *state) 3617 { 3618 /* 3619 * Unregister txbuf mr 3620 */ 3621 if (ibt_deregister_mr(state->id_hca_hdl, 3622 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3623 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3624 } 3625 state->id_tx_mr_hdl = NULL; 3626 3627 /* 3628 * Free txbuf memory 3629 */ 3630 kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t)); 3631 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3632 state->id_tx_wqes = NULL; 3633 state->id_tx_bufs = NULL; 3634 } 3635 3636 static void 3637 ibd_free_tx_lsobufs(ibd_state_t *state) 3638 { 3639 ibd_lsobkt_t *bktp; 3640 3641 mutex_enter(&state->id_lso_lock); 3642 3643 if ((bktp = state->id_lso) == NULL) { 3644 mutex_exit(&state->id_lso_lock); 3645 return; 3646 } 3647 3648 /* 3649 * First, free the buflist 3650 */ 3651 ASSERT(bktp->bkt_bufl != NULL); 3652 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3653 3654 /* 3655 * Unregister the LSO memory and free it 3656 */ 3657 ASSERT(bktp->bkt_mr_hdl != NULL); 3658 if (ibt_deregister_mr(state->id_hca_hdl, 3659 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3660 DPRINT(10, 3661 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3662 } 3663 ASSERT(bktp->bkt_mem); 3664 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3665 3666 /* 3667 * Finally free the bucket 3668 */ 3669 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3670 state->id_lso = NULL; 3671 3672 mutex_exit(&state->id_lso_lock); 3673 } 3674 3675 /* 3676 * Free the statically allocated Tx buffer list. 3677 */ 3678 static void 3679 ibd_fini_txlist(ibd_state_t *state) 3680 { 3681 /* 3682 * Free the allocated swqes 3683 */ 3684 mutex_enter(&state->id_tx_list.dl_mutex); 3685 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3686 state->id_tx_list.dl_head = NULL; 3687 state->id_tx_list.dl_pending_sends = B_FALSE; 3688 state->id_tx_list.dl_cnt = 0; 3689 state->id_tx_rel_list.dl_head = NULL; 3690 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 3691 state->id_tx_rel_list.dl_cnt = 0; 3692 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3693 mutex_exit(&state->id_tx_list.dl_mutex); 3694 3695 ibd_free_tx_lsobufs(state); 3696 ibd_free_tx_copybufs(state); 3697 } 3698 3699 /* 3700 * post a list of rwqes, NULL terminated. 3701 */ 3702 static void 3703 ibd_post_recv_list(ibd_state_t *state, ibd_rwqe_t *rwqe) 3704 { 3705 uint_t i; 3706 uint_t num_posted; 3707 ibt_status_t ibt_status; 3708 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3709 3710 while (rwqe) { 3711 /* Post up to IBD_RX_POST_CNT receive work requests */ 3712 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3713 wrs[i] = rwqe->w_rwr; 3714 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3715 if (rwqe == NULL) { 3716 i++; 3717 break; 3718 } 3719 } 3720 3721 /* 3722 * If posting fails for some reason, we'll never receive 3723 * completion intimation, so we'll need to cleanup. But 3724 * we need to make sure we don't clean up nodes whose 3725 * wrs have been successfully posted. We assume that the 3726 * hca driver returns on the first failure to post and 3727 * therefore the first 'num_posted' entries don't need 3728 * cleanup here. 3729 */ 3730 atomic_add_32(&state->id_rx_list.dl_cnt, i); 3731 3732 num_posted = 0; 3733 ibt_status = ibt_post_recv(state->id_chnl_hdl, wrs, i, 3734 &num_posted); 3735 if (ibt_status != IBT_SUCCESS) { 3736 /* This cannot happen unless the device has an error. */ 3737 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3738 "posting multiple wrs failed: " 3739 "requested=%d, done=%d, ret=%d", 3740 IBD_RX_POST_CNT, num_posted, ibt_status); 3741 atomic_add_32(&state->id_rx_list.dl_cnt, 3742 num_posted - i); 3743 } 3744 } 3745 } 3746 3747 /* 3748 * Grab a list of rwqes from the array of lists, and post the list. 3749 */ 3750 static void 3751 ibd_post_recv_intr(ibd_state_t *state) 3752 { 3753 ibd_rx_queue_t *rxp; 3754 ibd_rwqe_t *list; 3755 3756 /* rotate through the rx_queue array, expecting an adequate number */ 3757 state->id_rx_post_queue_index = 3758 (state->id_rx_post_queue_index + 1) & 3759 (state->id_rx_nqueues - 1); 3760 3761 rxp = state->id_rx_queues + state->id_rx_post_queue_index; 3762 mutex_enter(&rxp->rx_post_lock); 3763 list = WQE_TO_RWQE(rxp->rx_head); 3764 rxp->rx_head = NULL; 3765 rxp->rx_cnt = 0; 3766 mutex_exit(&rxp->rx_post_lock); 3767 ibd_post_recv_list(state, list); 3768 } 3769 3770 /* macro explained below */ 3771 #define RX_QUEUE_HASH(rwqe) \ 3772 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3773 3774 /* 3775 * Add a rwqe to one of the the Rx lists. If the list is large enough 3776 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3777 * 3778 * Note: one of 2^N lists is chosen via a hash. This is done 3779 * because using one list is contentious. If the first list is busy 3780 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3781 * 3782 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3783 * even distribution of mapping rwqes to the 2^N queues. 3784 */ 3785 static void 3786 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3787 { 3788 ibd_rx_queue_t *rxp; 3789 3790 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3791 3792 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3793 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3794 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3795 mutex_enter(&rxp->rx_post_lock); 3796 } 3797 rwqe->rwqe_next = rxp->rx_head; 3798 if (++rxp->rx_cnt >= IBD_RX_POST_CNT - 2) { 3799 uint_t active = atomic_inc_32_nv(&state->id_rx_post_active); 3800 3801 /* only call ibt_post_recv() every Nth time through here */ 3802 if ((active & (state->id_rx_nqueues - 1)) == 0) { 3803 rxp->rx_head = NULL; 3804 rxp->rx_cnt = 0; 3805 mutex_exit(&rxp->rx_post_lock); 3806 ibd_post_recv_list(state, rwqe); 3807 return; 3808 } 3809 } 3810 rxp->rx_head = RWQE_TO_WQE(rwqe); 3811 mutex_exit(&rxp->rx_post_lock); 3812 } 3813 3814 static int 3815 ibd_alloc_rx_copybufs(ibd_state_t *state) 3816 { 3817 ibt_mr_attr_t mem_attr; 3818 int i; 3819 3820 /* 3821 * Allocate one big chunk for all regular rx copy bufs 3822 */ 3823 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 3824 3825 state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe * 3826 state->id_rx_buf_sz, KM_SLEEP); 3827 3828 state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe * 3829 sizeof (ibd_rwqe_t), KM_SLEEP); 3830 3831 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 3832 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 3833 sizeof (ibd_rx_queue_t), KM_SLEEP); 3834 for (i = 0; i < state->id_rx_nqueues; i++) { 3835 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3836 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 3837 } 3838 3839 /* 3840 * Do one memory registration on the entire rxbuf area 3841 */ 3842 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 3843 mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz; 3844 mem_attr.mr_as = NULL; 3845 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3846 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3847 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 3848 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 3849 kmem_free(state->id_rx_wqes, 3850 state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3851 kmem_free(state->id_rx_bufs, 3852 state->id_num_rwqe * state->id_rx_buf_sz); 3853 state->id_rx_bufs = NULL; 3854 state->id_rx_wqes = NULL; 3855 return (DDI_FAILURE); 3856 } 3857 3858 return (DDI_SUCCESS); 3859 } 3860 3861 /* 3862 * Allocate the statically allocated Rx buffer list. 3863 */ 3864 static int 3865 ibd_init_rxlist(ibd_state_t *state) 3866 { 3867 ibd_rwqe_t *rwqe, *next; 3868 ibd_wqe_t *list; 3869 ibt_lkey_t lkey; 3870 int i; 3871 uint_t len; 3872 uint8_t *bufaddr; 3873 3874 mutex_enter(&state->id_rx_free_list.dl_mutex); 3875 if (state->id_rx_free_list.dl_head != NULL) { 3876 /* rx rsrcs were never freed. Just repost them */ 3877 len = state->id_rx_buf_sz; 3878 list = state->id_rx_free_list.dl_head; 3879 state->id_rx_free_list.dl_head = NULL; 3880 state->id_rx_free_list.dl_cnt = 0; 3881 mutex_exit(&state->id_rx_free_list.dl_mutex); 3882 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3883 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 3884 if ((rwqe->rwqe_im_mblk = desballoc( 3885 rwqe->rwqe_copybuf.ic_bufaddr, len, 0, 3886 &rwqe->w_freemsg_cb)) == NULL) { 3887 /* allow freemsg_cb to free the rwqes */ 3888 if (atomic_dec_32_nv(&state->id_running) != 0) { 3889 cmn_err(CE_WARN, "ibd_init_rxlist: " 3890 "id_running was not 1\n"); 3891 } 3892 DPRINT(10, "ibd_init_rxlist : " 3893 "failed in desballoc()"); 3894 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3895 rwqe = next) { 3896 next = WQE_TO_RWQE(rwqe->rwqe_next); 3897 if (rwqe->rwqe_im_mblk) { 3898 atomic_inc_32(&state-> 3899 id_rx_list. 3900 dl_bufs_outstanding); 3901 freemsg(rwqe->rwqe_im_mblk); 3902 } else 3903 ibd_free_rwqe(state, rwqe); 3904 } 3905 atomic_inc_32(&state->id_running); 3906 return (DDI_FAILURE); 3907 } 3908 } 3909 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3910 return (DDI_SUCCESS); 3911 } 3912 mutex_exit(&state->id_rx_free_list.dl_mutex); 3913 3914 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 3915 return (DDI_FAILURE); 3916 3917 /* 3918 * Allocate and setup the rwqe list 3919 */ 3920 len = state->id_rx_buf_sz; 3921 lkey = state->id_rx_mr_desc.md_lkey; 3922 rwqe = state->id_rx_wqes; 3923 bufaddr = state->id_rx_bufs; 3924 list = NULL; 3925 for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) { 3926 rwqe->w_state = state; 3927 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3928 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3929 3930 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 3931 3932 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 3933 &rwqe->w_freemsg_cb)) == NULL) { 3934 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 3935 /* allow freemsg_cb to free the rwqes */ 3936 if (atomic_dec_32_nv(&state->id_running) != 0) { 3937 cmn_err(CE_WARN, "ibd_init_rxlist: " 3938 "id_running was not 1\n"); 3939 } 3940 DPRINT(10, "ibd_init_rxlist : " 3941 "failed in desballoc()"); 3942 for (rwqe = WQE_TO_RWQE(list); rwqe != NULL; 3943 rwqe = next) { 3944 next = WQE_TO_RWQE(rwqe->rwqe_next); 3945 freemsg(rwqe->rwqe_im_mblk); 3946 } 3947 atomic_inc_32(&state->id_running); 3948 3949 /* remove reference to free'd rwqes */ 3950 mutex_enter(&state->id_rx_free_list.dl_mutex); 3951 state->id_rx_free_list.dl_head = NULL; 3952 state->id_rx_free_list.dl_cnt = 0; 3953 mutex_exit(&state->id_rx_free_list.dl_mutex); 3954 3955 ibd_fini_rxlist(state); 3956 return (DDI_FAILURE); 3957 } 3958 3959 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 3960 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3961 (ib_vaddr_t)(uintptr_t)bufaddr; 3962 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 3963 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3964 rwqe->w_rwr.wr_nds = 1; 3965 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3966 3967 rwqe->rwqe_next = list; 3968 list = RWQE_TO_WQE(rwqe); 3969 } 3970 ibd_post_recv_list(state, WQE_TO_RWQE(list)); 3971 3972 return (DDI_SUCCESS); 3973 } 3974 3975 static void 3976 ibd_free_rx_copybufs(ibd_state_t *state) 3977 { 3978 int i; 3979 3980 /* 3981 * Unregister rxbuf mr 3982 */ 3983 if (ibt_deregister_mr(state->id_hca_hdl, 3984 state->id_rx_mr_hdl) != IBT_SUCCESS) { 3985 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 3986 } 3987 state->id_rx_mr_hdl = NULL; 3988 3989 /* 3990 * Free rxbuf memory 3991 */ 3992 for (i = 0; i < state->id_rx_nqueues; i++) { 3993 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3994 mutex_destroy(&rxp->rx_post_lock); 3995 } 3996 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 3997 sizeof (ibd_rx_queue_t)); 3998 kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3999 kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz); 4000 state->id_rx_queues = NULL; 4001 state->id_rx_wqes = NULL; 4002 state->id_rx_bufs = NULL; 4003 } 4004 4005 static void 4006 ibd_free_rx_rsrcs(ibd_state_t *state) 4007 { 4008 mutex_enter(&state->id_rx_free_list.dl_mutex); 4009 if (state->id_rx_free_list.dl_head == NULL) { 4010 /* already freed */ 4011 mutex_exit(&state->id_rx_free_list.dl_mutex); 4012 return; 4013 } 4014 ASSERT(state->id_rx_free_list.dl_cnt == state->id_num_rwqe); 4015 ibd_free_rx_copybufs(state); 4016 state->id_rx_free_list.dl_cnt = 0; 4017 state->id_rx_free_list.dl_head = NULL; 4018 mutex_exit(&state->id_rx_free_list.dl_mutex); 4019 } 4020 4021 /* 4022 * Free the statically allocated Rx buffer list. 4023 */ 4024 static void 4025 ibd_fini_rxlist(ibd_state_t *state) 4026 { 4027 ibd_rwqe_t *rwqe; 4028 int i; 4029 4030 /* run through the rx_queue's, calling freemsg() */ 4031 for (i = 0; i < state->id_rx_nqueues; i++) { 4032 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 4033 mutex_enter(&rxp->rx_post_lock); 4034 for (rwqe = WQE_TO_RWQE(rxp->rx_head); rwqe; 4035 rwqe = WQE_TO_RWQE(rwqe->rwqe_next)) { 4036 freemsg(rwqe->rwqe_im_mblk); 4037 rxp->rx_cnt--; 4038 } 4039 rxp->rx_head = NULL; 4040 mutex_exit(&rxp->rx_post_lock); 4041 } 4042 4043 /* cannot free rx resources unless gld returned everything */ 4044 if (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 0) == 0) 4045 ibd_free_rx_rsrcs(state); 4046 } 4047 4048 /* 4049 * Free an allocated recv wqe. 4050 */ 4051 /* ARGSUSED */ 4052 static void 4053 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 4054 { 4055 /* 4056 * desballoc() failed (no memory). 4057 * 4058 * This rwqe is placed on a free list so that it 4059 * can be reinstated when memory is available. 4060 * 4061 * NOTE: no code currently exists to reinstate 4062 * these "lost" rwqes. 4063 */ 4064 mutex_enter(&state->id_rx_free_list.dl_mutex); 4065 state->id_rx_free_list.dl_cnt++; 4066 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 4067 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 4068 mutex_exit(&state->id_rx_free_list.dl_mutex); 4069 } 4070 4071 /* 4072 * IBA Rx completion queue handler. Guaranteed to be single 4073 * threaded and nonreentrant for this CQ. 4074 */ 4075 /* ARGSUSED */ 4076 static void 4077 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4078 { 4079 ibd_state_t *state = (ibd_state_t *)arg; 4080 4081 atomic_inc_64(&state->id_num_intrs); 4082 4083 if (ibd_rx_softintr == 1) { 4084 mutex_enter(&state->id_rcq_poll_lock); 4085 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 4086 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 4087 mutex_exit(&state->id_rcq_poll_lock); 4088 return; 4089 } else { 4090 mutex_exit(&state->id_rcq_poll_lock); 4091 ddi_trigger_softintr(state->id_rx); 4092 } 4093 } else 4094 (void) ibd_intr((caddr_t)state); 4095 } 4096 4097 /* 4098 * CQ handler for Tx completions, when the Tx CQ is in 4099 * interrupt driven mode. 4100 */ 4101 /* ARGSUSED */ 4102 static void 4103 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4104 { 4105 ibd_state_t *state = (ibd_state_t *)arg; 4106 4107 atomic_inc_64(&state->id_num_intrs); 4108 4109 if (ibd_tx_softintr == 1) { 4110 mutex_enter(&state->id_scq_poll_lock); 4111 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 4112 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 4113 mutex_exit(&state->id_scq_poll_lock); 4114 return; 4115 } else { 4116 mutex_exit(&state->id_scq_poll_lock); 4117 ddi_trigger_softintr(state->id_tx); 4118 } 4119 } else 4120 (void) ibd_tx_recycle((caddr_t)state); 4121 } 4122 4123 /* 4124 * Multicast group create/delete trap handler. These will be delivered 4125 * on a kernel thread (handling can thus block) and can be invoked 4126 * concurrently. The handler can be invoked anytime after it is 4127 * registered and before ibt_detach(). 4128 */ 4129 /* ARGSUSED */ 4130 static void 4131 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4132 ibt_subnet_event_t *event) 4133 { 4134 ibd_state_t *state = (ibd_state_t *)arg; 4135 ibd_req_t *req; 4136 4137 /* 4138 * The trap handler will get invoked once for every event for 4139 * every port. The input "gid" is the GID0 of the port the 4140 * trap came in on; we just need to act on traps that came 4141 * to our port, meaning the port on which the ipoib interface 4142 * resides. Since ipoib uses GID0 of the port, we just match 4143 * the gids to check whether we need to handle the trap. 4144 */ 4145 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4146 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4147 return; 4148 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4149 4150 DPRINT(10, "ibd_notices_handler : %d\n", code); 4151 4152 switch (code) { 4153 case IBT_SM_EVENT_UNAVAILABLE: 4154 /* 4155 * If we are in promiscuous mode or have 4156 * sendnonmembers, we need to print a warning 4157 * message right now. Else, just store the 4158 * information, print when we enter promiscuous 4159 * mode or attempt nonmember send. We might 4160 * also want to stop caching sendnonmember. 4161 */ 4162 ibd_print_warn(state, "IBA multicast support " 4163 "degraded due to unavailability of multicast " 4164 "traps"); 4165 break; 4166 case IBT_SM_EVENT_AVAILABLE: 4167 /* 4168 * If we printed a warning message above or 4169 * while trying to nonmember send or get into 4170 * promiscuous mode, print an okay message. 4171 */ 4172 ibd_print_warn(state, "IBA multicast support " 4173 "restored due to availability of multicast " 4174 "traps"); 4175 break; 4176 case IBT_SM_EVENT_MCG_CREATED: 4177 case IBT_SM_EVENT_MCG_DELETED: 4178 /* 4179 * Common processing of creation/deletion traps. 4180 * First check if the instance is being 4181 * [de]initialized; back off then, without doing 4182 * anything more, since we are not sure if the 4183 * async thread is around, or whether we might 4184 * be racing with the detach code in ibd_m_stop() 4185 * that scans the mcg list. 4186 */ 4187 if (!ibd_async_safe(state)) 4188 return; 4189 4190 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4191 req->rq_gid = event->sm_notice_gid; 4192 req->rq_ptr = (void *)code; 4193 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4194 break; 4195 } 4196 } 4197 4198 static void 4199 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4200 { 4201 ib_gid_t mgid = req->rq_gid; 4202 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4203 4204 DPRINT(10, "ibd_async_trap : %d\n", code); 4205 4206 /* 4207 * Atomically search the nonmember and sendonlymember lists and 4208 * delete. 4209 */ 4210 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4211 4212 if (state->id_prom_op == IBD_OP_COMPLETED) { 4213 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4214 4215 /* 4216 * If in promiscuous mode, try to join/attach to the new 4217 * mcg. Given the unreliable out-of-order mode of trap 4218 * delivery, we can never be sure whether it is a problem 4219 * if the join fails. Thus, we warn the admin of a failure 4220 * if this was a creation trap. Note that the trap might 4221 * actually be reporting a long past event, and the mcg 4222 * might already have been deleted, thus we might be warning 4223 * in vain. 4224 */ 4225 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4226 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4227 ibd_print_warn(state, "IBA promiscuous mode missed " 4228 "new multicast gid %016llx:%016llx", 4229 (u_longlong_t)mgid.gid_prefix, 4230 (u_longlong_t)mgid.gid_guid); 4231 } 4232 4233 /* 4234 * Free the request slot allocated by the subnet event thread. 4235 */ 4236 ibd_async_done(state); 4237 } 4238 4239 /* 4240 * GLDv3 entry point to get capabilities. 4241 */ 4242 static boolean_t 4243 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4244 { 4245 ibd_state_t *state = arg; 4246 4247 switch (cap) { 4248 case MAC_CAPAB_HCKSUM: { 4249 uint32_t *txflags = cap_data; 4250 4251 /* 4252 * We either do full checksum or not do it at all 4253 */ 4254 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4255 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4256 else 4257 return (B_FALSE); 4258 break; 4259 } 4260 4261 case MAC_CAPAB_LSO: { 4262 mac_capab_lso_t *cap_lso = cap_data; 4263 4264 /* 4265 * In addition to the capability and policy, since LSO 4266 * relies on hw checksum, we'll not enable LSO if we 4267 * don't have hw checksum. Of course, if the HCA doesn't 4268 * provide the reserved lkey capability, enabling LSO will 4269 * actually affect performance adversely, so we'll disable 4270 * LSO even for that case. 4271 */ 4272 if (!state->id_lso_policy || !state->id_lso_capable) 4273 return (B_FALSE); 4274 4275 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4276 return (B_FALSE); 4277 4278 if (state->id_hca_res_lkey_capab == 0) { 4279 ibd_print_warn(state, "no reserved-lkey capability, " 4280 "disabling LSO"); 4281 return (B_FALSE); 4282 } 4283 4284 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4285 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4286 break; 4287 } 4288 4289 default: 4290 return (B_FALSE); 4291 } 4292 4293 return (B_TRUE); 4294 } 4295 4296 static int 4297 ibd_get_port_details(ibd_state_t *state) 4298 { 4299 ibt_hca_portinfo_t *port_infop; 4300 ibt_status_t ret; 4301 uint_t psize, port_infosz; 4302 4303 mutex_enter(&state->id_link_mutex); 4304 4305 /* 4306 * Query for port information 4307 */ 4308 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4309 &port_infop, &psize, &port_infosz); 4310 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4311 mutex_exit(&state->id_link_mutex); 4312 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4313 "failed, ret=%d", ret); 4314 return (ENETDOWN); 4315 } 4316 4317 /* 4318 * If the link already went down by the time we get here, 4319 * give up 4320 */ 4321 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4322 mutex_exit(&state->id_link_mutex); 4323 ibt_free_portinfo(port_infop, port_infosz); 4324 DPRINT(10, "ibd_get_port_details: port is not active"); 4325 return (ENETDOWN); 4326 } 4327 4328 /* 4329 * If the link is active, verify the pkey 4330 */ 4331 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4332 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4333 mutex_exit(&state->id_link_mutex); 4334 ibt_free_portinfo(port_infop, port_infosz); 4335 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4336 "failed, ret=%d", ret); 4337 return (ENONET); 4338 } 4339 4340 state->id_mtu = (128 << port_infop->p_mtu); 4341 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4342 state->id_sgid = *port_infop->p_sgid_tbl; 4343 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4344 state->id_link_state = LINK_STATE_UP; 4345 4346 mutex_exit(&state->id_link_mutex); 4347 ibt_free_portinfo(port_infop, port_infosz); 4348 4349 /* 4350 * Now that the port is active, record the port speed 4351 */ 4352 state->id_link_speed = ibd_get_portspeed(state); 4353 4354 return (0); 4355 } 4356 4357 static int 4358 ibd_alloc_cqs(ibd_state_t *state) 4359 { 4360 ibt_hca_attr_t hca_attrs; 4361 ibt_cq_attr_t cq_attr; 4362 ibt_status_t ret; 4363 uint32_t real_size; 4364 4365 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4366 ASSERT(ret == IBT_SUCCESS); 4367 4368 /* 4369 * Allocate Rx/combined CQ: 4370 * Theoretically, there is no point in having more than #rwqe 4371 * plus #swqe cqe's, except that the CQ will be signaled for 4372 * overflow when the last wqe completes, if none of the previous 4373 * cqe's have been polled. Thus, we allocate just a few less wqe's 4374 * to make sure such overflow does not occur. 4375 */ 4376 cq_attr.cq_sched = NULL; 4377 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4378 4379 /* 4380 * Allocate Receive CQ. 4381 */ 4382 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4383 cq_attr.cq_size = state->id_num_rwqe + 1; 4384 } else { 4385 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4386 state->id_num_rwqe = cq_attr.cq_size - 1; 4387 } 4388 4389 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4390 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4391 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4392 "failed, ret=%d\n", ret); 4393 return (DDI_FAILURE); 4394 } 4395 4396 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4397 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4398 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4399 "moderation failed, ret=%d\n", ret); 4400 } 4401 4402 /* make the #rx wc's the same as max rx chain size */ 4403 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 4404 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4405 state->id_rxwcs_size, KM_SLEEP); 4406 4407 /* 4408 * Allocate Send CQ. 4409 */ 4410 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4411 cq_attr.cq_size = state->id_num_swqe + 1; 4412 } else { 4413 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4414 state->id_num_swqe = cq_attr.cq_size - 1; 4415 } 4416 4417 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4418 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4419 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4420 "failed, ret=%d\n", ret); 4421 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4422 state->id_rxwcs_size); 4423 (void) ibt_free_cq(state->id_rcq_hdl); 4424 return (DDI_FAILURE); 4425 } 4426 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4427 ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) { 4428 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4429 "moderation failed, ret=%d\n", ret); 4430 } 4431 4432 state->id_txwcs_size = IBD_TX_POLL_THRESH; 4433 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4434 state->id_txwcs_size, KM_SLEEP); 4435 4436 /* 4437 * Print message in case we could not allocate as many wqe's 4438 * as was requested. 4439 */ 4440 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4441 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4442 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4443 } 4444 if (state->id_num_swqe != IBD_NUM_SWQE) { 4445 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4446 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4447 } 4448 4449 return (DDI_SUCCESS); 4450 } 4451 4452 static int 4453 ibd_setup_ud_channel(ibd_state_t *state) 4454 { 4455 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4456 ibt_ud_chan_query_attr_t ud_chan_attr; 4457 ibt_status_t ret; 4458 4459 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 4460 if (state->id_hca_res_lkey_capab) 4461 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4462 if (state->id_lso_policy && state->id_lso_capable) 4463 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4464 4465 ud_alloc_attr.ud_hca_port_num = state->id_port; 4466 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4467 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4468 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4469 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4470 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4471 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4472 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4473 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4474 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4475 ud_alloc_attr.ud_clone_chan = NULL; 4476 4477 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4478 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4479 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4480 "failed, ret=%d\n", ret); 4481 return (DDI_FAILURE); 4482 } 4483 4484 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4485 &ud_chan_attr)) != IBT_SUCCESS) { 4486 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4487 "failed, ret=%d\n", ret); 4488 (void) ibt_free_channel(state->id_chnl_hdl); 4489 return (DDI_FAILURE); 4490 } 4491 4492 state->id_qpnum = ud_chan_attr.ud_qpn; 4493 4494 return (DDI_SUCCESS); 4495 } 4496 4497 static int 4498 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4499 { 4500 uint32_t progress = state->id_mac_state; 4501 uint_t attempts; 4502 ibt_status_t ret; 4503 ib_gid_t mgid; 4504 ibd_mce_t *mce; 4505 uint8_t jstate; 4506 4507 if (atomic_dec_32_nv(&state->id_running) != 0) 4508 cmn_err(CE_WARN, "ibd_undo_start: id_running was not 1\n"); 4509 4510 /* 4511 * Before we try to stop/undo whatever we did in ibd_start(), 4512 * we need to mark the link state appropriately to prevent the 4513 * ip layer from using this instance for any new transfers. Note 4514 * that if the original state of the link was "up" when we're 4515 * here, we'll set the final link state to "unknown", to behave 4516 * in the same fashion as other ethernet drivers. 4517 */ 4518 mutex_enter(&state->id_link_mutex); 4519 if (cur_link_state == LINK_STATE_DOWN) { 4520 state->id_link_state = cur_link_state; 4521 } else { 4522 state->id_link_state = LINK_STATE_UNKNOWN; 4523 } 4524 mutex_exit(&state->id_link_mutex); 4525 mac_link_update(state->id_mh, state->id_link_state); 4526 4527 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4528 if (progress & IBD_DRV_STARTED) { 4529 state->id_mac_state &= (~IBD_DRV_STARTED); 4530 } 4531 4532 /* Stop listen under Reliable Connected Mode */ 4533 if (progress & IBD_DRV_RC_LISTEN) { 4534 ASSERT(state->id_enable_rc); 4535 if (state->rc_listen_hdl != NULL) { 4536 ibd_rc_stop_listen(state); 4537 } 4538 state->id_mac_state &= (~IBD_DRV_RC_LISTEN); 4539 } 4540 4541 if ((state->id_enable_rc) && (progress & IBD_DRV_ACACHE_INITIALIZED)) { 4542 (void) ibd_rc_close_all_chan(state); 4543 } 4544 4545 /* 4546 * First, stop receive interrupts; this stops the driver from 4547 * handing up buffers to higher layers. Wait for receive buffers 4548 * to be returned and give up after 1 second. 4549 */ 4550 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4551 attempts = 10; 4552 while (atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 4553 0) > 0) { 4554 delay(drv_usectohz(100000)); 4555 if (--attempts == 0) { 4556 /* 4557 * There are pending bufs with the network 4558 * layer and we have no choice but to wait 4559 * for them to be done with. Reap all the 4560 * Tx/Rx completions that were posted since 4561 * we turned off the notification and 4562 * return failure. 4563 */ 4564 cmn_err(CE_CONT, "!ibd: bufs outstanding\n"); 4565 DPRINT(2, "ibd_undo_start: " 4566 "reclaiming failed"); 4567 break; 4568 } 4569 } 4570 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4571 } 4572 4573 if (progress & IBD_DRV_RC_LARGEBUF_ALLOCD) { 4574 ibd_rc_fini_tx_largebuf_list(state); 4575 state->id_mac_state &= (~IBD_DRV_RC_LARGEBUF_ALLOCD); 4576 } 4577 4578 if (progress & IBD_DRV_RC_SRQ_ALLOCD) { 4579 ASSERT(state->id_enable_rc); 4580 if (state->rc_srq_rwqe_list.dl_bufs_outstanding == 0) { 4581 ibd_rc_fini_srq_list(state); 4582 state->id_mac_state &= (~IBD_DRV_RC_SRQ_ALLOCD); 4583 } else { 4584 cmn_err(CE_CONT, "ibd_undo_start: srq bufs " 4585 "outstanding\n"); 4586 } 4587 } 4588 4589 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4590 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4591 4592 mutex_enter(&state->id_trap_lock); 4593 state->id_trap_stop = B_TRUE; 4594 while (state->id_trap_inprog > 0) 4595 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4596 mutex_exit(&state->id_trap_lock); 4597 4598 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4599 } 4600 4601 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4602 /* 4603 * Flushing the channel ensures that all pending WQE's 4604 * are marked with flush_error and handed to the CQ. It 4605 * does not guarantee the invocation of the CQ handler. 4606 * This call is guaranteed to return successfully for 4607 * UD QPNs. 4608 */ 4609 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4610 IBT_SUCCESS) { 4611 DPRINT(10, "ibd_undo_start: flush_channel " 4612 "failed, ret=%d", ret); 4613 } 4614 4615 /* 4616 * Give some time for the TX CQ handler to process the 4617 * completions. 4618 */ 4619 mutex_enter(&state->id_tx_list.dl_mutex); 4620 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4621 attempts = 10; 4622 while (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt 4623 != state->id_num_swqe) { 4624 if (--attempts == 0) 4625 break; 4626 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4627 mutex_exit(&state->id_tx_list.dl_mutex); 4628 delay(drv_usectohz(100000)); 4629 mutex_enter(&state->id_tx_list.dl_mutex); 4630 mutex_enter(&state->id_tx_rel_list.dl_mutex); 4631 } 4632 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4633 if (state->id_tx_list.dl_cnt + state->id_tx_rel_list.dl_cnt != 4634 state->id_num_swqe) { 4635 cmn_err(CE_WARN, "tx resources not freed\n"); 4636 } 4637 mutex_exit(&state->id_tx_rel_list.dl_mutex); 4638 mutex_exit(&state->id_tx_list.dl_mutex); 4639 4640 attempts = 10; 4641 while (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4642 if (--attempts == 0) 4643 break; 4644 delay(drv_usectohz(100000)); 4645 } 4646 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4647 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, 0) != 0) { 4648 cmn_err(CE_WARN, "rx resources not freed\n"); 4649 } 4650 4651 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4652 } 4653 4654 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4655 /* 4656 * No new async requests will be posted since the device 4657 * link state has been marked as unknown; completion handlers 4658 * have been turned off, so Tx handler will not cause any 4659 * more IBD_ASYNC_REAP requests. 4660 * 4661 * Queue a request for the async thread to exit, which will 4662 * be serviced after any pending ones. This can take a while, 4663 * specially if the SM is unreachable, since IBMF will slowly 4664 * timeout each SM request issued by the async thread. Reap 4665 * the thread before continuing on, we do not want it to be 4666 * lingering in modunloaded code (or we could move the reap 4667 * to ibd_detach(), provided we keep track of the current 4668 * id_async_thrid somewhere safe). 4669 */ 4670 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4671 thread_join(state->id_async_thrid); 4672 4673 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4674 } 4675 4676 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4677 /* 4678 * Drop all residual full/non membership. This includes full 4679 * membership to the broadcast group, and any nonmembership 4680 * acquired during transmits. We do this after the Tx completion 4681 * handlers are done, since those might result in some late 4682 * leaves; this also eliminates a potential race with that 4683 * path wrt the mc full list insert/delete. Trap handling 4684 * has also been suppressed at this point. Thus, no locks 4685 * are required while traversing the mc full list. 4686 */ 4687 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4688 mce = list_head(&state->id_mc_full); 4689 while (mce != NULL) { 4690 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4691 jstate = mce->mc_jstate; 4692 mce = list_next(&state->id_mc_full, mce); 4693 ibd_leave_group(state, mgid, jstate); 4694 } 4695 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4696 } 4697 4698 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4699 ibd_fini_rxlist(state); 4700 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4701 } 4702 4703 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4704 ibd_fini_txlist(state); 4705 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4706 } 4707 4708 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4709 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4710 IBT_SUCCESS) { 4711 DPRINT(10, "ibd_undo_start: free_channel " 4712 "failed, ret=%d", ret); 4713 } 4714 4715 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4716 } 4717 4718 if (progress & IBD_DRV_CQS_ALLOCD) { 4719 kmem_free(state->id_txwcs, 4720 sizeof (ibt_wc_t) * state->id_txwcs_size); 4721 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4722 IBT_SUCCESS) { 4723 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4724 "failed, ret=%d", ret); 4725 } 4726 4727 kmem_free(state->id_rxwcs, 4728 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4729 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4730 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4731 "ret=%d", ret); 4732 } 4733 4734 state->id_txwcs = NULL; 4735 state->id_rxwcs = NULL; 4736 state->id_scq_hdl = NULL; 4737 state->id_rcq_hdl = NULL; 4738 4739 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4740 } 4741 4742 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4743 mutex_enter(&state->id_ac_mutex); 4744 mod_hash_destroy_hash(state->id_ah_active_hash); 4745 mutex_exit(&state->id_ac_mutex); 4746 ibd_acache_fini(state); 4747 4748 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4749 } 4750 4751 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4752 /* 4753 * If we'd created the ipoib broadcast group and had 4754 * successfully joined it, leave it now 4755 */ 4756 if (state->id_bgroup_created) { 4757 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4758 jstate = IB_MC_JSTATE_FULL; 4759 (void) ibt_leave_mcg(state->id_sgid, mgid, 4760 state->id_sgid, jstate); 4761 } 4762 ibt_free_mcg_info(state->id_mcinfo, 1); 4763 4764 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4765 } 4766 4767 return (DDI_SUCCESS); 4768 } 4769 4770 /* 4771 * These pair of routines are used to set/clear the condition that 4772 * the caller is likely to do something to change the id_mac_state. 4773 * If there's already someone doing either a start or a stop (possibly 4774 * due to the async handler detecting a pkey relocation event, a plumb 4775 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4776 * that's done. 4777 */ 4778 static void 4779 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4780 { 4781 mutex_enter(&state->id_macst_lock); 4782 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4783 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4784 4785 state->id_mac_state |= flag; 4786 mutex_exit(&state->id_macst_lock); 4787 } 4788 4789 static void 4790 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4791 { 4792 mutex_enter(&state->id_macst_lock); 4793 state->id_mac_state &= (~flag); 4794 cv_signal(&state->id_macst_cv); 4795 mutex_exit(&state->id_macst_lock); 4796 } 4797 4798 /* 4799 * GLDv3 entry point to start hardware. 4800 */ 4801 /*ARGSUSED*/ 4802 static int 4803 ibd_m_start(void *arg) 4804 { 4805 ibd_state_t *state = arg; 4806 int ret; 4807 4808 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4809 4810 ret = ibd_start(state); 4811 4812 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4813 4814 return (ret); 4815 } 4816 4817 static int 4818 ibd_start(ibd_state_t *state) 4819 { 4820 kthread_t *kht; 4821 int err; 4822 ibt_status_t ret; 4823 4824 if (state->id_mac_state & IBD_DRV_STARTED) 4825 return (DDI_SUCCESS); 4826 4827 if (atomic_inc_32_nv(&state->id_running) != 1) { 4828 DPRINT(10, "ibd_start: id_running is non-zero"); 4829 cmn_err(CE_WARN, "ibd_start: id_running was not 0\n"); 4830 atomic_dec_32(&state->id_running); 4831 return (EINVAL); 4832 } 4833 4834 /* 4835 * Get port details; if we fail here, very likely the port 4836 * state is inactive or the pkey can't be verified. 4837 */ 4838 if ((err = ibd_get_port_details(state)) != 0) { 4839 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4840 goto start_fail; 4841 } 4842 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4843 4844 /* 4845 * Find the IPoIB broadcast group 4846 */ 4847 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4848 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4849 err = ENOTACTIVE; 4850 goto start_fail; 4851 } 4852 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4853 4854 /* 4855 * Initialize per-interface caches and lists; if we fail here, 4856 * it is most likely due to a lack of resources 4857 */ 4858 if (ibd_acache_init(state) != DDI_SUCCESS) { 4859 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4860 err = ENOMEM; 4861 goto start_fail; 4862 } 4863 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4864 4865 /* 4866 * Allocate send and receive completion queues 4867 */ 4868 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4869 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4870 err = ENOMEM; 4871 goto start_fail; 4872 } 4873 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4874 4875 /* 4876 * Setup a UD channel 4877 */ 4878 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4879 err = ENOMEM; 4880 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4881 goto start_fail; 4882 } 4883 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4884 4885 /* 4886 * Allocate and initialize the tx buffer list 4887 */ 4888 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4889 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4890 err = ENOMEM; 4891 goto start_fail; 4892 } 4893 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4894 4895 /* 4896 * Create the send cq handler here 4897 */ 4898 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4899 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4900 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4901 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4902 "failed, ret=%d", ret); 4903 err = EINVAL; 4904 goto start_fail; 4905 } 4906 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4907 4908 /* 4909 * Allocate and initialize the rx buffer list 4910 */ 4911 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4912 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4913 err = ENOMEM; 4914 goto start_fail; 4915 } 4916 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4917 4918 /* 4919 * Join IPoIB broadcast group 4920 */ 4921 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4922 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4923 err = ENOTACTIVE; 4924 goto start_fail; 4925 } 4926 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4927 4928 /* 4929 * Create the async thread; thread_create never fails. 4930 */ 4931 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4932 TS_RUN, minclsyspri); 4933 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4934 state->id_async_thrid = kht->t_did; 4935 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4936 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4937 4938 /* 4939 * When we did mac_register() in ibd_attach(), we didn't register 4940 * the real macaddr and we didn't have the true port mtu. Now that 4941 * we're almost ready, set the local mac address and broadcast 4942 * addresses and update gldv3 about the real values of these 4943 * parameters. 4944 */ 4945 if (state->id_enable_rc) { 4946 ibd_h2n_mac(&state->id_macaddr, 4947 IBD_MAC_ADDR_RC + state->id_qpnum, 4948 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4949 ibd_h2n_mac(&state->rc_macaddr_loopback, state->id_qpnum, 4950 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4951 } else { 4952 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4953 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4954 } 4955 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4956 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4957 4958 if (!state->id_enable_rc) { 4959 (void) mac_maxsdu_update(state->id_mh, state->id_mtu 4960 - IPOIB_HDRSIZE); 4961 } 4962 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4963 4964 /* 4965 * Setup the receive cq handler 4966 */ 4967 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4968 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4969 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4970 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4971 "failed, ret=%d", ret); 4972 err = EINVAL; 4973 goto start_fail; 4974 } 4975 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4976 4977 /* 4978 * Setup the subnet notices handler after we've initialized the acache/ 4979 * mcache and started the async thread, both of which are required for 4980 * the trap handler to function properly. 4981 * 4982 * Now that the async thread has been started (and we've already done 4983 * a mac_register() during attach so mac_tx_update() can be called 4984 * if necessary without any problem), we can enable the trap handler 4985 * to queue requests to the async thread. 4986 */ 4987 ibt_register_subnet_notices(state->id_ibt_hdl, 4988 ibd_snet_notices_handler, state); 4989 mutex_enter(&state->id_trap_lock); 4990 state->id_trap_stop = B_FALSE; 4991 mutex_exit(&state->id_trap_lock); 4992 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4993 4994 if (state->id_enable_rc) { 4995 if (state->rc_enable_srq) { 4996 if (state->id_mac_state & IBD_DRV_RC_SRQ_ALLOCD) { 4997 if (ibd_rc_repost_srq_free_list(state) != 4998 IBT_SUCCESS) { 4999 err = ENOMEM; 5000 goto start_fail; 5001 } 5002 } else { 5003 /* Allocate SRQ resource */ 5004 if (ibd_rc_init_srq_list(state) != 5005 IBT_SUCCESS) { 5006 err = ENOMEM; 5007 goto start_fail; 5008 } 5009 state->id_mac_state |= IBD_DRV_RC_SRQ_ALLOCD; 5010 } 5011 } 5012 5013 if (ibd_rc_init_tx_largebuf_list(state) != IBT_SUCCESS) { 5014 DPRINT(10, "ibd_start: ibd_rc_init_tx_largebuf_list() " 5015 "failed"); 5016 err = ENOMEM; 5017 goto start_fail; 5018 } 5019 state->id_mac_state |= IBD_DRV_RC_LARGEBUF_ALLOCD; 5020 5021 /* RC: begin to listen only after everything is available */ 5022 if (ibd_rc_listen(state) != IBT_SUCCESS) { 5023 DPRINT(10, "ibd_start: ibd_rc_listen() failed"); 5024 err = EINVAL; 5025 goto start_fail; 5026 } 5027 state->id_mac_state |= IBD_DRV_RC_LISTEN; 5028 } 5029 5030 /* 5031 * Indicate link status to GLDv3 and higher layers. By default, 5032 * we assume we are in up state (which must have been true at 5033 * least at the time the broadcast mcg's were probed); if there 5034 * were any up/down transitions till the time we come here, the 5035 * async handler will have updated last known state, which we 5036 * use to tell GLDv3. The async handler will not send any 5037 * notifications to GLDv3 till we reach here in the initialization 5038 * sequence. 5039 */ 5040 state->id_mac_state |= IBD_DRV_STARTED; 5041 mac_link_update(state->id_mh, state->id_link_state); 5042 5043 return (DDI_SUCCESS); 5044 5045 start_fail: 5046 /* 5047 * If we ran into a problem during ibd_start() and ran into 5048 * some other problem during undoing our partial work, we can't 5049 * do anything about it. Ignore any errors we might get from 5050 * ibd_undo_start() and just return the original error we got. 5051 */ 5052 (void) ibd_undo_start(state, LINK_STATE_DOWN); 5053 return (err); 5054 } 5055 5056 /* 5057 * GLDv3 entry point to stop hardware from receiving packets. 5058 */ 5059 /*ARGSUSED*/ 5060 static void 5061 ibd_m_stop(void *arg) 5062 { 5063 ibd_state_t *state = (ibd_state_t *)arg; 5064 5065 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 5066 5067 (void) ibd_undo_start(state, state->id_link_state); 5068 5069 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 5070 } 5071 5072 /* 5073 * GLDv3 entry point to modify device's mac address. We do not 5074 * allow address modifications. 5075 */ 5076 static int 5077 ibd_m_unicst(void *arg, const uint8_t *macaddr) 5078 { 5079 ibd_state_t *state = arg; 5080 5081 /* 5082 * Don't bother even comparing the macaddr if we haven't 5083 * completed ibd_m_start(). 5084 */ 5085 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5086 return (0); 5087 5088 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 5089 return (0); 5090 else 5091 return (EINVAL); 5092 } 5093 5094 /* 5095 * The blocking part of the IBA join/leave operations are done out 5096 * of here on the async thread. 5097 */ 5098 static void 5099 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 5100 { 5101 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 5102 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 5103 5104 if (op == IBD_ASYNC_JOIN) { 5105 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 5106 ibd_print_warn(state, "Join multicast group failed :" 5107 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5108 } 5109 } else { 5110 /* 5111 * Here, we must search for the proper mcg_info and 5112 * use that to leave the group. 5113 */ 5114 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 5115 } 5116 } 5117 5118 /* 5119 * GLDv3 entry point for multicast enable/disable requests. 5120 * This function queues the operation to the async thread and 5121 * return success for a valid multicast address. 5122 */ 5123 static int 5124 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 5125 { 5126 ibd_state_t *state = (ibd_state_t *)arg; 5127 ipoib_mac_t maddr, *mcast; 5128 ib_gid_t mgid; 5129 ibd_req_t *req; 5130 5131 /* 5132 * If we haven't completed ibd_m_start(), async thread wouldn't 5133 * have been started and id_bcaddr wouldn't be set, so there's 5134 * no point in continuing. 5135 */ 5136 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5137 return (0); 5138 5139 /* 5140 * The incoming multicast address might not be aligned properly 5141 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 5142 * it to look like one though, to get the offsets of the mc gid, 5143 * since we know we are not going to dereference any values with 5144 * the ipoib_mac_t pointer. 5145 */ 5146 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 5147 mcast = &maddr; 5148 5149 /* 5150 * Check validity of MCG address. We could additionally check 5151 * that a enable/disable is not being issued on the "broadcast" 5152 * mcg, but since this operation is only invokable by privileged 5153 * programs anyway, we allow the flexibility to those dlpi apps. 5154 * Note that we do not validate the "scope" of the IBA mcg. 5155 */ 5156 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 5157 return (EINVAL); 5158 5159 /* 5160 * fill in multicast pkey and scope 5161 */ 5162 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 5163 5164 /* 5165 * If someone is trying to JOIN/LEAVE the broadcast group, we do 5166 * nothing (i.e. we stay JOINed to the broadcast group done in 5167 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 5168 * requires to be joined to broadcast groups at all times. 5169 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 5170 * depends on this. 5171 */ 5172 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5173 return (0); 5174 5175 ibd_n2h_gid(mcast, &mgid); 5176 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5177 if (req == NULL) 5178 return (ENOMEM); 5179 5180 req->rq_gid = mgid; 5181 5182 if (add) { 5183 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 5184 mgid.gid_prefix, mgid.gid_guid); 5185 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 5186 } else { 5187 DPRINT(1, "ibd_m_multicst : unset_multicast : " 5188 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 5189 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 5190 } 5191 return (0); 5192 } 5193 5194 /* 5195 * The blocking part of the IBA promiscuous operations are done 5196 * out of here on the async thread. The dlpireq parameter indicates 5197 * whether this invocation is due to a dlpi request or due to 5198 * a port up/down event. 5199 */ 5200 static void 5201 ibd_async_unsetprom(ibd_state_t *state) 5202 { 5203 ibd_mce_t *mce = list_head(&state->id_mc_non); 5204 ib_gid_t mgid; 5205 5206 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 5207 5208 while (mce != NULL) { 5209 mgid = mce->mc_info.mc_adds_vect.av_dgid; 5210 mce = list_next(&state->id_mc_non, mce); 5211 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 5212 } 5213 state->id_prom_op = IBD_OP_NOTSTARTED; 5214 } 5215 5216 /* 5217 * The blocking part of the IBA promiscuous operations are done 5218 * out of here on the async thread. The dlpireq parameter indicates 5219 * whether this invocation is due to a dlpi request or due to 5220 * a port up/down event. 5221 */ 5222 static void 5223 ibd_async_setprom(ibd_state_t *state) 5224 { 5225 ibt_mcg_attr_t mcg_attr; 5226 ibt_mcg_info_t *mcg_info; 5227 ib_gid_t mgid; 5228 uint_t numg; 5229 int i; 5230 char ret = IBD_OP_COMPLETED; 5231 5232 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 5233 5234 /* 5235 * Obtain all active MC groups on the IB fabric with 5236 * specified criteria (scope + Pkey + Qkey + mtu). 5237 */ 5238 bzero(&mcg_attr, sizeof (mcg_attr)); 5239 mcg_attr.mc_pkey = state->id_pkey; 5240 mcg_attr.mc_scope = state->id_scope; 5241 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 5242 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 5243 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 5244 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 5245 IBT_SUCCESS) { 5246 ibd_print_warn(state, "Could not get list of IBA multicast " 5247 "groups"); 5248 ret = IBD_OP_ERRORED; 5249 goto done; 5250 } 5251 5252 /* 5253 * Iterate over the returned mcg's and join as NonMember 5254 * to the IP mcg's. 5255 */ 5256 for (i = 0; i < numg; i++) { 5257 /* 5258 * Do a NonMember JOIN on the MC group. 5259 */ 5260 mgid = mcg_info[i].mc_adds_vect.av_dgid; 5261 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 5262 ibd_print_warn(state, "IBA promiscuous mode missed " 5263 "multicast gid %016llx:%016llx", 5264 (u_longlong_t)mgid.gid_prefix, 5265 (u_longlong_t)mgid.gid_guid); 5266 } 5267 5268 ibt_free_mcg_info(mcg_info, numg); 5269 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 5270 done: 5271 state->id_prom_op = ret; 5272 } 5273 5274 /* 5275 * GLDv3 entry point for multicast promiscuous enable/disable requests. 5276 * GLDv3 assumes phys state receives more packets than multi state, 5277 * which is not true for IPoIB. Thus, treat the multi and phys 5278 * promiscuous states the same way to work with GLDv3's assumption. 5279 */ 5280 static int 5281 ibd_m_promisc(void *arg, boolean_t on) 5282 { 5283 ibd_state_t *state = (ibd_state_t *)arg; 5284 ibd_req_t *req; 5285 5286 /* 5287 * Async thread wouldn't have been started if we haven't 5288 * passed ibd_m_start() 5289 */ 5290 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5291 return (0); 5292 5293 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5294 if (req == NULL) 5295 return (ENOMEM); 5296 if (on) { 5297 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 5298 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 5299 } else { 5300 DPRINT(1, "ibd_m_promisc : unset_promisc"); 5301 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 5302 } 5303 5304 return (0); 5305 } 5306 5307 /* 5308 * GLDv3 entry point for gathering statistics. 5309 */ 5310 static int 5311 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 5312 { 5313 ibd_state_t *state = (ibd_state_t *)arg; 5314 5315 switch (stat) { 5316 case MAC_STAT_IFSPEED: 5317 *val = state->id_link_speed; 5318 break; 5319 case MAC_STAT_MULTIRCV: 5320 *val = state->id_multi_rcv; 5321 break; 5322 case MAC_STAT_BRDCSTRCV: 5323 *val = state->id_brd_rcv; 5324 break; 5325 case MAC_STAT_MULTIXMT: 5326 *val = state->id_multi_xmt; 5327 break; 5328 case MAC_STAT_BRDCSTXMT: 5329 *val = state->id_brd_xmt; 5330 break; 5331 case MAC_STAT_RBYTES: 5332 *val = state->id_rcv_bytes + state->rc_rcv_trans_byte 5333 + state->rc_rcv_copy_byte; 5334 break; 5335 case MAC_STAT_IPACKETS: 5336 *val = state->id_rcv_pkt + state->rc_rcv_trans_pkt 5337 + state->rc_rcv_copy_pkt; 5338 break; 5339 case MAC_STAT_OBYTES: 5340 *val = state->id_xmt_bytes + state->rc_xmt_bytes; 5341 break; 5342 case MAC_STAT_OPACKETS: 5343 *val = state->id_xmt_pkt + state->rc_xmt_small_pkt + 5344 state->rc_xmt_fragmented_pkt + 5345 state->rc_xmt_map_fail_pkt + state->rc_xmt_map_succ_pkt; 5346 break; 5347 case MAC_STAT_OERRORS: 5348 *val = state->id_ah_error; /* failed AH translation */ 5349 break; 5350 case MAC_STAT_IERRORS: 5351 *val = 0; 5352 break; 5353 case MAC_STAT_NOXMTBUF: 5354 *val = state->id_tx_short + state->rc_swqe_short + 5355 state->rc_xmt_buf_short; 5356 break; 5357 case MAC_STAT_NORCVBUF: 5358 default: 5359 return (ENOTSUP); 5360 } 5361 5362 return (0); 5363 } 5364 5365 static void 5366 ibd_async_txsched(ibd_state_t *state) 5367 { 5368 ibd_resume_transmission(state); 5369 } 5370 5371 static void 5372 ibd_resume_transmission(ibd_state_t *state) 5373 { 5374 int flag; 5375 int met_thresh = 0; 5376 int thresh = 0; 5377 int ret = -1; 5378 5379 mutex_enter(&state->id_sched_lock); 5380 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5381 mutex_enter(&state->id_tx_list.dl_mutex); 5382 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5383 met_thresh = state->id_tx_list.dl_cnt + 5384 state->id_tx_rel_list.dl_cnt; 5385 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5386 mutex_exit(&state->id_tx_list.dl_mutex); 5387 thresh = IBD_FREE_SWQES_THRESH; 5388 flag = IBD_RSRC_SWQE; 5389 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5390 ASSERT(state->id_lso != NULL); 5391 mutex_enter(&state->id_lso_lock); 5392 met_thresh = state->id_lso->bkt_nfree; 5393 thresh = IBD_FREE_LSOS_THRESH; 5394 mutex_exit(&state->id_lso_lock); 5395 flag = IBD_RSRC_LSOBUF; 5396 if (met_thresh > thresh) 5397 state->id_sched_lso_cnt++; 5398 } 5399 if (met_thresh > thresh) { 5400 state->id_sched_needed &= ~flag; 5401 state->id_sched_cnt++; 5402 ret = 0; 5403 } 5404 mutex_exit(&state->id_sched_lock); 5405 5406 if (ret == 0) 5407 mac_tx_update(state->id_mh); 5408 } 5409 5410 /* 5411 * Release the send wqe back into free list. 5412 */ 5413 static void 5414 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 5415 { 5416 /* 5417 * Add back on Tx list for reuse. 5418 */ 5419 ASSERT(tail->swqe_next == NULL); 5420 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5421 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 5422 tail->swqe_next = state->id_tx_rel_list.dl_head; 5423 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 5424 state->id_tx_rel_list.dl_cnt += n; 5425 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5426 } 5427 5428 /* 5429 * Acquire a send wqe from free list. 5430 * Returns error number and send wqe pointer. 5431 */ 5432 static ibd_swqe_t * 5433 ibd_acquire_swqe(ibd_state_t *state) 5434 { 5435 ibd_swqe_t *wqe; 5436 5437 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5438 if (state->id_tx_rel_list.dl_head != NULL) { 5439 /* transfer id_tx_rel_list to id_tx_list */ 5440 state->id_tx_list.dl_head = 5441 state->id_tx_rel_list.dl_head; 5442 state->id_tx_list.dl_cnt = 5443 state->id_tx_rel_list.dl_cnt; 5444 state->id_tx_list.dl_pending_sends = B_FALSE; 5445 5446 /* clear id_tx_rel_list */ 5447 state->id_tx_rel_list.dl_head = NULL; 5448 state->id_tx_rel_list.dl_cnt = 0; 5449 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5450 5451 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5452 state->id_tx_list.dl_cnt -= 1; 5453 state->id_tx_list.dl_head = wqe->swqe_next; 5454 } else { /* no free swqe */ 5455 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5456 state->id_tx_list.dl_pending_sends = B_TRUE; 5457 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5458 state->id_tx_short++; 5459 wqe = NULL; 5460 } 5461 return (wqe); 5462 } 5463 5464 static int 5465 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5466 ibt_ud_dest_hdl_t ud_dest) 5467 { 5468 mblk_t *nmp; 5469 int iph_len, tcph_len; 5470 ibt_wr_lso_t *lso; 5471 uintptr_t ip_start, tcp_start; 5472 uint8_t *dst; 5473 uint_t pending, mblen; 5474 5475 /* 5476 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5477 * we need to adjust it here for lso. 5478 */ 5479 lso = &(node->w_swr.wr.ud_lso); 5480 lso->lso_ud_dest = ud_dest; 5481 lso->lso_mss = mss; 5482 5483 /* 5484 * Calculate the LSO header size and set it in the UD LSO structure. 5485 * Note that the only assumption we make is that each of the IPoIB, 5486 * IP and TCP headers will be contained in a single mblk fragment; 5487 * together, the headers may span multiple mblk fragments. 5488 */ 5489 nmp = mp; 5490 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5491 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5492 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5493 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5494 nmp = nmp->b_cont; 5495 5496 } 5497 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5498 5499 tcp_start = ip_start + iph_len; 5500 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5501 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5502 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5503 nmp = nmp->b_cont; 5504 } 5505 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5506 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5507 5508 /* 5509 * If the lso header fits entirely within a single mblk fragment, 5510 * we'll avoid an additional copy of the lso header here and just 5511 * pass the b_rptr of the mblk directly. 5512 * 5513 * If this isn't true, we'd have to allocate for it explicitly. 5514 */ 5515 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5516 lso->lso_hdr = mp->b_rptr; 5517 } else { 5518 /* On work completion, remember to free this allocated hdr */ 5519 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5520 if (lso->lso_hdr == NULL) { 5521 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5522 "sz = %d", lso->lso_hdr_sz); 5523 lso->lso_hdr_sz = 0; 5524 lso->lso_mss = 0; 5525 return (-1); 5526 } 5527 } 5528 5529 /* 5530 * Copy in the lso header only if we need to 5531 */ 5532 if (lso->lso_hdr != mp->b_rptr) { 5533 dst = lso->lso_hdr; 5534 pending = lso->lso_hdr_sz; 5535 5536 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5537 mblen = MBLKL(nmp); 5538 if (pending > mblen) { 5539 bcopy(nmp->b_rptr, dst, mblen); 5540 dst += mblen; 5541 pending -= mblen; 5542 } else { 5543 bcopy(nmp->b_rptr, dst, pending); 5544 break; 5545 } 5546 } 5547 } 5548 5549 return (0); 5550 } 5551 5552 static void 5553 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5554 { 5555 ibt_wr_lso_t *lso; 5556 5557 if ((!node) || (!mp)) 5558 return; 5559 5560 /* 5561 * Free any header space that we might've allocated if we 5562 * did an LSO 5563 */ 5564 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5565 lso = &(node->w_swr.wr.ud_lso); 5566 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5567 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5568 lso->lso_hdr = NULL; 5569 lso->lso_hdr_sz = 0; 5570 } 5571 } 5572 } 5573 5574 static void 5575 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5576 { 5577 uint_t i; 5578 uint_t num_posted; 5579 uint_t n_wrs; 5580 ibt_status_t ibt_status; 5581 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 5582 ibd_swqe_t *tx_head, *elem; 5583 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 5584 5585 /* post the one request, then check for more */ 5586 ibt_status = ibt_post_send(state->id_chnl_hdl, 5587 &node->w_swr, 1, NULL); 5588 if (ibt_status != IBT_SUCCESS) { 5589 ibd_print_warn(state, "ibd_post_send: " 5590 "posting one wr failed: ret=%d", ibt_status); 5591 ibd_tx_cleanup(state, node); 5592 } 5593 5594 tx_head = NULL; 5595 for (;;) { 5596 if (tx_head == NULL) { 5597 mutex_enter(&state->id_txpost_lock); 5598 tx_head = state->id_tx_head; 5599 if (tx_head == NULL) { 5600 state->id_tx_busy = 0; 5601 mutex_exit(&state->id_txpost_lock); 5602 return; 5603 } 5604 state->id_tx_head = NULL; 5605 mutex_exit(&state->id_txpost_lock); 5606 } 5607 5608 /* 5609 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 5610 * at a time if possible, and keep posting them. 5611 */ 5612 for (n_wrs = 0, elem = tx_head; 5613 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 5614 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5615 nodes[n_wrs] = elem; 5616 wrs[n_wrs] = elem->w_swr; 5617 } 5618 tx_head = elem; 5619 5620 ASSERT(n_wrs != 0); 5621 5622 /* 5623 * If posting fails for some reason, we'll never receive 5624 * completion intimation, so we'll need to cleanup. But 5625 * we need to make sure we don't clean up nodes whose 5626 * wrs have been successfully posted. We assume that the 5627 * hca driver returns on the first failure to post and 5628 * therefore the first 'num_posted' entries don't need 5629 * cleanup here. 5630 */ 5631 num_posted = 0; 5632 ibt_status = ibt_post_send(state->id_chnl_hdl, 5633 wrs, n_wrs, &num_posted); 5634 if (ibt_status != IBT_SUCCESS) { 5635 ibd_print_warn(state, "ibd_post_send: " 5636 "posting multiple wrs failed: " 5637 "requested=%d, done=%d, ret=%d", 5638 n_wrs, num_posted, ibt_status); 5639 5640 for (i = num_posted; i < n_wrs; i++) 5641 ibd_tx_cleanup(state, nodes[i]); 5642 } 5643 } 5644 } 5645 5646 static int 5647 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5648 uint_t lsohdr_sz) 5649 { 5650 ibt_wr_ds_t *sgl; 5651 ibt_status_t ibt_status; 5652 mblk_t *nmp; 5653 mblk_t *data_mp; 5654 uchar_t *bufp; 5655 size_t blksize; 5656 size_t skip; 5657 size_t avail; 5658 uint_t pktsize; 5659 uint_t frag_len; 5660 uint_t pending_hdr; 5661 int nmblks; 5662 int i; 5663 5664 /* 5665 * Let's skip ahead to the data if this is LSO 5666 */ 5667 data_mp = mp; 5668 pending_hdr = 0; 5669 if (lsohdr_sz) { 5670 pending_hdr = lsohdr_sz; 5671 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5672 frag_len = nmp->b_wptr - nmp->b_rptr; 5673 if (frag_len > pending_hdr) 5674 break; 5675 pending_hdr -= frag_len; 5676 } 5677 data_mp = nmp; /* start of data past lso header */ 5678 ASSERT(data_mp != NULL); 5679 } 5680 5681 /* 5682 * Calculate the size of message data and number of msg blocks 5683 */ 5684 pktsize = 0; 5685 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5686 nmp = nmp->b_cont, nmblks++) { 5687 pktsize += MBLKL(nmp); 5688 } 5689 pktsize -= pending_hdr; 5690 5691 /* 5692 * We only do ibt_map_mem_iov() if the pktsize is above the 5693 * "copy-threshold", and if the number of mp fragments is less than 5694 * the maximum acceptable. 5695 */ 5696 if ((state->id_hca_res_lkey_capab) && 5697 (pktsize > IBD_TX_COPY_THRESH) && 5698 (nmblks < state->id_max_sqseg_hiwm)) { 5699 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5700 ibt_iov_attr_t iov_attr; 5701 5702 iov_attr.iov_as = NULL; 5703 iov_attr.iov = iov_arr; 5704 iov_attr.iov_buf = NULL; 5705 iov_attr.iov_list_len = nmblks; 5706 iov_attr.iov_wr_nds = state->id_max_sqseg; 5707 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5708 iov_attr.iov_flags = IBT_IOV_SLEEP; 5709 5710 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5711 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5712 iov_arr[i].iov_len = MBLKL(nmp); 5713 if (i == 0) { 5714 iov_arr[i].iov_addr += pending_hdr; 5715 iov_arr[i].iov_len -= pending_hdr; 5716 } 5717 } 5718 5719 node->w_buftype = IBD_WQE_MAPPED; 5720 node->w_swr.wr_sgl = node->w_sgl; 5721 5722 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5723 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5724 if (ibt_status != IBT_SUCCESS) { 5725 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5726 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5727 goto ibd_copy_path; 5728 } 5729 5730 return (0); 5731 } 5732 5733 ibd_copy_path: 5734 if (pktsize <= state->id_tx_buf_sz) { 5735 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5736 node->w_swr.wr_nds = 1; 5737 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5738 node->w_buftype = IBD_WQE_TXBUF; 5739 5740 /* 5741 * Even though this is the copy path for transfers less than 5742 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5743 * is possible the first data mblk fragment (data_mp) still 5744 * contains part of the LSO header that we need to skip. 5745 */ 5746 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5747 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5748 blksize = MBLKL(nmp) - pending_hdr; 5749 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5750 bufp += blksize; 5751 pending_hdr = 0; 5752 } 5753 5754 return (0); 5755 } 5756 5757 /* 5758 * Copy path for transfers greater than id_tx_buf_sz 5759 */ 5760 node->w_swr.wr_sgl = node->w_sgl; 5761 if (ibd_acquire_lsobufs(state, pktsize, 5762 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5763 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5764 return (-1); 5765 } 5766 node->w_buftype = IBD_WQE_LSOBUF; 5767 5768 /* 5769 * Copy the larger-than-id_tx_buf_sz packet into a set of 5770 * fixed-sized, pre-mapped LSO buffers. Note that we might 5771 * need to skip part of the LSO header in the first fragment 5772 * as before. 5773 */ 5774 nmp = data_mp; 5775 skip = pending_hdr; 5776 for (i = 0; i < node->w_swr.wr_nds; i++) { 5777 sgl = node->w_swr.wr_sgl + i; 5778 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5779 avail = IBD_LSO_BUFSZ; 5780 while (nmp && avail) { 5781 blksize = MBLKL(nmp) - skip; 5782 if (blksize > avail) { 5783 bcopy(nmp->b_rptr + skip, bufp, avail); 5784 skip += avail; 5785 avail = 0; 5786 } else { 5787 bcopy(nmp->b_rptr + skip, bufp, blksize); 5788 skip = 0; 5789 avail -= blksize; 5790 bufp += blksize; 5791 nmp = nmp->b_cont; 5792 } 5793 } 5794 } 5795 5796 return (0); 5797 } 5798 5799 /* 5800 * Schedule a completion queue polling to reap the resource we're 5801 * short on. If we implement the change to reap tx completions 5802 * in a separate thread, we'll need to wake up that thread here. 5803 */ 5804 static int 5805 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5806 { 5807 ibd_req_t *req; 5808 5809 mutex_enter(&state->id_sched_lock); 5810 state->id_sched_needed |= resource_type; 5811 mutex_exit(&state->id_sched_lock); 5812 5813 /* 5814 * If we are asked to queue a work entry, we need to do it 5815 */ 5816 if (q_flag) { 5817 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5818 if (req == NULL) 5819 return (-1); 5820 5821 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5822 } 5823 5824 return (0); 5825 } 5826 5827 /* 5828 * The passed in packet has this format: 5829 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5830 */ 5831 static boolean_t 5832 ibd_send(ibd_state_t *state, mblk_t *mp) 5833 { 5834 ibd_ace_t *ace; 5835 ibd_swqe_t *node; 5836 ipoib_mac_t *dest; 5837 ib_header_info_t *ipibp; 5838 ip6_t *ip6h; 5839 uint_t pktsize; 5840 uint32_t mss; 5841 uint32_t hckflags; 5842 uint32_t lsoflags = 0; 5843 uint_t lsohdr_sz = 0; 5844 int ret, len; 5845 boolean_t dofree = B_FALSE; 5846 boolean_t rc; 5847 /* if (rc_chan == NULL) send by UD; else send by RC; */ 5848 ibd_rc_chan_t *rc_chan; 5849 int nmblks; 5850 mblk_t *nmp; 5851 5852 /* 5853 * If we aren't done with the device initialization and start, 5854 * we shouldn't be here. 5855 */ 5856 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5857 return (B_FALSE); 5858 5859 /* 5860 * Obtain an address handle for the destination. 5861 */ 5862 ipibp = (ib_header_info_t *)mp->b_rptr; 5863 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5864 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5865 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5866 5867 rc_chan = NULL; 5868 ace = ibd_acache_lookup(state, dest, &ret, 1); 5869 if (state->id_enable_rc && (ace != NULL) && 5870 (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN))) { 5871 if (ace->ac_chan == NULL) { 5872 state->rc_null_conn++; 5873 } else { 5874 if (ace->ac_chan->chan_state == 5875 IBD_RC_STATE_ACT_ESTAB) { 5876 rc_chan = ace->ac_chan; 5877 mutex_enter(&rc_chan->tx_wqe_list.dl_mutex); 5878 node = WQE_TO_SWQE( 5879 rc_chan->tx_wqe_list.dl_head); 5880 if (node != NULL) { 5881 rc_chan->tx_wqe_list.dl_cnt -= 1; 5882 rc_chan->tx_wqe_list.dl_head = 5883 node->swqe_next; 5884 } else { 5885 node = ibd_rc_acquire_swqes(rc_chan); 5886 } 5887 mutex_exit(&rc_chan->tx_wqe_list.dl_mutex); 5888 5889 if (node == NULL) { 5890 state->rc_swqe_short++; 5891 mutex_enter(&state->id_sched_lock); 5892 state->id_sched_needed |= 5893 IBD_RSRC_RC_SWQE; 5894 mutex_exit(&state->id_sched_lock); 5895 ibd_dec_ref_ace(state, ace); 5896 return (B_FALSE); 5897 } 5898 } else { 5899 state->rc_no_estab_conn++; 5900 } 5901 } 5902 } 5903 5904 if (rc_chan == NULL) { 5905 mutex_enter(&state->id_tx_list.dl_mutex); 5906 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 5907 if (node != NULL) { 5908 state->id_tx_list.dl_cnt -= 1; 5909 state->id_tx_list.dl_head = node->swqe_next; 5910 } else { 5911 node = ibd_acquire_swqe(state); 5912 } 5913 mutex_exit(&state->id_tx_list.dl_mutex); 5914 if (node == NULL) { 5915 /* 5916 * If we don't have an swqe available, schedule a 5917 * transmit completion queue cleanup and hold off on 5918 * sending more packets until we have some free swqes 5919 */ 5920 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) { 5921 if (ace != NULL) { 5922 ibd_dec_ref_ace(state, ace); 5923 } 5924 return (B_FALSE); 5925 } 5926 5927 /* 5928 * If a poll cannot be scheduled, we have no choice but 5929 * to drop this packet 5930 */ 5931 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5932 if (ace != NULL) { 5933 ibd_dec_ref_ace(state, ace); 5934 } 5935 return (B_TRUE); 5936 } 5937 } 5938 5939 /* 5940 * Initialize the commonly used fields in swqe to NULL to protect 5941 * against ibd_tx_cleanup accidentally misinterpreting these on a 5942 * failure. 5943 */ 5944 node->swqe_im_mblk = NULL; 5945 node->w_swr.wr_nds = 0; 5946 node->w_swr.wr_sgl = NULL; 5947 node->w_swr.wr_opcode = IBT_WRC_SEND; 5948 5949 /* 5950 * Calculate the size of message data and number of msg blocks 5951 */ 5952 pktsize = 0; 5953 for (nmblks = 0, nmp = mp; nmp != NULL; 5954 nmp = nmp->b_cont, nmblks++) { 5955 pktsize += MBLKL(nmp); 5956 } 5957 5958 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5959 atomic_inc_64(&state->id_brd_xmt); 5960 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5961 atomic_inc_64(&state->id_multi_xmt); 5962 5963 if (ace != NULL) { 5964 node->w_ahandle = ace; 5965 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5966 } else { 5967 DPRINT(5, 5968 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5969 ((ret == EFAULT) ? "failed" : "queued"), 5970 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5971 htonl(dest->ipoib_gidpref[1]), 5972 htonl(dest->ipoib_gidsuff[0]), 5973 htonl(dest->ipoib_gidsuff[1])); 5974 state->rc_ace_not_found++; 5975 node->w_ahandle = NULL; 5976 5977 /* 5978 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5979 * can not find a path for the specific dest address. We 5980 * should get rid of this kind of packet. We also should get 5981 * rid of the packet if we cannot schedule a poll via the 5982 * async thread. For the normal case, ibd will return the 5983 * packet to upper layer and wait for AH creating. 5984 * 5985 * Note that we always queue a work slot entry for the async 5986 * thread when we fail AH lookup (even in intr mode); this is 5987 * due to the convoluted way the code currently looks for AH. 5988 */ 5989 if (ret == EFAULT) { 5990 dofree = B_TRUE; 5991 rc = B_TRUE; 5992 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5993 dofree = B_TRUE; 5994 rc = B_TRUE; 5995 } else { 5996 dofree = B_FALSE; 5997 rc = B_FALSE; 5998 } 5999 goto ibd_send_fail; 6000 } 6001 6002 /* 6003 * For ND6 packets, padding is at the front of the source lladdr. 6004 * Insert the padding at front. 6005 */ 6006 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 6007 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 6008 if (!pullupmsg(mp, IPV6_HDR_LEN + 6009 sizeof (ib_header_info_t))) { 6010 DPRINT(10, "ibd_send: pullupmsg failure "); 6011 dofree = B_TRUE; 6012 rc = B_TRUE; 6013 goto ibd_send_fail; 6014 } 6015 ipibp = (ib_header_info_t *)mp->b_rptr; 6016 } 6017 ip6h = (ip6_t *)((uchar_t *)ipibp + 6018 sizeof (ib_header_info_t)); 6019 len = ntohs(ip6h->ip6_plen); 6020 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6021 mblk_t *pad; 6022 6023 pad = allocb(4, 0); 6024 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 6025 linkb(mp, pad); 6026 if (MBLKL(mp) < sizeof (ib_header_info_t) + 6027 IPV6_HDR_LEN + len + 4) { 6028 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 6029 IPV6_HDR_LEN + len + 4)) { 6030 DPRINT(10, "ibd_send: pullupmsg " 6031 "failure "); 6032 dofree = B_TRUE; 6033 rc = B_TRUE; 6034 goto ibd_send_fail; 6035 } 6036 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6037 sizeof (ib_header_info_t)); 6038 } 6039 6040 /* LINTED: E_CONSTANT_CONDITION */ 6041 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 6042 } 6043 } 6044 6045 ASSERT(mp->b_wptr - mp->b_rptr >= sizeof (ib_addrs_t)); 6046 mp->b_rptr += sizeof (ib_addrs_t); 6047 pktsize -= sizeof (ib_addrs_t); 6048 6049 if (rc_chan) { /* send in RC mode */ 6050 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 6051 ibt_iov_attr_t iov_attr; 6052 uint_t i; 6053 size_t blksize; 6054 uchar_t *bufp; 6055 ibd_rc_tx_largebuf_t *lbufp; 6056 6057 atomic_add_64(&state->rc_xmt_bytes, pktsize); 6058 6059 /* 6060 * Upper layer does Tx checksum, we don't need do any 6061 * checksum here. 6062 */ 6063 ASSERT(node->w_swr.wr_trans == IBT_RC_SRV); 6064 6065 /* 6066 * We only do ibt_map_mem_iov() if the pktsize is above 6067 * the "copy-threshold", and if the number of mp 6068 * fragments is less than the maximum acceptable. 6069 */ 6070 if (pktsize <= ibd_rc_tx_copy_thresh) { 6071 atomic_inc_64(&state->rc_xmt_small_pkt); 6072 /* 6073 * Only process unicast packet in Reliable Connected 6074 * mode. 6075 */ 6076 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 6077 node->w_swr.wr_nds = 1; 6078 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 6079 node->w_buftype = IBD_WQE_TXBUF; 6080 6081 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 6082 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6083 blksize = MBLKL(nmp); 6084 bcopy(nmp->b_rptr, bufp, blksize); 6085 bufp += blksize; 6086 } 6087 freemsg(mp); 6088 ASSERT(node->swqe_im_mblk == NULL); 6089 } else { 6090 if ((state->rc_enable_iov_map) && 6091 (nmblks < state->rc_max_sqseg_hiwm)) { 6092 6093 /* do ibt_map_mem_iov() */ 6094 iov_attr.iov_as = NULL; 6095 iov_attr.iov = iov_arr; 6096 iov_attr.iov_buf = NULL; 6097 iov_attr.iov_wr_nds = state->rc_tx_max_sqseg; 6098 iov_attr.iov_lso_hdr_sz = 0; 6099 iov_attr.iov_flags = IBT_IOV_SLEEP; 6100 6101 i = 0; 6102 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6103 iov_arr[i].iov_len = MBLKL(nmp); 6104 if (iov_arr[i].iov_len != 0) { 6105 iov_arr[i].iov_addr = (caddr_t) 6106 (void *)nmp->b_rptr; 6107 i++; 6108 } 6109 } 6110 iov_attr.iov_list_len = i; 6111 node->w_swr.wr_sgl = node->w_sgl; 6112 6113 ret = ibt_map_mem_iov(state->id_hca_hdl, 6114 &iov_attr, (ibt_all_wr_t *)&node->w_swr, 6115 &node->w_mi_hdl); 6116 if (ret != IBT_SUCCESS) { 6117 atomic_inc_64( 6118 &state->rc_xmt_map_fail_pkt); 6119 DPRINT(30, "ibd_send: ibt_map_mem_iov(" 6120 ") failed, nmblks=%d, real_nmblks" 6121 "=%d, ret=0x%x", nmblks, i, ret); 6122 goto ibd_rc_large_copy; 6123 } 6124 6125 atomic_inc_64(&state->rc_xmt_map_succ_pkt); 6126 node->w_buftype = IBD_WQE_MAPPED; 6127 node->swqe_im_mblk = mp; 6128 } else { 6129 atomic_inc_64(&state->rc_xmt_fragmented_pkt); 6130 ibd_rc_large_copy: 6131 mutex_enter(&state->rc_tx_large_bufs_lock); 6132 if (state->rc_tx_largebuf_nfree == 0) { 6133 state->rc_xmt_buf_short++; 6134 mutex_exit 6135 (&state->rc_tx_large_bufs_lock); 6136 mutex_enter(&state->id_sched_lock); 6137 state->id_sched_needed |= 6138 IBD_RSRC_RC_TX_LARGEBUF; 6139 mutex_exit(&state->id_sched_lock); 6140 dofree = B_FALSE; 6141 rc = B_FALSE; 6142 /* 6143 * If we don't have Tx large bufs, 6144 * return failure. node->w_buftype 6145 * should not be IBD_WQE_RC_COPYBUF, 6146 * otherwise it will cause problem 6147 * in ibd_rc_tx_cleanup() 6148 */ 6149 node->w_buftype = IBD_WQE_TXBUF; 6150 goto ibd_send_fail; 6151 } 6152 6153 lbufp = state->rc_tx_largebuf_free_head; 6154 ASSERT(lbufp->lb_buf != NULL); 6155 state->rc_tx_largebuf_free_head = 6156 lbufp->lb_next; 6157 lbufp->lb_next = NULL; 6158 /* Update nfree count */ 6159 state->rc_tx_largebuf_nfree --; 6160 mutex_exit(&state->rc_tx_large_bufs_lock); 6161 bufp = lbufp->lb_buf; 6162 node->w_sgl[0].ds_va = 6163 (ib_vaddr_t)(uintptr_t)bufp; 6164 node->w_sgl[0].ds_key = 6165 state->rc_tx_mr_desc.md_lkey; 6166 node->w_sgl[0].ds_len = pktsize; 6167 node->w_swr.wr_sgl = node->w_sgl; 6168 node->w_swr.wr_nds = 1; 6169 node->w_buftype = IBD_WQE_RC_COPYBUF; 6170 node->w_rc_tx_largebuf = lbufp; 6171 6172 for (nmp = mp; nmp != NULL; nmp = nmp->b_cont) { 6173 blksize = MBLKL(nmp); 6174 if (blksize != 0) { 6175 bcopy(nmp->b_rptr, bufp, 6176 blksize); 6177 bufp += blksize; 6178 } 6179 } 6180 freemsg(mp); 6181 ASSERT(node->swqe_im_mblk == NULL); 6182 } 6183 } 6184 6185 node->swqe_next = NULL; 6186 mutex_enter(&rc_chan->tx_post_lock); 6187 if (rc_chan->tx_busy) { 6188 if (rc_chan->tx_head) { 6189 rc_chan->tx_tail->swqe_next = 6190 SWQE_TO_WQE(node); 6191 } else { 6192 rc_chan->tx_head = node; 6193 } 6194 rc_chan->tx_tail = node; 6195 mutex_exit(&rc_chan->tx_post_lock); 6196 } else { 6197 rc_chan->tx_busy = 1; 6198 mutex_exit(&rc_chan->tx_post_lock); 6199 ibd_rc_post_send(rc_chan, node); 6200 } 6201 6202 return (B_TRUE); 6203 } /* send by RC */ 6204 6205 if ((state->id_enable_rc) && (pktsize > state->id_mtu)) { 6206 /* 6207 * Too long pktsize. The packet size from GLD should <= 6208 * state->id_mtu + sizeof (ib_addrs_t) 6209 */ 6210 if (ace->ac_mac.ipoib_qpn != htonl(IB_MC_QPN)) { 6211 ibd_req_t *req; 6212 6213 mutex_enter(&ace->tx_too_big_mutex); 6214 if (ace->tx_too_big_ongoing) { 6215 mutex_exit(&ace->tx_too_big_mutex); 6216 state->rc_xmt_reenter_too_long_pkt++; 6217 dofree = B_TRUE; 6218 } else { 6219 ace->tx_too_big_ongoing = B_TRUE; 6220 mutex_exit(&ace->tx_too_big_mutex); 6221 state->rc_xmt_icmp_too_long_pkt++; 6222 6223 req = kmem_cache_alloc(state->id_req_kmc, 6224 KM_NOSLEEP); 6225 if (req == NULL) { 6226 ibd_print_warn(state, "ibd_send: alloc " 6227 "ibd_req_t fail"); 6228 /* Drop it. */ 6229 dofree = B_TRUE; 6230 } else { 6231 req->rq_ptr = mp; 6232 req->rq_ptr2 = ace; 6233 ibd_queue_work_slot(state, req, 6234 IBD_ASYNC_RC_TOO_BIG); 6235 dofree = B_FALSE; 6236 } 6237 } 6238 } else { 6239 ibd_print_warn(state, "Reliable Connected mode is on. " 6240 "Multicast packet length %d > %d is too long to " 6241 "send packet (%d > %d), drop it", 6242 pktsize, state->id_mtu); 6243 state->rc_xmt_drop_too_long_pkt++; 6244 /* Drop it. */ 6245 dofree = B_TRUE; 6246 } 6247 rc = B_TRUE; 6248 goto ibd_send_fail; 6249 } 6250 6251 atomic_add_64(&state->id_xmt_bytes, pktsize); 6252 atomic_inc_64(&state->id_xmt_pkt); 6253 6254 /* 6255 * Do LSO and checksum related work here. For LSO send, adjust the 6256 * ud destination, the opcode and the LSO header information to the 6257 * work request. 6258 */ 6259 lso_info_get(mp, &mss, &lsoflags); 6260 if ((lsoflags & HW_LSO) != HW_LSO) { 6261 node->w_swr.wr_opcode = IBT_WRC_SEND; 6262 lsohdr_sz = 0; 6263 } else { 6264 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 6265 /* 6266 * The routine can only fail if there's no memory; we 6267 * can only drop the packet if this happens 6268 */ 6269 ibd_print_warn(state, 6270 "ibd_send: no memory, lso posting failed"); 6271 dofree = B_TRUE; 6272 rc = B_TRUE; 6273 goto ibd_send_fail; 6274 } 6275 6276 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 6277 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 6278 } 6279 6280 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 6281 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 6282 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 6283 else 6284 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 6285 6286 /* 6287 * Prepare the sgl for posting; the routine can only fail if there's 6288 * no lso buf available for posting. If this is the case, we should 6289 * probably resched for lso bufs to become available and then try again. 6290 */ 6291 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 6292 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 6293 dofree = B_TRUE; 6294 rc = B_TRUE; 6295 } else { 6296 dofree = B_FALSE; 6297 rc = B_FALSE; 6298 } 6299 goto ibd_send_fail; 6300 } 6301 node->swqe_im_mblk = mp; 6302 6303 /* 6304 * Queue the wqe to hardware; since we can now simply queue a 6305 * post instead of doing it serially, we cannot assume anything 6306 * about the 'node' after ibd_post_send() returns. 6307 */ 6308 node->swqe_next = NULL; 6309 6310 mutex_enter(&state->id_txpost_lock); 6311 if (state->id_tx_busy) { 6312 if (state->id_tx_head) { 6313 state->id_tx_tail->swqe_next = 6314 SWQE_TO_WQE(node); 6315 } else { 6316 state->id_tx_head = node; 6317 } 6318 state->id_tx_tail = node; 6319 mutex_exit(&state->id_txpost_lock); 6320 } else { 6321 state->id_tx_busy = 1; 6322 mutex_exit(&state->id_txpost_lock); 6323 ibd_post_send(state, node); 6324 } 6325 6326 return (B_TRUE); 6327 6328 ibd_send_fail: 6329 if (node && mp) 6330 ibd_free_lsohdr(node, mp); 6331 6332 if (dofree) 6333 freemsg(mp); 6334 6335 if (node != NULL) { 6336 if (rc_chan) { 6337 ibd_rc_tx_cleanup(node); 6338 } else { 6339 ibd_tx_cleanup(state, node); 6340 } 6341 } 6342 6343 return (rc); 6344 } 6345 6346 /* 6347 * GLDv3 entry point for transmitting datagram. 6348 */ 6349 static mblk_t * 6350 ibd_m_tx(void *arg, mblk_t *mp) 6351 { 6352 ibd_state_t *state = (ibd_state_t *)arg; 6353 mblk_t *next; 6354 6355 if (state->id_link_state != LINK_STATE_UP) { 6356 freemsgchain(mp); 6357 mp = NULL; 6358 } 6359 6360 while (mp != NULL) { 6361 next = mp->b_next; 6362 mp->b_next = NULL; 6363 if (ibd_send(state, mp) == B_FALSE) { 6364 /* Send fail */ 6365 mp->b_next = next; 6366 break; 6367 } 6368 mp = next; 6369 } 6370 6371 return (mp); 6372 } 6373 6374 /* 6375 * this handles Tx and Rx completions. With separate CQs, this handles 6376 * only Rx completions. 6377 */ 6378 static uint_t 6379 ibd_intr(caddr_t arg) 6380 { 6381 ibd_state_t *state = (ibd_state_t *)arg; 6382 6383 ibd_poll_rcq(state, state->id_rcq_hdl); 6384 6385 return (DDI_INTR_CLAIMED); 6386 } 6387 6388 /* 6389 * Poll and fully drain the send cq 6390 */ 6391 static void 6392 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6393 { 6394 ibt_wc_t *wcs = state->id_txwcs; 6395 uint_t numwcs = state->id_txwcs_size; 6396 ibd_wqe_t *wqe; 6397 ibd_swqe_t *head, *tail; 6398 ibt_wc_t *wc; 6399 uint_t num_polled; 6400 int i; 6401 6402 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 6403 head = tail = NULL; 6404 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 6405 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 6406 if (wc->wc_status != IBT_WC_SUCCESS) { 6407 /* 6408 * Channel being torn down. 6409 */ 6410 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 6411 DPRINT(5, "ibd_drain_scq: flush error"); 6412 DPRINT(10, "ibd_drain_scq: Bad " 6413 "status %d", wc->wc_status); 6414 } else { 6415 DPRINT(10, "ibd_drain_scq: " 6416 "unexpected wc_status %d", 6417 wc->wc_status); 6418 } 6419 /* 6420 * Fallthrough to invoke the Tx handler to 6421 * release held resources, e.g., AH refcount. 6422 */ 6423 } 6424 /* 6425 * Add this swqe to the list to be cleaned up. 6426 */ 6427 if (head) 6428 tail->swqe_next = wqe; 6429 else 6430 head = WQE_TO_SWQE(wqe); 6431 tail = WQE_TO_SWQE(wqe); 6432 } 6433 tail->swqe_next = NULL; 6434 ibd_tx_cleanup_list(state, head, tail); 6435 6436 /* 6437 * Resume any blocked transmissions if possible 6438 */ 6439 ibd_resume_transmission(state); 6440 } 6441 } 6442 6443 /* 6444 * Poll and fully drain the receive cq 6445 */ 6446 static void 6447 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6448 { 6449 ibt_wc_t *wcs = state->id_rxwcs; 6450 uint_t numwcs = state->id_rxwcs_size; 6451 ibd_rwqe_t *rwqe; 6452 ibt_wc_t *wc; 6453 uint_t num_polled; 6454 int i; 6455 mblk_t *head, *tail, *mp; 6456 6457 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 6458 head = tail = NULL; 6459 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 6460 rwqe = (ibd_rwqe_t *)(uintptr_t)wc->wc_id; 6461 if (wc->wc_status != IBT_WC_SUCCESS) { 6462 /* 6463 * Channel being torn down. 6464 */ 6465 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 6466 DPRINT(5, "ibd_drain_rcq: " 6467 "expected flushed rwqe"); 6468 } else { 6469 DPRINT(5, "ibd_drain_rcq: " 6470 "unexpected wc_status %d", 6471 wc->wc_status); 6472 } 6473 atomic_inc_32( 6474 &state->id_rx_list.dl_bufs_outstanding); 6475 freemsg(rwqe->rwqe_im_mblk); 6476 continue; 6477 } 6478 mp = ibd_process_rx(state, rwqe, wc); 6479 if (mp == NULL) 6480 continue; 6481 6482 /* 6483 * Add this mp to the list to send to the nw layer. 6484 */ 6485 if (head) 6486 tail->b_next = mp; 6487 else 6488 head = mp; 6489 tail = mp; 6490 } 6491 if (head) 6492 mac_rx(state->id_mh, state->id_rh, head); 6493 6494 /* 6495 * Account for #rwqes polled. 6496 * Post more here, if less than one fourth full. 6497 */ 6498 if (atomic_add_32_nv(&state->id_rx_list.dl_cnt, -num_polled) < 6499 (state->id_num_rwqe / 4)) 6500 ibd_post_recv_intr(state); 6501 } 6502 } 6503 6504 /* 6505 * Common code for interrupt handling as well as for polling 6506 * for all completed wqe's while detaching. 6507 */ 6508 static void 6509 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 6510 { 6511 int flag, redo_flag; 6512 int redo = 1; 6513 6514 flag = IBD_CQ_POLLING; 6515 redo_flag = IBD_REDO_CQ_POLLING; 6516 6517 mutex_enter(&state->id_scq_poll_lock); 6518 if (state->id_scq_poll_busy & flag) { 6519 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 6520 state->id_scq_poll_busy |= redo_flag; 6521 mutex_exit(&state->id_scq_poll_lock); 6522 return; 6523 } 6524 state->id_scq_poll_busy |= flag; 6525 mutex_exit(&state->id_scq_poll_lock); 6526 6527 /* 6528 * In some cases (eg detaching), this code can be invoked on 6529 * any cpu after disabling cq notification (thus no concurrency 6530 * exists). Apart from that, the following applies normally: 6531 * Transmit completion handling could be from any cpu if 6532 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 6533 * is interrupt driven. 6534 */ 6535 6536 /* 6537 * Poll and drain the CQ 6538 */ 6539 ibd_drain_scq(state, cq_hdl); 6540 6541 /* 6542 * Enable CQ notifications and redrain the cq to catch any 6543 * completions we might have missed after the ibd_drain_scq() 6544 * above and before the ibt_enable_cq_notify() that follows. 6545 * Finally, service any new requests to poll the cq that 6546 * could've come in after the ibt_enable_cq_notify(). 6547 */ 6548 do { 6549 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 6550 IBT_SUCCESS) { 6551 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6552 } 6553 6554 ibd_drain_scq(state, cq_hdl); 6555 6556 mutex_enter(&state->id_scq_poll_lock); 6557 if (state->id_scq_poll_busy & redo_flag) 6558 state->id_scq_poll_busy &= ~redo_flag; 6559 else { 6560 state->id_scq_poll_busy &= ~flag; 6561 redo = 0; 6562 } 6563 mutex_exit(&state->id_scq_poll_lock); 6564 6565 } while (redo); 6566 } 6567 6568 /* 6569 * Common code for interrupt handling as well as for polling 6570 * for all completed wqe's while detaching. 6571 */ 6572 static void 6573 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 6574 { 6575 int flag, redo_flag; 6576 int redo = 1; 6577 6578 flag = IBD_CQ_POLLING; 6579 redo_flag = IBD_REDO_CQ_POLLING; 6580 6581 mutex_enter(&state->id_rcq_poll_lock); 6582 if (state->id_rcq_poll_busy & flag) { 6583 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 6584 state->id_rcq_poll_busy |= redo_flag; 6585 mutex_exit(&state->id_rcq_poll_lock); 6586 return; 6587 } 6588 state->id_rcq_poll_busy |= flag; 6589 mutex_exit(&state->id_rcq_poll_lock); 6590 6591 /* 6592 * Poll and drain the CQ 6593 */ 6594 ibd_drain_rcq(state, rcq); 6595 6596 /* 6597 * Enable CQ notifications and redrain the cq to catch any 6598 * completions we might have missed after the ibd_drain_cq() 6599 * above and before the ibt_enable_cq_notify() that follows. 6600 * Finally, service any new requests to poll the cq that 6601 * could've come in after the ibt_enable_cq_notify(). 6602 */ 6603 do { 6604 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 6605 IBT_SUCCESS) { 6606 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6607 } 6608 6609 ibd_drain_rcq(state, rcq); 6610 6611 mutex_enter(&state->id_rcq_poll_lock); 6612 if (state->id_rcq_poll_busy & redo_flag) 6613 state->id_rcq_poll_busy &= ~redo_flag; 6614 else { 6615 state->id_rcq_poll_busy &= ~flag; 6616 redo = 0; 6617 } 6618 mutex_exit(&state->id_rcq_poll_lock); 6619 6620 } while (redo); 6621 } 6622 6623 /* 6624 * Unmap the memory area associated with a given swqe. 6625 */ 6626 void 6627 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 6628 { 6629 ibt_status_t stat; 6630 6631 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6632 6633 if (swqe->w_mi_hdl) { 6634 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6635 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6636 DPRINT(10, 6637 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6638 } 6639 swqe->w_mi_hdl = NULL; 6640 } 6641 swqe->w_swr.wr_nds = 0; 6642 } 6643 6644 void 6645 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 6646 { 6647 /* 6648 * The recycling logic can be eliminated from here 6649 * and put into the async thread if we create another 6650 * list to hold ACE's for unjoined mcg's. 6651 */ 6652 if (DEC_REF_DO_CYCLE(ace)) { 6653 ibd_mce_t *mce; 6654 6655 /* 6656 * Check with the lock taken: we decremented 6657 * reference count without the lock, and some 6658 * transmitter might already have bumped the 6659 * reference count (possible in case of multicast 6660 * disable when we leave the AH on the active 6661 * list). If not still 0, get out, leaving the 6662 * recycle bit intact. 6663 * 6664 * Atomically transition the AH from active 6665 * to free list, and queue a work request to 6666 * leave the group and destroy the mce. No 6667 * transmitter can be looking at the AH or 6668 * the MCE in between, since we have the 6669 * ac_mutex lock. In the SendOnly reap case, 6670 * it is not necessary to hold the ac_mutex 6671 * and recheck the ref count (since the AH was 6672 * taken off the active list), we just do it 6673 * to have uniform processing with the Full 6674 * reap case. 6675 */ 6676 mutex_enter(&state->id_ac_mutex); 6677 mce = ace->ac_mce; 6678 if (GET_REF_CYCLE(ace) == 0) { 6679 CLEAR_REFCYCLE(ace); 6680 /* 6681 * Identify the case of fullmember reap as 6682 * opposed to mcg trap reap. Also, port up 6683 * might set ac_mce to NULL to indicate Tx 6684 * cleanup should do no more than put the 6685 * AH in the free list (see ibd_async_link). 6686 */ 6687 if (mce != NULL) { 6688 ace->ac_mce = NULL; 6689 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6690 /* 6691 * mc_req was initialized at mce 6692 * creation time. 6693 */ 6694 ibd_queue_work_slot(state, 6695 &mce->mc_req, IBD_ASYNC_REAP); 6696 } 6697 IBD_ACACHE_INSERT_FREE(state, ace); 6698 } 6699 mutex_exit(&state->id_ac_mutex); 6700 } 6701 } 6702 6703 /* 6704 * Common code that deals with clean ups after a successful or 6705 * erroneous transmission attempt. 6706 */ 6707 static void 6708 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6709 { 6710 ibd_ace_t *ace = swqe->w_ahandle; 6711 6712 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6713 6714 /* 6715 * If this was a dynamic mapping in ibd_send(), we need to 6716 * unmap here. If this was an lso buffer we'd used for sending, 6717 * we need to release the lso buf to the pool, since the resource 6718 * is scarce. However, if this was simply a normal send using 6719 * the copybuf (present in each swqe), we don't need to release it. 6720 */ 6721 if (swqe->swqe_im_mblk != NULL) { 6722 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6723 ibd_unmap_mem(state, swqe); 6724 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6725 ibd_release_lsobufs(state, 6726 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6727 } 6728 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6729 freemsg(swqe->swqe_im_mblk); 6730 swqe->swqe_im_mblk = NULL; 6731 } 6732 6733 /* 6734 * Drop the reference count on the AH; it can be reused 6735 * now for a different destination if there are no more 6736 * posted sends that will use it. This can be eliminated 6737 * if we can always associate each Tx buffer with an AH. 6738 * The ace can be null if we are cleaning up from the 6739 * ibd_send() error path. 6740 */ 6741 if (ace != NULL) { 6742 ibd_dec_ref_ace(state, ace); 6743 } 6744 6745 /* 6746 * Release the send wqe for reuse. 6747 */ 6748 swqe->swqe_next = NULL; 6749 ibd_release_swqe(state, swqe, swqe, 1); 6750 } 6751 6752 static void 6753 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 6754 { 6755 ibd_ace_t *ace; 6756 ibd_swqe_t *swqe; 6757 int n = 0; 6758 6759 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 6760 6761 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 6762 6763 /* 6764 * If this was a dynamic mapping in ibd_send(), we need to 6765 * unmap here. If this was an lso buffer we'd used for sending, 6766 * we need to release the lso buf to the pool, since the 6767 * resource is scarce. However, if this was simply a normal 6768 * send using the copybuf (present in each swqe), we don't need 6769 * to release it. 6770 */ 6771 if (swqe->swqe_im_mblk != NULL) { 6772 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6773 ibd_unmap_mem(state, swqe); 6774 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6775 ibd_release_lsobufs(state, 6776 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6777 } 6778 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6779 freemsg(swqe->swqe_im_mblk); 6780 swqe->swqe_im_mblk = NULL; 6781 } 6782 6783 /* 6784 * Drop the reference count on the AH; it can be reused 6785 * now for a different destination if there are no more 6786 * posted sends that will use it. This can be eliminated 6787 * if we can always associate each Tx buffer with an AH. 6788 * The ace can be null if we are cleaning up from the 6789 * ibd_send() error path. 6790 */ 6791 ace = swqe->w_ahandle; 6792 if (ace != NULL) { 6793 ibd_dec_ref_ace(state, ace); 6794 } 6795 n++; 6796 } 6797 6798 /* 6799 * Release the send wqes for reuse. 6800 */ 6801 ibd_release_swqe(state, head, tail, n); 6802 } 6803 6804 /* 6805 * Processing to be done after receipt of a packet; hand off to GLD 6806 * in the format expected by GLD. The received packet has this 6807 * format: 2b sap :: 00 :: data. 6808 */ 6809 static mblk_t * 6810 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6811 { 6812 ib_header_info_t *phdr; 6813 mblk_t *mp; 6814 ipoib_hdr_t *ipibp; 6815 ipha_t *iphap; 6816 ip6_t *ip6h; 6817 int len; 6818 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 6819 uint32_t bufs; 6820 6821 /* 6822 * Track number handed to upper layer that need to be returned. 6823 */ 6824 bufs = atomic_inc_32_nv(&state->id_rx_list.dl_bufs_outstanding); 6825 6826 /* Never run out of rwqes, use allocb when running low */ 6827 if (bufs >= state->id_rx_bufs_outstanding_limit) { 6828 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6829 atomic_inc_32(&state->id_rx_allocb); 6830 mp = allocb(pkt_len, BPRI_HI); 6831 if (mp) { 6832 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 6833 ibd_post_recv(state, rwqe); 6834 } else { /* no memory */ 6835 atomic_inc_32(&state->id_rx_allocb_failed); 6836 ibd_post_recv(state, rwqe); 6837 return (NULL); 6838 } 6839 } else { 6840 mp = rwqe->rwqe_im_mblk; 6841 } 6842 6843 6844 /* 6845 * Adjust write pointer depending on how much data came in. 6846 */ 6847 mp->b_wptr = mp->b_rptr + pkt_len; 6848 6849 /* 6850 * Make sure this is NULL or we're in trouble. 6851 */ 6852 if (mp->b_next != NULL) { 6853 ibd_print_warn(state, 6854 "ibd_process_rx: got duplicate mp from rcq?"); 6855 mp->b_next = NULL; 6856 } 6857 6858 /* 6859 * the IB link will deliver one of the IB link layer 6860 * headers called, the Global Routing Header (GRH). 6861 * ibd driver uses the information in GRH to build the 6862 * Header_info structure and pass it with the datagram up 6863 * to GLDv3. 6864 * If the GRH is not valid, indicate to GLDv3 by setting 6865 * the VerTcFlow field to 0. 6866 */ 6867 phdr = (ib_header_info_t *)mp->b_rptr; 6868 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6869 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6870 6871 /* if it is loop back packet, just drop it. */ 6872 if (state->id_enable_rc) { 6873 if (bcmp(&phdr->ib_grh.ipoib_sqpn, 6874 &state->rc_macaddr_loopback, 6875 IPOIB_ADDRL) == 0) { 6876 freemsg(mp); 6877 return (NULL); 6878 } 6879 } else { 6880 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6881 IPOIB_ADDRL) == 0) { 6882 freemsg(mp); 6883 return (NULL); 6884 } 6885 } 6886 6887 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6888 sizeof (ipoib_mac_t)); 6889 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6890 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6891 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6892 } else { 6893 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6894 } 6895 } else { 6896 /* 6897 * It can not be a IBA multicast packet. Must have been 6898 * unicast for us. Just copy the interface address to dst. 6899 */ 6900 phdr->ib_grh.ipoib_vertcflow = 0; 6901 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6902 sizeof (ipoib_mac_t)); 6903 } 6904 6905 /* 6906 * For ND6 packets, padding is at the front of the source/target 6907 * lladdr. However the inet6 layer is not aware of it, hence remove 6908 * the padding from such packets. 6909 */ 6910 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6911 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 6912 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6913 len = ntohs(ip6h->ip6_plen); 6914 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6915 /* LINTED: E_CONSTANT_CONDITION */ 6916 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6917 } 6918 } 6919 6920 /* 6921 * Update statistics 6922 */ 6923 atomic_add_64(&state->id_rcv_bytes, pkt_len); 6924 atomic_inc_64(&state->id_rcv_pkt); 6925 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6926 atomic_inc_64(&state->id_brd_rcv); 6927 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6928 atomic_inc_64(&state->id_multi_rcv); 6929 6930 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6931 /* 6932 * Set receive checksum status in mp 6933 * Hardware checksumming can be considered valid only if: 6934 * 1. CQE.IP_OK bit is set 6935 * 2. CQE.CKSUM = 0xffff 6936 * 3. IPv6 routing header is not present in the packet 6937 * 4. If there are no IP_OPTIONS in the IP HEADER 6938 */ 6939 6940 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6941 (wc->wc_cksum == 0xFFFF) && 6942 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6943 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6944 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6945 } 6946 6947 return (mp); 6948 } 6949 6950 /* 6951 * Callback code invoked from STREAMs when the receive data buffer is 6952 * free for recycling. 6953 */ 6954 static void 6955 ibd_freemsg_cb(char *arg) 6956 { 6957 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6958 ibd_state_t *state = rwqe->w_state; 6959 6960 atomic_dec_32(&state->id_rx_list.dl_bufs_outstanding); 6961 6962 /* 6963 * If the driver is stopped, just free the rwqe. 6964 */ 6965 if (atomic_add_32_nv(&state->id_running, 0) == 0) { 6966 DPRINT(6, "ibd_freemsg: wqe being freed"); 6967 rwqe->rwqe_im_mblk = NULL; 6968 ibd_free_rwqe(state, rwqe); 6969 return; 6970 } 6971 6972 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6973 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6974 if (rwqe->rwqe_im_mblk == NULL) { 6975 ibd_free_rwqe(state, rwqe); 6976 DPRINT(6, "ibd_freemsg: desballoc failed"); 6977 return; 6978 } 6979 6980 ibd_post_recv(state, rwqe); 6981 } 6982 6983 static uint_t 6984 ibd_tx_recycle(caddr_t arg) 6985 { 6986 ibd_state_t *state = (ibd_state_t *)arg; 6987 6988 /* 6989 * Poll for completed entries 6990 */ 6991 ibd_poll_scq(state, state->id_scq_hdl); 6992 6993 return (DDI_INTR_CLAIMED); 6994 } 6995 6996 #ifdef IBD_LOGGING 6997 static void 6998 ibd_log_init(void) 6999 { 7000 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 7001 ibd_lbuf_ndx = 0; 7002 7003 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 7004 } 7005 7006 static void 7007 ibd_log_fini(void) 7008 { 7009 if (ibd_lbuf) 7010 kmem_free(ibd_lbuf, IBD_LOG_SZ); 7011 ibd_lbuf_ndx = 0; 7012 ibd_lbuf = NULL; 7013 7014 mutex_destroy(&ibd_lbuf_lock); 7015 } 7016 7017 static void 7018 ibd_log(const char *fmt, ...) 7019 { 7020 va_list ap; 7021 uint32_t off; 7022 uint32_t msglen; 7023 char tmpbuf[IBD_DMAX_LINE]; 7024 7025 if (ibd_lbuf == NULL) 7026 return; 7027 7028 va_start(ap, fmt); 7029 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 7030 va_end(ap); 7031 7032 if (msglen >= IBD_DMAX_LINE) 7033 msglen = IBD_DMAX_LINE - 1; 7034 7035 mutex_enter(&ibd_lbuf_lock); 7036 7037 off = ibd_lbuf_ndx; /* current msg should go here */ 7038 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 7039 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 7040 7041 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 7042 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 7043 7044 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 7045 ibd_lbuf_ndx = 0; 7046 7047 mutex_exit(&ibd_lbuf_lock); 7048 7049 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 7050 } 7051 #endif 7052