1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 54 #include <inet/ip6.h> /* for ip6_t */ 55 #include <inet/tcp.h> /* for tcph_t */ 56 #include <netinet/icmp6.h> /* for icmp6_t */ 57 #include <sys/callb.h> 58 #include <sys/modhash.h> 59 60 #include <sys/ib/clients/ibd/ibd.h> 61 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 62 #include <sys/note.h> 63 #include <sys/multidata.h> 64 65 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 66 67 /* 68 * Per-interface tunables 69 * 70 * ibd_tx_copy_thresh 71 * This sets the threshold at which ibd will attempt to do a bcopy of the 72 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 73 * is restricted by various parameters, so setting of this value must be 74 * made after careful considerations only. For instance, IB HCAs currently 75 * impose a relatively small limit (when compared to ethernet NICs) on the 76 * length of the SGL for transmit. On the other hand, the ip stack could 77 * send down mp chains that are quite long when LSO is enabled. 78 * 79 * ibd_num_swqe 80 * Number of "send WQE" elements that will be allocated and used by ibd. 81 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 82 * buffer in each of these send wqes must be taken into account. This 83 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 84 * currently set to the same value of ibd_tx_copy_thresh, but may be 85 * changed independently if needed). 86 * 87 * ibd_num_rwqe 88 * Number of "receive WQE" elements that will be allocated and used by 89 * ibd. This parameter is limited by the maximum channel size of the HCA. 90 * Each buffer in the receive wqe will be of MTU size. 91 * 92 * ibd_num_lso_bufs 93 * Number of "larger-than-MTU" copy buffers to use for cases when the 94 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 95 * and too large to be used with regular MTU-sized copy buffers. It is 96 * not recommended to tune this variable without understanding the 97 * application environment and/or memory resources. The size of each of 98 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 99 * 100 * ibd_num_ah 101 * Number of AH cache entries to allocate 102 * 103 * ibd_hash_size 104 * Hash table size for the active AH list 105 * 106 * ibd_separate_cqs 107 * ibd_txcomp_poll 108 * These boolean variables (1 or 0) may be used to tune the behavior of 109 * ibd in managing the send and receive completion queues and in deciding 110 * whether or not transmit completions should be polled or interrupt 111 * driven (when the completion queues are separate). If both the completion 112 * queues are interrupt driven, it may not be possible for the handlers to 113 * be invoked concurrently, depending on how the interrupts are tied on 114 * the PCI intr line. Note that some combination of these two parameters 115 * may not be meaningful (and therefore not allowed). 116 * 117 * ibd_tx_softintr 118 * ibd_rx_softintr 119 * The softintr mechanism allows ibd to avoid event queue overflows if 120 * the receive/completion handlers are to be expensive. These are enabled 121 * by default. 122 * 123 * ibd_log_sz 124 * This specifies the size of the ibd log buffer in bytes. The buffer is 125 * allocated and logging is enabled only when IBD_LOGGING is defined. 126 * 127 */ 128 uint_t ibd_tx_copy_thresh = 0x1000; 129 uint_t ibd_num_swqe = 4000; 130 uint_t ibd_num_rwqe = 4000; 131 uint_t ibd_num_lso_bufs = 0x400; 132 uint_t ibd_num_ah = 64; 133 uint_t ibd_hash_size = 32; 134 uint_t ibd_separate_cqs = 1; 135 uint_t ibd_txcomp_poll = 0; 136 uint_t ibd_rx_softintr = 1; 137 uint_t ibd_tx_softintr = 1; 138 #ifdef IBD_LOGGING 139 uint_t ibd_log_sz = 0x20000; 140 #endif 141 142 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 143 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 144 #define IBD_NUM_SWQE ibd_num_swqe 145 #define IBD_NUM_RWQE ibd_num_rwqe 146 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 147 #define IBD_NUM_AH ibd_num_ah 148 #define IBD_HASH_SIZE ibd_hash_size 149 #ifdef IBD_LOGGING 150 #define IBD_LOG_SZ ibd_log_sz 151 #endif 152 153 /* 154 * Receive CQ moderation parameters: NOT tunables 155 */ 156 static uint_t ibd_rxcomp_count = 4; 157 static uint_t ibd_rxcomp_usec = 10; 158 159 /* 160 * Thresholds 161 * 162 * When waiting for resources (swqes or lso buffers) to become available, 163 * the first two thresholds below determine how long to wait before informing 164 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 165 * determines how low the available swqes should go before we start polling 166 * the completion queue. 167 */ 168 #define IBD_FREE_LSOS_THRESH 8 169 #define IBD_FREE_SWQES_THRESH 20 170 #define IBD_TX_POLL_THRESH 80 171 172 /* 173 * When doing multiple-send-wr or multiple-recv-wr posts, this value 174 * determines how many to do at a time (in a single ibt_post_send/recv). 175 */ 176 #define IBD_MAX_POST_MULTIPLE 4 177 178 /* 179 * Maximum length for returning chained mps back to crossbow 180 */ 181 #define IBD_MAX_RX_MP_LEN 16 182 183 /* 184 * LSO parameters 185 */ 186 #define IBD_LSO_MAXLEN 65536 187 #define IBD_LSO_BUFSZ 8192 188 #define IBD_PROP_LSO_POLICY "lso-policy" 189 190 /* 191 * Completion queue polling control 192 */ 193 #define IBD_RX_CQ_POLLING 0x1 194 #define IBD_TX_CQ_POLLING 0x2 195 #define IBD_REDO_RX_CQ_POLLING 0x4 196 #define IBD_REDO_TX_CQ_POLLING 0x8 197 198 /* 199 * Flag bits for resources to reap 200 */ 201 #define IBD_RSRC_SWQE 0x1 202 #define IBD_RSRC_LSOBUF 0x2 203 204 /* 205 * Async operation types 206 */ 207 #define IBD_ASYNC_GETAH 1 208 #define IBD_ASYNC_JOIN 2 209 #define IBD_ASYNC_LEAVE 3 210 #define IBD_ASYNC_PROMON 4 211 #define IBD_ASYNC_PROMOFF 5 212 #define IBD_ASYNC_REAP 6 213 #define IBD_ASYNC_TRAP 7 214 #define IBD_ASYNC_SCHED 8 215 #define IBD_ASYNC_LINK 9 216 #define IBD_ASYNC_EXIT 10 217 218 /* 219 * Async operation states 220 */ 221 #define IBD_OP_NOTSTARTED 0 222 #define IBD_OP_ONGOING 1 223 #define IBD_OP_COMPLETED 2 224 #define IBD_OP_ERRORED 3 225 #define IBD_OP_ROUTERED 4 226 227 /* 228 * Miscellaneous constants 229 */ 230 #define IBD_SEND 0 231 #define IBD_RECV 1 232 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 233 #ifdef IBD_LOGGING 234 #define IBD_DMAX_LINE 100 235 #endif 236 237 /* 238 * Enumerations for link states 239 */ 240 typedef enum { 241 IBD_LINK_DOWN, 242 IBD_LINK_UP, 243 IBD_LINK_UP_ABSENT 244 } ibd_link_op_t; 245 246 /* 247 * Driver State Pointer 248 */ 249 void *ibd_list; 250 251 /* 252 * Logging 253 */ 254 #ifdef IBD_LOGGING 255 kmutex_t ibd_lbuf_lock; 256 uint8_t *ibd_lbuf; 257 uint32_t ibd_lbuf_ndx; 258 #endif 259 260 /* 261 * Required system entry points 262 */ 263 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 264 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 265 266 /* 267 * Required driver entry points for GLDv3 268 */ 269 static int ibd_m_stat(void *, uint_t, uint64_t *); 270 static int ibd_m_start(void *); 271 static void ibd_m_stop(void *); 272 static int ibd_m_promisc(void *, boolean_t); 273 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 274 static int ibd_m_unicst(void *, const uint8_t *); 275 static mblk_t *ibd_m_tx(void *, mblk_t *); 276 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 277 278 /* 279 * Private driver entry points for GLDv3 280 */ 281 282 /* 283 * Initialization 284 */ 285 static int ibd_state_init(ibd_state_t *, dev_info_t *); 286 static int ibd_drv_init(ibd_state_t *); 287 static int ibd_init_txlist(ibd_state_t *); 288 static int ibd_init_rxlist(ibd_state_t *); 289 static int ibd_acache_init(ibd_state_t *); 290 #ifdef IBD_LOGGING 291 static void ibd_log_init(void); 292 #endif 293 294 /* 295 * Termination/cleanup 296 */ 297 static void ibd_state_fini(ibd_state_t *); 298 static void ibd_drv_fini(ibd_state_t *); 299 static void ibd_fini_txlist(ibd_state_t *); 300 static void ibd_fini_rxlist(ibd_state_t *); 301 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 302 static void ibd_acache_fini(ibd_state_t *); 303 #ifdef IBD_LOGGING 304 static void ibd_log_fini(void); 305 #endif 306 307 /* 308 * Allocation/acquire/map routines 309 */ 310 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t); 311 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 312 static int ibd_alloc_tx_copybufs(ibd_state_t *); 313 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 314 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **); 315 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 316 uint32_t *); 317 318 /* 319 * Free/release/unmap routines 320 */ 321 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 322 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 323 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *); 324 static void ibd_free_tx_copybufs(ibd_state_t *); 325 static void ibd_free_tx_lsobufs(ibd_state_t *); 326 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *); 327 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 328 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 329 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 330 331 /* 332 * Handlers/callback routines 333 */ 334 static uint_t ibd_intr(char *); 335 static uint_t ibd_tx_recycle(char *); 336 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 337 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 338 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 339 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t); 340 static void ibd_freemsg_cb(char *); 341 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 342 ibt_async_event_t *); 343 static void ibd_snet_notices_handler(void *, ib_gid_t, 344 ibt_subnet_event_code_t, ibt_subnet_event_t *); 345 346 /* 347 * Send/receive routines 348 */ 349 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 350 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 351 static int ibd_post_rwqe(ibd_state_t *, ibd_rwqe_t *, boolean_t); 352 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 353 static void ibd_flush_rx(ibd_state_t *, mblk_t *); 354 355 /* 356 * Threads 357 */ 358 static void ibd_async_work(ibd_state_t *); 359 360 /* 361 * Async tasks 362 */ 363 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 364 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 365 static void ibd_async_setprom(ibd_state_t *); 366 static void ibd_async_unsetprom(ibd_state_t *); 367 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 368 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 369 static void ibd_async_txsched(ibd_state_t *); 370 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 371 372 /* 373 * Async task helpers 374 */ 375 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 376 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 377 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 378 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 379 ipoib_mac_t *, ipoib_mac_t *); 380 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 381 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 382 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 383 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 384 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 385 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 386 static uint64_t ibd_get_portspeed(ibd_state_t *); 387 static int ibd_get_portpkey(ibd_state_t *, ib_guid_t *); 388 static boolean_t ibd_async_safe(ibd_state_t *); 389 static void ibd_async_done(ibd_state_t *); 390 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 391 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 392 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 393 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 394 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 395 396 /* 397 * Miscellaneous helpers 398 */ 399 static int ibd_sched_poll(ibd_state_t *, int, int); 400 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 401 static int ibd_resume_transmission(ibd_state_t *); 402 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 403 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 404 static void *list_get_head(list_t *); 405 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 406 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 407 static void ibd_print_warn(ibd_state_t *, char *, ...); 408 #ifdef IBD_LOGGING 409 static void ibd_log(const char *, ...); 410 #endif 411 412 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 413 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 414 415 /* Module Driver Info */ 416 static struct modldrv ibd_modldrv = { 417 &mod_driverops, /* This one is a driver */ 418 "InfiniBand GLDv3 Driver", /* short description */ 419 &ibd_dev_ops /* driver specific ops */ 420 }; 421 422 /* Module Linkage */ 423 static struct modlinkage ibd_modlinkage = { 424 MODREV_1, (void *)&ibd_modldrv, NULL 425 }; 426 427 /* 428 * Module (static) info passed to IBTL during ibt_attach 429 */ 430 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 431 IBTI_V_CURR, 432 IBT_NETWORK, 433 ibd_async_handler, 434 NULL, 435 "IPIB" 436 }; 437 438 /* 439 * GLDv3 entry points 440 */ 441 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 442 static mac_callbacks_t ib_m_callbacks = { 443 IBD_M_CALLBACK_FLAGS, 444 ibd_m_stat, 445 ibd_m_start, 446 ibd_m_stop, 447 ibd_m_promisc, 448 ibd_m_multicst, 449 ibd_m_unicst, 450 ibd_m_tx, 451 NULL, 452 ibd_m_getcapab 453 }; 454 455 /* 456 * Fill/clear <scope> and <p_key> in multicast/broadcast address 457 */ 458 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 459 { \ 460 *(uint32_t *)((char *)(maddr) + 4) |= \ 461 htonl((uint32_t)(scope) << 16); \ 462 *(uint32_t *)((char *)(maddr) + 8) |= \ 463 htonl((uint32_t)(pkey) << 16); \ 464 } 465 466 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 467 { \ 468 *(uint32_t *)((char *)(maddr) + 4) &= \ 469 htonl(~((uint32_t)0xF << 16)); \ 470 *(uint32_t *)((char *)(maddr) + 8) &= \ 471 htonl(~((uint32_t)0xFFFF << 16)); \ 472 } 473 474 /* 475 * Rudimentary debugging support 476 */ 477 #ifdef DEBUG 478 int ibd_debuglevel = 100; 479 static void 480 debug_print(int l, char *fmt, ...) 481 { 482 va_list ap; 483 484 if (l < ibd_debuglevel) 485 return; 486 va_start(ap, fmt); 487 vcmn_err(CE_CONT, fmt, ap); 488 va_end(ap); 489 } 490 #define DPRINT debug_print 491 #else 492 #define DPRINT 493 #endif 494 495 /* 496 * Common routine to print warning messages; adds in hca guid, port number 497 * and pkey to be able to identify the IBA interface. 498 */ 499 static void 500 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 501 { 502 ib_guid_t hca_guid; 503 char ibd_print_buf[256]; 504 int len; 505 va_list ap; 506 507 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 508 0, "hca-guid", 0); 509 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 510 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 511 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 512 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 513 va_start(ap, fmt); 514 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 515 fmt, ap); 516 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 517 va_end(ap); 518 } 519 520 /* 521 * Warlock directives 522 */ 523 524 /* 525 * id_lso_lock 526 * 527 * state->id_lso->bkt_nfree may be accessed without a lock to 528 * determine the threshold at which we have to ask the nw layer 529 * to resume transmission (see ibd_resume_transmission()). 530 */ 531 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 532 ibd_state_t::id_lso)) 533 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 534 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 535 536 /* 537 * id_cq_poll_lock 538 */ 539 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock, 540 ibd_state_t::id_cq_poll_busy)) 541 542 /* 543 * id_txpost_lock 544 */ 545 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 546 ibd_state_t::id_tx_head)) 547 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 548 ibd_state_t::id_tx_busy)) 549 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 550 ibd_state_t::id_tx_tailp)) 551 552 /* 553 * id_rxpost_lock 554 */ 555 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 556 ibd_state_t::id_rx_head)) 557 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 558 ibd_state_t::id_rx_busy)) 559 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 560 ibd_state_t::id_rx_tailp)) 561 562 /* 563 * id_acache_req_lock 564 */ 565 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 566 ibd_state_t::id_acache_req_cv)) 567 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 568 ibd_state_t::id_req_list)) 569 570 /* 571 * id_ac_mutex 572 * 573 * This mutex is actually supposed to protect id_ah_op as well, 574 * but this path of the code isn't clean (see update of id_ah_op 575 * in ibd_async_acache(), immediately after the call to 576 * ibd_async_mcache()). For now, we'll skip this check by 577 * declaring that id_ah_op is protected by some internal scheme 578 * that warlock isn't aware of. 579 */ 580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 581 ibd_state_t::id_ah_active)) 582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 583 ibd_state_t::id_ah_free)) 584 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 585 ibd_state_t::id_ah_addr)) 586 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 587 ibd_state_t::id_ah_op)) 588 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 589 ibd_state_t::id_ah_error)) 590 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 591 592 /* 593 * id_mc_mutex 594 */ 595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 596 ibd_state_t::id_mc_full)) 597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 598 ibd_state_t::id_mc_non)) 599 600 /* 601 * id_trap_lock 602 */ 603 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 604 ibd_state_t::id_trap_cv)) 605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 606 ibd_state_t::id_trap_stop)) 607 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 608 ibd_state_t::id_trap_inprog)) 609 610 /* 611 * id_prom_op 612 */ 613 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 614 ibd_state_t::id_prom_op)) 615 616 /* 617 * id_sched_lock 618 */ 619 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 620 ibd_state_t::id_sched_needed)) 621 622 /* 623 * id_link_mutex 624 */ 625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 626 ibd_state_t::id_link_state)) 627 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 628 _NOTE(SCHEME_PROTECTS_DATA("only async thr and drv init", 629 ibd_state_t::id_link_speed)) 630 631 /* 632 * id_tx_list.dl_mutex 633 */ 634 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 635 ibd_state_t::id_tx_list.dl_head)) 636 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 637 ibd_state_t::id_tx_list.dl_tail)) 638 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 639 ibd_state_t::id_tx_list.dl_pending_sends)) 640 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 641 ibd_state_t::id_tx_list.dl_cnt)) 642 643 /* 644 * id_rx_list.dl_mutex 645 */ 646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 647 ibd_state_t::id_rx_list.dl_head)) 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 649 ibd_state_t::id_rx_list.dl_tail)) 650 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 651 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 652 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 653 ibd_state_t::id_rx_list.dl_cnt)) 654 655 656 /* 657 * Items protected by atomic updates 658 */ 659 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 660 ibd_state_s::id_brd_rcv 661 ibd_state_s::id_brd_xmt 662 ibd_state_s::id_multi_rcv 663 ibd_state_s::id_multi_xmt 664 ibd_state_s::id_num_intrs 665 ibd_state_s::id_rcv_bytes 666 ibd_state_s::id_rcv_pkt 667 ibd_state_s::id_tx_short 668 ibd_state_s::id_xmt_bytes 669 ibd_state_s::id_xmt_pkt)) 670 671 /* 672 * Non-mutex protection schemes for data elements. Almost all of 673 * these are non-shared items. 674 */ 675 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 676 callb_cpr 677 ib_gid_s 678 ib_header_info 679 ibd_acache_rq 680 ibd_acache_s::ac_mce 681 ibd_mcache::mc_fullreap 682 ibd_mcache::mc_jstate 683 ibd_mcache::mc_req 684 ibd_rwqe_s 685 ibd_swqe_s 686 ibd_wqe_s 687 ibt_wr_ds_s::ds_va 688 ibt_wr_lso_s 689 ipoib_mac::ipoib_qpn 690 mac_capab_lso_s 691 msgb::b_next 692 msgb::b_rptr 693 msgb::b_wptr)) 694 695 int 696 _init() 697 { 698 int status; 699 700 /* 701 * Sanity check some parameter settings. Tx completion polling 702 * only makes sense with separate CQs for Tx and Rx. 703 */ 704 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 705 cmn_err(CE_NOTE, "!ibd: %s", 706 "Setting ibd_txcomp_poll = 0 for combined CQ"); 707 ibd_txcomp_poll = 0; 708 } 709 710 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 711 if (status != 0) { 712 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 713 return (status); 714 } 715 716 mac_init_ops(&ibd_dev_ops, "ibd"); 717 status = mod_install(&ibd_modlinkage); 718 if (status != 0) { 719 DPRINT(10, "_init:failed in mod_install()"); 720 ddi_soft_state_fini(&ibd_list); 721 mac_fini_ops(&ibd_dev_ops); 722 return (status); 723 } 724 725 #ifdef IBD_LOGGING 726 ibd_log_init(); 727 #endif 728 return (0); 729 } 730 731 int 732 _info(struct modinfo *modinfop) 733 { 734 return (mod_info(&ibd_modlinkage, modinfop)); 735 } 736 737 int 738 _fini() 739 { 740 int status; 741 742 status = mod_remove(&ibd_modlinkage); 743 if (status != 0) 744 return (status); 745 746 mac_fini_ops(&ibd_dev_ops); 747 ddi_soft_state_fini(&ibd_list); 748 #ifdef IBD_LOGGING 749 ibd_log_fini(); 750 #endif 751 return (0); 752 } 753 754 /* 755 * Convert the GID part of the mac address from network byte order 756 * to host order. 757 */ 758 static void 759 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 760 { 761 ib_sn_prefix_t nbopref; 762 ib_guid_t nboguid; 763 764 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 765 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 766 dgid->gid_prefix = b2h64(nbopref); 767 dgid->gid_guid = b2h64(nboguid); 768 } 769 770 /* 771 * Create the IPoIB address in network byte order from host order inputs. 772 */ 773 static void 774 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 775 ib_guid_t guid) 776 { 777 ib_sn_prefix_t nbopref; 778 ib_guid_t nboguid; 779 780 mac->ipoib_qpn = htonl(qpn); 781 nbopref = h2b64(prefix); 782 nboguid = h2b64(guid); 783 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 784 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 785 } 786 787 /* 788 * Send to the appropriate all-routers group when the IBA multicast group 789 * does not exist, based on whether the target group is v4 or v6. 790 */ 791 static boolean_t 792 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 793 ipoib_mac_t *rmac) 794 { 795 boolean_t retval = B_TRUE; 796 uint32_t adjscope = state->id_scope << 16; 797 uint32_t topword; 798 799 /* 800 * Copy the first 4 bytes in without assuming any alignment of 801 * input mac address; this will have IPoIB signature, flags and 802 * scope bits. 803 */ 804 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 805 topword = ntohl(topword); 806 807 /* 808 * Generate proper address for IPv4/v6, adding in the Pkey properly. 809 */ 810 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 811 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 812 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 813 ((uint32_t)(state->id_pkey << 16))), 814 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 815 else 816 /* 817 * Does not have proper bits in the mgid address. 818 */ 819 retval = B_FALSE; 820 821 return (retval); 822 } 823 824 /* 825 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 826 * front of optional src/tgt link layer address. Right now Solaris inserts 827 * padding by default at the end. The routine which is doing is nce_xmit() 828 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 829 * the packet comes down from IP layer to the IBD driver, it is in the 830 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 831 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 832 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 833 * 834 * The send routine at IBD driver changes this packet as follows: 835 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 836 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 837 * aligned. 838 * 839 * At the receiving side again ibd_process_rx takes the above packet and 840 * removes the two bytes of front padding and inserts it at the end. This 841 * is since the IP layer does not understand padding at the front. 842 */ 843 #define IBD_PAD_NSNA(ip6h, len, type) { \ 844 uchar_t *nd_lla_ptr; \ 845 icmp6_t *icmp6; \ 846 nd_opt_hdr_t *opt; \ 847 int i; \ 848 \ 849 icmp6 = (icmp6_t *)&ip6h[1]; \ 850 len -= sizeof (nd_neighbor_advert_t); \ 851 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 852 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 853 (len != 0)) { \ 854 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 855 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 856 ASSERT(opt != NULL); \ 857 nd_lla_ptr = (uchar_t *)&opt[1]; \ 858 if (type == IBD_SEND) { \ 859 for (i = IPOIB_ADDRL; i > 0; i--) \ 860 *(nd_lla_ptr + i + 1) = \ 861 *(nd_lla_ptr + i - 1); \ 862 } else { \ 863 for (i = 0; i < IPOIB_ADDRL; i++) \ 864 *(nd_lla_ptr + i) = \ 865 *(nd_lla_ptr + i + 2); \ 866 } \ 867 *(nd_lla_ptr + i) = 0; \ 868 *(nd_lla_ptr + i + 1) = 0; \ 869 } \ 870 } 871 872 /* 873 * Address handle entries maintained by the driver are kept in the 874 * free and active lists. Each entry starts out in the free list; 875 * it migrates to the active list when primed using ibt_get_paths() 876 * and ibt_modify_ud_dest() for transmission to a specific destination. 877 * In the active list, the entry has a reference count indicating the 878 * number of ongoing/uncompleted transmits that reference it. The 879 * entry is left in the active list even after the reference count 880 * goes to 0, since successive transmits can find it there and do 881 * not need to set up another entry (ie the path information is 882 * cached using the active list). Entries on the active list are 883 * also hashed using the destination link address as a key for faster 884 * lookups during transmits. 885 * 886 * For any destination address (unicast or multicast, whatever the 887 * join states), there will be at most one entry in the active list. 888 * Entries with a 0 reference count on the active list can be reused 889 * for a transmit to a new destination, if the free list is empty. 890 * 891 * The AH free list insertion/deletion is protected with the id_ac_mutex, 892 * since the async thread and Tx callback handlers insert/delete. The 893 * active list does not need a lock (all operations are done by the 894 * async thread) but updates to the reference count are atomically 895 * done (increments done by Tx path, decrements by the Tx callback handler). 896 */ 897 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 898 list_insert_head(&state->id_ah_free, ce) 899 #define IBD_ACACHE_GET_FREE(state) \ 900 list_get_head(&state->id_ah_free) 901 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 902 int _ret_; \ 903 list_insert_head(&state->id_ah_active, ce); \ 904 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 905 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 906 ASSERT(_ret_ == 0); \ 907 } 908 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 909 list_remove(&state->id_ah_active, ce); \ 910 (void) mod_hash_remove(state->id_ah_active_hash, \ 911 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 912 } 913 #define IBD_ACACHE_GET_ACTIVE(state) \ 914 list_get_head(&state->id_ah_active) 915 916 /* 917 * Membership states for different mcg's are tracked by two lists: 918 * the "non" list is used for promiscuous mode, when all mcg traffic 919 * needs to be inspected. This type of membership is never used for 920 * transmission, so there can not be an AH in the active list 921 * corresponding to a member in this list. This list does not need 922 * any protection, since all operations are performed by the async 923 * thread. 924 * 925 * "Full" and "SendOnly" membership is tracked using a single list, 926 * the "full" list. This is because this single list can then be 927 * searched during transmit to a multicast group (if an AH for the 928 * mcg is not found in the active list), since at least one type 929 * of membership must be present before initiating the transmit. 930 * This list is also emptied during driver detach, since sendonly 931 * membership acquired during transmit is dropped at detach time 932 * alongwith ipv4 broadcast full membership. Insert/deletes to 933 * this list are done only by the async thread, but it is also 934 * searched in program context (see multicast disable case), thus 935 * the id_mc_mutex protects the list. The driver detach path also 936 * deconstructs the "full" list, but it ensures that the async 937 * thread will not be accessing the list (by blocking out mcg 938 * trap handling and making sure no more Tx reaping will happen). 939 * 940 * Currently, an IBA attach is done in the SendOnly case too, 941 * although this is not required. 942 */ 943 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 944 list_insert_head(&state->id_mc_full, mce) 945 #define IBD_MCACHE_INSERT_NON(state, mce) \ 946 list_insert_head(&state->id_mc_non, mce) 947 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 948 ibd_mcache_find(mgid, &state->id_mc_full) 949 #define IBD_MCACHE_FIND_NON(state, mgid) \ 950 ibd_mcache_find(mgid, &state->id_mc_non) 951 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 952 list_remove(&state->id_mc_full, mce) 953 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 954 list_remove(&state->id_mc_non, mce) 955 956 /* 957 * AH and MCE active list manipulation: 958 * 959 * Multicast disable requests and MCG delete traps are two cases 960 * where the active AH entry for the mcg (if any unreferenced one exists) 961 * will be moved to the free list (to force the next Tx to the mcg to 962 * join the MCG in SendOnly mode). Port up handling will also move AHs 963 * from active to free list. 964 * 965 * In the case when some transmits are still pending on an entry 966 * for an mcg, but a multicast disable has already been issued on the 967 * mcg, there are some options to consider to preserve the join state 968 * to ensure the emitted packet is properly routed on the IBA fabric. 969 * For the AH, we can 970 * 1. take out of active list at multicast disable time. 971 * 2. take out of active list only when last pending Tx completes. 972 * For the MCE, we can 973 * 3. take out of active list at multicast disable time. 974 * 4. take out of active list only when last pending Tx completes. 975 * 5. move from active list to stale list at multicast disable time. 976 * We choose to use 2,4. We use option 4 so that if a multicast enable 977 * is tried before the pending Tx completes, the enable code finds the 978 * mce in the active list and just has to make sure it will not be reaped 979 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 980 * a stale list (#5) that would be checked in the enable code would need 981 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 982 * after the multicast disable would try to put an AH in the active list, 983 * and associate the mce it finds in the active list to this new AH, 984 * whereas the mce is already associated with the previous AH (taken off 985 * the active list), and will be removed once the pending Tx's complete 986 * (unless a reference count on mce's is implemented). One implication of 987 * using 2,4 is that new Tx's posted before the pending Tx's complete will 988 * grab new references on the AH, further delaying the leave. 989 * 990 * In the case of mcg delete (or create) trap when the port is sendonly 991 * joined, the AH and MCE handling is different: the AH and MCE has to be 992 * immediately taken off the active lists (forcing a join and path lookup 993 * at the next Tx is the only guaranteed means of ensuring a proper Tx 994 * to an mcg as it is repeatedly created and deleted and goes thru 995 * reincarnations). 996 * 997 * When a port is already sendonly joined, and a multicast enable is 998 * attempted, the same mce structure is promoted; this ensures only a 999 * single mce on the active list tracks the most powerful join state. 1000 * 1001 * In the case of port up event handling, the MCE for sendonly membership 1002 * is freed up, and the ACE is put into the free list as soon as possible 1003 * (depending on whether posted Tx's have completed). For fullmembership 1004 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1005 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1006 * done; else the mce is deconstructed (mc_fullreap case). 1007 * 1008 * MCG creation and deletion trap handling: 1009 * 1010 * These traps are unreliable (meaning sometimes the trap might never 1011 * be delivered to the subscribed nodes) and may arrive out-of-order 1012 * since they use UD transport. An alternative to relying on these 1013 * unreliable traps is to poll for mcg presence every so often, but 1014 * instead of doing that, we try to be as conservative as possible 1015 * while handling the traps, and hope that the traps do arrive at 1016 * the subscribed nodes soon. Note that if a node is fullmember 1017 * joined to an mcg, it can not possibly receive a mcg create/delete 1018 * trap for that mcg (by fullmember definition); if it does, it is 1019 * an old trap from a previous incarnation of the mcg. 1020 * 1021 * Whenever a trap is received, the driver cleans up its sendonly 1022 * membership to the group; we choose to do a sendonly leave even 1023 * on a creation trap to handle the case of a prior deletion of the mcg 1024 * having gone unnoticed. Consider an example scenario: 1025 * T1: MCG M is deleted, and fires off deletion trap D1. 1026 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1027 * T3: Node N tries to transmit to M, joining in sendonly mode. 1028 * T4: MCG M is deleted, and fires off deletion trap D2. 1029 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1030 * If the trap is D2, then a LEAVE is not required, since the mcg 1031 * is already deleted; but if it is D1, a LEAVE is required. A safe 1032 * approach is to always LEAVE, but the SM may be confused if it 1033 * receives a LEAVE without a prior JOIN. 1034 * 1035 * Management of the non-membership to an mcg is similar to the above, 1036 * except that if the interface is in promiscuous mode, it is required 1037 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1038 * if the re-join attempt fails (in which case a warning message needs 1039 * to be printed), it is not clear whether it failed due to the mcg not 1040 * existing, or some fabric/hca issues, due to the delayed nature of 1041 * trap delivery. Querying the SA to establish presence/absence of the 1042 * mcg is also racy at best. Thus, the driver just prints a warning 1043 * message when it can not rejoin after receiving a create trap, although 1044 * this might be (on rare occassions) a mis-warning if the create trap is 1045 * received after the mcg was deleted. 1046 */ 1047 1048 /* 1049 * Implementation of atomic "recycle" bits and reference count 1050 * on address handles. This utilizes the fact that max reference 1051 * count on any handle is limited by number of send wqes, thus 1052 * high bits in the ac_ref field can be used as the recycle bits, 1053 * and only the low bits hold the number of pending Tx requests. 1054 * This atomic AH reference counting allows the Tx completion 1055 * handler not to acquire the id_ac_mutex to process every completion, 1056 * thus reducing lock contention problems between completion and 1057 * the Tx path. 1058 */ 1059 #define CYCLEVAL 0x80000 1060 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1061 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1062 #define GET_REF(ace) ((ace)->ac_ref) 1063 #define GET_REF_CYCLE(ace) ( \ 1064 /* \ 1065 * Make sure "cycle" bit is set. \ 1066 */ \ 1067 ASSERT(CYCLE_SET(ace)), \ 1068 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1069 ) 1070 #define INC_REF(ace, num) { \ 1071 atomic_add_32(&(ace)->ac_ref, num); \ 1072 } 1073 #define SET_CYCLE_IF_REF(ace) ( \ 1074 CYCLE_SET(ace) ? B_TRUE : \ 1075 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1076 CYCLEVAL ? \ 1077 /* \ 1078 * Clear the "cycle" bit we just set; \ 1079 * ref count known to be 0 from above. \ 1080 */ \ 1081 CLEAR_REFCYCLE(ace), B_FALSE : \ 1082 /* \ 1083 * We set "cycle" bit; let caller know. \ 1084 */ \ 1085 B_TRUE \ 1086 ) 1087 #define DEC_REF_DO_CYCLE(ace) ( \ 1088 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1089 CYCLEVAL ? \ 1090 /* \ 1091 * Ref count known to be 0 from above. \ 1092 */ \ 1093 B_TRUE : \ 1094 B_FALSE \ 1095 ) 1096 1097 static void * 1098 list_get_head(list_t *list) 1099 { 1100 list_node_t *lhead = list_head(list); 1101 1102 if (lhead != NULL) 1103 list_remove(list, lhead); 1104 return (lhead); 1105 } 1106 1107 /* 1108 * This is always guaranteed to be able to queue the work. 1109 */ 1110 static void 1111 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1112 { 1113 /* Initialize request */ 1114 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1115 ptr->rq_op = op; 1116 1117 /* 1118 * Queue provided slot onto request pool. 1119 */ 1120 mutex_enter(&state->id_acache_req_lock); 1121 list_insert_tail(&state->id_req_list, ptr); 1122 1123 /* Go, fetch, async thread */ 1124 cv_signal(&state->id_acache_req_cv); 1125 mutex_exit(&state->id_acache_req_lock); 1126 } 1127 1128 /* 1129 * Main body of the per interface async thread. 1130 */ 1131 static void 1132 ibd_async_work(ibd_state_t *state) 1133 { 1134 ibd_req_t *ptr; 1135 callb_cpr_t cprinfo; 1136 1137 mutex_enter(&state->id_acache_req_lock); 1138 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1139 callb_generic_cpr, "ibd_async_work"); 1140 1141 for (;;) { 1142 ptr = list_get_head(&state->id_req_list); 1143 if (ptr != NULL) { 1144 mutex_exit(&state->id_acache_req_lock); 1145 1146 /* 1147 * Once we have done the operation, there is no 1148 * guarantee the request slot is going to be valid, 1149 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1150 * TRAP). 1151 * 1152 * Perform the request. 1153 */ 1154 switch (ptr->rq_op) { 1155 case IBD_ASYNC_GETAH: 1156 ibd_async_acache(state, &ptr->rq_mac); 1157 break; 1158 case IBD_ASYNC_JOIN: 1159 case IBD_ASYNC_LEAVE: 1160 ibd_async_multicast(state, 1161 ptr->rq_gid, ptr->rq_op); 1162 break; 1163 case IBD_ASYNC_PROMON: 1164 ibd_async_setprom(state); 1165 break; 1166 case IBD_ASYNC_PROMOFF: 1167 ibd_async_unsetprom(state); 1168 break; 1169 case IBD_ASYNC_REAP: 1170 ibd_async_reap_group(state, 1171 ptr->rq_ptr, ptr->rq_gid, 1172 IB_MC_JSTATE_FULL); 1173 /* 1174 * the req buf contains in mce 1175 * structure, so we do not need 1176 * to free it here. 1177 */ 1178 ptr = NULL; 1179 break; 1180 case IBD_ASYNC_TRAP: 1181 ibd_async_trap(state, ptr); 1182 break; 1183 case IBD_ASYNC_SCHED: 1184 ibd_async_txsched(state); 1185 break; 1186 case IBD_ASYNC_LINK: 1187 ibd_async_link(state, ptr); 1188 break; 1189 case IBD_ASYNC_EXIT: 1190 mutex_enter(&state->id_acache_req_lock); 1191 #ifndef __lock_lint 1192 CALLB_CPR_EXIT(&cprinfo); 1193 #else 1194 mutex_exit(&state->id_acache_req_lock); 1195 #endif 1196 return; 1197 } 1198 if (ptr != NULL) 1199 kmem_cache_free(state->id_req_kmc, ptr); 1200 1201 mutex_enter(&state->id_acache_req_lock); 1202 } else { 1203 #ifndef __lock_lint 1204 /* 1205 * Nothing to do: wait till new request arrives. 1206 */ 1207 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1208 cv_wait(&state->id_acache_req_cv, 1209 &state->id_acache_req_lock); 1210 CALLB_CPR_SAFE_END(&cprinfo, 1211 &state->id_acache_req_lock); 1212 #endif 1213 } 1214 } 1215 1216 /*NOTREACHED*/ 1217 _NOTE(NOT_REACHED) 1218 } 1219 1220 /* 1221 * Return when it is safe to queue requests to the async daemon; primarily 1222 * for subnet trap and async event handling. Disallow requests before the 1223 * daemon is created, and when interface deinitilization starts. 1224 */ 1225 static boolean_t 1226 ibd_async_safe(ibd_state_t *state) 1227 { 1228 mutex_enter(&state->id_trap_lock); 1229 if (state->id_trap_stop) { 1230 mutex_exit(&state->id_trap_lock); 1231 return (B_FALSE); 1232 } 1233 state->id_trap_inprog++; 1234 mutex_exit(&state->id_trap_lock); 1235 return (B_TRUE); 1236 } 1237 1238 /* 1239 * Wake up ibd_drv_fini() if the detach code is waiting for pending subnet 1240 * trap or event handling to complete to kill the async thread and deconstruct 1241 * the mcg/ace list. 1242 */ 1243 static void 1244 ibd_async_done(ibd_state_t *state) 1245 { 1246 mutex_enter(&state->id_trap_lock); 1247 if (--state->id_trap_inprog == 0) 1248 cv_signal(&state->id_trap_cv); 1249 mutex_exit(&state->id_trap_lock); 1250 } 1251 1252 /* 1253 * Hash functions: 1254 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1255 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1256 * These operate on mac addresses input into ibd_send, but there is no 1257 * guarantee on the alignment of the ipoib_mac_t structure. 1258 */ 1259 /*ARGSUSED*/ 1260 static uint_t 1261 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1262 { 1263 ulong_t ptraddr = (ulong_t)key; 1264 uint_t hval; 1265 1266 /* 1267 * If the input address is 4 byte aligned, we can just dereference 1268 * it. This is most common, since IP will send in a 4 byte aligned 1269 * IP header, which implies the 24 byte IPoIB psuedo header will be 1270 * 4 byte aligned too. 1271 */ 1272 if ((ptraddr & 3) == 0) 1273 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1274 1275 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1276 return (hval); 1277 } 1278 1279 static int 1280 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1281 { 1282 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1283 return (0); 1284 else 1285 return (1); 1286 } 1287 1288 /* 1289 * Initialize all the per interface caches and lists; AH cache, 1290 * MCG list etc. 1291 */ 1292 static int 1293 ibd_acache_init(ibd_state_t *state) 1294 { 1295 ibd_ace_t *ce; 1296 int i; 1297 1298 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1299 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1300 1301 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1302 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1303 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1304 offsetof(ibd_ace_t, ac_list)); 1305 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1306 offsetof(ibd_ace_t, ac_list)); 1307 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1308 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1309 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1310 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1311 offsetof(ibd_mce_t, mc_list)); 1312 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1313 offsetof(ibd_mce_t, mc_list)); 1314 list_create(&state->id_req_list, sizeof (ibd_req_t), 1315 offsetof(ibd_req_t, rq_list)); 1316 1317 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1318 IBD_NUM_AH, KM_SLEEP); 1319 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1320 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1321 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1322 ibd_acache_fini(state); 1323 return (DDI_FAILURE); 1324 } else { 1325 CLEAR_REFCYCLE(ce); 1326 ce->ac_mce = NULL; 1327 IBD_ACACHE_INSERT_FREE(state, ce); 1328 } 1329 } 1330 return (DDI_SUCCESS); 1331 } 1332 1333 static void 1334 ibd_acache_fini(ibd_state_t *state) 1335 { 1336 ibd_ace_t *ptr; 1337 1338 mutex_enter(&state->id_ac_mutex); 1339 1340 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1341 ASSERT(GET_REF(ptr) == 0); 1342 (void) ibt_free_ud_dest(ptr->ac_dest); 1343 } 1344 1345 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1346 ASSERT(GET_REF(ptr) == 0); 1347 (void) ibt_free_ud_dest(ptr->ac_dest); 1348 } 1349 1350 list_destroy(&state->id_ah_free); 1351 list_destroy(&state->id_ah_active); 1352 list_destroy(&state->id_mc_full); 1353 list_destroy(&state->id_mc_non); 1354 list_destroy(&state->id_req_list); 1355 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1356 mutex_exit(&state->id_ac_mutex); 1357 mutex_destroy(&state->id_ac_mutex); 1358 mutex_destroy(&state->id_mc_mutex); 1359 mutex_destroy(&state->id_acache_req_lock); 1360 cv_destroy(&state->id_acache_req_cv); 1361 } 1362 1363 /* 1364 * Search AH active hash list for a cached path to input destination. 1365 * If we are "just looking", hold == F. When we are in the Tx path, 1366 * we set hold == T to grab a reference on the AH so that it can not 1367 * be recycled to a new destination while the Tx request is posted. 1368 */ 1369 static ibd_ace_t * 1370 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1371 { 1372 ibd_ace_t *ptr; 1373 1374 ASSERT(mutex_owned(&state->id_ac_mutex)); 1375 1376 /* 1377 * Do hash search. 1378 */ 1379 if (mod_hash_find(state->id_ah_active_hash, 1380 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1381 if (hold) 1382 INC_REF(ptr, num); 1383 return (ptr); 1384 } 1385 return (NULL); 1386 } 1387 1388 /* 1389 * This is called by the tx side; if an initialized AH is found in 1390 * the active list, it is locked down and can be used; if no entry 1391 * is found, an async request is queued to do path resolution. 1392 */ 1393 static ibd_ace_t * 1394 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1395 { 1396 ibd_ace_t *ptr; 1397 ibd_req_t *req; 1398 1399 /* 1400 * Only attempt to print when we can; in the mdt pattr case, the 1401 * address is not aligned properly. 1402 */ 1403 if (((ulong_t)mac & 3) == 0) { 1404 DPRINT(4, 1405 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1406 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1407 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1408 htonl(mac->ipoib_gidsuff[1])); 1409 } 1410 1411 mutex_enter(&state->id_ac_mutex); 1412 1413 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1414 mutex_exit(&state->id_ac_mutex); 1415 return (ptr); 1416 } 1417 1418 /* 1419 * Implementation of a single outstanding async request; if 1420 * the operation is not started yet, queue a request and move 1421 * to ongoing state. Remember in id_ah_addr for which address 1422 * we are queueing the request, in case we need to flag an error; 1423 * Any further requests, for the same or different address, until 1424 * the operation completes, is sent back to GLDv3 to be retried. 1425 * The async thread will update id_ah_op with an error indication 1426 * or will set it to indicate the next look up can start; either 1427 * way, it will mac_tx_update() so that all blocked requests come 1428 * back here. 1429 */ 1430 *err = EAGAIN; 1431 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1432 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1433 if (req != NULL) { 1434 /* 1435 * We did not even find the entry; queue a request 1436 * for it. 1437 */ 1438 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1439 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1440 state->id_ah_op = IBD_OP_ONGOING; 1441 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1442 } 1443 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1444 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1445 /* 1446 * Check the status of the pathrecord lookup request 1447 * we had queued before. 1448 */ 1449 if (state->id_ah_op == IBD_OP_ERRORED) { 1450 *err = EFAULT; 1451 state->id_ah_error++; 1452 } else { 1453 /* 1454 * IBD_OP_ROUTERED case: We need to send to the 1455 * all-router MCG. If we can find the AH for 1456 * the mcg, the Tx will be attempted. If we 1457 * do not find the AH, we return NORESOURCES 1458 * to retry. 1459 */ 1460 ipoib_mac_t routermac; 1461 1462 (void) ibd_get_allroutergroup(state, mac, &routermac); 1463 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1464 numwqe); 1465 } 1466 state->id_ah_op = IBD_OP_NOTSTARTED; 1467 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1468 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1469 /* 1470 * This case can happen when we get a higher band 1471 * packet. The easiest way is to reset the state machine 1472 * to accommodate the higher priority packet. 1473 */ 1474 state->id_ah_op = IBD_OP_NOTSTARTED; 1475 } 1476 mutex_exit(&state->id_ac_mutex); 1477 1478 return (ptr); 1479 } 1480 1481 /* 1482 * Grab a not-currently-in-use AH/PathRecord from the active 1483 * list to recycle to a new destination. Only the async thread 1484 * executes this code. 1485 */ 1486 static ibd_ace_t * 1487 ibd_acache_get_unref(ibd_state_t *state) 1488 { 1489 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1490 1491 ASSERT(mutex_owned(&state->id_ac_mutex)); 1492 1493 /* 1494 * Do plain linear search. 1495 */ 1496 while (ptr != NULL) { 1497 /* 1498 * Note that it is possible that the "cycle" bit 1499 * is set on the AH w/o any reference count. The 1500 * mcg must have been deleted, and the tx cleanup 1501 * just decremented the reference count to 0, but 1502 * hasn't gotten around to grabbing the id_ac_mutex 1503 * to move the AH into the free list. 1504 */ 1505 if (GET_REF(ptr) == 0) { 1506 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1507 break; 1508 } 1509 ptr = list_next(&state->id_ah_active, ptr); 1510 } 1511 return (ptr); 1512 } 1513 1514 /* 1515 * Invoked to clean up AH from active list in case of multicast 1516 * disable and to handle sendonly memberships during mcg traps. 1517 * And for port up processing for multicast and unicast AHs. 1518 * Normally, the AH is taken off the active list, and put into 1519 * the free list to be recycled for a new destination. In case 1520 * Tx requests on the AH have not completed yet, the AH is marked 1521 * for reaping (which will put the AH on the free list) once the Tx's 1522 * complete; in this case, depending on the "force" input, we take 1523 * out the AH from the active list right now, or leave it also for 1524 * the reap operation. Returns TRUE if the AH is taken off the active 1525 * list (and either put into the free list right now, or arranged for 1526 * later), FALSE otherwise. 1527 */ 1528 static boolean_t 1529 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1530 { 1531 ibd_ace_t *acactive; 1532 boolean_t ret = B_TRUE; 1533 1534 ASSERT(mutex_owned(&state->id_ac_mutex)); 1535 1536 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1537 1538 /* 1539 * Note that the AH might already have the cycle bit set 1540 * on it; this might happen if sequences of multicast 1541 * enables and disables are coming so fast, that posted 1542 * Tx's to the mcg have not completed yet, and the cycle 1543 * bit is set successively by each multicast disable. 1544 */ 1545 if (SET_CYCLE_IF_REF(acactive)) { 1546 if (!force) { 1547 /* 1548 * The ace is kept on the active list, further 1549 * Tx's can still grab a reference on it; the 1550 * ace is reaped when all pending Tx's 1551 * referencing the AH complete. 1552 */ 1553 ret = B_FALSE; 1554 } else { 1555 /* 1556 * In the mcg trap case, we always pull the 1557 * AH from the active list. And also the port 1558 * up multi/unicast case. 1559 */ 1560 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1561 acactive->ac_mce = NULL; 1562 } 1563 } else { 1564 /* 1565 * Determined the ref count is 0, thus reclaim 1566 * immediately after pulling out the ace from 1567 * the active list. 1568 */ 1569 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1570 acactive->ac_mce = NULL; 1571 IBD_ACACHE_INSERT_FREE(state, acactive); 1572 } 1573 1574 } 1575 return (ret); 1576 } 1577 1578 /* 1579 * Helper function for async path record lookup. If we are trying to 1580 * Tx to a MCG, check our membership, possibly trying to join the 1581 * group if required. If that fails, try to send the packet to the 1582 * all router group (indicated by the redirect output), pointing 1583 * the input mac address to the router mcg address. 1584 */ 1585 static ibd_mce_t * 1586 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1587 { 1588 ib_gid_t mgid; 1589 ibd_mce_t *mce; 1590 ipoib_mac_t routermac; 1591 1592 *redirect = B_FALSE; 1593 ibd_n2h_gid(mac, &mgid); 1594 1595 /* 1596 * Check the FullMember+SendOnlyNonMember list. 1597 * Since we are the only one who manipulates the 1598 * id_mc_full list, no locks are needed. 1599 */ 1600 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1601 if (mce != NULL) { 1602 DPRINT(4, "ibd_async_mcache : already joined to group"); 1603 return (mce); 1604 } 1605 1606 /* 1607 * Not found; try to join(SendOnlyNonMember) and attach. 1608 */ 1609 DPRINT(4, "ibd_async_mcache : not joined to group"); 1610 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1611 NULL) { 1612 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1613 return (mce); 1614 } 1615 1616 /* 1617 * MCGroup not present; try to join the all-router group. If 1618 * any of the following steps succeed, we will be redirecting 1619 * to the all router group. 1620 */ 1621 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1622 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1623 return (NULL); 1624 *redirect = B_TRUE; 1625 ibd_n2h_gid(&routermac, &mgid); 1626 bcopy(&routermac, mac, IPOIB_ADDRL); 1627 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1628 mgid.gid_prefix, mgid.gid_guid); 1629 1630 /* 1631 * Are we already joined to the router group? 1632 */ 1633 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1634 DPRINT(4, "ibd_async_mcache : using already joined router" 1635 "group\n"); 1636 return (mce); 1637 } 1638 1639 /* 1640 * Can we join(SendOnlyNonMember) the router group? 1641 */ 1642 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1643 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1644 NULL) { 1645 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1646 return (mce); 1647 } 1648 1649 return (NULL); 1650 } 1651 1652 /* 1653 * Async path record lookup code. 1654 */ 1655 static void 1656 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1657 { 1658 ibd_ace_t *ce; 1659 ibd_mce_t *mce = NULL; 1660 ibt_path_attr_t path_attr; 1661 ibt_path_info_t path_info; 1662 ib_gid_t destgid; 1663 int ret = IBD_OP_NOTSTARTED; 1664 1665 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1666 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1667 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1668 htonl(mac->ipoib_gidsuff[1])); 1669 1670 /* 1671 * Check whether we are trying to transmit to a MCG. 1672 * In that case, we need to make sure we are a member of 1673 * the MCG. 1674 */ 1675 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1676 boolean_t redirected; 1677 1678 /* 1679 * If we can not find or join the group or even 1680 * redirect, error out. 1681 */ 1682 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1683 NULL) { 1684 state->id_ah_op = IBD_OP_ERRORED; 1685 return; 1686 } 1687 1688 /* 1689 * If we got redirected, we need to determine whether 1690 * the AH for the new mcg is in the cache already, and 1691 * not pull it in then; otherwise proceed to get the 1692 * path for the new mcg. There is no guarantee that 1693 * if the AH is currently in the cache, it will still be 1694 * there when we look in ibd_acache_lookup(), but that's 1695 * okay, we will come back here. 1696 */ 1697 if (redirected) { 1698 ret = IBD_OP_ROUTERED; 1699 DPRINT(4, "ibd_async_acache : redirected to " 1700 "%08X:%08X:%08X:%08X:%08X", 1701 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1702 htonl(mac->ipoib_gidpref[1]), 1703 htonl(mac->ipoib_gidsuff[0]), 1704 htonl(mac->ipoib_gidsuff[1])); 1705 1706 mutex_enter(&state->id_ac_mutex); 1707 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1708 state->id_ah_op = IBD_OP_ROUTERED; 1709 mutex_exit(&state->id_ac_mutex); 1710 DPRINT(4, "ibd_async_acache : router AH found"); 1711 return; 1712 } 1713 mutex_exit(&state->id_ac_mutex); 1714 } 1715 } 1716 1717 /* 1718 * Get an AH from the free list. 1719 */ 1720 mutex_enter(&state->id_ac_mutex); 1721 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1722 /* 1723 * No free ones; try to grab an unreferenced active 1724 * one. Maybe we need to make the active list LRU, 1725 * but that will create more work for Tx callbacks. 1726 * Is there a way of not having to pull out the 1727 * entry from the active list, but just indicate it 1728 * is being recycled? Yes, but that creates one more 1729 * check in the fast lookup path. 1730 */ 1731 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1732 /* 1733 * Pretty serious shortage now. 1734 */ 1735 state->id_ah_op = IBD_OP_NOTSTARTED; 1736 mutex_exit(&state->id_ac_mutex); 1737 DPRINT(10, "ibd_async_acache : failed to find AH " 1738 "slot\n"); 1739 return; 1740 } 1741 /* 1742 * We could check whether ac_mce points to a SendOnly 1743 * member and drop that membership now. Or do it lazily 1744 * at detach time. 1745 */ 1746 ce->ac_mce = NULL; 1747 } 1748 mutex_exit(&state->id_ac_mutex); 1749 ASSERT(ce->ac_mce == NULL); 1750 1751 /* 1752 * Update the entry. 1753 */ 1754 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1755 1756 bzero(&path_info, sizeof (path_info)); 1757 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1758 path_attr.pa_sgid = state->id_sgid; 1759 path_attr.pa_num_dgids = 1; 1760 ibd_n2h_gid(&ce->ac_mac, &destgid); 1761 path_attr.pa_dgids = &destgid; 1762 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1763 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1764 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1765 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1766 goto error; 1767 } 1768 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1769 ntohl(ce->ac_mac.ipoib_qpn), 1770 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1771 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1772 goto error; 1773 } 1774 1775 /* 1776 * mce is set whenever an AH is being associated with a 1777 * MCG; this will come in handy when we leave the MCG. The 1778 * lock protects Tx fastpath from scanning the active list. 1779 */ 1780 if (mce != NULL) 1781 ce->ac_mce = mce; 1782 mutex_enter(&state->id_ac_mutex); 1783 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1784 state->id_ah_op = ret; 1785 mutex_exit(&state->id_ac_mutex); 1786 return; 1787 error: 1788 /* 1789 * We might want to drop SendOnly membership here if we 1790 * joined above. The lock protects Tx callbacks inserting 1791 * into the free list. 1792 */ 1793 mutex_enter(&state->id_ac_mutex); 1794 state->id_ah_op = IBD_OP_ERRORED; 1795 IBD_ACACHE_INSERT_FREE(state, ce); 1796 mutex_exit(&state->id_ac_mutex); 1797 } 1798 1799 /* 1800 * While restoring port's presence on the subnet on a port up, it is possible 1801 * that the port goes down again. 1802 */ 1803 static void 1804 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1805 { 1806 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1807 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1808 LINK_STATE_UP; 1809 ibd_mce_t *mce, *pmce; 1810 ibd_ace_t *ace, *pace; 1811 1812 DPRINT(10, "ibd_async_link(): %d", opcode); 1813 1814 /* 1815 * On a link up, revalidate the link speed/width. No point doing 1816 * this on a link down, since we will be unable to do SA operations, 1817 * defaulting to the lowest speed. Also notice that we update our 1818 * notion of speed before calling mac_link_update(), which will do 1819 * neccesary higher level notifications for speed changes. 1820 */ 1821 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1822 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1823 state->id_link_speed = ibd_get_portspeed(state); 1824 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1825 } 1826 1827 /* 1828 * Do all the work required to establish our presence on 1829 * the subnet. 1830 */ 1831 if (opcode == IBD_LINK_UP_ABSENT) { 1832 /* 1833 * If in promiscuous mode ... 1834 */ 1835 if (state->id_prom_op == IBD_OP_COMPLETED) { 1836 /* 1837 * Drop all nonmembership. 1838 */ 1839 ibd_async_unsetprom(state); 1840 1841 /* 1842 * Then, try to regain nonmembership to all mcg's. 1843 */ 1844 ibd_async_setprom(state); 1845 1846 } 1847 1848 /* 1849 * Drop all sendonly membership (which also gets rid of the 1850 * AHs); try to reacquire all full membership. 1851 */ 1852 mce = list_head(&state->id_mc_full); 1853 while ((pmce = mce) != NULL) { 1854 mce = list_next(&state->id_mc_full, mce); 1855 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1856 ibd_leave_group(state, 1857 pmce->mc_info.mc_adds_vect.av_dgid, 1858 IB_MC_JSTATE_SEND_ONLY_NON); 1859 else 1860 ibd_reacquire_group(state, pmce); 1861 } 1862 1863 /* 1864 * Recycle all active AHs to free list (and if there are 1865 * pending posts, make sure they will go into the free list 1866 * once the Tx's complete). Grab the lock to prevent 1867 * concurrent Tx's as well as Tx cleanups. 1868 */ 1869 mutex_enter(&state->id_ac_mutex); 1870 ace = list_head(&state->id_ah_active); 1871 while ((pace = ace) != NULL) { 1872 boolean_t cycled; 1873 1874 ace = list_next(&state->id_ah_active, ace); 1875 mce = pace->ac_mce; 1876 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1877 B_TRUE); 1878 /* 1879 * If this is for an mcg, it must be for a fullmember, 1880 * since we got rid of send-only members above when 1881 * processing the mce list. 1882 */ 1883 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1884 IB_MC_JSTATE_FULL))); 1885 1886 /* 1887 * Check if the fullmember mce needs to be torn down, 1888 * ie whether the DLPI disable has already been done. 1889 * If so, do some of the work of tx_cleanup, namely 1890 * causing leave (which will fail), detach and 1891 * mce-freeing. tx_cleanup will put the AH into free 1892 * list. The reason to duplicate some of this 1893 * tx_cleanup work is because we want to delete the 1894 * AH right now instead of waiting for tx_cleanup, to 1895 * force subsequent Tx's to reacquire an AH. 1896 */ 1897 if ((mce != NULL) && (mce->mc_fullreap)) 1898 ibd_async_reap_group(state, mce, 1899 mce->mc_info.mc_adds_vect.av_dgid, 1900 mce->mc_jstate); 1901 } 1902 mutex_exit(&state->id_ac_mutex); 1903 } 1904 1905 /* 1906 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1907 * (which stops further events from being delivered) before 1908 * mac_unregister(). At this point, it is guaranteed that mac_register 1909 * has already been done. 1910 */ 1911 mutex_enter(&state->id_link_mutex); 1912 state->id_link_state = lstate; 1913 mac_link_update(state->id_mh, lstate); 1914 mutex_exit(&state->id_link_mutex); 1915 1916 ibd_async_done(state); 1917 } 1918 1919 /* 1920 * When the link is notified up, we need to do a few things, based 1921 * on the port's current p_init_type_reply claiming a reinit has been 1922 * done or not. The reinit steps are: 1923 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1924 * the old Pkey and GID0 are correct. 1925 * 2. Register for mcg traps (already done by ibmf). 1926 * 3. If PreservePresenceReply indicates the SM has restored port's presence 1927 * in subnet, nothing more to do. Else go to next steps (on async daemon). 1928 * 4. Give up all sendonly memberships. 1929 * 5. Acquire all full memberships. 1930 * 6. In promiscuous mode, acquire all non memberships. 1931 * 7. Recycle all AHs to free list. 1932 */ 1933 static void 1934 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 1935 { 1936 ibt_hca_portinfo_t *port_infop; 1937 ibt_status_t ibt_status; 1938 uint_t psize, port_infosz; 1939 ibd_link_op_t opcode; 1940 ibd_req_t *req; 1941 1942 /* 1943 * Do not send a request to the async daemon if it has not 1944 * yet been created or is being destroyed. If the async 1945 * daemon has not yet been created, we still need to track 1946 * last known state of the link. If this code races with the 1947 * detach path, then we are assured that the detach path has 1948 * not yet done the ibt_close_hca (which waits for all async 1949 * events to complete). If the code races with the attach path, 1950 * we need to validate the pkey/gid (in the link_up case) if 1951 * the initialization path has already set these up and created 1952 * IBTF resources based on the values. 1953 */ 1954 mutex_enter(&state->id_link_mutex); 1955 1956 /* 1957 * If the init code in ibd_drv_init hasn't yet set up the 1958 * pkey/gid, nothing to do; that code will set the link state. 1959 */ 1960 if (state->id_link_state == LINK_STATE_UNKNOWN) { 1961 mutex_exit(&state->id_link_mutex); 1962 return; 1963 } 1964 1965 if ((code == IBT_EVENT_PORT_UP) || (code == IBT_CLNT_REREG_EVENT) || 1966 (code == IBT_PORT_CHANGE_EVENT)) { 1967 uint8_t itreply; 1968 boolean_t badup = B_FALSE; 1969 1970 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 1971 state->id_port, &port_infop, &psize, &port_infosz); 1972 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 1973 mutex_exit(&state->id_link_mutex); 1974 DPRINT(10, "ibd_link_up : failed in" 1975 " ibt_query_port()\n"); 1976 return; 1977 } 1978 1979 /* 1980 * If the link already went down by the time the handler gets 1981 * here, give up; we can not even validate pkey/gid since those 1982 * are not valid. 1983 */ 1984 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) 1985 badup = B_TRUE; 1986 1987 itreply = port_infop->p_init_type_reply; 1988 1989 /* 1990 * In InitTypeReply, check if NoLoadReply == 1991 * PreserveContentReply == 0, in which case, verify Pkey/GID0. 1992 */ 1993 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 1994 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && 1995 (!badup)) { 1996 /* 1997 * Check that the subnet part of GID0 has not changed. 1998 */ 1999 if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, 2000 sizeof (ib_gid_t)) != 0) 2001 badup = B_TRUE; 2002 2003 /* 2004 * Check that Pkey/index mapping is still valid. 2005 */ 2006 if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || 2007 (port_infop->p_pkey_tbl[state->id_pkix] != 2008 state->id_pkey)) 2009 badup = B_TRUE; 2010 } 2011 2012 /* 2013 * In InitTypeReply, if PreservePresenceReply indicates the SM 2014 * has ensured that the port's presence in mcg, traps etc is 2015 * intact, nothing more to do. 2016 */ 2017 opcode = IBD_LINK_UP_ABSENT; 2018 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2019 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) 2020 opcode = IBD_LINK_UP; 2021 2022 ibt_free_portinfo(port_infop, port_infosz); 2023 2024 if (badup) { 2025 code = IBT_ERROR_PORT_DOWN; 2026 } else if (code == IBT_PORT_CHANGE_EVENT) { 2027 mutex_exit(&state->id_link_mutex); 2028 return; 2029 } 2030 } 2031 2032 if (!ibd_async_safe(state)) { 2033 state->id_link_state = (((code == IBT_EVENT_PORT_UP) || 2034 (code == IBT_CLNT_REREG_EVENT)) ? LINK_STATE_UP : 2035 LINK_STATE_DOWN); 2036 mutex_exit(&state->id_link_mutex); 2037 return; 2038 } 2039 mutex_exit(&state->id_link_mutex); 2040 2041 if (code == IBT_ERROR_PORT_DOWN) 2042 opcode = IBD_LINK_DOWN; 2043 2044 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2045 req->rq_ptr = (void *)opcode; 2046 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2047 } 2048 2049 /* 2050 * For the port up/down events, IBTL guarantees there will not be concurrent 2051 * invocations of the handler. IBTL might coalesce link transition events, 2052 * and not invoke the handler for _each_ up/down transition, but it will 2053 * invoke the handler with last known state 2054 */ 2055 static void 2056 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2057 ibt_async_code_t code, ibt_async_event_t *event) 2058 { 2059 ibd_state_t *state = (ibd_state_t *)clnt_private; 2060 2061 switch (code) { 2062 case IBT_ERROR_CATASTROPHIC_CHAN: 2063 ibd_print_warn(state, "catastrophic channel error"); 2064 break; 2065 case IBT_ERROR_CQ: 2066 ibd_print_warn(state, "completion queue error"); 2067 break; 2068 case IBT_PORT_CHANGE_EVENT: 2069 /* 2070 * Events will be delivered to all instances that have 2071 * done ibt_open_hca() but not yet done ibt_close_hca(). 2072 * Only need to do work for our port; IBTF will deliver 2073 * events for other ports on the hca we have ibt_open_hca'ed 2074 * too. Note that ibd_drv_init() initializes id_port before 2075 * doing ibt_open_hca(). 2076 */ 2077 ASSERT(state->id_hca_hdl == hca_hdl); 2078 if (state->id_port != event->ev_port) 2079 break; 2080 2081 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2082 IBT_PORT_CHANGE_PKEY) { 2083 ibd_link_mod(state, code); 2084 } 2085 break; 2086 case IBT_ERROR_PORT_DOWN: 2087 case IBT_CLNT_REREG_EVENT: 2088 case IBT_EVENT_PORT_UP: 2089 /* 2090 * Events will be delivered to all instances that have 2091 * done ibt_open_hca() but not yet done ibt_close_hca(). 2092 * Only need to do work for our port; IBTF will deliver 2093 * events for other ports on the hca we have ibt_open_hca'ed 2094 * too. Note that ibd_drv_init() initializes id_port before 2095 * doing ibt_open_hca(). 2096 */ 2097 ASSERT(state->id_hca_hdl == hca_hdl); 2098 if (state->id_port != event->ev_port) 2099 break; 2100 2101 ibd_link_mod(state, code); 2102 break; 2103 2104 case IBT_HCA_ATTACH_EVENT: 2105 case IBT_HCA_DETACH_EVENT: 2106 /* 2107 * When a new card is plugged to the system, attach_event is 2108 * invoked. Additionally, a cfgadm needs to be run to make the 2109 * card known to the system, and an ifconfig needs to be run to 2110 * plumb up any ibd interfaces on the card. In the case of card 2111 * unplug, a cfgadm is run that will trigger any RCM scripts to 2112 * unplumb the ibd interfaces on the card; when the card is 2113 * actually unplugged, the detach_event is invoked; 2114 * additionally, if any ibd instances are still active on the 2115 * card (eg there were no associated RCM scripts), driver's 2116 * detach routine is invoked. 2117 */ 2118 break; 2119 default: 2120 break; 2121 } 2122 } 2123 2124 /* 2125 * Attach device to the IO framework. 2126 */ 2127 static int 2128 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2129 { 2130 mac_register_t *macp; 2131 ibd_state_t *state; 2132 int instance; 2133 int err; 2134 2135 switch (cmd) { 2136 case DDI_ATTACH: 2137 break; 2138 case DDI_RESUME: 2139 /* This driver does not support resume */ 2140 default: 2141 return (DDI_FAILURE); 2142 } 2143 2144 /* 2145 * Allocate soft device data structure 2146 */ 2147 instance = ddi_get_instance(dip); 2148 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2149 return (DDI_FAILURE); 2150 state = ddi_get_soft_state(ibd_list, instance); 2151 2152 /* pre ibt_attach() soft state initialization */ 2153 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2154 DPRINT(10, "ibd_attach : failed in ibd_state_init()"); 2155 goto attach_fail_state_init; 2156 } 2157 2158 /* alloc rx soft intr */ 2159 if ((ibd_rx_softintr == 1) && 2160 ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2161 NULL, NULL, ibd_intr, (caddr_t)state) != DDI_SUCCESS) { 2162 DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); 2163 goto attach_fail_ddi_add_rx_softintr; 2164 } 2165 2166 /* alloc tx soft intr */ 2167 if ((ibd_tx_softintr == 1) && 2168 ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2169 NULL, NULL, ibd_tx_recycle, (caddr_t)state) != DDI_SUCCESS) { 2170 DPRINT(10, "ibd_attach : failed in ddi_add_softintr()"); 2171 goto attach_fail_ddi_add_tx_softintr; 2172 } 2173 2174 /* "attach" to IBTL */ 2175 if (ibt_attach(&ibd_clnt_modinfo, dip, state, 2176 &state->id_ibt_hdl) != IBT_SUCCESS) { 2177 DPRINT(10, "ibd_attach : failed in ibt_attach()"); 2178 goto attach_fail_ibt_attach; 2179 } 2180 2181 /* Finish initializing this driver */ 2182 if (ibd_drv_init(state) != DDI_SUCCESS) { 2183 DPRINT(10, "ibd_attach : failed in ibd_drv_init()\n"); 2184 goto attach_fail_drv_init; 2185 } 2186 2187 /* 2188 * Initialize pointers to device specific functions which will be 2189 * used by the generic layer. 2190 */ 2191 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2192 DPRINT(10, "ibd_attach : failed in mac_alloc()"); 2193 goto attach_fail_drv_init; 2194 } 2195 2196 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2197 macp->m_driver = state; 2198 macp->m_dip = state->id_dip; 2199 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2200 macp->m_callbacks = &ib_m_callbacks; 2201 macp->m_min_sdu = 0; 2202 macp->m_max_sdu = state->id_mtu - IPOIB_HDRSIZE; 2203 2204 /* 2205 * Register ourselves with the GLDv3 interface 2206 */ 2207 err = mac_register(macp, &state->id_mh); 2208 mac_free(macp); 2209 if (err != 0) { 2210 DPRINT(10, "ibd_attach : failed in mac_register()"); 2211 goto attach_fail_mac_register; 2212 } 2213 2214 /* 2215 * Setup the handler we will use for regular DLPI stuff. Its important 2216 * to setup the recv handler after registering with gldv3. 2217 */ 2218 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2219 if (ibt_enable_cq_notify(state->id_rcq_hdl, IBT_NEXT_COMPLETION) != 2220 IBT_SUCCESS) { 2221 DPRINT(10, "ibd_attach : failed in ibt_enable_cq_notify()\n"); 2222 goto attach_fail_setup_handler; 2223 } 2224 2225 /* 2226 * Setup the subnet notices handler after we initialize the a/mcaches 2227 * and start the async thread, both of which are required for the 2228 * trap handler to function properly. Enable the trap handler to 2229 * queue requests to the async thread after the mac_register, because 2230 * the async daemon invokes mac_tx_update(), which must be done after 2231 * mac_register(). 2232 */ 2233 ibt_register_subnet_notices(state->id_ibt_hdl, 2234 ibd_snet_notices_handler, state); 2235 mutex_enter(&state->id_trap_lock); 2236 state->id_trap_stop = B_FALSE; 2237 mutex_exit(&state->id_trap_lock); 2238 2239 /* 2240 * Indicate link status to GLDv3 and higher layers. By default, 2241 * we assume we are in up state (which must have been true at 2242 * least at the time the broadcast mcg's were probed); if there 2243 * were any up/down transitions till the time we come here, the 2244 * async handler will have updated last known state, which we 2245 * use to tell GLDv3. The async handler will not send any 2246 * notifications to GLDv3 till we reach here in the initialization 2247 * sequence. 2248 */ 2249 mac_link_update(state->id_mh, state->id_link_state); 2250 2251 return (DDI_SUCCESS); 2252 2253 /* Attach failure points, cleanup */ 2254 attach_fail_setup_handler: 2255 (void) mac_unregister(state->id_mh); 2256 2257 attach_fail_mac_register: 2258 ibd_drv_fini(state); 2259 2260 attach_fail_drv_init: 2261 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2262 ibd_print_warn(state, "failed to free IB resources"); 2263 2264 attach_fail_ibt_attach: 2265 if (ibd_tx_softintr == 1) 2266 ddi_remove_softintr(state->id_tx); 2267 2268 attach_fail_ddi_add_tx_softintr: 2269 if (ibd_rx_softintr == 1) 2270 ddi_remove_softintr(state->id_rx); 2271 2272 attach_fail_ddi_add_rx_softintr: 2273 ibd_state_fini(state); 2274 2275 attach_fail_state_init: 2276 ddi_soft_state_free(ibd_list, instance); 2277 2278 return (DDI_FAILURE); 2279 } 2280 2281 /* 2282 * Detach device from the IO framework. 2283 */ 2284 static int 2285 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2286 { 2287 ibd_state_t *state; 2288 int status; 2289 int instance; 2290 2291 switch (cmd) { 2292 case DDI_DETACH: 2293 break; 2294 case DDI_SUSPEND: 2295 default: 2296 return (DDI_FAILURE); 2297 } 2298 2299 instance = ddi_get_instance(dip); 2300 state = ddi_get_soft_state(ibd_list, instance); 2301 2302 /* 2303 * First, stop receive interrupts; this stops the 2304 * driver from handing up buffers to higher layers. 2305 * Wait for receive buffers to be returned; give up 2306 * after 5 seconds. 2307 */ 2308 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 2309 status = 50; 2310 while (state->id_rx_list.dl_bufs_outstanding > 0) { 2311 delay(drv_usectohz(100000)); 2312 if (--status == 0) { 2313 DPRINT(2, "ibd_detach : reclaiming failed"); 2314 goto failed; 2315 } 2316 } 2317 2318 if (mac_unregister(state->id_mh) != DDI_SUCCESS) { 2319 DPRINT(10, "ibd_detach : failed in mac_unregister()"); 2320 goto failed; 2321 } 2322 2323 if (ibd_rx_softintr == 1) 2324 ddi_remove_softintr(state->id_rx); 2325 2326 if (ibd_tx_softintr == 1) 2327 ddi_remove_softintr(state->id_tx); 2328 2329 ibd_drv_fini(state); 2330 2331 if (ibt_detach(state->id_ibt_hdl) != IBT_SUCCESS) 2332 ibd_print_warn(state, "failed to free all IB resources at " 2333 "driver detach time"); 2334 2335 ibd_state_fini(state); 2336 ddi_soft_state_free(ibd_list, instance); 2337 return (DDI_SUCCESS); 2338 2339 failed: 2340 /* 2341 * Reap all the Tx/Rx completions that were posted since we 2342 * turned off the notification. Turn on notifications. There 2343 * is a race in that we do not reap completions that come in 2344 * after the poll and before notifications get turned on. That 2345 * is okay, the next rx/tx packet will trigger a completion 2346 * that will reap any missed completions. 2347 */ 2348 ibd_poll_compq(state, state->id_rcq_hdl); 2349 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 2350 return (DDI_FAILURE); 2351 } 2352 2353 /* 2354 * Pre ibt_attach() driver initialization 2355 */ 2356 static int 2357 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2358 { 2359 char buf[64]; 2360 2361 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2362 state->id_link_state = LINK_STATE_UNKNOWN; 2363 2364 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2365 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2366 state->id_trap_stop = B_TRUE; 2367 state->id_trap_inprog = 0; 2368 2369 mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2370 state->id_dip = dip; 2371 2372 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2373 2374 state->id_tx_list.dl_head = NULL; 2375 state->id_tx_list.dl_tail = NULL; 2376 state->id_tx_list.dl_pending_sends = B_FALSE; 2377 state->id_tx_list.dl_cnt = 0; 2378 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2379 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2380 state->id_tx_busy = 0; 2381 2382 state->id_rx_list.dl_head = NULL; 2383 state->id_rx_list.dl_tail = NULL; 2384 state->id_rx_list.dl_bufs_outstanding = 0; 2385 state->id_rx_list.dl_cnt = 0; 2386 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2387 mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL); 2388 2389 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2390 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2391 0, NULL, NULL, NULL, NULL, NULL, 0); 2392 2393 #ifdef IBD_LOGGING 2394 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 2395 #endif 2396 2397 return (DDI_SUCCESS); 2398 } 2399 2400 /* 2401 * Post ibt_detach() driver deconstruction 2402 */ 2403 static void 2404 ibd_state_fini(ibd_state_t *state) 2405 { 2406 kmem_cache_destroy(state->id_req_kmc); 2407 2408 mutex_destroy(&state->id_rxpost_lock); 2409 mutex_destroy(&state->id_rx_list.dl_mutex); 2410 2411 mutex_destroy(&state->id_txpost_lock); 2412 mutex_destroy(&state->id_tx_list.dl_mutex); 2413 2414 mutex_destroy(&state->id_sched_lock); 2415 mutex_destroy(&state->id_cq_poll_lock); 2416 2417 cv_destroy(&state->id_trap_cv); 2418 mutex_destroy(&state->id_trap_lock); 2419 mutex_destroy(&state->id_link_mutex); 2420 2421 #ifdef IBD_LOGGING 2422 mutex_destroy(&ibd_lbuf_lock); 2423 #endif 2424 } 2425 2426 /* 2427 * Fetch IBA parameters for the network device from IB nexus. 2428 */ 2429 static int 2430 ibd_get_portpkey(ibd_state_t *state, ib_guid_t *hca_guid) 2431 { 2432 /* 2433 * Get the IBA Pkey ... allow only fullmembers, per IPoIB spec. 2434 * Note that the default partition is also allowed. 2435 */ 2436 state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2437 0, "port-pkey", IB_PKEY_INVALID_LIMITED); 2438 if (state->id_pkey <= IB_PKEY_INVALID_FULL) { 2439 DPRINT(10, "ibd_get_portpkey : ERROR: IBport device has wrong" 2440 "partition\n"); 2441 return (DDI_FAILURE); 2442 } 2443 2444 /* 2445 * ... the IBA port ... 2446 */ 2447 state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 2448 0, "port-number", 0); 2449 if (state->id_port == 0) { 2450 DPRINT(10, "ibd_get_portpkey : ERROR: invalid port number\n"); 2451 return (DDI_FAILURE); 2452 } 2453 2454 /* 2455 * ... and HCA GUID. 2456 */ 2457 *hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 2458 0, "hca-guid", 0); 2459 if (*hca_guid == 0) { 2460 DPRINT(10, "ibd_get_portpkey : ERROR: IBport hca has wrong " 2461 "guid\n"); 2462 return (DDI_FAILURE); 2463 } 2464 2465 return (DDI_SUCCESS); 2466 } 2467 2468 /* 2469 * Fetch link speed from SA for snmp ifspeed reporting. 2470 */ 2471 static uint64_t 2472 ibd_get_portspeed(ibd_state_t *state) 2473 { 2474 int ret; 2475 ibt_path_info_t path; 2476 ibt_path_attr_t path_attr; 2477 uint8_t num_paths; 2478 uint64_t ifspeed; 2479 2480 /* 2481 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2482 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2483 * 2000000000. Start with that as default. 2484 */ 2485 ifspeed = 2000000000; 2486 2487 bzero(&path_attr, sizeof (path_attr)); 2488 2489 /* 2490 * Get the port speed from Loopback path information. 2491 */ 2492 path_attr.pa_dgids = &state->id_sgid; 2493 path_attr.pa_num_dgids = 1; 2494 path_attr.pa_sgid = state->id_sgid; 2495 2496 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2497 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2498 goto earlydone; 2499 2500 if (num_paths < 1) 2501 goto earlydone; 2502 2503 /* 2504 * In case SA does not return an expected value, report the default 2505 * speed as 1X. 2506 */ 2507 ret = 1; 2508 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2509 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2510 ret = 1; 2511 break; 2512 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2513 ret = 4; 2514 break; 2515 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2516 ret = 12; 2517 break; 2518 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2519 ret = 2; 2520 break; 2521 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2522 ret = 8; 2523 break; 2524 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2525 ret = 16; 2526 break; 2527 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2528 ret = 24; 2529 break; 2530 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2531 ret = 32; 2532 break; 2533 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2534 ret = 48; 2535 break; 2536 } 2537 2538 ifspeed *= ret; 2539 2540 earlydone: 2541 return (ifspeed); 2542 } 2543 2544 /* 2545 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2546 * representing the input mcg mgid. 2547 */ 2548 static ibd_mce_t * 2549 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2550 { 2551 ibd_mce_t *ptr = list_head(mlist); 2552 2553 /* 2554 * Do plain linear search. 2555 */ 2556 while (ptr != NULL) { 2557 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2558 sizeof (ib_gid_t)) == 0) 2559 return (ptr); 2560 ptr = list_next(mlist, ptr); 2561 } 2562 return (NULL); 2563 } 2564 2565 /* 2566 * Execute IBA JOIN. 2567 */ 2568 static ibt_status_t 2569 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2570 { 2571 ibt_mcg_attr_t mcg_attr; 2572 2573 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2574 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2575 mcg_attr.mc_mgid = mgid; 2576 mcg_attr.mc_join_state = mce->mc_jstate; 2577 mcg_attr.mc_scope = state->id_scope; 2578 mcg_attr.mc_pkey = state->id_pkey; 2579 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2580 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2581 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2582 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2583 NULL, NULL)); 2584 } 2585 2586 /* 2587 * This code JOINs the port in the proper way (depending on the join 2588 * state) so that IBA fabric will forward mcg packets to/from the port. 2589 * It also attaches the QPN to the mcg so it can receive those mcg 2590 * packets. This code makes sure not to attach the mcg to the QP if 2591 * that has been previously done due to the mcg being joined with a 2592 * different join state, even though this is not required by SWG_0216, 2593 * refid 3610. 2594 */ 2595 static ibd_mce_t * 2596 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2597 { 2598 ibt_status_t ibt_status; 2599 ibd_mce_t *mce, *tmce, *omce = NULL; 2600 boolean_t do_attach = B_TRUE; 2601 2602 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2603 jstate, mgid.gid_prefix, mgid.gid_guid); 2604 2605 /* 2606 * For enable_multicast Full member joins, we need to do some 2607 * extra work. If there is already an mce on the list that 2608 * indicates full membership, that means the membership has 2609 * not yet been dropped (since the disable_multicast was issued) 2610 * because there are pending Tx's to the mcg; in that case, just 2611 * mark the mce not to be reaped when the Tx completion queues 2612 * an async reap operation. 2613 * 2614 * If there is already an mce on the list indicating sendonly 2615 * membership, try to promote to full membership. Be careful 2616 * not to deallocate the old mce, since there might be an AH 2617 * pointing to it; instead, update the old mce with new data 2618 * that tracks the full membership. 2619 */ 2620 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2621 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2622 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2623 ASSERT(omce->mc_fullreap); 2624 omce->mc_fullreap = B_FALSE; 2625 return (omce); 2626 } else { 2627 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2628 } 2629 } 2630 2631 /* 2632 * Allocate the ibd_mce_t to track this JOIN. 2633 */ 2634 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2635 mce->mc_fullreap = B_FALSE; 2636 mce->mc_jstate = jstate; 2637 2638 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2639 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2640 ibt_status); 2641 kmem_free(mce, sizeof (ibd_mce_t)); 2642 return (NULL); 2643 } 2644 2645 /* 2646 * Is an IBA attach required? Not if the interface is already joined 2647 * to the mcg in a different appropriate join state. 2648 */ 2649 if (jstate == IB_MC_JSTATE_NON) { 2650 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2651 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2652 do_attach = B_FALSE; 2653 } else if (jstate == IB_MC_JSTATE_FULL) { 2654 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2655 do_attach = B_FALSE; 2656 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2657 do_attach = B_FALSE; 2658 } 2659 2660 if (do_attach) { 2661 /* 2662 * Do the IBA attach. 2663 */ 2664 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2665 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2666 &mce->mc_info)) != IBT_SUCCESS) { 2667 DPRINT(10, "ibd_join_group : failed qp attachment " 2668 "%d\n", ibt_status); 2669 /* 2670 * NOTE that we should probably preserve the join info 2671 * in the list and later try to leave again at detach 2672 * time. 2673 */ 2674 (void) ibt_leave_mcg(state->id_sgid, mgid, 2675 state->id_sgid, jstate); 2676 kmem_free(mce, sizeof (ibd_mce_t)); 2677 return (NULL); 2678 } 2679 } 2680 2681 /* 2682 * Insert the ibd_mce_t in the proper list. 2683 */ 2684 if (jstate == IB_MC_JSTATE_NON) { 2685 IBD_MCACHE_INSERT_NON(state, mce); 2686 } else { 2687 /* 2688 * Set up the mc_req fields used for reaping the 2689 * mcg in case of delayed tx completion (see 2690 * ibd_tx_cleanup()). Also done for sendonly join in 2691 * case we are promoted to fullmembership later and 2692 * keep using the same mce. 2693 */ 2694 mce->mc_req.rq_gid = mgid; 2695 mce->mc_req.rq_ptr = mce; 2696 /* 2697 * Check whether this is the case of trying to join 2698 * full member, and we were already joined send only. 2699 * We try to drop our SendOnly membership, but it is 2700 * possible that the mcg does not exist anymore (and 2701 * the subnet trap never reached us), so the leave 2702 * operation might fail. 2703 */ 2704 if (omce != NULL) { 2705 (void) ibt_leave_mcg(state->id_sgid, mgid, 2706 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2707 omce->mc_jstate = IB_MC_JSTATE_FULL; 2708 bcopy(&mce->mc_info, &omce->mc_info, 2709 sizeof (ibt_mcg_info_t)); 2710 kmem_free(mce, sizeof (ibd_mce_t)); 2711 return (omce); 2712 } 2713 mutex_enter(&state->id_mc_mutex); 2714 IBD_MCACHE_INSERT_FULL(state, mce); 2715 mutex_exit(&state->id_mc_mutex); 2716 } 2717 2718 return (mce); 2719 } 2720 2721 /* 2722 * Called during port up event handling to attempt to reacquire full 2723 * membership to an mcg. Stripped down version of ibd_join_group(). 2724 * Note that it is possible that the mcg might have gone away, and 2725 * gets recreated at this point. 2726 */ 2727 static void 2728 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2729 { 2730 ib_gid_t mgid; 2731 2732 /* 2733 * If the mc_fullreap flag is set, or this join fails, a subsequent 2734 * reap/leave is going to try to leave the group. We could prevent 2735 * that by adding a boolean flag into ibd_mce_t, if required. 2736 */ 2737 if (mce->mc_fullreap) 2738 return; 2739 2740 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2741 2742 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2743 mgid.gid_guid); 2744 2745 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2746 ibd_print_warn(state, "Failure on port up to rejoin " 2747 "multicast gid %016llx:%016llx", 2748 (u_longlong_t)mgid.gid_prefix, 2749 (u_longlong_t)mgid.gid_guid); 2750 } 2751 2752 /* 2753 * This code handles delayed Tx completion cleanups for mcg's to which 2754 * disable_multicast has been issued, regular mcg related cleanups during 2755 * disable_multicast, disable_promiscous and mcg traps, as well as 2756 * cleanups during driver detach time. Depending on the join state, 2757 * it deletes the mce from the appropriate list and issues the IBA 2758 * leave/detach; except in the disable_multicast case when the mce 2759 * is left on the active list for a subsequent Tx completion cleanup. 2760 */ 2761 static void 2762 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2763 uint8_t jstate) 2764 { 2765 ibd_mce_t *tmce; 2766 boolean_t do_detach = B_TRUE; 2767 2768 /* 2769 * Before detaching, we must check whether the other list 2770 * contains the mcg; if we detach blindly, the consumer 2771 * who set up the other list will also stop receiving 2772 * traffic. 2773 */ 2774 if (jstate == IB_MC_JSTATE_FULL) { 2775 /* 2776 * The following check is only relevant while coming 2777 * from the Tx completion path in the reap case. 2778 */ 2779 if (!mce->mc_fullreap) 2780 return; 2781 mutex_enter(&state->id_mc_mutex); 2782 IBD_MCACHE_PULLOUT_FULL(state, mce); 2783 mutex_exit(&state->id_mc_mutex); 2784 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2785 do_detach = B_FALSE; 2786 } else if (jstate == IB_MC_JSTATE_NON) { 2787 IBD_MCACHE_PULLOUT_NON(state, mce); 2788 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2789 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2790 do_detach = B_FALSE; 2791 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2792 mutex_enter(&state->id_mc_mutex); 2793 IBD_MCACHE_PULLOUT_FULL(state, mce); 2794 mutex_exit(&state->id_mc_mutex); 2795 do_detach = B_FALSE; 2796 } 2797 2798 /* 2799 * If we are reacting to a mcg trap and leaving our sendonly or 2800 * non membership, the mcg is possibly already gone, so attempting 2801 * to leave might fail. On the other hand, we must try to leave 2802 * anyway, since this might be a trap from long ago, and we could 2803 * have potentially sendonly joined to a recent incarnation of 2804 * the mcg and are about to loose track of this information. 2805 */ 2806 if (do_detach) { 2807 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 2808 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 2809 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 2810 } 2811 2812 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 2813 kmem_free(mce, sizeof (ibd_mce_t)); 2814 } 2815 2816 /* 2817 * Async code executed due to multicast and promiscuous disable requests 2818 * and mcg trap handling; also executed during driver detach. Mostly, a 2819 * leave and detach is done; except for the fullmember case when Tx 2820 * requests are pending, whence arrangements are made for subsequent 2821 * cleanup on Tx completion. 2822 */ 2823 static void 2824 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2825 { 2826 ipoib_mac_t mcmac; 2827 boolean_t recycled; 2828 ibd_mce_t *mce; 2829 2830 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 2831 jstate, mgid.gid_prefix, mgid.gid_guid); 2832 2833 if (jstate == IB_MC_JSTATE_NON) { 2834 recycled = B_TRUE; 2835 mce = IBD_MCACHE_FIND_NON(state, mgid); 2836 /* 2837 * In case we are handling a mcg trap, we might not find 2838 * the mcg in the non list. 2839 */ 2840 if (mce == NULL) 2841 return; 2842 } else { 2843 mce = IBD_MCACHE_FIND_FULL(state, mgid); 2844 2845 /* 2846 * In case we are handling a mcg trap, make sure the trap 2847 * is not arriving late; if we have an mce that indicates 2848 * that we are already a fullmember, that would be a clear 2849 * indication that the trap arrived late (ie, is for a 2850 * previous incarnation of the mcg). 2851 */ 2852 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 2853 if ((mce == NULL) || (mce->mc_jstate == 2854 IB_MC_JSTATE_FULL)) 2855 return; 2856 } else { 2857 ASSERT(jstate == IB_MC_JSTATE_FULL); 2858 2859 /* 2860 * If join group failed, mce will be NULL here. 2861 * This is because in GLDv3 driver, set multicast 2862 * will always return success. 2863 */ 2864 if (mce == NULL) 2865 return; 2866 2867 mce->mc_fullreap = B_TRUE; 2868 } 2869 2870 /* 2871 * If no pending Tx's remain that reference the AH 2872 * for the mcg, recycle it from active to free list. 2873 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 2874 * so the last completing Tx will cause an async reap 2875 * operation to be invoked, at which time we will drop our 2876 * membership to the mcg so that the pending Tx's complete 2877 * successfully. Refer to comments on "AH and MCE active 2878 * list manipulation" at top of this file. The lock protects 2879 * against Tx fast path and Tx cleanup code. 2880 */ 2881 mutex_enter(&state->id_ac_mutex); 2882 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 2883 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 2884 IB_MC_JSTATE_SEND_ONLY_NON)); 2885 mutex_exit(&state->id_ac_mutex); 2886 } 2887 2888 if (recycled) { 2889 DPRINT(2, "ibd_leave_group : leave_group reaping : " 2890 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 2891 ibd_async_reap_group(state, mce, mgid, jstate); 2892 } 2893 } 2894 2895 /* 2896 * Find the broadcast address as defined by IPoIB; implicitly 2897 * determines the IBA scope, mtu, tclass etc of the link the 2898 * interface is going to be a member of. 2899 */ 2900 static ibt_status_t 2901 ibd_find_bgroup(ibd_state_t *state) 2902 { 2903 ibt_mcg_attr_t mcg_attr; 2904 uint_t numg; 2905 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 2906 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 2907 IB_MC_SCOPE_GLOBAL }; 2908 int i, mcgmtu; 2909 boolean_t found = B_FALSE; 2910 2911 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2912 mcg_attr.mc_pkey = state->id_pkey; 2913 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 2914 2915 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 2916 state->id_scope = mcg_attr.mc_scope = scopes[i]; 2917 2918 /* 2919 * Look for the IPoIB broadcast group. 2920 */ 2921 state->id_mgid.gid_prefix = 2922 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 2923 ((uint64_t)state->id_scope << 48) | 2924 ((uint32_t)(state->id_pkey << 16))); 2925 mcg_attr.mc_mgid = state->id_mgid; 2926 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 2927 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 2928 found = B_TRUE; 2929 break; 2930 } 2931 2932 } 2933 2934 if (!found) { 2935 ibd_print_warn(state, "IPoIB broadcast group absent"); 2936 return (IBT_FAILURE); 2937 } 2938 2939 /* 2940 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 2941 */ 2942 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 2943 if (state->id_mtu < mcgmtu) { 2944 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 2945 "greater than port's maximum MTU %d", mcgmtu, 2946 state->id_mtu); 2947 return (IBT_FAILURE); 2948 } 2949 state->id_mtu = mcgmtu; 2950 2951 return (IBT_SUCCESS); 2952 } 2953 2954 /* 2955 * Post ibt_attach() initialization. 2956 */ 2957 static int 2958 ibd_drv_init(ibd_state_t *state) 2959 { 2960 kthread_t *kht; 2961 ibt_ud_chan_alloc_args_t ud_alloc_attr; 2962 ibt_ud_chan_query_attr_t ud_chan_attr; 2963 ibt_hca_portinfo_t *port_infop; 2964 ibt_hca_attr_t hca_attrs; 2965 ibt_status_t ibt_status; 2966 ibt_cq_attr_t cq_attr; 2967 ib_guid_t hca_guid; 2968 uint32_t real_size; 2969 uint32_t *ptr; 2970 char pathname[OBP_MAXPATHLEN]; 2971 uint_t psize, port_infosz; 2972 2973 /* 2974 * Initialize id_port before ibt_open_hca because of 2975 * ordering requirements in port up/down handling. 2976 */ 2977 if (ibd_get_portpkey(state, &hca_guid) != DDI_SUCCESS) 2978 return (DDI_FAILURE); 2979 2980 if (ibt_open_hca(state->id_ibt_hdl, hca_guid, 2981 &state->id_hca_hdl) != IBT_SUCCESS) { 2982 DPRINT(10, "ibd_drv_init : failed in ibt_open_hca()\n"); 2983 return (DDI_FAILURE); 2984 } 2985 2986 mutex_enter(&state->id_link_mutex); 2987 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 2988 state->id_port, &port_infop, &psize, 2989 &port_infosz); 2990 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 2991 mutex_exit(&state->id_link_mutex); 2992 DPRINT(10, "ibd_drv_init : failed in ibt_query_port()\n"); 2993 (void) ibt_close_hca(state->id_hca_hdl); 2994 return (DDI_FAILURE); 2995 } 2996 2997 /* 2998 * If the link already went down by the time we get here, give up; 2999 * we can not even get the gid since that is not valid. We would 3000 * fail in ibd_find_bgroup() anyway. 3001 */ 3002 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 3003 mutex_exit(&state->id_link_mutex); 3004 ibt_free_portinfo(port_infop, port_infosz); 3005 (void) ibt_close_hca(state->id_hca_hdl); 3006 ibd_print_warn(state, "Port is not active"); 3007 return (DDI_FAILURE); 3008 } 3009 3010 /* 3011 * This verifies the Pkey ibnexus handed us is still valid. 3012 * This is also the point from which the pkey table for the 3013 * port must hold the exact pkey value at the exact index 3014 * across port up/downs. 3015 */ 3016 if (ibt_pkey2index(state->id_hca_hdl, state->id_port, 3017 state->id_pkey, &state->id_pkix) != IBT_SUCCESS) { 3018 mutex_exit(&state->id_link_mutex); 3019 ibt_free_portinfo(port_infop, port_infosz); 3020 DPRINT(10, "ibd_drv_init : failed in ibt_pkey2index()\n"); 3021 (void) ibt_close_hca(state->id_hca_hdl); 3022 return (DDI_FAILURE); 3023 } 3024 3025 state->id_mtu = (128 << port_infop->p_mtu); 3026 state->id_sgid = *port_infop->p_sgid_tbl; 3027 state->id_link_state = LINK_STATE_UP; 3028 mutex_exit(&state->id_link_mutex); 3029 3030 ibt_free_portinfo(port_infop, port_infosz); 3031 3032 state->id_link_speed = ibd_get_portspeed(state); 3033 3034 /* 3035 * Read drv conf and record what the policy is on enabling LSO 3036 */ 3037 if (ddi_prop_get_int(DDI_DEV_T_ANY, state->id_dip, 3038 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 3039 state->id_lso_policy = B_TRUE; 3040 } else { 3041 state->id_lso_policy = B_FALSE; 3042 } 3043 3044 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 3045 ASSERT(ibt_status == IBT_SUCCESS); 3046 3047 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 3048 DPRINT(10, "ibd_drv_init : failed in ibd_find_bgroup\n"); 3049 goto drv_init_fail_find_bgroup; 3050 } 3051 3052 if (ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 3053 &state->id_pd_hdl) != IBT_SUCCESS) { 3054 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_pd()\n"); 3055 goto drv_init_fail_alloc_pd; 3056 } 3057 3058 /* Initialize the parallel ARP cache and AHs */ 3059 if (ibd_acache_init(state) != DDI_SUCCESS) { 3060 DPRINT(10, "ibd_drv_init : failed in ibd_acache_init()\n"); 3061 goto drv_init_fail_acache; 3062 } 3063 3064 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 3065 state->id_hca_res_lkey_capab = 1; 3066 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 3067 } 3068 3069 /* 3070 * Check various tunable limits. 3071 */ 3072 3073 /* 3074 * See if extended sgl size information is provided by the hca; if yes, 3075 * use the correct one and set the maximum sqseg value. 3076 */ 3077 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) 3078 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 3079 else 3080 state->id_max_sqseg = hca_attrs.hca_max_sgl; 3081 3082 /* 3083 * Set LSO capability and maximum length 3084 */ 3085 if (hca_attrs.hca_max_lso_size > 0) { 3086 state->id_lso_capable = B_TRUE; 3087 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 3088 state->id_lso_maxlen = IBD_LSO_MAXLEN; 3089 else 3090 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 3091 } else { 3092 state->id_lso_capable = B_FALSE; 3093 state->id_lso_maxlen = 0; 3094 } 3095 3096 3097 /* 3098 * Check #r/s wqes against max channel size. 3099 */ 3100 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) 3101 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 3102 else 3103 state->id_num_rwqe = IBD_NUM_RWQE; 3104 3105 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) 3106 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 3107 else 3108 state->id_num_swqe = IBD_NUM_SWQE; 3109 3110 /* 3111 * Check the hardware checksum capability. Currently we only consider 3112 * full checksum offload. 3113 */ 3114 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 3115 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 3116 } 3117 3118 /* 3119 * Allocate Rx/combined CQ: 3120 * Theoretically, there is no point in having more than #rwqe 3121 * plus #swqe cqe's, except that the CQ will be signalled for 3122 * overflow when the last wqe completes, if none of the previous 3123 * cqe's have been polled. Thus, we allocate just a few less wqe's 3124 * to make sure such overflow does not occur. 3125 */ 3126 cq_attr.cq_sched = NULL; 3127 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 3128 3129 if (ibd_separate_cqs == 1) { 3130 /* 3131 * Allocate Receive CQ. 3132 */ 3133 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 3134 cq_attr.cq_size = state->id_num_rwqe + 1; 3135 } else { 3136 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3137 state->id_num_rwqe = cq_attr.cq_size - 1; 3138 } 3139 3140 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3141 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3142 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3143 goto drv_init_fail_alloc_rcq; 3144 } 3145 3146 if (ibt_modify_cq(state->id_rcq_hdl, 3147 ibd_rxcomp_count, ibd_rxcomp_usec, 0) != IBT_SUCCESS) { 3148 DPRINT(10, "ibd_drv_init: Receive CQ interrupt " 3149 "moderation failed\n"); 3150 } 3151 3152 state->id_rxwcs_size = state->id_num_rwqe + 1; 3153 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 3154 state->id_rxwcs_size, KM_SLEEP); 3155 3156 /* 3157 * Allocate Send CQ. 3158 */ 3159 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 3160 cq_attr.cq_size = state->id_num_swqe + 1; 3161 } else { 3162 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3163 state->id_num_swqe = cq_attr.cq_size - 1; 3164 } 3165 3166 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3167 &state->id_scq_hdl, &real_size) != IBT_SUCCESS) { 3168 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3169 goto drv_init_fail_alloc_scq; 3170 } 3171 if (ibt_modify_cq(state->id_scq_hdl, 3172 10, 300, 0) != IBT_SUCCESS) { 3173 DPRINT(10, "ibd_drv_init: Send CQ interrupt " 3174 "moderation failed\n"); 3175 } 3176 3177 state->id_txwcs_size = state->id_num_swqe + 1; 3178 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 3179 state->id_txwcs_size, KM_SLEEP); 3180 } else { 3181 /* 3182 * Allocate combined Send/Receive CQ. 3183 */ 3184 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 3185 state->id_num_swqe + 1)) { 3186 cq_attr.cq_size = state->id_num_rwqe + 3187 state->id_num_swqe + 1; 3188 } else { 3189 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 3190 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 3191 state->id_num_rwqe) / (state->id_num_rwqe + 3192 state->id_num_swqe); 3193 state->id_num_swqe = cq_attr.cq_size - 1 - 3194 state->id_num_rwqe; 3195 } 3196 3197 state->id_rxwcs_size = cq_attr.cq_size; 3198 state->id_txwcs_size = state->id_rxwcs_size; 3199 3200 if (ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 3201 &state->id_rcq_hdl, &real_size) != IBT_SUCCESS) { 3202 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_cq()\n"); 3203 goto drv_init_fail_alloc_rcq; 3204 } 3205 state->id_scq_hdl = state->id_rcq_hdl; 3206 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 3207 state->id_rxwcs_size, KM_SLEEP); 3208 state->id_txwcs = state->id_rxwcs; 3209 } 3210 3211 /* 3212 * Print message in case we could not allocate as many wqe's 3213 * as was requested. Note that in the combined CQ case, we will 3214 * get the following message. 3215 */ 3216 if (state->id_num_rwqe != IBD_NUM_RWQE) 3217 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 3218 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 3219 if (state->id_num_swqe != IBD_NUM_SWQE) 3220 ibd_print_warn(state, "Setting #swqe = %d instead of default " 3221 "%d", state->id_num_swqe, IBD_NUM_SWQE); 3222 3223 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 3224 if (state->id_hca_res_lkey_capab) 3225 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 3226 if (state->id_lso_policy && state->id_lso_capable) 3227 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 3228 3229 ud_alloc_attr.ud_hca_port_num = state->id_port; 3230 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 3231 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 3232 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 3233 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 3234 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 3235 ud_alloc_attr.ud_scq = state->id_scq_hdl; 3236 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 3237 ud_alloc_attr.ud_pd = state->id_pd_hdl; 3238 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 3239 ud_alloc_attr.ud_clone_chan = NULL; 3240 3241 if (ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 3242 &ud_alloc_attr, &state->id_chnl_hdl, NULL) != IBT_SUCCESS) { 3243 DPRINT(10, "ibd_drv_init : failed in ibt_alloc_ud_channel()" 3244 "\n"); 3245 goto drv_init_fail_alloc_chan; 3246 } 3247 3248 if (ibt_query_ud_channel(state->id_chnl_hdl, &ud_chan_attr) != 3249 DDI_SUCCESS) { 3250 DPRINT(10, "ibd_drv_init : failed in ibt_query_ud_channel()"); 3251 goto drv_init_fail_query_chan; 3252 } 3253 3254 state->id_qpnum = ud_chan_attr.ud_qpn; 3255 /* state->id_max_sqseg = ud_chan_attr.ud_chan_sizes.cs_sq_sgl; */ 3256 3257 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 3258 state->id_max_sqseg = IBD_MAX_SQSEG; 3259 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 3260 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 3261 state->id_max_sqseg, IBD_MAX_SQSEG); 3262 } 3263 3264 /* Initialize the Transmit buffer list */ 3265 if (ibd_init_txlist(state) != DDI_SUCCESS) { 3266 DPRINT(10, "ibd_drv_init : failed in ibd_init_txlist()\n"); 3267 goto drv_init_fail_txlist_init; 3268 } 3269 3270 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 3271 /* 3272 * Setup the handler we will use for regular DLPI stuff 3273 */ 3274 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 3275 if (ibt_enable_cq_notify(state->id_scq_hdl, 3276 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 3277 DPRINT(10, "ibd_drv_init : failed in" 3278 " ibt_enable_cq_notify()\n"); 3279 goto drv_init_fail_cq_notify; 3280 } 3281 } 3282 3283 /* Initialize the Receive buffer list */ 3284 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 3285 DPRINT(10, "ibd_drv_init : failed in ibd_init_rxlist()\n"); 3286 goto drv_init_fail_rxlist_init; 3287 } 3288 3289 /* Join to IPoIB broadcast group as required by IPoIB */ 3290 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 3291 DPRINT(10, "ibd_drv_init : failed in ibd_join_group\n"); 3292 goto drv_init_fail_join_group; 3293 } 3294 3295 /* 3296 * Create the async thread; thread_create never fails. 3297 */ 3298 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 3299 TS_RUN, minclsyspri); 3300 3301 state->id_async_thrid = kht->t_did; 3302 3303 /* 3304 * The local mac address is now known. Create the IPoIB 3305 * address. 3306 */ 3307 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 3308 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3309 /* 3310 * Similarly, program in the broadcast mac address. 3311 */ 3312 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, state->id_mgid.gid_prefix, 3313 state->id_mgid.gid_guid); 3314 3315 ptr = (uint32_t *)&state->id_macaddr; 3316 DPRINT(10, "ibd_drv_init : INFO: MAC %08X:%08X:%08X:%08X:%08X\n", 3317 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3318 ptr = (uint32_t *)&state->id_bcaddr; 3319 DPRINT(10, "ibd_drv_init : INFO: BCMAC %08X:%08X:%08X:%08X:%08X\n", 3320 *ptr, *(ptr+1), *(ptr+2), *(ptr+3), *(ptr+4)); 3321 DPRINT(10, "ibd_drv_init : INFO: Pkey 0x%x, Mgid %016llx%016llx\n", 3322 state->id_pkey, state->id_mgid.gid_prefix, 3323 state->id_mgid.gid_guid); 3324 DPRINT(10, "ibd_drv_init : INFO: GID %016llx%016llx\n", 3325 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 3326 DPRINT(10, "ibd_drv_init : INFO: PKEY %04x\n", state->id_pkey); 3327 DPRINT(10, "ibd_drv_init : INFO: MTU %d\n", state->id_mtu); 3328 (void) ddi_pathname(state->id_dip, pathname); 3329 DPRINT(10, "ibd_drv_init : INFO: Pathname %s\n", pathname); 3330 3331 return (DDI_SUCCESS); 3332 3333 drv_init_fail_join_group: 3334 ibd_fini_rxlist(state); 3335 3336 drv_init_fail_rxlist_init: 3337 drv_init_fail_cq_notify: 3338 ibd_fini_txlist(state); 3339 3340 drv_init_fail_txlist_init: 3341 drv_init_fail_query_chan: 3342 if (ibt_free_channel(state->id_chnl_hdl) != IBT_SUCCESS) 3343 DPRINT(10, "ibd_drv_init : failed in ibt_free_channel()"); 3344 3345 drv_init_fail_alloc_chan: 3346 if ((ibd_separate_cqs == 1) && (ibt_free_cq(state->id_scq_hdl) != 3347 IBT_SUCCESS)) 3348 DPRINT(10, "ibd_drv_init : Tx ibt_free_cq()"); 3349 3350 if (ibd_separate_cqs == 1) 3351 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * 3352 state->id_txwcs_size); 3353 3354 drv_init_fail_alloc_scq: 3355 if (ibt_free_cq(state->id_rcq_hdl) != IBT_SUCCESS) 3356 DPRINT(10, "ibd_drv_init : Rx ibt_free_cq()"); 3357 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); 3358 3359 drv_init_fail_alloc_rcq: 3360 ibd_acache_fini(state); 3361 drv_init_fail_acache: 3362 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 3363 DPRINT(10, "ibd_drv_init : failed in ibt_free_pd()"); 3364 3365 drv_init_fail_alloc_pd: 3366 ibt_free_mcg_info(state->id_mcinfo, 1); 3367 drv_init_fail_find_bgroup: 3368 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 3369 DPRINT(10, "ibd_drv_init : failed in ibt_close_hca()"); 3370 3371 return (DDI_FAILURE); 3372 } 3373 3374 3375 static int 3376 ibd_alloc_tx_copybufs(ibd_state_t *state) 3377 { 3378 ibt_mr_attr_t mem_attr; 3379 3380 /* 3381 * Allocate one big chunk for all regular tx copy bufs 3382 */ 3383 state->id_tx_buf_sz = state->id_mtu; 3384 if (state->id_lso_policy && state->id_lso_capable && 3385 (IBD_TX_BUF_SZ > state->id_mtu)) { 3386 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3387 } 3388 3389 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3390 state->id_tx_buf_sz, KM_SLEEP); 3391 3392 /* 3393 * Do one memory registration on the entire txbuf area 3394 */ 3395 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3396 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3397 mem_attr.mr_as = NULL; 3398 mem_attr.mr_flags = IBT_MR_SLEEP; 3399 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3400 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3401 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3402 kmem_free(state->id_tx_bufs, 3403 state->id_num_swqe * state->id_tx_buf_sz); 3404 state->id_tx_bufs = NULL; 3405 return (DDI_FAILURE); 3406 } 3407 3408 return (DDI_SUCCESS); 3409 } 3410 3411 static int 3412 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3413 { 3414 ibt_mr_attr_t mem_attr; 3415 ibd_lsobuf_t *buflist; 3416 ibd_lsobuf_t *lbufp; 3417 ibd_lsobuf_t *tail; 3418 ibd_lsobkt_t *bktp; 3419 uint8_t *membase; 3420 uint8_t *memp; 3421 uint_t memsz; 3422 int i; 3423 3424 /* 3425 * Allocate the lso bucket 3426 */ 3427 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3428 3429 /* 3430 * Allocate the entire lso memory and register it 3431 */ 3432 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3433 membase = kmem_zalloc(memsz, KM_SLEEP); 3434 3435 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3436 mem_attr.mr_len = memsz; 3437 mem_attr.mr_as = NULL; 3438 mem_attr.mr_flags = IBT_MR_SLEEP; 3439 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3440 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3441 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3442 kmem_free(membase, memsz); 3443 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3444 return (DDI_FAILURE); 3445 } 3446 3447 /* 3448 * Now allocate the buflist. Note that the elements in the buflist and 3449 * the buffers in the lso memory have a permanent 1-1 relation, so we 3450 * can always derive the address of a buflist entry from the address of 3451 * an lso buffer. 3452 */ 3453 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3454 KM_SLEEP); 3455 3456 /* 3457 * Set up the lso buf chain 3458 */ 3459 memp = membase; 3460 lbufp = buflist; 3461 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3462 lbufp->lb_isfree = 1; 3463 lbufp->lb_buf = memp; 3464 lbufp->lb_next = lbufp + 1; 3465 3466 tail = lbufp; 3467 3468 memp += IBD_LSO_BUFSZ; 3469 lbufp++; 3470 } 3471 tail->lb_next = NULL; 3472 3473 /* 3474 * Set up the LSO buffer information in ibd state 3475 */ 3476 bktp->bkt_bufl = buflist; 3477 bktp->bkt_free_head = buflist; 3478 bktp->bkt_mem = membase; 3479 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3480 bktp->bkt_nfree = bktp->bkt_nelem; 3481 3482 state->id_lso = bktp; 3483 3484 return (DDI_SUCCESS); 3485 } 3486 3487 /* 3488 * Statically allocate Tx buffer list(s). 3489 */ 3490 static int 3491 ibd_init_txlist(ibd_state_t *state) 3492 { 3493 ibd_swqe_t *swqe; 3494 ibt_lkey_t lkey; 3495 int i; 3496 3497 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3498 return (DDI_FAILURE); 3499 3500 if (state->id_lso_policy && state->id_lso_capable) { 3501 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3502 state->id_lso_policy = B_FALSE; 3503 } 3504 3505 /* 3506 * Allocate and setup the swqe list 3507 */ 3508 lkey = state->id_tx_mr_desc.md_lkey; 3509 for (i = 0; i < state->id_num_swqe; i++) { 3510 if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) { 3511 DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed"); 3512 ibd_fini_txlist(state); 3513 return (DDI_FAILURE); 3514 } 3515 3516 /* add to list */ 3517 state->id_tx_list.dl_cnt++; 3518 if (state->id_tx_list.dl_head == NULL) { 3519 swqe->swqe_prev = NULL; 3520 swqe->swqe_next = NULL; 3521 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3522 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3523 } else { 3524 swqe->swqe_prev = state->id_tx_list.dl_tail; 3525 swqe->swqe_next = NULL; 3526 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3527 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3528 } 3529 } 3530 3531 return (DDI_SUCCESS); 3532 } 3533 3534 static int 3535 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3536 uint32_t *nds_p) 3537 { 3538 ibd_lsobkt_t *bktp; 3539 ibd_lsobuf_t *lbufp; 3540 ibd_lsobuf_t *nextp; 3541 ibt_lkey_t lso_lkey; 3542 uint_t frag_sz; 3543 uint_t num_needed; 3544 int i; 3545 3546 ASSERT(sgl_p != NULL); 3547 ASSERT(nds_p != NULL); 3548 ASSERT(req_sz != 0); 3549 3550 /* 3551 * Determine how many bufs we'd need for the size requested 3552 */ 3553 num_needed = req_sz / IBD_LSO_BUFSZ; 3554 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3555 num_needed++; 3556 3557 mutex_enter(&state->id_lso_lock); 3558 3559 /* 3560 * If we don't have enough lso bufs, return failure 3561 */ 3562 ASSERT(state->id_lso != NULL); 3563 bktp = state->id_lso; 3564 if (bktp->bkt_nfree < num_needed) { 3565 mutex_exit(&state->id_lso_lock); 3566 return (-1); 3567 } 3568 3569 /* 3570 * Pick the first 'num_needed' bufs from the free list 3571 */ 3572 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3573 lbufp = bktp->bkt_free_head; 3574 for (i = 0; i < num_needed; i++) { 3575 ASSERT(lbufp->lb_isfree != 0); 3576 ASSERT(lbufp->lb_buf != NULL); 3577 3578 nextp = lbufp->lb_next; 3579 3580 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3581 sgl_p[i].ds_key = lso_lkey; 3582 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3583 3584 lbufp->lb_isfree = 0; 3585 lbufp->lb_next = NULL; 3586 3587 lbufp = nextp; 3588 } 3589 bktp->bkt_free_head = lbufp; 3590 3591 /* 3592 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3593 * to adjust the last sgl entry's length. Since we know we need atleast 3594 * one, the i-1 use below is ok. 3595 */ 3596 if (frag_sz) { 3597 sgl_p[i-1].ds_len = frag_sz; 3598 } 3599 3600 /* 3601 * Update nfree count and return 3602 */ 3603 bktp->bkt_nfree -= num_needed; 3604 3605 mutex_exit(&state->id_lso_lock); 3606 3607 *nds_p = num_needed; 3608 3609 return (0); 3610 } 3611 3612 static void 3613 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3614 { 3615 ibd_lsobkt_t *bktp; 3616 ibd_lsobuf_t *lbufp; 3617 uint8_t *lso_mem_end; 3618 uint_t ndx; 3619 int i; 3620 3621 mutex_enter(&state->id_lso_lock); 3622 3623 bktp = state->id_lso; 3624 ASSERT(bktp != NULL); 3625 3626 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3627 for (i = 0; i < nds; i++) { 3628 uint8_t *va; 3629 3630 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3631 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3632 3633 /* 3634 * Figure out the buflist element this sgl buffer corresponds 3635 * to and put it back at the head 3636 */ 3637 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3638 lbufp = bktp->bkt_bufl + ndx; 3639 3640 ASSERT(lbufp->lb_isfree == 0); 3641 ASSERT(lbufp->lb_buf == va); 3642 3643 lbufp->lb_isfree = 1; 3644 lbufp->lb_next = bktp->bkt_free_head; 3645 bktp->bkt_free_head = lbufp; 3646 } 3647 bktp->bkt_nfree += nds; 3648 3649 mutex_exit(&state->id_lso_lock); 3650 } 3651 3652 static void 3653 ibd_free_tx_copybufs(ibd_state_t *state) 3654 { 3655 /* 3656 * Unregister txbuf mr 3657 */ 3658 if (ibt_deregister_mr(state->id_hca_hdl, 3659 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3660 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3661 } 3662 state->id_tx_mr_hdl = NULL; 3663 3664 /* 3665 * Free txbuf memory 3666 */ 3667 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3668 state->id_tx_bufs = NULL; 3669 } 3670 3671 static void 3672 ibd_free_tx_lsobufs(ibd_state_t *state) 3673 { 3674 ibd_lsobkt_t *bktp; 3675 3676 mutex_enter(&state->id_lso_lock); 3677 3678 if ((bktp = state->id_lso) == NULL) { 3679 mutex_exit(&state->id_lso_lock); 3680 return; 3681 } 3682 3683 /* 3684 * First, free the buflist 3685 */ 3686 ASSERT(bktp->bkt_bufl != NULL); 3687 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3688 3689 /* 3690 * Unregister the LSO memory and free it 3691 */ 3692 ASSERT(bktp->bkt_mr_hdl != NULL); 3693 if (ibt_deregister_mr(state->id_hca_hdl, 3694 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3695 DPRINT(10, 3696 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3697 } 3698 ASSERT(bktp->bkt_mem); 3699 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3700 3701 /* 3702 * Finally free the bucket 3703 */ 3704 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3705 state->id_lso = NULL; 3706 3707 mutex_exit(&state->id_lso_lock); 3708 } 3709 3710 /* 3711 * Free the statically allocated Tx buffer list. 3712 */ 3713 static void 3714 ibd_fini_txlist(ibd_state_t *state) 3715 { 3716 ibd_swqe_t *node; 3717 3718 /* 3719 * Free the allocated swqes 3720 */ 3721 mutex_enter(&state->id_tx_list.dl_mutex); 3722 while (state->id_tx_list.dl_head != NULL) { 3723 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3724 state->id_tx_list.dl_head = node->swqe_next; 3725 state->id_tx_list.dl_cnt--; 3726 ASSERT(state->id_tx_list.dl_cnt >= 0); 3727 ibd_free_swqe(state, node); 3728 } 3729 mutex_exit(&state->id_tx_list.dl_mutex); 3730 3731 ibd_free_tx_lsobufs(state); 3732 ibd_free_tx_copybufs(state); 3733 } 3734 3735 /* 3736 * Allocate a single send wqe and register it so it is almost 3737 * ready to be posted to the hardware. 3738 */ 3739 static int 3740 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey) 3741 { 3742 ibd_swqe_t *swqe; 3743 3744 swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP); 3745 *wqe = swqe; 3746 3747 swqe->swqe_type = IBD_WQE_SEND; 3748 swqe->swqe_next = NULL; 3749 swqe->swqe_prev = NULL; 3750 swqe->swqe_im_mblk = NULL; 3751 3752 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3753 (state->id_tx_bufs + ndx * state->id_tx_buf_sz); 3754 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3755 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3756 3757 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3758 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3759 swqe->w_swr.wr_trans = IBT_UD_SRV; 3760 3761 /* These are set in send */ 3762 swqe->w_swr.wr_nds = 0; 3763 swqe->w_swr.wr_sgl = NULL; 3764 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3765 3766 return (DDI_SUCCESS); 3767 } 3768 3769 /* 3770 * Free an allocated send wqe. 3771 */ 3772 /*ARGSUSED*/ 3773 static void 3774 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3775 { 3776 kmem_free(swqe, sizeof (ibd_swqe_t)); 3777 } 3778 3779 /* 3780 * Post a rwqe to the hardware and add it to the Rx list. The 3781 * "recycle" parameter indicates whether an old rwqe is being 3782 * recycled, or this is a new one. 3783 */ 3784 static int 3785 ibd_post_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3786 { 3787 ibt_status_t ibt_status; 3788 3789 if (recycle == B_FALSE) { 3790 mutex_enter(&state->id_rx_list.dl_mutex); 3791 if (state->id_rx_list.dl_head == NULL) { 3792 rwqe->rwqe_prev = NULL; 3793 rwqe->rwqe_next = NULL; 3794 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3795 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3796 } else { 3797 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3798 rwqe->rwqe_next = NULL; 3799 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3800 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3801 } 3802 mutex_exit(&state->id_rx_list.dl_mutex); 3803 } 3804 3805 mutex_enter(&state->id_rxpost_lock); 3806 if (state->id_rx_busy) { 3807 rwqe->w_post_link = NULL; 3808 if (state->id_rx_head) 3809 *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe; 3810 else 3811 state->id_rx_head = rwqe; 3812 state->id_rx_tailp = &(rwqe->w_post_link); 3813 } else { 3814 state->id_rx_busy = 1; 3815 do { 3816 mutex_exit(&state->id_rxpost_lock); 3817 3818 /* 3819 * Here we should add dl_cnt before post recv, because 3820 * we would have to make sure dl_cnt is updated before 3821 * the corresponding ibd_process_rx() is called. 3822 */ 3823 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3824 3825 ibt_status = ibt_post_recv(state->id_chnl_hdl, 3826 &rwqe->w_rwr, 1, NULL); 3827 if (ibt_status != IBT_SUCCESS) { 3828 (void) atomic_add_32_nv( 3829 &state->id_rx_list.dl_cnt, -1); 3830 ibd_print_warn(state, "ibd_post_rwqe: " 3831 "posting failed, ret=%d", ibt_status); 3832 return (DDI_FAILURE); 3833 } 3834 3835 mutex_enter(&state->id_rxpost_lock); 3836 rwqe = state->id_rx_head; 3837 if (rwqe) { 3838 state->id_rx_head = 3839 (ibd_rwqe_t *)(rwqe->w_post_link); 3840 } 3841 } while (rwqe); 3842 state->id_rx_busy = 0; 3843 } 3844 mutex_exit(&state->id_rxpost_lock); 3845 3846 return (DDI_SUCCESS); 3847 } 3848 3849 /* 3850 * Allocate the statically allocated Rx buffer list. 3851 */ 3852 static int 3853 ibd_init_rxlist(ibd_state_t *state) 3854 { 3855 ibd_rwqe_t *rwqe; 3856 int i; 3857 3858 for (i = 0; i < state->id_num_rwqe; i++) { 3859 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3860 ibd_fini_rxlist(state); 3861 return (DDI_FAILURE); 3862 } 3863 3864 if (ibd_post_rwqe(state, rwqe, B_FALSE) == DDI_FAILURE) { 3865 ibd_free_rwqe(state, rwqe); 3866 ibd_fini_rxlist(state); 3867 return (DDI_FAILURE); 3868 } 3869 } 3870 3871 return (DDI_SUCCESS); 3872 } 3873 3874 /* 3875 * Free the statically allocated Rx buffer list. 3876 * 3877 */ 3878 static void 3879 ibd_fini_rxlist(ibd_state_t *state) 3880 { 3881 ibd_rwqe_t *node; 3882 3883 mutex_enter(&state->id_rx_list.dl_mutex); 3884 while (state->id_rx_list.dl_head != NULL) { 3885 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3886 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3887 state->id_rx_list.dl_cnt--; 3888 ASSERT(state->id_rx_list.dl_cnt >= 0); 3889 3890 ibd_free_rwqe(state, node); 3891 } 3892 mutex_exit(&state->id_rx_list.dl_mutex); 3893 } 3894 3895 /* 3896 * Allocate a single recv wqe and register it so it is almost 3897 * ready to be posted to the hardware. 3898 */ 3899 static int 3900 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3901 { 3902 ibt_mr_attr_t mem_attr; 3903 ibd_rwqe_t *rwqe; 3904 3905 if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3906 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3907 return (DDI_FAILURE); 3908 } 3909 *wqe = rwqe; 3910 rwqe->rwqe_type = IBD_WQE_RECV; 3911 rwqe->w_state = state; 3912 rwqe->rwqe_next = NULL; 3913 rwqe->rwqe_prev = NULL; 3914 rwqe->w_freeing_wqe = B_FALSE; 3915 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3916 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3917 3918 rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3919 IPOIB_GRH_SIZE, KM_NOSLEEP); 3920 if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) { 3921 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3922 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3923 return (DDI_FAILURE); 3924 } 3925 3926 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3927 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3928 NULL) { 3929 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3930 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3931 state->id_mtu + IPOIB_GRH_SIZE); 3932 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3933 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3934 return (DDI_FAILURE); 3935 } 3936 3937 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3938 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3939 mem_attr.mr_as = NULL; 3940 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3941 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3942 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3943 IBT_SUCCESS) { 3944 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3945 rwqe->w_freeing_wqe = B_TRUE; 3946 freemsg(rwqe->rwqe_im_mblk); 3947 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3948 state->id_mtu + IPOIB_GRH_SIZE); 3949 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3950 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3951 return (DDI_FAILURE); 3952 } 3953 3954 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3955 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3956 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3957 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3958 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3959 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3960 rwqe->w_rwr.wr_nds = 1; 3961 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3962 3963 return (DDI_SUCCESS); 3964 } 3965 3966 /* 3967 * Free an allocated recv wqe. 3968 */ 3969 static void 3970 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3971 { 3972 if (ibt_deregister_mr(state->id_hca_hdl, 3973 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3974 DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()"); 3975 return; 3976 } 3977 3978 /* 3979 * Indicate to the callback function that this rwqe/mblk 3980 * should not be recycled. The freemsg() will invoke 3981 * ibd_freemsg_cb(). 3982 */ 3983 if (rwqe->rwqe_im_mblk != NULL) { 3984 rwqe->w_freeing_wqe = B_TRUE; 3985 freemsg(rwqe->rwqe_im_mblk); 3986 } 3987 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3988 state->id_mtu + IPOIB_GRH_SIZE); 3989 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3990 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3991 } 3992 3993 /* 3994 * Delete the rwqe being freed from the rx list. 3995 */ 3996 static void 3997 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3998 { 3999 mutex_enter(&state->id_rx_list.dl_mutex); 4000 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 4001 state->id_rx_list.dl_head = rwqe->rwqe_next; 4002 else 4003 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 4004 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 4005 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 4006 else 4007 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 4008 mutex_exit(&state->id_rx_list.dl_mutex); 4009 } 4010 4011 /* 4012 * Pre ibt_detach() deconstruction. 4013 */ 4014 static void 4015 ibd_drv_fini(ibd_state_t *state) 4016 { 4017 ib_gid_t mgid; 4018 ibd_mce_t *mce; 4019 ibt_status_t status; 4020 uint8_t jstate; 4021 4022 /* 4023 * Desubscribe from trap notices; we will be tearing down 4024 * the mcg lists soon. Make sure the trap handler does nothing 4025 * even if it is invoked (ie till we invoke ibt_detach()). 4026 */ 4027 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4028 mutex_enter(&state->id_trap_lock); 4029 state->id_trap_stop = B_TRUE; 4030 while (state->id_trap_inprog > 0) 4031 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4032 mutex_exit(&state->id_trap_lock); 4033 4034 /* 4035 * Flushing the channel ensures that all pending WQE's 4036 * are marked with flush_error and handed to the CQ. It 4037 * does not guarantee the invocation of the CQ handler. 4038 * This call is guaranteed to return successfully for UD QPNs. 4039 */ 4040 status = ibt_flush_channel(state->id_chnl_hdl); 4041 ASSERT(status == IBT_SUCCESS); 4042 4043 /* 4044 * We possibly need a loop here to wait for all the Tx 4045 * callbacks to happen. The Tx handlers will retrieve 4046 * held resources like AH ac_ref count, registered memory 4047 * and possibly IBD_ASYNC_REAP requests. Rx interrupts were already 4048 * turned off (in ibd_detach()); turn off Tx interrupts and 4049 * poll. By the time the polling returns an empty indicator, 4050 * we are sure we have seen all pending Tx callbacks. Note 4051 * that after the ibt_set_cq_handler() returns, the old handler 4052 * is guaranteed not to be invoked anymore. 4053 */ 4054 if (ibd_separate_cqs == 1) 4055 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4056 ibd_poll_compq(state, state->id_scq_hdl); 4057 4058 /* 4059 * No more async requests will be posted since the device has been 4060 * unregistered; completion handlers have been turned off, so Tx 4061 * handler will not cause any more IBD_ASYNC_REAP requests. Queue a 4062 * request for the async thread to exit, which will be serviced 4063 * after any pending ones. This can take a while, specially if the 4064 * SM is unreachable, since IBMF will slowly timeout each SM request 4065 * issued by the async thread. Reap the thread before continuing on, 4066 * we do not want it to be lingering in modunloaded code. 4067 */ 4068 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4069 thread_join(state->id_async_thrid); 4070 4071 /* 4072 * We can not be in promiscuous mode anymore, upper layers 4073 * would have made a request to disable it (if ever set previously) 4074 * before the detach is allowed to progress to this point; and the 4075 * aysnc thread would have processed that request by now. Thus the 4076 * nonmember list is guaranteed empty at this point. 4077 */ 4078 ASSERT(state->id_prom_op != IBD_OP_COMPLETED); 4079 4080 /* 4081 * Drop all residual full/non membership. This includes full 4082 * membership to the broadcast group, and any nonmembership 4083 * acquired during transmits. We do this after the Tx completion 4084 * handlers are done, since those might result in some late 4085 * leaves; this also eliminates a potential race with that 4086 * path wrt the mc full list insert/delete. Trap handling 4087 * has also been suppressed at this point. Thus, no locks 4088 * are required while traversing the mc full list. 4089 */ 4090 DPRINT(2, "ibd_drv_fini : clear full cache entries"); 4091 mce = list_head(&state->id_mc_full); 4092 while (mce != NULL) { 4093 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4094 jstate = mce->mc_jstate; 4095 mce = list_next(&state->id_mc_full, mce); 4096 ibd_leave_group(state, mgid, jstate); 4097 } 4098 4099 ibt_free_mcg_info(state->id_mcinfo, 1); 4100 4101 /* 4102 * Kill the channel now; guaranteed to return successfully 4103 * for UD QPNs. 4104 */ 4105 status = ibt_free_channel(state->id_chnl_hdl); 4106 ASSERT(status == IBT_SUCCESS); 4107 4108 /* 4109 * Kill the CQ; all completion handlers are guaranteed to 4110 * have terminated by the time this returns. Since we killed 4111 * the QPN above, we can not receive the IBT_CQ_BUSY error. 4112 */ 4113 status = ibt_free_cq(state->id_rcq_hdl); 4114 ASSERT(status == IBT_SUCCESS); 4115 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * state->id_rxwcs_size); 4116 4117 if (ibd_separate_cqs == 1) { 4118 status = ibt_free_cq(state->id_scq_hdl); 4119 ASSERT(status == IBT_SUCCESS); 4120 kmem_free(state->id_txwcs, sizeof (ibt_wc_t) * 4121 state->id_txwcs_size); 4122 } 4123 4124 /* 4125 * Since these following will act on the Rx/Tx list, which 4126 * is also looked at by the Rx/Tx handlers, keep them around 4127 * till all handlers are guaranteed to have completed. 4128 */ 4129 ibd_fini_rxlist(state); 4130 ibd_fini_txlist(state); 4131 4132 /* 4133 * Clean up the active AH hash list. 4134 */ 4135 mod_hash_destroy_hash(state->id_ah_active_hash); 4136 4137 /* 4138 * Free parallel ARP cache and AHs; we are sure all of these 4139 * resources have been released by the Tx completion handler. 4140 */ 4141 ibd_acache_fini(state); 4142 4143 /* 4144 * We freed the QPN, all the MRs and AHs. This step should not 4145 * fail; print a warning message if it does fail, due to a bug 4146 * in the driver. 4147 */ 4148 if (ibt_free_pd(state->id_hca_hdl, state->id_pd_hdl) != IBT_SUCCESS) 4149 ibd_print_warn(state, "failed to free protection domain"); 4150 4151 if (ibt_close_hca(state->id_hca_hdl) != IBT_SUCCESS) 4152 ibd_print_warn(state, "failed to close HCA device"); 4153 } 4154 4155 /* 4156 * IBA Rx/Tx completion queue handler. Guaranteed to be single 4157 * threaded and nonreentrant for this CQ. When using combined CQ, 4158 * this handles Tx and Rx completions. With separate CQs, this handles 4159 * only Rx completions. 4160 */ 4161 /* ARGSUSED */ 4162 static void 4163 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4164 { 4165 ibd_state_t *state = (ibd_state_t *)arg; 4166 4167 atomic_add_64(&state->id_num_intrs, 1); 4168 4169 if (ibd_rx_softintr == 1) 4170 ddi_trigger_softintr(state->id_rx); 4171 else 4172 (void) ibd_intr((char *)state); 4173 } 4174 4175 /* 4176 * Separate CQ handler for Tx completions, when the Tx CQ is in 4177 * interrupt driven mode. 4178 */ 4179 /* ARGSUSED */ 4180 static void 4181 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 4182 { 4183 ibd_state_t *state = (ibd_state_t *)arg; 4184 4185 atomic_add_64(&state->id_num_intrs, 1); 4186 4187 if (ibd_tx_softintr == 1) 4188 ddi_trigger_softintr(state->id_tx); 4189 else 4190 (void) ibd_tx_recycle((char *)state); 4191 } 4192 4193 /* 4194 * Multicast group create/delete trap handler. These will be delivered 4195 * on a kernel thread (handling can thus block) and can be invoked 4196 * concurrently. The handler can be invoked anytime after it is 4197 * registered and before ibt_detach(). 4198 */ 4199 /* ARGSUSED */ 4200 static void 4201 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 4202 ibt_subnet_event_t *event) 4203 { 4204 ibd_state_t *state = (ibd_state_t *)arg; 4205 ibd_req_t *req; 4206 4207 /* 4208 * The trap handler will get invoked once for every event for 4209 * evert port. The input "gid" is the GID0 of the port the 4210 * trap came in on; we just need to act on traps that came 4211 * to our port, meaning the port on which the ipoib interface 4212 * resides. Since ipoib uses GID0 of the port, we just match 4213 * the gids to check whether we need to handle the trap. 4214 */ 4215 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4216 return; 4217 4218 DPRINT(10, "ibd_notices_handler : %d\n", code); 4219 4220 switch (code) { 4221 case IBT_SM_EVENT_UNAVAILABLE: 4222 /* 4223 * If we are in promiscuous mode or have 4224 * sendnonmembers, we need to print a warning 4225 * message right now. Else, just store the 4226 * information, print when we enter promiscuous 4227 * mode or attempt nonmember send. We might 4228 * also want to stop caching sendnonmember. 4229 */ 4230 ibd_print_warn(state, "IBA multicast support " 4231 "degraded due to unavailability of multicast " 4232 "traps"); 4233 break; 4234 case IBT_SM_EVENT_AVAILABLE: 4235 /* 4236 * If we printed a warning message above or 4237 * while trying to nonmember send or get into 4238 * promiscuous mode, print an okay message. 4239 */ 4240 ibd_print_warn(state, "IBA multicast support " 4241 "restored due to availability of multicast " 4242 "traps"); 4243 break; 4244 case IBT_SM_EVENT_MCG_CREATED: 4245 case IBT_SM_EVENT_MCG_DELETED: 4246 /* 4247 * Common processing of creation/deletion traps. 4248 * First check if the instance is being 4249 * [de]initialized; back off then, without doing 4250 * anything more, since we are not sure if the 4251 * async thread is around, or whether we might 4252 * be racing with the detach code in ibd_drv_fini() 4253 * that scans the mcg list. 4254 */ 4255 if (!ibd_async_safe(state)) 4256 return; 4257 4258 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4259 req->rq_gid = event->sm_notice_gid; 4260 req->rq_ptr = (void *)code; 4261 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4262 break; 4263 } 4264 } 4265 4266 static void 4267 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4268 { 4269 ib_gid_t mgid = req->rq_gid; 4270 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4271 4272 DPRINT(10, "ibd_async_trap : %d\n", code); 4273 4274 /* 4275 * Atomically search the nonmember and sendonlymember lists and 4276 * delete. 4277 */ 4278 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4279 4280 if (state->id_prom_op == IBD_OP_COMPLETED) { 4281 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4282 4283 /* 4284 * If in promiscuous mode, try to join/attach to the new 4285 * mcg. Given the unreliable out-of-order mode of trap 4286 * delivery, we can never be sure whether it is a problem 4287 * if the join fails. Thus, we warn the admin of a failure 4288 * if this was a creation trap. Note that the trap might 4289 * actually be reporting a long past event, and the mcg 4290 * might already have been deleted, thus we might be warning 4291 * in vain. 4292 */ 4293 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4294 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4295 ibd_print_warn(state, "IBA promiscuous mode missed " 4296 "new multicast gid %016llx:%016llx", 4297 (u_longlong_t)mgid.gid_prefix, 4298 (u_longlong_t)mgid.gid_guid); 4299 } 4300 4301 /* 4302 * Free the request slot allocated by the subnet event thread. 4303 */ 4304 ibd_async_done(state); 4305 } 4306 4307 /* 4308 * GLDv3 entry point to get capabilities. 4309 */ 4310 static boolean_t 4311 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4312 { 4313 ibd_state_t *state = arg; 4314 4315 switch (cap) { 4316 case MAC_CAPAB_HCKSUM: { 4317 uint32_t *txflags = cap_data; 4318 4319 /* 4320 * We either do full checksum or not do it at all 4321 */ 4322 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4323 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4324 else 4325 return (B_FALSE); 4326 break; 4327 } 4328 4329 case MAC_CAPAB_LSO: { 4330 mac_capab_lso_t *cap_lso = cap_data; 4331 4332 /* 4333 * In addition to the capability and policy, since LSO 4334 * relies on hw checksum, we'll not enable LSO if we 4335 * don't have hw checksum. Of course, if the HCA doesn't 4336 * provide the reserved lkey capability, enabling LSO will 4337 * actually affect performance adversely, so we'll disable 4338 * LSO even for that case. 4339 */ 4340 if (!state->id_lso_policy || !state->id_lso_capable) 4341 return (B_FALSE); 4342 4343 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4344 return (B_FALSE); 4345 4346 if (state->id_hca_res_lkey_capab == 0) { 4347 ibd_print_warn(state, "no reserved-lkey capability, " 4348 "disabling LSO"); 4349 return (B_FALSE); 4350 } 4351 4352 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4353 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4354 break; 4355 } 4356 4357 default: 4358 return (B_FALSE); 4359 } 4360 4361 return (B_TRUE); 4362 } 4363 4364 /* 4365 * GLDv3 entry point to start hardware. 4366 */ 4367 /*ARGSUSED*/ 4368 static int 4369 ibd_m_start(void *arg) 4370 { 4371 return (0); 4372 } 4373 4374 /* 4375 * GLDv3 entry point to stop hardware from receiving packets. 4376 */ 4377 /*ARGSUSED*/ 4378 static void 4379 ibd_m_stop(void *arg) 4380 { 4381 } 4382 4383 /* 4384 * GLDv3 entry point to modify device's mac address. We do not 4385 * allow address modifications. 4386 */ 4387 static int 4388 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4389 { 4390 ibd_state_t *state; 4391 4392 state = (ibd_state_t *)arg; 4393 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4394 return (0); 4395 else 4396 return (EINVAL); 4397 } 4398 4399 /* 4400 * The blocking part of the IBA join/leave operations are done out 4401 * of here on the async thread. 4402 */ 4403 static void 4404 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4405 { 4406 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4407 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4408 4409 if (op == IBD_ASYNC_JOIN) { 4410 4411 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4412 ibd_print_warn(state, "Joint multicast group failed :" 4413 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4414 } 4415 } else { 4416 /* 4417 * Here, we must search for the proper mcg_info and 4418 * use that to leave the group. 4419 */ 4420 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4421 } 4422 } 4423 4424 /* 4425 * GLDv3 entry point for multicast enable/disable requests. 4426 * This function queues the operation to the async thread and 4427 * return success for a valid multicast address. 4428 */ 4429 static int 4430 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4431 { 4432 ibd_state_t *state = (ibd_state_t *)arg; 4433 ipoib_mac_t maddr, *mcast; 4434 ib_gid_t mgid; 4435 ibd_req_t *req; 4436 4437 /* 4438 * The incoming multicast address might not be aligned properly 4439 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4440 * it to look like one though, to get the offsets of the mc gid, 4441 * since we know we are not going to dereference any values with 4442 * the ipoib_mac_t pointer. 4443 */ 4444 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4445 mcast = &maddr; 4446 4447 /* 4448 * Check validity of MCG address. We could additionally check 4449 * that a enable/disable is not being issued on the "broadcast" 4450 * mcg, but since this operation is only invokable by priviledged 4451 * programs anyway, we allow the flexibility to those dlpi apps. 4452 * Note that we do not validate the "scope" of the IBA mcg. 4453 */ 4454 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4455 return (EINVAL); 4456 4457 /* 4458 * fill in multicast pkey and scope 4459 */ 4460 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4461 4462 /* 4463 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4464 * nothing (ie we stay JOINed to the broadcast group done in 4465 * ibd_drv_init()), to mimic ethernet behavior. IPv4 specifically 4466 * requires to be joined to broadcast groups at all times. 4467 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4468 * depends on this. 4469 */ 4470 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4471 return (0); 4472 4473 ibd_n2h_gid(mcast, &mgid); 4474 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4475 if (req == NULL) 4476 return (ENOMEM); 4477 4478 req->rq_gid = mgid; 4479 4480 if (add) { 4481 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4482 mgid.gid_prefix, mgid.gid_guid); 4483 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 4484 } else { 4485 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4486 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4487 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 4488 } 4489 return (0); 4490 } 4491 4492 /* 4493 * The blocking part of the IBA promiscuous operations are done 4494 * out of here on the async thread. The dlpireq parameter indicates 4495 * whether this invocation is due to a dlpi request or due to 4496 * a port up/down event. 4497 */ 4498 static void 4499 ibd_async_unsetprom(ibd_state_t *state) 4500 { 4501 ibd_mce_t *mce = list_head(&state->id_mc_non); 4502 ib_gid_t mgid; 4503 4504 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4505 4506 while (mce != NULL) { 4507 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4508 mce = list_next(&state->id_mc_non, mce); 4509 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4510 } 4511 state->id_prom_op = IBD_OP_NOTSTARTED; 4512 } 4513 4514 /* 4515 * The blocking part of the IBA promiscuous operations are done 4516 * out of here on the async thread. The dlpireq parameter indicates 4517 * whether this invocation is due to a dlpi request or due to 4518 * a port up/down event. 4519 */ 4520 static void 4521 ibd_async_setprom(ibd_state_t *state) 4522 { 4523 ibt_mcg_attr_t mcg_attr; 4524 ibt_mcg_info_t *mcg_info; 4525 ib_gid_t mgid; 4526 uint_t numg; 4527 int i, ret = IBD_OP_COMPLETED; 4528 4529 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4530 4531 /* 4532 * Obtain all active MC groups on the IB fabric with 4533 * specified criteria (scope + Pkey + Qkey + mtu). 4534 */ 4535 bzero(&mcg_attr, sizeof (mcg_attr)); 4536 mcg_attr.mc_pkey = state->id_pkey; 4537 mcg_attr.mc_scope = state->id_scope; 4538 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4539 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4540 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4541 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4542 IBT_SUCCESS) { 4543 ibd_print_warn(state, "Could not get list of IBA multicast " 4544 "groups"); 4545 ret = IBD_OP_ERRORED; 4546 goto done; 4547 } 4548 4549 /* 4550 * Iterate over the returned mcg's and join as NonMember 4551 * to the IP mcg's. 4552 */ 4553 for (i = 0; i < numg; i++) { 4554 /* 4555 * Do a NonMember JOIN on the MC group. 4556 */ 4557 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4558 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4559 ibd_print_warn(state, "IBA promiscuous mode missed " 4560 "multicast gid %016llx:%016llx", 4561 (u_longlong_t)mgid.gid_prefix, 4562 (u_longlong_t)mgid.gid_guid); 4563 } 4564 4565 ibt_free_mcg_info(mcg_info, numg); 4566 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4567 done: 4568 state->id_prom_op = ret; 4569 } 4570 4571 /* 4572 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4573 * GLDv3 assumes phys state receives more packets than multi state, 4574 * which is not true for IPoIB. Thus, treat the multi and phys 4575 * promiscuous states the same way to work with GLDv3's assumption. 4576 */ 4577 static int 4578 ibd_m_promisc(void *arg, boolean_t on) 4579 { 4580 ibd_state_t *state = (ibd_state_t *)arg; 4581 ibd_req_t *req; 4582 4583 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4584 if (req == NULL) 4585 return (ENOMEM); 4586 if (on) { 4587 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4588 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 4589 } else { 4590 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4591 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 4592 } 4593 4594 return (0); 4595 } 4596 4597 /* 4598 * GLDv3 entry point for gathering statistics. 4599 */ 4600 static int 4601 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 4602 { 4603 ibd_state_t *state = (ibd_state_t *)arg; 4604 4605 switch (stat) { 4606 case MAC_STAT_IFSPEED: 4607 *val = state->id_link_speed; 4608 break; 4609 case MAC_STAT_MULTIRCV: 4610 *val = state->id_multi_rcv; 4611 break; 4612 case MAC_STAT_BRDCSTRCV: 4613 *val = state->id_brd_rcv; 4614 break; 4615 case MAC_STAT_MULTIXMT: 4616 *val = state->id_multi_xmt; 4617 break; 4618 case MAC_STAT_BRDCSTXMT: 4619 *val = state->id_brd_xmt; 4620 break; 4621 case MAC_STAT_RBYTES: 4622 *val = state->id_rcv_bytes; 4623 break; 4624 case MAC_STAT_IPACKETS: 4625 *val = state->id_rcv_pkt; 4626 break; 4627 case MAC_STAT_OBYTES: 4628 *val = state->id_xmt_bytes; 4629 break; 4630 case MAC_STAT_OPACKETS: 4631 *val = state->id_xmt_pkt; 4632 break; 4633 case MAC_STAT_OERRORS: 4634 *val = state->id_ah_error; /* failed AH translation */ 4635 break; 4636 case MAC_STAT_IERRORS: 4637 *val = 0; 4638 break; 4639 case MAC_STAT_NOXMTBUF: 4640 *val = state->id_tx_short; 4641 break; 4642 case MAC_STAT_NORCVBUF: 4643 default: 4644 return (ENOTSUP); 4645 } 4646 4647 return (0); 4648 } 4649 4650 static void 4651 ibd_async_txsched(ibd_state_t *state) 4652 { 4653 ibd_req_t *req; 4654 int ret; 4655 4656 if (ibd_txcomp_poll) 4657 ibd_poll_compq(state, state->id_scq_hdl); 4658 4659 ret = ibd_resume_transmission(state); 4660 if (ret && ibd_txcomp_poll) { 4661 if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP)) 4662 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 4663 else { 4664 ibd_print_warn(state, "ibd_async_txsched: " 4665 "no memory, can't schedule work slot"); 4666 } 4667 } 4668 } 4669 4670 static int 4671 ibd_resume_transmission(ibd_state_t *state) 4672 { 4673 int flag; 4674 int met_thresh = 0; 4675 int ret = -1; 4676 4677 mutex_enter(&state->id_sched_lock); 4678 if (state->id_sched_needed & IBD_RSRC_SWQE) { 4679 met_thresh = (state->id_tx_list.dl_cnt > 4680 IBD_FREE_SWQES_THRESH); 4681 flag = IBD_RSRC_SWQE; 4682 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 4683 ASSERT(state->id_lso != NULL); 4684 met_thresh = (state->id_lso->bkt_nfree > 4685 IBD_FREE_LSOS_THRESH); 4686 flag = IBD_RSRC_LSOBUF; 4687 } 4688 if (met_thresh) { 4689 state->id_sched_needed &= ~flag; 4690 ret = 0; 4691 } 4692 mutex_exit(&state->id_sched_lock); 4693 4694 if (ret == 0) 4695 mac_tx_update(state->id_mh); 4696 4697 return (ret); 4698 } 4699 4700 /* 4701 * Release the send wqe back into free list. 4702 */ 4703 static void 4704 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 4705 { 4706 /* 4707 * Add back on Tx list for reuse. 4708 */ 4709 swqe->swqe_next = NULL; 4710 mutex_enter(&state->id_tx_list.dl_mutex); 4711 if (state->id_tx_list.dl_pending_sends) { 4712 state->id_tx_list.dl_pending_sends = B_FALSE; 4713 } 4714 if (state->id_tx_list.dl_head == NULL) { 4715 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 4716 } else { 4717 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 4718 } 4719 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 4720 state->id_tx_list.dl_cnt++; 4721 mutex_exit(&state->id_tx_list.dl_mutex); 4722 } 4723 4724 /* 4725 * Acquire a send wqe from free list. 4726 * Returns error number and send wqe pointer. 4727 */ 4728 static int 4729 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe) 4730 { 4731 int rc = 0; 4732 ibd_swqe_t *wqe; 4733 4734 /* 4735 * Check and reclaim some of the completed Tx requests. 4736 * If someone else is already in this code and pulling Tx 4737 * completions, no need to poll, since the current lock holder 4738 * will do the work anyway. Normally, we poll for completions 4739 * every few Tx attempts, but if we are short on Tx descriptors, 4740 * we always try to poll. 4741 */ 4742 if ((ibd_txcomp_poll == 1) && 4743 (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) { 4744 ibd_poll_compq(state, state->id_scq_hdl); 4745 } 4746 4747 /* 4748 * Grab required transmit wqes. 4749 */ 4750 mutex_enter(&state->id_tx_list.dl_mutex); 4751 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 4752 if (wqe != NULL) { 4753 state->id_tx_list.dl_cnt -= 1; 4754 state->id_tx_list.dl_head = wqe->swqe_next; 4755 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 4756 state->id_tx_list.dl_tail = NULL; 4757 } else { 4758 /* 4759 * If we did not find the number we were looking for, flag 4760 * no resource. Adjust list appropriately in either case. 4761 */ 4762 rc = ENOENT; 4763 state->id_tx_list.dl_pending_sends = B_TRUE; 4764 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 4765 atomic_add_64(&state->id_tx_short, 1); 4766 } 4767 mutex_exit(&state->id_tx_list.dl_mutex); 4768 *swqe = wqe; 4769 4770 return (rc); 4771 } 4772 4773 static int 4774 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 4775 ibt_ud_dest_hdl_t ud_dest) 4776 { 4777 mblk_t *nmp; 4778 int iph_len, tcph_len; 4779 ibt_wr_lso_t *lso; 4780 uintptr_t ip_start, tcp_start; 4781 uint8_t *dst; 4782 uint_t pending, mblen; 4783 4784 /* 4785 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 4786 * we need to adjust it here for lso. 4787 */ 4788 lso = &(node->w_swr.wr.ud_lso); 4789 lso->lso_ud_dest = ud_dest; 4790 lso->lso_mss = mss; 4791 4792 /* 4793 * Calculate the LSO header size and set it in the UD LSO structure. 4794 * Note that the only assumption we make is that each of the IPoIB, 4795 * IP and TCP headers will be contained in a single mblk fragment; 4796 * together, the headers may span multiple mblk fragments. 4797 */ 4798 nmp = mp; 4799 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 4800 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 4801 ip_start = (uintptr_t)nmp->b_cont->b_rptr 4802 + (ip_start - (uintptr_t)(nmp->b_wptr)); 4803 nmp = nmp->b_cont; 4804 4805 } 4806 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 4807 4808 tcp_start = ip_start + iph_len; 4809 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 4810 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 4811 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 4812 nmp = nmp->b_cont; 4813 } 4814 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 4815 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 4816 4817 /* 4818 * If the lso header fits entirely within a single mblk fragment, 4819 * we'll avoid an additional copy of the lso header here and just 4820 * pass the b_rptr of the mblk directly. 4821 * 4822 * If this isn't true, we'd have to allocate for it explicitly. 4823 */ 4824 if (lso->lso_hdr_sz <= MBLKL(mp)) { 4825 lso->lso_hdr = mp->b_rptr; 4826 } else { 4827 /* On work completion, remember to free this allocated hdr */ 4828 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 4829 if (lso->lso_hdr == NULL) { 4830 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 4831 "sz = %d", lso->lso_hdr_sz); 4832 lso->lso_hdr_sz = 0; 4833 lso->lso_mss = 0; 4834 return (-1); 4835 } 4836 } 4837 4838 /* 4839 * Copy in the lso header only if we need to 4840 */ 4841 if (lso->lso_hdr != mp->b_rptr) { 4842 dst = lso->lso_hdr; 4843 pending = lso->lso_hdr_sz; 4844 4845 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 4846 mblen = MBLKL(nmp); 4847 if (pending > mblen) { 4848 bcopy(nmp->b_rptr, dst, mblen); 4849 dst += mblen; 4850 pending -= mblen; 4851 } else { 4852 bcopy(nmp->b_rptr, dst, pending); 4853 break; 4854 } 4855 } 4856 } 4857 4858 return (0); 4859 } 4860 4861 static void 4862 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 4863 { 4864 ibt_wr_lso_t *lso; 4865 4866 if ((!node) || (!mp)) 4867 return; 4868 4869 /* 4870 * Free any header space that we might've allocated if we 4871 * did an LSO 4872 */ 4873 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 4874 lso = &(node->w_swr.wr.ud_lso); 4875 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 4876 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 4877 lso->lso_hdr = NULL; 4878 lso->lso_hdr_sz = 0; 4879 } 4880 } 4881 } 4882 4883 static void 4884 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 4885 { 4886 uint_t i; 4887 uint_t num_posted; 4888 uint_t n_wrs; 4889 ibt_status_t ibt_status; 4890 ibt_send_wr_t wrs[IBD_MAX_POST_MULTIPLE]; 4891 ibd_swqe_t *elem; 4892 ibd_swqe_t *nodes[IBD_MAX_POST_MULTIPLE]; 4893 4894 node->swqe_next = NULL; 4895 4896 mutex_enter(&state->id_txpost_lock); 4897 4898 /* 4899 * Enqueue the new node in chain of wqes to send 4900 */ 4901 if (state->id_tx_head) { 4902 *(state->id_tx_tailp) = (ibd_wqe_t *)node; 4903 } else { 4904 state->id_tx_head = node; 4905 } 4906 state->id_tx_tailp = &(node->swqe_next); 4907 4908 /* 4909 * If someone else is helping out with the sends, 4910 * just go back 4911 */ 4912 if (state->id_tx_busy) { 4913 mutex_exit(&state->id_txpost_lock); 4914 return; 4915 } 4916 4917 /* 4918 * Otherwise, mark the flag to indicate that we'll be 4919 * doing the dispatch of what's there in the wqe chain 4920 */ 4921 state->id_tx_busy = 1; 4922 4923 while (state->id_tx_head) { 4924 /* 4925 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs 4926 * at a time if possible, and keep posting them. 4927 */ 4928 for (n_wrs = 0, elem = state->id_tx_head; 4929 (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE); 4930 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 4931 4932 nodes[n_wrs] = elem; 4933 wrs[n_wrs] = elem->w_swr; 4934 } 4935 state->id_tx_head = elem; 4936 4937 /* 4938 * Release the txpost lock before posting the 4939 * send request to the hca; if the posting fails 4940 * for some reason, we'll never receive completion 4941 * intimation, so we'll need to cleanup. 4942 */ 4943 mutex_exit(&state->id_txpost_lock); 4944 4945 ASSERT(n_wrs != 0); 4946 4947 /* 4948 * If posting fails for some reason, we'll never receive 4949 * completion intimation, so we'll need to cleanup. But 4950 * we need to make sure we don't clean up nodes whose 4951 * wrs have been successfully posted. We assume that the 4952 * hca driver returns on the first failure to post and 4953 * therefore the first 'num_posted' entries don't need 4954 * cleanup here. 4955 */ 4956 num_posted = 0; 4957 ibt_status = ibt_post_send(state->id_chnl_hdl, 4958 wrs, n_wrs, &num_posted); 4959 if (ibt_status != IBT_SUCCESS) { 4960 4961 ibd_print_warn(state, "ibd_post_send: " 4962 "posting multiple wrs failed: " 4963 "requested=%d, done=%d, ret=%d", 4964 n_wrs, num_posted, ibt_status); 4965 4966 for (i = num_posted; i < n_wrs; i++) 4967 ibd_tx_cleanup(state, nodes[i]); 4968 } 4969 4970 /* 4971 * Grab the mutex before we go and check the tx Q again 4972 */ 4973 mutex_enter(&state->id_txpost_lock); 4974 } 4975 4976 state->id_tx_busy = 0; 4977 mutex_exit(&state->id_txpost_lock); 4978 } 4979 4980 static int 4981 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 4982 uint_t lsohdr_sz) 4983 { 4984 ibt_wr_ds_t *sgl; 4985 ibt_status_t ibt_status; 4986 mblk_t *nmp; 4987 mblk_t *data_mp; 4988 uchar_t *bufp; 4989 size_t blksize; 4990 size_t skip; 4991 size_t avail; 4992 uint_t pktsize; 4993 uint_t frag_len; 4994 uint_t pending_hdr; 4995 uint_t hiwm; 4996 int nmblks; 4997 int i; 4998 4999 /* 5000 * Let's skip ahead to the data if this is LSO 5001 */ 5002 data_mp = mp; 5003 pending_hdr = 0; 5004 if (lsohdr_sz) { 5005 pending_hdr = lsohdr_sz; 5006 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5007 frag_len = nmp->b_wptr - nmp->b_rptr; 5008 if (frag_len > pending_hdr) 5009 break; 5010 pending_hdr -= frag_len; 5011 } 5012 data_mp = nmp; /* start of data past lso header */ 5013 ASSERT(data_mp != NULL); 5014 } 5015 5016 /* 5017 * Calculate the size of message data and number of msg blocks 5018 */ 5019 pktsize = 0; 5020 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5021 nmp = nmp->b_cont, nmblks++) { 5022 pktsize += MBLKL(nmp); 5023 } 5024 pktsize -= pending_hdr; 5025 5026 /* 5027 * Translating the virtual address regions into physical regions 5028 * for using the Reserved LKey feature results in a wr sgl that 5029 * is a little longer. Since failing ibt_map_mem_iov() is costly, 5030 * we'll fix a high-water mark (65%) for when we should stop. 5031 */ 5032 hiwm = (state->id_max_sqseg * 65) / 100; 5033 5034 /* 5035 * We only do ibt_map_mem_iov() if the pktsize is above the 5036 * "copy-threshold", and if the number of mp fragments is less than 5037 * the maximum acceptable. 5038 */ 5039 if ((state->id_hca_res_lkey_capab) && 5040 (pktsize > IBD_TX_COPY_THRESH) && 5041 (nmblks < hiwm)) { 5042 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5043 ibt_iov_attr_t iov_attr; 5044 5045 iov_attr.iov_as = NULL; 5046 iov_attr.iov = iov_arr; 5047 iov_attr.iov_buf = NULL; 5048 iov_attr.iov_list_len = nmblks; 5049 iov_attr.iov_wr_nds = state->id_max_sqseg; 5050 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5051 iov_attr.iov_flags = IBT_IOV_SLEEP; 5052 5053 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5054 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5055 iov_arr[i].iov_len = MBLKL(nmp); 5056 if (i == 0) { 5057 iov_arr[i].iov_addr += pending_hdr; 5058 iov_arr[i].iov_len -= pending_hdr; 5059 } 5060 } 5061 5062 node->w_buftype = IBD_WQE_MAPPED; 5063 node->w_swr.wr_sgl = node->w_sgl; 5064 5065 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5066 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5067 if (ibt_status != IBT_SUCCESS) { 5068 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5069 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5070 goto ibd_copy_path; 5071 } 5072 5073 return (0); 5074 } 5075 5076 ibd_copy_path: 5077 if (pktsize <= state->id_tx_buf_sz) { 5078 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5079 node->w_swr.wr_nds = 1; 5080 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5081 node->w_buftype = IBD_WQE_TXBUF; 5082 5083 /* 5084 * Even though this is the copy path for transfers less than 5085 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5086 * is possible the first data mblk fragment (data_mp) still 5087 * contains part of the LSO header that we need to skip. 5088 */ 5089 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5090 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5091 blksize = MBLKL(nmp) - pending_hdr; 5092 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5093 bufp += blksize; 5094 pending_hdr = 0; 5095 } 5096 5097 return (0); 5098 } 5099 5100 /* 5101 * Copy path for transfers greater than id_tx_buf_sz 5102 */ 5103 node->w_swr.wr_sgl = node->w_sgl; 5104 if (ibd_acquire_lsobufs(state, pktsize, 5105 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5106 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5107 return (-1); 5108 } 5109 node->w_buftype = IBD_WQE_LSOBUF; 5110 5111 /* 5112 * Copy the larger-than-id_tx_buf_sz packet into a set of 5113 * fixed-sized, pre-mapped LSO buffers. Note that we might 5114 * need to skip part of the LSO header in the first fragment 5115 * as before. 5116 */ 5117 nmp = data_mp; 5118 skip = pending_hdr; 5119 for (i = 0; i < node->w_swr.wr_nds; i++) { 5120 sgl = node->w_swr.wr_sgl + i; 5121 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5122 avail = IBD_LSO_BUFSZ; 5123 while (nmp && avail) { 5124 blksize = MBLKL(nmp) - skip; 5125 if (blksize > avail) { 5126 bcopy(nmp->b_rptr + skip, bufp, avail); 5127 skip += avail; 5128 avail = 0; 5129 } else { 5130 bcopy(nmp->b_rptr + skip, bufp, blksize); 5131 skip = 0; 5132 avail -= blksize; 5133 bufp += blksize; 5134 nmp = nmp->b_cont; 5135 } 5136 } 5137 } 5138 5139 return (0); 5140 } 5141 5142 /* 5143 * Schedule a completion queue polling to reap the resource we're 5144 * short on. If we implement the change to reap tx completions 5145 * in a separate thread, we'll need to wake up that thread here. 5146 */ 5147 static int 5148 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5149 { 5150 ibd_req_t *req; 5151 5152 mutex_enter(&state->id_sched_lock); 5153 state->id_sched_needed |= resource_type; 5154 mutex_exit(&state->id_sched_lock); 5155 5156 /* 5157 * If we are asked to queue a work entry, we need to do it 5158 */ 5159 if (q_flag) { 5160 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5161 if (req == NULL) 5162 return (-1); 5163 5164 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5165 } 5166 5167 return (0); 5168 } 5169 5170 /* 5171 * The passed in packet has this format: 5172 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5173 */ 5174 static boolean_t 5175 ibd_send(ibd_state_t *state, mblk_t *mp) 5176 { 5177 ibd_ace_t *ace; 5178 ibd_swqe_t *node; 5179 ipoib_mac_t *dest; 5180 ib_header_info_t *ipibp; 5181 ip6_t *ip6h; 5182 uint_t pktsize; 5183 uint32_t mss; 5184 uint32_t hckflags; 5185 uint32_t lsoflags = 0; 5186 uint_t lsohdr_sz = 0; 5187 int ret, len; 5188 boolean_t dofree = B_FALSE; 5189 boolean_t rc; 5190 5191 node = NULL; 5192 if (ibd_acquire_swqe(state, &node) != 0) { 5193 /* 5194 * If we don't have an swqe available, schedule a transmit 5195 * completion queue cleanup and hold off on sending more 5196 * more packets until we have some free swqes 5197 */ 5198 if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0) 5199 return (B_FALSE); 5200 5201 /* 5202 * If a poll cannot be scheduled, we have no choice but 5203 * to drop this packet 5204 */ 5205 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5206 return (B_TRUE); 5207 } 5208 5209 /* 5210 * Initialize the commonly used fields in swqe to NULL to protect 5211 * against ibd_tx_cleanup accidentally misinterpreting these on a 5212 * failure. 5213 */ 5214 node->swqe_im_mblk = NULL; 5215 node->w_swr.wr_nds = 0; 5216 node->w_swr.wr_sgl = NULL; 5217 node->w_swr.wr_opcode = IBT_WRC_SEND; 5218 5219 /* 5220 * Obtain an address handle for the destination. 5221 */ 5222 ipibp = (ib_header_info_t *)mp->b_rptr; 5223 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5224 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5225 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5226 5227 pktsize = msgsize(mp); 5228 5229 atomic_add_64(&state->id_xmt_bytes, pktsize); 5230 atomic_inc_64(&state->id_xmt_pkt); 5231 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5232 atomic_inc_64(&state->id_brd_xmt); 5233 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5234 atomic_inc_64(&state->id_multi_xmt); 5235 5236 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 5237 node->w_ahandle = ace; 5238 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5239 } else { 5240 DPRINT(5, 5241 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5242 ((ret == EFAULT) ? "failed" : "queued"), 5243 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5244 htonl(dest->ipoib_gidpref[1]), 5245 htonl(dest->ipoib_gidsuff[0]), 5246 htonl(dest->ipoib_gidsuff[1])); 5247 node->w_ahandle = NULL; 5248 5249 /* 5250 * for the poll mode, it is probably some cqe pending in the 5251 * cq. So ibd has to poll cq here, otherwise acache probably 5252 * may not be recycled. 5253 */ 5254 if (ibd_txcomp_poll == 1) 5255 ibd_poll_compq(state, state->id_scq_hdl); 5256 5257 /* 5258 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5259 * can not find a path for the specific dest address. We 5260 * should get rid of this kind of packet. We also should get 5261 * rid of the packet if we cannot schedule a poll via the 5262 * async thread. For the normal case, ibd will return the 5263 * packet to upper layer and wait for AH creating. 5264 * 5265 * Note that we always queue a work slot entry for the async 5266 * thread when we fail AH lookup (even in intr mode); this is 5267 * due to the convoluted way the code currently looks for AH. 5268 */ 5269 if (ret == EFAULT) { 5270 dofree = B_TRUE; 5271 rc = B_TRUE; 5272 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5273 dofree = B_TRUE; 5274 rc = B_TRUE; 5275 } else { 5276 dofree = B_FALSE; 5277 rc = B_FALSE; 5278 } 5279 goto ibd_send_fail; 5280 } 5281 5282 /* 5283 * For ND6 packets, padding is at the front of the source lladdr. 5284 * Insert the padding at front. 5285 */ 5286 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { 5287 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5288 if (!pullupmsg(mp, IPV6_HDR_LEN + 5289 sizeof (ib_header_info_t))) { 5290 DPRINT(10, "ibd_send: pullupmsg failure "); 5291 dofree = B_TRUE; 5292 rc = B_TRUE; 5293 goto ibd_send_fail; 5294 } 5295 ipibp = (ib_header_info_t *)mp->b_rptr; 5296 } 5297 ip6h = (ip6_t *)((uchar_t *)ipibp + 5298 sizeof (ib_header_info_t)); 5299 len = ntohs(ip6h->ip6_plen); 5300 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5301 mblk_t *pad; 5302 5303 pad = allocb(4, 0); 5304 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5305 linkb(mp, pad); 5306 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5307 IPV6_HDR_LEN + len + 4) { 5308 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5309 IPV6_HDR_LEN + len + 4)) { 5310 DPRINT(10, "ibd_send: pullupmsg " 5311 "failure "); 5312 dofree = B_TRUE; 5313 rc = B_TRUE; 5314 goto ibd_send_fail; 5315 } 5316 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5317 sizeof (ib_header_info_t)); 5318 } 5319 5320 /* LINTED: E_CONSTANT_CONDITION */ 5321 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5322 } 5323 } 5324 5325 mp->b_rptr += sizeof (ib_addrs_t); 5326 5327 /* 5328 * Do LSO and checksum related work here. For LSO send, adjust the 5329 * ud destination, the opcode and the LSO header information to the 5330 * work request. 5331 */ 5332 lso_info_get(mp, &mss, &lsoflags); 5333 if ((lsoflags & HW_LSO) != HW_LSO) { 5334 node->w_swr.wr_opcode = IBT_WRC_SEND; 5335 lsohdr_sz = 0; 5336 } else { 5337 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5338 /* 5339 * The routine can only fail if there's no memory; we 5340 * can only drop the packet if this happens 5341 */ 5342 ibd_print_warn(state, 5343 "ibd_send: no memory, lso posting failed"); 5344 dofree = B_TRUE; 5345 rc = B_TRUE; 5346 goto ibd_send_fail; 5347 } 5348 5349 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5350 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5351 } 5352 5353 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5354 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5355 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5356 else 5357 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5358 5359 /* 5360 * Prepare the sgl for posting; the routine can only fail if there's 5361 * no lso buf available for posting. If this is the case, we should 5362 * probably resched for lso bufs to become available and then try again. 5363 */ 5364 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5365 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5366 dofree = B_TRUE; 5367 rc = B_TRUE; 5368 } else { 5369 dofree = B_FALSE; 5370 rc = B_FALSE; 5371 } 5372 goto ibd_send_fail; 5373 } 5374 node->swqe_im_mblk = mp; 5375 5376 /* 5377 * Queue the wqe to hardware; since we can now simply queue a 5378 * post instead of doing it serially, we cannot assume anything 5379 * about the 'node' after ibd_post_send() returns. 5380 */ 5381 ibd_post_send(state, node); 5382 5383 return (B_TRUE); 5384 5385 ibd_send_fail: 5386 if (node && mp) 5387 ibd_free_lsohdr(node, mp); 5388 5389 if (dofree) 5390 freemsg(mp); 5391 5392 if (node != NULL) 5393 ibd_tx_cleanup(state, node); 5394 5395 return (rc); 5396 } 5397 5398 /* 5399 * GLDv3 entry point for transmitting datagram. 5400 */ 5401 static mblk_t * 5402 ibd_m_tx(void *arg, mblk_t *mp) 5403 { 5404 ibd_state_t *state = (ibd_state_t *)arg; 5405 mblk_t *next; 5406 5407 while (mp != NULL) { 5408 next = mp->b_next; 5409 mp->b_next = NULL; 5410 if (ibd_send(state, mp) == B_FALSE) { 5411 /* Send fail */ 5412 mp->b_next = next; 5413 break; 5414 } 5415 mp = next; 5416 } 5417 5418 return (mp); 5419 } 5420 5421 /* 5422 * this handles Tx and Rx completions. With separate CQs, this handles 5423 * only Rx completions. 5424 */ 5425 static uint_t 5426 ibd_intr(char *arg) 5427 { 5428 ibd_state_t *state = (ibd_state_t *)arg; 5429 5430 ibd_poll_compq(state, state->id_rcq_hdl); 5431 5432 return (DDI_INTR_CLAIMED); 5433 } 5434 5435 /* 5436 * Poll and drain the cq 5437 */ 5438 static uint_t 5439 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, 5440 uint_t numwcs) 5441 { 5442 ibd_wqe_t *wqe; 5443 ibt_wc_t *wc; 5444 uint_t total_polled = 0; 5445 uint_t num_polled; 5446 int i; 5447 5448 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5449 total_polled += num_polled; 5450 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5451 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5452 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5453 (wqe->w_type == IBD_WQE_RECV)); 5454 if (wc->wc_status != IBT_WC_SUCCESS) { 5455 /* 5456 * Channel being torn down. 5457 */ 5458 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5459 DPRINT(5, "ibd_drain_cq: flush error"); 5460 /* 5461 * Only invoke the Tx handler to 5462 * release possibly held resources 5463 * like AH refcount etc. Can not 5464 * invoke Rx handler because it might 5465 * try adding buffers to the Rx pool 5466 * when we are trying to deinitialize. 5467 */ 5468 if (wqe->w_type == IBD_WQE_RECV) { 5469 continue; 5470 } else { 5471 DPRINT(10, "ibd_drain_cq: Bad " 5472 "status %d", wc->wc_status); 5473 } 5474 } 5475 } 5476 if (wqe->w_type == IBD_WQE_SEND) { 5477 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 5478 } else { 5479 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5480 } 5481 } 5482 } 5483 5484 return (total_polled); 5485 } 5486 5487 /* 5488 * Common code for interrupt handling as well as for polling 5489 * for all completed wqe's while detaching. 5490 */ 5491 static void 5492 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5493 { 5494 ibt_wc_t *wcs; 5495 uint_t numwcs; 5496 int flag, redo_flag; 5497 int redo = 1; 5498 uint_t num_polled = 0; 5499 5500 if (ibd_separate_cqs == 1) { 5501 if (cq_hdl == state->id_rcq_hdl) { 5502 flag = IBD_RX_CQ_POLLING; 5503 redo_flag = IBD_REDO_RX_CQ_POLLING; 5504 } else { 5505 flag = IBD_TX_CQ_POLLING; 5506 redo_flag = IBD_REDO_TX_CQ_POLLING; 5507 } 5508 } else { 5509 flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING; 5510 redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING; 5511 } 5512 5513 mutex_enter(&state->id_cq_poll_lock); 5514 if (state->id_cq_poll_busy & flag) { 5515 state->id_cq_poll_busy |= redo_flag; 5516 mutex_exit(&state->id_cq_poll_lock); 5517 return; 5518 } 5519 state->id_cq_poll_busy |= flag; 5520 mutex_exit(&state->id_cq_poll_lock); 5521 5522 /* 5523 * In some cases (eg detaching), this code can be invoked on 5524 * any cpu after disabling cq notification (thus no concurrency 5525 * exists). Apart from that, the following applies normally: 5526 * The receive completion handling is always on the Rx interrupt 5527 * cpu. Transmit completion handling could be from any cpu if 5528 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5529 * is interrupt driven. Combined completion handling is always 5530 * on the interrupt cpu. Thus, lock accordingly and use the 5531 * proper completion array. 5532 */ 5533 if (ibd_separate_cqs == 1) { 5534 if (cq_hdl == state->id_rcq_hdl) { 5535 wcs = state->id_rxwcs; 5536 numwcs = state->id_rxwcs_size; 5537 } else { 5538 wcs = state->id_txwcs; 5539 numwcs = state->id_txwcs_size; 5540 } 5541 } else { 5542 wcs = state->id_rxwcs; 5543 numwcs = state->id_rxwcs_size; 5544 } 5545 5546 /* 5547 * Poll and drain the CQ 5548 */ 5549 num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5550 5551 /* 5552 * Enable CQ notifications and redrain the cq to catch any 5553 * completions we might have missed after the ibd_drain_cq() 5554 * above and before the ibt_enable_cq_notify() that follows. 5555 * Finally, service any new requests to poll the cq that 5556 * could've come in after the ibt_enable_cq_notify(). 5557 */ 5558 do { 5559 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 5560 IBT_SUCCESS) { 5561 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5562 } 5563 5564 num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5565 5566 mutex_enter(&state->id_cq_poll_lock); 5567 if (state->id_cq_poll_busy & redo_flag) 5568 state->id_cq_poll_busy &= ~redo_flag; 5569 else { 5570 state->id_cq_poll_busy &= ~flag; 5571 redo = 0; 5572 } 5573 mutex_exit(&state->id_cq_poll_lock); 5574 5575 } while (redo); 5576 5577 /* 5578 * If we polled the receive cq and found anything, we need to flush 5579 * it out to the nw layer here. 5580 */ 5581 if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) { 5582 ibd_flush_rx(state, NULL); 5583 } 5584 } 5585 5586 /* 5587 * Unmap the memory area associated with a given swqe. 5588 */ 5589 static void 5590 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 5591 { 5592 ibt_status_t stat; 5593 5594 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 5595 5596 if (swqe->w_mi_hdl) { 5597 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 5598 swqe->w_mi_hdl)) != IBT_SUCCESS) { 5599 DPRINT(10, 5600 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 5601 } 5602 swqe->w_mi_hdl = NULL; 5603 } 5604 swqe->w_swr.wr_nds = 0; 5605 } 5606 5607 /* 5608 * Common code that deals with clean ups after a successful or 5609 * erroneous transmission attempt. 5610 */ 5611 static void 5612 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 5613 { 5614 ibd_ace_t *ace = swqe->w_ahandle; 5615 5616 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 5617 5618 /* 5619 * If this was a dynamic mapping in ibd_send(), we need to 5620 * unmap here. If this was an lso buffer we'd used for sending, 5621 * we need to release the lso buf to the pool, since the resource 5622 * is scarce. However, if this was simply a normal send using 5623 * the copybuf (present in each swqe), we don't need to release it. 5624 */ 5625 if (swqe->swqe_im_mblk != NULL) { 5626 if (swqe->w_buftype == IBD_WQE_MAPPED) { 5627 ibd_unmap_mem(state, swqe); 5628 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 5629 ibd_release_lsobufs(state, 5630 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 5631 } 5632 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 5633 freemsg(swqe->swqe_im_mblk); 5634 swqe->swqe_im_mblk = NULL; 5635 } 5636 5637 /* 5638 * Drop the reference count on the AH; it can be reused 5639 * now for a different destination if there are no more 5640 * posted sends that will use it. This can be eliminated 5641 * if we can always associate each Tx buffer with an AH. 5642 * The ace can be null if we are cleaning up from the 5643 * ibd_send() error path. 5644 */ 5645 if (ace != NULL) { 5646 /* 5647 * The recycling logic can be eliminated from here 5648 * and put into the async thread if we create another 5649 * list to hold ACE's for unjoined mcg's. 5650 */ 5651 if (DEC_REF_DO_CYCLE(ace)) { 5652 ibd_mce_t *mce; 5653 5654 /* 5655 * Check with the lock taken: we decremented 5656 * reference count without the lock, and some 5657 * transmitter might alreay have bumped the 5658 * reference count (possible in case of multicast 5659 * disable when we leave the AH on the active 5660 * list). If not still 0, get out, leaving the 5661 * recycle bit intact. 5662 * 5663 * Atomically transition the AH from active 5664 * to free list, and queue a work request to 5665 * leave the group and destroy the mce. No 5666 * transmitter can be looking at the AH or 5667 * the MCE in between, since we have the 5668 * ac_mutex lock. In the SendOnly reap case, 5669 * it is not neccesary to hold the ac_mutex 5670 * and recheck the ref count (since the AH was 5671 * taken off the active list), we just do it 5672 * to have uniform processing with the Full 5673 * reap case. 5674 */ 5675 mutex_enter(&state->id_ac_mutex); 5676 mce = ace->ac_mce; 5677 if (GET_REF_CYCLE(ace) == 0) { 5678 CLEAR_REFCYCLE(ace); 5679 /* 5680 * Identify the case of fullmember reap as 5681 * opposed to mcg trap reap. Also, port up 5682 * might set ac_mce to NULL to indicate Tx 5683 * cleanup should do no more than put the 5684 * AH in the free list (see ibd_async_link). 5685 */ 5686 if (mce != NULL) { 5687 ace->ac_mce = NULL; 5688 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 5689 /* 5690 * mc_req was initialized at mce 5691 * creation time. 5692 */ 5693 ibd_queue_work_slot(state, 5694 &mce->mc_req, IBD_ASYNC_REAP); 5695 } 5696 IBD_ACACHE_INSERT_FREE(state, ace); 5697 } 5698 mutex_exit(&state->id_ac_mutex); 5699 } 5700 } 5701 5702 /* 5703 * Release the send wqe for reuse. 5704 */ 5705 ibd_release_swqe(state, swqe); 5706 } 5707 5708 /* 5709 * Hand off the processed rx mp chain to mac_rx() 5710 */ 5711 static void 5712 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) 5713 { 5714 if (mpc == NULL) { 5715 mutex_enter(&state->id_rx_lock); 5716 5717 mpc = state->id_rx_mp; 5718 5719 state->id_rx_mp = NULL; 5720 state->id_rx_mp_tail = NULL; 5721 state->id_rx_mp_len = 0; 5722 5723 mutex_exit(&state->id_rx_lock); 5724 } 5725 5726 if (mpc) { 5727 mac_rx(state->id_mh, state->id_rh, mpc); 5728 } 5729 } 5730 5731 /* 5732 * Processing to be done after receipt of a packet; hand off to GLD 5733 * in the format expected by GLD. The received packet has this 5734 * format: 2b sap :: 00 :: data. 5735 */ 5736 static void 5737 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 5738 { 5739 ib_header_info_t *phdr; 5740 mblk_t *mp; 5741 mblk_t *mpc = NULL; 5742 ipoib_hdr_t *ipibp; 5743 ipha_t *iphap; 5744 ip6_t *ip6h; 5745 int rxcnt, len; 5746 5747 /* 5748 * Track number handed to upper layer, and number still 5749 * available to receive packets. 5750 */ 5751 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 5752 ASSERT(rxcnt >= 0); 5753 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 5754 5755 /* 5756 * Adjust write pointer depending on how much data came in. 5757 */ 5758 mp = rwqe->rwqe_im_mblk; 5759 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 5760 5761 /* 5762 * Make sure this is NULL or we're in trouble. 5763 */ 5764 if (mp->b_next != NULL) { 5765 ibd_print_warn(state, 5766 "ibd_process_rx: got duplicate mp from rcq?"); 5767 mp->b_next = NULL; 5768 } 5769 5770 /* 5771 * the IB link will deliver one of the IB link layer 5772 * headers called, the Global Routing Header (GRH). 5773 * ibd driver uses the information in GRH to build the 5774 * Header_info structure and pass it with the datagram up 5775 * to GLDv3. 5776 * If the GRH is not valid, indicate to GLDv3 by setting 5777 * the VerTcFlow field to 0. 5778 */ 5779 phdr = (ib_header_info_t *)mp->b_rptr; 5780 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 5781 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 5782 5783 /* if it is loop back packet, just drop it. */ 5784 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 5785 IPOIB_ADDRL) == 0) { 5786 freemsg(mp); 5787 return; 5788 } 5789 5790 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 5791 sizeof (ipoib_mac_t)); 5792 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 5793 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 5794 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 5795 } else { 5796 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 5797 } 5798 } else { 5799 /* 5800 * It can not be a IBA multicast packet. Must have been 5801 * unicast for us. Just copy the interface address to dst. 5802 */ 5803 phdr->ib_grh.ipoib_vertcflow = 0; 5804 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 5805 sizeof (ipoib_mac_t)); 5806 } 5807 5808 /* 5809 * For ND6 packets, padding is at the front of the source/target 5810 * lladdr. However the inet6 layer is not aware of it, hence remove 5811 * the padding from such packets. 5812 */ 5813 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 5814 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 5815 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 5816 if (!pullupmsg(mp, IPV6_HDR_LEN + 5817 sizeof (ipoib_hdr_t))) { 5818 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 5819 freemsg(mp); 5820 return; 5821 } 5822 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 5823 sizeof (ipoib_pgrh_t)); 5824 } 5825 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 5826 len = ntohs(ip6h->ip6_plen); 5827 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5828 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 5829 IPV6_HDR_LEN + len) { 5830 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 5831 IPV6_HDR_LEN + len)) { 5832 DPRINT(10, "ibd_process_rx: pullupmsg" 5833 " failed"); 5834 freemsg(mp); 5835 return; 5836 } 5837 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5838 sizeof (ipoib_pgrh_t) + 5839 sizeof (ipoib_hdr_t)); 5840 } 5841 /* LINTED: E_CONSTANT_CONDITION */ 5842 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 5843 } 5844 } 5845 5846 /* 5847 * Update statistics 5848 */ 5849 atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer); 5850 atomic_inc_64(&state->id_rcv_pkt); 5851 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5852 atomic_inc_64(&state->id_brd_rcv); 5853 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5854 atomic_inc_64(&state->id_multi_rcv); 5855 5856 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 5857 /* 5858 * Set receive checksum status in mp 5859 * Hardware checksumming can be considered valid only if: 5860 * 1. CQE.IP_OK bit is set 5861 * 2. CQE.CKSUM = 0xffff 5862 * 3. IPv6 routing header is not present in the packet 5863 * 4. If there are no IP_OPTIONS in the IP HEADER 5864 */ 5865 5866 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 5867 (wc->wc_cksum == 0xFFFF) && 5868 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 5869 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 5870 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 5871 } 5872 5873 /* 5874 * Add this mp to the list of processed mp's to send to 5875 * the nw layer 5876 */ 5877 mutex_enter(&state->id_rx_lock); 5878 if (state->id_rx_mp) { 5879 ASSERT(state->id_rx_mp_tail != NULL); 5880 state->id_rx_mp_tail->b_next = mp; 5881 } else { 5882 ASSERT(state->id_rx_mp_tail == NULL); 5883 state->id_rx_mp = mp; 5884 } 5885 5886 state->id_rx_mp_tail = mp; 5887 state->id_rx_mp_len++; 5888 5889 if (state->id_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 5890 mpc = state->id_rx_mp; 5891 5892 state->id_rx_mp = NULL; 5893 state->id_rx_mp_tail = NULL; 5894 state->id_rx_mp_len = 0; 5895 } 5896 5897 mutex_exit(&state->id_rx_lock); 5898 5899 if (mpc) { 5900 ibd_flush_rx(state, mpc); 5901 } 5902 } 5903 5904 /* 5905 * Callback code invoked from STREAMs when the receive data buffer is 5906 * free for recycling. 5907 */ 5908 static void 5909 ibd_freemsg_cb(char *arg) 5910 { 5911 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 5912 ibd_state_t *state = rwqe->w_state; 5913 5914 /* 5915 * If the wqe is being destructed, do not attempt recycling. 5916 */ 5917 if (rwqe->w_freeing_wqe == B_TRUE) { 5918 DPRINT(6, "ibd_freemsg: wqe being freed"); 5919 return; 5920 } else { 5921 /* 5922 * Upper layer has released held mblk, so we have 5923 * no more use for keeping the old pointer in 5924 * our rwqe. 5925 */ 5926 rwqe->rwqe_im_mblk = NULL; 5927 } 5928 5929 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 5930 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 5931 if (rwqe->rwqe_im_mblk == NULL) { 5932 ibd_delete_rwqe(state, rwqe); 5933 ibd_free_rwqe(state, rwqe); 5934 DPRINT(6, "ibd_freemsg: desballoc failed"); 5935 return; 5936 } 5937 5938 if (ibd_post_rwqe(state, rwqe, B_TRUE) == DDI_FAILURE) { 5939 ibd_delete_rwqe(state, rwqe); 5940 ibd_free_rwqe(state, rwqe); 5941 return; 5942 } 5943 5944 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 5945 } 5946 5947 static uint_t 5948 ibd_tx_recycle(char *arg) 5949 { 5950 ibd_state_t *state = (ibd_state_t *)arg; 5951 5952 /* 5953 * Poll for completed entries 5954 */ 5955 ibd_poll_compq(state, state->id_scq_hdl); 5956 5957 /* 5958 * Resume any blocked transmissions if possible 5959 */ 5960 (void) ibd_resume_transmission(state); 5961 5962 return (DDI_INTR_CLAIMED); 5963 } 5964 5965 #ifdef IBD_LOGGING 5966 static void 5967 ibd_log_init(void) 5968 { 5969 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 5970 ibd_lbuf_ndx = 0; 5971 } 5972 5973 static void 5974 ibd_log_fini(void) 5975 { 5976 if (ibd_lbuf) 5977 kmem_free(ibd_lbuf, IBD_LOG_SZ); 5978 ibd_lbuf_ndx = 0; 5979 ibd_lbuf = NULL; 5980 } 5981 5982 static void 5983 ibd_log(const char *fmt, ...) 5984 { 5985 va_list ap; 5986 uint32_t off; 5987 uint32_t msglen; 5988 char tmpbuf[IBD_DMAX_LINE]; 5989 5990 if (ibd_lbuf == NULL) 5991 return; 5992 5993 va_start(ap, fmt); 5994 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 5995 va_end(ap); 5996 5997 if (msglen >= IBD_DMAX_LINE) 5998 msglen = IBD_DMAX_LINE - 1; 5999 6000 mutex_enter(&ibd_lbuf_lock); 6001 6002 off = ibd_lbuf_ndx; /* current msg should go here */ 6003 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6004 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6005 6006 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6007 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6008 6009 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6010 ibd_lbuf_ndx = 0; 6011 6012 mutex_exit(&ibd_lbuf_lock); 6013 6014 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6015 } 6016 #endif 6017