1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 54 #include <inet/ip6.h> /* for ip6_t */ 55 #include <inet/tcp.h> /* for tcph_t */ 56 #include <netinet/icmp6.h> /* for icmp6_t */ 57 #include <sys/callb.h> 58 #include <sys/modhash.h> 59 60 #include <sys/ib/clients/ibd/ibd.h> 61 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 62 #include <sys/note.h> 63 #include <sys/multidata.h> 64 65 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 66 67 /* 68 * Per-interface tunables 69 * 70 * ibd_tx_copy_thresh 71 * This sets the threshold at which ibd will attempt to do a bcopy of the 72 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 73 * is restricted by various parameters, so setting of this value must be 74 * made after careful considerations only. For instance, IB HCAs currently 75 * impose a relatively small limit (when compared to ethernet NICs) on the 76 * length of the SGL for transmit. On the other hand, the ip stack could 77 * send down mp chains that are quite long when LSO is enabled. 78 * 79 * ibd_num_swqe 80 * Number of "send WQE" elements that will be allocated and used by ibd. 81 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 82 * buffer in each of these send wqes must be taken into account. This 83 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 84 * currently set to the same value of ibd_tx_copy_thresh, but may be 85 * changed independently if needed). 86 * 87 * ibd_num_rwqe 88 * Number of "receive WQE" elements that will be allocated and used by 89 * ibd. This parameter is limited by the maximum channel size of the HCA. 90 * Each buffer in the receive wqe will be of MTU size. 91 * 92 * ibd_num_lso_bufs 93 * Number of "larger-than-MTU" copy buffers to use for cases when the 94 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 95 * and too large to be used with regular MTU-sized copy buffers. It is 96 * not recommended to tune this variable without understanding the 97 * application environment and/or memory resources. The size of each of 98 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 99 * 100 * ibd_num_ah 101 * Number of AH cache entries to allocate 102 * 103 * ibd_hash_size 104 * Hash table size for the active AH list 105 * 106 * ibd_separate_cqs 107 * ibd_txcomp_poll 108 * These boolean variables (1 or 0) may be used to tune the behavior of 109 * ibd in managing the send and receive completion queues and in deciding 110 * whether or not transmit completions should be polled or interrupt 111 * driven (when the completion queues are separate). If both the completion 112 * queues are interrupt driven, it may not be possible for the handlers to 113 * be invoked concurrently, depending on how the interrupts are tied on 114 * the PCI intr line. Note that some combination of these two parameters 115 * may not be meaningful (and therefore not allowed). 116 * 117 * ibd_tx_softintr 118 * ibd_rx_softintr 119 * The softintr mechanism allows ibd to avoid event queue overflows if 120 * the receive/completion handlers are to be expensive. These are enabled 121 * by default. 122 * 123 * ibd_log_sz 124 * This specifies the size of the ibd log buffer in bytes. The buffer is 125 * allocated and logging is enabled only when IBD_LOGGING is defined. 126 * 127 */ 128 uint_t ibd_tx_copy_thresh = 0x1000; 129 uint_t ibd_num_swqe = 4000; 130 uint_t ibd_num_rwqe = 4000; 131 uint_t ibd_num_lso_bufs = 0x400; 132 uint_t ibd_num_ah = 64; 133 uint_t ibd_hash_size = 32; 134 uint_t ibd_separate_cqs = 1; 135 uint_t ibd_txcomp_poll = 0; 136 uint_t ibd_rx_softintr = 1; 137 uint_t ibd_tx_softintr = 1; 138 #ifdef IBD_LOGGING 139 uint_t ibd_log_sz = 0x20000; 140 #endif 141 142 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 143 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 144 #define IBD_NUM_SWQE ibd_num_swqe 145 #define IBD_NUM_RWQE ibd_num_rwqe 146 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 147 #define IBD_NUM_AH ibd_num_ah 148 #define IBD_HASH_SIZE ibd_hash_size 149 #ifdef IBD_LOGGING 150 #define IBD_LOG_SZ ibd_log_sz 151 #endif 152 153 /* 154 * Receive CQ moderation parameters: NOT tunables 155 */ 156 static uint_t ibd_rxcomp_count = 4; 157 static uint_t ibd_rxcomp_usec = 10; 158 159 /* 160 * Send CQ moderation parameters: NOT tunables 161 */ 162 #define IBD_TXCOMP_COUNT 10 163 #define IBD_TXCOMP_USEC 300 164 165 /* 166 * Thresholds 167 * 168 * When waiting for resources (swqes or lso buffers) to become available, 169 * the first two thresholds below determine how long to wait before informing 170 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 171 * determines how low the available swqes should go before we start polling 172 * the completion queue. 173 */ 174 #define IBD_FREE_LSOS_THRESH 8 175 #define IBD_FREE_SWQES_THRESH 20 176 #define IBD_TX_POLL_THRESH 80 177 178 /* 179 * When doing multiple-send-wr or multiple-recv-wr posts, this value 180 * determines how many to do at a time (in a single ibt_post_send/recv). 181 */ 182 #define IBD_MAX_POST_MULTIPLE 4 183 184 /* 185 * Maximum length for returning chained mps back to crossbow 186 */ 187 #define IBD_MAX_RX_MP_LEN 16 188 189 /* 190 * LSO parameters 191 */ 192 #define IBD_LSO_MAXLEN 65536 193 #define IBD_LSO_BUFSZ 8192 194 #define IBD_PROP_LSO_POLICY "lso-policy" 195 196 /* 197 * Completion queue polling control 198 */ 199 #define IBD_RX_CQ_POLLING 0x1 200 #define IBD_TX_CQ_POLLING 0x2 201 #define IBD_REDO_RX_CQ_POLLING 0x4 202 #define IBD_REDO_TX_CQ_POLLING 0x8 203 204 /* 205 * Flag bits for resources to reap 206 */ 207 #define IBD_RSRC_SWQE 0x1 208 #define IBD_RSRC_LSOBUF 0x2 209 210 /* 211 * Async operation types 212 */ 213 #define IBD_ASYNC_GETAH 1 214 #define IBD_ASYNC_JOIN 2 215 #define IBD_ASYNC_LEAVE 3 216 #define IBD_ASYNC_PROMON 4 217 #define IBD_ASYNC_PROMOFF 5 218 #define IBD_ASYNC_REAP 6 219 #define IBD_ASYNC_TRAP 7 220 #define IBD_ASYNC_SCHED 8 221 #define IBD_ASYNC_LINK 9 222 #define IBD_ASYNC_EXIT 10 223 224 /* 225 * Async operation states 226 */ 227 #define IBD_OP_NOTSTARTED 0 228 #define IBD_OP_ONGOING 1 229 #define IBD_OP_COMPLETED 2 230 #define IBD_OP_ERRORED 3 231 #define IBD_OP_ROUTERED 4 232 233 /* 234 * State of IBD driver initialization during attach/m_start 235 */ 236 #define IBD_DRV_STATE_INITIALIZED 0x00001 237 #define IBD_DRV_RXINTR_ADDED 0x00002 238 #define IBD_DRV_TXINTR_ADDED 0x00004 239 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 240 #define IBD_DRV_HCA_OPENED 0x00010 241 #define IBD_DRV_PD_ALLOCD 0x00020 242 #define IBD_DRV_MAC_REGISTERED 0x00040 243 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 244 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 245 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 246 #define IBD_DRV_CQS_ALLOCD 0x00400 247 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 248 #define IBD_DRV_TXLIST_ALLOCD 0x01000 249 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 250 #define IBD_DRV_RXLIST_ALLOCD 0x04000 251 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 252 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 253 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 254 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 255 #define IBD_DRV_STARTED 0x80000 256 257 /* 258 * Miscellaneous constants 259 */ 260 #define IBD_SEND 0 261 #define IBD_RECV 1 262 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 263 #define IBD_DEF_MAX_SDU 2044 264 #ifdef IBD_LOGGING 265 #define IBD_DMAX_LINE 100 266 #endif 267 268 /* 269 * Enumerations for link states 270 */ 271 typedef enum { 272 IBD_LINK_DOWN, 273 IBD_LINK_UP, 274 IBD_LINK_UP_ABSENT 275 } ibd_link_op_t; 276 277 /* 278 * Driver State Pointer 279 */ 280 void *ibd_list; 281 282 /* 283 * Logging 284 */ 285 #ifdef IBD_LOGGING 286 kmutex_t ibd_lbuf_lock; 287 uint8_t *ibd_lbuf; 288 uint32_t ibd_lbuf_ndx; 289 #endif 290 291 /* 292 * Required system entry points 293 */ 294 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 295 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 296 297 /* 298 * Required driver entry points for GLDv3 299 */ 300 static int ibd_m_stat(void *, uint_t, uint64_t *); 301 static int ibd_m_start(void *); 302 static void ibd_m_stop(void *); 303 static int ibd_m_promisc(void *, boolean_t); 304 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 305 static int ibd_m_unicst(void *, const uint8_t *); 306 static mblk_t *ibd_m_tx(void *, mblk_t *); 307 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 308 309 /* 310 * Private driver entry points for GLDv3 311 */ 312 313 /* 314 * Initialization 315 */ 316 static int ibd_state_init(ibd_state_t *, dev_info_t *); 317 static int ibd_init_txlist(ibd_state_t *); 318 static int ibd_init_rxlist(ibd_state_t *); 319 static int ibd_acache_init(ibd_state_t *); 320 #ifdef IBD_LOGGING 321 static void ibd_log_init(void); 322 #endif 323 324 /* 325 * Termination/cleanup 326 */ 327 static void ibd_state_fini(ibd_state_t *); 328 static void ibd_fini_txlist(ibd_state_t *); 329 static void ibd_fini_rxlist(ibd_state_t *); 330 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 331 static void ibd_acache_fini(ibd_state_t *); 332 #ifdef IBD_LOGGING 333 static void ibd_log_fini(void); 334 #endif 335 336 /* 337 * Allocation/acquire/map routines 338 */ 339 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t); 340 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 341 static int ibd_alloc_tx_copybufs(ibd_state_t *); 342 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 343 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **); 344 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 345 uint32_t *); 346 347 /* 348 * Free/release/unmap routines 349 */ 350 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 351 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 352 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *); 353 static void ibd_free_tx_copybufs(ibd_state_t *); 354 static void ibd_free_tx_lsobufs(ibd_state_t *); 355 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *); 356 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 357 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 358 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 359 360 /* 361 * Handlers/callback routines 362 */ 363 static uint_t ibd_intr(char *); 364 static uint_t ibd_tx_recycle(char *); 365 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 366 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 367 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 368 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t); 369 static void ibd_freemsg_cb(char *); 370 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 371 ibt_async_event_t *); 372 static void ibd_snet_notices_handler(void *, ib_gid_t, 373 ibt_subnet_event_code_t, ibt_subnet_event_t *); 374 375 /* 376 * Send/receive routines 377 */ 378 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 379 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 380 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t); 381 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 382 static void ibd_flush_rx(ibd_state_t *, mblk_t *); 383 384 /* 385 * Threads 386 */ 387 static void ibd_async_work(ibd_state_t *); 388 389 /* 390 * Async tasks 391 */ 392 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 393 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 394 static void ibd_async_setprom(ibd_state_t *); 395 static void ibd_async_unsetprom(ibd_state_t *); 396 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 397 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 398 static void ibd_async_txsched(ibd_state_t *); 399 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 400 401 /* 402 * Async task helpers 403 */ 404 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 405 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 406 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 407 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 408 ipoib_mac_t *, ipoib_mac_t *); 409 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 410 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 411 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 412 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 413 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 414 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 415 static uint64_t ibd_get_portspeed(ibd_state_t *); 416 static boolean_t ibd_async_safe(ibd_state_t *); 417 static void ibd_async_done(ibd_state_t *); 418 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 419 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 420 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 421 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 422 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 423 424 /* 425 * Helpers for attach/start routines 426 */ 427 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 428 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 429 static int ibd_unattach(ibd_state_t *, dev_info_t *); 430 static int ibd_get_port_details(ibd_state_t *); 431 static int ibd_alloc_cqs(ibd_state_t *); 432 static int ibd_setup_ud_channel(ibd_state_t *); 433 static int ibd_undo_m_start(ibd_state_t *); 434 435 436 /* 437 * Miscellaneous helpers 438 */ 439 static int ibd_sched_poll(ibd_state_t *, int, int); 440 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 441 static int ibd_resume_transmission(ibd_state_t *); 442 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 443 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 444 static void *list_get_head(list_t *); 445 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 446 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 447 static void ibd_print_warn(ibd_state_t *, char *, ...); 448 #ifdef IBD_LOGGING 449 static void ibd_log(const char *, ...); 450 #endif 451 452 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 453 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 454 455 /* Module Driver Info */ 456 static struct modldrv ibd_modldrv = { 457 &mod_driverops, /* This one is a driver */ 458 "InfiniBand GLDv3 Driver", /* short description */ 459 &ibd_dev_ops /* driver specific ops */ 460 }; 461 462 /* Module Linkage */ 463 static struct modlinkage ibd_modlinkage = { 464 MODREV_1, (void *)&ibd_modldrv, NULL 465 }; 466 467 /* 468 * Module (static) info passed to IBTL during ibt_attach 469 */ 470 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 471 IBTI_V_CURR, 472 IBT_NETWORK, 473 ibd_async_handler, 474 NULL, 475 "IPIB" 476 }; 477 478 /* 479 * GLDv3 entry points 480 */ 481 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 482 static mac_callbacks_t ibd_m_callbacks = { 483 IBD_M_CALLBACK_FLAGS, 484 ibd_m_stat, 485 ibd_m_start, 486 ibd_m_stop, 487 ibd_m_promisc, 488 ibd_m_multicst, 489 ibd_m_unicst, 490 ibd_m_tx, 491 NULL, 492 ibd_m_getcapab 493 }; 494 495 /* 496 * Fill/clear <scope> and <p_key> in multicast/broadcast address 497 */ 498 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 499 { \ 500 *(uint32_t *)((char *)(maddr) + 4) |= \ 501 htonl((uint32_t)(scope) << 16); \ 502 *(uint32_t *)((char *)(maddr) + 8) |= \ 503 htonl((uint32_t)(pkey) << 16); \ 504 } 505 506 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 507 { \ 508 *(uint32_t *)((char *)(maddr) + 4) &= \ 509 htonl(~((uint32_t)0xF << 16)); \ 510 *(uint32_t *)((char *)(maddr) + 8) &= \ 511 htonl(~((uint32_t)0xFFFF << 16)); \ 512 } 513 514 /* 515 * Rudimentary debugging support 516 */ 517 #ifdef DEBUG 518 int ibd_debuglevel = 100; 519 static void 520 debug_print(int l, char *fmt, ...) 521 { 522 va_list ap; 523 524 if (l < ibd_debuglevel) 525 return; 526 va_start(ap, fmt); 527 vcmn_err(CE_CONT, fmt, ap); 528 va_end(ap); 529 } 530 #define DPRINT debug_print 531 #else 532 #define DPRINT 533 #endif 534 535 /* 536 * Common routine to print warning messages; adds in hca guid, port number 537 * and pkey to be able to identify the IBA interface. 538 */ 539 static void 540 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 541 { 542 ib_guid_t hca_guid; 543 char ibd_print_buf[256]; 544 int len; 545 va_list ap; 546 547 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 548 0, "hca-guid", 0); 549 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 550 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 551 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 552 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 553 va_start(ap, fmt); 554 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 555 fmt, ap); 556 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 557 va_end(ap); 558 } 559 560 /* 561 * Warlock directives 562 */ 563 564 /* 565 * id_lso_lock 566 * 567 * state->id_lso->bkt_nfree may be accessed without a lock to 568 * determine the threshold at which we have to ask the nw layer 569 * to resume transmission (see ibd_resume_transmission()). 570 */ 571 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 572 ibd_state_t::id_lso)) 573 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 574 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 575 576 /* 577 * id_cq_poll_lock 578 */ 579 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock, 580 ibd_state_t::id_cq_poll_busy)) 581 582 /* 583 * id_txpost_lock 584 */ 585 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 586 ibd_state_t::id_tx_head)) 587 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 588 ibd_state_t::id_tx_busy)) 589 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 590 ibd_state_t::id_tx_tailp)) 591 592 /* 593 * id_rxpost_lock 594 */ 595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 596 ibd_state_t::id_rx_head)) 597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 598 ibd_state_t::id_rx_busy)) 599 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 600 ibd_state_t::id_rx_tailp)) 601 602 /* 603 * id_acache_req_lock 604 */ 605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 606 ibd_state_t::id_acache_req_cv)) 607 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 608 ibd_state_t::id_req_list)) 609 610 /* 611 * id_ac_mutex 612 * 613 * This mutex is actually supposed to protect id_ah_op as well, 614 * but this path of the code isn't clean (see update of id_ah_op 615 * in ibd_async_acache(), immediately after the call to 616 * ibd_async_mcache()). For now, we'll skip this check by 617 * declaring that id_ah_op is protected by some internal scheme 618 * that warlock isn't aware of. 619 */ 620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 621 ibd_state_t::id_ah_active)) 622 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 623 ibd_state_t::id_ah_free)) 624 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 625 ibd_state_t::id_ah_addr)) 626 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 627 ibd_state_t::id_ah_op)) 628 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 629 ibd_state_t::id_ah_error)) 630 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 631 632 /* 633 * id_mc_mutex 634 */ 635 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 636 ibd_state_t::id_mc_full)) 637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 638 ibd_state_t::id_mc_non)) 639 640 /* 641 * id_trap_lock 642 */ 643 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 644 ibd_state_t::id_trap_cv)) 645 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 646 ibd_state_t::id_trap_stop)) 647 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 648 ibd_state_t::id_trap_inprog)) 649 650 /* 651 * id_prom_op 652 */ 653 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 654 ibd_state_t::id_prom_op)) 655 656 /* 657 * id_sched_lock 658 */ 659 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 660 ibd_state_t::id_sched_needed)) 661 662 /* 663 * id_link_mutex 664 */ 665 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 666 ibd_state_t::id_link_state)) 667 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 668 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 669 ibd_state_t::id_link_speed)) 670 671 /* 672 * id_tx_list.dl_mutex 673 */ 674 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 675 ibd_state_t::id_tx_list.dl_head)) 676 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 677 ibd_state_t::id_tx_list.dl_tail)) 678 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 679 ibd_state_t::id_tx_list.dl_pending_sends)) 680 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 681 ibd_state_t::id_tx_list.dl_cnt)) 682 683 /* 684 * id_rx_list.dl_mutex 685 */ 686 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 687 ibd_state_t::id_rx_list.dl_head)) 688 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 689 ibd_state_t::id_rx_list.dl_tail)) 690 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 691 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 692 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 693 ibd_state_t::id_rx_list.dl_cnt)) 694 695 696 /* 697 * Items protected by atomic updates 698 */ 699 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 700 ibd_state_s::id_brd_rcv 701 ibd_state_s::id_brd_xmt 702 ibd_state_s::id_multi_rcv 703 ibd_state_s::id_multi_xmt 704 ibd_state_s::id_num_intrs 705 ibd_state_s::id_rcv_bytes 706 ibd_state_s::id_rcv_pkt 707 ibd_state_s::id_tx_short 708 ibd_state_s::id_xmt_bytes 709 ibd_state_s::id_xmt_pkt)) 710 711 /* 712 * Non-mutex protection schemes for data elements. Almost all of 713 * these are non-shared items. 714 */ 715 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 716 callb_cpr 717 ib_gid_s 718 ib_header_info 719 ibd_acache_rq 720 ibd_acache_s::ac_mce 721 ibd_mcache::mc_fullreap 722 ibd_mcache::mc_jstate 723 ibd_mcache::mc_req 724 ibd_rwqe_s 725 ibd_swqe_s 726 ibd_wqe_s 727 ibt_wr_ds_s::ds_va 728 ibt_wr_lso_s 729 ipoib_mac::ipoib_qpn 730 mac_capab_lso_s 731 msgb::b_next 732 msgb::b_rptr 733 msgb::b_wptr)) 734 735 int 736 _init() 737 { 738 int status; 739 740 /* 741 * Sanity check some parameter settings. Tx completion polling 742 * only makes sense with separate CQs for Tx and Rx. 743 */ 744 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 745 cmn_err(CE_NOTE, "!ibd: %s", 746 "Setting ibd_txcomp_poll = 0 for combined CQ"); 747 ibd_txcomp_poll = 0; 748 } 749 750 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 751 if (status != 0) { 752 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 753 return (status); 754 } 755 756 mac_init_ops(&ibd_dev_ops, "ibd"); 757 status = mod_install(&ibd_modlinkage); 758 if (status != 0) { 759 DPRINT(10, "_init:failed in mod_install()"); 760 ddi_soft_state_fini(&ibd_list); 761 mac_fini_ops(&ibd_dev_ops); 762 return (status); 763 } 764 765 #ifdef IBD_LOGGING 766 ibd_log_init(); 767 #endif 768 return (0); 769 } 770 771 int 772 _info(struct modinfo *modinfop) 773 { 774 return (mod_info(&ibd_modlinkage, modinfop)); 775 } 776 777 int 778 _fini() 779 { 780 int status; 781 782 status = mod_remove(&ibd_modlinkage); 783 if (status != 0) 784 return (status); 785 786 mac_fini_ops(&ibd_dev_ops); 787 ddi_soft_state_fini(&ibd_list); 788 #ifdef IBD_LOGGING 789 ibd_log_fini(); 790 #endif 791 return (0); 792 } 793 794 /* 795 * Convert the GID part of the mac address from network byte order 796 * to host order. 797 */ 798 static void 799 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 800 { 801 ib_sn_prefix_t nbopref; 802 ib_guid_t nboguid; 803 804 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 805 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 806 dgid->gid_prefix = b2h64(nbopref); 807 dgid->gid_guid = b2h64(nboguid); 808 } 809 810 /* 811 * Create the IPoIB address in network byte order from host order inputs. 812 */ 813 static void 814 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 815 ib_guid_t guid) 816 { 817 ib_sn_prefix_t nbopref; 818 ib_guid_t nboguid; 819 820 mac->ipoib_qpn = htonl(qpn); 821 nbopref = h2b64(prefix); 822 nboguid = h2b64(guid); 823 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 824 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 825 } 826 827 /* 828 * Send to the appropriate all-routers group when the IBA multicast group 829 * does not exist, based on whether the target group is v4 or v6. 830 */ 831 static boolean_t 832 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 833 ipoib_mac_t *rmac) 834 { 835 boolean_t retval = B_TRUE; 836 uint32_t adjscope = state->id_scope << 16; 837 uint32_t topword; 838 839 /* 840 * Copy the first 4 bytes in without assuming any alignment of 841 * input mac address; this will have IPoIB signature, flags and 842 * scope bits. 843 */ 844 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 845 topword = ntohl(topword); 846 847 /* 848 * Generate proper address for IPv4/v6, adding in the Pkey properly. 849 */ 850 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 851 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 852 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 853 ((uint32_t)(state->id_pkey << 16))), 854 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 855 else 856 /* 857 * Does not have proper bits in the mgid address. 858 */ 859 retval = B_FALSE; 860 861 return (retval); 862 } 863 864 /* 865 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 866 * front of optional src/tgt link layer address. Right now Solaris inserts 867 * padding by default at the end. The routine which is doing is nce_xmit() 868 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 869 * the packet comes down from IP layer to the IBD driver, it is in the 870 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 871 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 872 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 873 * 874 * The send routine at IBD driver changes this packet as follows: 875 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 876 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 877 * aligned. 878 * 879 * At the receiving side again ibd_process_rx takes the above packet and 880 * removes the two bytes of front padding and inserts it at the end. This 881 * is since the IP layer does not understand padding at the front. 882 */ 883 #define IBD_PAD_NSNA(ip6h, len, type) { \ 884 uchar_t *nd_lla_ptr; \ 885 icmp6_t *icmp6; \ 886 nd_opt_hdr_t *opt; \ 887 int i; \ 888 \ 889 icmp6 = (icmp6_t *)&ip6h[1]; \ 890 len -= sizeof (nd_neighbor_advert_t); \ 891 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 892 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 893 (len != 0)) { \ 894 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 895 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 896 ASSERT(opt != NULL); \ 897 nd_lla_ptr = (uchar_t *)&opt[1]; \ 898 if (type == IBD_SEND) { \ 899 for (i = IPOIB_ADDRL; i > 0; i--) \ 900 *(nd_lla_ptr + i + 1) = \ 901 *(nd_lla_ptr + i - 1); \ 902 } else { \ 903 for (i = 0; i < IPOIB_ADDRL; i++) \ 904 *(nd_lla_ptr + i) = \ 905 *(nd_lla_ptr + i + 2); \ 906 } \ 907 *(nd_lla_ptr + i) = 0; \ 908 *(nd_lla_ptr + i + 1) = 0; \ 909 } \ 910 } 911 912 /* 913 * Address handle entries maintained by the driver are kept in the 914 * free and active lists. Each entry starts out in the free list; 915 * it migrates to the active list when primed using ibt_get_paths() 916 * and ibt_modify_ud_dest() for transmission to a specific destination. 917 * In the active list, the entry has a reference count indicating the 918 * number of ongoing/uncompleted transmits that reference it. The 919 * entry is left in the active list even after the reference count 920 * goes to 0, since successive transmits can find it there and do 921 * not need to set up another entry (ie the path information is 922 * cached using the active list). Entries on the active list are 923 * also hashed using the destination link address as a key for faster 924 * lookups during transmits. 925 * 926 * For any destination address (unicast or multicast, whatever the 927 * join states), there will be at most one entry in the active list. 928 * Entries with a 0 reference count on the active list can be reused 929 * for a transmit to a new destination, if the free list is empty. 930 * 931 * The AH free list insertion/deletion is protected with the id_ac_mutex, 932 * since the async thread and Tx callback handlers insert/delete. The 933 * active list does not need a lock (all operations are done by the 934 * async thread) but updates to the reference count are atomically 935 * done (increments done by Tx path, decrements by the Tx callback handler). 936 */ 937 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 938 list_insert_head(&state->id_ah_free, ce) 939 #define IBD_ACACHE_GET_FREE(state) \ 940 list_get_head(&state->id_ah_free) 941 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 942 int _ret_; \ 943 list_insert_head(&state->id_ah_active, ce); \ 944 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 945 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 946 ASSERT(_ret_ == 0); \ 947 } 948 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 949 list_remove(&state->id_ah_active, ce); \ 950 (void) mod_hash_remove(state->id_ah_active_hash, \ 951 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 952 } 953 #define IBD_ACACHE_GET_ACTIVE(state) \ 954 list_get_head(&state->id_ah_active) 955 956 /* 957 * Membership states for different mcg's are tracked by two lists: 958 * the "non" list is used for promiscuous mode, when all mcg traffic 959 * needs to be inspected. This type of membership is never used for 960 * transmission, so there can not be an AH in the active list 961 * corresponding to a member in this list. This list does not need 962 * any protection, since all operations are performed by the async 963 * thread. 964 * 965 * "Full" and "SendOnly" membership is tracked using a single list, 966 * the "full" list. This is because this single list can then be 967 * searched during transmit to a multicast group (if an AH for the 968 * mcg is not found in the active list), since at least one type 969 * of membership must be present before initiating the transmit. 970 * This list is also emptied during driver detach, since sendonly 971 * membership acquired during transmit is dropped at detach time 972 * alongwith ipv4 broadcast full membership. Insert/deletes to 973 * this list are done only by the async thread, but it is also 974 * searched in program context (see multicast disable case), thus 975 * the id_mc_mutex protects the list. The driver detach path also 976 * deconstructs the "full" list, but it ensures that the async 977 * thread will not be accessing the list (by blocking out mcg 978 * trap handling and making sure no more Tx reaping will happen). 979 * 980 * Currently, an IBA attach is done in the SendOnly case too, 981 * although this is not required. 982 */ 983 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 984 list_insert_head(&state->id_mc_full, mce) 985 #define IBD_MCACHE_INSERT_NON(state, mce) \ 986 list_insert_head(&state->id_mc_non, mce) 987 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 988 ibd_mcache_find(mgid, &state->id_mc_full) 989 #define IBD_MCACHE_FIND_NON(state, mgid) \ 990 ibd_mcache_find(mgid, &state->id_mc_non) 991 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 992 list_remove(&state->id_mc_full, mce) 993 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 994 list_remove(&state->id_mc_non, mce) 995 996 /* 997 * AH and MCE active list manipulation: 998 * 999 * Multicast disable requests and MCG delete traps are two cases 1000 * where the active AH entry for the mcg (if any unreferenced one exists) 1001 * will be moved to the free list (to force the next Tx to the mcg to 1002 * join the MCG in SendOnly mode). Port up handling will also move AHs 1003 * from active to free list. 1004 * 1005 * In the case when some transmits are still pending on an entry 1006 * for an mcg, but a multicast disable has already been issued on the 1007 * mcg, there are some options to consider to preserve the join state 1008 * to ensure the emitted packet is properly routed on the IBA fabric. 1009 * For the AH, we can 1010 * 1. take out of active list at multicast disable time. 1011 * 2. take out of active list only when last pending Tx completes. 1012 * For the MCE, we can 1013 * 3. take out of active list at multicast disable time. 1014 * 4. take out of active list only when last pending Tx completes. 1015 * 5. move from active list to stale list at multicast disable time. 1016 * We choose to use 2,4. We use option 4 so that if a multicast enable 1017 * is tried before the pending Tx completes, the enable code finds the 1018 * mce in the active list and just has to make sure it will not be reaped 1019 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1020 * a stale list (#5) that would be checked in the enable code would need 1021 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1022 * after the multicast disable would try to put an AH in the active list, 1023 * and associate the mce it finds in the active list to this new AH, 1024 * whereas the mce is already associated with the previous AH (taken off 1025 * the active list), and will be removed once the pending Tx's complete 1026 * (unless a reference count on mce's is implemented). One implication of 1027 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1028 * grab new references on the AH, further delaying the leave. 1029 * 1030 * In the case of mcg delete (or create) trap when the port is sendonly 1031 * joined, the AH and MCE handling is different: the AH and MCE has to be 1032 * immediately taken off the active lists (forcing a join and path lookup 1033 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1034 * to an mcg as it is repeatedly created and deleted and goes thru 1035 * reincarnations). 1036 * 1037 * When a port is already sendonly joined, and a multicast enable is 1038 * attempted, the same mce structure is promoted; this ensures only a 1039 * single mce on the active list tracks the most powerful join state. 1040 * 1041 * In the case of port up event handling, the MCE for sendonly membership 1042 * is freed up, and the ACE is put into the free list as soon as possible 1043 * (depending on whether posted Tx's have completed). For fullmembership 1044 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1045 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1046 * done; else the mce is deconstructed (mc_fullreap case). 1047 * 1048 * MCG creation and deletion trap handling: 1049 * 1050 * These traps are unreliable (meaning sometimes the trap might never 1051 * be delivered to the subscribed nodes) and may arrive out-of-order 1052 * since they use UD transport. An alternative to relying on these 1053 * unreliable traps is to poll for mcg presence every so often, but 1054 * instead of doing that, we try to be as conservative as possible 1055 * while handling the traps, and hope that the traps do arrive at 1056 * the subscribed nodes soon. Note that if a node is fullmember 1057 * joined to an mcg, it can not possibly receive a mcg create/delete 1058 * trap for that mcg (by fullmember definition); if it does, it is 1059 * an old trap from a previous incarnation of the mcg. 1060 * 1061 * Whenever a trap is received, the driver cleans up its sendonly 1062 * membership to the group; we choose to do a sendonly leave even 1063 * on a creation trap to handle the case of a prior deletion of the mcg 1064 * having gone unnoticed. Consider an example scenario: 1065 * T1: MCG M is deleted, and fires off deletion trap D1. 1066 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1067 * T3: Node N tries to transmit to M, joining in sendonly mode. 1068 * T4: MCG M is deleted, and fires off deletion trap D2. 1069 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1070 * If the trap is D2, then a LEAVE is not required, since the mcg 1071 * is already deleted; but if it is D1, a LEAVE is required. A safe 1072 * approach is to always LEAVE, but the SM may be confused if it 1073 * receives a LEAVE without a prior JOIN. 1074 * 1075 * Management of the non-membership to an mcg is similar to the above, 1076 * except that if the interface is in promiscuous mode, it is required 1077 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1078 * if the re-join attempt fails (in which case a warning message needs 1079 * to be printed), it is not clear whether it failed due to the mcg not 1080 * existing, or some fabric/hca issues, due to the delayed nature of 1081 * trap delivery. Querying the SA to establish presence/absence of the 1082 * mcg is also racy at best. Thus, the driver just prints a warning 1083 * message when it can not rejoin after receiving a create trap, although 1084 * this might be (on rare occassions) a mis-warning if the create trap is 1085 * received after the mcg was deleted. 1086 */ 1087 1088 /* 1089 * Implementation of atomic "recycle" bits and reference count 1090 * on address handles. This utilizes the fact that max reference 1091 * count on any handle is limited by number of send wqes, thus 1092 * high bits in the ac_ref field can be used as the recycle bits, 1093 * and only the low bits hold the number of pending Tx requests. 1094 * This atomic AH reference counting allows the Tx completion 1095 * handler not to acquire the id_ac_mutex to process every completion, 1096 * thus reducing lock contention problems between completion and 1097 * the Tx path. 1098 */ 1099 #define CYCLEVAL 0x80000 1100 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1101 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1102 #define GET_REF(ace) ((ace)->ac_ref) 1103 #define GET_REF_CYCLE(ace) ( \ 1104 /* \ 1105 * Make sure "cycle" bit is set. \ 1106 */ \ 1107 ASSERT(CYCLE_SET(ace)), \ 1108 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1109 ) 1110 #define INC_REF(ace, num) { \ 1111 atomic_add_32(&(ace)->ac_ref, num); \ 1112 } 1113 #define SET_CYCLE_IF_REF(ace) ( \ 1114 CYCLE_SET(ace) ? B_TRUE : \ 1115 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1116 CYCLEVAL ? \ 1117 /* \ 1118 * Clear the "cycle" bit we just set; \ 1119 * ref count known to be 0 from above. \ 1120 */ \ 1121 CLEAR_REFCYCLE(ace), B_FALSE : \ 1122 /* \ 1123 * We set "cycle" bit; let caller know. \ 1124 */ \ 1125 B_TRUE \ 1126 ) 1127 #define DEC_REF_DO_CYCLE(ace) ( \ 1128 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1129 CYCLEVAL ? \ 1130 /* \ 1131 * Ref count known to be 0 from above. \ 1132 */ \ 1133 B_TRUE : \ 1134 B_FALSE \ 1135 ) 1136 1137 static void * 1138 list_get_head(list_t *list) 1139 { 1140 list_node_t *lhead = list_head(list); 1141 1142 if (lhead != NULL) 1143 list_remove(list, lhead); 1144 return (lhead); 1145 } 1146 1147 /* 1148 * This is always guaranteed to be able to queue the work. 1149 */ 1150 static void 1151 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1152 { 1153 /* Initialize request */ 1154 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1155 ptr->rq_op = op; 1156 1157 /* 1158 * Queue provided slot onto request pool. 1159 */ 1160 mutex_enter(&state->id_acache_req_lock); 1161 list_insert_tail(&state->id_req_list, ptr); 1162 1163 /* Go, fetch, async thread */ 1164 cv_signal(&state->id_acache_req_cv); 1165 mutex_exit(&state->id_acache_req_lock); 1166 } 1167 1168 /* 1169 * Main body of the per interface async thread. 1170 */ 1171 static void 1172 ibd_async_work(ibd_state_t *state) 1173 { 1174 ibd_req_t *ptr; 1175 callb_cpr_t cprinfo; 1176 1177 mutex_enter(&state->id_acache_req_lock); 1178 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1179 callb_generic_cpr, "ibd_async_work"); 1180 1181 for (;;) { 1182 ptr = list_get_head(&state->id_req_list); 1183 if (ptr != NULL) { 1184 mutex_exit(&state->id_acache_req_lock); 1185 1186 /* 1187 * Once we have done the operation, there is no 1188 * guarantee the request slot is going to be valid, 1189 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1190 * TRAP). 1191 * 1192 * Perform the request. 1193 */ 1194 switch (ptr->rq_op) { 1195 case IBD_ASYNC_GETAH: 1196 ibd_async_acache(state, &ptr->rq_mac); 1197 break; 1198 case IBD_ASYNC_JOIN: 1199 case IBD_ASYNC_LEAVE: 1200 ibd_async_multicast(state, 1201 ptr->rq_gid, ptr->rq_op); 1202 break; 1203 case IBD_ASYNC_PROMON: 1204 ibd_async_setprom(state); 1205 break; 1206 case IBD_ASYNC_PROMOFF: 1207 ibd_async_unsetprom(state); 1208 break; 1209 case IBD_ASYNC_REAP: 1210 ibd_async_reap_group(state, 1211 ptr->rq_ptr, ptr->rq_gid, 1212 IB_MC_JSTATE_FULL); 1213 /* 1214 * the req buf contains in mce 1215 * structure, so we do not need 1216 * to free it here. 1217 */ 1218 ptr = NULL; 1219 break; 1220 case IBD_ASYNC_TRAP: 1221 ibd_async_trap(state, ptr); 1222 break; 1223 case IBD_ASYNC_SCHED: 1224 ibd_async_txsched(state); 1225 break; 1226 case IBD_ASYNC_LINK: 1227 ibd_async_link(state, ptr); 1228 break; 1229 case IBD_ASYNC_EXIT: 1230 mutex_enter(&state->id_acache_req_lock); 1231 #ifndef __lock_lint 1232 CALLB_CPR_EXIT(&cprinfo); 1233 #else 1234 mutex_exit(&state->id_acache_req_lock); 1235 #endif 1236 return; 1237 } 1238 if (ptr != NULL) 1239 kmem_cache_free(state->id_req_kmc, ptr); 1240 1241 mutex_enter(&state->id_acache_req_lock); 1242 } else { 1243 #ifndef __lock_lint 1244 /* 1245 * Nothing to do: wait till new request arrives. 1246 */ 1247 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1248 cv_wait(&state->id_acache_req_cv, 1249 &state->id_acache_req_lock); 1250 CALLB_CPR_SAFE_END(&cprinfo, 1251 &state->id_acache_req_lock); 1252 #endif 1253 } 1254 } 1255 1256 /*NOTREACHED*/ 1257 _NOTE(NOT_REACHED) 1258 } 1259 1260 /* 1261 * Return when it is safe to queue requests to the async daemon; primarily 1262 * for subnet trap and async event handling. Disallow requests before the 1263 * daemon is created, and when interface deinitilization starts. 1264 */ 1265 static boolean_t 1266 ibd_async_safe(ibd_state_t *state) 1267 { 1268 mutex_enter(&state->id_trap_lock); 1269 if (state->id_trap_stop) { 1270 mutex_exit(&state->id_trap_lock); 1271 return (B_FALSE); 1272 } 1273 state->id_trap_inprog++; 1274 mutex_exit(&state->id_trap_lock); 1275 return (B_TRUE); 1276 } 1277 1278 /* 1279 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1280 * trap or event handling to complete to kill the async thread and deconstruct 1281 * the mcg/ace list. 1282 */ 1283 static void 1284 ibd_async_done(ibd_state_t *state) 1285 { 1286 mutex_enter(&state->id_trap_lock); 1287 if (--state->id_trap_inprog == 0) 1288 cv_signal(&state->id_trap_cv); 1289 mutex_exit(&state->id_trap_lock); 1290 } 1291 1292 /* 1293 * Hash functions: 1294 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1295 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1296 * These operate on mac addresses input into ibd_send, but there is no 1297 * guarantee on the alignment of the ipoib_mac_t structure. 1298 */ 1299 /*ARGSUSED*/ 1300 static uint_t 1301 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1302 { 1303 ulong_t ptraddr = (ulong_t)key; 1304 uint_t hval; 1305 1306 /* 1307 * If the input address is 4 byte aligned, we can just dereference 1308 * it. This is most common, since IP will send in a 4 byte aligned 1309 * IP header, which implies the 24 byte IPoIB psuedo header will be 1310 * 4 byte aligned too. 1311 */ 1312 if ((ptraddr & 3) == 0) 1313 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1314 1315 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1316 return (hval); 1317 } 1318 1319 static int 1320 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1321 { 1322 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1323 return (0); 1324 else 1325 return (1); 1326 } 1327 1328 /* 1329 * Initialize all the per interface caches and lists; AH cache, 1330 * MCG list etc. 1331 */ 1332 static int 1333 ibd_acache_init(ibd_state_t *state) 1334 { 1335 ibd_ace_t *ce; 1336 int i; 1337 1338 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1339 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1340 1341 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1342 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1343 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1344 offsetof(ibd_ace_t, ac_list)); 1345 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1346 offsetof(ibd_ace_t, ac_list)); 1347 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1348 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1349 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1350 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1351 offsetof(ibd_mce_t, mc_list)); 1352 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1353 offsetof(ibd_mce_t, mc_list)); 1354 list_create(&state->id_req_list, sizeof (ibd_req_t), 1355 offsetof(ibd_req_t, rq_list)); 1356 1357 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1358 IBD_NUM_AH, KM_SLEEP); 1359 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1360 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1361 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1362 ibd_acache_fini(state); 1363 return (DDI_FAILURE); 1364 } else { 1365 CLEAR_REFCYCLE(ce); 1366 ce->ac_mce = NULL; 1367 IBD_ACACHE_INSERT_FREE(state, ce); 1368 } 1369 } 1370 return (DDI_SUCCESS); 1371 } 1372 1373 static void 1374 ibd_acache_fini(ibd_state_t *state) 1375 { 1376 ibd_ace_t *ptr; 1377 1378 mutex_enter(&state->id_ac_mutex); 1379 1380 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1381 ASSERT(GET_REF(ptr) == 0); 1382 (void) ibt_free_ud_dest(ptr->ac_dest); 1383 } 1384 1385 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1386 ASSERT(GET_REF(ptr) == 0); 1387 (void) ibt_free_ud_dest(ptr->ac_dest); 1388 } 1389 1390 list_destroy(&state->id_ah_free); 1391 list_destroy(&state->id_ah_active); 1392 list_destroy(&state->id_mc_full); 1393 list_destroy(&state->id_mc_non); 1394 list_destroy(&state->id_req_list); 1395 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1396 mutex_exit(&state->id_ac_mutex); 1397 mutex_destroy(&state->id_ac_mutex); 1398 mutex_destroy(&state->id_mc_mutex); 1399 mutex_destroy(&state->id_acache_req_lock); 1400 cv_destroy(&state->id_acache_req_cv); 1401 } 1402 1403 /* 1404 * Search AH active hash list for a cached path to input destination. 1405 * If we are "just looking", hold == F. When we are in the Tx path, 1406 * we set hold == T to grab a reference on the AH so that it can not 1407 * be recycled to a new destination while the Tx request is posted. 1408 */ 1409 static ibd_ace_t * 1410 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1411 { 1412 ibd_ace_t *ptr; 1413 1414 ASSERT(mutex_owned(&state->id_ac_mutex)); 1415 1416 /* 1417 * Do hash search. 1418 */ 1419 if (mod_hash_find(state->id_ah_active_hash, 1420 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1421 if (hold) 1422 INC_REF(ptr, num); 1423 return (ptr); 1424 } 1425 return (NULL); 1426 } 1427 1428 /* 1429 * This is called by the tx side; if an initialized AH is found in 1430 * the active list, it is locked down and can be used; if no entry 1431 * is found, an async request is queued to do path resolution. 1432 */ 1433 static ibd_ace_t * 1434 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1435 { 1436 ibd_ace_t *ptr; 1437 ibd_req_t *req; 1438 1439 /* 1440 * Only attempt to print when we can; in the mdt pattr case, the 1441 * address is not aligned properly. 1442 */ 1443 if (((ulong_t)mac & 3) == 0) { 1444 DPRINT(4, 1445 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1446 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1447 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1448 htonl(mac->ipoib_gidsuff[1])); 1449 } 1450 1451 mutex_enter(&state->id_ac_mutex); 1452 1453 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1454 mutex_exit(&state->id_ac_mutex); 1455 return (ptr); 1456 } 1457 1458 /* 1459 * Implementation of a single outstanding async request; if 1460 * the operation is not started yet, queue a request and move 1461 * to ongoing state. Remember in id_ah_addr for which address 1462 * we are queueing the request, in case we need to flag an error; 1463 * Any further requests, for the same or different address, until 1464 * the operation completes, is sent back to GLDv3 to be retried. 1465 * The async thread will update id_ah_op with an error indication 1466 * or will set it to indicate the next look up can start; either 1467 * way, it will mac_tx_update() so that all blocked requests come 1468 * back here. 1469 */ 1470 *err = EAGAIN; 1471 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1472 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1473 if (req != NULL) { 1474 /* 1475 * We did not even find the entry; queue a request 1476 * for it. 1477 */ 1478 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1479 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1480 state->id_ah_op = IBD_OP_ONGOING; 1481 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1482 } 1483 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1484 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1485 /* 1486 * Check the status of the pathrecord lookup request 1487 * we had queued before. 1488 */ 1489 if (state->id_ah_op == IBD_OP_ERRORED) { 1490 *err = EFAULT; 1491 state->id_ah_error++; 1492 } else { 1493 /* 1494 * IBD_OP_ROUTERED case: We need to send to the 1495 * all-router MCG. If we can find the AH for 1496 * the mcg, the Tx will be attempted. If we 1497 * do not find the AH, we return NORESOURCES 1498 * to retry. 1499 */ 1500 ipoib_mac_t routermac; 1501 1502 (void) ibd_get_allroutergroup(state, mac, &routermac); 1503 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1504 numwqe); 1505 } 1506 state->id_ah_op = IBD_OP_NOTSTARTED; 1507 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1508 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1509 /* 1510 * This case can happen when we get a higher band 1511 * packet. The easiest way is to reset the state machine 1512 * to accommodate the higher priority packet. 1513 */ 1514 state->id_ah_op = IBD_OP_NOTSTARTED; 1515 } 1516 mutex_exit(&state->id_ac_mutex); 1517 1518 return (ptr); 1519 } 1520 1521 /* 1522 * Grab a not-currently-in-use AH/PathRecord from the active 1523 * list to recycle to a new destination. Only the async thread 1524 * executes this code. 1525 */ 1526 static ibd_ace_t * 1527 ibd_acache_get_unref(ibd_state_t *state) 1528 { 1529 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1530 1531 ASSERT(mutex_owned(&state->id_ac_mutex)); 1532 1533 /* 1534 * Do plain linear search. 1535 */ 1536 while (ptr != NULL) { 1537 /* 1538 * Note that it is possible that the "cycle" bit 1539 * is set on the AH w/o any reference count. The 1540 * mcg must have been deleted, and the tx cleanup 1541 * just decremented the reference count to 0, but 1542 * hasn't gotten around to grabbing the id_ac_mutex 1543 * to move the AH into the free list. 1544 */ 1545 if (GET_REF(ptr) == 0) { 1546 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1547 break; 1548 } 1549 ptr = list_next(&state->id_ah_active, ptr); 1550 } 1551 return (ptr); 1552 } 1553 1554 /* 1555 * Invoked to clean up AH from active list in case of multicast 1556 * disable and to handle sendonly memberships during mcg traps. 1557 * And for port up processing for multicast and unicast AHs. 1558 * Normally, the AH is taken off the active list, and put into 1559 * the free list to be recycled for a new destination. In case 1560 * Tx requests on the AH have not completed yet, the AH is marked 1561 * for reaping (which will put the AH on the free list) once the Tx's 1562 * complete; in this case, depending on the "force" input, we take 1563 * out the AH from the active list right now, or leave it also for 1564 * the reap operation. Returns TRUE if the AH is taken off the active 1565 * list (and either put into the free list right now, or arranged for 1566 * later), FALSE otherwise. 1567 */ 1568 static boolean_t 1569 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1570 { 1571 ibd_ace_t *acactive; 1572 boolean_t ret = B_TRUE; 1573 1574 ASSERT(mutex_owned(&state->id_ac_mutex)); 1575 1576 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1577 1578 /* 1579 * Note that the AH might already have the cycle bit set 1580 * on it; this might happen if sequences of multicast 1581 * enables and disables are coming so fast, that posted 1582 * Tx's to the mcg have not completed yet, and the cycle 1583 * bit is set successively by each multicast disable. 1584 */ 1585 if (SET_CYCLE_IF_REF(acactive)) { 1586 if (!force) { 1587 /* 1588 * The ace is kept on the active list, further 1589 * Tx's can still grab a reference on it; the 1590 * ace is reaped when all pending Tx's 1591 * referencing the AH complete. 1592 */ 1593 ret = B_FALSE; 1594 } else { 1595 /* 1596 * In the mcg trap case, we always pull the 1597 * AH from the active list. And also the port 1598 * up multi/unicast case. 1599 */ 1600 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1601 acactive->ac_mce = NULL; 1602 } 1603 } else { 1604 /* 1605 * Determined the ref count is 0, thus reclaim 1606 * immediately after pulling out the ace from 1607 * the active list. 1608 */ 1609 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1610 acactive->ac_mce = NULL; 1611 IBD_ACACHE_INSERT_FREE(state, acactive); 1612 } 1613 1614 } 1615 return (ret); 1616 } 1617 1618 /* 1619 * Helper function for async path record lookup. If we are trying to 1620 * Tx to a MCG, check our membership, possibly trying to join the 1621 * group if required. If that fails, try to send the packet to the 1622 * all router group (indicated by the redirect output), pointing 1623 * the input mac address to the router mcg address. 1624 */ 1625 static ibd_mce_t * 1626 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1627 { 1628 ib_gid_t mgid; 1629 ibd_mce_t *mce; 1630 ipoib_mac_t routermac; 1631 1632 *redirect = B_FALSE; 1633 ibd_n2h_gid(mac, &mgid); 1634 1635 /* 1636 * Check the FullMember+SendOnlyNonMember list. 1637 * Since we are the only one who manipulates the 1638 * id_mc_full list, no locks are needed. 1639 */ 1640 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1641 if (mce != NULL) { 1642 DPRINT(4, "ibd_async_mcache : already joined to group"); 1643 return (mce); 1644 } 1645 1646 /* 1647 * Not found; try to join(SendOnlyNonMember) and attach. 1648 */ 1649 DPRINT(4, "ibd_async_mcache : not joined to group"); 1650 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1651 NULL) { 1652 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1653 return (mce); 1654 } 1655 1656 /* 1657 * MCGroup not present; try to join the all-router group. If 1658 * any of the following steps succeed, we will be redirecting 1659 * to the all router group. 1660 */ 1661 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1662 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1663 return (NULL); 1664 *redirect = B_TRUE; 1665 ibd_n2h_gid(&routermac, &mgid); 1666 bcopy(&routermac, mac, IPOIB_ADDRL); 1667 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1668 mgid.gid_prefix, mgid.gid_guid); 1669 1670 /* 1671 * Are we already joined to the router group? 1672 */ 1673 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1674 DPRINT(4, "ibd_async_mcache : using already joined router" 1675 "group\n"); 1676 return (mce); 1677 } 1678 1679 /* 1680 * Can we join(SendOnlyNonMember) the router group? 1681 */ 1682 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1683 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1684 NULL) { 1685 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1686 return (mce); 1687 } 1688 1689 return (NULL); 1690 } 1691 1692 /* 1693 * Async path record lookup code. 1694 */ 1695 static void 1696 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1697 { 1698 ibd_ace_t *ce; 1699 ibd_mce_t *mce = NULL; 1700 ibt_path_attr_t path_attr; 1701 ibt_path_info_t path_info; 1702 ib_gid_t destgid; 1703 char ret = IBD_OP_NOTSTARTED; 1704 1705 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1706 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1707 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1708 htonl(mac->ipoib_gidsuff[1])); 1709 1710 /* 1711 * Check whether we are trying to transmit to a MCG. 1712 * In that case, we need to make sure we are a member of 1713 * the MCG. 1714 */ 1715 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1716 boolean_t redirected; 1717 1718 /* 1719 * If we can not find or join the group or even 1720 * redirect, error out. 1721 */ 1722 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1723 NULL) { 1724 state->id_ah_op = IBD_OP_ERRORED; 1725 return; 1726 } 1727 1728 /* 1729 * If we got redirected, we need to determine whether 1730 * the AH for the new mcg is in the cache already, and 1731 * not pull it in then; otherwise proceed to get the 1732 * path for the new mcg. There is no guarantee that 1733 * if the AH is currently in the cache, it will still be 1734 * there when we look in ibd_acache_lookup(), but that's 1735 * okay, we will come back here. 1736 */ 1737 if (redirected) { 1738 ret = IBD_OP_ROUTERED; 1739 DPRINT(4, "ibd_async_acache : redirected to " 1740 "%08X:%08X:%08X:%08X:%08X", 1741 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1742 htonl(mac->ipoib_gidpref[1]), 1743 htonl(mac->ipoib_gidsuff[0]), 1744 htonl(mac->ipoib_gidsuff[1])); 1745 1746 mutex_enter(&state->id_ac_mutex); 1747 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1748 state->id_ah_op = IBD_OP_ROUTERED; 1749 mutex_exit(&state->id_ac_mutex); 1750 DPRINT(4, "ibd_async_acache : router AH found"); 1751 return; 1752 } 1753 mutex_exit(&state->id_ac_mutex); 1754 } 1755 } 1756 1757 /* 1758 * Get an AH from the free list. 1759 */ 1760 mutex_enter(&state->id_ac_mutex); 1761 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1762 /* 1763 * No free ones; try to grab an unreferenced active 1764 * one. Maybe we need to make the active list LRU, 1765 * but that will create more work for Tx callbacks. 1766 * Is there a way of not having to pull out the 1767 * entry from the active list, but just indicate it 1768 * is being recycled? Yes, but that creates one more 1769 * check in the fast lookup path. 1770 */ 1771 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1772 /* 1773 * Pretty serious shortage now. 1774 */ 1775 state->id_ah_op = IBD_OP_NOTSTARTED; 1776 mutex_exit(&state->id_ac_mutex); 1777 DPRINT(10, "ibd_async_acache : failed to find AH " 1778 "slot\n"); 1779 return; 1780 } 1781 /* 1782 * We could check whether ac_mce points to a SendOnly 1783 * member and drop that membership now. Or do it lazily 1784 * at detach time. 1785 */ 1786 ce->ac_mce = NULL; 1787 } 1788 mutex_exit(&state->id_ac_mutex); 1789 ASSERT(ce->ac_mce == NULL); 1790 1791 /* 1792 * Update the entry. 1793 */ 1794 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1795 1796 bzero(&path_info, sizeof (path_info)); 1797 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1798 path_attr.pa_sgid = state->id_sgid; 1799 path_attr.pa_num_dgids = 1; 1800 ibd_n2h_gid(&ce->ac_mac, &destgid); 1801 path_attr.pa_dgids = &destgid; 1802 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1803 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1804 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1805 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1806 goto error; 1807 } 1808 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1809 ntohl(ce->ac_mac.ipoib_qpn), 1810 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1811 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1812 goto error; 1813 } 1814 1815 /* 1816 * mce is set whenever an AH is being associated with a 1817 * MCG; this will come in handy when we leave the MCG. The 1818 * lock protects Tx fastpath from scanning the active list. 1819 */ 1820 if (mce != NULL) 1821 ce->ac_mce = mce; 1822 mutex_enter(&state->id_ac_mutex); 1823 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1824 state->id_ah_op = ret; 1825 mutex_exit(&state->id_ac_mutex); 1826 return; 1827 error: 1828 /* 1829 * We might want to drop SendOnly membership here if we 1830 * joined above. The lock protects Tx callbacks inserting 1831 * into the free list. 1832 */ 1833 mutex_enter(&state->id_ac_mutex); 1834 state->id_ah_op = IBD_OP_ERRORED; 1835 IBD_ACACHE_INSERT_FREE(state, ce); 1836 mutex_exit(&state->id_ac_mutex); 1837 } 1838 1839 /* 1840 * While restoring port's presence on the subnet on a port up, it is possible 1841 * that the port goes down again. 1842 */ 1843 static void 1844 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1845 { 1846 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1847 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1848 LINK_STATE_UP; 1849 ibd_mce_t *mce, *pmce; 1850 ibd_ace_t *ace, *pace; 1851 1852 DPRINT(10, "ibd_async_link(): %d", opcode); 1853 1854 /* 1855 * On a link up, revalidate the link speed/width. No point doing 1856 * this on a link down, since we will be unable to do SA operations, 1857 * defaulting to the lowest speed. Also notice that we update our 1858 * notion of speed before calling mac_link_update(), which will do 1859 * neccesary higher level notifications for speed changes. 1860 */ 1861 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1862 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1863 state->id_link_speed = ibd_get_portspeed(state); 1864 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1865 } 1866 1867 /* 1868 * Do all the work required to establish our presence on 1869 * the subnet. 1870 */ 1871 if (opcode == IBD_LINK_UP_ABSENT) { 1872 /* 1873 * If in promiscuous mode ... 1874 */ 1875 if (state->id_prom_op == IBD_OP_COMPLETED) { 1876 /* 1877 * Drop all nonmembership. 1878 */ 1879 ibd_async_unsetprom(state); 1880 1881 /* 1882 * Then, try to regain nonmembership to all mcg's. 1883 */ 1884 ibd_async_setprom(state); 1885 1886 } 1887 1888 /* 1889 * Drop all sendonly membership (which also gets rid of the 1890 * AHs); try to reacquire all full membership. 1891 */ 1892 mce = list_head(&state->id_mc_full); 1893 while ((pmce = mce) != NULL) { 1894 mce = list_next(&state->id_mc_full, mce); 1895 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1896 ibd_leave_group(state, 1897 pmce->mc_info.mc_adds_vect.av_dgid, 1898 IB_MC_JSTATE_SEND_ONLY_NON); 1899 else 1900 ibd_reacquire_group(state, pmce); 1901 } 1902 1903 /* 1904 * Recycle all active AHs to free list (and if there are 1905 * pending posts, make sure they will go into the free list 1906 * once the Tx's complete). Grab the lock to prevent 1907 * concurrent Tx's as well as Tx cleanups. 1908 */ 1909 mutex_enter(&state->id_ac_mutex); 1910 ace = list_head(&state->id_ah_active); 1911 while ((pace = ace) != NULL) { 1912 boolean_t cycled; 1913 1914 ace = list_next(&state->id_ah_active, ace); 1915 mce = pace->ac_mce; 1916 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1917 B_TRUE); 1918 /* 1919 * If this is for an mcg, it must be for a fullmember, 1920 * since we got rid of send-only members above when 1921 * processing the mce list. 1922 */ 1923 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1924 IB_MC_JSTATE_FULL))); 1925 1926 /* 1927 * Check if the fullmember mce needs to be torn down, 1928 * ie whether the DLPI disable has already been done. 1929 * If so, do some of the work of tx_cleanup, namely 1930 * causing leave (which will fail), detach and 1931 * mce-freeing. tx_cleanup will put the AH into free 1932 * list. The reason to duplicate some of this 1933 * tx_cleanup work is because we want to delete the 1934 * AH right now instead of waiting for tx_cleanup, to 1935 * force subsequent Tx's to reacquire an AH. 1936 */ 1937 if ((mce != NULL) && (mce->mc_fullreap)) 1938 ibd_async_reap_group(state, mce, 1939 mce->mc_info.mc_adds_vect.av_dgid, 1940 mce->mc_jstate); 1941 } 1942 mutex_exit(&state->id_ac_mutex); 1943 } 1944 1945 /* 1946 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1947 * (which stops further events from being delivered) before 1948 * mac_unregister(). At this point, it is guaranteed that mac_register 1949 * has already been done. 1950 */ 1951 mutex_enter(&state->id_link_mutex); 1952 state->id_link_state = lstate; 1953 mac_link_update(state->id_mh, lstate); 1954 mutex_exit(&state->id_link_mutex); 1955 1956 ibd_async_done(state); 1957 } 1958 1959 /* 1960 * When the link is notified up, we need to do a few things, based 1961 * on the port's current p_init_type_reply claiming a reinit has been 1962 * done or not. The reinit steps are: 1963 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1964 * the old Pkey and GID0 are correct. 1965 * 2. Register for mcg traps (already done by ibmf). 1966 * 3. If PreservePresenceReply indicates the SM has restored port's presence 1967 * in subnet, nothing more to do. Else go to next steps (on async daemon). 1968 * 4. Give up all sendonly memberships. 1969 * 5. Acquire all full memberships. 1970 * 6. In promiscuous mode, acquire all non memberships. 1971 * 7. Recycle all AHs to free list. 1972 */ 1973 static void 1974 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 1975 { 1976 ibt_hca_portinfo_t *port_infop; 1977 ibt_status_t ibt_status; 1978 uint_t psize, port_infosz; 1979 ibd_link_op_t opcode; 1980 ibd_req_t *req; 1981 1982 /* 1983 * Do not send a request to the async daemon if it has not 1984 * yet been created or is being destroyed. If the async 1985 * daemon has not yet been created, we still need to track 1986 * last known state of the link. If this code races with the 1987 * detach path, then we are assured that the detach path has 1988 * not yet done the ibt_close_hca (which waits for all async 1989 * events to complete). If the code races with the attach path, 1990 * we need to validate the pkey/gid (in the link_up case) if 1991 * the initialization path has already set these up and created 1992 * IBTF resources based on the values. 1993 */ 1994 mutex_enter(&state->id_link_mutex); 1995 1996 /* 1997 * If the init code in ibd_m_start hasn't yet set up the 1998 * pkey/gid, nothing to do; that code will set the link state. 1999 */ 2000 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2001 mutex_exit(&state->id_link_mutex); 2002 return; 2003 } 2004 2005 if ((code == IBT_EVENT_PORT_UP) || (code == IBT_CLNT_REREG_EVENT) || 2006 (code == IBT_PORT_CHANGE_EVENT)) { 2007 uint8_t itreply; 2008 boolean_t badup = B_FALSE; 2009 2010 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, 2011 state->id_port, &port_infop, &psize, &port_infosz); 2012 if ((ibt_status != IBT_SUCCESS) || (psize != 1)) { 2013 mutex_exit(&state->id_link_mutex); 2014 DPRINT(10, "ibd_link_up : failed in" 2015 " ibt_query_port()\n"); 2016 return; 2017 } 2018 2019 /* 2020 * If the link already went down by the time the handler gets 2021 * here, give up; we can not even validate pkey/gid since those 2022 * are not valid. 2023 */ 2024 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) 2025 badup = B_TRUE; 2026 2027 itreply = port_infop->p_init_type_reply; 2028 2029 /* 2030 * In InitTypeReply, check if NoLoadReply == 2031 * PreserveContentReply == 0, in which case, verify Pkey/GID0. 2032 */ 2033 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2034 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0) && 2035 (!badup)) { 2036 /* 2037 * Check that the subnet part of GID0 has not changed. 2038 */ 2039 if (bcmp(port_infop->p_sgid_tbl, &state->id_sgid, 2040 sizeof (ib_gid_t)) != 0) 2041 badup = B_TRUE; 2042 2043 /* 2044 * Check that Pkey/index mapping is still valid. 2045 */ 2046 if ((port_infop->p_pkey_tbl_sz <= state->id_pkix) || 2047 (port_infop->p_pkey_tbl[state->id_pkix] != 2048 state->id_pkey)) 2049 badup = B_TRUE; 2050 } 2051 2052 /* 2053 * In InitTypeReply, if PreservePresenceReply indicates the SM 2054 * has ensured that the port's presence in mcg, traps etc is 2055 * intact, nothing more to do. 2056 */ 2057 opcode = IBD_LINK_UP_ABSENT; 2058 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2059 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) 2060 opcode = IBD_LINK_UP; 2061 2062 ibt_free_portinfo(port_infop, port_infosz); 2063 2064 if (badup) { 2065 code = IBT_ERROR_PORT_DOWN; 2066 } else if (code == IBT_PORT_CHANGE_EVENT) { 2067 mutex_exit(&state->id_link_mutex); 2068 return; 2069 } 2070 } 2071 2072 if (!ibd_async_safe(state)) { 2073 state->id_link_state = (((code == IBT_EVENT_PORT_UP) || 2074 (code == IBT_CLNT_REREG_EVENT)) ? LINK_STATE_UP : 2075 LINK_STATE_DOWN); 2076 mutex_exit(&state->id_link_mutex); 2077 return; 2078 } 2079 mutex_exit(&state->id_link_mutex); 2080 2081 if (code == IBT_ERROR_PORT_DOWN) 2082 opcode = IBD_LINK_DOWN; 2083 2084 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2085 req->rq_ptr = (void *)opcode; 2086 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2087 } 2088 2089 /* 2090 * For the port up/down events, IBTL guarantees there will not be concurrent 2091 * invocations of the handler. IBTL might coalesce link transition events, 2092 * and not invoke the handler for _each_ up/down transition, but it will 2093 * invoke the handler with last known state 2094 */ 2095 static void 2096 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2097 ibt_async_code_t code, ibt_async_event_t *event) 2098 { 2099 ibd_state_t *state = (ibd_state_t *)clnt_private; 2100 2101 switch (code) { 2102 case IBT_ERROR_CATASTROPHIC_CHAN: 2103 ibd_print_warn(state, "catastrophic channel error"); 2104 break; 2105 case IBT_ERROR_CQ: 2106 ibd_print_warn(state, "completion queue error"); 2107 break; 2108 case IBT_PORT_CHANGE_EVENT: 2109 /* 2110 * Events will be delivered to all instances that have 2111 * done ibt_open_hca() but not yet done ibt_close_hca(). 2112 * Only need to do work for our port; IBTF will deliver 2113 * events for other ports on the hca we have ibt_open_hca'ed 2114 * too. Note that id_port is initialized in ibd_attach() 2115 * before we do an ibt_open_hca() in ibd_attach(). 2116 */ 2117 ASSERT(state->id_hca_hdl == hca_hdl); 2118 if (state->id_port != event->ev_port) 2119 break; 2120 2121 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2122 IBT_PORT_CHANGE_PKEY) { 2123 ibd_link_mod(state, code); 2124 } 2125 break; 2126 case IBT_ERROR_PORT_DOWN: 2127 case IBT_CLNT_REREG_EVENT: 2128 case IBT_EVENT_PORT_UP: 2129 /* 2130 * Events will be delivered to all instances that have 2131 * done ibt_open_hca() but not yet done ibt_close_hca(). 2132 * Only need to do work for our port; IBTF will deliver 2133 * events for other ports on the hca we have ibt_open_hca'ed 2134 * too. Note that id_port is initialized in ibd_attach() 2135 * before we do an ibt_open_hca() in ibd_attach(). 2136 */ 2137 ASSERT(state->id_hca_hdl == hca_hdl); 2138 if (state->id_port != event->ev_port) 2139 break; 2140 2141 ibd_link_mod(state, code); 2142 break; 2143 2144 case IBT_HCA_ATTACH_EVENT: 2145 case IBT_HCA_DETACH_EVENT: 2146 /* 2147 * When a new card is plugged to the system, attach_event is 2148 * invoked. Additionally, a cfgadm needs to be run to make the 2149 * card known to the system, and an ifconfig needs to be run to 2150 * plumb up any ibd interfaces on the card. In the case of card 2151 * unplug, a cfgadm is run that will trigger any RCM scripts to 2152 * unplumb the ibd interfaces on the card; when the card is 2153 * actually unplugged, the detach_event is invoked; 2154 * additionally, if any ibd instances are still active on the 2155 * card (eg there were no associated RCM scripts), driver's 2156 * detach routine is invoked. 2157 */ 2158 break; 2159 default: 2160 break; 2161 } 2162 } 2163 2164 static int 2165 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2166 { 2167 mac_register_t *macp; 2168 int ret; 2169 2170 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2171 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2172 return (DDI_FAILURE); 2173 } 2174 2175 /* 2176 * Note that when we register with mac during attach, we don't 2177 * have the id_macaddr yet, so we'll simply be registering a 2178 * zero macaddr that we'll overwrite later during plumb (in 2179 * ibd_m_start()). Similar is the case with id_mtu - we'll 2180 * update the mac layer with the correct mtu during plumb. 2181 */ 2182 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2183 macp->m_driver = state; 2184 macp->m_dip = dip; 2185 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2186 macp->m_callbacks = &ibd_m_callbacks; 2187 macp->m_min_sdu = 0; 2188 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2189 2190 /* 2191 * Register ourselves with the GLDv3 interface 2192 */ 2193 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2194 mac_free(macp); 2195 DPRINT(10, 2196 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2197 return (DDI_FAILURE); 2198 } 2199 2200 mac_free(macp); 2201 return (DDI_SUCCESS); 2202 } 2203 2204 static int 2205 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2206 { 2207 ibt_hca_attr_t hca_attrs; 2208 ibt_status_t ibt_status; 2209 2210 /* 2211 * Query the HCA and fetch its attributes 2212 */ 2213 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2214 ASSERT(ibt_status == IBT_SUCCESS); 2215 2216 /* 2217 * 1. Set the Hardware Checksum capability. Currently we only consider 2218 * full checksum offload. 2219 */ 2220 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 2221 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2222 } 2223 2224 /* 2225 * 2. Set LSO policy, capability and maximum length 2226 */ 2227 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2228 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2229 state->id_lso_policy = B_TRUE; 2230 } else { 2231 state->id_lso_policy = B_FALSE; 2232 } 2233 if (hca_attrs.hca_max_lso_size > 0) { 2234 state->id_lso_capable = B_TRUE; 2235 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2236 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2237 else 2238 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 2239 } else { 2240 state->id_lso_capable = B_FALSE; 2241 state->id_lso_maxlen = 0; 2242 } 2243 2244 /* 2245 * 3. Set Reserved L_Key capability 2246 */ 2247 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2248 state->id_hca_res_lkey_capab = 1; 2249 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2250 } 2251 2252 /* 2253 * 4. Set maximum sqseg value after checking to see if extended sgl 2254 * size information is provided by the hca 2255 */ 2256 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2257 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2258 } else { 2259 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2260 } 2261 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2262 state->id_max_sqseg = IBD_MAX_SQSEG; 2263 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2264 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2265 state->id_max_sqseg, IBD_MAX_SQSEG); 2266 } 2267 2268 /* 2269 * 5. Set number of recv and send wqes after checking hca maximum 2270 * channel size 2271 */ 2272 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2273 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2274 } else { 2275 state->id_num_rwqe = IBD_NUM_RWQE; 2276 } 2277 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2278 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2279 } else { 2280 state->id_num_swqe = IBD_NUM_SWQE; 2281 } 2282 2283 return (DDI_SUCCESS); 2284 } 2285 2286 static int 2287 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2288 { 2289 int instance; 2290 uint32_t progress = state->id_mac_state; 2291 ibt_status_t ret; 2292 2293 if (progress & IBD_DRV_MAC_REGISTERED) { 2294 (void) mac_unregister(state->id_mh); 2295 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2296 } 2297 2298 if (progress & IBD_DRV_PD_ALLOCD) { 2299 if ((ret = ibt_free_pd(state->id_hca_hdl, 2300 state->id_pd_hdl)) != IBT_SUCCESS) { 2301 ibd_print_warn(state, "failed to free " 2302 "protection domain, ret=%d", ret); 2303 } 2304 state->id_pd_hdl = NULL; 2305 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2306 } 2307 2308 if (progress & IBD_DRV_HCA_OPENED) { 2309 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2310 IBT_SUCCESS) { 2311 ibd_print_warn(state, "failed to close " 2312 "HCA device, ret=%d", ret); 2313 } 2314 state->id_hca_hdl = NULL; 2315 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2316 } 2317 2318 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2319 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2320 ibd_print_warn(state, 2321 "ibt_detach() failed, ret=%d", ret); 2322 } 2323 state->id_ibt_hdl = NULL; 2324 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2325 } 2326 2327 if (progress & IBD_DRV_TXINTR_ADDED) { 2328 ddi_remove_softintr(state->id_tx); 2329 state->id_tx = NULL; 2330 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2331 } 2332 2333 if (progress & IBD_DRV_RXINTR_ADDED) { 2334 ddi_remove_softintr(state->id_rx); 2335 state->id_rx = NULL; 2336 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2337 } 2338 2339 if (progress & IBD_DRV_STATE_INITIALIZED) { 2340 ibd_state_fini(state); 2341 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2342 } 2343 2344 instance = ddi_get_instance(dip); 2345 ddi_soft_state_free(ibd_list, instance); 2346 2347 return (DDI_SUCCESS); 2348 } 2349 2350 /* 2351 * Attach device to the IO framework. 2352 */ 2353 static int 2354 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2355 { 2356 ibd_state_t *state = NULL; 2357 ib_guid_t hca_guid; 2358 int instance; 2359 ibt_status_t ret; 2360 int rv; 2361 2362 /* 2363 * IBD doesn't support suspend/resume 2364 */ 2365 if (cmd != DDI_ATTACH) 2366 return (DDI_FAILURE); 2367 2368 /* 2369 * Allocate softstate structure 2370 */ 2371 instance = ddi_get_instance(dip); 2372 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2373 return (DDI_FAILURE); 2374 state = ddi_get_soft_state(ibd_list, instance); 2375 2376 /* 2377 * Initialize mutexes and condition variables 2378 */ 2379 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2380 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2381 goto attach_fail; 2382 } 2383 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2384 2385 /* 2386 * Allocate rx,tx softintr 2387 */ 2388 if (ibd_rx_softintr == 1) { 2389 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2390 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2391 DPRINT(10, "ibd_attach: failed in " 2392 "ddi_add_softintr(id_rx), ret=%d", rv); 2393 goto attach_fail; 2394 } 2395 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2396 } 2397 if (ibd_tx_softintr == 1) { 2398 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2399 NULL, NULL, ibd_tx_recycle, 2400 (caddr_t)state)) != DDI_SUCCESS) { 2401 DPRINT(10, "ibd_attach: failed in " 2402 "ddi_add_softintr(id_tx), ret=%d", rv); 2403 goto attach_fail; 2404 } 2405 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2406 } 2407 2408 /* 2409 * Obtain IBA P_Key, port number and HCA guid and validate 2410 * them (for P_Key, only full members are allowed as per 2411 * IPoIB specification; neither port number nor HCA guid 2412 * can be zero) 2413 */ 2414 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2415 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2416 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2417 state->id_pkey); 2418 goto attach_fail; 2419 } 2420 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2421 "port-number", 0)) == 0) { 2422 DPRINT(10, "ibd_attach: invalid port number (%d)", 2423 state->id_port); 2424 goto attach_fail; 2425 } 2426 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2427 "hca-guid", 0)) == 0) { 2428 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2429 hca_guid); 2430 goto attach_fail; 2431 } 2432 2433 /* 2434 * Attach to IBTL 2435 */ 2436 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2437 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2438 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2439 goto attach_fail; 2440 } 2441 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2442 2443 /* 2444 * Open the HCA 2445 */ 2446 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2447 &state->id_hca_hdl)) != IBT_SUCCESS) { 2448 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2449 goto attach_fail; 2450 } 2451 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2452 2453 /* 2454 * Record capabilities 2455 */ 2456 (void) ibd_record_capab(state, dip); 2457 2458 /* 2459 * Allocate a protection domain on the HCA 2460 */ 2461 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2462 &state->id_pd_hdl)) != IBT_SUCCESS) { 2463 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2464 goto attach_fail; 2465 } 2466 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2467 2468 2469 /* 2470 * Register ibd interfaces with the Nemo framework 2471 */ 2472 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2473 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2474 goto attach_fail; 2475 } 2476 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2477 2478 /* 2479 * We're done with everything we could to make the attach 2480 * succeed. All the buffer allocations and IPoIB broadcast 2481 * group joins are deferred to when the interface instance 2482 * is actually plumbed to avoid wasting memory. 2483 */ 2484 return (DDI_SUCCESS); 2485 2486 attach_fail: 2487 ibd_unattach(state, dip); 2488 return (DDI_FAILURE); 2489 } 2490 2491 /* 2492 * Detach device from the IO framework. 2493 */ 2494 static int 2495 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2496 { 2497 ibd_state_t *state; 2498 int instance; 2499 2500 /* 2501 * IBD doesn't support suspend/resume 2502 */ 2503 if (cmd != DDI_DETACH) 2504 return (DDI_FAILURE); 2505 2506 /* 2507 * Get the instance softstate 2508 */ 2509 instance = ddi_get_instance(dip); 2510 state = ddi_get_soft_state(ibd_list, instance); 2511 2512 /* 2513 * Release all resources we're holding still. Note that if we'd 2514 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2515 * so far, we should find all the flags we need in id_mac_state. 2516 */ 2517 (void) ibd_unattach(state, dip); 2518 2519 return (DDI_SUCCESS); 2520 } 2521 2522 /* 2523 * Pre ibt_attach() driver initialization 2524 */ 2525 static int 2526 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2527 { 2528 char buf[64]; 2529 2530 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2531 state->id_link_state = LINK_STATE_UNKNOWN; 2532 2533 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2534 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2535 state->id_trap_stop = B_TRUE; 2536 state->id_trap_inprog = 0; 2537 2538 mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2539 state->id_dip = dip; 2540 2541 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2542 2543 state->id_tx_list.dl_head = NULL; 2544 state->id_tx_list.dl_tail = NULL; 2545 state->id_tx_list.dl_pending_sends = B_FALSE; 2546 state->id_tx_list.dl_cnt = 0; 2547 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2548 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2549 state->id_tx_busy = 0; 2550 2551 state->id_rx_list.dl_head = NULL; 2552 state->id_rx_list.dl_tail = NULL; 2553 state->id_rx_list.dl_bufs_outstanding = 0; 2554 state->id_rx_list.dl_cnt = 0; 2555 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2556 mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL); 2557 2558 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2559 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2560 0, NULL, NULL, NULL, NULL, NULL, 0); 2561 2562 #ifdef IBD_LOGGING 2563 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 2564 #endif 2565 2566 return (DDI_SUCCESS); 2567 } 2568 2569 /* 2570 * Post ibt_detach() driver deconstruction 2571 */ 2572 static void 2573 ibd_state_fini(ibd_state_t *state) 2574 { 2575 kmem_cache_destroy(state->id_req_kmc); 2576 2577 mutex_destroy(&state->id_rxpost_lock); 2578 mutex_destroy(&state->id_rx_list.dl_mutex); 2579 2580 mutex_destroy(&state->id_txpost_lock); 2581 mutex_destroy(&state->id_tx_list.dl_mutex); 2582 2583 mutex_destroy(&state->id_sched_lock); 2584 mutex_destroy(&state->id_cq_poll_lock); 2585 2586 cv_destroy(&state->id_trap_cv); 2587 mutex_destroy(&state->id_trap_lock); 2588 mutex_destroy(&state->id_link_mutex); 2589 2590 #ifdef IBD_LOGGING 2591 mutex_destroy(&ibd_lbuf_lock); 2592 #endif 2593 } 2594 2595 /* 2596 * Fetch link speed from SA for snmp ifspeed reporting. 2597 */ 2598 static uint64_t 2599 ibd_get_portspeed(ibd_state_t *state) 2600 { 2601 int ret; 2602 ibt_path_info_t path; 2603 ibt_path_attr_t path_attr; 2604 uint8_t num_paths; 2605 uint64_t ifspeed; 2606 2607 /* 2608 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2609 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2610 * 2000000000. Start with that as default. 2611 */ 2612 ifspeed = 2000000000; 2613 2614 bzero(&path_attr, sizeof (path_attr)); 2615 2616 /* 2617 * Get the port speed from Loopback path information. 2618 */ 2619 path_attr.pa_dgids = &state->id_sgid; 2620 path_attr.pa_num_dgids = 1; 2621 path_attr.pa_sgid = state->id_sgid; 2622 2623 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2624 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2625 goto earlydone; 2626 2627 if (num_paths < 1) 2628 goto earlydone; 2629 2630 /* 2631 * In case SA does not return an expected value, report the default 2632 * speed as 1X. 2633 */ 2634 ret = 1; 2635 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2636 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2637 ret = 1; 2638 break; 2639 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2640 ret = 4; 2641 break; 2642 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2643 ret = 12; 2644 break; 2645 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2646 ret = 2; 2647 break; 2648 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2649 ret = 8; 2650 break; 2651 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2652 ret = 16; 2653 break; 2654 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2655 ret = 24; 2656 break; 2657 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2658 ret = 32; 2659 break; 2660 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2661 ret = 48; 2662 break; 2663 } 2664 2665 ifspeed *= ret; 2666 2667 earlydone: 2668 return (ifspeed); 2669 } 2670 2671 /* 2672 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2673 * representing the input mcg mgid. 2674 */ 2675 static ibd_mce_t * 2676 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2677 { 2678 ibd_mce_t *ptr = list_head(mlist); 2679 2680 /* 2681 * Do plain linear search. 2682 */ 2683 while (ptr != NULL) { 2684 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2685 sizeof (ib_gid_t)) == 0) 2686 return (ptr); 2687 ptr = list_next(mlist, ptr); 2688 } 2689 return (NULL); 2690 } 2691 2692 /* 2693 * Execute IBA JOIN. 2694 */ 2695 static ibt_status_t 2696 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2697 { 2698 ibt_mcg_attr_t mcg_attr; 2699 2700 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2701 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2702 mcg_attr.mc_mgid = mgid; 2703 mcg_attr.mc_join_state = mce->mc_jstate; 2704 mcg_attr.mc_scope = state->id_scope; 2705 mcg_attr.mc_pkey = state->id_pkey; 2706 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2707 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2708 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2709 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2710 NULL, NULL)); 2711 } 2712 2713 /* 2714 * This code JOINs the port in the proper way (depending on the join 2715 * state) so that IBA fabric will forward mcg packets to/from the port. 2716 * It also attaches the QPN to the mcg so it can receive those mcg 2717 * packets. This code makes sure not to attach the mcg to the QP if 2718 * that has been previously done due to the mcg being joined with a 2719 * different join state, even though this is not required by SWG_0216, 2720 * refid 3610. 2721 */ 2722 static ibd_mce_t * 2723 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2724 { 2725 ibt_status_t ibt_status; 2726 ibd_mce_t *mce, *tmce, *omce = NULL; 2727 boolean_t do_attach = B_TRUE; 2728 2729 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2730 jstate, mgid.gid_prefix, mgid.gid_guid); 2731 2732 /* 2733 * For enable_multicast Full member joins, we need to do some 2734 * extra work. If there is already an mce on the list that 2735 * indicates full membership, that means the membership has 2736 * not yet been dropped (since the disable_multicast was issued) 2737 * because there are pending Tx's to the mcg; in that case, just 2738 * mark the mce not to be reaped when the Tx completion queues 2739 * an async reap operation. 2740 * 2741 * If there is already an mce on the list indicating sendonly 2742 * membership, try to promote to full membership. Be careful 2743 * not to deallocate the old mce, since there might be an AH 2744 * pointing to it; instead, update the old mce with new data 2745 * that tracks the full membership. 2746 */ 2747 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2748 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2749 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2750 ASSERT(omce->mc_fullreap); 2751 omce->mc_fullreap = B_FALSE; 2752 return (omce); 2753 } else { 2754 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2755 } 2756 } 2757 2758 /* 2759 * Allocate the ibd_mce_t to track this JOIN. 2760 */ 2761 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2762 mce->mc_fullreap = B_FALSE; 2763 mce->mc_jstate = jstate; 2764 2765 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2766 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2767 ibt_status); 2768 kmem_free(mce, sizeof (ibd_mce_t)); 2769 return (NULL); 2770 } 2771 2772 /* 2773 * Is an IBA attach required? Not if the interface is already joined 2774 * to the mcg in a different appropriate join state. 2775 */ 2776 if (jstate == IB_MC_JSTATE_NON) { 2777 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2778 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2779 do_attach = B_FALSE; 2780 } else if (jstate == IB_MC_JSTATE_FULL) { 2781 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2782 do_attach = B_FALSE; 2783 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2784 do_attach = B_FALSE; 2785 } 2786 2787 if (do_attach) { 2788 /* 2789 * Do the IBA attach. 2790 */ 2791 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2792 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2793 &mce->mc_info)) != IBT_SUCCESS) { 2794 DPRINT(10, "ibd_join_group : failed qp attachment " 2795 "%d\n", ibt_status); 2796 /* 2797 * NOTE that we should probably preserve the join info 2798 * in the list and later try to leave again at detach 2799 * time. 2800 */ 2801 (void) ibt_leave_mcg(state->id_sgid, mgid, 2802 state->id_sgid, jstate); 2803 kmem_free(mce, sizeof (ibd_mce_t)); 2804 return (NULL); 2805 } 2806 } 2807 2808 /* 2809 * Insert the ibd_mce_t in the proper list. 2810 */ 2811 if (jstate == IB_MC_JSTATE_NON) { 2812 IBD_MCACHE_INSERT_NON(state, mce); 2813 } else { 2814 /* 2815 * Set up the mc_req fields used for reaping the 2816 * mcg in case of delayed tx completion (see 2817 * ibd_tx_cleanup()). Also done for sendonly join in 2818 * case we are promoted to fullmembership later and 2819 * keep using the same mce. 2820 */ 2821 mce->mc_req.rq_gid = mgid; 2822 mce->mc_req.rq_ptr = mce; 2823 /* 2824 * Check whether this is the case of trying to join 2825 * full member, and we were already joined send only. 2826 * We try to drop our SendOnly membership, but it is 2827 * possible that the mcg does not exist anymore (and 2828 * the subnet trap never reached us), so the leave 2829 * operation might fail. 2830 */ 2831 if (omce != NULL) { 2832 (void) ibt_leave_mcg(state->id_sgid, mgid, 2833 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2834 omce->mc_jstate = IB_MC_JSTATE_FULL; 2835 bcopy(&mce->mc_info, &omce->mc_info, 2836 sizeof (ibt_mcg_info_t)); 2837 kmem_free(mce, sizeof (ibd_mce_t)); 2838 return (omce); 2839 } 2840 mutex_enter(&state->id_mc_mutex); 2841 IBD_MCACHE_INSERT_FULL(state, mce); 2842 mutex_exit(&state->id_mc_mutex); 2843 } 2844 2845 return (mce); 2846 } 2847 2848 /* 2849 * Called during port up event handling to attempt to reacquire full 2850 * membership to an mcg. Stripped down version of ibd_join_group(). 2851 * Note that it is possible that the mcg might have gone away, and 2852 * gets recreated at this point. 2853 */ 2854 static void 2855 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2856 { 2857 ib_gid_t mgid; 2858 2859 /* 2860 * If the mc_fullreap flag is set, or this join fails, a subsequent 2861 * reap/leave is going to try to leave the group. We could prevent 2862 * that by adding a boolean flag into ibd_mce_t, if required. 2863 */ 2864 if (mce->mc_fullreap) 2865 return; 2866 2867 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2868 2869 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2870 mgid.gid_guid); 2871 2872 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2873 ibd_print_warn(state, "Failure on port up to rejoin " 2874 "multicast gid %016llx:%016llx", 2875 (u_longlong_t)mgid.gid_prefix, 2876 (u_longlong_t)mgid.gid_guid); 2877 } 2878 2879 /* 2880 * This code handles delayed Tx completion cleanups for mcg's to which 2881 * disable_multicast has been issued, regular mcg related cleanups during 2882 * disable_multicast, disable_promiscous and mcg traps, as well as 2883 * cleanups during driver detach time. Depending on the join state, 2884 * it deletes the mce from the appropriate list and issues the IBA 2885 * leave/detach; except in the disable_multicast case when the mce 2886 * is left on the active list for a subsequent Tx completion cleanup. 2887 */ 2888 static void 2889 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2890 uint8_t jstate) 2891 { 2892 ibd_mce_t *tmce; 2893 boolean_t do_detach = B_TRUE; 2894 2895 /* 2896 * Before detaching, we must check whether the other list 2897 * contains the mcg; if we detach blindly, the consumer 2898 * who set up the other list will also stop receiving 2899 * traffic. 2900 */ 2901 if (jstate == IB_MC_JSTATE_FULL) { 2902 /* 2903 * The following check is only relevant while coming 2904 * from the Tx completion path in the reap case. 2905 */ 2906 if (!mce->mc_fullreap) 2907 return; 2908 mutex_enter(&state->id_mc_mutex); 2909 IBD_MCACHE_PULLOUT_FULL(state, mce); 2910 mutex_exit(&state->id_mc_mutex); 2911 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2912 do_detach = B_FALSE; 2913 } else if (jstate == IB_MC_JSTATE_NON) { 2914 IBD_MCACHE_PULLOUT_NON(state, mce); 2915 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2916 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2917 do_detach = B_FALSE; 2918 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2919 mutex_enter(&state->id_mc_mutex); 2920 IBD_MCACHE_PULLOUT_FULL(state, mce); 2921 mutex_exit(&state->id_mc_mutex); 2922 do_detach = B_FALSE; 2923 } 2924 2925 /* 2926 * If we are reacting to a mcg trap and leaving our sendonly or 2927 * non membership, the mcg is possibly already gone, so attempting 2928 * to leave might fail. On the other hand, we must try to leave 2929 * anyway, since this might be a trap from long ago, and we could 2930 * have potentially sendonly joined to a recent incarnation of 2931 * the mcg and are about to loose track of this information. 2932 */ 2933 if (do_detach) { 2934 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 2935 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 2936 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 2937 } 2938 2939 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 2940 kmem_free(mce, sizeof (ibd_mce_t)); 2941 } 2942 2943 /* 2944 * Async code executed due to multicast and promiscuous disable requests 2945 * and mcg trap handling; also executed during driver detach. Mostly, a 2946 * leave and detach is done; except for the fullmember case when Tx 2947 * requests are pending, whence arrangements are made for subsequent 2948 * cleanup on Tx completion. 2949 */ 2950 static void 2951 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2952 { 2953 ipoib_mac_t mcmac; 2954 boolean_t recycled; 2955 ibd_mce_t *mce; 2956 2957 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 2958 jstate, mgid.gid_prefix, mgid.gid_guid); 2959 2960 if (jstate == IB_MC_JSTATE_NON) { 2961 recycled = B_TRUE; 2962 mce = IBD_MCACHE_FIND_NON(state, mgid); 2963 /* 2964 * In case we are handling a mcg trap, we might not find 2965 * the mcg in the non list. 2966 */ 2967 if (mce == NULL) 2968 return; 2969 } else { 2970 mce = IBD_MCACHE_FIND_FULL(state, mgid); 2971 2972 /* 2973 * In case we are handling a mcg trap, make sure the trap 2974 * is not arriving late; if we have an mce that indicates 2975 * that we are already a fullmember, that would be a clear 2976 * indication that the trap arrived late (ie, is for a 2977 * previous incarnation of the mcg). 2978 */ 2979 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 2980 if ((mce == NULL) || (mce->mc_jstate == 2981 IB_MC_JSTATE_FULL)) 2982 return; 2983 } else { 2984 ASSERT(jstate == IB_MC_JSTATE_FULL); 2985 2986 /* 2987 * If join group failed, mce will be NULL here. 2988 * This is because in GLDv3 driver, set multicast 2989 * will always return success. 2990 */ 2991 if (mce == NULL) 2992 return; 2993 2994 mce->mc_fullreap = B_TRUE; 2995 } 2996 2997 /* 2998 * If no pending Tx's remain that reference the AH 2999 * for the mcg, recycle it from active to free list. 3000 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3001 * so the last completing Tx will cause an async reap 3002 * operation to be invoked, at which time we will drop our 3003 * membership to the mcg so that the pending Tx's complete 3004 * successfully. Refer to comments on "AH and MCE active 3005 * list manipulation" at top of this file. The lock protects 3006 * against Tx fast path and Tx cleanup code. 3007 */ 3008 mutex_enter(&state->id_ac_mutex); 3009 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3010 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3011 IB_MC_JSTATE_SEND_ONLY_NON)); 3012 mutex_exit(&state->id_ac_mutex); 3013 } 3014 3015 if (recycled) { 3016 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3017 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3018 ibd_async_reap_group(state, mce, mgid, jstate); 3019 } 3020 } 3021 3022 /* 3023 * Find the broadcast address as defined by IPoIB; implicitly 3024 * determines the IBA scope, mtu, tclass etc of the link the 3025 * interface is going to be a member of. 3026 */ 3027 static ibt_status_t 3028 ibd_find_bgroup(ibd_state_t *state) 3029 { 3030 ibt_mcg_attr_t mcg_attr; 3031 uint_t numg; 3032 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3033 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3034 IB_MC_SCOPE_GLOBAL }; 3035 int i, mcgmtu; 3036 boolean_t found = B_FALSE; 3037 3038 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3039 mcg_attr.mc_pkey = state->id_pkey; 3040 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3041 3042 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3043 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3044 3045 /* 3046 * Look for the IPoIB broadcast group. 3047 */ 3048 state->id_mgid.gid_prefix = 3049 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3050 ((uint64_t)state->id_scope << 48) | 3051 ((uint32_t)(state->id_pkey << 16))); 3052 mcg_attr.mc_mgid = state->id_mgid; 3053 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3054 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3055 found = B_TRUE; 3056 break; 3057 } 3058 3059 } 3060 3061 if (!found) { 3062 ibd_print_warn(state, "IPoIB broadcast group absent"); 3063 return (IBT_FAILURE); 3064 } 3065 3066 /* 3067 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3068 */ 3069 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3070 if (state->id_mtu < mcgmtu) { 3071 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3072 "greater than port's maximum MTU %d", mcgmtu, 3073 state->id_mtu); 3074 return (IBT_FAILURE); 3075 } 3076 state->id_mtu = mcgmtu; 3077 3078 return (IBT_SUCCESS); 3079 } 3080 3081 static int 3082 ibd_alloc_tx_copybufs(ibd_state_t *state) 3083 { 3084 ibt_mr_attr_t mem_attr; 3085 3086 /* 3087 * Allocate one big chunk for all regular tx copy bufs 3088 */ 3089 state->id_tx_buf_sz = state->id_mtu; 3090 if (state->id_lso_policy && state->id_lso_capable && 3091 (IBD_TX_BUF_SZ > state->id_mtu)) { 3092 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3093 } 3094 3095 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3096 state->id_tx_buf_sz, KM_SLEEP); 3097 3098 /* 3099 * Do one memory registration on the entire txbuf area 3100 */ 3101 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3102 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3103 mem_attr.mr_as = NULL; 3104 mem_attr.mr_flags = IBT_MR_SLEEP; 3105 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3106 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3107 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3108 kmem_free(state->id_tx_bufs, 3109 state->id_num_swqe * state->id_tx_buf_sz); 3110 state->id_tx_bufs = NULL; 3111 return (DDI_FAILURE); 3112 } 3113 3114 return (DDI_SUCCESS); 3115 } 3116 3117 static int 3118 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3119 { 3120 ibt_mr_attr_t mem_attr; 3121 ibd_lsobuf_t *buflist; 3122 ibd_lsobuf_t *lbufp; 3123 ibd_lsobuf_t *tail; 3124 ibd_lsobkt_t *bktp; 3125 uint8_t *membase; 3126 uint8_t *memp; 3127 uint_t memsz; 3128 int i; 3129 3130 /* 3131 * Allocate the lso bucket 3132 */ 3133 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3134 3135 /* 3136 * Allocate the entire lso memory and register it 3137 */ 3138 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3139 membase = kmem_zalloc(memsz, KM_SLEEP); 3140 3141 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3142 mem_attr.mr_len = memsz; 3143 mem_attr.mr_as = NULL; 3144 mem_attr.mr_flags = IBT_MR_SLEEP; 3145 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3146 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3147 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3148 kmem_free(membase, memsz); 3149 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3150 return (DDI_FAILURE); 3151 } 3152 3153 /* 3154 * Now allocate the buflist. Note that the elements in the buflist and 3155 * the buffers in the lso memory have a permanent 1-1 relation, so we 3156 * can always derive the address of a buflist entry from the address of 3157 * an lso buffer. 3158 */ 3159 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3160 KM_SLEEP); 3161 3162 /* 3163 * Set up the lso buf chain 3164 */ 3165 memp = membase; 3166 lbufp = buflist; 3167 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3168 lbufp->lb_isfree = 1; 3169 lbufp->lb_buf = memp; 3170 lbufp->lb_next = lbufp + 1; 3171 3172 tail = lbufp; 3173 3174 memp += IBD_LSO_BUFSZ; 3175 lbufp++; 3176 } 3177 tail->lb_next = NULL; 3178 3179 /* 3180 * Set up the LSO buffer information in ibd state 3181 */ 3182 bktp->bkt_bufl = buflist; 3183 bktp->bkt_free_head = buflist; 3184 bktp->bkt_mem = membase; 3185 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3186 bktp->bkt_nfree = bktp->bkt_nelem; 3187 3188 state->id_lso = bktp; 3189 3190 return (DDI_SUCCESS); 3191 } 3192 3193 /* 3194 * Statically allocate Tx buffer list(s). 3195 */ 3196 static int 3197 ibd_init_txlist(ibd_state_t *state) 3198 { 3199 ibd_swqe_t *swqe; 3200 ibt_lkey_t lkey; 3201 int i; 3202 3203 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3204 return (DDI_FAILURE); 3205 3206 if (state->id_lso_policy && state->id_lso_capable) { 3207 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3208 state->id_lso_policy = B_FALSE; 3209 } 3210 3211 /* 3212 * Allocate and setup the swqe list 3213 */ 3214 lkey = state->id_tx_mr_desc.md_lkey; 3215 for (i = 0; i < state->id_num_swqe; i++) { 3216 if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) { 3217 DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed"); 3218 ibd_fini_txlist(state); 3219 return (DDI_FAILURE); 3220 } 3221 3222 /* add to list */ 3223 state->id_tx_list.dl_cnt++; 3224 if (state->id_tx_list.dl_head == NULL) { 3225 swqe->swqe_prev = NULL; 3226 swqe->swqe_next = NULL; 3227 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3228 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3229 } else { 3230 swqe->swqe_prev = state->id_tx_list.dl_tail; 3231 swqe->swqe_next = NULL; 3232 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3233 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3234 } 3235 } 3236 3237 return (DDI_SUCCESS); 3238 } 3239 3240 static int 3241 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3242 uint32_t *nds_p) 3243 { 3244 ibd_lsobkt_t *bktp; 3245 ibd_lsobuf_t *lbufp; 3246 ibd_lsobuf_t *nextp; 3247 ibt_lkey_t lso_lkey; 3248 uint_t frag_sz; 3249 uint_t num_needed; 3250 int i; 3251 3252 ASSERT(sgl_p != NULL); 3253 ASSERT(nds_p != NULL); 3254 ASSERT(req_sz != 0); 3255 3256 /* 3257 * Determine how many bufs we'd need for the size requested 3258 */ 3259 num_needed = req_sz / IBD_LSO_BUFSZ; 3260 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3261 num_needed++; 3262 3263 mutex_enter(&state->id_lso_lock); 3264 3265 /* 3266 * If we don't have enough lso bufs, return failure 3267 */ 3268 ASSERT(state->id_lso != NULL); 3269 bktp = state->id_lso; 3270 if (bktp->bkt_nfree < num_needed) { 3271 mutex_exit(&state->id_lso_lock); 3272 return (-1); 3273 } 3274 3275 /* 3276 * Pick the first 'num_needed' bufs from the free list 3277 */ 3278 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3279 lbufp = bktp->bkt_free_head; 3280 for (i = 0; i < num_needed; i++) { 3281 ASSERT(lbufp->lb_isfree != 0); 3282 ASSERT(lbufp->lb_buf != NULL); 3283 3284 nextp = lbufp->lb_next; 3285 3286 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3287 sgl_p[i].ds_key = lso_lkey; 3288 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3289 3290 lbufp->lb_isfree = 0; 3291 lbufp->lb_next = NULL; 3292 3293 lbufp = nextp; 3294 } 3295 bktp->bkt_free_head = lbufp; 3296 3297 /* 3298 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3299 * to adjust the last sgl entry's length. Since we know we need atleast 3300 * one, the i-1 use below is ok. 3301 */ 3302 if (frag_sz) { 3303 sgl_p[i-1].ds_len = frag_sz; 3304 } 3305 3306 /* 3307 * Update nfree count and return 3308 */ 3309 bktp->bkt_nfree -= num_needed; 3310 3311 mutex_exit(&state->id_lso_lock); 3312 3313 *nds_p = num_needed; 3314 3315 return (0); 3316 } 3317 3318 static void 3319 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3320 { 3321 ibd_lsobkt_t *bktp; 3322 ibd_lsobuf_t *lbufp; 3323 uint8_t *lso_mem_end; 3324 uint_t ndx; 3325 int i; 3326 3327 mutex_enter(&state->id_lso_lock); 3328 3329 bktp = state->id_lso; 3330 ASSERT(bktp != NULL); 3331 3332 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3333 for (i = 0; i < nds; i++) { 3334 uint8_t *va; 3335 3336 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3337 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3338 3339 /* 3340 * Figure out the buflist element this sgl buffer corresponds 3341 * to and put it back at the head 3342 */ 3343 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3344 lbufp = bktp->bkt_bufl + ndx; 3345 3346 ASSERT(lbufp->lb_isfree == 0); 3347 ASSERT(lbufp->lb_buf == va); 3348 3349 lbufp->lb_isfree = 1; 3350 lbufp->lb_next = bktp->bkt_free_head; 3351 bktp->bkt_free_head = lbufp; 3352 } 3353 bktp->bkt_nfree += nds; 3354 3355 mutex_exit(&state->id_lso_lock); 3356 } 3357 3358 static void 3359 ibd_free_tx_copybufs(ibd_state_t *state) 3360 { 3361 /* 3362 * Unregister txbuf mr 3363 */ 3364 if (ibt_deregister_mr(state->id_hca_hdl, 3365 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3366 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3367 } 3368 state->id_tx_mr_hdl = NULL; 3369 3370 /* 3371 * Free txbuf memory 3372 */ 3373 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3374 state->id_tx_bufs = NULL; 3375 } 3376 3377 static void 3378 ibd_free_tx_lsobufs(ibd_state_t *state) 3379 { 3380 ibd_lsobkt_t *bktp; 3381 3382 mutex_enter(&state->id_lso_lock); 3383 3384 if ((bktp = state->id_lso) == NULL) { 3385 mutex_exit(&state->id_lso_lock); 3386 return; 3387 } 3388 3389 /* 3390 * First, free the buflist 3391 */ 3392 ASSERT(bktp->bkt_bufl != NULL); 3393 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3394 3395 /* 3396 * Unregister the LSO memory and free it 3397 */ 3398 ASSERT(bktp->bkt_mr_hdl != NULL); 3399 if (ibt_deregister_mr(state->id_hca_hdl, 3400 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3401 DPRINT(10, 3402 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3403 } 3404 ASSERT(bktp->bkt_mem); 3405 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3406 3407 /* 3408 * Finally free the bucket 3409 */ 3410 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3411 state->id_lso = NULL; 3412 3413 mutex_exit(&state->id_lso_lock); 3414 } 3415 3416 /* 3417 * Free the statically allocated Tx buffer list. 3418 */ 3419 static void 3420 ibd_fini_txlist(ibd_state_t *state) 3421 { 3422 ibd_swqe_t *node; 3423 3424 /* 3425 * Free the allocated swqes 3426 */ 3427 mutex_enter(&state->id_tx_list.dl_mutex); 3428 while (state->id_tx_list.dl_head != NULL) { 3429 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3430 state->id_tx_list.dl_head = node->swqe_next; 3431 ASSERT(state->id_tx_list.dl_cnt > 0); 3432 state->id_tx_list.dl_cnt--; 3433 ibd_free_swqe(state, node); 3434 } 3435 mutex_exit(&state->id_tx_list.dl_mutex); 3436 3437 ibd_free_tx_lsobufs(state); 3438 ibd_free_tx_copybufs(state); 3439 } 3440 3441 /* 3442 * Allocate a single send wqe and register it so it is almost 3443 * ready to be posted to the hardware. 3444 */ 3445 static int 3446 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey) 3447 { 3448 ibd_swqe_t *swqe; 3449 3450 swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP); 3451 *wqe = swqe; 3452 3453 swqe->swqe_type = IBD_WQE_SEND; 3454 swqe->swqe_next = NULL; 3455 swqe->swqe_prev = NULL; 3456 swqe->swqe_im_mblk = NULL; 3457 3458 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3459 (state->id_tx_bufs + ndx * state->id_tx_buf_sz); 3460 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3461 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3462 3463 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3464 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3465 swqe->w_swr.wr_trans = IBT_UD_SRV; 3466 3467 /* These are set in send */ 3468 swqe->w_swr.wr_nds = 0; 3469 swqe->w_swr.wr_sgl = NULL; 3470 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3471 3472 return (DDI_SUCCESS); 3473 } 3474 3475 /* 3476 * Free an allocated send wqe. 3477 */ 3478 /*ARGSUSED*/ 3479 static void 3480 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3481 { 3482 kmem_free(swqe, sizeof (ibd_swqe_t)); 3483 } 3484 3485 /* 3486 * Post a rwqe to the hardware and add it to the Rx list. The 3487 * "recycle" parameter indicates whether an old rwqe is being 3488 * recycled, or this is a new one. 3489 */ 3490 static int 3491 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3492 { 3493 ibt_status_t ibt_status; 3494 3495 if (recycle == B_FALSE) { 3496 mutex_enter(&state->id_rx_list.dl_mutex); 3497 if (state->id_rx_list.dl_head == NULL) { 3498 rwqe->rwqe_prev = NULL; 3499 rwqe->rwqe_next = NULL; 3500 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3501 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3502 } else { 3503 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3504 rwqe->rwqe_next = NULL; 3505 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3506 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3507 } 3508 mutex_exit(&state->id_rx_list.dl_mutex); 3509 } 3510 3511 mutex_enter(&state->id_rxpost_lock); 3512 if (state->id_rx_busy) { 3513 rwqe->w_post_link = NULL; 3514 if (state->id_rx_head) 3515 *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe; 3516 else 3517 state->id_rx_head = rwqe; 3518 state->id_rx_tailp = &(rwqe->w_post_link); 3519 } else { 3520 state->id_rx_busy = 1; 3521 do { 3522 mutex_exit(&state->id_rxpost_lock); 3523 3524 /* 3525 * Here we should add dl_cnt before post recv, because 3526 * we would have to make sure dl_cnt is updated before 3527 * the corresponding ibd_process_rx() is called. 3528 */ 3529 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3530 3531 ibt_status = ibt_post_recv(state->id_chnl_hdl, 3532 &rwqe->w_rwr, 1, NULL); 3533 if (ibt_status != IBT_SUCCESS) { 3534 (void) atomic_add_32_nv( 3535 &state->id_rx_list.dl_cnt, -1); 3536 ibd_print_warn(state, "ibd_post_recv: " 3537 "posting failed, ret=%d", ibt_status); 3538 return (DDI_FAILURE); 3539 } 3540 3541 mutex_enter(&state->id_rxpost_lock); 3542 rwqe = state->id_rx_head; 3543 if (rwqe) { 3544 state->id_rx_head = 3545 (ibd_rwqe_t *)(rwqe->w_post_link); 3546 } 3547 } while (rwqe); 3548 state->id_rx_busy = 0; 3549 } 3550 mutex_exit(&state->id_rxpost_lock); 3551 3552 return (DDI_SUCCESS); 3553 } 3554 3555 /* 3556 * Allocate the statically allocated Rx buffer list. 3557 */ 3558 static int 3559 ibd_init_rxlist(ibd_state_t *state) 3560 { 3561 ibd_rwqe_t *rwqe; 3562 int i; 3563 3564 for (i = 0; i < state->id_num_rwqe; i++) { 3565 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3566 ibd_fini_rxlist(state); 3567 return (DDI_FAILURE); 3568 } 3569 3570 if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) { 3571 ibd_free_rwqe(state, rwqe); 3572 ibd_fini_rxlist(state); 3573 return (DDI_FAILURE); 3574 } 3575 } 3576 3577 return (DDI_SUCCESS); 3578 } 3579 3580 /* 3581 * Free the statically allocated Rx buffer list. 3582 * 3583 */ 3584 static void 3585 ibd_fini_rxlist(ibd_state_t *state) 3586 { 3587 ibd_rwqe_t *node; 3588 3589 mutex_enter(&state->id_rx_list.dl_mutex); 3590 while (state->id_rx_list.dl_head != NULL) { 3591 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3592 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3593 ASSERT(state->id_rx_list.dl_cnt > 0); 3594 state->id_rx_list.dl_cnt--; 3595 3596 ibd_free_rwqe(state, node); 3597 } 3598 mutex_exit(&state->id_rx_list.dl_mutex); 3599 } 3600 3601 /* 3602 * Allocate a single recv wqe and register it so it is almost 3603 * ready to be posted to the hardware. 3604 */ 3605 static int 3606 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3607 { 3608 ibt_mr_attr_t mem_attr; 3609 ibd_rwqe_t *rwqe; 3610 3611 if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3612 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3613 return (DDI_FAILURE); 3614 } 3615 *wqe = rwqe; 3616 rwqe->rwqe_type = IBD_WQE_RECV; 3617 rwqe->w_state = state; 3618 rwqe->rwqe_next = NULL; 3619 rwqe->rwqe_prev = NULL; 3620 rwqe->w_freeing_wqe = B_FALSE; 3621 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3622 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3623 3624 rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3625 IPOIB_GRH_SIZE, KM_NOSLEEP); 3626 if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) { 3627 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3628 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3629 return (DDI_FAILURE); 3630 } 3631 3632 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3633 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3634 NULL) { 3635 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3636 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3637 state->id_mtu + IPOIB_GRH_SIZE); 3638 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3639 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3640 return (DDI_FAILURE); 3641 } 3642 3643 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3644 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3645 mem_attr.mr_as = NULL; 3646 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3647 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3648 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3649 IBT_SUCCESS) { 3650 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3651 rwqe->w_freeing_wqe = B_TRUE; 3652 freemsg(rwqe->rwqe_im_mblk); 3653 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3654 state->id_mtu + IPOIB_GRH_SIZE); 3655 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3656 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3657 return (DDI_FAILURE); 3658 } 3659 3660 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3661 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3662 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3663 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3664 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3665 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3666 rwqe->w_rwr.wr_nds = 1; 3667 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3668 3669 return (DDI_SUCCESS); 3670 } 3671 3672 /* 3673 * Free an allocated recv wqe. 3674 */ 3675 static void 3676 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3677 { 3678 if (ibt_deregister_mr(state->id_hca_hdl, 3679 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3680 DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()"); 3681 return; 3682 } 3683 3684 /* 3685 * Indicate to the callback function that this rwqe/mblk 3686 * should not be recycled. The freemsg() will invoke 3687 * ibd_freemsg_cb(). 3688 */ 3689 if (rwqe->rwqe_im_mblk != NULL) { 3690 rwqe->w_freeing_wqe = B_TRUE; 3691 freemsg(rwqe->rwqe_im_mblk); 3692 } 3693 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3694 state->id_mtu + IPOIB_GRH_SIZE); 3695 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3696 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3697 } 3698 3699 /* 3700 * Delete the rwqe being freed from the rx list. 3701 */ 3702 static void 3703 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3704 { 3705 mutex_enter(&state->id_rx_list.dl_mutex); 3706 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3707 state->id_rx_list.dl_head = rwqe->rwqe_next; 3708 else 3709 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3710 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3711 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3712 else 3713 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3714 mutex_exit(&state->id_rx_list.dl_mutex); 3715 } 3716 3717 /* 3718 * IBA Rx/Tx completion queue handler. Guaranteed to be single 3719 * threaded and nonreentrant for this CQ. When using combined CQ, 3720 * this handles Tx and Rx completions. With separate CQs, this handles 3721 * only Rx completions. 3722 */ 3723 /* ARGSUSED */ 3724 static void 3725 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3726 { 3727 ibd_state_t *state = (ibd_state_t *)arg; 3728 3729 atomic_add_64(&state->id_num_intrs, 1); 3730 3731 if (ibd_rx_softintr == 1) 3732 ddi_trigger_softintr(state->id_rx); 3733 else 3734 (void) ibd_intr((char *)state); 3735 } 3736 3737 /* 3738 * Separate CQ handler for Tx completions, when the Tx CQ is in 3739 * interrupt driven mode. 3740 */ 3741 /* ARGSUSED */ 3742 static void 3743 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3744 { 3745 ibd_state_t *state = (ibd_state_t *)arg; 3746 3747 atomic_add_64(&state->id_num_intrs, 1); 3748 3749 if (ibd_tx_softintr == 1) 3750 ddi_trigger_softintr(state->id_tx); 3751 else 3752 (void) ibd_tx_recycle((char *)state); 3753 } 3754 3755 /* 3756 * Multicast group create/delete trap handler. These will be delivered 3757 * on a kernel thread (handling can thus block) and can be invoked 3758 * concurrently. The handler can be invoked anytime after it is 3759 * registered and before ibt_detach(). 3760 */ 3761 /* ARGSUSED */ 3762 static void 3763 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3764 ibt_subnet_event_t *event) 3765 { 3766 ibd_state_t *state = (ibd_state_t *)arg; 3767 ibd_req_t *req; 3768 3769 /* 3770 * The trap handler will get invoked once for every event for 3771 * evert port. The input "gid" is the GID0 of the port the 3772 * trap came in on; we just need to act on traps that came 3773 * to our port, meaning the port on which the ipoib interface 3774 * resides. Since ipoib uses GID0 of the port, we just match 3775 * the gids to check whether we need to handle the trap. 3776 */ 3777 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3778 return; 3779 3780 DPRINT(10, "ibd_notices_handler : %d\n", code); 3781 3782 switch (code) { 3783 case IBT_SM_EVENT_UNAVAILABLE: 3784 /* 3785 * If we are in promiscuous mode or have 3786 * sendnonmembers, we need to print a warning 3787 * message right now. Else, just store the 3788 * information, print when we enter promiscuous 3789 * mode or attempt nonmember send. We might 3790 * also want to stop caching sendnonmember. 3791 */ 3792 ibd_print_warn(state, "IBA multicast support " 3793 "degraded due to unavailability of multicast " 3794 "traps"); 3795 break; 3796 case IBT_SM_EVENT_AVAILABLE: 3797 /* 3798 * If we printed a warning message above or 3799 * while trying to nonmember send or get into 3800 * promiscuous mode, print an okay message. 3801 */ 3802 ibd_print_warn(state, "IBA multicast support " 3803 "restored due to availability of multicast " 3804 "traps"); 3805 break; 3806 case IBT_SM_EVENT_MCG_CREATED: 3807 case IBT_SM_EVENT_MCG_DELETED: 3808 /* 3809 * Common processing of creation/deletion traps. 3810 * First check if the instance is being 3811 * [de]initialized; back off then, without doing 3812 * anything more, since we are not sure if the 3813 * async thread is around, or whether we might 3814 * be racing with the detach code in ibd_m_stop() 3815 * that scans the mcg list. 3816 */ 3817 if (!ibd_async_safe(state)) 3818 return; 3819 3820 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 3821 req->rq_gid = event->sm_notice_gid; 3822 req->rq_ptr = (void *)code; 3823 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 3824 break; 3825 } 3826 } 3827 3828 static void 3829 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 3830 { 3831 ib_gid_t mgid = req->rq_gid; 3832 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 3833 3834 DPRINT(10, "ibd_async_trap : %d\n", code); 3835 3836 /* 3837 * Atomically search the nonmember and sendonlymember lists and 3838 * delete. 3839 */ 3840 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 3841 3842 if (state->id_prom_op == IBD_OP_COMPLETED) { 3843 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 3844 3845 /* 3846 * If in promiscuous mode, try to join/attach to the new 3847 * mcg. Given the unreliable out-of-order mode of trap 3848 * delivery, we can never be sure whether it is a problem 3849 * if the join fails. Thus, we warn the admin of a failure 3850 * if this was a creation trap. Note that the trap might 3851 * actually be reporting a long past event, and the mcg 3852 * might already have been deleted, thus we might be warning 3853 * in vain. 3854 */ 3855 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 3856 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 3857 ibd_print_warn(state, "IBA promiscuous mode missed " 3858 "new multicast gid %016llx:%016llx", 3859 (u_longlong_t)mgid.gid_prefix, 3860 (u_longlong_t)mgid.gid_guid); 3861 } 3862 3863 /* 3864 * Free the request slot allocated by the subnet event thread. 3865 */ 3866 ibd_async_done(state); 3867 } 3868 3869 /* 3870 * GLDv3 entry point to get capabilities. 3871 */ 3872 static boolean_t 3873 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 3874 { 3875 ibd_state_t *state = arg; 3876 3877 switch (cap) { 3878 case MAC_CAPAB_HCKSUM: { 3879 uint32_t *txflags = cap_data; 3880 3881 /* 3882 * We either do full checksum or not do it at all 3883 */ 3884 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 3885 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 3886 else 3887 return (B_FALSE); 3888 break; 3889 } 3890 3891 case MAC_CAPAB_LSO: { 3892 mac_capab_lso_t *cap_lso = cap_data; 3893 3894 /* 3895 * In addition to the capability and policy, since LSO 3896 * relies on hw checksum, we'll not enable LSO if we 3897 * don't have hw checksum. Of course, if the HCA doesn't 3898 * provide the reserved lkey capability, enabling LSO will 3899 * actually affect performance adversely, so we'll disable 3900 * LSO even for that case. 3901 */ 3902 if (!state->id_lso_policy || !state->id_lso_capable) 3903 return (B_FALSE); 3904 3905 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 3906 return (B_FALSE); 3907 3908 if (state->id_hca_res_lkey_capab == 0) { 3909 ibd_print_warn(state, "no reserved-lkey capability, " 3910 "disabling LSO"); 3911 return (B_FALSE); 3912 } 3913 3914 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 3915 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 3916 break; 3917 } 3918 3919 default: 3920 return (B_FALSE); 3921 } 3922 3923 return (B_TRUE); 3924 } 3925 3926 static int 3927 ibd_get_port_details(ibd_state_t *state) 3928 { 3929 ibt_hca_portinfo_t *port_infop; 3930 ibt_status_t ret; 3931 uint_t psize, port_infosz; 3932 3933 mutex_enter(&state->id_link_mutex); 3934 3935 /* 3936 * Query for port information 3937 */ 3938 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 3939 &port_infop, &psize, &port_infosz); 3940 if ((ret != IBT_SUCCESS) || (psize != 1)) { 3941 mutex_exit(&state->id_link_mutex); 3942 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 3943 "failed, ret=%d", ret); 3944 return (DDI_FAILURE); 3945 } 3946 3947 /* 3948 * If the link already went down by the time we get here, 3949 * give up 3950 */ 3951 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 3952 mutex_exit(&state->id_link_mutex); 3953 ibt_free_portinfo(port_infop, port_infosz); 3954 DPRINT(10, "ibd_get_port_details: port is not active"); 3955 return (DDI_FAILURE); 3956 } 3957 3958 /* 3959 * If the link is active, verify the pkey 3960 */ 3961 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 3962 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 3963 mutex_exit(&state->id_link_mutex); 3964 ibt_free_portinfo(port_infop, port_infosz); 3965 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 3966 "failed, ret=%d", ret); 3967 return (DDI_FAILURE); 3968 } 3969 3970 state->id_mtu = (128 << port_infop->p_mtu); 3971 state->id_sgid = *port_infop->p_sgid_tbl; 3972 state->id_link_state = LINK_STATE_UP; 3973 3974 mutex_exit(&state->id_link_mutex); 3975 ibt_free_portinfo(port_infop, port_infosz); 3976 3977 /* 3978 * Now that the port is active, record the port speed 3979 */ 3980 state->id_link_speed = ibd_get_portspeed(state); 3981 3982 return (DDI_SUCCESS); 3983 } 3984 3985 static int 3986 ibd_alloc_cqs(ibd_state_t *state) 3987 { 3988 ibt_hca_attr_t hca_attrs; 3989 ibt_cq_attr_t cq_attr; 3990 ibt_status_t ret; 3991 uint32_t real_size; 3992 3993 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 3994 ASSERT(ret == IBT_SUCCESS); 3995 3996 /* 3997 * Allocate Rx/combined CQ: 3998 * Theoretically, there is no point in having more than #rwqe 3999 * plus #swqe cqe's, except that the CQ will be signalled for 4000 * overflow when the last wqe completes, if none of the previous 4001 * cqe's have been polled. Thus, we allocate just a few less wqe's 4002 * to make sure such overflow does not occur. 4003 */ 4004 cq_attr.cq_sched = NULL; 4005 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4006 4007 if (ibd_separate_cqs == 1) { 4008 /* 4009 * Allocate Receive CQ. 4010 */ 4011 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4012 cq_attr.cq_size = state->id_num_rwqe + 1; 4013 } else { 4014 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4015 state->id_num_rwqe = cq_attr.cq_size - 1; 4016 } 4017 4018 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4019 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4020 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4021 "failed, ret=%d\n", ret); 4022 return (DDI_FAILURE); 4023 } 4024 4025 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4026 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4027 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4028 "moderation failed, ret=%d\n", ret); 4029 } 4030 4031 state->id_rxwcs_size = state->id_num_rwqe + 1; 4032 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4033 state->id_rxwcs_size, KM_SLEEP); 4034 4035 /* 4036 * Allocate Send CQ. 4037 */ 4038 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4039 cq_attr.cq_size = state->id_num_swqe + 1; 4040 } else { 4041 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4042 state->id_num_swqe = cq_attr.cq_size - 1; 4043 } 4044 4045 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4046 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4047 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4048 "failed, ret=%d\n", ret); 4049 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4050 state->id_rxwcs_size); 4051 (void) ibt_free_cq(state->id_rcq_hdl); 4052 return (DDI_FAILURE); 4053 } 4054 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4055 IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) { 4056 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4057 "moderation failed, ret=%d\n", ret); 4058 } 4059 4060 state->id_txwcs_size = state->id_num_swqe + 1; 4061 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4062 state->id_txwcs_size, KM_SLEEP); 4063 } else { 4064 /* 4065 * Allocate combined Send/Receive CQ. 4066 */ 4067 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 4068 state->id_num_swqe + 1)) { 4069 cq_attr.cq_size = state->id_num_rwqe + 4070 state->id_num_swqe + 1; 4071 } else { 4072 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4073 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 4074 state->id_num_rwqe) / (state->id_num_rwqe + 4075 state->id_num_swqe); 4076 state->id_num_swqe = cq_attr.cq_size - 1 - 4077 state->id_num_rwqe; 4078 } 4079 4080 state->id_rxwcs_size = cq_attr.cq_size; 4081 state->id_txwcs_size = state->id_rxwcs_size; 4082 4083 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4084 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4085 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) " 4086 "failed, ret=%d\n", ret); 4087 return (DDI_FAILURE); 4088 } 4089 state->id_scq_hdl = state->id_rcq_hdl; 4090 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4091 state->id_rxwcs_size, KM_SLEEP); 4092 state->id_txwcs = state->id_rxwcs; 4093 } 4094 4095 /* 4096 * Print message in case we could not allocate as many wqe's 4097 * as was requested. 4098 */ 4099 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4100 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4101 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4102 } 4103 if (state->id_num_swqe != IBD_NUM_SWQE) { 4104 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4105 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4106 } 4107 4108 return (DDI_SUCCESS); 4109 } 4110 4111 static int 4112 ibd_setup_ud_channel(ibd_state_t *state) 4113 { 4114 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4115 ibt_ud_chan_query_attr_t ud_chan_attr; 4116 ibt_status_t ret; 4117 4118 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 4119 if (state->id_hca_res_lkey_capab) 4120 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4121 if (state->id_lso_policy && state->id_lso_capable) 4122 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4123 4124 ud_alloc_attr.ud_hca_port_num = state->id_port; 4125 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4126 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4127 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4128 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4129 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4130 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4131 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4132 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4133 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4134 ud_alloc_attr.ud_clone_chan = NULL; 4135 4136 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4137 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4138 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4139 "failed, ret=%d\n", ret); 4140 return (DDI_FAILURE); 4141 } 4142 4143 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4144 &ud_chan_attr)) != IBT_SUCCESS) { 4145 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4146 "failed, ret=%d\n", ret); 4147 (void) ibt_free_channel(state->id_chnl_hdl); 4148 return (DDI_FAILURE); 4149 } 4150 4151 state->id_qpnum = ud_chan_attr.ud_qpn; 4152 4153 return (DDI_SUCCESS); 4154 } 4155 4156 static int 4157 ibd_undo_m_start(ibd_state_t *state) 4158 { 4159 uint32_t progress = state->id_mac_state; 4160 uint_t attempts; 4161 ibt_status_t ret; 4162 ib_gid_t mgid; 4163 ibd_mce_t *mce; 4164 uint8_t jstate; 4165 4166 /* 4167 * Before we try to stop/undo whatever we did in ibd_m_start(), 4168 * we need to mark the link state as unknown to prevent nw 4169 * layer from using this instance for any new transfers. 4170 */ 4171 if (progress & IBD_DRV_PORT_DETAILS_OBTAINED) { 4172 state->id_link_state = LINK_STATE_UNKNOWN; 4173 mac_link_update(state->id_mh, state->id_link_state); 4174 4175 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4176 } 4177 4178 if (progress & IBD_DRV_STARTED) { 4179 state->id_mac_state &= (~IBD_DRV_STARTED); 4180 } 4181 4182 /* 4183 * First, stop receive interrupts; this stops the driver from 4184 * handing up buffers to higher layers. Wait for receive buffers 4185 * to be returned and give up after 5 seconds. 4186 */ 4187 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4188 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4189 attempts = 50; 4190 while (state->id_rx_list.dl_bufs_outstanding > 0) { 4191 delay(drv_usectohz(100000)); 4192 if (--attempts == 0) { 4193 /* 4194 * There are pending bufs with the network 4195 * layer and we have no choice but to wait 4196 * for them to be done with. Reap all the 4197 * Tx/Rx completions that were posted since 4198 * we turned off the notification and 4199 * return failure. 4200 */ 4201 DPRINT(2, "ibd_undo_m_start: " 4202 "reclaiming failed"); 4203 ibd_poll_compq(state, state->id_rcq_hdl); 4204 ibt_set_cq_handler(state->id_rcq_hdl, 4205 ibd_rcq_handler, state); 4206 return (DDI_FAILURE); 4207 } 4208 } 4209 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4210 } 4211 4212 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4213 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4214 4215 mutex_enter(&state->id_trap_lock); 4216 state->id_trap_stop = B_TRUE; 4217 while (state->id_trap_inprog > 0) 4218 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4219 mutex_exit(&state->id_trap_lock); 4220 4221 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4222 } 4223 4224 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4225 /* 4226 * Flushing the channel ensures that all pending WQE's 4227 * are marked with flush_error and handed to the CQ. It 4228 * does not guarantee the invocation of the CQ handler. 4229 * This call is guaranteed to return successfully for 4230 * UD QPNs. 4231 */ 4232 ret = ibt_flush_channel(state->id_chnl_hdl); 4233 ASSERT(ret == IBT_SUCCESS); 4234 4235 /* 4236 * Turn off Tx interrupts and poll. By the time the polling 4237 * returns an empty indicator, we are sure we have seen all 4238 * pending Tx callbacks. Note that after the call to 4239 * ibt_set_cq_handler() returns, the old handler is 4240 * guaranteed not to be invoked anymore. 4241 */ 4242 if (ibd_separate_cqs == 1) 4243 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4244 ibd_poll_compq(state, state->id_scq_hdl); 4245 4246 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4247 } 4248 4249 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4250 /* 4251 * No new async requests will be posted since the device 4252 * link state has been marked as unknown; completion handlers 4253 * have been turned off, so Tx handler will not cause any 4254 * more IBD_ASYNC_REAP requests. 4255 * 4256 * Queue a request for the async thread to exit, which will 4257 * be serviced after any pending ones. This can take a while, 4258 * specially if the SM is unreachable, since IBMF will slowly 4259 * timeout each SM request issued by the async thread. Reap 4260 * the thread before continuing on, we do not want it to be 4261 * lingering in modunloaded code (or we could move the reap 4262 * to ibd_detach(), provided we keep track of the current 4263 * id_async_thrid somewhere safe). 4264 */ 4265 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4266 thread_join(state->id_async_thrid); 4267 4268 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4269 } 4270 4271 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4272 /* 4273 * Drop all residual full/non membership. This includes full 4274 * membership to the broadcast group, and any nonmembership 4275 * acquired during transmits. We do this after the Tx completion 4276 * handlers are done, since those might result in some late 4277 * leaves; this also eliminates a potential race with that 4278 * path wrt the mc full list insert/delete. Trap handling 4279 * has also been suppressed at this point. Thus, no locks 4280 * are required while traversing the mc full list. 4281 */ 4282 DPRINT(2, "ibd_undo_m_start: clear full cache entries"); 4283 mce = list_head(&state->id_mc_full); 4284 while (mce != NULL) { 4285 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4286 jstate = mce->mc_jstate; 4287 mce = list_next(&state->id_mc_full, mce); 4288 ibd_leave_group(state, mgid, jstate); 4289 } 4290 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4291 } 4292 4293 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4294 ibd_fini_rxlist(state); 4295 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4296 } 4297 4298 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4299 ibd_fini_txlist(state); 4300 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4301 } 4302 4303 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4304 (void) ibt_free_channel(state->id_chnl_hdl); 4305 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4306 } 4307 4308 if (progress & IBD_DRV_CQS_ALLOCD) { 4309 if (ibd_separate_cqs == 1) { 4310 kmem_free(state->id_txwcs, 4311 sizeof (ibt_wc_t) * state->id_txwcs_size); 4312 (void) ibt_free_cq(state->id_scq_hdl); 4313 } 4314 4315 kmem_free(state->id_rxwcs, 4316 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4317 (void) ibt_free_cq(state->id_rcq_hdl); 4318 4319 state->id_txwcs = NULL; 4320 state->id_rxwcs = NULL; 4321 state->id_scq_hdl = NULL; 4322 state->id_rcq_hdl = NULL; 4323 4324 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4325 } 4326 4327 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4328 mod_hash_destroy_hash(state->id_ah_active_hash); 4329 ibd_acache_fini(state); 4330 4331 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4332 } 4333 4334 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4335 ibt_free_mcg_info(state->id_mcinfo, 1); 4336 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4337 } 4338 4339 return (DDI_SUCCESS); 4340 } 4341 4342 /* 4343 * GLDv3 entry point to start hardware. 4344 */ 4345 /*ARGSUSED*/ 4346 static int 4347 ibd_m_start(void *arg) 4348 { 4349 ibd_state_t *state = arg; 4350 kthread_t *kht; 4351 int err; 4352 4353 if (state->id_mac_state & IBD_DRV_STARTED) 4354 return (DDI_SUCCESS); 4355 4356 /* 4357 * Get port details; if we fail here, very likely the port 4358 * state is inactive or the pkey can't be verified 4359 */ 4360 if (ibd_get_port_details(state) != DDI_SUCCESS) { 4361 DPRINT(10, "ibd_m_start: ibd_get_port_details() failed"); 4362 return (EAGAIN); 4363 } 4364 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4365 4366 /* 4367 * Find the IPoIB broadcast group 4368 */ 4369 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4370 DPRINT(10, "ibd_m_start: ibd_find_bgroup() failed"); 4371 err = ENOENT; 4372 goto m_start_fail; 4373 } 4374 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4375 4376 /* 4377 * Initialize per-interface caches and lists; if we fail here, 4378 * it is most likely due to a lack of resources 4379 */ 4380 if (ibd_acache_init(state) != DDI_SUCCESS) { 4381 DPRINT(10, "ibd_m_start: ibd_acache_init() failed"); 4382 err = ENOMEM; 4383 goto m_start_fail; 4384 } 4385 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4386 4387 /* 4388 * Allocate send and receive completion queues 4389 */ 4390 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4391 DPRINT(10, "ibd_m_start: ibd_alloc_cqs() failed"); 4392 err = ENOMEM; 4393 goto m_start_fail; 4394 } 4395 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4396 4397 /* 4398 * Setup a UD channel 4399 */ 4400 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4401 err = ENOMEM; 4402 DPRINT(10, "ibd_m_start: ibd_setup_ud_channel() failed"); 4403 goto m_start_fail; 4404 } 4405 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4406 4407 /* 4408 * Allocate and initialize the tx buffer list 4409 */ 4410 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4411 DPRINT(10, "ibd_m_start: ibd_init_txlist() failed"); 4412 err = ENOMEM; 4413 goto m_start_fail; 4414 } 4415 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4416 4417 /* 4418 * If we have separate cqs, create the send cq handler here 4419 */ 4420 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 4421 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4422 if (ibt_enable_cq_notify(state->id_scq_hdl, 4423 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 4424 DPRINT(10, 4425 "ibd_m_start: ibt_enable_cq_notify(scq) failed"); 4426 err = EINVAL; 4427 goto m_start_fail; 4428 } 4429 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4430 } 4431 4432 /* 4433 * Allocate and initialize the rx buffer list 4434 */ 4435 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4436 DPRINT(10, "ibd_m_start: ibd_init_rxlist() failed"); 4437 err = ENOMEM; 4438 goto m_start_fail; 4439 } 4440 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4441 4442 /* 4443 * Join IPoIB broadcast group 4444 */ 4445 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4446 DPRINT(10, "ibd_m_start: ibd_join_group() failed"); 4447 err = EINVAL; 4448 goto m_start_fail; 4449 } 4450 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4451 4452 /* 4453 * Create the async thread; thread_create never fails. 4454 */ 4455 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4456 TS_RUN, minclsyspri); 4457 state->id_async_thrid = kht->t_did; 4458 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4459 4460 /* 4461 * When we did mac_register() in ibd_attach(), we didn't register 4462 * the real macaddr and we didn't have the true port mtu. Now that 4463 * we're almost ready, set the local mac address and broadcast 4464 * addresses and update gldv3 about the real values of these 4465 * parameters. 4466 */ 4467 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4468 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4469 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4470 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4471 4472 mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); 4473 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4474 4475 /* 4476 * Setup the receive cq handler 4477 */ 4478 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4479 if (ibt_enable_cq_notify(state->id_rcq_hdl, 4480 IBT_NEXT_COMPLETION) != IBT_SUCCESS) { 4481 DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) failed"); 4482 err = EINVAL; 4483 goto m_start_fail; 4484 } 4485 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4486 4487 /* 4488 * Setup the subnet notices handler after we've initialized the acache/ 4489 * mcache and started the async thread, both of which are required for 4490 * the trap handler to function properly. 4491 * 4492 * Now that the async thread has been started (and we've already done 4493 * a mac_register() during attach so mac_tx_update() can be called 4494 * if necessary without any problem), we can enable the trap handler 4495 * to queue requests to the async thread. 4496 */ 4497 ibt_register_subnet_notices(state->id_ibt_hdl, 4498 ibd_snet_notices_handler, state); 4499 mutex_enter(&state->id_trap_lock); 4500 state->id_trap_stop = B_FALSE; 4501 mutex_exit(&state->id_trap_lock); 4502 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4503 4504 /* 4505 * Indicate link status to GLDv3 and higher layers. By default, 4506 * we assume we are in up state (which must have been true at 4507 * least at the time the broadcast mcg's were probed); if there 4508 * were any up/down transitions till the time we come here, the 4509 * async handler will have updated last known state, which we 4510 * use to tell GLDv3. The async handler will not send any 4511 * notifications to GLDv3 till we reach here in the initialization 4512 * sequence. 4513 */ 4514 state->id_mac_state |= IBD_DRV_STARTED; 4515 mac_link_update(state->id_mh, state->id_link_state); 4516 4517 return (DDI_SUCCESS); 4518 4519 m_start_fail: 4520 /* 4521 * If we ran into a problem during ibd_m_start() and ran into 4522 * some other problem during undoing our partial work, we can't 4523 * do anything about it. Ignore any errors we might get from 4524 * ibd_undo_m_start() and just return the original error we got. 4525 */ 4526 (void) ibd_undo_m_start(state); 4527 return (err); 4528 } 4529 4530 /* 4531 * GLDv3 entry point to stop hardware from receiving packets. 4532 */ 4533 /*ARGSUSED*/ 4534 static void 4535 ibd_m_stop(void *arg) 4536 { 4537 ibd_state_t *state = arg; 4538 4539 /* 4540 * Since ibd_m_stop() doesn't expect any return, we cannot 4541 * fail even if we run into some problem with ibd_undo_m_start(). 4542 * The best we can do is to leave it in a good state, so 4543 * perhaps a future unplumb will succeed. 4544 */ 4545 (void) ibd_undo_m_start(state); 4546 } 4547 4548 /* 4549 * GLDv3 entry point to modify device's mac address. We do not 4550 * allow address modifications. 4551 */ 4552 static int 4553 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4554 { 4555 ibd_state_t *state = arg; 4556 4557 /* 4558 * Don't bother even comparing the macaddr if we haven't 4559 * completed ibd_m_start(). 4560 */ 4561 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4562 return (0); 4563 4564 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4565 return (0); 4566 else 4567 return (EINVAL); 4568 } 4569 4570 /* 4571 * The blocking part of the IBA join/leave operations are done out 4572 * of here on the async thread. 4573 */ 4574 static void 4575 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4576 { 4577 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4578 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4579 4580 if (op == IBD_ASYNC_JOIN) { 4581 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4582 ibd_print_warn(state, "Joint multicast group failed :" 4583 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4584 } 4585 } else { 4586 /* 4587 * Here, we must search for the proper mcg_info and 4588 * use that to leave the group. 4589 */ 4590 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4591 } 4592 } 4593 4594 /* 4595 * GLDv3 entry point for multicast enable/disable requests. 4596 * This function queues the operation to the async thread and 4597 * return success for a valid multicast address. 4598 */ 4599 static int 4600 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4601 { 4602 ibd_state_t *state = (ibd_state_t *)arg; 4603 ipoib_mac_t maddr, *mcast; 4604 ib_gid_t mgid; 4605 ibd_req_t *req; 4606 4607 /* 4608 * If we haven't completed ibd_m_start(), async thread wouldn't 4609 * have been started and id_bcaddr wouldn't be set, so there's 4610 * no point in continuing. 4611 */ 4612 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4613 return (0); 4614 4615 /* 4616 * The incoming multicast address might not be aligned properly 4617 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4618 * it to look like one though, to get the offsets of the mc gid, 4619 * since we know we are not going to dereference any values with 4620 * the ipoib_mac_t pointer. 4621 */ 4622 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4623 mcast = &maddr; 4624 4625 /* 4626 * Check validity of MCG address. We could additionally check 4627 * that a enable/disable is not being issued on the "broadcast" 4628 * mcg, but since this operation is only invokable by priviledged 4629 * programs anyway, we allow the flexibility to those dlpi apps. 4630 * Note that we do not validate the "scope" of the IBA mcg. 4631 */ 4632 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4633 return (EINVAL); 4634 4635 /* 4636 * fill in multicast pkey and scope 4637 */ 4638 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4639 4640 /* 4641 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4642 * nothing (i.e. we stay JOINed to the broadcast group done in 4643 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 4644 * requires to be joined to broadcast groups at all times. 4645 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4646 * depends on this. 4647 */ 4648 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4649 return (0); 4650 4651 ibd_n2h_gid(mcast, &mgid); 4652 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4653 if (req == NULL) 4654 return (ENOMEM); 4655 4656 req->rq_gid = mgid; 4657 4658 if (add) { 4659 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4660 mgid.gid_prefix, mgid.gid_guid); 4661 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 4662 } else { 4663 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4664 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4665 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 4666 } 4667 return (0); 4668 } 4669 4670 /* 4671 * The blocking part of the IBA promiscuous operations are done 4672 * out of here on the async thread. The dlpireq parameter indicates 4673 * whether this invocation is due to a dlpi request or due to 4674 * a port up/down event. 4675 */ 4676 static void 4677 ibd_async_unsetprom(ibd_state_t *state) 4678 { 4679 ibd_mce_t *mce = list_head(&state->id_mc_non); 4680 ib_gid_t mgid; 4681 4682 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4683 4684 while (mce != NULL) { 4685 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4686 mce = list_next(&state->id_mc_non, mce); 4687 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4688 } 4689 state->id_prom_op = IBD_OP_NOTSTARTED; 4690 } 4691 4692 /* 4693 * The blocking part of the IBA promiscuous operations are done 4694 * out of here on the async thread. The dlpireq parameter indicates 4695 * whether this invocation is due to a dlpi request or due to 4696 * a port up/down event. 4697 */ 4698 static void 4699 ibd_async_setprom(ibd_state_t *state) 4700 { 4701 ibt_mcg_attr_t mcg_attr; 4702 ibt_mcg_info_t *mcg_info; 4703 ib_gid_t mgid; 4704 uint_t numg; 4705 int i; 4706 char ret = IBD_OP_COMPLETED; 4707 4708 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4709 4710 /* 4711 * Obtain all active MC groups on the IB fabric with 4712 * specified criteria (scope + Pkey + Qkey + mtu). 4713 */ 4714 bzero(&mcg_attr, sizeof (mcg_attr)); 4715 mcg_attr.mc_pkey = state->id_pkey; 4716 mcg_attr.mc_scope = state->id_scope; 4717 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4718 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4719 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4720 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4721 IBT_SUCCESS) { 4722 ibd_print_warn(state, "Could not get list of IBA multicast " 4723 "groups"); 4724 ret = IBD_OP_ERRORED; 4725 goto done; 4726 } 4727 4728 /* 4729 * Iterate over the returned mcg's and join as NonMember 4730 * to the IP mcg's. 4731 */ 4732 for (i = 0; i < numg; i++) { 4733 /* 4734 * Do a NonMember JOIN on the MC group. 4735 */ 4736 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4737 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4738 ibd_print_warn(state, "IBA promiscuous mode missed " 4739 "multicast gid %016llx:%016llx", 4740 (u_longlong_t)mgid.gid_prefix, 4741 (u_longlong_t)mgid.gid_guid); 4742 } 4743 4744 ibt_free_mcg_info(mcg_info, numg); 4745 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4746 done: 4747 state->id_prom_op = ret; 4748 } 4749 4750 /* 4751 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4752 * GLDv3 assumes phys state receives more packets than multi state, 4753 * which is not true for IPoIB. Thus, treat the multi and phys 4754 * promiscuous states the same way to work with GLDv3's assumption. 4755 */ 4756 static int 4757 ibd_m_promisc(void *arg, boolean_t on) 4758 { 4759 ibd_state_t *state = (ibd_state_t *)arg; 4760 ibd_req_t *req; 4761 4762 /* 4763 * Async thread wouldn't have been started if we haven't 4764 * passed ibd_m_start() 4765 */ 4766 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4767 return (0); 4768 4769 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4770 if (req == NULL) 4771 return (ENOMEM); 4772 if (on) { 4773 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4774 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 4775 } else { 4776 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4777 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 4778 } 4779 4780 return (0); 4781 } 4782 4783 /* 4784 * GLDv3 entry point for gathering statistics. 4785 */ 4786 static int 4787 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 4788 { 4789 ibd_state_t *state = (ibd_state_t *)arg; 4790 4791 switch (stat) { 4792 case MAC_STAT_IFSPEED: 4793 *val = state->id_link_speed; 4794 break; 4795 case MAC_STAT_MULTIRCV: 4796 *val = state->id_multi_rcv; 4797 break; 4798 case MAC_STAT_BRDCSTRCV: 4799 *val = state->id_brd_rcv; 4800 break; 4801 case MAC_STAT_MULTIXMT: 4802 *val = state->id_multi_xmt; 4803 break; 4804 case MAC_STAT_BRDCSTXMT: 4805 *val = state->id_brd_xmt; 4806 break; 4807 case MAC_STAT_RBYTES: 4808 *val = state->id_rcv_bytes; 4809 break; 4810 case MAC_STAT_IPACKETS: 4811 *val = state->id_rcv_pkt; 4812 break; 4813 case MAC_STAT_OBYTES: 4814 *val = state->id_xmt_bytes; 4815 break; 4816 case MAC_STAT_OPACKETS: 4817 *val = state->id_xmt_pkt; 4818 break; 4819 case MAC_STAT_OERRORS: 4820 *val = state->id_ah_error; /* failed AH translation */ 4821 break; 4822 case MAC_STAT_IERRORS: 4823 *val = 0; 4824 break; 4825 case MAC_STAT_NOXMTBUF: 4826 *val = state->id_tx_short; 4827 break; 4828 case MAC_STAT_NORCVBUF: 4829 default: 4830 return (ENOTSUP); 4831 } 4832 4833 return (0); 4834 } 4835 4836 static void 4837 ibd_async_txsched(ibd_state_t *state) 4838 { 4839 ibd_req_t *req; 4840 int ret; 4841 4842 if (ibd_txcomp_poll) 4843 ibd_poll_compq(state, state->id_scq_hdl); 4844 4845 ret = ibd_resume_transmission(state); 4846 if (ret && ibd_txcomp_poll) { 4847 if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP)) 4848 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 4849 else { 4850 ibd_print_warn(state, "ibd_async_txsched: " 4851 "no memory, can't schedule work slot"); 4852 } 4853 } 4854 } 4855 4856 static int 4857 ibd_resume_transmission(ibd_state_t *state) 4858 { 4859 int flag; 4860 int met_thresh = 0; 4861 int ret = -1; 4862 4863 mutex_enter(&state->id_sched_lock); 4864 if (state->id_sched_needed & IBD_RSRC_SWQE) { 4865 met_thresh = (state->id_tx_list.dl_cnt > 4866 IBD_FREE_SWQES_THRESH); 4867 flag = IBD_RSRC_SWQE; 4868 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 4869 ASSERT(state->id_lso != NULL); 4870 met_thresh = (state->id_lso->bkt_nfree > 4871 IBD_FREE_LSOS_THRESH); 4872 flag = IBD_RSRC_LSOBUF; 4873 } 4874 if (met_thresh) { 4875 state->id_sched_needed &= ~flag; 4876 ret = 0; 4877 } 4878 mutex_exit(&state->id_sched_lock); 4879 4880 if (ret == 0) 4881 mac_tx_update(state->id_mh); 4882 4883 return (ret); 4884 } 4885 4886 /* 4887 * Release the send wqe back into free list. 4888 */ 4889 static void 4890 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 4891 { 4892 /* 4893 * Add back on Tx list for reuse. 4894 */ 4895 swqe->swqe_next = NULL; 4896 mutex_enter(&state->id_tx_list.dl_mutex); 4897 if (state->id_tx_list.dl_pending_sends) { 4898 state->id_tx_list.dl_pending_sends = B_FALSE; 4899 } 4900 if (state->id_tx_list.dl_head == NULL) { 4901 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 4902 } else { 4903 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 4904 } 4905 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 4906 state->id_tx_list.dl_cnt++; 4907 mutex_exit(&state->id_tx_list.dl_mutex); 4908 } 4909 4910 /* 4911 * Acquire a send wqe from free list. 4912 * Returns error number and send wqe pointer. 4913 */ 4914 static int 4915 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe) 4916 { 4917 int rc = 0; 4918 ibd_swqe_t *wqe; 4919 4920 /* 4921 * Check and reclaim some of the completed Tx requests. 4922 * If someone else is already in this code and pulling Tx 4923 * completions, no need to poll, since the current lock holder 4924 * will do the work anyway. Normally, we poll for completions 4925 * every few Tx attempts, but if we are short on Tx descriptors, 4926 * we always try to poll. 4927 */ 4928 if ((ibd_txcomp_poll == 1) && 4929 (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) { 4930 ibd_poll_compq(state, state->id_scq_hdl); 4931 } 4932 4933 /* 4934 * Grab required transmit wqes. 4935 */ 4936 mutex_enter(&state->id_tx_list.dl_mutex); 4937 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 4938 if (wqe != NULL) { 4939 state->id_tx_list.dl_cnt -= 1; 4940 state->id_tx_list.dl_head = wqe->swqe_next; 4941 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 4942 state->id_tx_list.dl_tail = NULL; 4943 } else { 4944 /* 4945 * If we did not find the number we were looking for, flag 4946 * no resource. Adjust list appropriately in either case. 4947 */ 4948 rc = ENOENT; 4949 state->id_tx_list.dl_pending_sends = B_TRUE; 4950 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 4951 atomic_add_64(&state->id_tx_short, 1); 4952 } 4953 mutex_exit(&state->id_tx_list.dl_mutex); 4954 *swqe = wqe; 4955 4956 return (rc); 4957 } 4958 4959 static int 4960 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 4961 ibt_ud_dest_hdl_t ud_dest) 4962 { 4963 mblk_t *nmp; 4964 int iph_len, tcph_len; 4965 ibt_wr_lso_t *lso; 4966 uintptr_t ip_start, tcp_start; 4967 uint8_t *dst; 4968 uint_t pending, mblen; 4969 4970 /* 4971 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 4972 * we need to adjust it here for lso. 4973 */ 4974 lso = &(node->w_swr.wr.ud_lso); 4975 lso->lso_ud_dest = ud_dest; 4976 lso->lso_mss = mss; 4977 4978 /* 4979 * Calculate the LSO header size and set it in the UD LSO structure. 4980 * Note that the only assumption we make is that each of the IPoIB, 4981 * IP and TCP headers will be contained in a single mblk fragment; 4982 * together, the headers may span multiple mblk fragments. 4983 */ 4984 nmp = mp; 4985 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 4986 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 4987 ip_start = (uintptr_t)nmp->b_cont->b_rptr 4988 + (ip_start - (uintptr_t)(nmp->b_wptr)); 4989 nmp = nmp->b_cont; 4990 4991 } 4992 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 4993 4994 tcp_start = ip_start + iph_len; 4995 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 4996 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 4997 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 4998 nmp = nmp->b_cont; 4999 } 5000 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5001 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5002 5003 /* 5004 * If the lso header fits entirely within a single mblk fragment, 5005 * we'll avoid an additional copy of the lso header here and just 5006 * pass the b_rptr of the mblk directly. 5007 * 5008 * If this isn't true, we'd have to allocate for it explicitly. 5009 */ 5010 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5011 lso->lso_hdr = mp->b_rptr; 5012 } else { 5013 /* On work completion, remember to free this allocated hdr */ 5014 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5015 if (lso->lso_hdr == NULL) { 5016 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5017 "sz = %d", lso->lso_hdr_sz); 5018 lso->lso_hdr_sz = 0; 5019 lso->lso_mss = 0; 5020 return (-1); 5021 } 5022 } 5023 5024 /* 5025 * Copy in the lso header only if we need to 5026 */ 5027 if (lso->lso_hdr != mp->b_rptr) { 5028 dst = lso->lso_hdr; 5029 pending = lso->lso_hdr_sz; 5030 5031 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5032 mblen = MBLKL(nmp); 5033 if (pending > mblen) { 5034 bcopy(nmp->b_rptr, dst, mblen); 5035 dst += mblen; 5036 pending -= mblen; 5037 } else { 5038 bcopy(nmp->b_rptr, dst, pending); 5039 break; 5040 } 5041 } 5042 } 5043 5044 return (0); 5045 } 5046 5047 static void 5048 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5049 { 5050 ibt_wr_lso_t *lso; 5051 5052 if ((!node) || (!mp)) 5053 return; 5054 5055 /* 5056 * Free any header space that we might've allocated if we 5057 * did an LSO 5058 */ 5059 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5060 lso = &(node->w_swr.wr.ud_lso); 5061 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5062 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5063 lso->lso_hdr = NULL; 5064 lso->lso_hdr_sz = 0; 5065 } 5066 } 5067 } 5068 5069 static void 5070 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5071 { 5072 uint_t i; 5073 uint_t num_posted; 5074 uint_t n_wrs; 5075 ibt_status_t ibt_status; 5076 ibt_send_wr_t wrs[IBD_MAX_POST_MULTIPLE]; 5077 ibd_swqe_t *elem; 5078 ibd_swqe_t *nodes[IBD_MAX_POST_MULTIPLE]; 5079 5080 node->swqe_next = NULL; 5081 5082 mutex_enter(&state->id_txpost_lock); 5083 5084 /* 5085 * Enqueue the new node in chain of wqes to send 5086 */ 5087 if (state->id_tx_head) { 5088 *(state->id_tx_tailp) = (ibd_wqe_t *)node; 5089 } else { 5090 state->id_tx_head = node; 5091 } 5092 state->id_tx_tailp = &(node->swqe_next); 5093 5094 /* 5095 * If someone else is helping out with the sends, 5096 * just go back 5097 */ 5098 if (state->id_tx_busy) { 5099 mutex_exit(&state->id_txpost_lock); 5100 return; 5101 } 5102 5103 /* 5104 * Otherwise, mark the flag to indicate that we'll be 5105 * doing the dispatch of what's there in the wqe chain 5106 */ 5107 state->id_tx_busy = 1; 5108 5109 while (state->id_tx_head) { 5110 /* 5111 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs 5112 * at a time if possible, and keep posting them. 5113 */ 5114 for (n_wrs = 0, elem = state->id_tx_head; 5115 (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE); 5116 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5117 5118 nodes[n_wrs] = elem; 5119 wrs[n_wrs] = elem->w_swr; 5120 } 5121 state->id_tx_head = elem; 5122 5123 /* 5124 * Release the txpost lock before posting the 5125 * send request to the hca; if the posting fails 5126 * for some reason, we'll never receive completion 5127 * intimation, so we'll need to cleanup. 5128 */ 5129 mutex_exit(&state->id_txpost_lock); 5130 5131 ASSERT(n_wrs != 0); 5132 5133 /* 5134 * If posting fails for some reason, we'll never receive 5135 * completion intimation, so we'll need to cleanup. But 5136 * we need to make sure we don't clean up nodes whose 5137 * wrs have been successfully posted. We assume that the 5138 * hca driver returns on the first failure to post and 5139 * therefore the first 'num_posted' entries don't need 5140 * cleanup here. 5141 */ 5142 num_posted = 0; 5143 ibt_status = ibt_post_send(state->id_chnl_hdl, 5144 wrs, n_wrs, &num_posted); 5145 if (ibt_status != IBT_SUCCESS) { 5146 5147 ibd_print_warn(state, "ibd_post_send: " 5148 "posting multiple wrs failed: " 5149 "requested=%d, done=%d, ret=%d", 5150 n_wrs, num_posted, ibt_status); 5151 5152 for (i = num_posted; i < n_wrs; i++) 5153 ibd_tx_cleanup(state, nodes[i]); 5154 } 5155 5156 /* 5157 * Grab the mutex before we go and check the tx Q again 5158 */ 5159 mutex_enter(&state->id_txpost_lock); 5160 } 5161 5162 state->id_tx_busy = 0; 5163 mutex_exit(&state->id_txpost_lock); 5164 } 5165 5166 static int 5167 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5168 uint_t lsohdr_sz) 5169 { 5170 ibt_wr_ds_t *sgl; 5171 ibt_status_t ibt_status; 5172 mblk_t *nmp; 5173 mblk_t *data_mp; 5174 uchar_t *bufp; 5175 size_t blksize; 5176 size_t skip; 5177 size_t avail; 5178 uint_t pktsize; 5179 uint_t frag_len; 5180 uint_t pending_hdr; 5181 uint_t hiwm; 5182 int nmblks; 5183 int i; 5184 5185 /* 5186 * Let's skip ahead to the data if this is LSO 5187 */ 5188 data_mp = mp; 5189 pending_hdr = 0; 5190 if (lsohdr_sz) { 5191 pending_hdr = lsohdr_sz; 5192 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5193 frag_len = nmp->b_wptr - nmp->b_rptr; 5194 if (frag_len > pending_hdr) 5195 break; 5196 pending_hdr -= frag_len; 5197 } 5198 data_mp = nmp; /* start of data past lso header */ 5199 ASSERT(data_mp != NULL); 5200 } 5201 5202 /* 5203 * Calculate the size of message data and number of msg blocks 5204 */ 5205 pktsize = 0; 5206 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5207 nmp = nmp->b_cont, nmblks++) { 5208 pktsize += MBLKL(nmp); 5209 } 5210 pktsize -= pending_hdr; 5211 5212 /* 5213 * Translating the virtual address regions into physical regions 5214 * for using the Reserved LKey feature results in a wr sgl that 5215 * is a little longer. Since failing ibt_map_mem_iov() is costly, 5216 * we'll fix a high-water mark (65%) for when we should stop. 5217 */ 5218 hiwm = (state->id_max_sqseg * 65) / 100; 5219 5220 /* 5221 * We only do ibt_map_mem_iov() if the pktsize is above the 5222 * "copy-threshold", and if the number of mp fragments is less than 5223 * the maximum acceptable. 5224 */ 5225 if ((state->id_hca_res_lkey_capab) && 5226 (pktsize > IBD_TX_COPY_THRESH) && 5227 (nmblks < hiwm)) { 5228 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5229 ibt_iov_attr_t iov_attr; 5230 5231 iov_attr.iov_as = NULL; 5232 iov_attr.iov = iov_arr; 5233 iov_attr.iov_buf = NULL; 5234 iov_attr.iov_list_len = nmblks; 5235 iov_attr.iov_wr_nds = state->id_max_sqseg; 5236 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5237 iov_attr.iov_flags = IBT_IOV_SLEEP; 5238 5239 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5240 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5241 iov_arr[i].iov_len = MBLKL(nmp); 5242 if (i == 0) { 5243 iov_arr[i].iov_addr += pending_hdr; 5244 iov_arr[i].iov_len -= pending_hdr; 5245 } 5246 } 5247 5248 node->w_buftype = IBD_WQE_MAPPED; 5249 node->w_swr.wr_sgl = node->w_sgl; 5250 5251 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5252 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5253 if (ibt_status != IBT_SUCCESS) { 5254 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5255 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5256 goto ibd_copy_path; 5257 } 5258 5259 return (0); 5260 } 5261 5262 ibd_copy_path: 5263 if (pktsize <= state->id_tx_buf_sz) { 5264 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5265 node->w_swr.wr_nds = 1; 5266 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5267 node->w_buftype = IBD_WQE_TXBUF; 5268 5269 /* 5270 * Even though this is the copy path for transfers less than 5271 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5272 * is possible the first data mblk fragment (data_mp) still 5273 * contains part of the LSO header that we need to skip. 5274 */ 5275 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5276 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5277 blksize = MBLKL(nmp) - pending_hdr; 5278 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5279 bufp += blksize; 5280 pending_hdr = 0; 5281 } 5282 5283 return (0); 5284 } 5285 5286 /* 5287 * Copy path for transfers greater than id_tx_buf_sz 5288 */ 5289 node->w_swr.wr_sgl = node->w_sgl; 5290 if (ibd_acquire_lsobufs(state, pktsize, 5291 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5292 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5293 return (-1); 5294 } 5295 node->w_buftype = IBD_WQE_LSOBUF; 5296 5297 /* 5298 * Copy the larger-than-id_tx_buf_sz packet into a set of 5299 * fixed-sized, pre-mapped LSO buffers. Note that we might 5300 * need to skip part of the LSO header in the first fragment 5301 * as before. 5302 */ 5303 nmp = data_mp; 5304 skip = pending_hdr; 5305 for (i = 0; i < node->w_swr.wr_nds; i++) { 5306 sgl = node->w_swr.wr_sgl + i; 5307 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5308 avail = IBD_LSO_BUFSZ; 5309 while (nmp && avail) { 5310 blksize = MBLKL(nmp) - skip; 5311 if (blksize > avail) { 5312 bcopy(nmp->b_rptr + skip, bufp, avail); 5313 skip += avail; 5314 avail = 0; 5315 } else { 5316 bcopy(nmp->b_rptr + skip, bufp, blksize); 5317 skip = 0; 5318 avail -= blksize; 5319 bufp += blksize; 5320 nmp = nmp->b_cont; 5321 } 5322 } 5323 } 5324 5325 return (0); 5326 } 5327 5328 /* 5329 * Schedule a completion queue polling to reap the resource we're 5330 * short on. If we implement the change to reap tx completions 5331 * in a separate thread, we'll need to wake up that thread here. 5332 */ 5333 static int 5334 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5335 { 5336 ibd_req_t *req; 5337 5338 mutex_enter(&state->id_sched_lock); 5339 state->id_sched_needed |= resource_type; 5340 mutex_exit(&state->id_sched_lock); 5341 5342 /* 5343 * If we are asked to queue a work entry, we need to do it 5344 */ 5345 if (q_flag) { 5346 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5347 if (req == NULL) 5348 return (-1); 5349 5350 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5351 } 5352 5353 return (0); 5354 } 5355 5356 /* 5357 * The passed in packet has this format: 5358 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5359 */ 5360 static boolean_t 5361 ibd_send(ibd_state_t *state, mblk_t *mp) 5362 { 5363 ibd_ace_t *ace; 5364 ibd_swqe_t *node; 5365 ipoib_mac_t *dest; 5366 ib_header_info_t *ipibp; 5367 ip6_t *ip6h; 5368 uint_t pktsize; 5369 uint32_t mss; 5370 uint32_t hckflags; 5371 uint32_t lsoflags = 0; 5372 uint_t lsohdr_sz = 0; 5373 int ret, len; 5374 boolean_t dofree = B_FALSE; 5375 boolean_t rc; 5376 5377 /* 5378 * If we aren't done with the device initialization and start, 5379 * we shouldn't be here. 5380 */ 5381 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5382 return (B_FALSE); 5383 5384 node = NULL; 5385 if (ibd_acquire_swqe(state, &node) != 0) { 5386 /* 5387 * If we don't have an swqe available, schedule a transmit 5388 * completion queue cleanup and hold off on sending more 5389 * more packets until we have some free swqes 5390 */ 5391 if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0) 5392 return (B_FALSE); 5393 5394 /* 5395 * If a poll cannot be scheduled, we have no choice but 5396 * to drop this packet 5397 */ 5398 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5399 return (B_TRUE); 5400 } 5401 5402 /* 5403 * Initialize the commonly used fields in swqe to NULL to protect 5404 * against ibd_tx_cleanup accidentally misinterpreting these on a 5405 * failure. 5406 */ 5407 node->swqe_im_mblk = NULL; 5408 node->w_swr.wr_nds = 0; 5409 node->w_swr.wr_sgl = NULL; 5410 node->w_swr.wr_opcode = IBT_WRC_SEND; 5411 5412 /* 5413 * Obtain an address handle for the destination. 5414 */ 5415 ipibp = (ib_header_info_t *)mp->b_rptr; 5416 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5417 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5418 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5419 5420 pktsize = msgsize(mp); 5421 5422 atomic_add_64(&state->id_xmt_bytes, pktsize); 5423 atomic_inc_64(&state->id_xmt_pkt); 5424 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5425 atomic_inc_64(&state->id_brd_xmt); 5426 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5427 atomic_inc_64(&state->id_multi_xmt); 5428 5429 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 5430 node->w_ahandle = ace; 5431 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5432 } else { 5433 DPRINT(5, 5434 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5435 ((ret == EFAULT) ? "failed" : "queued"), 5436 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5437 htonl(dest->ipoib_gidpref[1]), 5438 htonl(dest->ipoib_gidsuff[0]), 5439 htonl(dest->ipoib_gidsuff[1])); 5440 node->w_ahandle = NULL; 5441 5442 /* 5443 * for the poll mode, it is probably some cqe pending in the 5444 * cq. So ibd has to poll cq here, otherwise acache probably 5445 * may not be recycled. 5446 */ 5447 if (ibd_txcomp_poll == 1) 5448 ibd_poll_compq(state, state->id_scq_hdl); 5449 5450 /* 5451 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5452 * can not find a path for the specific dest address. We 5453 * should get rid of this kind of packet. We also should get 5454 * rid of the packet if we cannot schedule a poll via the 5455 * async thread. For the normal case, ibd will return the 5456 * packet to upper layer and wait for AH creating. 5457 * 5458 * Note that we always queue a work slot entry for the async 5459 * thread when we fail AH lookup (even in intr mode); this is 5460 * due to the convoluted way the code currently looks for AH. 5461 */ 5462 if (ret == EFAULT) { 5463 dofree = B_TRUE; 5464 rc = B_TRUE; 5465 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5466 dofree = B_TRUE; 5467 rc = B_TRUE; 5468 } else { 5469 dofree = B_FALSE; 5470 rc = B_FALSE; 5471 } 5472 goto ibd_send_fail; 5473 } 5474 5475 /* 5476 * For ND6 packets, padding is at the front of the source lladdr. 5477 * Insert the padding at front. 5478 */ 5479 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { 5480 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5481 if (!pullupmsg(mp, IPV6_HDR_LEN + 5482 sizeof (ib_header_info_t))) { 5483 DPRINT(10, "ibd_send: pullupmsg failure "); 5484 dofree = B_TRUE; 5485 rc = B_TRUE; 5486 goto ibd_send_fail; 5487 } 5488 ipibp = (ib_header_info_t *)mp->b_rptr; 5489 } 5490 ip6h = (ip6_t *)((uchar_t *)ipibp + 5491 sizeof (ib_header_info_t)); 5492 len = ntohs(ip6h->ip6_plen); 5493 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5494 mblk_t *pad; 5495 5496 pad = allocb(4, 0); 5497 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5498 linkb(mp, pad); 5499 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5500 IPV6_HDR_LEN + len + 4) { 5501 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5502 IPV6_HDR_LEN + len + 4)) { 5503 DPRINT(10, "ibd_send: pullupmsg " 5504 "failure "); 5505 dofree = B_TRUE; 5506 rc = B_TRUE; 5507 goto ibd_send_fail; 5508 } 5509 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5510 sizeof (ib_header_info_t)); 5511 } 5512 5513 /* LINTED: E_CONSTANT_CONDITION */ 5514 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5515 } 5516 } 5517 5518 mp->b_rptr += sizeof (ib_addrs_t); 5519 5520 /* 5521 * Do LSO and checksum related work here. For LSO send, adjust the 5522 * ud destination, the opcode and the LSO header information to the 5523 * work request. 5524 */ 5525 lso_info_get(mp, &mss, &lsoflags); 5526 if ((lsoflags & HW_LSO) != HW_LSO) { 5527 node->w_swr.wr_opcode = IBT_WRC_SEND; 5528 lsohdr_sz = 0; 5529 } else { 5530 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5531 /* 5532 * The routine can only fail if there's no memory; we 5533 * can only drop the packet if this happens 5534 */ 5535 ibd_print_warn(state, 5536 "ibd_send: no memory, lso posting failed"); 5537 dofree = B_TRUE; 5538 rc = B_TRUE; 5539 goto ibd_send_fail; 5540 } 5541 5542 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5543 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5544 } 5545 5546 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5547 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5548 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5549 else 5550 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5551 5552 /* 5553 * Prepare the sgl for posting; the routine can only fail if there's 5554 * no lso buf available for posting. If this is the case, we should 5555 * probably resched for lso bufs to become available and then try again. 5556 */ 5557 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5558 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5559 dofree = B_TRUE; 5560 rc = B_TRUE; 5561 } else { 5562 dofree = B_FALSE; 5563 rc = B_FALSE; 5564 } 5565 goto ibd_send_fail; 5566 } 5567 node->swqe_im_mblk = mp; 5568 5569 /* 5570 * Queue the wqe to hardware; since we can now simply queue a 5571 * post instead of doing it serially, we cannot assume anything 5572 * about the 'node' after ibd_post_send() returns. 5573 */ 5574 ibd_post_send(state, node); 5575 5576 return (B_TRUE); 5577 5578 ibd_send_fail: 5579 if (node && mp) 5580 ibd_free_lsohdr(node, mp); 5581 5582 if (dofree) 5583 freemsg(mp); 5584 5585 if (node != NULL) 5586 ibd_tx_cleanup(state, node); 5587 5588 return (rc); 5589 } 5590 5591 /* 5592 * GLDv3 entry point for transmitting datagram. 5593 */ 5594 static mblk_t * 5595 ibd_m_tx(void *arg, mblk_t *mp) 5596 { 5597 ibd_state_t *state = (ibd_state_t *)arg; 5598 mblk_t *next; 5599 5600 while (mp != NULL) { 5601 next = mp->b_next; 5602 mp->b_next = NULL; 5603 if (ibd_send(state, mp) == B_FALSE) { 5604 /* Send fail */ 5605 mp->b_next = next; 5606 break; 5607 } 5608 mp = next; 5609 } 5610 5611 return (mp); 5612 } 5613 5614 /* 5615 * this handles Tx and Rx completions. With separate CQs, this handles 5616 * only Rx completions. 5617 */ 5618 static uint_t 5619 ibd_intr(char *arg) 5620 { 5621 ibd_state_t *state = (ibd_state_t *)arg; 5622 5623 ibd_poll_compq(state, state->id_rcq_hdl); 5624 5625 return (DDI_INTR_CLAIMED); 5626 } 5627 5628 /* 5629 * Poll and drain the cq 5630 */ 5631 static uint_t 5632 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, 5633 uint_t numwcs) 5634 { 5635 ibd_wqe_t *wqe; 5636 ibt_wc_t *wc; 5637 uint_t total_polled = 0; 5638 uint_t num_polled; 5639 int i; 5640 5641 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5642 total_polled += num_polled; 5643 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5644 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5645 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5646 (wqe->w_type == IBD_WQE_RECV)); 5647 if (wc->wc_status != IBT_WC_SUCCESS) { 5648 /* 5649 * Channel being torn down. 5650 */ 5651 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5652 DPRINT(5, "ibd_drain_cq: flush error"); 5653 /* 5654 * Only invoke the Tx handler to 5655 * release possibly held resources 5656 * like AH refcount etc. Can not 5657 * invoke Rx handler because it might 5658 * try adding buffers to the Rx pool 5659 * when we are trying to deinitialize. 5660 */ 5661 if (wqe->w_type == IBD_WQE_RECV) { 5662 continue; 5663 } else { 5664 DPRINT(10, "ibd_drain_cq: Bad " 5665 "status %d", wc->wc_status); 5666 } 5667 } 5668 } 5669 if (wqe->w_type == IBD_WQE_SEND) { 5670 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 5671 } else { 5672 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5673 } 5674 } 5675 } 5676 5677 return (total_polled); 5678 } 5679 5680 /* 5681 * Common code for interrupt handling as well as for polling 5682 * for all completed wqe's while detaching. 5683 */ 5684 static void 5685 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5686 { 5687 ibt_wc_t *wcs; 5688 uint_t numwcs; 5689 int flag, redo_flag; 5690 int redo = 1; 5691 uint_t num_polled = 0; 5692 5693 if (ibd_separate_cqs == 1) { 5694 if (cq_hdl == state->id_rcq_hdl) { 5695 flag = IBD_RX_CQ_POLLING; 5696 redo_flag = IBD_REDO_RX_CQ_POLLING; 5697 } else { 5698 flag = IBD_TX_CQ_POLLING; 5699 redo_flag = IBD_REDO_TX_CQ_POLLING; 5700 } 5701 } else { 5702 flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING; 5703 redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING; 5704 } 5705 5706 mutex_enter(&state->id_cq_poll_lock); 5707 if (state->id_cq_poll_busy & flag) { 5708 state->id_cq_poll_busy |= redo_flag; 5709 mutex_exit(&state->id_cq_poll_lock); 5710 return; 5711 } 5712 state->id_cq_poll_busy |= flag; 5713 mutex_exit(&state->id_cq_poll_lock); 5714 5715 /* 5716 * In some cases (eg detaching), this code can be invoked on 5717 * any cpu after disabling cq notification (thus no concurrency 5718 * exists). Apart from that, the following applies normally: 5719 * The receive completion handling is always on the Rx interrupt 5720 * cpu. Transmit completion handling could be from any cpu if 5721 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5722 * is interrupt driven. Combined completion handling is always 5723 * on the interrupt cpu. Thus, lock accordingly and use the 5724 * proper completion array. 5725 */ 5726 if (ibd_separate_cqs == 1) { 5727 if (cq_hdl == state->id_rcq_hdl) { 5728 wcs = state->id_rxwcs; 5729 numwcs = state->id_rxwcs_size; 5730 } else { 5731 wcs = state->id_txwcs; 5732 numwcs = state->id_txwcs_size; 5733 } 5734 } else { 5735 wcs = state->id_rxwcs; 5736 numwcs = state->id_rxwcs_size; 5737 } 5738 5739 /* 5740 * Poll and drain the CQ 5741 */ 5742 num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5743 5744 /* 5745 * Enable CQ notifications and redrain the cq to catch any 5746 * completions we might have missed after the ibd_drain_cq() 5747 * above and before the ibt_enable_cq_notify() that follows. 5748 * Finally, service any new requests to poll the cq that 5749 * could've come in after the ibt_enable_cq_notify(). 5750 */ 5751 do { 5752 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 5753 IBT_SUCCESS) { 5754 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5755 } 5756 5757 num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5758 5759 mutex_enter(&state->id_cq_poll_lock); 5760 if (state->id_cq_poll_busy & redo_flag) 5761 state->id_cq_poll_busy &= ~redo_flag; 5762 else { 5763 state->id_cq_poll_busy &= ~flag; 5764 redo = 0; 5765 } 5766 mutex_exit(&state->id_cq_poll_lock); 5767 5768 } while (redo); 5769 5770 /* 5771 * If we polled the receive cq and found anything, we need to flush 5772 * it out to the nw layer here. 5773 */ 5774 if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) { 5775 ibd_flush_rx(state, NULL); 5776 } 5777 } 5778 5779 /* 5780 * Unmap the memory area associated with a given swqe. 5781 */ 5782 static void 5783 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 5784 { 5785 ibt_status_t stat; 5786 5787 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 5788 5789 if (swqe->w_mi_hdl) { 5790 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 5791 swqe->w_mi_hdl)) != IBT_SUCCESS) { 5792 DPRINT(10, 5793 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 5794 } 5795 swqe->w_mi_hdl = NULL; 5796 } 5797 swqe->w_swr.wr_nds = 0; 5798 } 5799 5800 /* 5801 * Common code that deals with clean ups after a successful or 5802 * erroneous transmission attempt. 5803 */ 5804 static void 5805 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 5806 { 5807 ibd_ace_t *ace = swqe->w_ahandle; 5808 5809 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 5810 5811 /* 5812 * If this was a dynamic mapping in ibd_send(), we need to 5813 * unmap here. If this was an lso buffer we'd used for sending, 5814 * we need to release the lso buf to the pool, since the resource 5815 * is scarce. However, if this was simply a normal send using 5816 * the copybuf (present in each swqe), we don't need to release it. 5817 */ 5818 if (swqe->swqe_im_mblk != NULL) { 5819 if (swqe->w_buftype == IBD_WQE_MAPPED) { 5820 ibd_unmap_mem(state, swqe); 5821 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 5822 ibd_release_lsobufs(state, 5823 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 5824 } 5825 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 5826 freemsg(swqe->swqe_im_mblk); 5827 swqe->swqe_im_mblk = NULL; 5828 } 5829 5830 /* 5831 * Drop the reference count on the AH; it can be reused 5832 * now for a different destination if there are no more 5833 * posted sends that will use it. This can be eliminated 5834 * if we can always associate each Tx buffer with an AH. 5835 * The ace can be null if we are cleaning up from the 5836 * ibd_send() error path. 5837 */ 5838 if (ace != NULL) { 5839 /* 5840 * The recycling logic can be eliminated from here 5841 * and put into the async thread if we create another 5842 * list to hold ACE's for unjoined mcg's. 5843 */ 5844 if (DEC_REF_DO_CYCLE(ace)) { 5845 ibd_mce_t *mce; 5846 5847 /* 5848 * Check with the lock taken: we decremented 5849 * reference count without the lock, and some 5850 * transmitter might alreay have bumped the 5851 * reference count (possible in case of multicast 5852 * disable when we leave the AH on the active 5853 * list). If not still 0, get out, leaving the 5854 * recycle bit intact. 5855 * 5856 * Atomically transition the AH from active 5857 * to free list, and queue a work request to 5858 * leave the group and destroy the mce. No 5859 * transmitter can be looking at the AH or 5860 * the MCE in between, since we have the 5861 * ac_mutex lock. In the SendOnly reap case, 5862 * it is not neccesary to hold the ac_mutex 5863 * and recheck the ref count (since the AH was 5864 * taken off the active list), we just do it 5865 * to have uniform processing with the Full 5866 * reap case. 5867 */ 5868 mutex_enter(&state->id_ac_mutex); 5869 mce = ace->ac_mce; 5870 if (GET_REF_CYCLE(ace) == 0) { 5871 CLEAR_REFCYCLE(ace); 5872 /* 5873 * Identify the case of fullmember reap as 5874 * opposed to mcg trap reap. Also, port up 5875 * might set ac_mce to NULL to indicate Tx 5876 * cleanup should do no more than put the 5877 * AH in the free list (see ibd_async_link). 5878 */ 5879 if (mce != NULL) { 5880 ace->ac_mce = NULL; 5881 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 5882 /* 5883 * mc_req was initialized at mce 5884 * creation time. 5885 */ 5886 ibd_queue_work_slot(state, 5887 &mce->mc_req, IBD_ASYNC_REAP); 5888 } 5889 IBD_ACACHE_INSERT_FREE(state, ace); 5890 } 5891 mutex_exit(&state->id_ac_mutex); 5892 } 5893 } 5894 5895 /* 5896 * Release the send wqe for reuse. 5897 */ 5898 ibd_release_swqe(state, swqe); 5899 } 5900 5901 /* 5902 * Hand off the processed rx mp chain to mac_rx() 5903 */ 5904 static void 5905 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) 5906 { 5907 if (mpc == NULL) { 5908 mutex_enter(&state->id_rx_lock); 5909 5910 mpc = state->id_rx_mp; 5911 5912 state->id_rx_mp = NULL; 5913 state->id_rx_mp_tail = NULL; 5914 state->id_rx_mp_len = 0; 5915 5916 mutex_exit(&state->id_rx_lock); 5917 } 5918 5919 if (mpc) { 5920 mac_rx(state->id_mh, state->id_rh, mpc); 5921 } 5922 } 5923 5924 /* 5925 * Processing to be done after receipt of a packet; hand off to GLD 5926 * in the format expected by GLD. The received packet has this 5927 * format: 2b sap :: 00 :: data. 5928 */ 5929 static void 5930 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 5931 { 5932 ib_header_info_t *phdr; 5933 mblk_t *mp; 5934 mblk_t *mpc = NULL; 5935 ipoib_hdr_t *ipibp; 5936 ipha_t *iphap; 5937 ip6_t *ip6h; 5938 int rxcnt, len; 5939 5940 /* 5941 * Track number handed to upper layer, and number still 5942 * available to receive packets. 5943 */ 5944 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 5945 ASSERT(rxcnt >= 0); 5946 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 5947 5948 /* 5949 * Adjust write pointer depending on how much data came in. 5950 */ 5951 mp = rwqe->rwqe_im_mblk; 5952 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 5953 5954 /* 5955 * Make sure this is NULL or we're in trouble. 5956 */ 5957 if (mp->b_next != NULL) { 5958 ibd_print_warn(state, 5959 "ibd_process_rx: got duplicate mp from rcq?"); 5960 mp->b_next = NULL; 5961 } 5962 5963 /* 5964 * the IB link will deliver one of the IB link layer 5965 * headers called, the Global Routing Header (GRH). 5966 * ibd driver uses the information in GRH to build the 5967 * Header_info structure and pass it with the datagram up 5968 * to GLDv3. 5969 * If the GRH is not valid, indicate to GLDv3 by setting 5970 * the VerTcFlow field to 0. 5971 */ 5972 phdr = (ib_header_info_t *)mp->b_rptr; 5973 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 5974 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 5975 5976 /* if it is loop back packet, just drop it. */ 5977 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 5978 IPOIB_ADDRL) == 0) { 5979 freemsg(mp); 5980 return; 5981 } 5982 5983 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 5984 sizeof (ipoib_mac_t)); 5985 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 5986 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 5987 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 5988 } else { 5989 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 5990 } 5991 } else { 5992 /* 5993 * It can not be a IBA multicast packet. Must have been 5994 * unicast for us. Just copy the interface address to dst. 5995 */ 5996 phdr->ib_grh.ipoib_vertcflow = 0; 5997 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 5998 sizeof (ipoib_mac_t)); 5999 } 6000 6001 /* 6002 * For ND6 packets, padding is at the front of the source/target 6003 * lladdr. However the inet6 layer is not aware of it, hence remove 6004 * the padding from such packets. 6005 */ 6006 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6007 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 6008 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 6009 if (!pullupmsg(mp, IPV6_HDR_LEN + 6010 sizeof (ipoib_hdr_t))) { 6011 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 6012 freemsg(mp); 6013 return; 6014 } 6015 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 6016 sizeof (ipoib_pgrh_t)); 6017 } 6018 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6019 len = ntohs(ip6h->ip6_plen); 6020 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6021 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 6022 IPV6_HDR_LEN + len) { 6023 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 6024 IPV6_HDR_LEN + len)) { 6025 DPRINT(10, "ibd_process_rx: pullupmsg" 6026 " failed"); 6027 freemsg(mp); 6028 return; 6029 } 6030 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6031 sizeof (ipoib_pgrh_t) + 6032 sizeof (ipoib_hdr_t)); 6033 } 6034 /* LINTED: E_CONSTANT_CONDITION */ 6035 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6036 } 6037 } 6038 6039 /* 6040 * Update statistics 6041 */ 6042 atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer); 6043 atomic_inc_64(&state->id_rcv_pkt); 6044 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6045 atomic_inc_64(&state->id_brd_rcv); 6046 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6047 atomic_inc_64(&state->id_multi_rcv); 6048 6049 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6050 /* 6051 * Set receive checksum status in mp 6052 * Hardware checksumming can be considered valid only if: 6053 * 1. CQE.IP_OK bit is set 6054 * 2. CQE.CKSUM = 0xffff 6055 * 3. IPv6 routing header is not present in the packet 6056 * 4. If there are no IP_OPTIONS in the IP HEADER 6057 */ 6058 6059 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6060 (wc->wc_cksum == 0xFFFF) && 6061 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6062 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6063 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6064 } 6065 6066 /* 6067 * Add this mp to the list of processed mp's to send to 6068 * the nw layer 6069 */ 6070 mutex_enter(&state->id_rx_lock); 6071 if (state->id_rx_mp) { 6072 ASSERT(state->id_rx_mp_tail != NULL); 6073 state->id_rx_mp_tail->b_next = mp; 6074 } else { 6075 ASSERT(state->id_rx_mp_tail == NULL); 6076 state->id_rx_mp = mp; 6077 } 6078 6079 state->id_rx_mp_tail = mp; 6080 state->id_rx_mp_len++; 6081 6082 if (state->id_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 6083 mpc = state->id_rx_mp; 6084 6085 state->id_rx_mp = NULL; 6086 state->id_rx_mp_tail = NULL; 6087 state->id_rx_mp_len = 0; 6088 } 6089 6090 mutex_exit(&state->id_rx_lock); 6091 6092 if (mpc) { 6093 ibd_flush_rx(state, mpc); 6094 } 6095 } 6096 6097 /* 6098 * Callback code invoked from STREAMs when the receive data buffer is 6099 * free for recycling. 6100 */ 6101 static void 6102 ibd_freemsg_cb(char *arg) 6103 { 6104 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6105 ibd_state_t *state = rwqe->w_state; 6106 6107 /* 6108 * If the wqe is being destructed, do not attempt recycling. 6109 */ 6110 if (rwqe->w_freeing_wqe == B_TRUE) { 6111 DPRINT(6, "ibd_freemsg: wqe being freed"); 6112 return; 6113 } else { 6114 /* 6115 * Upper layer has released held mblk, so we have 6116 * no more use for keeping the old pointer in 6117 * our rwqe. 6118 */ 6119 rwqe->rwqe_im_mblk = NULL; 6120 } 6121 6122 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6123 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6124 if (rwqe->rwqe_im_mblk == NULL) { 6125 ibd_delete_rwqe(state, rwqe); 6126 ibd_free_rwqe(state, rwqe); 6127 DPRINT(6, "ibd_freemsg: desballoc failed"); 6128 return; 6129 } 6130 6131 if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) { 6132 ibd_delete_rwqe(state, rwqe); 6133 ibd_free_rwqe(state, rwqe); 6134 return; 6135 } 6136 6137 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 6138 } 6139 6140 static uint_t 6141 ibd_tx_recycle(char *arg) 6142 { 6143 ibd_state_t *state = (ibd_state_t *)arg; 6144 6145 /* 6146 * Poll for completed entries 6147 */ 6148 ibd_poll_compq(state, state->id_scq_hdl); 6149 6150 /* 6151 * Resume any blocked transmissions if possible 6152 */ 6153 (void) ibd_resume_transmission(state); 6154 6155 return (DDI_INTR_CLAIMED); 6156 } 6157 6158 #ifdef IBD_LOGGING 6159 static void 6160 ibd_log_init(void) 6161 { 6162 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6163 ibd_lbuf_ndx = 0; 6164 } 6165 6166 static void 6167 ibd_log_fini(void) 6168 { 6169 if (ibd_lbuf) 6170 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6171 ibd_lbuf_ndx = 0; 6172 ibd_lbuf = NULL; 6173 } 6174 6175 static void 6176 ibd_log(const char *fmt, ...) 6177 { 6178 va_list ap; 6179 uint32_t off; 6180 uint32_t msglen; 6181 char tmpbuf[IBD_DMAX_LINE]; 6182 6183 if (ibd_lbuf == NULL) 6184 return; 6185 6186 va_start(ap, fmt); 6187 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6188 va_end(ap); 6189 6190 if (msglen >= IBD_DMAX_LINE) 6191 msglen = IBD_DMAX_LINE - 1; 6192 6193 mutex_enter(&ibd_lbuf_lock); 6194 6195 off = ibd_lbuf_ndx; /* current msg should go here */ 6196 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6197 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6198 6199 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6200 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6201 6202 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6203 ibd_lbuf_ndx = 0; 6204 6205 mutex_exit(&ibd_lbuf_lock); 6206 6207 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6208 } 6209 #endif 6210