1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IP */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip_if.h> /* for IP6_DL_SAP */ 54 #include <inet/ip6.h> /* for ip6_t */ 55 #include <inet/tcp.h> /* for tcph_t */ 56 #include <netinet/icmp6.h> /* for icmp6_t */ 57 #include <sys/callb.h> 58 #include <sys/modhash.h> 59 60 #include <sys/ib/clients/ibd/ibd.h> 61 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 62 #include <sys/note.h> 63 #include <sys/multidata.h> 64 65 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 66 67 /* 68 * Per-interface tunables 69 * 70 * ibd_tx_copy_thresh 71 * This sets the threshold at which ibd will attempt to do a bcopy of the 72 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 73 * is restricted by various parameters, so setting of this value must be 74 * made after careful considerations only. For instance, IB HCAs currently 75 * impose a relatively small limit (when compared to ethernet NICs) on the 76 * length of the SGL for transmit. On the other hand, the ip stack could 77 * send down mp chains that are quite long when LSO is enabled. 78 * 79 * ibd_num_swqe 80 * Number of "send WQE" elements that will be allocated and used by ibd. 81 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 82 * buffer in each of these send wqes must be taken into account. This 83 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 84 * currently set to the same value of ibd_tx_copy_thresh, but may be 85 * changed independently if needed). 86 * 87 * ibd_num_rwqe 88 * Number of "receive WQE" elements that will be allocated and used by 89 * ibd. This parameter is limited by the maximum channel size of the HCA. 90 * Each buffer in the receive wqe will be of MTU size. 91 * 92 * ibd_num_lso_bufs 93 * Number of "larger-than-MTU" copy buffers to use for cases when the 94 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 95 * and too large to be used with regular MTU-sized copy buffers. It is 96 * not recommended to tune this variable without understanding the 97 * application environment and/or memory resources. The size of each of 98 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 99 * 100 * ibd_num_ah 101 * Number of AH cache entries to allocate 102 * 103 * ibd_hash_size 104 * Hash table size for the active AH list 105 * 106 * ibd_separate_cqs 107 * ibd_txcomp_poll 108 * These boolean variables (1 or 0) may be used to tune the behavior of 109 * ibd in managing the send and receive completion queues and in deciding 110 * whether or not transmit completions should be polled or interrupt 111 * driven (when the completion queues are separate). If both the completion 112 * queues are interrupt driven, it may not be possible for the handlers to 113 * be invoked concurrently, depending on how the interrupts are tied on 114 * the PCI intr line. Note that some combination of these two parameters 115 * may not be meaningful (and therefore not allowed). 116 * 117 * ibd_tx_softintr 118 * ibd_rx_softintr 119 * The softintr mechanism allows ibd to avoid event queue overflows if 120 * the receive/completion handlers are to be expensive. These are enabled 121 * by default. 122 * 123 * ibd_log_sz 124 * This specifies the size of the ibd log buffer in bytes. The buffer is 125 * allocated and logging is enabled only when IBD_LOGGING is defined. 126 * 127 */ 128 uint_t ibd_tx_copy_thresh = 0x1000; 129 uint_t ibd_num_swqe = 4000; 130 uint_t ibd_num_rwqe = 4000; 131 uint_t ibd_num_lso_bufs = 0x400; 132 uint_t ibd_num_ah = 64; 133 uint_t ibd_hash_size = 32; 134 uint_t ibd_separate_cqs = 1; 135 uint_t ibd_txcomp_poll = 0; 136 uint_t ibd_rx_softintr = 1; 137 uint_t ibd_tx_softintr = 1; 138 uint_t ibd_create_broadcast_group = 1; 139 #ifdef IBD_LOGGING 140 uint_t ibd_log_sz = 0x20000; 141 #endif 142 143 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 144 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 145 #define IBD_NUM_SWQE ibd_num_swqe 146 #define IBD_NUM_RWQE ibd_num_rwqe 147 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 148 #define IBD_NUM_AH ibd_num_ah 149 #define IBD_HASH_SIZE ibd_hash_size 150 #ifdef IBD_LOGGING 151 #define IBD_LOG_SZ ibd_log_sz 152 #endif 153 154 /* 155 * Receive CQ moderation parameters: NOT tunables 156 */ 157 static uint_t ibd_rxcomp_count = 4; 158 static uint_t ibd_rxcomp_usec = 10; 159 160 /* 161 * Send CQ moderation parameters: NOT tunables 162 */ 163 #define IBD_TXCOMP_COUNT 10 164 #define IBD_TXCOMP_USEC 300 165 166 /* 167 * Thresholds 168 * 169 * When waiting for resources (swqes or lso buffers) to become available, 170 * the first two thresholds below determine how long to wait before informing 171 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 172 * determines how low the available swqes should go before we start polling 173 * the completion queue. 174 */ 175 #define IBD_FREE_LSOS_THRESH 8 176 #define IBD_FREE_SWQES_THRESH 20 177 #define IBD_TX_POLL_THRESH 80 178 179 /* 180 * When doing multiple-send-wr or multiple-recv-wr posts, this value 181 * determines how many to do at a time (in a single ibt_post_send/recv). 182 */ 183 #define IBD_MAX_POST_MULTIPLE 4 184 185 /* 186 * Maximum length for returning chained mps back to crossbow 187 */ 188 #define IBD_MAX_RX_MP_LEN 16 189 190 /* 191 * LSO parameters 192 */ 193 #define IBD_LSO_MAXLEN 65536 194 #define IBD_LSO_BUFSZ 8192 195 #define IBD_PROP_LSO_POLICY "lso-policy" 196 197 /* 198 * Completion queue polling control 199 */ 200 #define IBD_RX_CQ_POLLING 0x1 201 #define IBD_TX_CQ_POLLING 0x2 202 #define IBD_REDO_RX_CQ_POLLING 0x4 203 #define IBD_REDO_TX_CQ_POLLING 0x8 204 205 /* 206 * Flag bits for resources to reap 207 */ 208 #define IBD_RSRC_SWQE 0x1 209 #define IBD_RSRC_LSOBUF 0x2 210 211 /* 212 * Async operation types 213 */ 214 #define IBD_ASYNC_GETAH 1 215 #define IBD_ASYNC_JOIN 2 216 #define IBD_ASYNC_LEAVE 3 217 #define IBD_ASYNC_PROMON 4 218 #define IBD_ASYNC_PROMOFF 5 219 #define IBD_ASYNC_REAP 6 220 #define IBD_ASYNC_TRAP 7 221 #define IBD_ASYNC_SCHED 8 222 #define IBD_ASYNC_LINK 9 223 #define IBD_ASYNC_EXIT 10 224 225 /* 226 * Async operation states 227 */ 228 #define IBD_OP_NOTSTARTED 0 229 #define IBD_OP_ONGOING 1 230 #define IBD_OP_COMPLETED 2 231 #define IBD_OP_ERRORED 3 232 #define IBD_OP_ROUTERED 4 233 234 /* 235 * State of IBD driver initialization during attach/m_start 236 */ 237 #define IBD_DRV_STATE_INITIALIZED 0x00001 238 #define IBD_DRV_RXINTR_ADDED 0x00002 239 #define IBD_DRV_TXINTR_ADDED 0x00004 240 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 241 #define IBD_DRV_HCA_OPENED 0x00010 242 #define IBD_DRV_PD_ALLOCD 0x00020 243 #define IBD_DRV_MAC_REGISTERED 0x00040 244 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 245 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 246 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 247 #define IBD_DRV_CQS_ALLOCD 0x00400 248 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 249 #define IBD_DRV_TXLIST_ALLOCD 0x01000 250 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 251 #define IBD_DRV_RXLIST_ALLOCD 0x04000 252 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 253 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 254 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 255 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 256 #define IBD_DRV_STARTED 0x80000 257 258 /* 259 * Miscellaneous constants 260 */ 261 #define IBD_SEND 0 262 #define IBD_RECV 1 263 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 264 #define IBD_DEF_MAX_SDU 2044 265 #define IBD_DEFAULT_QKEY 0xB1B 266 #ifdef IBD_LOGGING 267 #define IBD_DMAX_LINE 100 268 #endif 269 270 /* 271 * Enumerations for link states 272 */ 273 typedef enum { 274 IBD_LINK_DOWN, 275 IBD_LINK_UP, 276 IBD_LINK_UP_ABSENT 277 } ibd_link_op_t; 278 279 /* 280 * Driver State Pointer 281 */ 282 void *ibd_list; 283 284 /* 285 * Logging 286 */ 287 #ifdef IBD_LOGGING 288 kmutex_t ibd_lbuf_lock; 289 uint8_t *ibd_lbuf; 290 uint32_t ibd_lbuf_ndx; 291 #endif 292 293 /* 294 * Required system entry points 295 */ 296 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 297 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 298 299 /* 300 * Required driver entry points for GLDv3 301 */ 302 static int ibd_m_stat(void *, uint_t, uint64_t *); 303 static int ibd_m_start(void *); 304 static void ibd_m_stop(void *); 305 static int ibd_m_promisc(void *, boolean_t); 306 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 307 static int ibd_m_unicst(void *, const uint8_t *); 308 static mblk_t *ibd_m_tx(void *, mblk_t *); 309 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 310 311 /* 312 * Private driver entry points for GLDv3 313 */ 314 315 /* 316 * Initialization 317 */ 318 static int ibd_state_init(ibd_state_t *, dev_info_t *); 319 static int ibd_init_txlist(ibd_state_t *); 320 static int ibd_init_rxlist(ibd_state_t *); 321 static int ibd_acache_init(ibd_state_t *); 322 #ifdef IBD_LOGGING 323 static void ibd_log_init(void); 324 #endif 325 326 /* 327 * Termination/cleanup 328 */ 329 static void ibd_state_fini(ibd_state_t *); 330 static void ibd_fini_txlist(ibd_state_t *); 331 static void ibd_fini_rxlist(ibd_state_t *); 332 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 333 static void ibd_acache_fini(ibd_state_t *); 334 #ifdef IBD_LOGGING 335 static void ibd_log_fini(void); 336 #endif 337 338 /* 339 * Allocation/acquire/map routines 340 */ 341 static int ibd_alloc_swqe(ibd_state_t *, ibd_swqe_t **, int, ibt_lkey_t); 342 static int ibd_alloc_rwqe(ibd_state_t *, ibd_rwqe_t **); 343 static int ibd_alloc_tx_copybufs(ibd_state_t *); 344 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 345 static int ibd_acquire_swqe(ibd_state_t *, ibd_swqe_t **); 346 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 347 uint32_t *); 348 349 /* 350 * Free/release/unmap routines 351 */ 352 static void ibd_free_swqe(ibd_state_t *, ibd_swqe_t *); 353 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 354 static void ibd_delete_rwqe(ibd_state_t *, ibd_rwqe_t *); 355 static void ibd_free_tx_copybufs(ibd_state_t *); 356 static void ibd_free_tx_lsobufs(ibd_state_t *); 357 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *); 358 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 359 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 360 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 361 362 /* 363 * Handlers/callback routines 364 */ 365 static uint_t ibd_intr(char *); 366 static uint_t ibd_tx_recycle(char *); 367 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 368 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 369 static void ibd_poll_compq(ibd_state_t *, ibt_cq_hdl_t); 370 static uint_t ibd_drain_cq(ibd_state_t *, ibt_cq_hdl_t, ibt_wc_t *, uint_t); 371 static void ibd_freemsg_cb(char *); 372 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 373 ibt_async_event_t *); 374 static void ibd_snet_notices_handler(void *, ib_gid_t, 375 ibt_subnet_event_code_t, ibt_subnet_event_t *); 376 377 /* 378 * Send/receive routines 379 */ 380 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 381 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 382 static int ibd_post_recv(ibd_state_t *, ibd_rwqe_t *, boolean_t); 383 static void ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 384 static void ibd_flush_rx(ibd_state_t *, mblk_t *); 385 386 /* 387 * Threads 388 */ 389 static void ibd_async_work(ibd_state_t *); 390 391 /* 392 * Async tasks 393 */ 394 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 395 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 396 static void ibd_async_setprom(ibd_state_t *); 397 static void ibd_async_unsetprom(ibd_state_t *); 398 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 399 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 400 static void ibd_async_txsched(ibd_state_t *); 401 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 402 403 /* 404 * Async task helpers 405 */ 406 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 407 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 408 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 409 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 410 ipoib_mac_t *, ipoib_mac_t *); 411 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 412 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 413 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 414 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 415 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 416 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 417 static uint64_t ibd_get_portspeed(ibd_state_t *); 418 static boolean_t ibd_async_safe(ibd_state_t *); 419 static void ibd_async_done(ibd_state_t *); 420 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 421 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 422 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 423 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 424 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 425 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 426 427 /* 428 * Helpers for attach/start routines 429 */ 430 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 431 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 432 static int ibd_unattach(ibd_state_t *, dev_info_t *); 433 static int ibd_get_port_details(ibd_state_t *); 434 static int ibd_alloc_cqs(ibd_state_t *); 435 static int ibd_setup_ud_channel(ibd_state_t *); 436 static int ibd_undo_m_start(ibd_state_t *); 437 438 439 /* 440 * Miscellaneous helpers 441 */ 442 static int ibd_sched_poll(ibd_state_t *, int, int); 443 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 444 static int ibd_resume_transmission(ibd_state_t *); 445 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 446 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 447 static void *list_get_head(list_t *); 448 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 449 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 450 static void ibd_print_warn(ibd_state_t *, char *, ...); 451 #ifdef IBD_LOGGING 452 static void ibd_log(const char *, ...); 453 #endif 454 455 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 456 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 457 458 /* Module Driver Info */ 459 static struct modldrv ibd_modldrv = { 460 &mod_driverops, /* This one is a driver */ 461 "InfiniBand GLDv3 Driver", /* short description */ 462 &ibd_dev_ops /* driver specific ops */ 463 }; 464 465 /* Module Linkage */ 466 static struct modlinkage ibd_modlinkage = { 467 MODREV_1, (void *)&ibd_modldrv, NULL 468 }; 469 470 /* 471 * Module (static) info passed to IBTL during ibt_attach 472 */ 473 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 474 IBTI_V_CURR, 475 IBT_NETWORK, 476 ibd_async_handler, 477 NULL, 478 "IPIB" 479 }; 480 481 /* 482 * GLDv3 entry points 483 */ 484 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 485 static mac_callbacks_t ibd_m_callbacks = { 486 IBD_M_CALLBACK_FLAGS, 487 ibd_m_stat, 488 ibd_m_start, 489 ibd_m_stop, 490 ibd_m_promisc, 491 ibd_m_multicst, 492 ibd_m_unicst, 493 ibd_m_tx, 494 NULL, 495 ibd_m_getcapab 496 }; 497 498 /* 499 * Fill/clear <scope> and <p_key> in multicast/broadcast address 500 */ 501 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 502 { \ 503 *(uint32_t *)((char *)(maddr) + 4) |= \ 504 htonl((uint32_t)(scope) << 16); \ 505 *(uint32_t *)((char *)(maddr) + 8) |= \ 506 htonl((uint32_t)(pkey) << 16); \ 507 } 508 509 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 510 { \ 511 *(uint32_t *)((char *)(maddr) + 4) &= \ 512 htonl(~((uint32_t)0xF << 16)); \ 513 *(uint32_t *)((char *)(maddr) + 8) &= \ 514 htonl(~((uint32_t)0xFFFF << 16)); \ 515 } 516 517 /* 518 * Rudimentary debugging support 519 */ 520 #ifdef DEBUG 521 int ibd_debuglevel = 100; 522 static void 523 debug_print(int l, char *fmt, ...) 524 { 525 va_list ap; 526 527 if (l < ibd_debuglevel) 528 return; 529 va_start(ap, fmt); 530 vcmn_err(CE_CONT, fmt, ap); 531 va_end(ap); 532 } 533 #define DPRINT debug_print 534 #else 535 #define DPRINT 536 #endif 537 538 /* 539 * Common routine to print warning messages; adds in hca guid, port number 540 * and pkey to be able to identify the IBA interface. 541 */ 542 static void 543 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 544 { 545 ib_guid_t hca_guid; 546 char ibd_print_buf[256]; 547 int len; 548 va_list ap; 549 550 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 551 0, "hca-guid", 0); 552 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 553 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 554 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 555 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 556 va_start(ap, fmt); 557 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 558 fmt, ap); 559 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 560 va_end(ap); 561 } 562 563 /* 564 * Warlock directives 565 */ 566 567 /* 568 * id_lso_lock 569 * 570 * state->id_lso->bkt_nfree may be accessed without a lock to 571 * determine the threshold at which we have to ask the nw layer 572 * to resume transmission (see ibd_resume_transmission()). 573 */ 574 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 575 ibd_state_t::id_lso)) 576 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 577 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 578 579 /* 580 * id_cq_poll_lock 581 */ 582 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_cq_poll_lock, 583 ibd_state_t::id_cq_poll_busy)) 584 585 /* 586 * id_txpost_lock 587 */ 588 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 589 ibd_state_t::id_tx_head)) 590 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 591 ibd_state_t::id_tx_busy)) 592 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 593 ibd_state_t::id_tx_tailp)) 594 595 /* 596 * id_rxpost_lock 597 */ 598 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 599 ibd_state_t::id_rx_head)) 600 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 601 ibd_state_t::id_rx_busy)) 602 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rxpost_lock, 603 ibd_state_t::id_rx_tailp)) 604 605 /* 606 * id_acache_req_lock 607 */ 608 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 609 ibd_state_t::id_acache_req_cv)) 610 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 611 ibd_state_t::id_req_list)) 612 613 /* 614 * id_ac_mutex 615 * 616 * This mutex is actually supposed to protect id_ah_op as well, 617 * but this path of the code isn't clean (see update of id_ah_op 618 * in ibd_async_acache(), immediately after the call to 619 * ibd_async_mcache()). For now, we'll skip this check by 620 * declaring that id_ah_op is protected by some internal scheme 621 * that warlock isn't aware of. 622 */ 623 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 624 ibd_state_t::id_ah_active)) 625 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 626 ibd_state_t::id_ah_free)) 627 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 628 ibd_state_t::id_ah_addr)) 629 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 630 ibd_state_t::id_ah_op)) 631 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 632 ibd_state_t::id_ah_error)) 633 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 634 635 /* 636 * id_mc_mutex 637 */ 638 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 639 ibd_state_t::id_mc_full)) 640 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 641 ibd_state_t::id_mc_non)) 642 643 /* 644 * id_trap_lock 645 */ 646 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 647 ibd_state_t::id_trap_cv)) 648 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 649 ibd_state_t::id_trap_stop)) 650 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 651 ibd_state_t::id_trap_inprog)) 652 653 /* 654 * id_prom_op 655 */ 656 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 657 ibd_state_t::id_prom_op)) 658 659 /* 660 * id_sched_lock 661 */ 662 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 663 ibd_state_t::id_sched_needed)) 664 665 /* 666 * id_link_mutex 667 */ 668 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 669 ibd_state_t::id_link_state)) 670 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 671 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 672 ibd_state_t::id_link_speed)) 673 674 /* 675 * id_tx_list.dl_mutex 676 */ 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 678 ibd_state_t::id_tx_list.dl_head)) 679 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 680 ibd_state_t::id_tx_list.dl_tail)) 681 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 682 ibd_state_t::id_tx_list.dl_pending_sends)) 683 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 684 ibd_state_t::id_tx_list.dl_cnt)) 685 686 /* 687 * id_rx_list.dl_mutex 688 */ 689 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 690 ibd_state_t::id_rx_list.dl_head)) 691 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_rx_list.dl_mutex, 692 ibd_state_t::id_rx_list.dl_tail)) 693 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 694 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 695 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 696 ibd_state_t::id_rx_list.dl_cnt)) 697 698 699 /* 700 * Items protected by atomic updates 701 */ 702 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 703 ibd_state_s::id_brd_rcv 704 ibd_state_s::id_brd_xmt 705 ibd_state_s::id_multi_rcv 706 ibd_state_s::id_multi_xmt 707 ibd_state_s::id_num_intrs 708 ibd_state_s::id_rcv_bytes 709 ibd_state_s::id_rcv_pkt 710 ibd_state_s::id_tx_short 711 ibd_state_s::id_xmt_bytes 712 ibd_state_s::id_xmt_pkt)) 713 714 /* 715 * Non-mutex protection schemes for data elements. Almost all of 716 * these are non-shared items. 717 */ 718 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 719 callb_cpr 720 ib_gid_s 721 ib_header_info 722 ibd_acache_rq 723 ibd_acache_s::ac_mce 724 ibd_mcache::mc_fullreap 725 ibd_mcache::mc_jstate 726 ibd_mcache::mc_req 727 ibd_rwqe_s 728 ibd_swqe_s 729 ibd_wqe_s 730 ibt_wr_ds_s::ds_va 731 ibt_wr_lso_s 732 ipoib_mac::ipoib_qpn 733 mac_capab_lso_s 734 msgb::b_next 735 msgb::b_rptr 736 msgb::b_wptr)) 737 738 int 739 _init() 740 { 741 int status; 742 743 /* 744 * Sanity check some parameter settings. Tx completion polling 745 * only makes sense with separate CQs for Tx and Rx. 746 */ 747 if ((ibd_txcomp_poll == 1) && (ibd_separate_cqs == 0)) { 748 cmn_err(CE_NOTE, "!ibd: %s", 749 "Setting ibd_txcomp_poll = 0 for combined CQ"); 750 ibd_txcomp_poll = 0; 751 } 752 753 status = ddi_soft_state_init(&ibd_list, sizeof (ibd_state_t), 0); 754 if (status != 0) { 755 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 756 return (status); 757 } 758 759 mac_init_ops(&ibd_dev_ops, "ibd"); 760 status = mod_install(&ibd_modlinkage); 761 if (status != 0) { 762 DPRINT(10, "_init:failed in mod_install()"); 763 ddi_soft_state_fini(&ibd_list); 764 mac_fini_ops(&ibd_dev_ops); 765 return (status); 766 } 767 768 #ifdef IBD_LOGGING 769 ibd_log_init(); 770 #endif 771 return (0); 772 } 773 774 int 775 _info(struct modinfo *modinfop) 776 { 777 return (mod_info(&ibd_modlinkage, modinfop)); 778 } 779 780 int 781 _fini() 782 { 783 int status; 784 785 status = mod_remove(&ibd_modlinkage); 786 if (status != 0) 787 return (status); 788 789 mac_fini_ops(&ibd_dev_ops); 790 ddi_soft_state_fini(&ibd_list); 791 #ifdef IBD_LOGGING 792 ibd_log_fini(); 793 #endif 794 return (0); 795 } 796 797 /* 798 * Convert the GID part of the mac address from network byte order 799 * to host order. 800 */ 801 static void 802 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 803 { 804 ib_sn_prefix_t nbopref; 805 ib_guid_t nboguid; 806 807 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 808 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 809 dgid->gid_prefix = b2h64(nbopref); 810 dgid->gid_guid = b2h64(nboguid); 811 } 812 813 /* 814 * Create the IPoIB address in network byte order from host order inputs. 815 */ 816 static void 817 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 818 ib_guid_t guid) 819 { 820 ib_sn_prefix_t nbopref; 821 ib_guid_t nboguid; 822 823 mac->ipoib_qpn = htonl(qpn); 824 nbopref = h2b64(prefix); 825 nboguid = h2b64(guid); 826 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 827 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 828 } 829 830 /* 831 * Send to the appropriate all-routers group when the IBA multicast group 832 * does not exist, based on whether the target group is v4 or v6. 833 */ 834 static boolean_t 835 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 836 ipoib_mac_t *rmac) 837 { 838 boolean_t retval = B_TRUE; 839 uint32_t adjscope = state->id_scope << 16; 840 uint32_t topword; 841 842 /* 843 * Copy the first 4 bytes in without assuming any alignment of 844 * input mac address; this will have IPoIB signature, flags and 845 * scope bits. 846 */ 847 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 848 topword = ntohl(topword); 849 850 /* 851 * Generate proper address for IPv4/v6, adding in the Pkey properly. 852 */ 853 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 854 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 855 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 856 ((uint32_t)(state->id_pkey << 16))), 857 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 858 else 859 /* 860 * Does not have proper bits in the mgid address. 861 */ 862 retval = B_FALSE; 863 864 return (retval); 865 } 866 867 /* 868 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 869 * front of optional src/tgt link layer address. Right now Solaris inserts 870 * padding by default at the end. The routine which is doing is nce_xmit() 871 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 872 * the packet comes down from IP layer to the IBD driver, it is in the 873 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 874 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 875 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 876 * 877 * The send routine at IBD driver changes this packet as follows: 878 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 879 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 880 * aligned. 881 * 882 * At the receiving side again ibd_process_rx takes the above packet and 883 * removes the two bytes of front padding and inserts it at the end. This 884 * is since the IP layer does not understand padding at the front. 885 */ 886 #define IBD_PAD_NSNA(ip6h, len, type) { \ 887 uchar_t *nd_lla_ptr; \ 888 icmp6_t *icmp6; \ 889 nd_opt_hdr_t *opt; \ 890 int i; \ 891 \ 892 icmp6 = (icmp6_t *)&ip6h[1]; \ 893 len -= sizeof (nd_neighbor_advert_t); \ 894 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 895 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 896 (len != 0)) { \ 897 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 898 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 899 ASSERT(opt != NULL); \ 900 nd_lla_ptr = (uchar_t *)&opt[1]; \ 901 if (type == IBD_SEND) { \ 902 for (i = IPOIB_ADDRL; i > 0; i--) \ 903 *(nd_lla_ptr + i + 1) = \ 904 *(nd_lla_ptr + i - 1); \ 905 } else { \ 906 for (i = 0; i < IPOIB_ADDRL; i++) \ 907 *(nd_lla_ptr + i) = \ 908 *(nd_lla_ptr + i + 2); \ 909 } \ 910 *(nd_lla_ptr + i) = 0; \ 911 *(nd_lla_ptr + i + 1) = 0; \ 912 } \ 913 } 914 915 /* 916 * Address handle entries maintained by the driver are kept in the 917 * free and active lists. Each entry starts out in the free list; 918 * it migrates to the active list when primed using ibt_get_paths() 919 * and ibt_modify_ud_dest() for transmission to a specific destination. 920 * In the active list, the entry has a reference count indicating the 921 * number of ongoing/uncompleted transmits that reference it. The 922 * entry is left in the active list even after the reference count 923 * goes to 0, since successive transmits can find it there and do 924 * not need to set up another entry (ie the path information is 925 * cached using the active list). Entries on the active list are 926 * also hashed using the destination link address as a key for faster 927 * lookups during transmits. 928 * 929 * For any destination address (unicast or multicast, whatever the 930 * join states), there will be at most one entry in the active list. 931 * Entries with a 0 reference count on the active list can be reused 932 * for a transmit to a new destination, if the free list is empty. 933 * 934 * The AH free list insertion/deletion is protected with the id_ac_mutex, 935 * since the async thread and Tx callback handlers insert/delete. The 936 * active list does not need a lock (all operations are done by the 937 * async thread) but updates to the reference count are atomically 938 * done (increments done by Tx path, decrements by the Tx callback handler). 939 */ 940 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 941 list_insert_head(&state->id_ah_free, ce) 942 #define IBD_ACACHE_GET_FREE(state) \ 943 list_get_head(&state->id_ah_free) 944 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 945 int _ret_; \ 946 list_insert_head(&state->id_ah_active, ce); \ 947 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 948 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 949 ASSERT(_ret_ == 0); \ 950 } 951 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 952 list_remove(&state->id_ah_active, ce); \ 953 (void) mod_hash_remove(state->id_ah_active_hash, \ 954 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 955 } 956 #define IBD_ACACHE_GET_ACTIVE(state) \ 957 list_get_head(&state->id_ah_active) 958 959 /* 960 * Membership states for different mcg's are tracked by two lists: 961 * the "non" list is used for promiscuous mode, when all mcg traffic 962 * needs to be inspected. This type of membership is never used for 963 * transmission, so there can not be an AH in the active list 964 * corresponding to a member in this list. This list does not need 965 * any protection, since all operations are performed by the async 966 * thread. 967 * 968 * "Full" and "SendOnly" membership is tracked using a single list, 969 * the "full" list. This is because this single list can then be 970 * searched during transmit to a multicast group (if an AH for the 971 * mcg is not found in the active list), since at least one type 972 * of membership must be present before initiating the transmit. 973 * This list is also emptied during driver detach, since sendonly 974 * membership acquired during transmit is dropped at detach time 975 * alongwith ipv4 broadcast full membership. Insert/deletes to 976 * this list are done only by the async thread, but it is also 977 * searched in program context (see multicast disable case), thus 978 * the id_mc_mutex protects the list. The driver detach path also 979 * deconstructs the "full" list, but it ensures that the async 980 * thread will not be accessing the list (by blocking out mcg 981 * trap handling and making sure no more Tx reaping will happen). 982 * 983 * Currently, an IBA attach is done in the SendOnly case too, 984 * although this is not required. 985 */ 986 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 987 list_insert_head(&state->id_mc_full, mce) 988 #define IBD_MCACHE_INSERT_NON(state, mce) \ 989 list_insert_head(&state->id_mc_non, mce) 990 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 991 ibd_mcache_find(mgid, &state->id_mc_full) 992 #define IBD_MCACHE_FIND_NON(state, mgid) \ 993 ibd_mcache_find(mgid, &state->id_mc_non) 994 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 995 list_remove(&state->id_mc_full, mce) 996 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 997 list_remove(&state->id_mc_non, mce) 998 999 /* 1000 * AH and MCE active list manipulation: 1001 * 1002 * Multicast disable requests and MCG delete traps are two cases 1003 * where the active AH entry for the mcg (if any unreferenced one exists) 1004 * will be moved to the free list (to force the next Tx to the mcg to 1005 * join the MCG in SendOnly mode). Port up handling will also move AHs 1006 * from active to free list. 1007 * 1008 * In the case when some transmits are still pending on an entry 1009 * for an mcg, but a multicast disable has already been issued on the 1010 * mcg, there are some options to consider to preserve the join state 1011 * to ensure the emitted packet is properly routed on the IBA fabric. 1012 * For the AH, we can 1013 * 1. take out of active list at multicast disable time. 1014 * 2. take out of active list only when last pending Tx completes. 1015 * For the MCE, we can 1016 * 3. take out of active list at multicast disable time. 1017 * 4. take out of active list only when last pending Tx completes. 1018 * 5. move from active list to stale list at multicast disable time. 1019 * We choose to use 2,4. We use option 4 so that if a multicast enable 1020 * is tried before the pending Tx completes, the enable code finds the 1021 * mce in the active list and just has to make sure it will not be reaped 1022 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1023 * a stale list (#5) that would be checked in the enable code would need 1024 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1025 * after the multicast disable would try to put an AH in the active list, 1026 * and associate the mce it finds in the active list to this new AH, 1027 * whereas the mce is already associated with the previous AH (taken off 1028 * the active list), and will be removed once the pending Tx's complete 1029 * (unless a reference count on mce's is implemented). One implication of 1030 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1031 * grab new references on the AH, further delaying the leave. 1032 * 1033 * In the case of mcg delete (or create) trap when the port is sendonly 1034 * joined, the AH and MCE handling is different: the AH and MCE has to be 1035 * immediately taken off the active lists (forcing a join and path lookup 1036 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1037 * to an mcg as it is repeatedly created and deleted and goes thru 1038 * reincarnations). 1039 * 1040 * When a port is already sendonly joined, and a multicast enable is 1041 * attempted, the same mce structure is promoted; this ensures only a 1042 * single mce on the active list tracks the most powerful join state. 1043 * 1044 * In the case of port up event handling, the MCE for sendonly membership 1045 * is freed up, and the ACE is put into the free list as soon as possible 1046 * (depending on whether posted Tx's have completed). For fullmembership 1047 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1048 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1049 * done; else the mce is deconstructed (mc_fullreap case). 1050 * 1051 * MCG creation and deletion trap handling: 1052 * 1053 * These traps are unreliable (meaning sometimes the trap might never 1054 * be delivered to the subscribed nodes) and may arrive out-of-order 1055 * since they use UD transport. An alternative to relying on these 1056 * unreliable traps is to poll for mcg presence every so often, but 1057 * instead of doing that, we try to be as conservative as possible 1058 * while handling the traps, and hope that the traps do arrive at 1059 * the subscribed nodes soon. Note that if a node is fullmember 1060 * joined to an mcg, it can not possibly receive a mcg create/delete 1061 * trap for that mcg (by fullmember definition); if it does, it is 1062 * an old trap from a previous incarnation of the mcg. 1063 * 1064 * Whenever a trap is received, the driver cleans up its sendonly 1065 * membership to the group; we choose to do a sendonly leave even 1066 * on a creation trap to handle the case of a prior deletion of the mcg 1067 * having gone unnoticed. Consider an example scenario: 1068 * T1: MCG M is deleted, and fires off deletion trap D1. 1069 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1070 * T3: Node N tries to transmit to M, joining in sendonly mode. 1071 * T4: MCG M is deleted, and fires off deletion trap D2. 1072 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1073 * If the trap is D2, then a LEAVE is not required, since the mcg 1074 * is already deleted; but if it is D1, a LEAVE is required. A safe 1075 * approach is to always LEAVE, but the SM may be confused if it 1076 * receives a LEAVE without a prior JOIN. 1077 * 1078 * Management of the non-membership to an mcg is similar to the above, 1079 * except that if the interface is in promiscuous mode, it is required 1080 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1081 * if the re-join attempt fails (in which case a warning message needs 1082 * to be printed), it is not clear whether it failed due to the mcg not 1083 * existing, or some fabric/hca issues, due to the delayed nature of 1084 * trap delivery. Querying the SA to establish presence/absence of the 1085 * mcg is also racy at best. Thus, the driver just prints a warning 1086 * message when it can not rejoin after receiving a create trap, although 1087 * this might be (on rare occassions) a mis-warning if the create trap is 1088 * received after the mcg was deleted. 1089 */ 1090 1091 /* 1092 * Implementation of atomic "recycle" bits and reference count 1093 * on address handles. This utilizes the fact that max reference 1094 * count on any handle is limited by number of send wqes, thus 1095 * high bits in the ac_ref field can be used as the recycle bits, 1096 * and only the low bits hold the number of pending Tx requests. 1097 * This atomic AH reference counting allows the Tx completion 1098 * handler not to acquire the id_ac_mutex to process every completion, 1099 * thus reducing lock contention problems between completion and 1100 * the Tx path. 1101 */ 1102 #define CYCLEVAL 0x80000 1103 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1104 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1105 #define GET_REF(ace) ((ace)->ac_ref) 1106 #define GET_REF_CYCLE(ace) ( \ 1107 /* \ 1108 * Make sure "cycle" bit is set. \ 1109 */ \ 1110 ASSERT(CYCLE_SET(ace)), \ 1111 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1112 ) 1113 #define INC_REF(ace, num) { \ 1114 atomic_add_32(&(ace)->ac_ref, num); \ 1115 } 1116 #define SET_CYCLE_IF_REF(ace) ( \ 1117 CYCLE_SET(ace) ? B_TRUE : \ 1118 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1119 CYCLEVAL ? \ 1120 /* \ 1121 * Clear the "cycle" bit we just set; \ 1122 * ref count known to be 0 from above. \ 1123 */ \ 1124 CLEAR_REFCYCLE(ace), B_FALSE : \ 1125 /* \ 1126 * We set "cycle" bit; let caller know. \ 1127 */ \ 1128 B_TRUE \ 1129 ) 1130 #define DEC_REF_DO_CYCLE(ace) ( \ 1131 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1132 CYCLEVAL ? \ 1133 /* \ 1134 * Ref count known to be 0 from above. \ 1135 */ \ 1136 B_TRUE : \ 1137 B_FALSE \ 1138 ) 1139 1140 static void * 1141 list_get_head(list_t *list) 1142 { 1143 list_node_t *lhead = list_head(list); 1144 1145 if (lhead != NULL) 1146 list_remove(list, lhead); 1147 return (lhead); 1148 } 1149 1150 /* 1151 * This is always guaranteed to be able to queue the work. 1152 */ 1153 static void 1154 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1155 { 1156 /* Initialize request */ 1157 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1158 ptr->rq_op = op; 1159 1160 /* 1161 * Queue provided slot onto request pool. 1162 */ 1163 mutex_enter(&state->id_acache_req_lock); 1164 list_insert_tail(&state->id_req_list, ptr); 1165 1166 /* Go, fetch, async thread */ 1167 cv_signal(&state->id_acache_req_cv); 1168 mutex_exit(&state->id_acache_req_lock); 1169 } 1170 1171 /* 1172 * Main body of the per interface async thread. 1173 */ 1174 static void 1175 ibd_async_work(ibd_state_t *state) 1176 { 1177 ibd_req_t *ptr; 1178 callb_cpr_t cprinfo; 1179 1180 mutex_enter(&state->id_acache_req_lock); 1181 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1182 callb_generic_cpr, "ibd_async_work"); 1183 1184 for (;;) { 1185 ptr = list_get_head(&state->id_req_list); 1186 if (ptr != NULL) { 1187 mutex_exit(&state->id_acache_req_lock); 1188 1189 /* 1190 * Once we have done the operation, there is no 1191 * guarantee the request slot is going to be valid, 1192 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1193 * TRAP). 1194 * 1195 * Perform the request. 1196 */ 1197 switch (ptr->rq_op) { 1198 case IBD_ASYNC_GETAH: 1199 ibd_async_acache(state, &ptr->rq_mac); 1200 break; 1201 case IBD_ASYNC_JOIN: 1202 case IBD_ASYNC_LEAVE: 1203 ibd_async_multicast(state, 1204 ptr->rq_gid, ptr->rq_op); 1205 break; 1206 case IBD_ASYNC_PROMON: 1207 ibd_async_setprom(state); 1208 break; 1209 case IBD_ASYNC_PROMOFF: 1210 ibd_async_unsetprom(state); 1211 break; 1212 case IBD_ASYNC_REAP: 1213 ibd_async_reap_group(state, 1214 ptr->rq_ptr, ptr->rq_gid, 1215 IB_MC_JSTATE_FULL); 1216 /* 1217 * the req buf contains in mce 1218 * structure, so we do not need 1219 * to free it here. 1220 */ 1221 ptr = NULL; 1222 break; 1223 case IBD_ASYNC_TRAP: 1224 ibd_async_trap(state, ptr); 1225 break; 1226 case IBD_ASYNC_SCHED: 1227 ibd_async_txsched(state); 1228 break; 1229 case IBD_ASYNC_LINK: 1230 ibd_async_link(state, ptr); 1231 break; 1232 case IBD_ASYNC_EXIT: 1233 mutex_enter(&state->id_acache_req_lock); 1234 #ifndef __lock_lint 1235 CALLB_CPR_EXIT(&cprinfo); 1236 #else 1237 mutex_exit(&state->id_acache_req_lock); 1238 #endif 1239 return; 1240 } 1241 if (ptr != NULL) 1242 kmem_cache_free(state->id_req_kmc, ptr); 1243 1244 mutex_enter(&state->id_acache_req_lock); 1245 } else { 1246 #ifndef __lock_lint 1247 /* 1248 * Nothing to do: wait till new request arrives. 1249 */ 1250 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1251 cv_wait(&state->id_acache_req_cv, 1252 &state->id_acache_req_lock); 1253 CALLB_CPR_SAFE_END(&cprinfo, 1254 &state->id_acache_req_lock); 1255 #endif 1256 } 1257 } 1258 1259 /*NOTREACHED*/ 1260 _NOTE(NOT_REACHED) 1261 } 1262 1263 /* 1264 * Return when it is safe to queue requests to the async daemon; primarily 1265 * for subnet trap and async event handling. Disallow requests before the 1266 * daemon is created, and when interface deinitilization starts. 1267 */ 1268 static boolean_t 1269 ibd_async_safe(ibd_state_t *state) 1270 { 1271 mutex_enter(&state->id_trap_lock); 1272 if (state->id_trap_stop) { 1273 mutex_exit(&state->id_trap_lock); 1274 return (B_FALSE); 1275 } 1276 state->id_trap_inprog++; 1277 mutex_exit(&state->id_trap_lock); 1278 return (B_TRUE); 1279 } 1280 1281 /* 1282 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1283 * trap or event handling to complete to kill the async thread and deconstruct 1284 * the mcg/ace list. 1285 */ 1286 static void 1287 ibd_async_done(ibd_state_t *state) 1288 { 1289 mutex_enter(&state->id_trap_lock); 1290 if (--state->id_trap_inprog == 0) 1291 cv_signal(&state->id_trap_cv); 1292 mutex_exit(&state->id_trap_lock); 1293 } 1294 1295 /* 1296 * Hash functions: 1297 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1298 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1299 * These operate on mac addresses input into ibd_send, but there is no 1300 * guarantee on the alignment of the ipoib_mac_t structure. 1301 */ 1302 /*ARGSUSED*/ 1303 static uint_t 1304 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1305 { 1306 ulong_t ptraddr = (ulong_t)key; 1307 uint_t hval; 1308 1309 /* 1310 * If the input address is 4 byte aligned, we can just dereference 1311 * it. This is most common, since IP will send in a 4 byte aligned 1312 * IP header, which implies the 24 byte IPoIB psuedo header will be 1313 * 4 byte aligned too. 1314 */ 1315 if ((ptraddr & 3) == 0) 1316 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1317 1318 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1319 return (hval); 1320 } 1321 1322 static int 1323 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1324 { 1325 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1326 return (0); 1327 else 1328 return (1); 1329 } 1330 1331 /* 1332 * Initialize all the per interface caches and lists; AH cache, 1333 * MCG list etc. 1334 */ 1335 static int 1336 ibd_acache_init(ibd_state_t *state) 1337 { 1338 ibd_ace_t *ce; 1339 int i; 1340 1341 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1342 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1343 1344 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1345 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1346 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1347 offsetof(ibd_ace_t, ac_list)); 1348 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1349 offsetof(ibd_ace_t, ac_list)); 1350 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1351 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1352 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1353 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1354 offsetof(ibd_mce_t, mc_list)); 1355 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1356 offsetof(ibd_mce_t, mc_list)); 1357 list_create(&state->id_req_list, sizeof (ibd_req_t), 1358 offsetof(ibd_req_t, rq_list)); 1359 1360 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1361 IBD_NUM_AH, KM_SLEEP); 1362 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1363 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1364 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1365 ibd_acache_fini(state); 1366 return (DDI_FAILURE); 1367 } else { 1368 CLEAR_REFCYCLE(ce); 1369 ce->ac_mce = NULL; 1370 IBD_ACACHE_INSERT_FREE(state, ce); 1371 } 1372 } 1373 return (DDI_SUCCESS); 1374 } 1375 1376 static void 1377 ibd_acache_fini(ibd_state_t *state) 1378 { 1379 ibd_ace_t *ptr; 1380 1381 mutex_enter(&state->id_ac_mutex); 1382 1383 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1384 ASSERT(GET_REF(ptr) == 0); 1385 (void) ibt_free_ud_dest(ptr->ac_dest); 1386 } 1387 1388 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1389 ASSERT(GET_REF(ptr) == 0); 1390 (void) ibt_free_ud_dest(ptr->ac_dest); 1391 } 1392 1393 list_destroy(&state->id_ah_free); 1394 list_destroy(&state->id_ah_active); 1395 list_destroy(&state->id_mc_full); 1396 list_destroy(&state->id_mc_non); 1397 list_destroy(&state->id_req_list); 1398 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1399 mutex_exit(&state->id_ac_mutex); 1400 mutex_destroy(&state->id_ac_mutex); 1401 mutex_destroy(&state->id_mc_mutex); 1402 mutex_destroy(&state->id_acache_req_lock); 1403 cv_destroy(&state->id_acache_req_cv); 1404 } 1405 1406 /* 1407 * Search AH active hash list for a cached path to input destination. 1408 * If we are "just looking", hold == F. When we are in the Tx path, 1409 * we set hold == T to grab a reference on the AH so that it can not 1410 * be recycled to a new destination while the Tx request is posted. 1411 */ 1412 static ibd_ace_t * 1413 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1414 { 1415 ibd_ace_t *ptr; 1416 1417 ASSERT(mutex_owned(&state->id_ac_mutex)); 1418 1419 /* 1420 * Do hash search. 1421 */ 1422 if (mod_hash_find(state->id_ah_active_hash, 1423 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1424 if (hold) 1425 INC_REF(ptr, num); 1426 return (ptr); 1427 } 1428 return (NULL); 1429 } 1430 1431 /* 1432 * This is called by the tx side; if an initialized AH is found in 1433 * the active list, it is locked down and can be used; if no entry 1434 * is found, an async request is queued to do path resolution. 1435 */ 1436 static ibd_ace_t * 1437 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1438 { 1439 ibd_ace_t *ptr; 1440 ibd_req_t *req; 1441 1442 /* 1443 * Only attempt to print when we can; in the mdt pattr case, the 1444 * address is not aligned properly. 1445 */ 1446 if (((ulong_t)mac & 3) == 0) { 1447 DPRINT(4, 1448 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1449 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1450 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1451 htonl(mac->ipoib_gidsuff[1])); 1452 } 1453 1454 mutex_enter(&state->id_ac_mutex); 1455 1456 if ((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL) { 1457 mutex_exit(&state->id_ac_mutex); 1458 return (ptr); 1459 } 1460 1461 /* 1462 * Implementation of a single outstanding async request; if 1463 * the operation is not started yet, queue a request and move 1464 * to ongoing state. Remember in id_ah_addr for which address 1465 * we are queueing the request, in case we need to flag an error; 1466 * Any further requests, for the same or different address, until 1467 * the operation completes, is sent back to GLDv3 to be retried. 1468 * The async thread will update id_ah_op with an error indication 1469 * or will set it to indicate the next look up can start; either 1470 * way, it will mac_tx_update() so that all blocked requests come 1471 * back here. 1472 */ 1473 *err = EAGAIN; 1474 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1475 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1476 if (req != NULL) { 1477 /* 1478 * We did not even find the entry; queue a request 1479 * for it. 1480 */ 1481 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1482 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1483 state->id_ah_op = IBD_OP_ONGOING; 1484 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1485 } 1486 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1487 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1488 /* 1489 * Check the status of the pathrecord lookup request 1490 * we had queued before. 1491 */ 1492 if (state->id_ah_op == IBD_OP_ERRORED) { 1493 *err = EFAULT; 1494 state->id_ah_error++; 1495 } else { 1496 /* 1497 * IBD_OP_ROUTERED case: We need to send to the 1498 * all-router MCG. If we can find the AH for 1499 * the mcg, the Tx will be attempted. If we 1500 * do not find the AH, we return NORESOURCES 1501 * to retry. 1502 */ 1503 ipoib_mac_t routermac; 1504 1505 (void) ibd_get_allroutergroup(state, mac, &routermac); 1506 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1507 numwqe); 1508 } 1509 state->id_ah_op = IBD_OP_NOTSTARTED; 1510 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1511 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1512 /* 1513 * This case can happen when we get a higher band 1514 * packet. The easiest way is to reset the state machine 1515 * to accommodate the higher priority packet. 1516 */ 1517 state->id_ah_op = IBD_OP_NOTSTARTED; 1518 } 1519 mutex_exit(&state->id_ac_mutex); 1520 1521 return (ptr); 1522 } 1523 1524 /* 1525 * Grab a not-currently-in-use AH/PathRecord from the active 1526 * list to recycle to a new destination. Only the async thread 1527 * executes this code. 1528 */ 1529 static ibd_ace_t * 1530 ibd_acache_get_unref(ibd_state_t *state) 1531 { 1532 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1533 1534 ASSERT(mutex_owned(&state->id_ac_mutex)); 1535 1536 /* 1537 * Do plain linear search. 1538 */ 1539 while (ptr != NULL) { 1540 /* 1541 * Note that it is possible that the "cycle" bit 1542 * is set on the AH w/o any reference count. The 1543 * mcg must have been deleted, and the tx cleanup 1544 * just decremented the reference count to 0, but 1545 * hasn't gotten around to grabbing the id_ac_mutex 1546 * to move the AH into the free list. 1547 */ 1548 if (GET_REF(ptr) == 0) { 1549 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1550 break; 1551 } 1552 ptr = list_next(&state->id_ah_active, ptr); 1553 } 1554 return (ptr); 1555 } 1556 1557 /* 1558 * Invoked to clean up AH from active list in case of multicast 1559 * disable and to handle sendonly memberships during mcg traps. 1560 * And for port up processing for multicast and unicast AHs. 1561 * Normally, the AH is taken off the active list, and put into 1562 * the free list to be recycled for a new destination. In case 1563 * Tx requests on the AH have not completed yet, the AH is marked 1564 * for reaping (which will put the AH on the free list) once the Tx's 1565 * complete; in this case, depending on the "force" input, we take 1566 * out the AH from the active list right now, or leave it also for 1567 * the reap operation. Returns TRUE if the AH is taken off the active 1568 * list (and either put into the free list right now, or arranged for 1569 * later), FALSE otherwise. 1570 */ 1571 static boolean_t 1572 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1573 { 1574 ibd_ace_t *acactive; 1575 boolean_t ret = B_TRUE; 1576 1577 ASSERT(mutex_owned(&state->id_ac_mutex)); 1578 1579 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1580 1581 /* 1582 * Note that the AH might already have the cycle bit set 1583 * on it; this might happen if sequences of multicast 1584 * enables and disables are coming so fast, that posted 1585 * Tx's to the mcg have not completed yet, and the cycle 1586 * bit is set successively by each multicast disable. 1587 */ 1588 if (SET_CYCLE_IF_REF(acactive)) { 1589 if (!force) { 1590 /* 1591 * The ace is kept on the active list, further 1592 * Tx's can still grab a reference on it; the 1593 * ace is reaped when all pending Tx's 1594 * referencing the AH complete. 1595 */ 1596 ret = B_FALSE; 1597 } else { 1598 /* 1599 * In the mcg trap case, we always pull the 1600 * AH from the active list. And also the port 1601 * up multi/unicast case. 1602 */ 1603 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1604 acactive->ac_mce = NULL; 1605 } 1606 } else { 1607 /* 1608 * Determined the ref count is 0, thus reclaim 1609 * immediately after pulling out the ace from 1610 * the active list. 1611 */ 1612 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1613 acactive->ac_mce = NULL; 1614 IBD_ACACHE_INSERT_FREE(state, acactive); 1615 } 1616 1617 } 1618 return (ret); 1619 } 1620 1621 /* 1622 * Helper function for async path record lookup. If we are trying to 1623 * Tx to a MCG, check our membership, possibly trying to join the 1624 * group if required. If that fails, try to send the packet to the 1625 * all router group (indicated by the redirect output), pointing 1626 * the input mac address to the router mcg address. 1627 */ 1628 static ibd_mce_t * 1629 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1630 { 1631 ib_gid_t mgid; 1632 ibd_mce_t *mce; 1633 ipoib_mac_t routermac; 1634 1635 *redirect = B_FALSE; 1636 ibd_n2h_gid(mac, &mgid); 1637 1638 /* 1639 * Check the FullMember+SendOnlyNonMember list. 1640 * Since we are the only one who manipulates the 1641 * id_mc_full list, no locks are needed. 1642 */ 1643 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1644 if (mce != NULL) { 1645 DPRINT(4, "ibd_async_mcache : already joined to group"); 1646 return (mce); 1647 } 1648 1649 /* 1650 * Not found; try to join(SendOnlyNonMember) and attach. 1651 */ 1652 DPRINT(4, "ibd_async_mcache : not joined to group"); 1653 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1654 NULL) { 1655 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1656 return (mce); 1657 } 1658 1659 /* 1660 * MCGroup not present; try to join the all-router group. If 1661 * any of the following steps succeed, we will be redirecting 1662 * to the all router group. 1663 */ 1664 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1665 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1666 return (NULL); 1667 *redirect = B_TRUE; 1668 ibd_n2h_gid(&routermac, &mgid); 1669 bcopy(&routermac, mac, IPOIB_ADDRL); 1670 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1671 mgid.gid_prefix, mgid.gid_guid); 1672 1673 /* 1674 * Are we already joined to the router group? 1675 */ 1676 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1677 DPRINT(4, "ibd_async_mcache : using already joined router" 1678 "group\n"); 1679 return (mce); 1680 } 1681 1682 /* 1683 * Can we join(SendOnlyNonMember) the router group? 1684 */ 1685 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1686 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1687 NULL) { 1688 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1689 return (mce); 1690 } 1691 1692 return (NULL); 1693 } 1694 1695 /* 1696 * Async path record lookup code. 1697 */ 1698 static void 1699 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1700 { 1701 ibd_ace_t *ce; 1702 ibd_mce_t *mce = NULL; 1703 ibt_path_attr_t path_attr; 1704 ibt_path_info_t path_info; 1705 ib_gid_t destgid; 1706 char ret = IBD_OP_NOTSTARTED; 1707 1708 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1709 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1710 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1711 htonl(mac->ipoib_gidsuff[1])); 1712 1713 /* 1714 * Check whether we are trying to transmit to a MCG. 1715 * In that case, we need to make sure we are a member of 1716 * the MCG. 1717 */ 1718 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1719 boolean_t redirected; 1720 1721 /* 1722 * If we can not find or join the group or even 1723 * redirect, error out. 1724 */ 1725 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1726 NULL) { 1727 state->id_ah_op = IBD_OP_ERRORED; 1728 return; 1729 } 1730 1731 /* 1732 * If we got redirected, we need to determine whether 1733 * the AH for the new mcg is in the cache already, and 1734 * not pull it in then; otherwise proceed to get the 1735 * path for the new mcg. There is no guarantee that 1736 * if the AH is currently in the cache, it will still be 1737 * there when we look in ibd_acache_lookup(), but that's 1738 * okay, we will come back here. 1739 */ 1740 if (redirected) { 1741 ret = IBD_OP_ROUTERED; 1742 DPRINT(4, "ibd_async_acache : redirected to " 1743 "%08X:%08X:%08X:%08X:%08X", 1744 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1745 htonl(mac->ipoib_gidpref[1]), 1746 htonl(mac->ipoib_gidsuff[0]), 1747 htonl(mac->ipoib_gidsuff[1])); 1748 1749 mutex_enter(&state->id_ac_mutex); 1750 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1751 state->id_ah_op = IBD_OP_ROUTERED; 1752 mutex_exit(&state->id_ac_mutex); 1753 DPRINT(4, "ibd_async_acache : router AH found"); 1754 return; 1755 } 1756 mutex_exit(&state->id_ac_mutex); 1757 } 1758 } 1759 1760 /* 1761 * Get an AH from the free list. 1762 */ 1763 mutex_enter(&state->id_ac_mutex); 1764 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1765 /* 1766 * No free ones; try to grab an unreferenced active 1767 * one. Maybe we need to make the active list LRU, 1768 * but that will create more work for Tx callbacks. 1769 * Is there a way of not having to pull out the 1770 * entry from the active list, but just indicate it 1771 * is being recycled? Yes, but that creates one more 1772 * check in the fast lookup path. 1773 */ 1774 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1775 /* 1776 * Pretty serious shortage now. 1777 */ 1778 state->id_ah_op = IBD_OP_NOTSTARTED; 1779 mutex_exit(&state->id_ac_mutex); 1780 DPRINT(10, "ibd_async_acache : failed to find AH " 1781 "slot\n"); 1782 return; 1783 } 1784 /* 1785 * We could check whether ac_mce points to a SendOnly 1786 * member and drop that membership now. Or do it lazily 1787 * at detach time. 1788 */ 1789 ce->ac_mce = NULL; 1790 } 1791 mutex_exit(&state->id_ac_mutex); 1792 ASSERT(ce->ac_mce == NULL); 1793 1794 /* 1795 * Update the entry. 1796 */ 1797 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1798 1799 bzero(&path_info, sizeof (path_info)); 1800 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1801 path_attr.pa_sgid = state->id_sgid; 1802 path_attr.pa_num_dgids = 1; 1803 ibd_n2h_gid(&ce->ac_mac, &destgid); 1804 path_attr.pa_dgids = &destgid; 1805 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1806 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1807 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1808 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1809 goto error; 1810 } 1811 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1812 ntohl(ce->ac_mac.ipoib_qpn), 1813 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1814 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1815 goto error; 1816 } 1817 1818 /* 1819 * mce is set whenever an AH is being associated with a 1820 * MCG; this will come in handy when we leave the MCG. The 1821 * lock protects Tx fastpath from scanning the active list. 1822 */ 1823 if (mce != NULL) 1824 ce->ac_mce = mce; 1825 mutex_enter(&state->id_ac_mutex); 1826 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1827 state->id_ah_op = ret; 1828 mutex_exit(&state->id_ac_mutex); 1829 return; 1830 error: 1831 /* 1832 * We might want to drop SendOnly membership here if we 1833 * joined above. The lock protects Tx callbacks inserting 1834 * into the free list. 1835 */ 1836 mutex_enter(&state->id_ac_mutex); 1837 state->id_ah_op = IBD_OP_ERRORED; 1838 IBD_ACACHE_INSERT_FREE(state, ce); 1839 mutex_exit(&state->id_ac_mutex); 1840 } 1841 1842 /* 1843 * While restoring port's presence on the subnet on a port up, it is possible 1844 * that the port goes down again. 1845 */ 1846 static void 1847 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1848 { 1849 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1850 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1851 LINK_STATE_UP; 1852 ibd_mce_t *mce, *pmce; 1853 ibd_ace_t *ace, *pace; 1854 1855 DPRINT(10, "ibd_async_link(): %d", opcode); 1856 1857 /* 1858 * On a link up, revalidate the link speed/width. No point doing 1859 * this on a link down, since we will be unable to do SA operations, 1860 * defaulting to the lowest speed. Also notice that we update our 1861 * notion of speed before calling mac_link_update(), which will do 1862 * neccesary higher level notifications for speed changes. 1863 */ 1864 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1865 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1866 state->id_link_speed = ibd_get_portspeed(state); 1867 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1868 } 1869 1870 /* 1871 * Do all the work required to establish our presence on 1872 * the subnet. 1873 */ 1874 if (opcode == IBD_LINK_UP_ABSENT) { 1875 /* 1876 * If in promiscuous mode ... 1877 */ 1878 if (state->id_prom_op == IBD_OP_COMPLETED) { 1879 /* 1880 * Drop all nonmembership. 1881 */ 1882 ibd_async_unsetprom(state); 1883 1884 /* 1885 * Then, try to regain nonmembership to all mcg's. 1886 */ 1887 ibd_async_setprom(state); 1888 1889 } 1890 1891 /* 1892 * Drop all sendonly membership (which also gets rid of the 1893 * AHs); try to reacquire all full membership. 1894 */ 1895 mce = list_head(&state->id_mc_full); 1896 while ((pmce = mce) != NULL) { 1897 mce = list_next(&state->id_mc_full, mce); 1898 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1899 ibd_leave_group(state, 1900 pmce->mc_info.mc_adds_vect.av_dgid, 1901 IB_MC_JSTATE_SEND_ONLY_NON); 1902 else 1903 ibd_reacquire_group(state, pmce); 1904 } 1905 1906 /* 1907 * Recycle all active AHs to free list (and if there are 1908 * pending posts, make sure they will go into the free list 1909 * once the Tx's complete). Grab the lock to prevent 1910 * concurrent Tx's as well as Tx cleanups. 1911 */ 1912 mutex_enter(&state->id_ac_mutex); 1913 ace = list_head(&state->id_ah_active); 1914 while ((pace = ace) != NULL) { 1915 boolean_t cycled; 1916 1917 ace = list_next(&state->id_ah_active, ace); 1918 mce = pace->ac_mce; 1919 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1920 B_TRUE); 1921 /* 1922 * If this is for an mcg, it must be for a fullmember, 1923 * since we got rid of send-only members above when 1924 * processing the mce list. 1925 */ 1926 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1927 IB_MC_JSTATE_FULL))); 1928 1929 /* 1930 * Check if the fullmember mce needs to be torn down, 1931 * ie whether the DLPI disable has already been done. 1932 * If so, do some of the work of tx_cleanup, namely 1933 * causing leave (which will fail), detach and 1934 * mce-freeing. tx_cleanup will put the AH into free 1935 * list. The reason to duplicate some of this 1936 * tx_cleanup work is because we want to delete the 1937 * AH right now instead of waiting for tx_cleanup, to 1938 * force subsequent Tx's to reacquire an AH. 1939 */ 1940 if ((mce != NULL) && (mce->mc_fullreap)) 1941 ibd_async_reap_group(state, mce, 1942 mce->mc_info.mc_adds_vect.av_dgid, 1943 mce->mc_jstate); 1944 } 1945 mutex_exit(&state->id_ac_mutex); 1946 } 1947 1948 /* 1949 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1950 * (which stops further events from being delivered) before 1951 * mac_unregister(). At this point, it is guaranteed that mac_register 1952 * has already been done. 1953 */ 1954 mutex_enter(&state->id_link_mutex); 1955 state->id_link_state = lstate; 1956 mac_link_update(state->id_mh, lstate); 1957 mutex_exit(&state->id_link_mutex); 1958 1959 ibd_async_done(state); 1960 } 1961 1962 /* 1963 * Check the pkey table to see if we can find the pkey we're looking for. 1964 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1965 * failure. 1966 */ 1967 static int 1968 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1969 uint16_t *pkix) 1970 { 1971 uint16_t ndx; 1972 1973 ASSERT(pkix != NULL); 1974 1975 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1976 if (pkey_tbl[ndx] == pkey) { 1977 *pkix = ndx; 1978 return (0); 1979 } 1980 } 1981 return (-1); 1982 } 1983 1984 /* 1985 * When the link is notified up, we need to do a few things, based 1986 * on the port's current p_init_type_reply claiming a reinit has been 1987 * done or not. The reinit steps are: 1988 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 1989 * the old Pkey and GID0 are correct. 1990 * 2. Register for mcg traps (already done by ibmf). 1991 * 3. If PreservePresenceReply indicates the SM has restored port's presence 1992 * in subnet, nothing more to do. Else go to next steps (on async daemon). 1993 * 4. Give up all sendonly memberships. 1994 * 5. Acquire all full memberships. 1995 * 6. In promiscuous mode, acquire all non memberships. 1996 * 7. Recycle all AHs to free list. 1997 */ 1998 static void 1999 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2000 { 2001 ibt_hca_portinfo_t *port_infop = NULL; 2002 ibt_status_t ibt_status; 2003 uint_t psize, port_infosz; 2004 ibd_link_op_t opcode; 2005 ibd_req_t *req; 2006 link_state_t new_link_state = LINK_STATE_UP; 2007 uint8_t itreply; 2008 uint16_t pkix; 2009 2010 /* 2011 * Do not send a request to the async daemon if it has not 2012 * yet been created or is being destroyed. If the async 2013 * daemon has not yet been created, we still need to track 2014 * last known state of the link. If this code races with the 2015 * detach path, then we are assured that the detach path has 2016 * not yet done the ibt_close_hca (which waits for all async 2017 * events to complete). If the code races with the attach path, 2018 * we need to validate the pkey/gid (in the link_up case) if 2019 * the initialization path has already set these up and created 2020 * IBTF resources based on the values. 2021 */ 2022 mutex_enter(&state->id_link_mutex); 2023 2024 /* 2025 * If the init code in ibd_m_start hasn't yet set up the 2026 * pkey/gid, nothing to do; that code will set the link state. 2027 */ 2028 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2029 mutex_exit(&state->id_link_mutex); 2030 return; 2031 } 2032 2033 /* 2034 * If this routine was called in response to a port down event, 2035 * we just need to see if this should be informed. 2036 */ 2037 if (code == IBT_ERROR_PORT_DOWN) { 2038 new_link_state = LINK_STATE_DOWN; 2039 goto update_link_state; 2040 } 2041 2042 /* 2043 * If it's not a port down event we've received, try to get the port 2044 * attributes first. If we fail here, the port is as good as down. 2045 * Otherwise, if the link went down by the time the handler gets 2046 * here, give up - we cannot even validate the pkey/gid since those 2047 * are not valid and this is as bad as a port down anyway. 2048 */ 2049 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2050 &port_infop, &psize, &port_infosz); 2051 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2052 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2053 new_link_state = LINK_STATE_DOWN; 2054 goto update_link_state; 2055 } 2056 2057 /* 2058 * Check the SM InitTypeReply flags. If both NoLoadReply and 2059 * PreserveContentReply are 0, we don't know anything about the 2060 * data loaded into the port attributes, so we need to verify 2061 * if gid0 and pkey are still valid. 2062 */ 2063 itreply = port_infop->p_init_type_reply; 2064 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2065 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2066 /* 2067 * Check to see if the subnet part of GID0 has changed. If 2068 * not, check the simple case first to see if the pkey 2069 * index is the same as before; finally check to see if the 2070 * pkey has been relocated to a different index in the table. 2071 */ 2072 if (bcmp(port_infop->p_sgid_tbl, 2073 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2074 2075 new_link_state = LINK_STATE_DOWN; 2076 2077 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2078 state->id_pkey) { 2079 2080 new_link_state = LINK_STATE_UP; 2081 2082 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2083 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2084 2085 ibt_free_portinfo(port_infop, port_infosz); 2086 mutex_exit(&state->id_link_mutex); 2087 2088 ibd_m_stop(state); 2089 if ((ibt_status = ibd_m_start(state)) != IBT_SUCCESS) { 2090 DPRINT(10, "link_mod: cannot " 2091 "restart, ret=%d", ibt_status); 2092 } 2093 return; 2094 } else { 2095 new_link_state = LINK_STATE_DOWN; 2096 } 2097 } 2098 2099 update_link_state: 2100 if (port_infop) { 2101 ibt_free_portinfo(port_infop, port_infosz); 2102 } 2103 2104 /* 2105 * If the old state is the same as the new state, nothing to do 2106 */ 2107 if (state->id_link_state == new_link_state) { 2108 mutex_exit(&state->id_link_mutex); 2109 return; 2110 } 2111 2112 /* 2113 * Ok, so there was a link state change; see if it's safe to ask 2114 * the async thread to do the work 2115 */ 2116 if (!ibd_async_safe(state)) { 2117 state->id_link_state = new_link_state; 2118 mutex_exit(&state->id_link_mutex); 2119 return; 2120 } 2121 2122 mutex_exit(&state->id_link_mutex); 2123 2124 /* 2125 * If we're reporting a link up, check InitTypeReply to see if 2126 * the SM has ensured that the port's presence in mcg, traps, 2127 * etc. is intact. 2128 */ 2129 if (new_link_state == LINK_STATE_DOWN) { 2130 opcode = IBD_LINK_DOWN; 2131 } else { 2132 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2133 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2134 opcode = IBD_LINK_UP; 2135 } else { 2136 opcode = IBD_LINK_UP_ABSENT; 2137 } 2138 } 2139 2140 /* 2141 * Queue up a request for ibd_async_link() to handle this link 2142 * state change event 2143 */ 2144 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2145 req->rq_ptr = (void *)opcode; 2146 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2147 } 2148 2149 /* 2150 * For the port up/down events, IBTL guarantees there will not be concurrent 2151 * invocations of the handler. IBTL might coalesce link transition events, 2152 * and not invoke the handler for _each_ up/down transition, but it will 2153 * invoke the handler with last known state 2154 */ 2155 static void 2156 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2157 ibt_async_code_t code, ibt_async_event_t *event) 2158 { 2159 ibd_state_t *state = (ibd_state_t *)clnt_private; 2160 2161 switch (code) { 2162 case IBT_ERROR_CATASTROPHIC_CHAN: 2163 ibd_print_warn(state, "catastrophic channel error"); 2164 break; 2165 case IBT_ERROR_CQ: 2166 ibd_print_warn(state, "completion queue error"); 2167 break; 2168 case IBT_PORT_CHANGE_EVENT: 2169 /* 2170 * Events will be delivered to all instances that have 2171 * done ibt_open_hca() but not yet done ibt_close_hca(). 2172 * Only need to do work for our port; IBTF will deliver 2173 * events for other ports on the hca we have ibt_open_hca'ed 2174 * too. Note that id_port is initialized in ibd_attach() 2175 * before we do an ibt_open_hca() in ibd_attach(). 2176 */ 2177 ASSERT(state->id_hca_hdl == hca_hdl); 2178 if (state->id_port != event->ev_port) 2179 break; 2180 2181 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2182 IBT_PORT_CHANGE_PKEY) { 2183 ibd_link_mod(state, code); 2184 } 2185 break; 2186 case IBT_ERROR_PORT_DOWN: 2187 case IBT_CLNT_REREG_EVENT: 2188 case IBT_EVENT_PORT_UP: 2189 /* 2190 * Events will be delivered to all instances that have 2191 * done ibt_open_hca() but not yet done ibt_close_hca(). 2192 * Only need to do work for our port; IBTF will deliver 2193 * events for other ports on the hca we have ibt_open_hca'ed 2194 * too. Note that id_port is initialized in ibd_attach() 2195 * before we do an ibt_open_hca() in ibd_attach(). 2196 */ 2197 ASSERT(state->id_hca_hdl == hca_hdl); 2198 if (state->id_port != event->ev_port) 2199 break; 2200 2201 ibd_link_mod(state, code); 2202 break; 2203 2204 case IBT_HCA_ATTACH_EVENT: 2205 case IBT_HCA_DETACH_EVENT: 2206 /* 2207 * When a new card is plugged to the system, attach_event is 2208 * invoked. Additionally, a cfgadm needs to be run to make the 2209 * card known to the system, and an ifconfig needs to be run to 2210 * plumb up any ibd interfaces on the card. In the case of card 2211 * unplug, a cfgadm is run that will trigger any RCM scripts to 2212 * unplumb the ibd interfaces on the card; when the card is 2213 * actually unplugged, the detach_event is invoked; 2214 * additionally, if any ibd instances are still active on the 2215 * card (eg there were no associated RCM scripts), driver's 2216 * detach routine is invoked. 2217 */ 2218 break; 2219 default: 2220 break; 2221 } 2222 } 2223 2224 static int 2225 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2226 { 2227 mac_register_t *macp; 2228 int ret; 2229 2230 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2231 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2232 return (DDI_FAILURE); 2233 } 2234 2235 /* 2236 * Note that when we register with mac during attach, we don't 2237 * have the id_macaddr yet, so we'll simply be registering a 2238 * zero macaddr that we'll overwrite later during plumb (in 2239 * ibd_m_start()). Similar is the case with id_mtu - we'll 2240 * update the mac layer with the correct mtu during plumb. 2241 */ 2242 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2243 macp->m_driver = state; 2244 macp->m_dip = dip; 2245 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2246 macp->m_callbacks = &ibd_m_callbacks; 2247 macp->m_min_sdu = 0; 2248 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2249 2250 /* 2251 * Register ourselves with the GLDv3 interface 2252 */ 2253 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2254 mac_free(macp); 2255 DPRINT(10, 2256 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2257 return (DDI_FAILURE); 2258 } 2259 2260 mac_free(macp); 2261 return (DDI_SUCCESS); 2262 } 2263 2264 static int 2265 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2266 { 2267 ibt_hca_attr_t hca_attrs; 2268 ibt_status_t ibt_status; 2269 2270 /* 2271 * Query the HCA and fetch its attributes 2272 */ 2273 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2274 ASSERT(ibt_status == IBT_SUCCESS); 2275 2276 /* 2277 * 1. Set the Hardware Checksum capability. Currently we only consider 2278 * full checksum offload. 2279 */ 2280 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 2281 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2282 } 2283 2284 /* 2285 * 2. Set LSO policy, capability and maximum length 2286 */ 2287 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2288 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2289 state->id_lso_policy = B_TRUE; 2290 } else { 2291 state->id_lso_policy = B_FALSE; 2292 } 2293 if (hca_attrs.hca_max_lso_size > 0) { 2294 state->id_lso_capable = B_TRUE; 2295 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2296 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2297 else 2298 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 2299 } else { 2300 state->id_lso_capable = B_FALSE; 2301 state->id_lso_maxlen = 0; 2302 } 2303 2304 /* 2305 * 3. Set Reserved L_Key capability 2306 */ 2307 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2308 state->id_hca_res_lkey_capab = 1; 2309 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2310 } 2311 2312 /* 2313 * 4. Set maximum sqseg value after checking to see if extended sgl 2314 * size information is provided by the hca 2315 */ 2316 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2317 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2318 } else { 2319 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2320 } 2321 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2322 state->id_max_sqseg = IBD_MAX_SQSEG; 2323 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2324 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2325 state->id_max_sqseg, IBD_MAX_SQSEG); 2326 } 2327 2328 /* 2329 * 5. Set number of recv and send wqes after checking hca maximum 2330 * channel size 2331 */ 2332 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2333 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2334 } else { 2335 state->id_num_rwqe = IBD_NUM_RWQE; 2336 } 2337 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2338 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2339 } else { 2340 state->id_num_swqe = IBD_NUM_SWQE; 2341 } 2342 2343 return (DDI_SUCCESS); 2344 } 2345 2346 static int 2347 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2348 { 2349 int instance; 2350 uint32_t progress = state->id_mac_state; 2351 ibt_status_t ret; 2352 2353 if (progress & IBD_DRV_MAC_REGISTERED) { 2354 (void) mac_unregister(state->id_mh); 2355 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2356 } 2357 2358 if (progress & IBD_DRV_PD_ALLOCD) { 2359 if ((ret = ibt_free_pd(state->id_hca_hdl, 2360 state->id_pd_hdl)) != IBT_SUCCESS) { 2361 ibd_print_warn(state, "failed to free " 2362 "protection domain, ret=%d", ret); 2363 } 2364 state->id_pd_hdl = NULL; 2365 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2366 } 2367 2368 if (progress & IBD_DRV_HCA_OPENED) { 2369 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2370 IBT_SUCCESS) { 2371 ibd_print_warn(state, "failed to close " 2372 "HCA device, ret=%d", ret); 2373 } 2374 state->id_hca_hdl = NULL; 2375 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2376 } 2377 2378 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2379 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2380 ibd_print_warn(state, 2381 "ibt_detach() failed, ret=%d", ret); 2382 } 2383 state->id_ibt_hdl = NULL; 2384 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2385 } 2386 2387 if (progress & IBD_DRV_TXINTR_ADDED) { 2388 ddi_remove_softintr(state->id_tx); 2389 state->id_tx = NULL; 2390 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2391 } 2392 2393 if (progress & IBD_DRV_RXINTR_ADDED) { 2394 ddi_remove_softintr(state->id_rx); 2395 state->id_rx = NULL; 2396 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2397 } 2398 2399 if (progress & IBD_DRV_STATE_INITIALIZED) { 2400 ibd_state_fini(state); 2401 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2402 } 2403 2404 instance = ddi_get_instance(dip); 2405 ddi_soft_state_free(ibd_list, instance); 2406 2407 return (DDI_SUCCESS); 2408 } 2409 2410 /* 2411 * Attach device to the IO framework. 2412 */ 2413 static int 2414 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2415 { 2416 ibd_state_t *state = NULL; 2417 ib_guid_t hca_guid; 2418 int instance; 2419 ibt_status_t ret; 2420 int rv; 2421 2422 /* 2423 * IBD doesn't support suspend/resume 2424 */ 2425 if (cmd != DDI_ATTACH) 2426 return (DDI_FAILURE); 2427 2428 /* 2429 * Allocate softstate structure 2430 */ 2431 instance = ddi_get_instance(dip); 2432 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2433 return (DDI_FAILURE); 2434 state = ddi_get_soft_state(ibd_list, instance); 2435 2436 /* 2437 * Initialize mutexes and condition variables 2438 */ 2439 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2440 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2441 goto attach_fail; 2442 } 2443 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2444 2445 /* 2446 * Allocate rx,tx softintr 2447 */ 2448 if (ibd_rx_softintr == 1) { 2449 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2450 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2451 DPRINT(10, "ibd_attach: failed in " 2452 "ddi_add_softintr(id_rx), ret=%d", rv); 2453 goto attach_fail; 2454 } 2455 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2456 } 2457 if (ibd_tx_softintr == 1) { 2458 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2459 NULL, NULL, ibd_tx_recycle, 2460 (caddr_t)state)) != DDI_SUCCESS) { 2461 DPRINT(10, "ibd_attach: failed in " 2462 "ddi_add_softintr(id_tx), ret=%d", rv); 2463 goto attach_fail; 2464 } 2465 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2466 } 2467 2468 /* 2469 * Obtain IBA P_Key, port number and HCA guid and validate 2470 * them (for P_Key, only full members are allowed as per 2471 * IPoIB specification; neither port number nor HCA guid 2472 * can be zero) 2473 */ 2474 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2475 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2476 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2477 state->id_pkey); 2478 goto attach_fail; 2479 } 2480 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2481 "port-number", 0)) == 0) { 2482 DPRINT(10, "ibd_attach: invalid port number (%d)", 2483 state->id_port); 2484 goto attach_fail; 2485 } 2486 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2487 "hca-guid", 0)) == 0) { 2488 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2489 hca_guid); 2490 goto attach_fail; 2491 } 2492 2493 /* 2494 * Attach to IBTL 2495 */ 2496 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2497 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2498 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2499 goto attach_fail; 2500 } 2501 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2502 2503 /* 2504 * Open the HCA 2505 */ 2506 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2507 &state->id_hca_hdl)) != IBT_SUCCESS) { 2508 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2509 goto attach_fail; 2510 } 2511 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2512 2513 /* 2514 * Record capabilities 2515 */ 2516 (void) ibd_record_capab(state, dip); 2517 2518 /* 2519 * Allocate a protection domain on the HCA 2520 */ 2521 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2522 &state->id_pd_hdl)) != IBT_SUCCESS) { 2523 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2524 goto attach_fail; 2525 } 2526 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2527 2528 2529 /* 2530 * Register ibd interfaces with the Nemo framework 2531 */ 2532 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2533 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2534 goto attach_fail; 2535 } 2536 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2537 2538 /* 2539 * We're done with everything we could to make the attach 2540 * succeed. All the buffer allocations and IPoIB broadcast 2541 * group joins are deferred to when the interface instance 2542 * is actually plumbed to avoid wasting memory. 2543 */ 2544 return (DDI_SUCCESS); 2545 2546 attach_fail: 2547 ibd_unattach(state, dip); 2548 return (DDI_FAILURE); 2549 } 2550 2551 /* 2552 * Detach device from the IO framework. 2553 */ 2554 static int 2555 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2556 { 2557 ibd_state_t *state; 2558 int instance; 2559 2560 /* 2561 * IBD doesn't support suspend/resume 2562 */ 2563 if (cmd != DDI_DETACH) 2564 return (DDI_FAILURE); 2565 2566 /* 2567 * Get the instance softstate 2568 */ 2569 instance = ddi_get_instance(dip); 2570 state = ddi_get_soft_state(ibd_list, instance); 2571 2572 /* 2573 * Release all resources we're holding still. Note that if we'd 2574 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2575 * so far, we should find all the flags we need in id_mac_state. 2576 */ 2577 (void) ibd_unattach(state, dip); 2578 2579 return (DDI_SUCCESS); 2580 } 2581 2582 /* 2583 * Pre ibt_attach() driver initialization 2584 */ 2585 static int 2586 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2587 { 2588 char buf[64]; 2589 2590 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2591 state->id_link_state = LINK_STATE_UNKNOWN; 2592 2593 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2594 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2595 state->id_trap_stop = B_TRUE; 2596 state->id_trap_inprog = 0; 2597 2598 mutex_init(&state->id_cq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2599 state->id_dip = dip; 2600 2601 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2602 2603 state->id_tx_list.dl_head = NULL; 2604 state->id_tx_list.dl_tail = NULL; 2605 state->id_tx_list.dl_pending_sends = B_FALSE; 2606 state->id_tx_list.dl_cnt = 0; 2607 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2608 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2609 state->id_tx_busy = 0; 2610 2611 state->id_rx_list.dl_head = NULL; 2612 state->id_rx_list.dl_tail = NULL; 2613 state->id_rx_list.dl_bufs_outstanding = 0; 2614 state->id_rx_list.dl_cnt = 0; 2615 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2616 mutex_init(&state->id_rxpost_lock, NULL, MUTEX_DRIVER, NULL); 2617 2618 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2619 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2620 0, NULL, NULL, NULL, NULL, NULL, 0); 2621 2622 return (DDI_SUCCESS); 2623 } 2624 2625 /* 2626 * Post ibt_detach() driver deconstruction 2627 */ 2628 static void 2629 ibd_state_fini(ibd_state_t *state) 2630 { 2631 kmem_cache_destroy(state->id_req_kmc); 2632 2633 mutex_destroy(&state->id_rxpost_lock); 2634 mutex_destroy(&state->id_rx_list.dl_mutex); 2635 2636 mutex_destroy(&state->id_txpost_lock); 2637 mutex_destroy(&state->id_tx_list.dl_mutex); 2638 2639 mutex_destroy(&state->id_sched_lock); 2640 mutex_destroy(&state->id_cq_poll_lock); 2641 2642 cv_destroy(&state->id_trap_cv); 2643 mutex_destroy(&state->id_trap_lock); 2644 mutex_destroy(&state->id_link_mutex); 2645 } 2646 2647 /* 2648 * Fetch link speed from SA for snmp ifspeed reporting. 2649 */ 2650 static uint64_t 2651 ibd_get_portspeed(ibd_state_t *state) 2652 { 2653 int ret; 2654 ibt_path_info_t path; 2655 ibt_path_attr_t path_attr; 2656 uint8_t num_paths; 2657 uint64_t ifspeed; 2658 2659 /* 2660 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2661 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2662 * 2000000000. Start with that as default. 2663 */ 2664 ifspeed = 2000000000; 2665 2666 bzero(&path_attr, sizeof (path_attr)); 2667 2668 /* 2669 * Get the port speed from Loopback path information. 2670 */ 2671 path_attr.pa_dgids = &state->id_sgid; 2672 path_attr.pa_num_dgids = 1; 2673 path_attr.pa_sgid = state->id_sgid; 2674 2675 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2676 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2677 goto earlydone; 2678 2679 if (num_paths < 1) 2680 goto earlydone; 2681 2682 /* 2683 * In case SA does not return an expected value, report the default 2684 * speed as 1X. 2685 */ 2686 ret = 1; 2687 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2688 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2689 ret = 1; 2690 break; 2691 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2692 ret = 4; 2693 break; 2694 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2695 ret = 12; 2696 break; 2697 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2698 ret = 2; 2699 break; 2700 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2701 ret = 8; 2702 break; 2703 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2704 ret = 16; 2705 break; 2706 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2707 ret = 24; 2708 break; 2709 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2710 ret = 32; 2711 break; 2712 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2713 ret = 48; 2714 break; 2715 } 2716 2717 ifspeed *= ret; 2718 2719 earlydone: 2720 return (ifspeed); 2721 } 2722 2723 /* 2724 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2725 * representing the input mcg mgid. 2726 */ 2727 static ibd_mce_t * 2728 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2729 { 2730 ibd_mce_t *ptr = list_head(mlist); 2731 2732 /* 2733 * Do plain linear search. 2734 */ 2735 while (ptr != NULL) { 2736 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2737 sizeof (ib_gid_t)) == 0) 2738 return (ptr); 2739 ptr = list_next(mlist, ptr); 2740 } 2741 return (NULL); 2742 } 2743 2744 /* 2745 * Execute IBA JOIN. 2746 */ 2747 static ibt_status_t 2748 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2749 { 2750 ibt_mcg_attr_t mcg_attr; 2751 2752 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2753 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2754 mcg_attr.mc_mgid = mgid; 2755 mcg_attr.mc_join_state = mce->mc_jstate; 2756 mcg_attr.mc_scope = state->id_scope; 2757 mcg_attr.mc_pkey = state->id_pkey; 2758 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2759 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2760 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2761 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2762 NULL, NULL)); 2763 } 2764 2765 /* 2766 * This code JOINs the port in the proper way (depending on the join 2767 * state) so that IBA fabric will forward mcg packets to/from the port. 2768 * It also attaches the QPN to the mcg so it can receive those mcg 2769 * packets. This code makes sure not to attach the mcg to the QP if 2770 * that has been previously done due to the mcg being joined with a 2771 * different join state, even though this is not required by SWG_0216, 2772 * refid 3610. 2773 */ 2774 static ibd_mce_t * 2775 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2776 { 2777 ibt_status_t ibt_status; 2778 ibd_mce_t *mce, *tmce, *omce = NULL; 2779 boolean_t do_attach = B_TRUE; 2780 2781 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2782 jstate, mgid.gid_prefix, mgid.gid_guid); 2783 2784 /* 2785 * For enable_multicast Full member joins, we need to do some 2786 * extra work. If there is already an mce on the list that 2787 * indicates full membership, that means the membership has 2788 * not yet been dropped (since the disable_multicast was issued) 2789 * because there are pending Tx's to the mcg; in that case, just 2790 * mark the mce not to be reaped when the Tx completion queues 2791 * an async reap operation. 2792 * 2793 * If there is already an mce on the list indicating sendonly 2794 * membership, try to promote to full membership. Be careful 2795 * not to deallocate the old mce, since there might be an AH 2796 * pointing to it; instead, update the old mce with new data 2797 * that tracks the full membership. 2798 */ 2799 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2800 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2801 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2802 ASSERT(omce->mc_fullreap); 2803 omce->mc_fullreap = B_FALSE; 2804 return (omce); 2805 } else { 2806 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2807 } 2808 } 2809 2810 /* 2811 * Allocate the ibd_mce_t to track this JOIN. 2812 */ 2813 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2814 mce->mc_fullreap = B_FALSE; 2815 mce->mc_jstate = jstate; 2816 2817 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2818 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2819 ibt_status); 2820 kmem_free(mce, sizeof (ibd_mce_t)); 2821 return (NULL); 2822 } 2823 2824 /* 2825 * Is an IBA attach required? Not if the interface is already joined 2826 * to the mcg in a different appropriate join state. 2827 */ 2828 if (jstate == IB_MC_JSTATE_NON) { 2829 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2830 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2831 do_attach = B_FALSE; 2832 } else if (jstate == IB_MC_JSTATE_FULL) { 2833 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2834 do_attach = B_FALSE; 2835 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2836 do_attach = B_FALSE; 2837 } 2838 2839 if (do_attach) { 2840 /* 2841 * Do the IBA attach. 2842 */ 2843 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2844 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2845 &mce->mc_info)) != IBT_SUCCESS) { 2846 DPRINT(10, "ibd_join_group : failed qp attachment " 2847 "%d\n", ibt_status); 2848 /* 2849 * NOTE that we should probably preserve the join info 2850 * in the list and later try to leave again at detach 2851 * time. 2852 */ 2853 (void) ibt_leave_mcg(state->id_sgid, mgid, 2854 state->id_sgid, jstate); 2855 kmem_free(mce, sizeof (ibd_mce_t)); 2856 return (NULL); 2857 } 2858 } 2859 2860 /* 2861 * Insert the ibd_mce_t in the proper list. 2862 */ 2863 if (jstate == IB_MC_JSTATE_NON) { 2864 IBD_MCACHE_INSERT_NON(state, mce); 2865 } else { 2866 /* 2867 * Set up the mc_req fields used for reaping the 2868 * mcg in case of delayed tx completion (see 2869 * ibd_tx_cleanup()). Also done for sendonly join in 2870 * case we are promoted to fullmembership later and 2871 * keep using the same mce. 2872 */ 2873 mce->mc_req.rq_gid = mgid; 2874 mce->mc_req.rq_ptr = mce; 2875 /* 2876 * Check whether this is the case of trying to join 2877 * full member, and we were already joined send only. 2878 * We try to drop our SendOnly membership, but it is 2879 * possible that the mcg does not exist anymore (and 2880 * the subnet trap never reached us), so the leave 2881 * operation might fail. 2882 */ 2883 if (omce != NULL) { 2884 (void) ibt_leave_mcg(state->id_sgid, mgid, 2885 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2886 omce->mc_jstate = IB_MC_JSTATE_FULL; 2887 bcopy(&mce->mc_info, &omce->mc_info, 2888 sizeof (ibt_mcg_info_t)); 2889 kmem_free(mce, sizeof (ibd_mce_t)); 2890 return (omce); 2891 } 2892 mutex_enter(&state->id_mc_mutex); 2893 IBD_MCACHE_INSERT_FULL(state, mce); 2894 mutex_exit(&state->id_mc_mutex); 2895 } 2896 2897 return (mce); 2898 } 2899 2900 /* 2901 * Called during port up event handling to attempt to reacquire full 2902 * membership to an mcg. Stripped down version of ibd_join_group(). 2903 * Note that it is possible that the mcg might have gone away, and 2904 * gets recreated at this point. 2905 */ 2906 static void 2907 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2908 { 2909 ib_gid_t mgid; 2910 2911 /* 2912 * If the mc_fullreap flag is set, or this join fails, a subsequent 2913 * reap/leave is going to try to leave the group. We could prevent 2914 * that by adding a boolean flag into ibd_mce_t, if required. 2915 */ 2916 if (mce->mc_fullreap) 2917 return; 2918 2919 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2920 2921 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2922 mgid.gid_guid); 2923 2924 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2925 ibd_print_warn(state, "Failure on port up to rejoin " 2926 "multicast gid %016llx:%016llx", 2927 (u_longlong_t)mgid.gid_prefix, 2928 (u_longlong_t)mgid.gid_guid); 2929 } 2930 2931 /* 2932 * This code handles delayed Tx completion cleanups for mcg's to which 2933 * disable_multicast has been issued, regular mcg related cleanups during 2934 * disable_multicast, disable_promiscous and mcg traps, as well as 2935 * cleanups during driver detach time. Depending on the join state, 2936 * it deletes the mce from the appropriate list and issues the IBA 2937 * leave/detach; except in the disable_multicast case when the mce 2938 * is left on the active list for a subsequent Tx completion cleanup. 2939 */ 2940 static void 2941 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 2942 uint8_t jstate) 2943 { 2944 ibd_mce_t *tmce; 2945 boolean_t do_detach = B_TRUE; 2946 2947 /* 2948 * Before detaching, we must check whether the other list 2949 * contains the mcg; if we detach blindly, the consumer 2950 * who set up the other list will also stop receiving 2951 * traffic. 2952 */ 2953 if (jstate == IB_MC_JSTATE_FULL) { 2954 /* 2955 * The following check is only relevant while coming 2956 * from the Tx completion path in the reap case. 2957 */ 2958 if (!mce->mc_fullreap) 2959 return; 2960 mutex_enter(&state->id_mc_mutex); 2961 IBD_MCACHE_PULLOUT_FULL(state, mce); 2962 mutex_exit(&state->id_mc_mutex); 2963 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2964 do_detach = B_FALSE; 2965 } else if (jstate == IB_MC_JSTATE_NON) { 2966 IBD_MCACHE_PULLOUT_NON(state, mce); 2967 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2968 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2969 do_detach = B_FALSE; 2970 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2971 mutex_enter(&state->id_mc_mutex); 2972 IBD_MCACHE_PULLOUT_FULL(state, mce); 2973 mutex_exit(&state->id_mc_mutex); 2974 do_detach = B_FALSE; 2975 } 2976 2977 /* 2978 * If we are reacting to a mcg trap and leaving our sendonly or 2979 * non membership, the mcg is possibly already gone, so attempting 2980 * to leave might fail. On the other hand, we must try to leave 2981 * anyway, since this might be a trap from long ago, and we could 2982 * have potentially sendonly joined to a recent incarnation of 2983 * the mcg and are about to loose track of this information. 2984 */ 2985 if (do_detach) { 2986 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 2987 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 2988 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 2989 } 2990 2991 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 2992 kmem_free(mce, sizeof (ibd_mce_t)); 2993 } 2994 2995 /* 2996 * Async code executed due to multicast and promiscuous disable requests 2997 * and mcg trap handling; also executed during driver detach. Mostly, a 2998 * leave and detach is done; except for the fullmember case when Tx 2999 * requests are pending, whence arrangements are made for subsequent 3000 * cleanup on Tx completion. 3001 */ 3002 static void 3003 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3004 { 3005 ipoib_mac_t mcmac; 3006 boolean_t recycled; 3007 ibd_mce_t *mce; 3008 3009 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3010 jstate, mgid.gid_prefix, mgid.gid_guid); 3011 3012 if (jstate == IB_MC_JSTATE_NON) { 3013 recycled = B_TRUE; 3014 mce = IBD_MCACHE_FIND_NON(state, mgid); 3015 /* 3016 * In case we are handling a mcg trap, we might not find 3017 * the mcg in the non list. 3018 */ 3019 if (mce == NULL) { 3020 return; 3021 } 3022 } else { 3023 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3024 3025 /* 3026 * In case we are handling a mcg trap, make sure the trap 3027 * is not arriving late; if we have an mce that indicates 3028 * that we are already a fullmember, that would be a clear 3029 * indication that the trap arrived late (ie, is for a 3030 * previous incarnation of the mcg). 3031 */ 3032 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3033 if ((mce == NULL) || (mce->mc_jstate == 3034 IB_MC_JSTATE_FULL)) { 3035 return; 3036 } 3037 } else { 3038 ASSERT(jstate == IB_MC_JSTATE_FULL); 3039 3040 /* 3041 * If join group failed, mce will be NULL here. 3042 * This is because in GLDv3 driver, set multicast 3043 * will always return success. 3044 */ 3045 if (mce == NULL) { 3046 return; 3047 } 3048 3049 mce->mc_fullreap = B_TRUE; 3050 } 3051 3052 /* 3053 * If no pending Tx's remain that reference the AH 3054 * for the mcg, recycle it from active to free list. 3055 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3056 * so the last completing Tx will cause an async reap 3057 * operation to be invoked, at which time we will drop our 3058 * membership to the mcg so that the pending Tx's complete 3059 * successfully. Refer to comments on "AH and MCE active 3060 * list manipulation" at top of this file. The lock protects 3061 * against Tx fast path and Tx cleanup code. 3062 */ 3063 mutex_enter(&state->id_ac_mutex); 3064 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3065 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3066 IB_MC_JSTATE_SEND_ONLY_NON)); 3067 mutex_exit(&state->id_ac_mutex); 3068 } 3069 3070 if (recycled) { 3071 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3072 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3073 ibd_async_reap_group(state, mce, mgid, jstate); 3074 } 3075 } 3076 3077 /* 3078 * Find the broadcast address as defined by IPoIB; implicitly 3079 * determines the IBA scope, mtu, tclass etc of the link the 3080 * interface is going to be a member of. 3081 */ 3082 static ibt_status_t 3083 ibd_find_bgroup(ibd_state_t *state) 3084 { 3085 ibt_mcg_attr_t mcg_attr; 3086 uint_t numg; 3087 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3088 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3089 IB_MC_SCOPE_GLOBAL }; 3090 int i, mcgmtu; 3091 boolean_t found = B_FALSE; 3092 int ret; 3093 ibt_mcg_info_t mcg_info; 3094 3095 state->id_bgroup_created = B_FALSE; 3096 3097 query_bcast_grp: 3098 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3099 mcg_attr.mc_pkey = state->id_pkey; 3100 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3101 3102 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3103 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3104 3105 /* 3106 * Look for the IPoIB broadcast group. 3107 */ 3108 state->id_mgid.gid_prefix = 3109 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3110 ((uint64_t)state->id_scope << 48) | 3111 ((uint32_t)(state->id_pkey << 16))); 3112 mcg_attr.mc_mgid = state->id_mgid; 3113 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3114 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3115 found = B_TRUE; 3116 break; 3117 } 3118 } 3119 3120 if (!found) { 3121 if (ibd_create_broadcast_group) { 3122 /* 3123 * If we created the broadcast group, but failed to 3124 * find it, we can't do anything except leave the 3125 * one we created and return failure. 3126 */ 3127 if (state->id_bgroup_created) { 3128 ibd_print_warn(state, "IPoIB broadcast group " 3129 "absent. Unable to query after create."); 3130 goto find_bgroup_fail; 3131 } 3132 3133 /* 3134 * Create the ipoib broadcast group if it didn't exist 3135 */ 3136 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3137 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3138 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3139 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3140 mcg_attr.mc_pkey = state->id_pkey; 3141 mcg_attr.mc_flow = 0; 3142 mcg_attr.mc_sl = 0; 3143 mcg_attr.mc_tclass = 0; 3144 state->id_mgid.gid_prefix = 3145 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3146 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3147 ((uint32_t)(state->id_pkey << 16))); 3148 mcg_attr.mc_mgid = state->id_mgid; 3149 3150 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3151 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3152 ibd_print_warn(state, "IPoIB broadcast group " 3153 "absent, create failed: ret = %d\n", ret); 3154 state->id_bgroup_created = B_FALSE; 3155 return (IBT_FAILURE); 3156 } 3157 state->id_bgroup_created = B_TRUE; 3158 goto query_bcast_grp; 3159 } else { 3160 ibd_print_warn(state, "IPoIB broadcast group absent"); 3161 return (IBT_FAILURE); 3162 } 3163 } 3164 3165 /* 3166 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3167 */ 3168 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3169 if (state->id_mtu < mcgmtu) { 3170 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3171 "greater than port's maximum MTU %d", mcgmtu, 3172 state->id_mtu); 3173 ibt_free_mcg_info(state->id_mcinfo, 1); 3174 goto find_bgroup_fail; 3175 } 3176 state->id_mtu = mcgmtu; 3177 3178 return (IBT_SUCCESS); 3179 3180 find_bgroup_fail: 3181 if (state->id_bgroup_created) { 3182 (void) ibt_leave_mcg(state->id_sgid, 3183 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3184 IB_MC_JSTATE_FULL); 3185 } 3186 3187 return (IBT_FAILURE); 3188 } 3189 3190 static int 3191 ibd_alloc_tx_copybufs(ibd_state_t *state) 3192 { 3193 ibt_mr_attr_t mem_attr; 3194 3195 /* 3196 * Allocate one big chunk for all regular tx copy bufs 3197 */ 3198 state->id_tx_buf_sz = state->id_mtu; 3199 if (state->id_lso_policy && state->id_lso_capable && 3200 (IBD_TX_BUF_SZ > state->id_mtu)) { 3201 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3202 } 3203 3204 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3205 state->id_tx_buf_sz, KM_SLEEP); 3206 3207 /* 3208 * Do one memory registration on the entire txbuf area 3209 */ 3210 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3211 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3212 mem_attr.mr_as = NULL; 3213 mem_attr.mr_flags = IBT_MR_SLEEP; 3214 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3215 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3216 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3217 kmem_free(state->id_tx_bufs, 3218 state->id_num_swqe * state->id_tx_buf_sz); 3219 state->id_tx_bufs = NULL; 3220 return (DDI_FAILURE); 3221 } 3222 3223 return (DDI_SUCCESS); 3224 } 3225 3226 static int 3227 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3228 { 3229 ibt_mr_attr_t mem_attr; 3230 ibd_lsobuf_t *buflist; 3231 ibd_lsobuf_t *lbufp; 3232 ibd_lsobuf_t *tail; 3233 ibd_lsobkt_t *bktp; 3234 uint8_t *membase; 3235 uint8_t *memp; 3236 uint_t memsz; 3237 int i; 3238 3239 /* 3240 * Allocate the lso bucket 3241 */ 3242 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3243 3244 /* 3245 * Allocate the entire lso memory and register it 3246 */ 3247 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3248 membase = kmem_zalloc(memsz, KM_SLEEP); 3249 3250 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3251 mem_attr.mr_len = memsz; 3252 mem_attr.mr_as = NULL; 3253 mem_attr.mr_flags = IBT_MR_SLEEP; 3254 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3255 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3256 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3257 kmem_free(membase, memsz); 3258 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3259 return (DDI_FAILURE); 3260 } 3261 3262 /* 3263 * Now allocate the buflist. Note that the elements in the buflist and 3264 * the buffers in the lso memory have a permanent 1-1 relation, so we 3265 * can always derive the address of a buflist entry from the address of 3266 * an lso buffer. 3267 */ 3268 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3269 KM_SLEEP); 3270 3271 /* 3272 * Set up the lso buf chain 3273 */ 3274 memp = membase; 3275 lbufp = buflist; 3276 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3277 lbufp->lb_isfree = 1; 3278 lbufp->lb_buf = memp; 3279 lbufp->lb_next = lbufp + 1; 3280 3281 tail = lbufp; 3282 3283 memp += IBD_LSO_BUFSZ; 3284 lbufp++; 3285 } 3286 tail->lb_next = NULL; 3287 3288 /* 3289 * Set up the LSO buffer information in ibd state 3290 */ 3291 bktp->bkt_bufl = buflist; 3292 bktp->bkt_free_head = buflist; 3293 bktp->bkt_mem = membase; 3294 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3295 bktp->bkt_nfree = bktp->bkt_nelem; 3296 3297 state->id_lso = bktp; 3298 3299 return (DDI_SUCCESS); 3300 } 3301 3302 /* 3303 * Statically allocate Tx buffer list(s). 3304 */ 3305 static int 3306 ibd_init_txlist(ibd_state_t *state) 3307 { 3308 ibd_swqe_t *swqe; 3309 ibt_lkey_t lkey; 3310 int i; 3311 3312 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3313 return (DDI_FAILURE); 3314 3315 if (state->id_lso_policy && state->id_lso_capable) { 3316 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3317 state->id_lso_policy = B_FALSE; 3318 } 3319 3320 /* 3321 * Allocate and setup the swqe list 3322 */ 3323 lkey = state->id_tx_mr_desc.md_lkey; 3324 for (i = 0; i < state->id_num_swqe; i++) { 3325 if (ibd_alloc_swqe(state, &swqe, i, lkey) != DDI_SUCCESS) { 3326 DPRINT(10, "ibd_init_txlist: ibd_alloc_swqe failed"); 3327 ibd_fini_txlist(state); 3328 return (DDI_FAILURE); 3329 } 3330 3331 /* add to list */ 3332 state->id_tx_list.dl_cnt++; 3333 if (state->id_tx_list.dl_head == NULL) { 3334 swqe->swqe_prev = NULL; 3335 swqe->swqe_next = NULL; 3336 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3337 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3338 } else { 3339 swqe->swqe_prev = state->id_tx_list.dl_tail; 3340 swqe->swqe_next = NULL; 3341 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 3342 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 3343 } 3344 } 3345 3346 return (DDI_SUCCESS); 3347 } 3348 3349 static int 3350 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3351 uint32_t *nds_p) 3352 { 3353 ibd_lsobkt_t *bktp; 3354 ibd_lsobuf_t *lbufp; 3355 ibd_lsobuf_t *nextp; 3356 ibt_lkey_t lso_lkey; 3357 uint_t frag_sz; 3358 uint_t num_needed; 3359 int i; 3360 3361 ASSERT(sgl_p != NULL); 3362 ASSERT(nds_p != NULL); 3363 ASSERT(req_sz != 0); 3364 3365 /* 3366 * Determine how many bufs we'd need for the size requested 3367 */ 3368 num_needed = req_sz / IBD_LSO_BUFSZ; 3369 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3370 num_needed++; 3371 3372 mutex_enter(&state->id_lso_lock); 3373 3374 /* 3375 * If we don't have enough lso bufs, return failure 3376 */ 3377 ASSERT(state->id_lso != NULL); 3378 bktp = state->id_lso; 3379 if (bktp->bkt_nfree < num_needed) { 3380 mutex_exit(&state->id_lso_lock); 3381 return (-1); 3382 } 3383 3384 /* 3385 * Pick the first 'num_needed' bufs from the free list 3386 */ 3387 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3388 lbufp = bktp->bkt_free_head; 3389 for (i = 0; i < num_needed; i++) { 3390 ASSERT(lbufp->lb_isfree != 0); 3391 ASSERT(lbufp->lb_buf != NULL); 3392 3393 nextp = lbufp->lb_next; 3394 3395 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3396 sgl_p[i].ds_key = lso_lkey; 3397 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3398 3399 lbufp->lb_isfree = 0; 3400 lbufp->lb_next = NULL; 3401 3402 lbufp = nextp; 3403 } 3404 bktp->bkt_free_head = lbufp; 3405 3406 /* 3407 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3408 * to adjust the last sgl entry's length. Since we know we need atleast 3409 * one, the i-1 use below is ok. 3410 */ 3411 if (frag_sz) { 3412 sgl_p[i-1].ds_len = frag_sz; 3413 } 3414 3415 /* 3416 * Update nfree count and return 3417 */ 3418 bktp->bkt_nfree -= num_needed; 3419 3420 mutex_exit(&state->id_lso_lock); 3421 3422 *nds_p = num_needed; 3423 3424 return (0); 3425 } 3426 3427 static void 3428 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3429 { 3430 ibd_lsobkt_t *bktp; 3431 ibd_lsobuf_t *lbufp; 3432 uint8_t *lso_mem_end; 3433 uint_t ndx; 3434 int i; 3435 3436 mutex_enter(&state->id_lso_lock); 3437 3438 bktp = state->id_lso; 3439 ASSERT(bktp != NULL); 3440 3441 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3442 for (i = 0; i < nds; i++) { 3443 uint8_t *va; 3444 3445 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3446 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3447 3448 /* 3449 * Figure out the buflist element this sgl buffer corresponds 3450 * to and put it back at the head 3451 */ 3452 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3453 lbufp = bktp->bkt_bufl + ndx; 3454 3455 ASSERT(lbufp->lb_isfree == 0); 3456 ASSERT(lbufp->lb_buf == va); 3457 3458 lbufp->lb_isfree = 1; 3459 lbufp->lb_next = bktp->bkt_free_head; 3460 bktp->bkt_free_head = lbufp; 3461 } 3462 bktp->bkt_nfree += nds; 3463 3464 mutex_exit(&state->id_lso_lock); 3465 } 3466 3467 static void 3468 ibd_free_tx_copybufs(ibd_state_t *state) 3469 { 3470 /* 3471 * Unregister txbuf mr 3472 */ 3473 if (ibt_deregister_mr(state->id_hca_hdl, 3474 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3475 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3476 } 3477 state->id_tx_mr_hdl = NULL; 3478 3479 /* 3480 * Free txbuf memory 3481 */ 3482 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3483 state->id_tx_bufs = NULL; 3484 } 3485 3486 static void 3487 ibd_free_tx_lsobufs(ibd_state_t *state) 3488 { 3489 ibd_lsobkt_t *bktp; 3490 3491 mutex_enter(&state->id_lso_lock); 3492 3493 if ((bktp = state->id_lso) == NULL) { 3494 mutex_exit(&state->id_lso_lock); 3495 return; 3496 } 3497 3498 /* 3499 * First, free the buflist 3500 */ 3501 ASSERT(bktp->bkt_bufl != NULL); 3502 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3503 3504 /* 3505 * Unregister the LSO memory and free it 3506 */ 3507 ASSERT(bktp->bkt_mr_hdl != NULL); 3508 if (ibt_deregister_mr(state->id_hca_hdl, 3509 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3510 DPRINT(10, 3511 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3512 } 3513 ASSERT(bktp->bkt_mem); 3514 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3515 3516 /* 3517 * Finally free the bucket 3518 */ 3519 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3520 state->id_lso = NULL; 3521 3522 mutex_exit(&state->id_lso_lock); 3523 } 3524 3525 /* 3526 * Free the statically allocated Tx buffer list. 3527 */ 3528 static void 3529 ibd_fini_txlist(ibd_state_t *state) 3530 { 3531 ibd_swqe_t *node; 3532 3533 /* 3534 * Free the allocated swqes 3535 */ 3536 mutex_enter(&state->id_tx_list.dl_mutex); 3537 while (state->id_tx_list.dl_head != NULL) { 3538 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3539 state->id_tx_list.dl_head = node->swqe_next; 3540 ASSERT(state->id_tx_list.dl_cnt > 0); 3541 state->id_tx_list.dl_cnt--; 3542 ibd_free_swqe(state, node); 3543 } 3544 mutex_exit(&state->id_tx_list.dl_mutex); 3545 3546 ibd_free_tx_lsobufs(state); 3547 ibd_free_tx_copybufs(state); 3548 } 3549 3550 /* 3551 * Allocate a single send wqe and register it so it is almost 3552 * ready to be posted to the hardware. 3553 */ 3554 static int 3555 ibd_alloc_swqe(ibd_state_t *state, ibd_swqe_t **wqe, int ndx, ibt_lkey_t lkey) 3556 { 3557 ibd_swqe_t *swqe; 3558 3559 swqe = kmem_zalloc(sizeof (ibd_swqe_t), KM_SLEEP); 3560 *wqe = swqe; 3561 3562 swqe->swqe_type = IBD_WQE_SEND; 3563 swqe->swqe_next = NULL; 3564 swqe->swqe_prev = NULL; 3565 swqe->swqe_im_mblk = NULL; 3566 3567 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3568 (state->id_tx_bufs + ndx * state->id_tx_buf_sz); 3569 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3570 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3571 3572 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3573 swqe->w_swr.wr_flags = IBT_WR_SEND_SIGNAL; 3574 swqe->w_swr.wr_trans = IBT_UD_SRV; 3575 3576 /* These are set in send */ 3577 swqe->w_swr.wr_nds = 0; 3578 swqe->w_swr.wr_sgl = NULL; 3579 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3580 3581 return (DDI_SUCCESS); 3582 } 3583 3584 /* 3585 * Free an allocated send wqe. 3586 */ 3587 /*ARGSUSED*/ 3588 static void 3589 ibd_free_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 3590 { 3591 kmem_free(swqe, sizeof (ibd_swqe_t)); 3592 } 3593 3594 /* 3595 * Post a rwqe to the hardware and add it to the Rx list. The 3596 * "recycle" parameter indicates whether an old rwqe is being 3597 * recycled, or this is a new one. 3598 */ 3599 static int 3600 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe, boolean_t recycle) 3601 { 3602 ibt_status_t ibt_status; 3603 3604 if (recycle == B_FALSE) { 3605 mutex_enter(&state->id_rx_list.dl_mutex); 3606 if (state->id_rx_list.dl_head == NULL) { 3607 rwqe->rwqe_prev = NULL; 3608 rwqe->rwqe_next = NULL; 3609 state->id_rx_list.dl_head = RWQE_TO_WQE(rwqe); 3610 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3611 } else { 3612 rwqe->rwqe_prev = state->id_rx_list.dl_tail; 3613 rwqe->rwqe_next = NULL; 3614 state->id_rx_list.dl_tail->w_next = RWQE_TO_WQE(rwqe); 3615 state->id_rx_list.dl_tail = RWQE_TO_WQE(rwqe); 3616 } 3617 mutex_exit(&state->id_rx_list.dl_mutex); 3618 } 3619 3620 mutex_enter(&state->id_rxpost_lock); 3621 if (state->id_rx_busy) { 3622 rwqe->w_post_link = NULL; 3623 if (state->id_rx_head) 3624 *(state->id_rx_tailp) = (ibd_wqe_t *)rwqe; 3625 else 3626 state->id_rx_head = rwqe; 3627 state->id_rx_tailp = &(rwqe->w_post_link); 3628 } else { 3629 state->id_rx_busy = 1; 3630 do { 3631 mutex_exit(&state->id_rxpost_lock); 3632 3633 /* 3634 * Here we should add dl_cnt before post recv, because 3635 * we would have to make sure dl_cnt is updated before 3636 * the corresponding ibd_process_rx() is called. 3637 */ 3638 atomic_add_32(&state->id_rx_list.dl_cnt, 1); 3639 3640 ibt_status = ibt_post_recv(state->id_chnl_hdl, 3641 &rwqe->w_rwr, 1, NULL); 3642 if (ibt_status != IBT_SUCCESS) { 3643 (void) atomic_add_32_nv( 3644 &state->id_rx_list.dl_cnt, -1); 3645 ibd_print_warn(state, "ibd_post_recv: " 3646 "posting failed, ret=%d", ibt_status); 3647 return (DDI_FAILURE); 3648 } 3649 3650 mutex_enter(&state->id_rxpost_lock); 3651 rwqe = state->id_rx_head; 3652 if (rwqe) { 3653 state->id_rx_head = 3654 (ibd_rwqe_t *)(rwqe->w_post_link); 3655 } 3656 } while (rwqe); 3657 state->id_rx_busy = 0; 3658 } 3659 mutex_exit(&state->id_rxpost_lock); 3660 3661 return (DDI_SUCCESS); 3662 } 3663 3664 /* 3665 * Allocate the statically allocated Rx buffer list. 3666 */ 3667 static int 3668 ibd_init_rxlist(ibd_state_t *state) 3669 { 3670 ibd_rwqe_t *rwqe; 3671 int i; 3672 3673 for (i = 0; i < state->id_num_rwqe; i++) { 3674 if (ibd_alloc_rwqe(state, &rwqe) != DDI_SUCCESS) { 3675 ibd_fini_rxlist(state); 3676 return (DDI_FAILURE); 3677 } 3678 3679 if (ibd_post_recv(state, rwqe, B_FALSE) == DDI_FAILURE) { 3680 ibd_free_rwqe(state, rwqe); 3681 ibd_fini_rxlist(state); 3682 return (DDI_FAILURE); 3683 } 3684 } 3685 3686 return (DDI_SUCCESS); 3687 } 3688 3689 /* 3690 * Free the statically allocated Rx buffer list. 3691 * 3692 */ 3693 static void 3694 ibd_fini_rxlist(ibd_state_t *state) 3695 { 3696 ibd_rwqe_t *node; 3697 3698 mutex_enter(&state->id_rx_list.dl_mutex); 3699 while (state->id_rx_list.dl_head != NULL) { 3700 node = WQE_TO_RWQE(state->id_rx_list.dl_head); 3701 state->id_rx_list.dl_head = state->id_rx_list.dl_head->w_next; 3702 ASSERT(state->id_rx_list.dl_cnt > 0); 3703 state->id_rx_list.dl_cnt--; 3704 3705 ibd_free_rwqe(state, node); 3706 } 3707 mutex_exit(&state->id_rx_list.dl_mutex); 3708 } 3709 3710 /* 3711 * Allocate a single recv wqe and register it so it is almost 3712 * ready to be posted to the hardware. 3713 */ 3714 static int 3715 ibd_alloc_rwqe(ibd_state_t *state, ibd_rwqe_t **wqe) 3716 { 3717 ibt_mr_attr_t mem_attr; 3718 ibd_rwqe_t *rwqe; 3719 3720 if ((rwqe = kmem_zalloc(sizeof (ibd_rwqe_t), KM_NOSLEEP)) == NULL) { 3721 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3722 return (DDI_FAILURE); 3723 } 3724 *wqe = rwqe; 3725 rwqe->rwqe_type = IBD_WQE_RECV; 3726 rwqe->w_state = state; 3727 rwqe->rwqe_next = NULL; 3728 rwqe->rwqe_prev = NULL; 3729 rwqe->w_freeing_wqe = B_FALSE; 3730 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3731 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3732 3733 rwqe->rwqe_copybuf.ic_bufaddr = kmem_alloc(state->id_mtu + 3734 IPOIB_GRH_SIZE, KM_NOSLEEP); 3735 if (rwqe->rwqe_copybuf.ic_bufaddr == NULL) { 3736 DPRINT(10, "ibd_alloc_rwqe: failed in kmem_alloc"); 3737 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3738 return (DDI_FAILURE); 3739 } 3740 3741 if ((rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 3742 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb)) == 3743 NULL) { 3744 DPRINT(10, "ibd_alloc_rwqe : failed in desballoc()"); 3745 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3746 state->id_mtu + IPOIB_GRH_SIZE); 3747 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3748 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3749 return (DDI_FAILURE); 3750 } 3751 3752 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3753 mem_attr.mr_len = state->id_mtu + IPOIB_GRH_SIZE; 3754 mem_attr.mr_as = NULL; 3755 mem_attr.mr_flags = IBT_MR_NOSLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3756 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3757 &rwqe->rwqe_copybuf.ic_mr_hdl, &rwqe->rwqe_copybuf.ic_mr_desc) != 3758 IBT_SUCCESS) { 3759 DPRINT(10, "ibd_alloc_rwqe : failed in ibt_register_mem()"); 3760 rwqe->w_freeing_wqe = B_TRUE; 3761 freemsg(rwqe->rwqe_im_mblk); 3762 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3763 state->id_mtu + IPOIB_GRH_SIZE); 3764 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3765 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3766 return (DDI_FAILURE); 3767 } 3768 3769 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3770 (ib_vaddr_t)(uintptr_t)rwqe->rwqe_copybuf.ic_bufaddr; 3771 rwqe->rwqe_copybuf.ic_sgl.ds_key = 3772 rwqe->rwqe_copybuf.ic_mr_desc.md_lkey; 3773 rwqe->rwqe_copybuf.ic_sgl.ds_len = state->id_mtu + IPOIB_GRH_SIZE; 3774 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3775 rwqe->w_rwr.wr_nds = 1; 3776 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3777 3778 return (DDI_SUCCESS); 3779 } 3780 3781 /* 3782 * Free an allocated recv wqe. 3783 */ 3784 static void 3785 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3786 { 3787 if (ibt_deregister_mr(state->id_hca_hdl, 3788 rwqe->rwqe_copybuf.ic_mr_hdl) != IBT_SUCCESS) { 3789 DPRINT(10, "ibd_free_rwqe: failed in ibt_deregister_mr()"); 3790 return; 3791 } 3792 3793 /* 3794 * Indicate to the callback function that this rwqe/mblk 3795 * should not be recycled. The freemsg() will invoke 3796 * ibd_freemsg_cb(). 3797 */ 3798 if (rwqe->rwqe_im_mblk != NULL) { 3799 rwqe->w_freeing_wqe = B_TRUE; 3800 freemsg(rwqe->rwqe_im_mblk); 3801 } 3802 kmem_free(rwqe->rwqe_copybuf.ic_bufaddr, 3803 state->id_mtu + IPOIB_GRH_SIZE); 3804 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3805 kmem_free(rwqe, sizeof (ibd_rwqe_t)); 3806 } 3807 3808 /* 3809 * Delete the rwqe being freed from the rx list. 3810 */ 3811 static void 3812 ibd_delete_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3813 { 3814 mutex_enter(&state->id_rx_list.dl_mutex); 3815 if (state->id_rx_list.dl_head == RWQE_TO_WQE(rwqe)) 3816 state->id_rx_list.dl_head = rwqe->rwqe_next; 3817 else 3818 rwqe->rwqe_prev->w_next = rwqe->rwqe_next; 3819 if (state->id_rx_list.dl_tail == RWQE_TO_WQE(rwqe)) 3820 state->id_rx_list.dl_tail = rwqe->rwqe_prev; 3821 else 3822 rwqe->rwqe_next->w_prev = rwqe->rwqe_prev; 3823 mutex_exit(&state->id_rx_list.dl_mutex); 3824 } 3825 3826 /* 3827 * IBA Rx/Tx completion queue handler. Guaranteed to be single 3828 * threaded and nonreentrant for this CQ. When using combined CQ, 3829 * this handles Tx and Rx completions. With separate CQs, this handles 3830 * only Rx completions. 3831 */ 3832 /* ARGSUSED */ 3833 static void 3834 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3835 { 3836 ibd_state_t *state = (ibd_state_t *)arg; 3837 3838 atomic_add_64(&state->id_num_intrs, 1); 3839 3840 if (ibd_rx_softintr == 1) 3841 ddi_trigger_softintr(state->id_rx); 3842 else 3843 (void) ibd_intr((char *)state); 3844 } 3845 3846 /* 3847 * Separate CQ handler for Tx completions, when the Tx CQ is in 3848 * interrupt driven mode. 3849 */ 3850 /* ARGSUSED */ 3851 static void 3852 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3853 { 3854 ibd_state_t *state = (ibd_state_t *)arg; 3855 3856 atomic_add_64(&state->id_num_intrs, 1); 3857 3858 if (ibd_tx_softintr == 1) 3859 ddi_trigger_softintr(state->id_tx); 3860 else 3861 (void) ibd_tx_recycle((char *)state); 3862 } 3863 3864 /* 3865 * Multicast group create/delete trap handler. These will be delivered 3866 * on a kernel thread (handling can thus block) and can be invoked 3867 * concurrently. The handler can be invoked anytime after it is 3868 * registered and before ibt_detach(). 3869 */ 3870 /* ARGSUSED */ 3871 static void 3872 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3873 ibt_subnet_event_t *event) 3874 { 3875 ibd_state_t *state = (ibd_state_t *)arg; 3876 ibd_req_t *req; 3877 3878 /* 3879 * The trap handler will get invoked once for every event for 3880 * evert port. The input "gid" is the GID0 of the port the 3881 * trap came in on; we just need to act on traps that came 3882 * to our port, meaning the port on which the ipoib interface 3883 * resides. Since ipoib uses GID0 of the port, we just match 3884 * the gids to check whether we need to handle the trap. 3885 */ 3886 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 3887 return; 3888 3889 DPRINT(10, "ibd_notices_handler : %d\n", code); 3890 3891 switch (code) { 3892 case IBT_SM_EVENT_UNAVAILABLE: 3893 /* 3894 * If we are in promiscuous mode or have 3895 * sendnonmembers, we need to print a warning 3896 * message right now. Else, just store the 3897 * information, print when we enter promiscuous 3898 * mode or attempt nonmember send. We might 3899 * also want to stop caching sendnonmember. 3900 */ 3901 ibd_print_warn(state, "IBA multicast support " 3902 "degraded due to unavailability of multicast " 3903 "traps"); 3904 break; 3905 case IBT_SM_EVENT_AVAILABLE: 3906 /* 3907 * If we printed a warning message above or 3908 * while trying to nonmember send or get into 3909 * promiscuous mode, print an okay message. 3910 */ 3911 ibd_print_warn(state, "IBA multicast support " 3912 "restored due to availability of multicast " 3913 "traps"); 3914 break; 3915 case IBT_SM_EVENT_MCG_CREATED: 3916 case IBT_SM_EVENT_MCG_DELETED: 3917 /* 3918 * Common processing of creation/deletion traps. 3919 * First check if the instance is being 3920 * [de]initialized; back off then, without doing 3921 * anything more, since we are not sure if the 3922 * async thread is around, or whether we might 3923 * be racing with the detach code in ibd_m_stop() 3924 * that scans the mcg list. 3925 */ 3926 if (!ibd_async_safe(state)) 3927 return; 3928 3929 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 3930 req->rq_gid = event->sm_notice_gid; 3931 req->rq_ptr = (void *)code; 3932 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 3933 break; 3934 } 3935 } 3936 3937 static void 3938 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 3939 { 3940 ib_gid_t mgid = req->rq_gid; 3941 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 3942 3943 DPRINT(10, "ibd_async_trap : %d\n", code); 3944 3945 /* 3946 * Atomically search the nonmember and sendonlymember lists and 3947 * delete. 3948 */ 3949 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 3950 3951 if (state->id_prom_op == IBD_OP_COMPLETED) { 3952 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 3953 3954 /* 3955 * If in promiscuous mode, try to join/attach to the new 3956 * mcg. Given the unreliable out-of-order mode of trap 3957 * delivery, we can never be sure whether it is a problem 3958 * if the join fails. Thus, we warn the admin of a failure 3959 * if this was a creation trap. Note that the trap might 3960 * actually be reporting a long past event, and the mcg 3961 * might already have been deleted, thus we might be warning 3962 * in vain. 3963 */ 3964 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 3965 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 3966 ibd_print_warn(state, "IBA promiscuous mode missed " 3967 "new multicast gid %016llx:%016llx", 3968 (u_longlong_t)mgid.gid_prefix, 3969 (u_longlong_t)mgid.gid_guid); 3970 } 3971 3972 /* 3973 * Free the request slot allocated by the subnet event thread. 3974 */ 3975 ibd_async_done(state); 3976 } 3977 3978 /* 3979 * GLDv3 entry point to get capabilities. 3980 */ 3981 static boolean_t 3982 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 3983 { 3984 ibd_state_t *state = arg; 3985 3986 switch (cap) { 3987 case MAC_CAPAB_HCKSUM: { 3988 uint32_t *txflags = cap_data; 3989 3990 /* 3991 * We either do full checksum or not do it at all 3992 */ 3993 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 3994 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 3995 else 3996 return (B_FALSE); 3997 break; 3998 } 3999 4000 case MAC_CAPAB_LSO: { 4001 mac_capab_lso_t *cap_lso = cap_data; 4002 4003 /* 4004 * In addition to the capability and policy, since LSO 4005 * relies on hw checksum, we'll not enable LSO if we 4006 * don't have hw checksum. Of course, if the HCA doesn't 4007 * provide the reserved lkey capability, enabling LSO will 4008 * actually affect performance adversely, so we'll disable 4009 * LSO even for that case. 4010 */ 4011 if (!state->id_lso_policy || !state->id_lso_capable) 4012 return (B_FALSE); 4013 4014 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4015 return (B_FALSE); 4016 4017 if (state->id_hca_res_lkey_capab == 0) { 4018 ibd_print_warn(state, "no reserved-lkey capability, " 4019 "disabling LSO"); 4020 return (B_FALSE); 4021 } 4022 4023 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4024 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4025 break; 4026 } 4027 4028 default: 4029 return (B_FALSE); 4030 } 4031 4032 return (B_TRUE); 4033 } 4034 4035 static int 4036 ibd_get_port_details(ibd_state_t *state) 4037 { 4038 ibt_hca_portinfo_t *port_infop; 4039 ibt_status_t ret; 4040 uint_t psize, port_infosz; 4041 4042 mutex_enter(&state->id_link_mutex); 4043 4044 /* 4045 * Query for port information 4046 */ 4047 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4048 &port_infop, &psize, &port_infosz); 4049 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4050 mutex_exit(&state->id_link_mutex); 4051 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4052 "failed, ret=%d", ret); 4053 return (DDI_FAILURE); 4054 } 4055 4056 /* 4057 * If the link already went down by the time we get here, 4058 * give up 4059 */ 4060 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4061 mutex_exit(&state->id_link_mutex); 4062 ibt_free_portinfo(port_infop, port_infosz); 4063 DPRINT(10, "ibd_get_port_details: port is not active"); 4064 return (DDI_FAILURE); 4065 } 4066 4067 /* 4068 * If the link is active, verify the pkey 4069 */ 4070 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4071 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4072 mutex_exit(&state->id_link_mutex); 4073 ibt_free_portinfo(port_infop, port_infosz); 4074 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4075 "failed, ret=%d", ret); 4076 return (DDI_FAILURE); 4077 } 4078 4079 state->id_mtu = (128 << port_infop->p_mtu); 4080 state->id_sgid = *port_infop->p_sgid_tbl; 4081 state->id_link_state = LINK_STATE_UP; 4082 4083 mutex_exit(&state->id_link_mutex); 4084 ibt_free_portinfo(port_infop, port_infosz); 4085 4086 /* 4087 * Now that the port is active, record the port speed 4088 */ 4089 state->id_link_speed = ibd_get_portspeed(state); 4090 4091 return (DDI_SUCCESS); 4092 } 4093 4094 static int 4095 ibd_alloc_cqs(ibd_state_t *state) 4096 { 4097 ibt_hca_attr_t hca_attrs; 4098 ibt_cq_attr_t cq_attr; 4099 ibt_status_t ret; 4100 uint32_t real_size; 4101 4102 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4103 ASSERT(ret == IBT_SUCCESS); 4104 4105 /* 4106 * Allocate Rx/combined CQ: 4107 * Theoretically, there is no point in having more than #rwqe 4108 * plus #swqe cqe's, except that the CQ will be signalled for 4109 * overflow when the last wqe completes, if none of the previous 4110 * cqe's have been polled. Thus, we allocate just a few less wqe's 4111 * to make sure such overflow does not occur. 4112 */ 4113 cq_attr.cq_sched = NULL; 4114 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4115 4116 if (ibd_separate_cqs == 1) { 4117 /* 4118 * Allocate Receive CQ. 4119 */ 4120 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4121 cq_attr.cq_size = state->id_num_rwqe + 1; 4122 } else { 4123 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4124 state->id_num_rwqe = cq_attr.cq_size - 1; 4125 } 4126 4127 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4128 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4129 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4130 "failed, ret=%d\n", ret); 4131 return (DDI_FAILURE); 4132 } 4133 4134 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4135 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4136 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4137 "moderation failed, ret=%d\n", ret); 4138 } 4139 4140 state->id_rxwcs_size = state->id_num_rwqe + 1; 4141 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4142 state->id_rxwcs_size, KM_SLEEP); 4143 4144 /* 4145 * Allocate Send CQ. 4146 */ 4147 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4148 cq_attr.cq_size = state->id_num_swqe + 1; 4149 } else { 4150 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4151 state->id_num_swqe = cq_attr.cq_size - 1; 4152 } 4153 4154 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4155 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4156 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4157 "failed, ret=%d\n", ret); 4158 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4159 state->id_rxwcs_size); 4160 (void) ibt_free_cq(state->id_rcq_hdl); 4161 return (DDI_FAILURE); 4162 } 4163 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4164 IBD_TXCOMP_COUNT, IBD_TXCOMP_USEC, 0)) != IBT_SUCCESS) { 4165 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4166 "moderation failed, ret=%d\n", ret); 4167 } 4168 4169 state->id_txwcs_size = state->id_num_swqe + 1; 4170 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4171 state->id_txwcs_size, KM_SLEEP); 4172 } else { 4173 /* 4174 * Allocate combined Send/Receive CQ. 4175 */ 4176 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 4177 state->id_num_swqe + 1)) { 4178 cq_attr.cq_size = state->id_num_rwqe + 4179 state->id_num_swqe + 1; 4180 } else { 4181 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4182 state->id_num_rwqe = ((cq_attr.cq_size - 1) * 4183 state->id_num_rwqe) / (state->id_num_rwqe + 4184 state->id_num_swqe); 4185 state->id_num_swqe = cq_attr.cq_size - 1 - 4186 state->id_num_rwqe; 4187 } 4188 4189 state->id_rxwcs_size = cq_attr.cq_size; 4190 state->id_txwcs_size = state->id_rxwcs_size; 4191 4192 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4193 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4194 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rscq) " 4195 "failed, ret=%d\n", ret); 4196 return (DDI_FAILURE); 4197 } 4198 state->id_scq_hdl = state->id_rcq_hdl; 4199 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4200 state->id_rxwcs_size, KM_SLEEP); 4201 state->id_txwcs = state->id_rxwcs; 4202 } 4203 4204 /* 4205 * Print message in case we could not allocate as many wqe's 4206 * as was requested. 4207 */ 4208 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4209 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4210 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4211 } 4212 if (state->id_num_swqe != IBD_NUM_SWQE) { 4213 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4214 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4215 } 4216 4217 return (DDI_SUCCESS); 4218 } 4219 4220 static int 4221 ibd_setup_ud_channel(ibd_state_t *state) 4222 { 4223 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4224 ibt_ud_chan_query_attr_t ud_chan_attr; 4225 ibt_status_t ret; 4226 4227 ud_alloc_attr.ud_flags = IBT_WR_SIGNALED; 4228 if (state->id_hca_res_lkey_capab) 4229 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4230 if (state->id_lso_policy && state->id_lso_capable) 4231 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4232 4233 ud_alloc_attr.ud_hca_port_num = state->id_port; 4234 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4235 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4236 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4237 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4238 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4239 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4240 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4241 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4242 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4243 ud_alloc_attr.ud_clone_chan = NULL; 4244 4245 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4246 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4247 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4248 "failed, ret=%d\n", ret); 4249 return (DDI_FAILURE); 4250 } 4251 4252 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4253 &ud_chan_attr)) != IBT_SUCCESS) { 4254 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4255 "failed, ret=%d\n", ret); 4256 (void) ibt_free_channel(state->id_chnl_hdl); 4257 return (DDI_FAILURE); 4258 } 4259 4260 state->id_qpnum = ud_chan_attr.ud_qpn; 4261 4262 return (DDI_SUCCESS); 4263 } 4264 4265 static int 4266 ibd_undo_m_start(ibd_state_t *state) 4267 { 4268 uint32_t progress = state->id_mac_state; 4269 uint_t attempts; 4270 ibt_status_t ret; 4271 ib_gid_t mgid; 4272 ibd_mce_t *mce; 4273 uint8_t jstate; 4274 4275 /* 4276 * Before we try to stop/undo whatever we did in ibd_m_start(), 4277 * we need to mark the link state as unknown to prevent nw 4278 * layer from using this instance for any new transfers. 4279 */ 4280 if (progress & IBD_DRV_PORT_DETAILS_OBTAINED) { 4281 state->id_link_state = LINK_STATE_UNKNOWN; 4282 mac_link_update(state->id_mh, state->id_link_state); 4283 4284 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4285 } 4286 4287 if (progress & IBD_DRV_STARTED) { 4288 state->id_mac_state &= (~IBD_DRV_STARTED); 4289 } 4290 4291 /* 4292 * First, stop receive interrupts; this stops the driver from 4293 * handing up buffers to higher layers. Wait for receive buffers 4294 * to be returned and give up after 5 seconds. 4295 */ 4296 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4297 4298 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4299 4300 attempts = 50; 4301 while (state->id_rx_list.dl_bufs_outstanding > 0) { 4302 delay(drv_usectohz(100000)); 4303 if (--attempts == 0) { 4304 /* 4305 * There are pending bufs with the network 4306 * layer and we have no choice but to wait 4307 * for them to be done with. Reap all the 4308 * Tx/Rx completions that were posted since 4309 * we turned off the notification and 4310 * return failure. 4311 */ 4312 DPRINT(2, "ibd_undo_m_start: " 4313 "reclaiming failed"); 4314 ibd_poll_compq(state, state->id_rcq_hdl); 4315 ibt_set_cq_handler(state->id_rcq_hdl, 4316 ibd_rcq_handler, state); 4317 return (DDI_FAILURE); 4318 } 4319 } 4320 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4321 } 4322 4323 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4324 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4325 4326 mutex_enter(&state->id_trap_lock); 4327 state->id_trap_stop = B_TRUE; 4328 while (state->id_trap_inprog > 0) 4329 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4330 mutex_exit(&state->id_trap_lock); 4331 4332 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4333 } 4334 4335 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4336 /* 4337 * Flushing the channel ensures that all pending WQE's 4338 * are marked with flush_error and handed to the CQ. It 4339 * does not guarantee the invocation of the CQ handler. 4340 * This call is guaranteed to return successfully for 4341 * UD QPNs. 4342 */ 4343 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4344 IBT_SUCCESS) { 4345 DPRINT(10, "undo_m_start: flush_channel " 4346 "failed, ret=%d", ret); 4347 } 4348 4349 /* 4350 * Turn off Tx interrupts and poll. By the time the polling 4351 * returns an empty indicator, we are sure we have seen all 4352 * pending Tx callbacks. Note that after the call to 4353 * ibt_set_cq_handler() returns, the old handler is 4354 * guaranteed not to be invoked anymore. 4355 */ 4356 if (ibd_separate_cqs == 1) { 4357 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4358 } 4359 ibd_poll_compq(state, state->id_scq_hdl); 4360 4361 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4362 } 4363 4364 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4365 /* 4366 * No new async requests will be posted since the device 4367 * link state has been marked as unknown; completion handlers 4368 * have been turned off, so Tx handler will not cause any 4369 * more IBD_ASYNC_REAP requests. 4370 * 4371 * Queue a request for the async thread to exit, which will 4372 * be serviced after any pending ones. This can take a while, 4373 * specially if the SM is unreachable, since IBMF will slowly 4374 * timeout each SM request issued by the async thread. Reap 4375 * the thread before continuing on, we do not want it to be 4376 * lingering in modunloaded code (or we could move the reap 4377 * to ibd_detach(), provided we keep track of the current 4378 * id_async_thrid somewhere safe). 4379 */ 4380 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4381 thread_join(state->id_async_thrid); 4382 4383 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4384 } 4385 4386 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4387 /* 4388 * Drop all residual full/non membership. This includes full 4389 * membership to the broadcast group, and any nonmembership 4390 * acquired during transmits. We do this after the Tx completion 4391 * handlers are done, since those might result in some late 4392 * leaves; this also eliminates a potential race with that 4393 * path wrt the mc full list insert/delete. Trap handling 4394 * has also been suppressed at this point. Thus, no locks 4395 * are required while traversing the mc full list. 4396 */ 4397 DPRINT(2, "ibd_undo_m_start: clear full cache entries"); 4398 mce = list_head(&state->id_mc_full); 4399 while (mce != NULL) { 4400 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4401 jstate = mce->mc_jstate; 4402 mce = list_next(&state->id_mc_full, mce); 4403 ibd_leave_group(state, mgid, jstate); 4404 } 4405 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4406 } 4407 4408 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4409 ibd_fini_rxlist(state); 4410 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4411 } 4412 4413 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4414 ibd_fini_txlist(state); 4415 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4416 } 4417 4418 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4419 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4420 IBT_SUCCESS) { 4421 DPRINT(10, "undo_m_start: free_channel " 4422 "failed, ret=%d", ret); 4423 } 4424 4425 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4426 } 4427 4428 if (progress & IBD_DRV_CQS_ALLOCD) { 4429 if (ibd_separate_cqs == 1) { 4430 kmem_free(state->id_txwcs, 4431 sizeof (ibt_wc_t) * state->id_txwcs_size); 4432 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4433 IBT_SUCCESS) { 4434 DPRINT(10, "undo_m_start: free_cq(scq) " 4435 "failed, ret=%d", ret); 4436 } 4437 } 4438 4439 kmem_free(state->id_rxwcs, 4440 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4441 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4442 DPRINT(10, "undo_m_start: free_cq(rcq) failed, " 4443 "ret=%d", ret); 4444 } 4445 4446 state->id_txwcs = NULL; 4447 state->id_rxwcs = NULL; 4448 state->id_scq_hdl = NULL; 4449 state->id_rcq_hdl = NULL; 4450 4451 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4452 } 4453 4454 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4455 mod_hash_destroy_hash(state->id_ah_active_hash); 4456 ibd_acache_fini(state); 4457 4458 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4459 } 4460 4461 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4462 /* 4463 * If we'd created the ipoib broadcast group and had 4464 * successfully joined it, leave it now 4465 */ 4466 if (state->id_bgroup_created) { 4467 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4468 jstate = IB_MC_JSTATE_FULL; 4469 (void) ibt_leave_mcg(state->id_sgid, mgid, 4470 state->id_sgid, jstate); 4471 } 4472 ibt_free_mcg_info(state->id_mcinfo, 1); 4473 4474 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4475 } 4476 4477 return (DDI_SUCCESS); 4478 } 4479 4480 /* 4481 * GLDv3 entry point to start hardware. 4482 */ 4483 /*ARGSUSED*/ 4484 static int 4485 ibd_m_start(void *arg) 4486 { 4487 ibd_state_t *state = arg; 4488 kthread_t *kht; 4489 int err; 4490 ibt_status_t ret; 4491 4492 if (state->id_mac_state & IBD_DRV_STARTED) 4493 return (DDI_SUCCESS); 4494 4495 /* 4496 * Get port details; if we fail here, very likely the port 4497 * state is inactive or the pkey can't be verified 4498 */ 4499 if (ibd_get_port_details(state) != DDI_SUCCESS) { 4500 DPRINT(10, "ibd_m_start: ibd_get_port_details() failed"); 4501 return (EAGAIN); 4502 } 4503 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4504 4505 /* 4506 * Find the IPoIB broadcast group 4507 */ 4508 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4509 DPRINT(10, "ibd_m_start: ibd_find_bgroup() failed"); 4510 err = ENOENT; 4511 goto m_start_fail; 4512 } 4513 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4514 4515 /* 4516 * Initialize per-interface caches and lists; if we fail here, 4517 * it is most likely due to a lack of resources 4518 */ 4519 if (ibd_acache_init(state) != DDI_SUCCESS) { 4520 DPRINT(10, "ibd_m_start: ibd_acache_init() failed"); 4521 err = ENOMEM; 4522 goto m_start_fail; 4523 } 4524 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4525 4526 /* 4527 * Allocate send and receive completion queues 4528 */ 4529 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4530 DPRINT(10, "ibd_m_start: ibd_alloc_cqs() failed"); 4531 err = ENOMEM; 4532 goto m_start_fail; 4533 } 4534 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4535 4536 /* 4537 * Setup a UD channel 4538 */ 4539 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4540 err = ENOMEM; 4541 DPRINT(10, "ibd_m_start: ibd_setup_ud_channel() failed"); 4542 goto m_start_fail; 4543 } 4544 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4545 4546 /* 4547 * Allocate and initialize the tx buffer list 4548 */ 4549 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4550 DPRINT(10, "ibd_m_start: ibd_init_txlist() failed"); 4551 err = ENOMEM; 4552 goto m_start_fail; 4553 } 4554 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4555 4556 /* 4557 * If we have separate cqs, create the send cq handler here 4558 */ 4559 if ((ibd_separate_cqs == 1) && (ibd_txcomp_poll == 0)) { 4560 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4561 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4562 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4563 DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(scq) " 4564 "failed, ret=%d", ret); 4565 err = EINVAL; 4566 goto m_start_fail; 4567 } 4568 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4569 } 4570 4571 /* 4572 * Allocate and initialize the rx buffer list 4573 */ 4574 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4575 DPRINT(10, "ibd_m_start: ibd_init_rxlist() failed"); 4576 err = ENOMEM; 4577 goto m_start_fail; 4578 } 4579 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4580 4581 /* 4582 * Join IPoIB broadcast group 4583 */ 4584 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4585 DPRINT(10, "ibd_m_start: ibd_join_group() failed"); 4586 err = EINVAL; 4587 goto m_start_fail; 4588 } 4589 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4590 4591 /* 4592 * Create the async thread; thread_create never fails. 4593 */ 4594 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4595 TS_RUN, minclsyspri); 4596 state->id_async_thrid = kht->t_did; 4597 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4598 4599 /* 4600 * When we did mac_register() in ibd_attach(), we didn't register 4601 * the real macaddr and we didn't have the true port mtu. Now that 4602 * we're almost ready, set the local mac address and broadcast 4603 * addresses and update gldv3 about the real values of these 4604 * parameters. 4605 */ 4606 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4607 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4608 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4609 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4610 4611 mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); 4612 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4613 4614 /* 4615 * Setup the receive cq handler 4616 */ 4617 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4618 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4619 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4620 DPRINT(10, "ibd_m_start: ibt_enable_cq_notify(rcq) " 4621 "failed, ret=%d", ret); 4622 err = EINVAL; 4623 goto m_start_fail; 4624 } 4625 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4626 4627 /* 4628 * Setup the subnet notices handler after we've initialized the acache/ 4629 * mcache and started the async thread, both of which are required for 4630 * the trap handler to function properly. 4631 * 4632 * Now that the async thread has been started (and we've already done 4633 * a mac_register() during attach so mac_tx_update() can be called 4634 * if necessary without any problem), we can enable the trap handler 4635 * to queue requests to the async thread. 4636 */ 4637 ibt_register_subnet_notices(state->id_ibt_hdl, 4638 ibd_snet_notices_handler, state); 4639 mutex_enter(&state->id_trap_lock); 4640 state->id_trap_stop = B_FALSE; 4641 mutex_exit(&state->id_trap_lock); 4642 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4643 4644 /* 4645 * Indicate link status to GLDv3 and higher layers. By default, 4646 * we assume we are in up state (which must have been true at 4647 * least at the time the broadcast mcg's were probed); if there 4648 * were any up/down transitions till the time we come here, the 4649 * async handler will have updated last known state, which we 4650 * use to tell GLDv3. The async handler will not send any 4651 * notifications to GLDv3 till we reach here in the initialization 4652 * sequence. 4653 */ 4654 state->id_mac_state |= IBD_DRV_STARTED; 4655 mac_link_update(state->id_mh, state->id_link_state); 4656 4657 return (DDI_SUCCESS); 4658 4659 m_start_fail: 4660 /* 4661 * If we ran into a problem during ibd_m_start() and ran into 4662 * some other problem during undoing our partial work, we can't 4663 * do anything about it. Ignore any errors we might get from 4664 * ibd_undo_m_start() and just return the original error we got. 4665 */ 4666 (void) ibd_undo_m_start(state); 4667 return (err); 4668 } 4669 4670 /* 4671 * GLDv3 entry point to stop hardware from receiving packets. 4672 */ 4673 /*ARGSUSED*/ 4674 static void 4675 ibd_m_stop(void *arg) 4676 { 4677 ibd_state_t *state = arg; 4678 4679 /* 4680 * Since ibd_m_stop() doesn't expect any return, we cannot 4681 * fail even if we run into some problem with ibd_undo_m_start(). 4682 * The best we can do is to leave it in a good state, so 4683 * perhaps a future unplumb will succeed. 4684 */ 4685 (void) ibd_undo_m_start(state); 4686 } 4687 4688 /* 4689 * GLDv3 entry point to modify device's mac address. We do not 4690 * allow address modifications. 4691 */ 4692 static int 4693 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4694 { 4695 ibd_state_t *state = arg; 4696 4697 /* 4698 * Don't bother even comparing the macaddr if we haven't 4699 * completed ibd_m_start(). 4700 */ 4701 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4702 return (0); 4703 4704 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4705 return (0); 4706 else 4707 return (EINVAL); 4708 } 4709 4710 /* 4711 * The blocking part of the IBA join/leave operations are done out 4712 * of here on the async thread. 4713 */ 4714 static void 4715 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4716 { 4717 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4718 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4719 4720 if (op == IBD_ASYNC_JOIN) { 4721 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4722 ibd_print_warn(state, "Joint multicast group failed :" 4723 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4724 } 4725 } else { 4726 /* 4727 * Here, we must search for the proper mcg_info and 4728 * use that to leave the group. 4729 */ 4730 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4731 } 4732 } 4733 4734 /* 4735 * GLDv3 entry point for multicast enable/disable requests. 4736 * This function queues the operation to the async thread and 4737 * return success for a valid multicast address. 4738 */ 4739 static int 4740 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4741 { 4742 ibd_state_t *state = (ibd_state_t *)arg; 4743 ipoib_mac_t maddr, *mcast; 4744 ib_gid_t mgid; 4745 ibd_req_t *req; 4746 4747 /* 4748 * If we haven't completed ibd_m_start(), async thread wouldn't 4749 * have been started and id_bcaddr wouldn't be set, so there's 4750 * no point in continuing. 4751 */ 4752 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4753 return (0); 4754 4755 /* 4756 * The incoming multicast address might not be aligned properly 4757 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4758 * it to look like one though, to get the offsets of the mc gid, 4759 * since we know we are not going to dereference any values with 4760 * the ipoib_mac_t pointer. 4761 */ 4762 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4763 mcast = &maddr; 4764 4765 /* 4766 * Check validity of MCG address. We could additionally check 4767 * that a enable/disable is not being issued on the "broadcast" 4768 * mcg, but since this operation is only invokable by priviledged 4769 * programs anyway, we allow the flexibility to those dlpi apps. 4770 * Note that we do not validate the "scope" of the IBA mcg. 4771 */ 4772 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4773 return (EINVAL); 4774 4775 /* 4776 * fill in multicast pkey and scope 4777 */ 4778 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4779 4780 /* 4781 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4782 * nothing (i.e. we stay JOINed to the broadcast group done in 4783 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 4784 * requires to be joined to broadcast groups at all times. 4785 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4786 * depends on this. 4787 */ 4788 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4789 return (0); 4790 4791 ibd_n2h_gid(mcast, &mgid); 4792 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4793 if (req == NULL) 4794 return (ENOMEM); 4795 4796 req->rq_gid = mgid; 4797 4798 if (add) { 4799 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4800 mgid.gid_prefix, mgid.gid_guid); 4801 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 4802 } else { 4803 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4804 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4805 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 4806 } 4807 return (0); 4808 } 4809 4810 /* 4811 * The blocking part of the IBA promiscuous operations are done 4812 * out of here on the async thread. The dlpireq parameter indicates 4813 * whether this invocation is due to a dlpi request or due to 4814 * a port up/down event. 4815 */ 4816 static void 4817 ibd_async_unsetprom(ibd_state_t *state) 4818 { 4819 ibd_mce_t *mce = list_head(&state->id_mc_non); 4820 ib_gid_t mgid; 4821 4822 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4823 4824 while (mce != NULL) { 4825 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4826 mce = list_next(&state->id_mc_non, mce); 4827 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4828 } 4829 state->id_prom_op = IBD_OP_NOTSTARTED; 4830 } 4831 4832 /* 4833 * The blocking part of the IBA promiscuous operations are done 4834 * out of here on the async thread. The dlpireq parameter indicates 4835 * whether this invocation is due to a dlpi request or due to 4836 * a port up/down event. 4837 */ 4838 static void 4839 ibd_async_setprom(ibd_state_t *state) 4840 { 4841 ibt_mcg_attr_t mcg_attr; 4842 ibt_mcg_info_t *mcg_info; 4843 ib_gid_t mgid; 4844 uint_t numg; 4845 int i; 4846 char ret = IBD_OP_COMPLETED; 4847 4848 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4849 4850 /* 4851 * Obtain all active MC groups on the IB fabric with 4852 * specified criteria (scope + Pkey + Qkey + mtu). 4853 */ 4854 bzero(&mcg_attr, sizeof (mcg_attr)); 4855 mcg_attr.mc_pkey = state->id_pkey; 4856 mcg_attr.mc_scope = state->id_scope; 4857 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4858 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4859 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4860 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4861 IBT_SUCCESS) { 4862 ibd_print_warn(state, "Could not get list of IBA multicast " 4863 "groups"); 4864 ret = IBD_OP_ERRORED; 4865 goto done; 4866 } 4867 4868 /* 4869 * Iterate over the returned mcg's and join as NonMember 4870 * to the IP mcg's. 4871 */ 4872 for (i = 0; i < numg; i++) { 4873 /* 4874 * Do a NonMember JOIN on the MC group. 4875 */ 4876 mgid = mcg_info[i].mc_adds_vect.av_dgid; 4877 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 4878 ibd_print_warn(state, "IBA promiscuous mode missed " 4879 "multicast gid %016llx:%016llx", 4880 (u_longlong_t)mgid.gid_prefix, 4881 (u_longlong_t)mgid.gid_guid); 4882 } 4883 4884 ibt_free_mcg_info(mcg_info, numg); 4885 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 4886 done: 4887 state->id_prom_op = ret; 4888 } 4889 4890 /* 4891 * GLDv3 entry point for multicast promiscuous enable/disable requests. 4892 * GLDv3 assumes phys state receives more packets than multi state, 4893 * which is not true for IPoIB. Thus, treat the multi and phys 4894 * promiscuous states the same way to work with GLDv3's assumption. 4895 */ 4896 static int 4897 ibd_m_promisc(void *arg, boolean_t on) 4898 { 4899 ibd_state_t *state = (ibd_state_t *)arg; 4900 ibd_req_t *req; 4901 4902 /* 4903 * Async thread wouldn't have been started if we haven't 4904 * passed ibd_m_start() 4905 */ 4906 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4907 return (0); 4908 4909 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4910 if (req == NULL) 4911 return (ENOMEM); 4912 if (on) { 4913 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 4914 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 4915 } else { 4916 DPRINT(1, "ibd_m_promisc : unset_promisc"); 4917 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 4918 } 4919 4920 return (0); 4921 } 4922 4923 /* 4924 * GLDv3 entry point for gathering statistics. 4925 */ 4926 static int 4927 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 4928 { 4929 ibd_state_t *state = (ibd_state_t *)arg; 4930 4931 switch (stat) { 4932 case MAC_STAT_IFSPEED: 4933 *val = state->id_link_speed; 4934 break; 4935 case MAC_STAT_MULTIRCV: 4936 *val = state->id_multi_rcv; 4937 break; 4938 case MAC_STAT_BRDCSTRCV: 4939 *val = state->id_brd_rcv; 4940 break; 4941 case MAC_STAT_MULTIXMT: 4942 *val = state->id_multi_xmt; 4943 break; 4944 case MAC_STAT_BRDCSTXMT: 4945 *val = state->id_brd_xmt; 4946 break; 4947 case MAC_STAT_RBYTES: 4948 *val = state->id_rcv_bytes; 4949 break; 4950 case MAC_STAT_IPACKETS: 4951 *val = state->id_rcv_pkt; 4952 break; 4953 case MAC_STAT_OBYTES: 4954 *val = state->id_xmt_bytes; 4955 break; 4956 case MAC_STAT_OPACKETS: 4957 *val = state->id_xmt_pkt; 4958 break; 4959 case MAC_STAT_OERRORS: 4960 *val = state->id_ah_error; /* failed AH translation */ 4961 break; 4962 case MAC_STAT_IERRORS: 4963 *val = 0; 4964 break; 4965 case MAC_STAT_NOXMTBUF: 4966 *val = state->id_tx_short; 4967 break; 4968 case MAC_STAT_NORCVBUF: 4969 default: 4970 return (ENOTSUP); 4971 } 4972 4973 return (0); 4974 } 4975 4976 static void 4977 ibd_async_txsched(ibd_state_t *state) 4978 { 4979 ibd_req_t *req; 4980 int ret; 4981 4982 if (ibd_txcomp_poll) 4983 ibd_poll_compq(state, state->id_scq_hdl); 4984 4985 ret = ibd_resume_transmission(state); 4986 if (ret && ibd_txcomp_poll) { 4987 if (req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP)) 4988 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 4989 else { 4990 ibd_print_warn(state, "ibd_async_txsched: " 4991 "no memory, can't schedule work slot"); 4992 } 4993 } 4994 } 4995 4996 static int 4997 ibd_resume_transmission(ibd_state_t *state) 4998 { 4999 int flag; 5000 int met_thresh = 0; 5001 int ret = -1; 5002 5003 mutex_enter(&state->id_sched_lock); 5004 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5005 met_thresh = (state->id_tx_list.dl_cnt > 5006 IBD_FREE_SWQES_THRESH); 5007 flag = IBD_RSRC_SWQE; 5008 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5009 ASSERT(state->id_lso != NULL); 5010 met_thresh = (state->id_lso->bkt_nfree > 5011 IBD_FREE_LSOS_THRESH); 5012 flag = IBD_RSRC_LSOBUF; 5013 } 5014 if (met_thresh) { 5015 state->id_sched_needed &= ~flag; 5016 ret = 0; 5017 } 5018 mutex_exit(&state->id_sched_lock); 5019 5020 if (ret == 0) 5021 mac_tx_update(state->id_mh); 5022 5023 return (ret); 5024 } 5025 5026 /* 5027 * Release the send wqe back into free list. 5028 */ 5029 static void 5030 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *swqe) 5031 { 5032 /* 5033 * Add back on Tx list for reuse. 5034 */ 5035 swqe->swqe_next = NULL; 5036 mutex_enter(&state->id_tx_list.dl_mutex); 5037 if (state->id_tx_list.dl_pending_sends) { 5038 state->id_tx_list.dl_pending_sends = B_FALSE; 5039 } 5040 if (state->id_tx_list.dl_head == NULL) { 5041 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 5042 } else { 5043 state->id_tx_list.dl_tail->w_next = SWQE_TO_WQE(swqe); 5044 } 5045 state->id_tx_list.dl_tail = SWQE_TO_WQE(swqe); 5046 state->id_tx_list.dl_cnt++; 5047 mutex_exit(&state->id_tx_list.dl_mutex); 5048 } 5049 5050 /* 5051 * Acquire a send wqe from free list. 5052 * Returns error number and send wqe pointer. 5053 */ 5054 static int 5055 ibd_acquire_swqe(ibd_state_t *state, ibd_swqe_t **swqe) 5056 { 5057 int rc = 0; 5058 ibd_swqe_t *wqe; 5059 5060 /* 5061 * Check and reclaim some of the completed Tx requests. 5062 * If someone else is already in this code and pulling Tx 5063 * completions, no need to poll, since the current lock holder 5064 * will do the work anyway. Normally, we poll for completions 5065 * every few Tx attempts, but if we are short on Tx descriptors, 5066 * we always try to poll. 5067 */ 5068 if ((ibd_txcomp_poll == 1) && 5069 (state->id_tx_list.dl_cnt < IBD_TX_POLL_THRESH)) { 5070 ibd_poll_compq(state, state->id_scq_hdl); 5071 } 5072 5073 /* 5074 * Grab required transmit wqes. 5075 */ 5076 mutex_enter(&state->id_tx_list.dl_mutex); 5077 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5078 if (wqe != NULL) { 5079 state->id_tx_list.dl_cnt -= 1; 5080 state->id_tx_list.dl_head = wqe->swqe_next; 5081 if (state->id_tx_list.dl_tail == SWQE_TO_WQE(wqe)) 5082 state->id_tx_list.dl_tail = NULL; 5083 } else { 5084 /* 5085 * If we did not find the number we were looking for, flag 5086 * no resource. Adjust list appropriately in either case. 5087 */ 5088 rc = ENOENT; 5089 state->id_tx_list.dl_pending_sends = B_TRUE; 5090 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5091 atomic_add_64(&state->id_tx_short, 1); 5092 } 5093 mutex_exit(&state->id_tx_list.dl_mutex); 5094 *swqe = wqe; 5095 5096 return (rc); 5097 } 5098 5099 static int 5100 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5101 ibt_ud_dest_hdl_t ud_dest) 5102 { 5103 mblk_t *nmp; 5104 int iph_len, tcph_len; 5105 ibt_wr_lso_t *lso; 5106 uintptr_t ip_start, tcp_start; 5107 uint8_t *dst; 5108 uint_t pending, mblen; 5109 5110 /* 5111 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5112 * we need to adjust it here for lso. 5113 */ 5114 lso = &(node->w_swr.wr.ud_lso); 5115 lso->lso_ud_dest = ud_dest; 5116 lso->lso_mss = mss; 5117 5118 /* 5119 * Calculate the LSO header size and set it in the UD LSO structure. 5120 * Note that the only assumption we make is that each of the IPoIB, 5121 * IP and TCP headers will be contained in a single mblk fragment; 5122 * together, the headers may span multiple mblk fragments. 5123 */ 5124 nmp = mp; 5125 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5126 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5127 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5128 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5129 nmp = nmp->b_cont; 5130 5131 } 5132 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5133 5134 tcp_start = ip_start + iph_len; 5135 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5136 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5137 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5138 nmp = nmp->b_cont; 5139 } 5140 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5141 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5142 5143 /* 5144 * If the lso header fits entirely within a single mblk fragment, 5145 * we'll avoid an additional copy of the lso header here and just 5146 * pass the b_rptr of the mblk directly. 5147 * 5148 * If this isn't true, we'd have to allocate for it explicitly. 5149 */ 5150 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5151 lso->lso_hdr = mp->b_rptr; 5152 } else { 5153 /* On work completion, remember to free this allocated hdr */ 5154 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5155 if (lso->lso_hdr == NULL) { 5156 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5157 "sz = %d", lso->lso_hdr_sz); 5158 lso->lso_hdr_sz = 0; 5159 lso->lso_mss = 0; 5160 return (-1); 5161 } 5162 } 5163 5164 /* 5165 * Copy in the lso header only if we need to 5166 */ 5167 if (lso->lso_hdr != mp->b_rptr) { 5168 dst = lso->lso_hdr; 5169 pending = lso->lso_hdr_sz; 5170 5171 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5172 mblen = MBLKL(nmp); 5173 if (pending > mblen) { 5174 bcopy(nmp->b_rptr, dst, mblen); 5175 dst += mblen; 5176 pending -= mblen; 5177 } else { 5178 bcopy(nmp->b_rptr, dst, pending); 5179 break; 5180 } 5181 } 5182 } 5183 5184 return (0); 5185 } 5186 5187 static void 5188 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5189 { 5190 ibt_wr_lso_t *lso; 5191 5192 if ((!node) || (!mp)) 5193 return; 5194 5195 /* 5196 * Free any header space that we might've allocated if we 5197 * did an LSO 5198 */ 5199 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5200 lso = &(node->w_swr.wr.ud_lso); 5201 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5202 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5203 lso->lso_hdr = NULL; 5204 lso->lso_hdr_sz = 0; 5205 } 5206 } 5207 } 5208 5209 static void 5210 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5211 { 5212 uint_t i; 5213 uint_t num_posted; 5214 uint_t n_wrs; 5215 ibt_status_t ibt_status; 5216 ibt_send_wr_t wrs[IBD_MAX_POST_MULTIPLE]; 5217 ibd_swqe_t *elem; 5218 ibd_swqe_t *nodes[IBD_MAX_POST_MULTIPLE]; 5219 5220 node->swqe_next = NULL; 5221 5222 mutex_enter(&state->id_txpost_lock); 5223 5224 /* 5225 * Enqueue the new node in chain of wqes to send 5226 */ 5227 if (state->id_tx_head) { 5228 *(state->id_tx_tailp) = (ibd_wqe_t *)node; 5229 } else { 5230 state->id_tx_head = node; 5231 } 5232 state->id_tx_tailp = &(node->swqe_next); 5233 5234 /* 5235 * If someone else is helping out with the sends, 5236 * just go back 5237 */ 5238 if (state->id_tx_busy) { 5239 mutex_exit(&state->id_txpost_lock); 5240 return; 5241 } 5242 5243 /* 5244 * Otherwise, mark the flag to indicate that we'll be 5245 * doing the dispatch of what's there in the wqe chain 5246 */ 5247 state->id_tx_busy = 1; 5248 5249 while (state->id_tx_head) { 5250 /* 5251 * Collect pending requests, IBD_MAX_POST_MULTIPLE wrs 5252 * at a time if possible, and keep posting them. 5253 */ 5254 for (n_wrs = 0, elem = state->id_tx_head; 5255 (elem) && (n_wrs < IBD_MAX_POST_MULTIPLE); 5256 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5257 5258 nodes[n_wrs] = elem; 5259 wrs[n_wrs] = elem->w_swr; 5260 } 5261 state->id_tx_head = elem; 5262 5263 /* 5264 * Release the txpost lock before posting the 5265 * send request to the hca; if the posting fails 5266 * for some reason, we'll never receive completion 5267 * intimation, so we'll need to cleanup. 5268 */ 5269 mutex_exit(&state->id_txpost_lock); 5270 5271 ASSERT(n_wrs != 0); 5272 5273 /* 5274 * If posting fails for some reason, we'll never receive 5275 * completion intimation, so we'll need to cleanup. But 5276 * we need to make sure we don't clean up nodes whose 5277 * wrs have been successfully posted. We assume that the 5278 * hca driver returns on the first failure to post and 5279 * therefore the first 'num_posted' entries don't need 5280 * cleanup here. 5281 */ 5282 num_posted = 0; 5283 ibt_status = ibt_post_send(state->id_chnl_hdl, 5284 wrs, n_wrs, &num_posted); 5285 if (ibt_status != IBT_SUCCESS) { 5286 5287 ibd_print_warn(state, "ibd_post_send: " 5288 "posting multiple wrs failed: " 5289 "requested=%d, done=%d, ret=%d", 5290 n_wrs, num_posted, ibt_status); 5291 5292 for (i = num_posted; i < n_wrs; i++) 5293 ibd_tx_cleanup(state, nodes[i]); 5294 } 5295 5296 /* 5297 * Grab the mutex before we go and check the tx Q again 5298 */ 5299 mutex_enter(&state->id_txpost_lock); 5300 } 5301 5302 state->id_tx_busy = 0; 5303 mutex_exit(&state->id_txpost_lock); 5304 } 5305 5306 static int 5307 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5308 uint_t lsohdr_sz) 5309 { 5310 ibt_wr_ds_t *sgl; 5311 ibt_status_t ibt_status; 5312 mblk_t *nmp; 5313 mblk_t *data_mp; 5314 uchar_t *bufp; 5315 size_t blksize; 5316 size_t skip; 5317 size_t avail; 5318 uint_t pktsize; 5319 uint_t frag_len; 5320 uint_t pending_hdr; 5321 uint_t hiwm; 5322 int nmblks; 5323 int i; 5324 5325 /* 5326 * Let's skip ahead to the data if this is LSO 5327 */ 5328 data_mp = mp; 5329 pending_hdr = 0; 5330 if (lsohdr_sz) { 5331 pending_hdr = lsohdr_sz; 5332 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5333 frag_len = nmp->b_wptr - nmp->b_rptr; 5334 if (frag_len > pending_hdr) 5335 break; 5336 pending_hdr -= frag_len; 5337 } 5338 data_mp = nmp; /* start of data past lso header */ 5339 ASSERT(data_mp != NULL); 5340 } 5341 5342 /* 5343 * Calculate the size of message data and number of msg blocks 5344 */ 5345 pktsize = 0; 5346 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5347 nmp = nmp->b_cont, nmblks++) { 5348 pktsize += MBLKL(nmp); 5349 } 5350 pktsize -= pending_hdr; 5351 5352 /* 5353 * Translating the virtual address regions into physical regions 5354 * for using the Reserved LKey feature results in a wr sgl that 5355 * is a little longer. Since failing ibt_map_mem_iov() is costly, 5356 * we'll fix a high-water mark (65%) for when we should stop. 5357 */ 5358 hiwm = (state->id_max_sqseg * 65) / 100; 5359 5360 /* 5361 * We only do ibt_map_mem_iov() if the pktsize is above the 5362 * "copy-threshold", and if the number of mp fragments is less than 5363 * the maximum acceptable. 5364 */ 5365 if ((state->id_hca_res_lkey_capab) && 5366 (pktsize > IBD_TX_COPY_THRESH) && 5367 (nmblks < hiwm)) { 5368 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5369 ibt_iov_attr_t iov_attr; 5370 5371 iov_attr.iov_as = NULL; 5372 iov_attr.iov = iov_arr; 5373 iov_attr.iov_buf = NULL; 5374 iov_attr.iov_list_len = nmblks; 5375 iov_attr.iov_wr_nds = state->id_max_sqseg; 5376 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5377 iov_attr.iov_flags = IBT_IOV_SLEEP; 5378 5379 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5380 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5381 iov_arr[i].iov_len = MBLKL(nmp); 5382 if (i == 0) { 5383 iov_arr[i].iov_addr += pending_hdr; 5384 iov_arr[i].iov_len -= pending_hdr; 5385 } 5386 } 5387 5388 node->w_buftype = IBD_WQE_MAPPED; 5389 node->w_swr.wr_sgl = node->w_sgl; 5390 5391 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5392 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5393 if (ibt_status != IBT_SUCCESS) { 5394 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5395 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5396 goto ibd_copy_path; 5397 } 5398 5399 return (0); 5400 } 5401 5402 ibd_copy_path: 5403 if (pktsize <= state->id_tx_buf_sz) { 5404 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5405 node->w_swr.wr_nds = 1; 5406 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5407 node->w_buftype = IBD_WQE_TXBUF; 5408 5409 /* 5410 * Even though this is the copy path for transfers less than 5411 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5412 * is possible the first data mblk fragment (data_mp) still 5413 * contains part of the LSO header that we need to skip. 5414 */ 5415 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5416 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5417 blksize = MBLKL(nmp) - pending_hdr; 5418 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5419 bufp += blksize; 5420 pending_hdr = 0; 5421 } 5422 5423 return (0); 5424 } 5425 5426 /* 5427 * Copy path for transfers greater than id_tx_buf_sz 5428 */ 5429 node->w_swr.wr_sgl = node->w_sgl; 5430 if (ibd_acquire_lsobufs(state, pktsize, 5431 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5432 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5433 return (-1); 5434 } 5435 node->w_buftype = IBD_WQE_LSOBUF; 5436 5437 /* 5438 * Copy the larger-than-id_tx_buf_sz packet into a set of 5439 * fixed-sized, pre-mapped LSO buffers. Note that we might 5440 * need to skip part of the LSO header in the first fragment 5441 * as before. 5442 */ 5443 nmp = data_mp; 5444 skip = pending_hdr; 5445 for (i = 0; i < node->w_swr.wr_nds; i++) { 5446 sgl = node->w_swr.wr_sgl + i; 5447 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5448 avail = IBD_LSO_BUFSZ; 5449 while (nmp && avail) { 5450 blksize = MBLKL(nmp) - skip; 5451 if (blksize > avail) { 5452 bcopy(nmp->b_rptr + skip, bufp, avail); 5453 skip += avail; 5454 avail = 0; 5455 } else { 5456 bcopy(nmp->b_rptr + skip, bufp, blksize); 5457 skip = 0; 5458 avail -= blksize; 5459 bufp += blksize; 5460 nmp = nmp->b_cont; 5461 } 5462 } 5463 } 5464 5465 return (0); 5466 } 5467 5468 /* 5469 * Schedule a completion queue polling to reap the resource we're 5470 * short on. If we implement the change to reap tx completions 5471 * in a separate thread, we'll need to wake up that thread here. 5472 */ 5473 static int 5474 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5475 { 5476 ibd_req_t *req; 5477 5478 mutex_enter(&state->id_sched_lock); 5479 state->id_sched_needed |= resource_type; 5480 mutex_exit(&state->id_sched_lock); 5481 5482 /* 5483 * If we are asked to queue a work entry, we need to do it 5484 */ 5485 if (q_flag) { 5486 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5487 if (req == NULL) 5488 return (-1); 5489 5490 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5491 } 5492 5493 return (0); 5494 } 5495 5496 /* 5497 * The passed in packet has this format: 5498 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5499 */ 5500 static boolean_t 5501 ibd_send(ibd_state_t *state, mblk_t *mp) 5502 { 5503 ibd_ace_t *ace; 5504 ibd_swqe_t *node; 5505 ipoib_mac_t *dest; 5506 ib_header_info_t *ipibp; 5507 ip6_t *ip6h; 5508 uint_t pktsize; 5509 uint32_t mss; 5510 uint32_t hckflags; 5511 uint32_t lsoflags = 0; 5512 uint_t lsohdr_sz = 0; 5513 int ret, len; 5514 boolean_t dofree = B_FALSE; 5515 boolean_t rc; 5516 5517 /* 5518 * If we aren't done with the device initialization and start, 5519 * we shouldn't be here. 5520 */ 5521 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5522 return (B_FALSE); 5523 5524 node = NULL; 5525 if (ibd_acquire_swqe(state, &node) != 0) { 5526 /* 5527 * If we don't have an swqe available, schedule a transmit 5528 * completion queue cleanup and hold off on sending more 5529 * more packets until we have some free swqes 5530 */ 5531 if (ibd_sched_poll(state, IBD_RSRC_SWQE, ibd_txcomp_poll) == 0) 5532 return (B_FALSE); 5533 5534 /* 5535 * If a poll cannot be scheduled, we have no choice but 5536 * to drop this packet 5537 */ 5538 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5539 return (B_TRUE); 5540 } 5541 5542 /* 5543 * Initialize the commonly used fields in swqe to NULL to protect 5544 * against ibd_tx_cleanup accidentally misinterpreting these on a 5545 * failure. 5546 */ 5547 node->swqe_im_mblk = NULL; 5548 node->w_swr.wr_nds = 0; 5549 node->w_swr.wr_sgl = NULL; 5550 node->w_swr.wr_opcode = IBT_WRC_SEND; 5551 5552 /* 5553 * Obtain an address handle for the destination. 5554 */ 5555 ipibp = (ib_header_info_t *)mp->b_rptr; 5556 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5557 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5558 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5559 5560 pktsize = msgsize(mp); 5561 5562 atomic_add_64(&state->id_xmt_bytes, pktsize); 5563 atomic_inc_64(&state->id_xmt_pkt); 5564 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5565 atomic_inc_64(&state->id_brd_xmt); 5566 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5567 atomic_inc_64(&state->id_multi_xmt); 5568 5569 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 5570 node->w_ahandle = ace; 5571 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5572 } else { 5573 DPRINT(5, 5574 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5575 ((ret == EFAULT) ? "failed" : "queued"), 5576 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5577 htonl(dest->ipoib_gidpref[1]), 5578 htonl(dest->ipoib_gidsuff[0]), 5579 htonl(dest->ipoib_gidsuff[1])); 5580 node->w_ahandle = NULL; 5581 5582 /* 5583 * for the poll mode, it is probably some cqe pending in the 5584 * cq. So ibd has to poll cq here, otherwise acache probably 5585 * may not be recycled. 5586 */ 5587 if (ibd_txcomp_poll == 1) 5588 ibd_poll_compq(state, state->id_scq_hdl); 5589 5590 /* 5591 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5592 * can not find a path for the specific dest address. We 5593 * should get rid of this kind of packet. We also should get 5594 * rid of the packet if we cannot schedule a poll via the 5595 * async thread. For the normal case, ibd will return the 5596 * packet to upper layer and wait for AH creating. 5597 * 5598 * Note that we always queue a work slot entry for the async 5599 * thread when we fail AH lookup (even in intr mode); this is 5600 * due to the convoluted way the code currently looks for AH. 5601 */ 5602 if (ret == EFAULT) { 5603 dofree = B_TRUE; 5604 rc = B_TRUE; 5605 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5606 dofree = B_TRUE; 5607 rc = B_TRUE; 5608 } else { 5609 dofree = B_FALSE; 5610 rc = B_FALSE; 5611 } 5612 goto ibd_send_fail; 5613 } 5614 5615 /* 5616 * For ND6 packets, padding is at the front of the source lladdr. 5617 * Insert the padding at front. 5618 */ 5619 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == IP6_DL_SAP) { 5620 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5621 if (!pullupmsg(mp, IPV6_HDR_LEN + 5622 sizeof (ib_header_info_t))) { 5623 DPRINT(10, "ibd_send: pullupmsg failure "); 5624 dofree = B_TRUE; 5625 rc = B_TRUE; 5626 goto ibd_send_fail; 5627 } 5628 ipibp = (ib_header_info_t *)mp->b_rptr; 5629 } 5630 ip6h = (ip6_t *)((uchar_t *)ipibp + 5631 sizeof (ib_header_info_t)); 5632 len = ntohs(ip6h->ip6_plen); 5633 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5634 mblk_t *pad; 5635 5636 pad = allocb(4, 0); 5637 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5638 linkb(mp, pad); 5639 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5640 IPV6_HDR_LEN + len + 4) { 5641 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5642 IPV6_HDR_LEN + len + 4)) { 5643 DPRINT(10, "ibd_send: pullupmsg " 5644 "failure "); 5645 dofree = B_TRUE; 5646 rc = B_TRUE; 5647 goto ibd_send_fail; 5648 } 5649 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5650 sizeof (ib_header_info_t)); 5651 } 5652 5653 /* LINTED: E_CONSTANT_CONDITION */ 5654 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5655 } 5656 } 5657 5658 mp->b_rptr += sizeof (ib_addrs_t); 5659 5660 /* 5661 * Do LSO and checksum related work here. For LSO send, adjust the 5662 * ud destination, the opcode and the LSO header information to the 5663 * work request. 5664 */ 5665 lso_info_get(mp, &mss, &lsoflags); 5666 if ((lsoflags & HW_LSO) != HW_LSO) { 5667 node->w_swr.wr_opcode = IBT_WRC_SEND; 5668 lsohdr_sz = 0; 5669 } else { 5670 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5671 /* 5672 * The routine can only fail if there's no memory; we 5673 * can only drop the packet if this happens 5674 */ 5675 ibd_print_warn(state, 5676 "ibd_send: no memory, lso posting failed"); 5677 dofree = B_TRUE; 5678 rc = B_TRUE; 5679 goto ibd_send_fail; 5680 } 5681 5682 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5683 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5684 } 5685 5686 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5687 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5688 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5689 else 5690 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5691 5692 /* 5693 * Prepare the sgl for posting; the routine can only fail if there's 5694 * no lso buf available for posting. If this is the case, we should 5695 * probably resched for lso bufs to become available and then try again. 5696 */ 5697 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5698 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5699 dofree = B_TRUE; 5700 rc = B_TRUE; 5701 } else { 5702 dofree = B_FALSE; 5703 rc = B_FALSE; 5704 } 5705 goto ibd_send_fail; 5706 } 5707 node->swqe_im_mblk = mp; 5708 5709 /* 5710 * Queue the wqe to hardware; since we can now simply queue a 5711 * post instead of doing it serially, we cannot assume anything 5712 * about the 'node' after ibd_post_send() returns. 5713 */ 5714 ibd_post_send(state, node); 5715 5716 return (B_TRUE); 5717 5718 ibd_send_fail: 5719 if (node && mp) 5720 ibd_free_lsohdr(node, mp); 5721 5722 if (dofree) 5723 freemsg(mp); 5724 5725 if (node != NULL) 5726 ibd_tx_cleanup(state, node); 5727 5728 return (rc); 5729 } 5730 5731 /* 5732 * GLDv3 entry point for transmitting datagram. 5733 */ 5734 static mblk_t * 5735 ibd_m_tx(void *arg, mblk_t *mp) 5736 { 5737 ibd_state_t *state = (ibd_state_t *)arg; 5738 mblk_t *next; 5739 5740 if (state->id_link_state != LINK_STATE_UP) { 5741 freemsgchain(mp); 5742 mp = NULL; 5743 } 5744 5745 while (mp != NULL) { 5746 next = mp->b_next; 5747 mp->b_next = NULL; 5748 if (ibd_send(state, mp) == B_FALSE) { 5749 /* Send fail */ 5750 mp->b_next = next; 5751 break; 5752 } 5753 mp = next; 5754 } 5755 5756 return (mp); 5757 } 5758 5759 /* 5760 * this handles Tx and Rx completions. With separate CQs, this handles 5761 * only Rx completions. 5762 */ 5763 static uint_t 5764 ibd_intr(char *arg) 5765 { 5766 ibd_state_t *state = (ibd_state_t *)arg; 5767 5768 ibd_poll_compq(state, state->id_rcq_hdl); 5769 5770 return (DDI_INTR_CLAIMED); 5771 } 5772 5773 /* 5774 * Poll and drain the cq 5775 */ 5776 static uint_t 5777 ibd_drain_cq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl, ibt_wc_t *wcs, 5778 uint_t numwcs) 5779 { 5780 ibd_wqe_t *wqe; 5781 ibt_wc_t *wc; 5782 uint_t total_polled = 0; 5783 uint_t num_polled; 5784 int i; 5785 5786 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5787 total_polled += num_polled; 5788 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5789 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5790 ASSERT((wqe->w_type == IBD_WQE_SEND) || 5791 (wqe->w_type == IBD_WQE_RECV)); 5792 if (wc->wc_status != IBT_WC_SUCCESS) { 5793 /* 5794 * Channel being torn down. 5795 */ 5796 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5797 DPRINT(5, "ibd_drain_cq: flush error"); 5798 /* 5799 * Only invoke the Tx handler to 5800 * release possibly held resources 5801 * like AH refcount etc. Can not 5802 * invoke Rx handler because it might 5803 * try adding buffers to the Rx pool 5804 * when we are trying to deinitialize. 5805 */ 5806 if (wqe->w_type == IBD_WQE_RECV) { 5807 continue; 5808 } else { 5809 DPRINT(10, "ibd_drain_cq: Bad " 5810 "status %d", wc->wc_status); 5811 } 5812 } 5813 } 5814 if (wqe->w_type == IBD_WQE_SEND) { 5815 ibd_tx_cleanup(state, WQE_TO_SWQE(wqe)); 5816 } else { 5817 ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5818 } 5819 } 5820 } 5821 5822 return (total_polled); 5823 } 5824 5825 /* 5826 * Common code for interrupt handling as well as for polling 5827 * for all completed wqe's while detaching. 5828 */ 5829 static void 5830 ibd_poll_compq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5831 { 5832 ibt_wc_t *wcs; 5833 uint_t numwcs; 5834 int flag, redo_flag; 5835 int redo = 1; 5836 uint_t num_polled = 0; 5837 5838 if (ibd_separate_cqs == 1) { 5839 if (cq_hdl == state->id_rcq_hdl) { 5840 flag = IBD_RX_CQ_POLLING; 5841 redo_flag = IBD_REDO_RX_CQ_POLLING; 5842 } else { 5843 flag = IBD_TX_CQ_POLLING; 5844 redo_flag = IBD_REDO_TX_CQ_POLLING; 5845 } 5846 } else { 5847 flag = IBD_RX_CQ_POLLING | IBD_TX_CQ_POLLING; 5848 redo_flag = IBD_REDO_RX_CQ_POLLING | IBD_REDO_TX_CQ_POLLING; 5849 } 5850 5851 mutex_enter(&state->id_cq_poll_lock); 5852 if (state->id_cq_poll_busy & flag) { 5853 state->id_cq_poll_busy |= redo_flag; 5854 mutex_exit(&state->id_cq_poll_lock); 5855 return; 5856 } 5857 state->id_cq_poll_busy |= flag; 5858 mutex_exit(&state->id_cq_poll_lock); 5859 5860 /* 5861 * In some cases (eg detaching), this code can be invoked on 5862 * any cpu after disabling cq notification (thus no concurrency 5863 * exists). Apart from that, the following applies normally: 5864 * The receive completion handling is always on the Rx interrupt 5865 * cpu. Transmit completion handling could be from any cpu if 5866 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 5867 * is interrupt driven. Combined completion handling is always 5868 * on the interrupt cpu. Thus, lock accordingly and use the 5869 * proper completion array. 5870 */ 5871 if (ibd_separate_cqs == 1) { 5872 if (cq_hdl == state->id_rcq_hdl) { 5873 wcs = state->id_rxwcs; 5874 numwcs = state->id_rxwcs_size; 5875 } else { 5876 wcs = state->id_txwcs; 5877 numwcs = state->id_txwcs_size; 5878 } 5879 } else { 5880 wcs = state->id_rxwcs; 5881 numwcs = state->id_rxwcs_size; 5882 } 5883 5884 /* 5885 * Poll and drain the CQ 5886 */ 5887 num_polled = ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5888 5889 /* 5890 * Enable CQ notifications and redrain the cq to catch any 5891 * completions we might have missed after the ibd_drain_cq() 5892 * above and before the ibt_enable_cq_notify() that follows. 5893 * Finally, service any new requests to poll the cq that 5894 * could've come in after the ibt_enable_cq_notify(). 5895 */ 5896 do { 5897 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 5898 IBT_SUCCESS) { 5899 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 5900 } 5901 5902 num_polled += ibd_drain_cq(state, cq_hdl, wcs, numwcs); 5903 5904 mutex_enter(&state->id_cq_poll_lock); 5905 if (state->id_cq_poll_busy & redo_flag) 5906 state->id_cq_poll_busy &= ~redo_flag; 5907 else { 5908 state->id_cq_poll_busy &= ~flag; 5909 redo = 0; 5910 } 5911 mutex_exit(&state->id_cq_poll_lock); 5912 5913 } while (redo); 5914 5915 /* 5916 * If we polled the receive cq and found anything, we need to flush 5917 * it out to the nw layer here. 5918 */ 5919 if ((flag & IBD_RX_CQ_POLLING) && (num_polled > 0)) { 5920 ibd_flush_rx(state, NULL); 5921 } 5922 } 5923 5924 /* 5925 * Unmap the memory area associated with a given swqe. 5926 */ 5927 static void 5928 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 5929 { 5930 ibt_status_t stat; 5931 5932 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 5933 5934 if (swqe->w_mi_hdl) { 5935 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 5936 swqe->w_mi_hdl)) != IBT_SUCCESS) { 5937 DPRINT(10, 5938 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 5939 } 5940 swqe->w_mi_hdl = NULL; 5941 } 5942 swqe->w_swr.wr_nds = 0; 5943 } 5944 5945 /* 5946 * Common code that deals with clean ups after a successful or 5947 * erroneous transmission attempt. 5948 */ 5949 static void 5950 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 5951 { 5952 ibd_ace_t *ace = swqe->w_ahandle; 5953 5954 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 5955 5956 /* 5957 * If this was a dynamic mapping in ibd_send(), we need to 5958 * unmap here. If this was an lso buffer we'd used for sending, 5959 * we need to release the lso buf to the pool, since the resource 5960 * is scarce. However, if this was simply a normal send using 5961 * the copybuf (present in each swqe), we don't need to release it. 5962 */ 5963 if (swqe->swqe_im_mblk != NULL) { 5964 if (swqe->w_buftype == IBD_WQE_MAPPED) { 5965 ibd_unmap_mem(state, swqe); 5966 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 5967 ibd_release_lsobufs(state, 5968 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 5969 } 5970 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 5971 freemsg(swqe->swqe_im_mblk); 5972 swqe->swqe_im_mblk = NULL; 5973 } 5974 5975 /* 5976 * Drop the reference count on the AH; it can be reused 5977 * now for a different destination if there are no more 5978 * posted sends that will use it. This can be eliminated 5979 * if we can always associate each Tx buffer with an AH. 5980 * The ace can be null if we are cleaning up from the 5981 * ibd_send() error path. 5982 */ 5983 if (ace != NULL) { 5984 /* 5985 * The recycling logic can be eliminated from here 5986 * and put into the async thread if we create another 5987 * list to hold ACE's for unjoined mcg's. 5988 */ 5989 if (DEC_REF_DO_CYCLE(ace)) { 5990 ibd_mce_t *mce; 5991 5992 /* 5993 * Check with the lock taken: we decremented 5994 * reference count without the lock, and some 5995 * transmitter might alreay have bumped the 5996 * reference count (possible in case of multicast 5997 * disable when we leave the AH on the active 5998 * list). If not still 0, get out, leaving the 5999 * recycle bit intact. 6000 * 6001 * Atomically transition the AH from active 6002 * to free list, and queue a work request to 6003 * leave the group and destroy the mce. No 6004 * transmitter can be looking at the AH or 6005 * the MCE in between, since we have the 6006 * ac_mutex lock. In the SendOnly reap case, 6007 * it is not neccesary to hold the ac_mutex 6008 * and recheck the ref count (since the AH was 6009 * taken off the active list), we just do it 6010 * to have uniform processing with the Full 6011 * reap case. 6012 */ 6013 mutex_enter(&state->id_ac_mutex); 6014 mce = ace->ac_mce; 6015 if (GET_REF_CYCLE(ace) == 0) { 6016 CLEAR_REFCYCLE(ace); 6017 /* 6018 * Identify the case of fullmember reap as 6019 * opposed to mcg trap reap. Also, port up 6020 * might set ac_mce to NULL to indicate Tx 6021 * cleanup should do no more than put the 6022 * AH in the free list (see ibd_async_link). 6023 */ 6024 if (mce != NULL) { 6025 ace->ac_mce = NULL; 6026 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6027 /* 6028 * mc_req was initialized at mce 6029 * creation time. 6030 */ 6031 ibd_queue_work_slot(state, 6032 &mce->mc_req, IBD_ASYNC_REAP); 6033 } 6034 IBD_ACACHE_INSERT_FREE(state, ace); 6035 } 6036 mutex_exit(&state->id_ac_mutex); 6037 } 6038 } 6039 6040 /* 6041 * Release the send wqe for reuse. 6042 */ 6043 ibd_release_swqe(state, swqe); 6044 } 6045 6046 /* 6047 * Hand off the processed rx mp chain to mac_rx() 6048 */ 6049 static void 6050 ibd_flush_rx(ibd_state_t *state, mblk_t *mpc) 6051 { 6052 if (mpc == NULL) { 6053 mutex_enter(&state->id_rx_lock); 6054 6055 mpc = state->id_rx_mp; 6056 6057 state->id_rx_mp = NULL; 6058 state->id_rx_mp_tail = NULL; 6059 state->id_rx_mp_len = 0; 6060 6061 mutex_exit(&state->id_rx_lock); 6062 } 6063 6064 if (mpc) { 6065 mac_rx(state->id_mh, state->id_rh, mpc); 6066 } 6067 } 6068 6069 /* 6070 * Processing to be done after receipt of a packet; hand off to GLD 6071 * in the format expected by GLD. The received packet has this 6072 * format: 2b sap :: 00 :: data. 6073 */ 6074 static void 6075 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6076 { 6077 ib_header_info_t *phdr; 6078 mblk_t *mp; 6079 mblk_t *mpc = NULL; 6080 ipoib_hdr_t *ipibp; 6081 ipha_t *iphap; 6082 ip6_t *ip6h; 6083 int rxcnt, len; 6084 6085 /* 6086 * Track number handed to upper layer, and number still 6087 * available to receive packets. 6088 */ 6089 rxcnt = atomic_add_32_nv(&state->id_rx_list.dl_cnt, -1); 6090 ASSERT(rxcnt >= 0); 6091 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, 1); 6092 6093 /* 6094 * Adjust write pointer depending on how much data came in. 6095 */ 6096 mp = rwqe->rwqe_im_mblk; 6097 mp->b_wptr = mp->b_rptr + wc->wc_bytes_xfer; 6098 6099 /* 6100 * Make sure this is NULL or we're in trouble. 6101 */ 6102 if (mp->b_next != NULL) { 6103 ibd_print_warn(state, 6104 "ibd_process_rx: got duplicate mp from rcq?"); 6105 mp->b_next = NULL; 6106 } 6107 6108 /* 6109 * the IB link will deliver one of the IB link layer 6110 * headers called, the Global Routing Header (GRH). 6111 * ibd driver uses the information in GRH to build the 6112 * Header_info structure and pass it with the datagram up 6113 * to GLDv3. 6114 * If the GRH is not valid, indicate to GLDv3 by setting 6115 * the VerTcFlow field to 0. 6116 */ 6117 phdr = (ib_header_info_t *)mp->b_rptr; 6118 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6119 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6120 6121 /* if it is loop back packet, just drop it. */ 6122 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6123 IPOIB_ADDRL) == 0) { 6124 freemsg(mp); 6125 return; 6126 } 6127 6128 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6129 sizeof (ipoib_mac_t)); 6130 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6131 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6132 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6133 } else { 6134 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6135 } 6136 } else { 6137 /* 6138 * It can not be a IBA multicast packet. Must have been 6139 * unicast for us. Just copy the interface address to dst. 6140 */ 6141 phdr->ib_grh.ipoib_vertcflow = 0; 6142 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6143 sizeof (ipoib_mac_t)); 6144 } 6145 6146 /* 6147 * For ND6 packets, padding is at the front of the source/target 6148 * lladdr. However the inet6 layer is not aware of it, hence remove 6149 * the padding from such packets. 6150 */ 6151 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6152 if (ntohs(ipibp->ipoib_type) == IP6_DL_SAP) { 6153 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + IPV6_HDR_LEN) { 6154 if (!pullupmsg(mp, IPV6_HDR_LEN + 6155 sizeof (ipoib_hdr_t))) { 6156 DPRINT(10, "ibd_process_rx: pullupmsg failed"); 6157 freemsg(mp); 6158 return; 6159 } 6160 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + 6161 sizeof (ipoib_pgrh_t)); 6162 } 6163 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6164 len = ntohs(ip6h->ip6_plen); 6165 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6166 if (MBLKL(mp) < sizeof (ipoib_hdr_t) + 6167 IPV6_HDR_LEN + len) { 6168 if (!pullupmsg(mp, sizeof (ipoib_hdr_t) + 6169 IPV6_HDR_LEN + len)) { 6170 DPRINT(10, "ibd_process_rx: pullupmsg" 6171 " failed"); 6172 freemsg(mp); 6173 return; 6174 } 6175 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 6176 sizeof (ipoib_pgrh_t) + 6177 sizeof (ipoib_hdr_t)); 6178 } 6179 /* LINTED: E_CONSTANT_CONDITION */ 6180 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6181 } 6182 } 6183 6184 /* 6185 * Update statistics 6186 */ 6187 atomic_add_64(&state->id_rcv_bytes, wc->wc_bytes_xfer); 6188 atomic_inc_64(&state->id_rcv_pkt); 6189 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6190 atomic_inc_64(&state->id_brd_rcv); 6191 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6192 atomic_inc_64(&state->id_multi_rcv); 6193 6194 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6195 /* 6196 * Set receive checksum status in mp 6197 * Hardware checksumming can be considered valid only if: 6198 * 1. CQE.IP_OK bit is set 6199 * 2. CQE.CKSUM = 0xffff 6200 * 3. IPv6 routing header is not present in the packet 6201 * 4. If there are no IP_OPTIONS in the IP HEADER 6202 */ 6203 6204 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6205 (wc->wc_cksum == 0xFFFF) && 6206 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6207 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6208 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6209 } 6210 6211 /* 6212 * Add this mp to the list of processed mp's to send to 6213 * the nw layer 6214 */ 6215 mutex_enter(&state->id_rx_lock); 6216 if (state->id_rx_mp) { 6217 ASSERT(state->id_rx_mp_tail != NULL); 6218 state->id_rx_mp_tail->b_next = mp; 6219 } else { 6220 ASSERT(state->id_rx_mp_tail == NULL); 6221 state->id_rx_mp = mp; 6222 } 6223 6224 state->id_rx_mp_tail = mp; 6225 state->id_rx_mp_len++; 6226 6227 if (state->id_rx_mp_len >= IBD_MAX_RX_MP_LEN) { 6228 mpc = state->id_rx_mp; 6229 6230 state->id_rx_mp = NULL; 6231 state->id_rx_mp_tail = NULL; 6232 state->id_rx_mp_len = 0; 6233 } 6234 6235 mutex_exit(&state->id_rx_lock); 6236 6237 if (mpc) { 6238 ibd_flush_rx(state, mpc); 6239 } 6240 } 6241 6242 /* 6243 * Callback code invoked from STREAMs when the receive data buffer is 6244 * free for recycling. 6245 */ 6246 static void 6247 ibd_freemsg_cb(char *arg) 6248 { 6249 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6250 ibd_state_t *state = rwqe->w_state; 6251 6252 /* 6253 * If the wqe is being destructed, do not attempt recycling. 6254 */ 6255 if (rwqe->w_freeing_wqe == B_TRUE) { 6256 DPRINT(6, "ibd_freemsg: wqe being freed"); 6257 return; 6258 } else { 6259 /* 6260 * Upper layer has released held mblk, so we have 6261 * no more use for keeping the old pointer in 6262 * our rwqe. 6263 */ 6264 rwqe->rwqe_im_mblk = NULL; 6265 } 6266 6267 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6268 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6269 if (rwqe->rwqe_im_mblk == NULL) { 6270 ibd_delete_rwqe(state, rwqe); 6271 ibd_free_rwqe(state, rwqe); 6272 DPRINT(6, "ibd_freemsg: desballoc failed"); 6273 return; 6274 } 6275 6276 if (ibd_post_recv(state, rwqe, B_TRUE) == DDI_FAILURE) { 6277 ibd_delete_rwqe(state, rwqe); 6278 ibd_free_rwqe(state, rwqe); 6279 return; 6280 } 6281 6282 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 6283 } 6284 6285 static uint_t 6286 ibd_tx_recycle(char *arg) 6287 { 6288 ibd_state_t *state = (ibd_state_t *)arg; 6289 6290 /* 6291 * Poll for completed entries 6292 */ 6293 ibd_poll_compq(state, state->id_scq_hdl); 6294 6295 /* 6296 * Resume any blocked transmissions if possible 6297 */ 6298 (void) ibd_resume_transmission(state); 6299 6300 return (DDI_INTR_CLAIMED); 6301 } 6302 6303 #ifdef IBD_LOGGING 6304 static void 6305 ibd_log_init(void) 6306 { 6307 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6308 ibd_lbuf_ndx = 0; 6309 6310 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 6311 } 6312 6313 static void 6314 ibd_log_fini(void) 6315 { 6316 if (ibd_lbuf) 6317 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6318 ibd_lbuf_ndx = 0; 6319 ibd_lbuf = NULL; 6320 6321 mutex_destroy(&ibd_lbuf_lock); 6322 } 6323 6324 static void 6325 ibd_log(const char *fmt, ...) 6326 { 6327 va_list ap; 6328 uint32_t off; 6329 uint32_t msglen; 6330 char tmpbuf[IBD_DMAX_LINE]; 6331 6332 if (ibd_lbuf == NULL) 6333 return; 6334 6335 va_start(ap, fmt); 6336 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6337 va_end(ap); 6338 6339 if (msglen >= IBD_DMAX_LINE) 6340 msglen = IBD_DMAX_LINE - 1; 6341 6342 mutex_enter(&ibd_lbuf_lock); 6343 6344 off = ibd_lbuf_ndx; /* current msg should go here */ 6345 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6346 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6347 6348 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6349 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6350 6351 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6352 ibd_lbuf_ndx = 0; 6353 6354 mutex_exit(&ibd_lbuf_lock); 6355 6356 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6357 } 6358 #endif 6359