1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * An implementation of the IPoIB standard based on PSARC 2001/289. 29 */ 30 31 #include <sys/types.h> 32 #include <sys/conf.h> 33 #include <sys/ddi.h> 34 #include <sys/sunddi.h> 35 #include <sys/modctl.h> 36 #include <sys/stropts.h> 37 #include <sys/stream.h> 38 #include <sys/strsun.h> 39 #include <sys/strsubr.h> 40 #include <sys/dlpi.h> 41 #include <sys/mac_provider.h> 42 43 #include <sys/pattr.h> /* for HCK_FULLCKSUM */ 44 #include <sys/sysmacros.h> /* for offsetof */ 45 #include <sys/disp.h> /* for async thread pri */ 46 #include <sys/atomic.h> /* for atomic_add*() */ 47 #include <sys/ethernet.h> /* for ETHERTYPE_IPV6 */ 48 #include <netinet/in.h> /* for netinet/ip.h below */ 49 #include <netinet/ip.h> /* for struct ip */ 50 #include <netinet/udp.h> /* for struct udphdr */ 51 #include <inet/common.h> /* for inet/ip.h below */ 52 #include <inet/ip.h> /* for ipha_t */ 53 #include <inet/ip6.h> /* for ip6_t */ 54 #include <inet/tcp.h> /* for tcph_t */ 55 #include <netinet/icmp6.h> /* for icmp6_t */ 56 #include <sys/callb.h> 57 #include <sys/modhash.h> 58 59 #include <sys/ib/clients/ibd/ibd.h> 60 #include <sys/ib/mgt/sm_attr.h> /* for SM_INIT_TYPE_* */ 61 #include <sys/note.h> 62 #include <sys/multidata.h> 63 64 #include <sys/ib/mgt/ibmf/ibmf.h> /* for ibd_get_portspeed */ 65 66 /* 67 * Per-interface tunables (for developers) 68 * 69 * ibd_tx_copy_thresh 70 * This sets the threshold at which ibd will attempt to do a bcopy of the 71 * outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior 72 * is restricted by various parameters, so setting of this value must be 73 * made after careful considerations only. For instance, IB HCAs currently 74 * impose a relatively small limit (when compared to ethernet NICs) on the 75 * length of the SGL for transmit. On the other hand, the ip stack could 76 * send down mp chains that are quite long when LSO is enabled. 77 * 78 * ibd_num_swqe 79 * Number of "send WQE" elements that will be allocated and used by ibd. 80 * When tuning this parameter, the size of pre-allocated, pre-mapped copy 81 * buffer in each of these send wqes must be taken into account. This 82 * copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is 83 * currently set to the same value of ibd_tx_copy_thresh, but may be 84 * changed independently if needed). 85 * 86 * ibd_num_rwqe 87 * Number of "receive WQE" elements that will be allocated and used by 88 * ibd. This parameter is limited by the maximum channel size of the HCA. 89 * Each buffer in the receive wqe will be of MTU size. 90 * 91 * ibd_num_lso_bufs 92 * Number of "larger-than-MTU" copy buffers to use for cases when the 93 * outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov() 94 * and too large to be used with regular MTU-sized copy buffers. It is 95 * not recommended to tune this variable without understanding the 96 * application environment and/or memory resources. The size of each of 97 * these lso buffers is determined by the value of IBD_LSO_BUFSZ. 98 * 99 * ibd_num_ah 100 * Number of AH cache entries to allocate 101 * 102 * ibd_hash_size 103 * Hash table size for the active AH list 104 * 105 * ibd_tx_softintr 106 * ibd_rx_softintr 107 * The softintr mechanism allows ibd to avoid event queue overflows if 108 * the receive/completion handlers are to be expensive. These are enabled 109 * by default. 110 * 111 * ibd_log_sz 112 * This specifies the size of the ibd log buffer in bytes. The buffer is 113 * allocated and logging is enabled only when IBD_LOGGING is defined. 114 * 115 */ 116 uint_t ibd_tx_copy_thresh = 0x1000; 117 uint_t ibd_num_swqe = 4000; 118 uint_t ibd_num_rwqe = 4000; 119 uint_t ibd_num_lso_bufs = 0x400; 120 uint_t ibd_num_ah = 64; 121 uint_t ibd_hash_size = 32; 122 uint_t ibd_rx_softintr = 1; 123 uint_t ibd_tx_softintr = 1; 124 uint_t ibd_create_broadcast_group = 1; 125 #ifdef IBD_LOGGING 126 uint_t ibd_log_sz = 0x20000; 127 #endif 128 129 #define IBD_TX_COPY_THRESH ibd_tx_copy_thresh 130 #define IBD_TX_BUF_SZ ibd_tx_copy_thresh 131 #define IBD_NUM_SWQE ibd_num_swqe 132 #define IBD_NUM_RWQE ibd_num_rwqe 133 #define IBD_NUM_LSO_BUFS ibd_num_lso_bufs 134 #define IBD_NUM_AH ibd_num_ah 135 #define IBD_HASH_SIZE ibd_hash_size 136 #ifdef IBD_LOGGING 137 #define IBD_LOG_SZ ibd_log_sz 138 #endif 139 140 /* 141 * Receive CQ moderation parameters: tunable (for developers) 142 */ 143 uint_t ibd_rxcomp_count = 4; 144 uint_t ibd_rxcomp_usec = 10; 145 146 /* 147 * Send CQ moderation parameters: tunable (for developers) 148 */ 149 uint_t ibd_txcomp_count = 16; 150 uint_t ibd_txcomp_usec = 300; 151 152 /* 153 * Thresholds 154 * 155 * When waiting for resources (swqes or lso buffers) to become available, 156 * the first two thresholds below determine how long to wait before informing 157 * the network layer to start sending packets again. The IBD_TX_POLL_THRESH 158 * determines how low the available swqes should go before we start polling 159 * the completion queue. 160 */ 161 #define IBD_FREE_LSOS_THRESH 8 162 #define IBD_FREE_SWQES_THRESH 20 163 #define IBD_TX_POLL_THRESH 80 164 165 /* 166 * When doing multiple-send-wr, this value determines how many to do at 167 * a time (in a single ibt_post_send). 168 */ 169 #define IBD_MAX_TX_POST_MULTIPLE 4 170 171 /* Post IBD_RX_POST_CNT receive work requests at a time. */ 172 #define IBD_RX_POST_CNT 16 173 174 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */ 175 #define IBD_LOG_RX_POST 3 176 177 /* Minimum number of receive work requests driver needs to always have */ 178 #define IBD_RWQE_MIN ((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4) 179 180 /* 181 * Maximum length for returning chained mps back to crossbow. 182 * Also used as the maximum number of rx wc's polled at a time. 183 */ 184 #define IBD_MAX_RX_MP_LEN 16 185 186 /* 187 * LSO parameters 188 */ 189 #define IBD_LSO_MAXLEN 65536 190 #define IBD_LSO_BUFSZ 8192 191 #define IBD_PROP_LSO_POLICY "lso-policy" 192 193 /* 194 * Completion queue polling control 195 */ 196 #define IBD_CQ_POLLING 0x1 197 #define IBD_REDO_CQ_POLLING 0x2 198 199 /* 200 * Flag bits for resources to reap 201 */ 202 #define IBD_RSRC_SWQE 0x1 203 #define IBD_RSRC_LSOBUF 0x2 204 205 /* 206 * Async operation types 207 */ 208 #define IBD_ASYNC_GETAH 1 209 #define IBD_ASYNC_JOIN 2 210 #define IBD_ASYNC_LEAVE 3 211 #define IBD_ASYNC_PROMON 4 212 #define IBD_ASYNC_PROMOFF 5 213 #define IBD_ASYNC_REAP 6 214 #define IBD_ASYNC_TRAP 7 215 #define IBD_ASYNC_SCHED 8 216 #define IBD_ASYNC_LINK 9 217 #define IBD_ASYNC_EXIT 10 218 219 /* 220 * Async operation states 221 */ 222 #define IBD_OP_NOTSTARTED 0 223 #define IBD_OP_ONGOING 1 224 #define IBD_OP_COMPLETED 2 225 #define IBD_OP_ERRORED 3 226 #define IBD_OP_ROUTERED 4 227 228 /* 229 * State of IBD driver initialization during attach/m_start 230 */ 231 #define IBD_DRV_STATE_INITIALIZED 0x00001 232 #define IBD_DRV_RXINTR_ADDED 0x00002 233 #define IBD_DRV_TXINTR_ADDED 0x00004 234 #define IBD_DRV_IBTL_ATTACH_DONE 0x00008 235 #define IBD_DRV_HCA_OPENED 0x00010 236 #define IBD_DRV_PD_ALLOCD 0x00020 237 #define IBD_DRV_MAC_REGISTERED 0x00040 238 #define IBD_DRV_PORT_DETAILS_OBTAINED 0x00080 239 #define IBD_DRV_BCAST_GROUP_FOUND 0x00100 240 #define IBD_DRV_ACACHE_INITIALIZED 0x00200 241 #define IBD_DRV_CQS_ALLOCD 0x00400 242 #define IBD_DRV_UD_CHANNEL_SETUP 0x00800 243 #define IBD_DRV_TXLIST_ALLOCD 0x01000 244 #define IBD_DRV_SCQ_NOTIFY_ENABLED 0x02000 245 #define IBD_DRV_RXLIST_ALLOCD 0x04000 246 #define IBD_DRV_BCAST_GROUP_JOINED 0x08000 247 #define IBD_DRV_ASYNC_THR_CREATED 0x10000 248 #define IBD_DRV_RCQ_NOTIFY_ENABLED 0x20000 249 #define IBD_DRV_SM_NOTICES_REGISTERED 0x40000 250 #define IBD_DRV_STARTED 0x80000 251 252 /* 253 * Start/stop in-progress flags; note that restart must always remain 254 * the OR of start and stop flag values. 255 */ 256 #define IBD_DRV_START_IN_PROGRESS 0x10000000 257 #define IBD_DRV_STOP_IN_PROGRESS 0x20000000 258 #define IBD_DRV_RESTART_IN_PROGRESS 0x30000000 259 260 /* 261 * Miscellaneous constants 262 */ 263 #define IBD_SEND 0 264 #define IBD_RECV 1 265 #define IB_MGID_IPV4_LOWGRP_MASK 0xFFFFFFFF 266 #define IBD_DEF_MAX_SDU 2044 267 #define IBD_DEFAULT_QKEY 0xB1B 268 #ifdef IBD_LOGGING 269 #define IBD_DMAX_LINE 100 270 #endif 271 272 /* 273 * Enumerations for link states 274 */ 275 typedef enum { 276 IBD_LINK_DOWN, 277 IBD_LINK_UP, 278 IBD_LINK_UP_ABSENT 279 } ibd_link_op_t; 280 281 /* 282 * Driver State Pointer 283 */ 284 void *ibd_list; 285 286 /* 287 * Logging 288 */ 289 #ifdef IBD_LOGGING 290 kmutex_t ibd_lbuf_lock; 291 uint8_t *ibd_lbuf; 292 uint32_t ibd_lbuf_ndx; 293 #endif 294 295 /* 296 * Required system entry points 297 */ 298 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd); 299 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd); 300 301 /* 302 * Required driver entry points for GLDv3 303 */ 304 static int ibd_m_stat(void *, uint_t, uint64_t *); 305 static int ibd_m_start(void *); 306 static void ibd_m_stop(void *); 307 static int ibd_m_promisc(void *, boolean_t); 308 static int ibd_m_multicst(void *, boolean_t, const uint8_t *); 309 static int ibd_m_unicst(void *, const uint8_t *); 310 static mblk_t *ibd_m_tx(void *, mblk_t *); 311 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *); 312 313 /* 314 * Private driver entry points for GLDv3 315 */ 316 317 /* 318 * Initialization 319 */ 320 static int ibd_state_init(ibd_state_t *, dev_info_t *); 321 static int ibd_init_txlist(ibd_state_t *); 322 static int ibd_init_rxlist(ibd_state_t *); 323 static int ibd_acache_init(ibd_state_t *); 324 #ifdef IBD_LOGGING 325 static void ibd_log_init(void); 326 #endif 327 328 /* 329 * Termination/cleanup 330 */ 331 static void ibd_state_fini(ibd_state_t *); 332 static void ibd_fini_txlist(ibd_state_t *); 333 static void ibd_fini_rxlist(ibd_state_t *); 334 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *); 335 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *); 336 static void ibd_acache_fini(ibd_state_t *); 337 #ifdef IBD_LOGGING 338 static void ibd_log_fini(void); 339 #endif 340 341 /* 342 * Allocation/acquire/map routines 343 */ 344 static int ibd_alloc_tx_copybufs(ibd_state_t *); 345 static int ibd_alloc_rx_copybufs(ibd_state_t *); 346 static int ibd_alloc_tx_lsobufs(ibd_state_t *); 347 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *); 348 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *, 349 uint32_t *); 350 351 /* 352 * Free/release/unmap routines 353 */ 354 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *); 355 static void ibd_free_tx_copybufs(ibd_state_t *); 356 static void ibd_free_rx_copybufs(ibd_state_t *); 357 static void ibd_free_tx_lsobufs(ibd_state_t *); 358 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int); 359 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t); 360 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *); 361 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *); 362 363 /* 364 * Handlers/callback routines 365 */ 366 static uint_t ibd_intr(caddr_t); 367 static uint_t ibd_tx_recycle(caddr_t); 368 static void ibd_rcq_handler(ibt_cq_hdl_t, void *); 369 static void ibd_scq_handler(ibt_cq_hdl_t, void *); 370 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t); 371 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t); 372 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t); 373 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t); 374 static void ibd_freemsg_cb(char *); 375 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t, 376 ibt_async_event_t *); 377 static void ibd_snet_notices_handler(void *, ib_gid_t, 378 ibt_subnet_event_code_t, ibt_subnet_event_t *); 379 380 /* 381 * Send/receive routines 382 */ 383 static boolean_t ibd_send(ibd_state_t *, mblk_t *); 384 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *); 385 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *); 386 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *); 387 388 /* 389 * Threads 390 */ 391 static void ibd_async_work(ibd_state_t *); 392 393 /* 394 * Async tasks 395 */ 396 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *); 397 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int); 398 static void ibd_async_setprom(ibd_state_t *); 399 static void ibd_async_unsetprom(ibd_state_t *); 400 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t); 401 static void ibd_async_trap(ibd_state_t *, ibd_req_t *); 402 static void ibd_async_txsched(ibd_state_t *); 403 static void ibd_async_link(ibd_state_t *, ibd_req_t *); 404 405 /* 406 * Async task helpers 407 */ 408 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *); 409 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t); 410 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *); 411 static boolean_t ibd_get_allroutergroup(ibd_state_t *, 412 ipoib_mac_t *, ipoib_mac_t *); 413 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t); 414 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *); 415 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *); 416 static ibt_status_t ibd_find_bgroup(ibd_state_t *); 417 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *); 418 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t); 419 static uint64_t ibd_get_portspeed(ibd_state_t *); 420 static boolean_t ibd_async_safe(ibd_state_t *); 421 static void ibd_async_done(ibd_state_t *); 422 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int); 423 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int); 424 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *); 425 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t); 426 static void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *); 427 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t); 428 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *); 429 430 /* 431 * Helpers for attach/start routines 432 */ 433 static int ibd_register_mac(ibd_state_t *, dev_info_t *); 434 static int ibd_record_capab(ibd_state_t *, dev_info_t *); 435 static int ibd_unattach(ibd_state_t *, dev_info_t *); 436 static int ibd_get_port_details(ibd_state_t *); 437 static int ibd_alloc_cqs(ibd_state_t *); 438 static int ibd_setup_ud_channel(ibd_state_t *); 439 static int ibd_start(ibd_state_t *); 440 static int ibd_undo_start(ibd_state_t *, link_state_t); 441 static void ibd_set_mac_progress(ibd_state_t *, uint_t); 442 static void ibd_clr_mac_progress(ibd_state_t *, uint_t); 443 444 445 /* 446 * Miscellaneous helpers 447 */ 448 static int ibd_sched_poll(ibd_state_t *, int, int); 449 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int); 450 static void ibd_resume_transmission(ibd_state_t *); 451 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t); 452 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t); 453 static void *list_get_head(list_t *); 454 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t); 455 static uint_t ibd_hash_by_id(void *, mod_hash_key_t); 456 static void ibd_print_warn(ibd_state_t *, char *, ...); 457 #ifdef IBD_LOGGING 458 static void ibd_log(const char *, ...); 459 #endif 460 461 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach, 462 nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed); 463 464 /* Module Driver Info */ 465 static struct modldrv ibd_modldrv = { 466 &mod_driverops, /* This one is a driver */ 467 "InfiniBand GLDv3 Driver", /* short description */ 468 &ibd_dev_ops /* driver specific ops */ 469 }; 470 471 /* Module Linkage */ 472 static struct modlinkage ibd_modlinkage = { 473 MODREV_1, (void *)&ibd_modldrv, NULL 474 }; 475 476 /* 477 * Module (static) info passed to IBTL during ibt_attach 478 */ 479 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = { 480 IBTI_V_CURR, 481 IBT_NETWORK, 482 ibd_async_handler, 483 NULL, 484 "IPIB" 485 }; 486 487 /* 488 * GLDv3 entry points 489 */ 490 #define IBD_M_CALLBACK_FLAGS (MC_GETCAPAB) 491 static mac_callbacks_t ibd_m_callbacks = { 492 IBD_M_CALLBACK_FLAGS, 493 ibd_m_stat, 494 ibd_m_start, 495 ibd_m_stop, 496 ibd_m_promisc, 497 ibd_m_multicst, 498 ibd_m_unicst, 499 ibd_m_tx, 500 NULL, 501 ibd_m_getcapab 502 }; 503 504 /* 505 * Fill/clear <scope> and <p_key> in multicast/broadcast address 506 */ 507 #define IBD_FILL_SCOPE_PKEY(maddr, scope, pkey) \ 508 { \ 509 *(uint32_t *)((char *)(maddr) + 4) |= \ 510 htonl((uint32_t)(scope) << 16); \ 511 *(uint32_t *)((char *)(maddr) + 8) |= \ 512 htonl((uint32_t)(pkey) << 16); \ 513 } 514 515 #define IBD_CLEAR_SCOPE_PKEY(maddr) \ 516 { \ 517 *(uint32_t *)((char *)(maddr) + 4) &= \ 518 htonl(~((uint32_t)0xF << 16)); \ 519 *(uint32_t *)((char *)(maddr) + 8) &= \ 520 htonl(~((uint32_t)0xFFFF << 16)); \ 521 } 522 523 /* 524 * Rudimentary debugging support 525 */ 526 #ifdef DEBUG 527 int ibd_debuglevel = 100; 528 static void 529 debug_print(int l, char *fmt, ...) 530 { 531 va_list ap; 532 533 if (l < ibd_debuglevel) 534 return; 535 va_start(ap, fmt); 536 vcmn_err(CE_CONT, fmt, ap); 537 va_end(ap); 538 } 539 #define DPRINT debug_print 540 #else 541 #define DPRINT 0 && 542 #endif 543 544 /* 545 * Common routine to print warning messages; adds in hca guid, port number 546 * and pkey to be able to identify the IBA interface. 547 */ 548 static void 549 ibd_print_warn(ibd_state_t *state, char *fmt, ...) 550 { 551 ib_guid_t hca_guid; 552 char ibd_print_buf[256]; 553 int len; 554 va_list ap; 555 556 hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip, 557 0, "hca-guid", 0); 558 len = snprintf(ibd_print_buf, sizeof (ibd_print_buf), 559 "%s%d: HCA GUID %016llx port %d PKEY %02x ", 560 ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip), 561 (u_longlong_t)hca_guid, state->id_port, state->id_pkey); 562 va_start(ap, fmt); 563 (void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len, 564 fmt, ap); 565 cmn_err(CE_NOTE, "!%s", ibd_print_buf); 566 va_end(ap); 567 } 568 569 /* 570 * Warlock directives 571 */ 572 573 /* 574 * id_lso_lock 575 * 576 * state->id_lso->bkt_nfree may be accessed without a lock to 577 * determine the threshold at which we have to ask the nw layer 578 * to resume transmission (see ibd_resume_transmission()). 579 */ 580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock, 581 ibd_state_t::id_lso)) 582 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso)) 583 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy)) 584 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree)) 585 586 /* 587 * id_scq_poll_lock 588 */ 589 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock, 590 ibd_state_t::id_scq_poll_busy)) 591 592 /* 593 * id_txpost_lock 594 */ 595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 596 ibd_state_t::id_tx_head)) 597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock, 598 ibd_state_t::id_tx_busy)) 599 600 /* 601 * id_acache_req_lock 602 */ 603 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 604 ibd_state_t::id_acache_req_cv)) 605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock, 606 ibd_state_t::id_req_list)) 607 _NOTE(SCHEME_PROTECTS_DATA("atomic", 608 ibd_acache_s::ac_ref)) 609 610 /* 611 * id_ac_mutex 612 * 613 * This mutex is actually supposed to protect id_ah_op as well, 614 * but this path of the code isn't clean (see update of id_ah_op 615 * in ibd_async_acache(), immediately after the call to 616 * ibd_async_mcache()). For now, we'll skip this check by 617 * declaring that id_ah_op is protected by some internal scheme 618 * that warlock isn't aware of. 619 */ 620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 621 ibd_state_t::id_ah_active)) 622 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 623 ibd_state_t::id_ah_free)) 624 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 625 ibd_state_t::id_ah_addr)) 626 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this", 627 ibd_state_t::id_ah_op)) 628 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 629 ibd_state_t::id_ah_error)) 630 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex, 631 ibd_state_t::id_ac_hot_ace)) 632 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error)) 633 634 /* 635 * id_mc_mutex 636 */ 637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 638 ibd_state_t::id_mc_full)) 639 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex, 640 ibd_state_t::id_mc_non)) 641 642 /* 643 * id_trap_lock 644 */ 645 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 646 ibd_state_t::id_trap_cv)) 647 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 648 ibd_state_t::id_trap_stop)) 649 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock, 650 ibd_state_t::id_trap_inprog)) 651 652 /* 653 * id_prom_op 654 */ 655 _NOTE(SCHEME_PROTECTS_DATA("only by async thread", 656 ibd_state_t::id_prom_op)) 657 658 /* 659 * id_sched_lock 660 */ 661 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock, 662 ibd_state_t::id_sched_needed)) 663 664 /* 665 * id_link_mutex 666 */ 667 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex, 668 ibd_state_t::id_link_state)) 669 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state)) 670 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start", 671 ibd_state_t::id_link_speed)) 672 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid)) 673 674 /* 675 * id_tx_list.dl_mutex 676 */ 677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 678 ibd_state_t::id_tx_list.dl_head)) 679 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 680 ibd_state_t::id_tx_list.dl_pending_sends)) 681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex, 682 ibd_state_t::id_tx_list.dl_cnt)) 683 684 /* 685 * id_rx_list.dl_mutex 686 */ 687 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 688 ibd_state_t::id_rx_list.dl_bufs_outstanding)) 689 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr", 690 ibd_state_t::id_rx_list.dl_cnt)) 691 692 693 /* 694 * Items protected by atomic updates 695 */ 696 _NOTE(SCHEME_PROTECTS_DATA("atomic update only", 697 ibd_state_s::id_brd_rcv 698 ibd_state_s::id_brd_xmt 699 ibd_state_s::id_multi_rcv 700 ibd_state_s::id_multi_xmt 701 ibd_state_s::id_num_intrs 702 ibd_state_s::id_rcv_bytes 703 ibd_state_s::id_rcv_pkt 704 ibd_state_s::id_tx_short 705 ibd_state_s::id_xmt_bytes 706 ibd_state_s::id_xmt_pkt)) 707 708 /* 709 * Non-mutex protection schemes for data elements. Almost all of 710 * these are non-shared items. 711 */ 712 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded", 713 callb_cpr 714 ib_gid_s 715 ib_header_info 716 ibd_acache_rq 717 ibd_acache_s::ac_mce 718 ibd_mcache::mc_fullreap 719 ibd_mcache::mc_jstate 720 ibd_mcache::mc_req 721 ibd_rwqe_s 722 ibd_swqe_s 723 ibd_wqe_s 724 ibt_wr_ds_s::ds_va 725 ibt_wr_lso_s 726 ipoib_mac::ipoib_qpn 727 mac_capab_lso_s 728 msgb::b_next 729 msgb::b_rptr 730 msgb::b_wptr 731 ibd_state_s::id_bgroup_created 732 ibd_state_s::id_mac_state 733 ibd_state_s::id_mtu 734 ibd_state_s::id_num_rwqe 735 ibd_state_s::id_num_swqe 736 ibd_state_s::id_qpnum 737 ibd_state_s::id_rcq_hdl 738 ibd_state_s::id_rx_buf_sz 739 ibd_state_s::id_rx_bufs 740 ibd_state_s::id_rx_mr_hdl 741 ibd_state_s::id_rx_wqes 742 ibd_state_s::id_rxwcs 743 ibd_state_s::id_rxwcs_size 744 ibd_state_s::id_rx_nqueues 745 ibd_state_s::id_rx_queues 746 ibd_state_s::id_scope 747 ibd_state_s::id_scq_hdl 748 ibd_state_s::id_tx_buf_sz 749 ibd_state_s::id_tx_bufs 750 ibd_state_s::id_tx_mr_hdl 751 ibd_state_s::id_tx_rel_list.dl_cnt 752 ibd_state_s::id_tx_wqes 753 ibd_state_s::id_txwcs 754 ibd_state_s::id_txwcs_size)) 755 756 int 757 _init() 758 { 759 int status; 760 761 status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t), 762 PAGESIZE), 0); 763 if (status != 0) { 764 DPRINT(10, "_init:failed in ddi_soft_state_init()"); 765 return (status); 766 } 767 768 mac_init_ops(&ibd_dev_ops, "ibd"); 769 status = mod_install(&ibd_modlinkage); 770 if (status != 0) { 771 DPRINT(10, "_init:failed in mod_install()"); 772 ddi_soft_state_fini(&ibd_list); 773 mac_fini_ops(&ibd_dev_ops); 774 return (status); 775 } 776 777 #ifdef IBD_LOGGING 778 ibd_log_init(); 779 #endif 780 return (0); 781 } 782 783 int 784 _info(struct modinfo *modinfop) 785 { 786 return (mod_info(&ibd_modlinkage, modinfop)); 787 } 788 789 int 790 _fini() 791 { 792 int status; 793 794 status = mod_remove(&ibd_modlinkage); 795 if (status != 0) 796 return (status); 797 798 mac_fini_ops(&ibd_dev_ops); 799 ddi_soft_state_fini(&ibd_list); 800 #ifdef IBD_LOGGING 801 ibd_log_fini(); 802 #endif 803 return (0); 804 } 805 806 /* 807 * Convert the GID part of the mac address from network byte order 808 * to host order. 809 */ 810 static void 811 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid) 812 { 813 ib_sn_prefix_t nbopref; 814 ib_guid_t nboguid; 815 816 bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t)); 817 bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t)); 818 dgid->gid_prefix = b2h64(nbopref); 819 dgid->gid_guid = b2h64(nboguid); 820 } 821 822 /* 823 * Create the IPoIB address in network byte order from host order inputs. 824 */ 825 static void 826 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix, 827 ib_guid_t guid) 828 { 829 ib_sn_prefix_t nbopref; 830 ib_guid_t nboguid; 831 832 mac->ipoib_qpn = htonl(qpn); 833 nbopref = h2b64(prefix); 834 nboguid = h2b64(guid); 835 bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t)); 836 bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t)); 837 } 838 839 /* 840 * Send to the appropriate all-routers group when the IBA multicast group 841 * does not exist, based on whether the target group is v4 or v6. 842 */ 843 static boolean_t 844 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac, 845 ipoib_mac_t *rmac) 846 { 847 boolean_t retval = B_TRUE; 848 uint32_t adjscope = state->id_scope << 16; 849 uint32_t topword; 850 851 /* 852 * Copy the first 4 bytes in without assuming any alignment of 853 * input mac address; this will have IPoIB signature, flags and 854 * scope bits. 855 */ 856 bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t)); 857 topword = ntohl(topword); 858 859 /* 860 * Generate proper address for IPv4/v6, adding in the Pkey properly. 861 */ 862 if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) || 863 (topword == (IB_MCGID_IPV6_PREFIX | adjscope))) 864 ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) | 865 ((uint32_t)(state->id_pkey << 16))), 866 (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP)); 867 else 868 /* 869 * Does not have proper bits in the mgid address. 870 */ 871 retval = B_FALSE; 872 873 return (retval); 874 } 875 876 /* 877 * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at 878 * front of optional src/tgt link layer address. Right now Solaris inserts 879 * padding by default at the end. The routine which is doing is nce_xmit() 880 * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when 881 * the packet comes down from IP layer to the IBD driver, it is in the 882 * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T] 883 * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result 884 * machdr is not 4 byte aligned and had 2 bytes of padding at the end. 885 * 886 * The send routine at IBD driver changes this packet as follows: 887 * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding] 888 * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte 889 * aligned. 890 * 891 * At the receiving side again ibd_process_rx takes the above packet and 892 * removes the two bytes of front padding and inserts it at the end. This 893 * is since the IP layer does not understand padding at the front. 894 */ 895 #define IBD_PAD_NSNA(ip6h, len, type) { \ 896 uchar_t *nd_lla_ptr; \ 897 icmp6_t *icmp6; \ 898 nd_opt_hdr_t *opt; \ 899 int i; \ 900 \ 901 icmp6 = (icmp6_t *)&ip6h[1]; \ 902 len -= sizeof (nd_neighbor_advert_t); \ 903 if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) || \ 904 (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) && \ 905 (len != 0)) { \ 906 opt = (nd_opt_hdr_t *)((uint8_t *)ip6h \ 907 + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); \ 908 ASSERT(opt != NULL); \ 909 nd_lla_ptr = (uchar_t *)&opt[1]; \ 910 if (type == IBD_SEND) { \ 911 for (i = IPOIB_ADDRL; i > 0; i--) \ 912 *(nd_lla_ptr + i + 1) = \ 913 *(nd_lla_ptr + i - 1); \ 914 } else { \ 915 for (i = 0; i < IPOIB_ADDRL; i++) \ 916 *(nd_lla_ptr + i) = \ 917 *(nd_lla_ptr + i + 2); \ 918 } \ 919 *(nd_lla_ptr + i) = 0; \ 920 *(nd_lla_ptr + i + 1) = 0; \ 921 } \ 922 } 923 924 /* 925 * Address handle entries maintained by the driver are kept in the 926 * free and active lists. Each entry starts out in the free list; 927 * it migrates to the active list when primed using ibt_get_paths() 928 * and ibt_modify_ud_dest() for transmission to a specific destination. 929 * In the active list, the entry has a reference count indicating the 930 * number of ongoing/uncompleted transmits that reference it. The 931 * entry is left in the active list even after the reference count 932 * goes to 0, since successive transmits can find it there and do 933 * not need to set up another entry (ie the path information is 934 * cached using the active list). Entries on the active list are 935 * also hashed using the destination link address as a key for faster 936 * lookups during transmits. 937 * 938 * For any destination address (unicast or multicast, whatever the 939 * join states), there will be at most one entry in the active list. 940 * Entries with a 0 reference count on the active list can be reused 941 * for a transmit to a new destination, if the free list is empty. 942 * 943 * The AH free list insertion/deletion is protected with the id_ac_mutex, 944 * since the async thread and Tx callback handlers insert/delete. The 945 * active list does not need a lock (all operations are done by the 946 * async thread) but updates to the reference count are atomically 947 * done (increments done by Tx path, decrements by the Tx callback handler). 948 */ 949 #define IBD_ACACHE_INSERT_FREE(state, ce) \ 950 list_insert_head(&state->id_ah_free, ce) 951 #define IBD_ACACHE_GET_FREE(state) \ 952 list_get_head(&state->id_ah_free) 953 #define IBD_ACACHE_INSERT_ACTIVE(state, ce) { \ 954 int _ret_; \ 955 list_insert_head(&state->id_ah_active, ce); \ 956 _ret_ = mod_hash_insert(state->id_ah_active_hash, \ 957 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 958 ASSERT(_ret_ == 0); \ 959 state->id_ac_hot_ace = ce; \ 960 } 961 #define IBD_ACACHE_PULLOUT_ACTIVE(state, ce) { \ 962 list_remove(&state->id_ah_active, ce); \ 963 if (state->id_ac_hot_ace == ce) \ 964 state->id_ac_hot_ace = NULL; \ 965 (void) mod_hash_remove(state->id_ah_active_hash, \ 966 (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce); \ 967 } 968 #define IBD_ACACHE_GET_ACTIVE(state) \ 969 list_get_head(&state->id_ah_active) 970 971 /* 972 * Membership states for different mcg's are tracked by two lists: 973 * the "non" list is used for promiscuous mode, when all mcg traffic 974 * needs to be inspected. This type of membership is never used for 975 * transmission, so there can not be an AH in the active list 976 * corresponding to a member in this list. This list does not need 977 * any protection, since all operations are performed by the async 978 * thread. 979 * 980 * "Full" and "SendOnly" membership is tracked using a single list, 981 * the "full" list. This is because this single list can then be 982 * searched during transmit to a multicast group (if an AH for the 983 * mcg is not found in the active list), since at least one type 984 * of membership must be present before initiating the transmit. 985 * This list is also emptied during driver detach, since sendonly 986 * membership acquired during transmit is dropped at detach time 987 * along with ipv4 broadcast full membership. Insert/deletes to 988 * this list are done only by the async thread, but it is also 989 * searched in program context (see multicast disable case), thus 990 * the id_mc_mutex protects the list. The driver detach path also 991 * deconstructs the "full" list, but it ensures that the async 992 * thread will not be accessing the list (by blocking out mcg 993 * trap handling and making sure no more Tx reaping will happen). 994 * 995 * Currently, an IBA attach is done in the SendOnly case too, 996 * although this is not required. 997 */ 998 #define IBD_MCACHE_INSERT_FULL(state, mce) \ 999 list_insert_head(&state->id_mc_full, mce) 1000 #define IBD_MCACHE_INSERT_NON(state, mce) \ 1001 list_insert_head(&state->id_mc_non, mce) 1002 #define IBD_MCACHE_FIND_FULL(state, mgid) \ 1003 ibd_mcache_find(mgid, &state->id_mc_full) 1004 #define IBD_MCACHE_FIND_NON(state, mgid) \ 1005 ibd_mcache_find(mgid, &state->id_mc_non) 1006 #define IBD_MCACHE_PULLOUT_FULL(state, mce) \ 1007 list_remove(&state->id_mc_full, mce) 1008 #define IBD_MCACHE_PULLOUT_NON(state, mce) \ 1009 list_remove(&state->id_mc_non, mce) 1010 1011 /* 1012 * AH and MCE active list manipulation: 1013 * 1014 * Multicast disable requests and MCG delete traps are two cases 1015 * where the active AH entry for the mcg (if any unreferenced one exists) 1016 * will be moved to the free list (to force the next Tx to the mcg to 1017 * join the MCG in SendOnly mode). Port up handling will also move AHs 1018 * from active to free list. 1019 * 1020 * In the case when some transmits are still pending on an entry 1021 * for an mcg, but a multicast disable has already been issued on the 1022 * mcg, there are some options to consider to preserve the join state 1023 * to ensure the emitted packet is properly routed on the IBA fabric. 1024 * For the AH, we can 1025 * 1. take out of active list at multicast disable time. 1026 * 2. take out of active list only when last pending Tx completes. 1027 * For the MCE, we can 1028 * 3. take out of active list at multicast disable time. 1029 * 4. take out of active list only when last pending Tx completes. 1030 * 5. move from active list to stale list at multicast disable time. 1031 * We choose to use 2,4. We use option 4 so that if a multicast enable 1032 * is tried before the pending Tx completes, the enable code finds the 1033 * mce in the active list and just has to make sure it will not be reaped 1034 * (ie the mcg leave done) when the pending Tx does complete. Alternatively, 1035 * a stale list (#5) that would be checked in the enable code would need 1036 * to be implemented. Option 2 is used, because otherwise, a Tx attempt 1037 * after the multicast disable would try to put an AH in the active list, 1038 * and associate the mce it finds in the active list to this new AH, 1039 * whereas the mce is already associated with the previous AH (taken off 1040 * the active list), and will be removed once the pending Tx's complete 1041 * (unless a reference count on mce's is implemented). One implication of 1042 * using 2,4 is that new Tx's posted before the pending Tx's complete will 1043 * grab new references on the AH, further delaying the leave. 1044 * 1045 * In the case of mcg delete (or create) trap when the port is sendonly 1046 * joined, the AH and MCE handling is different: the AH and MCE has to be 1047 * immediately taken off the active lists (forcing a join and path lookup 1048 * at the next Tx is the only guaranteed means of ensuring a proper Tx 1049 * to an mcg as it is repeatedly created and deleted and goes thru 1050 * reincarnations). 1051 * 1052 * When a port is already sendonly joined, and a multicast enable is 1053 * attempted, the same mce structure is promoted; this ensures only a 1054 * single mce on the active list tracks the most powerful join state. 1055 * 1056 * In the case of port up event handling, the MCE for sendonly membership 1057 * is freed up, and the ACE is put into the free list as soon as possible 1058 * (depending on whether posted Tx's have completed). For fullmembership 1059 * MCE's though, the ACE is similarly handled; but the MCE is kept around 1060 * (a re-JOIN is attempted) only if the DLPI leave has not already been 1061 * done; else the mce is deconstructed (mc_fullreap case). 1062 * 1063 * MCG creation and deletion trap handling: 1064 * 1065 * These traps are unreliable (meaning sometimes the trap might never 1066 * be delivered to the subscribed nodes) and may arrive out-of-order 1067 * since they use UD transport. An alternative to relying on these 1068 * unreliable traps is to poll for mcg presence every so often, but 1069 * instead of doing that, we try to be as conservative as possible 1070 * while handling the traps, and hope that the traps do arrive at 1071 * the subscribed nodes soon. Note that if a node is fullmember 1072 * joined to an mcg, it can not possibly receive a mcg create/delete 1073 * trap for that mcg (by fullmember definition); if it does, it is 1074 * an old trap from a previous incarnation of the mcg. 1075 * 1076 * Whenever a trap is received, the driver cleans up its sendonly 1077 * membership to the group; we choose to do a sendonly leave even 1078 * on a creation trap to handle the case of a prior deletion of the mcg 1079 * having gone unnoticed. Consider an example scenario: 1080 * T1: MCG M is deleted, and fires off deletion trap D1. 1081 * T2: MCG M is recreated, fires off creation trap C1, which is lost. 1082 * T3: Node N tries to transmit to M, joining in sendonly mode. 1083 * T4: MCG M is deleted, and fires off deletion trap D2. 1084 * T5: N receives a deletion trap, but can not distinguish D1 from D2. 1085 * If the trap is D2, then a LEAVE is not required, since the mcg 1086 * is already deleted; but if it is D1, a LEAVE is required. A safe 1087 * approach is to always LEAVE, but the SM may be confused if it 1088 * receives a LEAVE without a prior JOIN. 1089 * 1090 * Management of the non-membership to an mcg is similar to the above, 1091 * except that if the interface is in promiscuous mode, it is required 1092 * to attempt to re-join the mcg after receiving a trap. Unfortunately, 1093 * if the re-join attempt fails (in which case a warning message needs 1094 * to be printed), it is not clear whether it failed due to the mcg not 1095 * existing, or some fabric/hca issues, due to the delayed nature of 1096 * trap delivery. Querying the SA to establish presence/absence of the 1097 * mcg is also racy at best. Thus, the driver just prints a warning 1098 * message when it can not rejoin after receiving a create trap, although 1099 * this might be (on rare occasions) a mis-warning if the create trap is 1100 * received after the mcg was deleted. 1101 */ 1102 1103 /* 1104 * Implementation of atomic "recycle" bits and reference count 1105 * on address handles. This utilizes the fact that max reference 1106 * count on any handle is limited by number of send wqes, thus 1107 * high bits in the ac_ref field can be used as the recycle bits, 1108 * and only the low bits hold the number of pending Tx requests. 1109 * This atomic AH reference counting allows the Tx completion 1110 * handler not to acquire the id_ac_mutex to process every completion, 1111 * thus reducing lock contention problems between completion and 1112 * the Tx path. 1113 */ 1114 #define CYCLEVAL 0x80000 1115 #define CLEAR_REFCYCLE(ace) (ace)->ac_ref = 0 1116 #define CYCLE_SET(ace) (((ace)->ac_ref & CYCLEVAL) == CYCLEVAL) 1117 #define GET_REF(ace) ((ace)->ac_ref) 1118 #define GET_REF_CYCLE(ace) ( \ 1119 /* \ 1120 * Make sure "cycle" bit is set. \ 1121 */ \ 1122 ASSERT(CYCLE_SET(ace)), \ 1123 ((ace)->ac_ref & ~(CYCLEVAL)) \ 1124 ) 1125 #define INC_REF(ace, num) { \ 1126 atomic_add_32(&(ace)->ac_ref, num); \ 1127 } 1128 #define SET_CYCLE_IF_REF(ace) ( \ 1129 CYCLE_SET(ace) ? B_TRUE : \ 1130 atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) == \ 1131 CYCLEVAL ? \ 1132 /* \ 1133 * Clear the "cycle" bit we just set; \ 1134 * ref count known to be 0 from above. \ 1135 */ \ 1136 CLEAR_REFCYCLE(ace), B_FALSE : \ 1137 /* \ 1138 * We set "cycle" bit; let caller know. \ 1139 */ \ 1140 B_TRUE \ 1141 ) 1142 #define DEC_REF_DO_CYCLE(ace) ( \ 1143 atomic_add_32_nv(&ace->ac_ref, -1) == \ 1144 CYCLEVAL ? \ 1145 /* \ 1146 * Ref count known to be 0 from above. \ 1147 */ \ 1148 B_TRUE : \ 1149 B_FALSE \ 1150 ) 1151 1152 static void * 1153 list_get_head(list_t *list) 1154 { 1155 list_node_t *lhead = list_head(list); 1156 1157 if (lhead != NULL) 1158 list_remove(list, lhead); 1159 return (lhead); 1160 } 1161 1162 /* 1163 * This is always guaranteed to be able to queue the work. 1164 */ 1165 static void 1166 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op) 1167 { 1168 /* Initialize request */ 1169 DPRINT(1, "ibd_queue_work_slot : op: %d \n", op); 1170 ptr->rq_op = op; 1171 1172 /* 1173 * Queue provided slot onto request pool. 1174 */ 1175 mutex_enter(&state->id_acache_req_lock); 1176 list_insert_tail(&state->id_req_list, ptr); 1177 1178 /* Go, fetch, async thread */ 1179 cv_signal(&state->id_acache_req_cv); 1180 mutex_exit(&state->id_acache_req_lock); 1181 } 1182 1183 /* 1184 * Main body of the per interface async thread. 1185 */ 1186 static void 1187 ibd_async_work(ibd_state_t *state) 1188 { 1189 ibd_req_t *ptr; 1190 callb_cpr_t cprinfo; 1191 1192 mutex_enter(&state->id_acache_req_lock); 1193 CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock, 1194 callb_generic_cpr, "ibd_async_work"); 1195 1196 for (;;) { 1197 ptr = list_get_head(&state->id_req_list); 1198 if (ptr != NULL) { 1199 mutex_exit(&state->id_acache_req_lock); 1200 1201 /* 1202 * Once we have done the operation, there is no 1203 * guarantee the request slot is going to be valid, 1204 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP, 1205 * TRAP). 1206 * 1207 * Perform the request. 1208 */ 1209 switch (ptr->rq_op) { 1210 case IBD_ASYNC_GETAH: 1211 ibd_async_acache(state, &ptr->rq_mac); 1212 break; 1213 case IBD_ASYNC_JOIN: 1214 case IBD_ASYNC_LEAVE: 1215 ibd_async_multicast(state, 1216 ptr->rq_gid, ptr->rq_op); 1217 break; 1218 case IBD_ASYNC_PROMON: 1219 ibd_async_setprom(state); 1220 break; 1221 case IBD_ASYNC_PROMOFF: 1222 ibd_async_unsetprom(state); 1223 break; 1224 case IBD_ASYNC_REAP: 1225 ibd_async_reap_group(state, 1226 ptr->rq_ptr, ptr->rq_gid, 1227 IB_MC_JSTATE_FULL); 1228 /* 1229 * the req buf contains in mce 1230 * structure, so we do not need 1231 * to free it here. 1232 */ 1233 ptr = NULL; 1234 break; 1235 case IBD_ASYNC_TRAP: 1236 ibd_async_trap(state, ptr); 1237 break; 1238 case IBD_ASYNC_SCHED: 1239 ibd_async_txsched(state); 1240 break; 1241 case IBD_ASYNC_LINK: 1242 ibd_async_link(state, ptr); 1243 break; 1244 case IBD_ASYNC_EXIT: 1245 mutex_enter(&state->id_acache_req_lock); 1246 #ifndef __lock_lint 1247 CALLB_CPR_EXIT(&cprinfo); 1248 #else 1249 mutex_exit(&state->id_acache_req_lock); 1250 #endif 1251 return; 1252 } 1253 if (ptr != NULL) 1254 kmem_cache_free(state->id_req_kmc, ptr); 1255 1256 mutex_enter(&state->id_acache_req_lock); 1257 } else { 1258 #ifndef __lock_lint 1259 /* 1260 * Nothing to do: wait till new request arrives. 1261 */ 1262 CALLB_CPR_SAFE_BEGIN(&cprinfo); 1263 cv_wait(&state->id_acache_req_cv, 1264 &state->id_acache_req_lock); 1265 CALLB_CPR_SAFE_END(&cprinfo, 1266 &state->id_acache_req_lock); 1267 #endif 1268 } 1269 } 1270 1271 /*NOTREACHED*/ 1272 _NOTE(NOT_REACHED) 1273 } 1274 1275 /* 1276 * Return when it is safe to queue requests to the async daemon; primarily 1277 * for subnet trap and async event handling. Disallow requests before the 1278 * daemon is created, and when interface deinitilization starts. 1279 */ 1280 static boolean_t 1281 ibd_async_safe(ibd_state_t *state) 1282 { 1283 mutex_enter(&state->id_trap_lock); 1284 if (state->id_trap_stop) { 1285 mutex_exit(&state->id_trap_lock); 1286 return (B_FALSE); 1287 } 1288 state->id_trap_inprog++; 1289 mutex_exit(&state->id_trap_lock); 1290 return (B_TRUE); 1291 } 1292 1293 /* 1294 * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet 1295 * trap or event handling to complete to kill the async thread and deconstruct 1296 * the mcg/ace list. 1297 */ 1298 static void 1299 ibd_async_done(ibd_state_t *state) 1300 { 1301 mutex_enter(&state->id_trap_lock); 1302 if (--state->id_trap_inprog == 0) 1303 cv_signal(&state->id_trap_cv); 1304 mutex_exit(&state->id_trap_lock); 1305 } 1306 1307 /* 1308 * Hash functions: 1309 * ibd_hash_by_id: Returns the qpn as the hash entry into bucket. 1310 * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1. 1311 * These operate on mac addresses input into ibd_send, but there is no 1312 * guarantee on the alignment of the ipoib_mac_t structure. 1313 */ 1314 /*ARGSUSED*/ 1315 static uint_t 1316 ibd_hash_by_id(void *hash_data, mod_hash_key_t key) 1317 { 1318 ulong_t ptraddr = (ulong_t)key; 1319 uint_t hval; 1320 1321 /* 1322 * If the input address is 4 byte aligned, we can just dereference 1323 * it. This is most common, since IP will send in a 4 byte aligned 1324 * IP header, which implies the 24 byte IPoIB psuedo header will be 1325 * 4 byte aligned too. 1326 */ 1327 if ((ptraddr & 3) == 0) 1328 return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn); 1329 1330 bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t)); 1331 return (hval); 1332 } 1333 1334 static int 1335 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2) 1336 { 1337 if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0) 1338 return (0); 1339 else 1340 return (1); 1341 } 1342 1343 /* 1344 * Initialize all the per interface caches and lists; AH cache, 1345 * MCG list etc. 1346 */ 1347 static int 1348 ibd_acache_init(ibd_state_t *state) 1349 { 1350 ibd_ace_t *ce; 1351 int i; 1352 1353 mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL); 1354 cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL); 1355 1356 mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL); 1357 mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL); 1358 mutex_enter(&state->id_ac_mutex); 1359 list_create(&state->id_ah_free, sizeof (ibd_ace_t), 1360 offsetof(ibd_ace_t, ac_list)); 1361 list_create(&state->id_ah_active, sizeof (ibd_ace_t), 1362 offsetof(ibd_ace_t, ac_list)); 1363 state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash", 1364 IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 1365 ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP); 1366 list_create(&state->id_mc_full, sizeof (ibd_mce_t), 1367 offsetof(ibd_mce_t, mc_list)); 1368 list_create(&state->id_mc_non, sizeof (ibd_mce_t), 1369 offsetof(ibd_mce_t, mc_list)); 1370 list_create(&state->id_req_list, sizeof (ibd_req_t), 1371 offsetof(ibd_req_t, rq_list)); 1372 state->id_ac_hot_ace = NULL; 1373 1374 state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) * 1375 IBD_NUM_AH, KM_SLEEP); 1376 for (i = 0; i < IBD_NUM_AH; i++, ce++) { 1377 if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS, 1378 state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) { 1379 mutex_exit(&state->id_ac_mutex); 1380 ibd_acache_fini(state); 1381 return (DDI_FAILURE); 1382 } else { 1383 CLEAR_REFCYCLE(ce); 1384 ce->ac_mce = NULL; 1385 IBD_ACACHE_INSERT_FREE(state, ce); 1386 } 1387 } 1388 mutex_exit(&state->id_ac_mutex); 1389 return (DDI_SUCCESS); 1390 } 1391 1392 static void 1393 ibd_acache_fini(ibd_state_t *state) 1394 { 1395 ibd_ace_t *ptr; 1396 1397 mutex_enter(&state->id_ac_mutex); 1398 1399 while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) { 1400 ASSERT(GET_REF(ptr) == 0); 1401 (void) ibt_free_ud_dest(ptr->ac_dest); 1402 } 1403 1404 while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) { 1405 ASSERT(GET_REF(ptr) == 0); 1406 (void) ibt_free_ud_dest(ptr->ac_dest); 1407 } 1408 1409 list_destroy(&state->id_ah_free); 1410 list_destroy(&state->id_ah_active); 1411 list_destroy(&state->id_mc_full); 1412 list_destroy(&state->id_mc_non); 1413 list_destroy(&state->id_req_list); 1414 kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH); 1415 mutex_exit(&state->id_ac_mutex); 1416 mutex_destroy(&state->id_ac_mutex); 1417 mutex_destroy(&state->id_mc_mutex); 1418 mutex_destroy(&state->id_acache_req_lock); 1419 cv_destroy(&state->id_acache_req_cv); 1420 } 1421 1422 /* 1423 * Search AH active hash list for a cached path to input destination. 1424 * If we are "just looking", hold == F. When we are in the Tx path, 1425 * we set hold == T to grab a reference on the AH so that it can not 1426 * be recycled to a new destination while the Tx request is posted. 1427 */ 1428 static ibd_ace_t * 1429 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num) 1430 { 1431 ibd_ace_t *ptr; 1432 1433 ASSERT(mutex_owned(&state->id_ac_mutex)); 1434 1435 /* 1436 * Do hash search. 1437 */ 1438 if (mod_hash_find(state->id_ah_active_hash, 1439 (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) { 1440 if (hold) 1441 INC_REF(ptr, num); 1442 return (ptr); 1443 } 1444 return (NULL); 1445 } 1446 1447 /* 1448 * This is called by the tx side; if an initialized AH is found in 1449 * the active list, it is locked down and can be used; if no entry 1450 * is found, an async request is queued to do path resolution. 1451 */ 1452 static ibd_ace_t * 1453 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe) 1454 { 1455 ibd_ace_t *ptr; 1456 ibd_req_t *req; 1457 1458 /* 1459 * Only attempt to print when we can; in the mdt pattr case, the 1460 * address is not aligned properly. 1461 */ 1462 if (((ulong_t)mac & 3) == 0) { 1463 DPRINT(4, 1464 "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X", 1465 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1466 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1467 htonl(mac->ipoib_gidsuff[1])); 1468 } 1469 1470 mutex_enter(&state->id_ac_mutex); 1471 1472 if (((ptr = state->id_ac_hot_ace) != NULL) && 1473 (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) { 1474 INC_REF(ptr, numwqe); 1475 mutex_exit(&state->id_ac_mutex); 1476 return (ptr); 1477 } 1478 if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) { 1479 state->id_ac_hot_ace = ptr; 1480 mutex_exit(&state->id_ac_mutex); 1481 return (ptr); 1482 } 1483 1484 /* 1485 * Implementation of a single outstanding async request; if 1486 * the operation is not started yet, queue a request and move 1487 * to ongoing state. Remember in id_ah_addr for which address 1488 * we are queueing the request, in case we need to flag an error; 1489 * Any further requests, for the same or different address, until 1490 * the operation completes, is sent back to GLDv3 to be retried. 1491 * The async thread will update id_ah_op with an error indication 1492 * or will set it to indicate the next look up can start; either 1493 * way, it will mac_tx_update() so that all blocked requests come 1494 * back here. 1495 */ 1496 *err = EAGAIN; 1497 if (state->id_ah_op == IBD_OP_NOTSTARTED) { 1498 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 1499 if (req != NULL) { 1500 /* 1501 * We did not even find the entry; queue a request 1502 * for it. 1503 */ 1504 bcopy(mac, &(req->rq_mac), IPOIB_ADDRL); 1505 ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH); 1506 state->id_ah_op = IBD_OP_ONGOING; 1507 bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL); 1508 } 1509 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1510 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) { 1511 /* 1512 * Check the status of the pathrecord lookup request 1513 * we had queued before. 1514 */ 1515 if (state->id_ah_op == IBD_OP_ERRORED) { 1516 *err = EFAULT; 1517 state->id_ah_error++; 1518 } else { 1519 /* 1520 * IBD_OP_ROUTERED case: We need to send to the 1521 * all-router MCG. If we can find the AH for 1522 * the mcg, the Tx will be attempted. If we 1523 * do not find the AH, we return NORESOURCES 1524 * to retry. 1525 */ 1526 ipoib_mac_t routermac; 1527 1528 (void) ibd_get_allroutergroup(state, mac, &routermac); 1529 ptr = ibd_acache_find(state, &routermac, B_TRUE, 1530 numwqe); 1531 } 1532 state->id_ah_op = IBD_OP_NOTSTARTED; 1533 } else if ((state->id_ah_op != IBD_OP_ONGOING) && 1534 (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) { 1535 /* 1536 * This case can happen when we get a higher band 1537 * packet. The easiest way is to reset the state machine 1538 * to accommodate the higher priority packet. 1539 */ 1540 state->id_ah_op = IBD_OP_NOTSTARTED; 1541 } 1542 mutex_exit(&state->id_ac_mutex); 1543 1544 return (ptr); 1545 } 1546 1547 /* 1548 * Grab a not-currently-in-use AH/PathRecord from the active 1549 * list to recycle to a new destination. Only the async thread 1550 * executes this code. 1551 */ 1552 static ibd_ace_t * 1553 ibd_acache_get_unref(ibd_state_t *state) 1554 { 1555 ibd_ace_t *ptr = list_head(&state->id_ah_active); 1556 1557 ASSERT(mutex_owned(&state->id_ac_mutex)); 1558 1559 /* 1560 * Do plain linear search. 1561 */ 1562 while (ptr != NULL) { 1563 /* 1564 * Note that it is possible that the "cycle" bit 1565 * is set on the AH w/o any reference count. The 1566 * mcg must have been deleted, and the tx cleanup 1567 * just decremented the reference count to 0, but 1568 * hasn't gotten around to grabbing the id_ac_mutex 1569 * to move the AH into the free list. 1570 */ 1571 if (GET_REF(ptr) == 0) { 1572 IBD_ACACHE_PULLOUT_ACTIVE(state, ptr); 1573 break; 1574 } 1575 ptr = list_next(&state->id_ah_active, ptr); 1576 } 1577 return (ptr); 1578 } 1579 1580 /* 1581 * Invoked to clean up AH from active list in case of multicast 1582 * disable and to handle sendonly memberships during mcg traps. 1583 * And for port up processing for multicast and unicast AHs. 1584 * Normally, the AH is taken off the active list, and put into 1585 * the free list to be recycled for a new destination. In case 1586 * Tx requests on the AH have not completed yet, the AH is marked 1587 * for reaping (which will put the AH on the free list) once the Tx's 1588 * complete; in this case, depending on the "force" input, we take 1589 * out the AH from the active list right now, or leave it also for 1590 * the reap operation. Returns TRUE if the AH is taken off the active 1591 * list (and either put into the free list right now, or arranged for 1592 * later), FALSE otherwise. 1593 */ 1594 static boolean_t 1595 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force) 1596 { 1597 ibd_ace_t *acactive; 1598 boolean_t ret = B_TRUE; 1599 1600 ASSERT(mutex_owned(&state->id_ac_mutex)); 1601 1602 if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) { 1603 1604 /* 1605 * Note that the AH might already have the cycle bit set 1606 * on it; this might happen if sequences of multicast 1607 * enables and disables are coming so fast, that posted 1608 * Tx's to the mcg have not completed yet, and the cycle 1609 * bit is set successively by each multicast disable. 1610 */ 1611 if (SET_CYCLE_IF_REF(acactive)) { 1612 if (!force) { 1613 /* 1614 * The ace is kept on the active list, further 1615 * Tx's can still grab a reference on it; the 1616 * ace is reaped when all pending Tx's 1617 * referencing the AH complete. 1618 */ 1619 ret = B_FALSE; 1620 } else { 1621 /* 1622 * In the mcg trap case, we always pull the 1623 * AH from the active list. And also the port 1624 * up multi/unicast case. 1625 */ 1626 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1627 acactive->ac_mce = NULL; 1628 } 1629 } else { 1630 /* 1631 * Determined the ref count is 0, thus reclaim 1632 * immediately after pulling out the ace from 1633 * the active list. 1634 */ 1635 IBD_ACACHE_PULLOUT_ACTIVE(state, acactive); 1636 acactive->ac_mce = NULL; 1637 IBD_ACACHE_INSERT_FREE(state, acactive); 1638 } 1639 1640 } 1641 return (ret); 1642 } 1643 1644 /* 1645 * Helper function for async path record lookup. If we are trying to 1646 * Tx to a MCG, check our membership, possibly trying to join the 1647 * group if required. If that fails, try to send the packet to the 1648 * all router group (indicated by the redirect output), pointing 1649 * the input mac address to the router mcg address. 1650 */ 1651 static ibd_mce_t * 1652 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect) 1653 { 1654 ib_gid_t mgid; 1655 ibd_mce_t *mce; 1656 ipoib_mac_t routermac; 1657 1658 *redirect = B_FALSE; 1659 ibd_n2h_gid(mac, &mgid); 1660 1661 /* 1662 * Check the FullMember+SendOnlyNonMember list. 1663 * Since we are the only one who manipulates the 1664 * id_mc_full list, no locks are needed. 1665 */ 1666 mce = IBD_MCACHE_FIND_FULL(state, mgid); 1667 if (mce != NULL) { 1668 DPRINT(4, "ibd_async_mcache : already joined to group"); 1669 return (mce); 1670 } 1671 1672 /* 1673 * Not found; try to join(SendOnlyNonMember) and attach. 1674 */ 1675 DPRINT(4, "ibd_async_mcache : not joined to group"); 1676 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1677 NULL) { 1678 DPRINT(4, "ibd_async_mcache : nonmem joined to group"); 1679 return (mce); 1680 } 1681 1682 /* 1683 * MCGroup not present; try to join the all-router group. If 1684 * any of the following steps succeed, we will be redirecting 1685 * to the all router group. 1686 */ 1687 DPRINT(4, "ibd_async_mcache : nonmem join failed"); 1688 if (!ibd_get_allroutergroup(state, mac, &routermac)) 1689 return (NULL); 1690 *redirect = B_TRUE; 1691 ibd_n2h_gid(&routermac, &mgid); 1692 bcopy(&routermac, mac, IPOIB_ADDRL); 1693 DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n", 1694 mgid.gid_prefix, mgid.gid_guid); 1695 1696 /* 1697 * Are we already joined to the router group? 1698 */ 1699 if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) { 1700 DPRINT(4, "ibd_async_mcache : using already joined router" 1701 "group\n"); 1702 return (mce); 1703 } 1704 1705 /* 1706 * Can we join(SendOnlyNonMember) the router group? 1707 */ 1708 DPRINT(4, "ibd_async_mcache : attempting join to router grp"); 1709 if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) != 1710 NULL) { 1711 DPRINT(4, "ibd_async_mcache : joined to router grp"); 1712 return (mce); 1713 } 1714 1715 return (NULL); 1716 } 1717 1718 /* 1719 * Async path record lookup code. 1720 */ 1721 static void 1722 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac) 1723 { 1724 ibd_ace_t *ce; 1725 ibd_mce_t *mce = NULL; 1726 ibt_path_attr_t path_attr; 1727 ibt_path_info_t path_info; 1728 ib_gid_t destgid; 1729 char ret = IBD_OP_NOTSTARTED; 1730 1731 DPRINT(4, "ibd_async_acache : %08X:%08X:%08X:%08X:%08X", 1732 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1733 htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]), 1734 htonl(mac->ipoib_gidsuff[1])); 1735 1736 /* 1737 * Check whether we are trying to transmit to a MCG. 1738 * In that case, we need to make sure we are a member of 1739 * the MCG. 1740 */ 1741 if (mac->ipoib_qpn == htonl(IB_MC_QPN)) { 1742 boolean_t redirected; 1743 1744 /* 1745 * If we can not find or join the group or even 1746 * redirect, error out. 1747 */ 1748 if ((mce = ibd_async_mcache(state, mac, &redirected)) == 1749 NULL) { 1750 state->id_ah_op = IBD_OP_ERRORED; 1751 return; 1752 } 1753 1754 /* 1755 * If we got redirected, we need to determine whether 1756 * the AH for the new mcg is in the cache already, and 1757 * not pull it in then; otherwise proceed to get the 1758 * path for the new mcg. There is no guarantee that 1759 * if the AH is currently in the cache, it will still be 1760 * there when we look in ibd_acache_lookup(), but that's 1761 * okay, we will come back here. 1762 */ 1763 if (redirected) { 1764 ret = IBD_OP_ROUTERED; 1765 DPRINT(4, "ibd_async_acache : redirected to " 1766 "%08X:%08X:%08X:%08X:%08X", 1767 htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]), 1768 htonl(mac->ipoib_gidpref[1]), 1769 htonl(mac->ipoib_gidsuff[0]), 1770 htonl(mac->ipoib_gidsuff[1])); 1771 1772 mutex_enter(&state->id_ac_mutex); 1773 if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) { 1774 state->id_ah_op = IBD_OP_ROUTERED; 1775 mutex_exit(&state->id_ac_mutex); 1776 DPRINT(4, "ibd_async_acache : router AH found"); 1777 return; 1778 } 1779 mutex_exit(&state->id_ac_mutex); 1780 } 1781 } 1782 1783 /* 1784 * Get an AH from the free list. 1785 */ 1786 mutex_enter(&state->id_ac_mutex); 1787 if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) { 1788 /* 1789 * No free ones; try to grab an unreferenced active 1790 * one. Maybe we need to make the active list LRU, 1791 * but that will create more work for Tx callbacks. 1792 * Is there a way of not having to pull out the 1793 * entry from the active list, but just indicate it 1794 * is being recycled? Yes, but that creates one more 1795 * check in the fast lookup path. 1796 */ 1797 if ((ce = ibd_acache_get_unref(state)) == NULL) { 1798 /* 1799 * Pretty serious shortage now. 1800 */ 1801 state->id_ah_op = IBD_OP_NOTSTARTED; 1802 mutex_exit(&state->id_ac_mutex); 1803 DPRINT(10, "ibd_async_acache : failed to find AH " 1804 "slot\n"); 1805 return; 1806 } 1807 /* 1808 * We could check whether ac_mce points to a SendOnly 1809 * member and drop that membership now. Or do it lazily 1810 * at detach time. 1811 */ 1812 ce->ac_mce = NULL; 1813 } 1814 mutex_exit(&state->id_ac_mutex); 1815 ASSERT(ce->ac_mce == NULL); 1816 1817 /* 1818 * Update the entry. 1819 */ 1820 bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL); 1821 1822 bzero(&path_info, sizeof (path_info)); 1823 bzero(&path_attr, sizeof (ibt_path_attr_t)); 1824 path_attr.pa_sgid = state->id_sgid; 1825 path_attr.pa_num_dgids = 1; 1826 ibd_n2h_gid(&ce->ac_mac, &destgid); 1827 path_attr.pa_dgids = &destgid; 1828 path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 1829 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 1830 &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) { 1831 DPRINT(10, "ibd_async_acache : failed in ibt_get_paths"); 1832 goto error; 1833 } 1834 if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey, 1835 ntohl(ce->ac_mac.ipoib_qpn), 1836 &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) { 1837 DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest"); 1838 goto error; 1839 } 1840 1841 /* 1842 * mce is set whenever an AH is being associated with a 1843 * MCG; this will come in handy when we leave the MCG. The 1844 * lock protects Tx fastpath from scanning the active list. 1845 */ 1846 if (mce != NULL) 1847 ce->ac_mce = mce; 1848 mutex_enter(&state->id_ac_mutex); 1849 IBD_ACACHE_INSERT_ACTIVE(state, ce); 1850 state->id_ah_op = ret; 1851 mutex_exit(&state->id_ac_mutex); 1852 return; 1853 error: 1854 /* 1855 * We might want to drop SendOnly membership here if we 1856 * joined above. The lock protects Tx callbacks inserting 1857 * into the free list. 1858 */ 1859 mutex_enter(&state->id_ac_mutex); 1860 state->id_ah_op = IBD_OP_ERRORED; 1861 IBD_ACACHE_INSERT_FREE(state, ce); 1862 mutex_exit(&state->id_ac_mutex); 1863 } 1864 1865 /* 1866 * While restoring port's presence on the subnet on a port up, it is possible 1867 * that the port goes down again. 1868 */ 1869 static void 1870 ibd_async_link(ibd_state_t *state, ibd_req_t *req) 1871 { 1872 ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr; 1873 link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN : 1874 LINK_STATE_UP; 1875 ibd_mce_t *mce, *pmce; 1876 ibd_ace_t *ace, *pace; 1877 1878 DPRINT(10, "ibd_async_link(): %d", opcode); 1879 1880 /* 1881 * On a link up, revalidate the link speed/width. No point doing 1882 * this on a link down, since we will be unable to do SA operations, 1883 * defaulting to the lowest speed. Also notice that we update our 1884 * notion of speed before calling mac_link_update(), which will do 1885 * necessary higher level notifications for speed changes. 1886 */ 1887 if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) { 1888 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 1889 state->id_link_speed = ibd_get_portspeed(state); 1890 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 1891 } 1892 1893 /* 1894 * Do all the work required to establish our presence on 1895 * the subnet. 1896 */ 1897 if (opcode == IBD_LINK_UP_ABSENT) { 1898 /* 1899 * If in promiscuous mode ... 1900 */ 1901 if (state->id_prom_op == IBD_OP_COMPLETED) { 1902 /* 1903 * Drop all nonmembership. 1904 */ 1905 ibd_async_unsetprom(state); 1906 1907 /* 1908 * Then, try to regain nonmembership to all mcg's. 1909 */ 1910 ibd_async_setprom(state); 1911 1912 } 1913 1914 /* 1915 * Drop all sendonly membership (which also gets rid of the 1916 * AHs); try to reacquire all full membership. 1917 */ 1918 mce = list_head(&state->id_mc_full); 1919 while ((pmce = mce) != NULL) { 1920 mce = list_next(&state->id_mc_full, mce); 1921 if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON) 1922 ibd_leave_group(state, 1923 pmce->mc_info.mc_adds_vect.av_dgid, 1924 IB_MC_JSTATE_SEND_ONLY_NON); 1925 else 1926 ibd_reacquire_group(state, pmce); 1927 } 1928 1929 /* 1930 * Recycle all active AHs to free list (and if there are 1931 * pending posts, make sure they will go into the free list 1932 * once the Tx's complete). Grab the lock to prevent 1933 * concurrent Tx's as well as Tx cleanups. 1934 */ 1935 mutex_enter(&state->id_ac_mutex); 1936 ace = list_head(&state->id_ah_active); 1937 while ((pace = ace) != NULL) { 1938 boolean_t cycled; 1939 1940 ace = list_next(&state->id_ah_active, ace); 1941 mce = pace->ac_mce; 1942 cycled = ibd_acache_recycle(state, &pace->ac_mac, 1943 B_TRUE); 1944 /* 1945 * If this is for an mcg, it must be for a fullmember, 1946 * since we got rid of send-only members above when 1947 * processing the mce list. 1948 */ 1949 ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate == 1950 IB_MC_JSTATE_FULL))); 1951 1952 /* 1953 * Check if the fullmember mce needs to be torn down, 1954 * ie whether the DLPI disable has already been done. 1955 * If so, do some of the work of tx_cleanup, namely 1956 * causing leave (which will fail), detach and 1957 * mce-freeing. tx_cleanup will put the AH into free 1958 * list. The reason to duplicate some of this 1959 * tx_cleanup work is because we want to delete the 1960 * AH right now instead of waiting for tx_cleanup, to 1961 * force subsequent Tx's to reacquire an AH. 1962 */ 1963 if ((mce != NULL) && (mce->mc_fullreap)) 1964 ibd_async_reap_group(state, mce, 1965 mce->mc_info.mc_adds_vect.av_dgid, 1966 mce->mc_jstate); 1967 } 1968 mutex_exit(&state->id_ac_mutex); 1969 } 1970 1971 /* 1972 * mac handle is guaranteed to exist since driver does ibt_close_hca() 1973 * (which stops further events from being delivered) before 1974 * mac_unregister(). At this point, it is guaranteed that mac_register 1975 * has already been done. 1976 */ 1977 mutex_enter(&state->id_link_mutex); 1978 state->id_link_state = lstate; 1979 mac_link_update(state->id_mh, lstate); 1980 mutex_exit(&state->id_link_mutex); 1981 1982 ibd_async_done(state); 1983 } 1984 1985 /* 1986 * Check the pkey table to see if we can find the pkey we're looking for. 1987 * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on 1988 * failure. 1989 */ 1990 static int 1991 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey, 1992 uint16_t *pkix) 1993 { 1994 uint16_t ndx; 1995 1996 ASSERT(pkix != NULL); 1997 1998 for (ndx = 0; ndx < pkey_tbl_sz; ndx++) { 1999 if (pkey_tbl[ndx] == pkey) { 2000 *pkix = ndx; 2001 return (0); 2002 } 2003 } 2004 return (-1); 2005 } 2006 2007 /* 2008 * When the link is notified up, we need to do a few things, based 2009 * on the port's current p_init_type_reply claiming a reinit has been 2010 * done or not. The reinit steps are: 2011 * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify 2012 * the old Pkey and GID0 are correct. 2013 * 2. Register for mcg traps (already done by ibmf). 2014 * 3. If PreservePresenceReply indicates the SM has restored port's presence 2015 * in subnet, nothing more to do. Else go to next steps (on async daemon). 2016 * 4. Give up all sendonly memberships. 2017 * 5. Acquire all full memberships. 2018 * 6. In promiscuous mode, acquire all non memberships. 2019 * 7. Recycle all AHs to free list. 2020 */ 2021 static void 2022 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code) 2023 { 2024 ibt_hca_portinfo_t *port_infop = NULL; 2025 ibt_status_t ibt_status; 2026 uint_t psize, port_infosz; 2027 ibd_link_op_t opcode; 2028 ibd_req_t *req; 2029 link_state_t new_link_state = LINK_STATE_UP; 2030 uint8_t itreply; 2031 uint16_t pkix; 2032 int ret; 2033 2034 /* 2035 * Let's not race with a plumb or an unplumb; if we detect a 2036 * pkey relocation event later on here, we may have to restart. 2037 */ 2038 ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2039 2040 mutex_enter(&state->id_link_mutex); 2041 2042 /* 2043 * If the init code in ibd_m_start hasn't yet set up the 2044 * pkey/gid, nothing to do; that code will set the link state. 2045 */ 2046 if (state->id_link_state == LINK_STATE_UNKNOWN) { 2047 mutex_exit(&state->id_link_mutex); 2048 goto link_mod_return; 2049 } 2050 2051 /* 2052 * If this routine was called in response to a port down event, 2053 * we just need to see if this should be informed. 2054 */ 2055 if (code == IBT_ERROR_PORT_DOWN) { 2056 new_link_state = LINK_STATE_DOWN; 2057 goto update_link_state; 2058 } 2059 2060 /* 2061 * If it's not a port down event we've received, try to get the port 2062 * attributes first. If we fail here, the port is as good as down. 2063 * Otherwise, if the link went down by the time the handler gets 2064 * here, give up - we cannot even validate the pkey/gid since those 2065 * are not valid and this is as bad as a port down anyway. 2066 */ 2067 ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 2068 &port_infop, &psize, &port_infosz); 2069 if ((ibt_status != IBT_SUCCESS) || (psize != 1) || 2070 (port_infop->p_linkstate != IBT_PORT_ACTIVE)) { 2071 new_link_state = LINK_STATE_DOWN; 2072 goto update_link_state; 2073 } 2074 2075 /* 2076 * Check the SM InitTypeReply flags. If both NoLoadReply and 2077 * PreserveContentReply are 0, we don't know anything about the 2078 * data loaded into the port attributes, so we need to verify 2079 * if gid0 and pkey are still valid. 2080 */ 2081 itreply = port_infop->p_init_type_reply; 2082 if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) && 2083 ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) { 2084 /* 2085 * Check to see if the subnet part of GID0 has changed. If 2086 * not, check the simple case first to see if the pkey 2087 * index is the same as before; finally check to see if the 2088 * pkey has been relocated to a different index in the table. 2089 */ 2090 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2091 if (bcmp(port_infop->p_sgid_tbl, 2092 &state->id_sgid, sizeof (ib_gid_t)) != 0) { 2093 2094 new_link_state = LINK_STATE_DOWN; 2095 2096 } else if (port_infop->p_pkey_tbl[state->id_pkix] == 2097 state->id_pkey) { 2098 2099 new_link_state = LINK_STATE_UP; 2100 2101 } else if (ibd_locate_pkey(port_infop->p_pkey_tbl, 2102 port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) { 2103 2104 ibt_free_portinfo(port_infop, port_infosz); 2105 mutex_exit(&state->id_link_mutex); 2106 2107 /* 2108 * Currently a restart is required if our pkey has moved 2109 * in the pkey table. If we get the ibt_recycle_ud() to 2110 * work as documented (expected), we may be able to 2111 * avoid a complete restart. Note that we've already 2112 * marked both the start and stop 'in-progress' flags, 2113 * so it is ok to go ahead and do this restart. 2114 */ 2115 (void) ibd_undo_start(state, LINK_STATE_DOWN); 2116 if ((ret = ibd_start(state)) != 0) { 2117 DPRINT(10, "ibd_restart: cannot restart, " 2118 "ret=%d", ret); 2119 } 2120 2121 goto link_mod_return; 2122 } else { 2123 new_link_state = LINK_STATE_DOWN; 2124 } 2125 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 2126 } 2127 2128 update_link_state: 2129 if (port_infop) { 2130 ibt_free_portinfo(port_infop, port_infosz); 2131 } 2132 2133 /* 2134 * If the old state is the same as the new state, nothing to do 2135 */ 2136 if (state->id_link_state == new_link_state) { 2137 mutex_exit(&state->id_link_mutex); 2138 goto link_mod_return; 2139 } 2140 2141 /* 2142 * Ok, so there was a link state change; see if it's safe to ask 2143 * the async thread to do the work 2144 */ 2145 if (!ibd_async_safe(state)) { 2146 state->id_link_state = new_link_state; 2147 mutex_exit(&state->id_link_mutex); 2148 goto link_mod_return; 2149 } 2150 2151 mutex_exit(&state->id_link_mutex); 2152 2153 /* 2154 * If we're reporting a link up, check InitTypeReply to see if 2155 * the SM has ensured that the port's presence in mcg, traps, 2156 * etc. is intact. 2157 */ 2158 if (new_link_state == LINK_STATE_DOWN) { 2159 opcode = IBD_LINK_DOWN; 2160 } else { 2161 if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 2162 SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) { 2163 opcode = IBD_LINK_UP; 2164 } else { 2165 opcode = IBD_LINK_UP_ABSENT; 2166 } 2167 } 2168 2169 /* 2170 * Queue up a request for ibd_async_link() to handle this link 2171 * state change event 2172 */ 2173 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 2174 req->rq_ptr = (void *)opcode; 2175 ibd_queue_work_slot(state, req, IBD_ASYNC_LINK); 2176 2177 link_mod_return: 2178 ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS); 2179 } 2180 2181 /* 2182 * For the port up/down events, IBTL guarantees there will not be concurrent 2183 * invocations of the handler. IBTL might coalesce link transition events, 2184 * and not invoke the handler for _each_ up/down transition, but it will 2185 * invoke the handler with last known state 2186 */ 2187 static void 2188 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl, 2189 ibt_async_code_t code, ibt_async_event_t *event) 2190 { 2191 ibd_state_t *state = (ibd_state_t *)clnt_private; 2192 2193 switch (code) { 2194 case IBT_ERROR_CATASTROPHIC_CHAN: 2195 ibd_print_warn(state, "catastrophic channel error"); 2196 break; 2197 case IBT_ERROR_CQ: 2198 ibd_print_warn(state, "completion queue error"); 2199 break; 2200 case IBT_PORT_CHANGE_EVENT: 2201 /* 2202 * Events will be delivered to all instances that have 2203 * done ibt_open_hca() but not yet done ibt_close_hca(). 2204 * Only need to do work for our port; IBTF will deliver 2205 * events for other ports on the hca we have ibt_open_hca'ed 2206 * too. Note that id_port is initialized in ibd_attach() 2207 * before we do an ibt_open_hca() in ibd_attach(). 2208 */ 2209 ASSERT(state->id_hca_hdl == hca_hdl); 2210 if (state->id_port != event->ev_port) 2211 break; 2212 2213 if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) == 2214 IBT_PORT_CHANGE_PKEY) { 2215 ibd_link_mod(state, code); 2216 } 2217 break; 2218 case IBT_ERROR_PORT_DOWN: 2219 case IBT_CLNT_REREG_EVENT: 2220 case IBT_EVENT_PORT_UP: 2221 /* 2222 * Events will be delivered to all instances that have 2223 * done ibt_open_hca() but not yet done ibt_close_hca(). 2224 * Only need to do work for our port; IBTF will deliver 2225 * events for other ports on the hca we have ibt_open_hca'ed 2226 * too. Note that id_port is initialized in ibd_attach() 2227 * before we do an ibt_open_hca() in ibd_attach(). 2228 */ 2229 ASSERT(state->id_hca_hdl == hca_hdl); 2230 if (state->id_port != event->ev_port) 2231 break; 2232 2233 ibd_link_mod(state, code); 2234 break; 2235 2236 case IBT_HCA_ATTACH_EVENT: 2237 case IBT_HCA_DETACH_EVENT: 2238 /* 2239 * When a new card is plugged to the system, attach_event is 2240 * invoked. Additionally, a cfgadm needs to be run to make the 2241 * card known to the system, and an ifconfig needs to be run to 2242 * plumb up any ibd interfaces on the card. In the case of card 2243 * unplug, a cfgadm is run that will trigger any RCM scripts to 2244 * unplumb the ibd interfaces on the card; when the card is 2245 * actually unplugged, the detach_event is invoked; 2246 * additionally, if any ibd instances are still active on the 2247 * card (eg there were no associated RCM scripts), driver's 2248 * detach routine is invoked. 2249 */ 2250 break; 2251 default: 2252 break; 2253 } 2254 } 2255 2256 static int 2257 ibd_register_mac(ibd_state_t *state, dev_info_t *dip) 2258 { 2259 mac_register_t *macp; 2260 int ret; 2261 2262 if ((macp = mac_alloc(MAC_VERSION)) == NULL) { 2263 DPRINT(10, "ibd_register_mac: mac_alloc() failed"); 2264 return (DDI_FAILURE); 2265 } 2266 2267 /* 2268 * Note that when we register with mac during attach, we don't 2269 * have the id_macaddr yet, so we'll simply be registering a 2270 * zero macaddr that we'll overwrite later during plumb (in 2271 * ibd_m_start()). Similar is the case with id_mtu - we'll 2272 * update the mac layer with the correct mtu during plumb. 2273 */ 2274 macp->m_type_ident = MAC_PLUGIN_IDENT_IB; 2275 macp->m_driver = state; 2276 macp->m_dip = dip; 2277 macp->m_src_addr = (uint8_t *)&state->id_macaddr; 2278 macp->m_callbacks = &ibd_m_callbacks; 2279 macp->m_min_sdu = 0; 2280 macp->m_max_sdu = IBD_DEF_MAX_SDU; 2281 2282 /* 2283 * Register ourselves with the GLDv3 interface 2284 */ 2285 if ((ret = mac_register(macp, &state->id_mh)) != 0) { 2286 mac_free(macp); 2287 DPRINT(10, 2288 "ibd_register_mac: mac_register() failed, ret=%d", ret); 2289 return (DDI_FAILURE); 2290 } 2291 2292 mac_free(macp); 2293 return (DDI_SUCCESS); 2294 } 2295 2296 static int 2297 ibd_record_capab(ibd_state_t *state, dev_info_t *dip) 2298 { 2299 ibt_hca_attr_t hca_attrs; 2300 ibt_status_t ibt_status; 2301 2302 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state)) 2303 2304 /* 2305 * Query the HCA and fetch its attributes 2306 */ 2307 ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 2308 ASSERT(ibt_status == IBT_SUCCESS); 2309 2310 /* 2311 * 1. Set the Hardware Checksum capability. Currently we only consider 2312 * full checksum offload. 2313 */ 2314 if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) { 2315 state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL; 2316 } 2317 2318 /* 2319 * 2. Set LSO policy, capability and maximum length 2320 */ 2321 if (ddi_prop_get_int(DDI_DEV_T_ANY, dip, 2322 DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) { 2323 state->id_lso_policy = B_TRUE; 2324 } else { 2325 state->id_lso_policy = B_FALSE; 2326 } 2327 2328 if (hca_attrs.hca_max_lso_size > 0) { 2329 state->id_lso_capable = B_TRUE; 2330 if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN) 2331 state->id_lso_maxlen = IBD_LSO_MAXLEN; 2332 else 2333 state->id_lso_maxlen = hca_attrs.hca_max_lso_size; 2334 } else { 2335 state->id_lso_capable = B_FALSE; 2336 state->id_lso_maxlen = 0; 2337 } 2338 2339 /* 2340 * 3. Set Reserved L_Key capability 2341 */ 2342 if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) { 2343 state->id_hca_res_lkey_capab = 1; 2344 state->id_res_lkey = hca_attrs.hca_reserved_lkey; 2345 } 2346 2347 /* 2348 * 4. Set maximum sqseg value after checking to see if extended sgl 2349 * size information is provided by the hca 2350 */ 2351 if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) { 2352 state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz; 2353 } else { 2354 state->id_max_sqseg = hca_attrs.hca_max_sgl; 2355 } 2356 if (state->id_max_sqseg > IBD_MAX_SQSEG) { 2357 state->id_max_sqseg = IBD_MAX_SQSEG; 2358 } else if (state->id_max_sqseg < IBD_MAX_SQSEG) { 2359 ibd_print_warn(state, "Set #sgl = %d instead of default %d", 2360 state->id_max_sqseg, IBD_MAX_SQSEG); 2361 } 2362 2363 /* 2364 * Translating the virtual address regions into physical regions 2365 * for using the Reserved LKey feature results in a wr sgl that 2366 * is a little longer. Since failing ibt_map_mem_iov() is costly, 2367 * we'll fix a high-water mark (65%) for when we should stop. 2368 */ 2369 state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100; 2370 2371 /* 2372 * 5. Set number of recv and send wqes after checking hca maximum 2373 * channel size 2374 */ 2375 if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) { 2376 state->id_num_rwqe = hca_attrs.hca_max_chan_sz; 2377 } else { 2378 state->id_num_rwqe = IBD_NUM_RWQE; 2379 } 2380 state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN; 2381 if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) { 2382 state->id_num_swqe = hca_attrs.hca_max_chan_sz; 2383 } else { 2384 state->id_num_swqe = IBD_NUM_SWQE; 2385 } 2386 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2387 2388 return (DDI_SUCCESS); 2389 } 2390 2391 static int 2392 ibd_unattach(ibd_state_t *state, dev_info_t *dip) 2393 { 2394 int instance; 2395 uint32_t progress = state->id_mac_state; 2396 ibt_status_t ret; 2397 2398 if (progress & IBD_DRV_MAC_REGISTERED) { 2399 (void) mac_unregister(state->id_mh); 2400 state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED); 2401 } 2402 2403 if (progress & IBD_DRV_PD_ALLOCD) { 2404 if ((ret = ibt_free_pd(state->id_hca_hdl, 2405 state->id_pd_hdl)) != IBT_SUCCESS) { 2406 ibd_print_warn(state, "failed to free " 2407 "protection domain, ret=%d", ret); 2408 } 2409 state->id_pd_hdl = NULL; 2410 state->id_mac_state &= (~IBD_DRV_PD_ALLOCD); 2411 } 2412 2413 if (progress & IBD_DRV_HCA_OPENED) { 2414 if ((ret = ibt_close_hca(state->id_hca_hdl)) != 2415 IBT_SUCCESS) { 2416 ibd_print_warn(state, "failed to close " 2417 "HCA device, ret=%d", ret); 2418 } 2419 state->id_hca_hdl = NULL; 2420 state->id_mac_state &= (~IBD_DRV_HCA_OPENED); 2421 } 2422 2423 if (progress & IBD_DRV_IBTL_ATTACH_DONE) { 2424 if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) { 2425 ibd_print_warn(state, 2426 "ibt_detach() failed, ret=%d", ret); 2427 } 2428 state->id_ibt_hdl = NULL; 2429 state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE); 2430 } 2431 2432 if (progress & IBD_DRV_TXINTR_ADDED) { 2433 ddi_remove_softintr(state->id_tx); 2434 state->id_tx = NULL; 2435 state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED); 2436 } 2437 2438 if (progress & IBD_DRV_RXINTR_ADDED) { 2439 ddi_remove_softintr(state->id_rx); 2440 state->id_rx = NULL; 2441 state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED); 2442 } 2443 2444 if (progress & IBD_DRV_STATE_INITIALIZED) { 2445 ibd_state_fini(state); 2446 state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED); 2447 } 2448 2449 instance = ddi_get_instance(dip); 2450 ddi_soft_state_free(ibd_list, instance); 2451 2452 return (DDI_SUCCESS); 2453 } 2454 2455 /* 2456 * Attach device to the IO framework. 2457 */ 2458 static int 2459 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 2460 { 2461 ibd_state_t *state = NULL; 2462 ib_guid_t hca_guid; 2463 int instance; 2464 ibt_status_t ret; 2465 int rv; 2466 2467 /* 2468 * IBD doesn't support suspend/resume 2469 */ 2470 if (cmd != DDI_ATTACH) 2471 return (DDI_FAILURE); 2472 2473 /* 2474 * Allocate softstate structure 2475 */ 2476 instance = ddi_get_instance(dip); 2477 if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE) 2478 return (DDI_FAILURE); 2479 state = ddi_get_soft_state(ibd_list, instance); 2480 2481 /* 2482 * Initialize mutexes and condition variables 2483 */ 2484 if (ibd_state_init(state, dip) != DDI_SUCCESS) { 2485 DPRINT(10, "ibd_attach: failed in ibd_state_init()"); 2486 goto attach_fail; 2487 } 2488 state->id_mac_state |= IBD_DRV_STATE_INITIALIZED; 2489 2490 /* 2491 * Allocate rx,tx softintr 2492 */ 2493 if (ibd_rx_softintr == 1) { 2494 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx, 2495 NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) { 2496 DPRINT(10, "ibd_attach: failed in " 2497 "ddi_add_softintr(id_rx), ret=%d", rv); 2498 goto attach_fail; 2499 } 2500 state->id_mac_state |= IBD_DRV_RXINTR_ADDED; 2501 } 2502 if (ibd_tx_softintr == 1) { 2503 if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx, 2504 NULL, NULL, ibd_tx_recycle, 2505 (caddr_t)state)) != DDI_SUCCESS) { 2506 DPRINT(10, "ibd_attach: failed in " 2507 "ddi_add_softintr(id_tx), ret=%d", rv); 2508 goto attach_fail; 2509 } 2510 state->id_mac_state |= IBD_DRV_TXINTR_ADDED; 2511 } 2512 2513 /* 2514 * Obtain IBA P_Key, port number and HCA guid and validate 2515 * them (for P_Key, only full members are allowed as per 2516 * IPoIB specification; neither port number nor HCA guid 2517 * can be zero) 2518 */ 2519 if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2520 "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) { 2521 DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)", 2522 state->id_pkey); 2523 goto attach_fail; 2524 } 2525 if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0, 2526 "port-number", 0)) == 0) { 2527 DPRINT(10, "ibd_attach: invalid port number (%d)", 2528 state->id_port); 2529 goto attach_fail; 2530 } 2531 if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0, 2532 "hca-guid", 0)) == 0) { 2533 DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)", 2534 hca_guid); 2535 goto attach_fail; 2536 } 2537 2538 /* 2539 * Attach to IBTL 2540 */ 2541 if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state, 2542 &state->id_ibt_hdl)) != IBT_SUCCESS) { 2543 DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret); 2544 goto attach_fail; 2545 } 2546 state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE; 2547 2548 /* 2549 * Open the HCA 2550 */ 2551 if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid, 2552 &state->id_hca_hdl)) != IBT_SUCCESS) { 2553 DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret); 2554 goto attach_fail; 2555 } 2556 state->id_mac_state |= IBD_DRV_HCA_OPENED; 2557 2558 /* 2559 * Record capabilities 2560 */ 2561 (void) ibd_record_capab(state, dip); 2562 2563 /* 2564 * Allocate a protection domain on the HCA 2565 */ 2566 if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS, 2567 &state->id_pd_hdl)) != IBT_SUCCESS) { 2568 DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret); 2569 goto attach_fail; 2570 } 2571 state->id_mac_state |= IBD_DRV_PD_ALLOCD; 2572 2573 2574 /* 2575 * Register ibd interfaces with the Nemo framework 2576 */ 2577 if (ibd_register_mac(state, dip) != IBT_SUCCESS) { 2578 DPRINT(10, "ibd_attach: failed in ibd_register_mac()"); 2579 goto attach_fail; 2580 } 2581 state->id_mac_state |= IBD_DRV_MAC_REGISTERED; 2582 2583 /* 2584 * We're done with everything we could to make the attach 2585 * succeed. All the buffer allocations and IPoIB broadcast 2586 * group joins are deferred to when the interface instance 2587 * is actually plumbed to avoid wasting memory. 2588 */ 2589 return (DDI_SUCCESS); 2590 2591 attach_fail: 2592 (void) ibd_unattach(state, dip); 2593 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state)) 2594 return (DDI_FAILURE); 2595 } 2596 2597 /* 2598 * Detach device from the IO framework. 2599 */ 2600 static int 2601 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 2602 { 2603 ibd_state_t *state; 2604 int instance; 2605 2606 /* 2607 * IBD doesn't support suspend/resume 2608 */ 2609 if (cmd != DDI_DETACH) 2610 return (DDI_FAILURE); 2611 2612 /* 2613 * Get the instance softstate 2614 */ 2615 instance = ddi_get_instance(dip); 2616 state = ddi_get_soft_state(ibd_list, instance); 2617 2618 /* 2619 * Release all resources we're holding still. Note that if we'd 2620 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly 2621 * so far, we should find all the flags we need in id_mac_state. 2622 */ 2623 (void) ibd_unattach(state, dip); 2624 2625 return (DDI_SUCCESS); 2626 } 2627 2628 /* 2629 * Pre ibt_attach() driver initialization 2630 */ 2631 static int 2632 ibd_state_init(ibd_state_t *state, dev_info_t *dip) 2633 { 2634 char buf[64]; 2635 2636 mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL); 2637 state->id_link_state = LINK_STATE_UNKNOWN; 2638 2639 mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL); 2640 cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL); 2641 state->id_trap_stop = B_TRUE; 2642 state->id_trap_inprog = 0; 2643 2644 mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2645 mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL); 2646 state->id_dip = dip; 2647 2648 mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL); 2649 2650 mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2651 mutex_enter(&state->id_tx_list.dl_mutex); 2652 state->id_tx_list.dl_head = NULL; 2653 state->id_tx_list.dl_pending_sends = B_FALSE; 2654 state->id_tx_list.dl_cnt = 0; 2655 mutex_exit(&state->id_tx_list.dl_mutex); 2656 mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2657 mutex_enter(&state->id_tx_rel_list.dl_mutex); 2658 state->id_tx_rel_list.dl_head = NULL; 2659 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 2660 state->id_tx_rel_list.dl_cnt = 0; 2661 mutex_exit(&state->id_tx_rel_list.dl_mutex); 2662 mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL); 2663 state->id_tx_busy = 0; 2664 mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL); 2665 2666 state->id_rx_list.dl_bufs_outstanding = 0; 2667 state->id_rx_list.dl_cnt = 0; 2668 mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2669 mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL); 2670 (void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip)); 2671 state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t), 2672 0, NULL, NULL, NULL, NULL, NULL, 0); 2673 2674 mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL); 2675 cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL); 2676 2677 return (DDI_SUCCESS); 2678 } 2679 2680 /* 2681 * Post ibt_detach() driver deconstruction 2682 */ 2683 static void 2684 ibd_state_fini(ibd_state_t *state) 2685 { 2686 cv_destroy(&state->id_macst_cv); 2687 mutex_destroy(&state->id_macst_lock); 2688 2689 kmem_cache_destroy(state->id_req_kmc); 2690 2691 mutex_destroy(&state->id_rx_list.dl_mutex); 2692 mutex_destroy(&state->id_rx_free_list.dl_mutex); 2693 2694 mutex_destroy(&state->id_txpost_lock); 2695 mutex_destroy(&state->id_tx_list.dl_mutex); 2696 mutex_destroy(&state->id_tx_rel_list.dl_mutex); 2697 mutex_destroy(&state->id_lso_lock); 2698 2699 mutex_destroy(&state->id_sched_lock); 2700 mutex_destroy(&state->id_scq_poll_lock); 2701 mutex_destroy(&state->id_rcq_poll_lock); 2702 2703 cv_destroy(&state->id_trap_cv); 2704 mutex_destroy(&state->id_trap_lock); 2705 mutex_destroy(&state->id_link_mutex); 2706 } 2707 2708 /* 2709 * Fetch link speed from SA for snmp ifspeed reporting. 2710 */ 2711 static uint64_t 2712 ibd_get_portspeed(ibd_state_t *state) 2713 { 2714 int ret; 2715 ibt_path_info_t path; 2716 ibt_path_attr_t path_attr; 2717 uint8_t num_paths; 2718 uint64_t ifspeed; 2719 2720 /* 2721 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire 2722 * translates to 2 Gbps data rate. Thus, 1X single data rate is 2723 * 2000000000. Start with that as default. 2724 */ 2725 ifspeed = 2000000000; 2726 2727 bzero(&path_attr, sizeof (path_attr)); 2728 2729 /* 2730 * Get the port speed from Loopback path information. 2731 */ 2732 path_attr.pa_dgids = &state->id_sgid; 2733 path_attr.pa_num_dgids = 1; 2734 path_attr.pa_sgid = state->id_sgid; 2735 2736 if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS, 2737 &path_attr, 1, &path, &num_paths) != IBT_SUCCESS) 2738 goto earlydone; 2739 2740 if (num_paths < 1) 2741 goto earlydone; 2742 2743 /* 2744 * In case SA does not return an expected value, report the default 2745 * speed as 1X. 2746 */ 2747 ret = 1; 2748 switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) { 2749 case IBT_SRATE_2: /* 1X SDR i.e 2.5 Gbps */ 2750 ret = 1; 2751 break; 2752 case IBT_SRATE_10: /* 4X SDR or 1X QDR i.e 10 Gbps */ 2753 ret = 4; 2754 break; 2755 case IBT_SRATE_30: /* 12X SDR i.e 30 Gbps */ 2756 ret = 12; 2757 break; 2758 case IBT_SRATE_5: /* 1X DDR i.e 5 Gbps */ 2759 ret = 2; 2760 break; 2761 case IBT_SRATE_20: /* 4X DDR or 8X SDR i.e 20 Gbps */ 2762 ret = 8; 2763 break; 2764 case IBT_SRATE_40: /* 8X DDR or 4X QDR i.e 40 Gbps */ 2765 ret = 16; 2766 break; 2767 case IBT_SRATE_60: /* 12X DDR i.e 60 Gbps */ 2768 ret = 24; 2769 break; 2770 case IBT_SRATE_80: /* 8X QDR i.e 80 Gbps */ 2771 ret = 32; 2772 break; 2773 case IBT_SRATE_120: /* 12X QDR i.e 120 Gbps */ 2774 ret = 48; 2775 break; 2776 } 2777 2778 ifspeed *= ret; 2779 2780 earlydone: 2781 return (ifspeed); 2782 } 2783 2784 /* 2785 * Search input mcg list (id_mc_full or id_mc_non) for an entry 2786 * representing the input mcg mgid. 2787 */ 2788 static ibd_mce_t * 2789 ibd_mcache_find(ib_gid_t mgid, struct list *mlist) 2790 { 2791 ibd_mce_t *ptr = list_head(mlist); 2792 2793 /* 2794 * Do plain linear search. 2795 */ 2796 while (ptr != NULL) { 2797 if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid, 2798 sizeof (ib_gid_t)) == 0) 2799 return (ptr); 2800 ptr = list_next(mlist, ptr); 2801 } 2802 return (NULL); 2803 } 2804 2805 /* 2806 * Execute IBA JOIN. 2807 */ 2808 static ibt_status_t 2809 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce) 2810 { 2811 ibt_mcg_attr_t mcg_attr; 2812 2813 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 2814 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 2815 mcg_attr.mc_mgid = mgid; 2816 mcg_attr.mc_join_state = mce->mc_jstate; 2817 mcg_attr.mc_scope = state->id_scope; 2818 mcg_attr.mc_pkey = state->id_pkey; 2819 mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow; 2820 mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl; 2821 mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass; 2822 return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info, 2823 NULL, NULL)); 2824 } 2825 2826 /* 2827 * This code JOINs the port in the proper way (depending on the join 2828 * state) so that IBA fabric will forward mcg packets to/from the port. 2829 * It also attaches the QPN to the mcg so it can receive those mcg 2830 * packets. This code makes sure not to attach the mcg to the QP if 2831 * that has been previously done due to the mcg being joined with a 2832 * different join state, even though this is not required by SWG_0216, 2833 * refid 3610. 2834 */ 2835 static ibd_mce_t * 2836 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 2837 { 2838 ibt_status_t ibt_status; 2839 ibd_mce_t *mce, *tmce, *omce = NULL; 2840 boolean_t do_attach = B_TRUE; 2841 2842 DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n", 2843 jstate, mgid.gid_prefix, mgid.gid_guid); 2844 2845 /* 2846 * For enable_multicast Full member joins, we need to do some 2847 * extra work. If there is already an mce on the list that 2848 * indicates full membership, that means the membership has 2849 * not yet been dropped (since the disable_multicast was issued) 2850 * because there are pending Tx's to the mcg; in that case, just 2851 * mark the mce not to be reaped when the Tx completion queues 2852 * an async reap operation. 2853 * 2854 * If there is already an mce on the list indicating sendonly 2855 * membership, try to promote to full membership. Be careful 2856 * not to deallocate the old mce, since there might be an AH 2857 * pointing to it; instead, update the old mce with new data 2858 * that tracks the full membership. 2859 */ 2860 if ((jstate == IB_MC_JSTATE_FULL) && ((omce = 2861 IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) { 2862 if (omce->mc_jstate == IB_MC_JSTATE_FULL) { 2863 ASSERT(omce->mc_fullreap); 2864 omce->mc_fullreap = B_FALSE; 2865 return (omce); 2866 } else { 2867 ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON); 2868 } 2869 } 2870 2871 /* 2872 * Allocate the ibd_mce_t to track this JOIN. 2873 */ 2874 mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP); 2875 mce->mc_fullreap = B_FALSE; 2876 mce->mc_jstate = jstate; 2877 2878 if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) { 2879 DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d", 2880 ibt_status); 2881 kmem_free(mce, sizeof (ibd_mce_t)); 2882 return (NULL); 2883 } 2884 2885 /* 2886 * Is an IBA attach required? Not if the interface is already joined 2887 * to the mcg in a different appropriate join state. 2888 */ 2889 if (jstate == IB_MC_JSTATE_NON) { 2890 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 2891 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 2892 do_attach = B_FALSE; 2893 } else if (jstate == IB_MC_JSTATE_FULL) { 2894 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 2895 do_attach = B_FALSE; 2896 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 2897 do_attach = B_FALSE; 2898 } 2899 2900 if (do_attach) { 2901 /* 2902 * Do the IBA attach. 2903 */ 2904 DPRINT(10, "ibd_join_group: ibt_attach_mcg \n"); 2905 if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl, 2906 &mce->mc_info)) != IBT_SUCCESS) { 2907 DPRINT(10, "ibd_join_group : failed qp attachment " 2908 "%d\n", ibt_status); 2909 /* 2910 * NOTE that we should probably preserve the join info 2911 * in the list and later try to leave again at detach 2912 * time. 2913 */ 2914 (void) ibt_leave_mcg(state->id_sgid, mgid, 2915 state->id_sgid, jstate); 2916 kmem_free(mce, sizeof (ibd_mce_t)); 2917 return (NULL); 2918 } 2919 } 2920 2921 /* 2922 * Insert the ibd_mce_t in the proper list. 2923 */ 2924 if (jstate == IB_MC_JSTATE_NON) { 2925 IBD_MCACHE_INSERT_NON(state, mce); 2926 } else { 2927 /* 2928 * Set up the mc_req fields used for reaping the 2929 * mcg in case of delayed tx completion (see 2930 * ibd_tx_cleanup()). Also done for sendonly join in 2931 * case we are promoted to fullmembership later and 2932 * keep using the same mce. 2933 */ 2934 mce->mc_req.rq_gid = mgid; 2935 mce->mc_req.rq_ptr = mce; 2936 /* 2937 * Check whether this is the case of trying to join 2938 * full member, and we were already joined send only. 2939 * We try to drop our SendOnly membership, but it is 2940 * possible that the mcg does not exist anymore (and 2941 * the subnet trap never reached us), so the leave 2942 * operation might fail. 2943 */ 2944 if (omce != NULL) { 2945 (void) ibt_leave_mcg(state->id_sgid, mgid, 2946 state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON); 2947 omce->mc_jstate = IB_MC_JSTATE_FULL; 2948 bcopy(&mce->mc_info, &omce->mc_info, 2949 sizeof (ibt_mcg_info_t)); 2950 kmem_free(mce, sizeof (ibd_mce_t)); 2951 return (omce); 2952 } 2953 mutex_enter(&state->id_mc_mutex); 2954 IBD_MCACHE_INSERT_FULL(state, mce); 2955 mutex_exit(&state->id_mc_mutex); 2956 } 2957 2958 return (mce); 2959 } 2960 2961 /* 2962 * Called during port up event handling to attempt to reacquire full 2963 * membership to an mcg. Stripped down version of ibd_join_group(). 2964 * Note that it is possible that the mcg might have gone away, and 2965 * gets recreated at this point. 2966 */ 2967 static void 2968 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce) 2969 { 2970 ib_gid_t mgid; 2971 2972 /* 2973 * If the mc_fullreap flag is set, or this join fails, a subsequent 2974 * reap/leave is going to try to leave the group. We could prevent 2975 * that by adding a boolean flag into ibd_mce_t, if required. 2976 */ 2977 if (mce->mc_fullreap) 2978 return; 2979 2980 mgid = mce->mc_info.mc_adds_vect.av_dgid; 2981 2982 DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix, 2983 mgid.gid_guid); 2984 2985 if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS) 2986 ibd_print_warn(state, "Failure on port up to rejoin " 2987 "multicast gid %016llx:%016llx", 2988 (u_longlong_t)mgid.gid_prefix, 2989 (u_longlong_t)mgid.gid_guid); 2990 } 2991 2992 /* 2993 * This code handles delayed Tx completion cleanups for mcg's to which 2994 * disable_multicast has been issued, regular mcg related cleanups during 2995 * disable_multicast, disable_promiscuous and mcg traps, as well as 2996 * cleanups during driver detach time. Depending on the join state, 2997 * it deletes the mce from the appropriate list and issues the IBA 2998 * leave/detach; except in the disable_multicast case when the mce 2999 * is left on the active list for a subsequent Tx completion cleanup. 3000 */ 3001 static void 3002 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid, 3003 uint8_t jstate) 3004 { 3005 ibd_mce_t *tmce; 3006 boolean_t do_detach = B_TRUE; 3007 3008 /* 3009 * Before detaching, we must check whether the other list 3010 * contains the mcg; if we detach blindly, the consumer 3011 * who set up the other list will also stop receiving 3012 * traffic. 3013 */ 3014 if (jstate == IB_MC_JSTATE_FULL) { 3015 /* 3016 * The following check is only relevant while coming 3017 * from the Tx completion path in the reap case. 3018 */ 3019 if (!mce->mc_fullreap) 3020 return; 3021 mutex_enter(&state->id_mc_mutex); 3022 IBD_MCACHE_PULLOUT_FULL(state, mce); 3023 mutex_exit(&state->id_mc_mutex); 3024 if (IBD_MCACHE_FIND_NON(state, mgid) != NULL) 3025 do_detach = B_FALSE; 3026 } else if (jstate == IB_MC_JSTATE_NON) { 3027 IBD_MCACHE_PULLOUT_NON(state, mce); 3028 tmce = IBD_MCACHE_FIND_FULL(state, mgid); 3029 if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL)) 3030 do_detach = B_FALSE; 3031 } else { /* jstate == IB_MC_JSTATE_SEND_ONLY_NON */ 3032 mutex_enter(&state->id_mc_mutex); 3033 IBD_MCACHE_PULLOUT_FULL(state, mce); 3034 mutex_exit(&state->id_mc_mutex); 3035 do_detach = B_FALSE; 3036 } 3037 3038 /* 3039 * If we are reacting to a mcg trap and leaving our sendonly or 3040 * non membership, the mcg is possibly already gone, so attempting 3041 * to leave might fail. On the other hand, we must try to leave 3042 * anyway, since this might be a trap from long ago, and we could 3043 * have potentially sendonly joined to a recent incarnation of 3044 * the mcg and are about to loose track of this information. 3045 */ 3046 if (do_detach) { 3047 DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : " 3048 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3049 (void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info); 3050 } 3051 3052 (void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate); 3053 kmem_free(mce, sizeof (ibd_mce_t)); 3054 } 3055 3056 /* 3057 * Async code executed due to multicast and promiscuous disable requests 3058 * and mcg trap handling; also executed during driver detach. Mostly, a 3059 * leave and detach is done; except for the fullmember case when Tx 3060 * requests are pending, whence arrangements are made for subsequent 3061 * cleanup on Tx completion. 3062 */ 3063 static void 3064 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate) 3065 { 3066 ipoib_mac_t mcmac; 3067 boolean_t recycled; 3068 ibd_mce_t *mce; 3069 3070 DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n", 3071 jstate, mgid.gid_prefix, mgid.gid_guid); 3072 3073 if (jstate == IB_MC_JSTATE_NON) { 3074 recycled = B_TRUE; 3075 mce = IBD_MCACHE_FIND_NON(state, mgid); 3076 /* 3077 * In case we are handling a mcg trap, we might not find 3078 * the mcg in the non list. 3079 */ 3080 if (mce == NULL) { 3081 return; 3082 } 3083 } else { 3084 mce = IBD_MCACHE_FIND_FULL(state, mgid); 3085 3086 /* 3087 * In case we are handling a mcg trap, make sure the trap 3088 * is not arriving late; if we have an mce that indicates 3089 * that we are already a fullmember, that would be a clear 3090 * indication that the trap arrived late (ie, is for a 3091 * previous incarnation of the mcg). 3092 */ 3093 if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) { 3094 if ((mce == NULL) || (mce->mc_jstate == 3095 IB_MC_JSTATE_FULL)) { 3096 return; 3097 } 3098 } else { 3099 ASSERT(jstate == IB_MC_JSTATE_FULL); 3100 3101 /* 3102 * If join group failed, mce will be NULL here. 3103 * This is because in GLDv3 driver, set multicast 3104 * will always return success. 3105 */ 3106 if (mce == NULL) { 3107 return; 3108 } 3109 3110 mce->mc_fullreap = B_TRUE; 3111 } 3112 3113 /* 3114 * If no pending Tx's remain that reference the AH 3115 * for the mcg, recycle it from active to free list. 3116 * Else in the IB_MC_JSTATE_FULL case, just mark the AH, 3117 * so the last completing Tx will cause an async reap 3118 * operation to be invoked, at which time we will drop our 3119 * membership to the mcg so that the pending Tx's complete 3120 * successfully. Refer to comments on "AH and MCE active 3121 * list manipulation" at top of this file. The lock protects 3122 * against Tx fast path and Tx cleanup code. 3123 */ 3124 mutex_enter(&state->id_ac_mutex); 3125 ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid); 3126 recycled = ibd_acache_recycle(state, &mcmac, (jstate == 3127 IB_MC_JSTATE_SEND_ONLY_NON)); 3128 mutex_exit(&state->id_ac_mutex); 3129 } 3130 3131 if (recycled) { 3132 DPRINT(2, "ibd_leave_group : leave_group reaping : " 3133 "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid); 3134 ibd_async_reap_group(state, mce, mgid, jstate); 3135 } 3136 } 3137 3138 /* 3139 * Find the broadcast address as defined by IPoIB; implicitly 3140 * determines the IBA scope, mtu, tclass etc of the link the 3141 * interface is going to be a member of. 3142 */ 3143 static ibt_status_t 3144 ibd_find_bgroup(ibd_state_t *state) 3145 { 3146 ibt_mcg_attr_t mcg_attr; 3147 uint_t numg; 3148 uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL, 3149 IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL, 3150 IB_MC_SCOPE_GLOBAL }; 3151 int i, mcgmtu; 3152 boolean_t found = B_FALSE; 3153 int ret; 3154 ibt_mcg_info_t mcg_info; 3155 3156 state->id_bgroup_created = B_FALSE; 3157 3158 query_bcast_grp: 3159 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3160 mcg_attr.mc_pkey = state->id_pkey; 3161 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3162 state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK; 3163 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3164 3165 for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) { 3166 state->id_scope = mcg_attr.mc_scope = scopes[i]; 3167 3168 /* 3169 * Look for the IPoIB broadcast group. 3170 */ 3171 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3172 state->id_mgid.gid_prefix = 3173 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3174 ((uint64_t)state->id_scope << 48) | 3175 ((uint32_t)(state->id_pkey << 16))); 3176 mcg_attr.mc_mgid = state->id_mgid; 3177 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3178 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1, 3179 &state->id_mcinfo, &numg) == IBT_SUCCESS) { 3180 found = B_TRUE; 3181 break; 3182 } 3183 } 3184 3185 if (!found) { 3186 if (ibd_create_broadcast_group) { 3187 /* 3188 * If we created the broadcast group, but failed to 3189 * find it, we can't do anything except leave the 3190 * one we created and return failure. 3191 */ 3192 if (state->id_bgroup_created) { 3193 ibd_print_warn(state, "IPoIB broadcast group " 3194 "absent. Unable to query after create."); 3195 goto find_bgroup_fail; 3196 } 3197 3198 /* 3199 * Create the ipoib broadcast group if it didn't exist 3200 */ 3201 bzero(&mcg_attr, sizeof (ibt_mcg_attr_t)); 3202 mcg_attr.mc_qkey = IBD_DEFAULT_QKEY; 3203 mcg_attr.mc_join_state = IB_MC_JSTATE_FULL; 3204 mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL; 3205 mcg_attr.mc_pkey = state->id_pkey; 3206 mcg_attr.mc_flow = 0; 3207 mcg_attr.mc_sl = 0; 3208 mcg_attr.mc_tclass = 0; 3209 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3210 state->id_mgid.gid_prefix = 3211 (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) | 3212 ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) | 3213 ((uint32_t)(state->id_pkey << 16))); 3214 mcg_attr.mc_mgid = state->id_mgid; 3215 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid)) 3216 3217 if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr, 3218 &mcg_info, NULL, NULL)) != IBT_SUCCESS) { 3219 ibd_print_warn(state, "IPoIB broadcast group " 3220 "absent, create failed: ret = %d\n", ret); 3221 state->id_bgroup_created = B_FALSE; 3222 return (IBT_FAILURE); 3223 } 3224 state->id_bgroup_created = B_TRUE; 3225 goto query_bcast_grp; 3226 } else { 3227 ibd_print_warn(state, "IPoIB broadcast group absent"); 3228 return (IBT_FAILURE); 3229 } 3230 } 3231 3232 /* 3233 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu. 3234 */ 3235 mcgmtu = (128 << state->id_mcinfo->mc_mtu); 3236 if (state->id_mtu < mcgmtu) { 3237 ibd_print_warn(state, "IPoIB broadcast group MTU %d " 3238 "greater than port's maximum MTU %d", mcgmtu, 3239 state->id_mtu); 3240 ibt_free_mcg_info(state->id_mcinfo, 1); 3241 goto find_bgroup_fail; 3242 } 3243 state->id_mtu = mcgmtu; 3244 3245 return (IBT_SUCCESS); 3246 3247 find_bgroup_fail: 3248 if (state->id_bgroup_created) { 3249 (void) ibt_leave_mcg(state->id_sgid, 3250 mcg_info.mc_adds_vect.av_dgid, state->id_sgid, 3251 IB_MC_JSTATE_FULL); 3252 } 3253 3254 return (IBT_FAILURE); 3255 } 3256 3257 static int 3258 ibd_alloc_tx_copybufs(ibd_state_t *state) 3259 { 3260 ibt_mr_attr_t mem_attr; 3261 3262 /* 3263 * Allocate one big chunk for all regular tx copy bufs 3264 */ 3265 state->id_tx_buf_sz = state->id_mtu; 3266 if (state->id_lso_policy && state->id_lso_capable && 3267 (IBD_TX_BUF_SZ > state->id_mtu)) { 3268 state->id_tx_buf_sz = IBD_TX_BUF_SZ; 3269 } 3270 3271 state->id_tx_bufs = kmem_zalloc(state->id_num_swqe * 3272 state->id_tx_buf_sz, KM_SLEEP); 3273 3274 state->id_tx_wqes = kmem_zalloc(state->id_num_swqe * 3275 sizeof (ibd_swqe_t), KM_SLEEP); 3276 3277 /* 3278 * Do one memory registration on the entire txbuf area 3279 */ 3280 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs; 3281 mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz; 3282 mem_attr.mr_as = NULL; 3283 mem_attr.mr_flags = IBT_MR_SLEEP; 3284 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3285 &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) { 3286 DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed"); 3287 kmem_free(state->id_tx_wqes, 3288 state->id_num_swqe * sizeof (ibd_swqe_t)); 3289 kmem_free(state->id_tx_bufs, 3290 state->id_num_swqe * state->id_tx_buf_sz); 3291 state->id_tx_bufs = NULL; 3292 return (DDI_FAILURE); 3293 } 3294 3295 return (DDI_SUCCESS); 3296 } 3297 3298 static int 3299 ibd_alloc_tx_lsobufs(ibd_state_t *state) 3300 { 3301 ibt_mr_attr_t mem_attr; 3302 ibd_lsobuf_t *buflist; 3303 ibd_lsobuf_t *lbufp; 3304 ibd_lsobuf_t *tail; 3305 ibd_lsobkt_t *bktp; 3306 uint8_t *membase; 3307 uint8_t *memp; 3308 uint_t memsz; 3309 int i; 3310 3311 /* 3312 * Allocate the lso bucket 3313 */ 3314 bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP); 3315 3316 /* 3317 * Allocate the entire lso memory and register it 3318 */ 3319 memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ; 3320 membase = kmem_zalloc(memsz, KM_SLEEP); 3321 3322 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase; 3323 mem_attr.mr_len = memsz; 3324 mem_attr.mr_as = NULL; 3325 mem_attr.mr_flags = IBT_MR_SLEEP; 3326 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, 3327 &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) { 3328 DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed"); 3329 kmem_free(membase, memsz); 3330 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3331 return (DDI_FAILURE); 3332 } 3333 3334 mutex_enter(&state->id_lso_lock); 3335 3336 /* 3337 * Now allocate the buflist. Note that the elements in the buflist and 3338 * the buffers in the lso memory have a permanent 1-1 relation, so we 3339 * can always derive the address of a buflist entry from the address of 3340 * an lso buffer. 3341 */ 3342 buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t), 3343 KM_SLEEP); 3344 3345 /* 3346 * Set up the lso buf chain 3347 */ 3348 memp = membase; 3349 lbufp = buflist; 3350 for (i = 0; i < IBD_NUM_LSO_BUFS; i++) { 3351 lbufp->lb_isfree = 1; 3352 lbufp->lb_buf = memp; 3353 lbufp->lb_next = lbufp + 1; 3354 3355 tail = lbufp; 3356 3357 memp += IBD_LSO_BUFSZ; 3358 lbufp++; 3359 } 3360 tail->lb_next = NULL; 3361 3362 /* 3363 * Set up the LSO buffer information in ibd state 3364 */ 3365 bktp->bkt_bufl = buflist; 3366 bktp->bkt_free_head = buflist; 3367 bktp->bkt_mem = membase; 3368 bktp->bkt_nelem = IBD_NUM_LSO_BUFS; 3369 bktp->bkt_nfree = bktp->bkt_nelem; 3370 3371 state->id_lso = bktp; 3372 mutex_exit(&state->id_lso_lock); 3373 3374 return (DDI_SUCCESS); 3375 } 3376 3377 /* 3378 * Statically allocate Tx buffer list(s). 3379 */ 3380 static int 3381 ibd_init_txlist(ibd_state_t *state) 3382 { 3383 ibd_swqe_t *swqe; 3384 ibt_lkey_t lkey; 3385 int i; 3386 uint_t len; 3387 uint8_t *bufaddr; 3388 3389 if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS) 3390 return (DDI_FAILURE); 3391 3392 if (state->id_lso_policy && state->id_lso_capable) { 3393 if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS) 3394 state->id_lso_policy = B_FALSE; 3395 } 3396 3397 /* 3398 * Allocate and setup the swqe list 3399 */ 3400 lkey = state->id_tx_mr_desc.md_lkey; 3401 bufaddr = state->id_tx_bufs; 3402 len = state->id_tx_buf_sz; 3403 swqe = state->id_tx_wqes; 3404 mutex_enter(&state->id_tx_list.dl_mutex); 3405 for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) { 3406 swqe->swqe_type = IBD_WQE_SEND; 3407 swqe->swqe_next = NULL; 3408 swqe->swqe_im_mblk = NULL; 3409 3410 swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t) 3411 bufaddr; 3412 swqe->swqe_copybuf.ic_sgl.ds_key = lkey; 3413 swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */ 3414 3415 swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe; 3416 swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS; 3417 swqe->w_swr.wr_trans = IBT_UD_SRV; 3418 3419 /* These are set in send */ 3420 swqe->w_swr.wr_nds = 0; 3421 swqe->w_swr.wr_sgl = NULL; 3422 swqe->w_swr.wr_opcode = IBT_WRC_SEND; 3423 3424 /* add to list */ 3425 state->id_tx_list.dl_cnt++; 3426 swqe->swqe_next = state->id_tx_list.dl_head; 3427 state->id_tx_list.dl_head = SWQE_TO_WQE(swqe); 3428 } 3429 mutex_exit(&state->id_tx_list.dl_mutex); 3430 3431 return (DDI_SUCCESS); 3432 } 3433 3434 static int 3435 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p, 3436 uint32_t *nds_p) 3437 { 3438 ibd_lsobkt_t *bktp; 3439 ibd_lsobuf_t *lbufp; 3440 ibd_lsobuf_t *nextp; 3441 ibt_lkey_t lso_lkey; 3442 uint_t frag_sz; 3443 uint_t num_needed; 3444 int i; 3445 3446 ASSERT(sgl_p != NULL); 3447 ASSERT(nds_p != NULL); 3448 ASSERT(req_sz != 0); 3449 3450 /* 3451 * Determine how many bufs we'd need for the size requested 3452 */ 3453 num_needed = req_sz / IBD_LSO_BUFSZ; 3454 if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0) 3455 num_needed++; 3456 3457 mutex_enter(&state->id_lso_lock); 3458 3459 /* 3460 * If we don't have enough lso bufs, return failure 3461 */ 3462 ASSERT(state->id_lso != NULL); 3463 bktp = state->id_lso; 3464 if (bktp->bkt_nfree < num_needed) { 3465 mutex_exit(&state->id_lso_lock); 3466 return (-1); 3467 } 3468 3469 /* 3470 * Pick the first 'num_needed' bufs from the free list 3471 */ 3472 lso_lkey = bktp->bkt_mr_desc.md_lkey; 3473 lbufp = bktp->bkt_free_head; 3474 for (i = 0; i < num_needed; i++) { 3475 ASSERT(lbufp->lb_isfree != 0); 3476 ASSERT(lbufp->lb_buf != NULL); 3477 3478 nextp = lbufp->lb_next; 3479 3480 sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf; 3481 sgl_p[i].ds_key = lso_lkey; 3482 sgl_p[i].ds_len = IBD_LSO_BUFSZ; 3483 3484 lbufp->lb_isfree = 0; 3485 lbufp->lb_next = NULL; 3486 3487 lbufp = nextp; 3488 } 3489 bktp->bkt_free_head = lbufp; 3490 3491 /* 3492 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need 3493 * to adjust the last sgl entry's length. Since we know we need atleast 3494 * one, the i-1 use below is ok. 3495 */ 3496 if (frag_sz) { 3497 sgl_p[i-1].ds_len = frag_sz; 3498 } 3499 3500 /* 3501 * Update nfree count and return 3502 */ 3503 bktp->bkt_nfree -= num_needed; 3504 3505 mutex_exit(&state->id_lso_lock); 3506 3507 *nds_p = num_needed; 3508 3509 return (0); 3510 } 3511 3512 static void 3513 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds) 3514 { 3515 ibd_lsobkt_t *bktp; 3516 ibd_lsobuf_t *lbufp; 3517 uint8_t *lso_mem_end; 3518 uint_t ndx; 3519 int i; 3520 3521 mutex_enter(&state->id_lso_lock); 3522 3523 bktp = state->id_lso; 3524 ASSERT(bktp != NULL); 3525 3526 lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ; 3527 for (i = 0; i < nds; i++) { 3528 uint8_t *va; 3529 3530 va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va; 3531 ASSERT(va >= bktp->bkt_mem && va < lso_mem_end); 3532 3533 /* 3534 * Figure out the buflist element this sgl buffer corresponds 3535 * to and put it back at the head 3536 */ 3537 ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ; 3538 lbufp = bktp->bkt_bufl + ndx; 3539 3540 ASSERT(lbufp->lb_isfree == 0); 3541 ASSERT(lbufp->lb_buf == va); 3542 3543 lbufp->lb_isfree = 1; 3544 lbufp->lb_next = bktp->bkt_free_head; 3545 bktp->bkt_free_head = lbufp; 3546 } 3547 bktp->bkt_nfree += nds; 3548 3549 mutex_exit(&state->id_lso_lock); 3550 } 3551 3552 static void 3553 ibd_free_tx_copybufs(ibd_state_t *state) 3554 { 3555 /* 3556 * Unregister txbuf mr 3557 */ 3558 if (ibt_deregister_mr(state->id_hca_hdl, 3559 state->id_tx_mr_hdl) != IBT_SUCCESS) { 3560 DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed"); 3561 } 3562 state->id_tx_mr_hdl = NULL; 3563 3564 /* 3565 * Free txbuf memory 3566 */ 3567 kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t)); 3568 kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz); 3569 state->id_tx_wqes = NULL; 3570 state->id_tx_bufs = NULL; 3571 } 3572 3573 static void 3574 ibd_free_tx_lsobufs(ibd_state_t *state) 3575 { 3576 ibd_lsobkt_t *bktp; 3577 3578 mutex_enter(&state->id_lso_lock); 3579 3580 if ((bktp = state->id_lso) == NULL) { 3581 mutex_exit(&state->id_lso_lock); 3582 return; 3583 } 3584 3585 /* 3586 * First, free the buflist 3587 */ 3588 ASSERT(bktp->bkt_bufl != NULL); 3589 kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t)); 3590 3591 /* 3592 * Unregister the LSO memory and free it 3593 */ 3594 ASSERT(bktp->bkt_mr_hdl != NULL); 3595 if (ibt_deregister_mr(state->id_hca_hdl, 3596 bktp->bkt_mr_hdl) != IBT_SUCCESS) { 3597 DPRINT(10, 3598 "ibd_free_lsobufs: ibt_deregister_mr failed"); 3599 } 3600 ASSERT(bktp->bkt_mem); 3601 kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ); 3602 3603 /* 3604 * Finally free the bucket 3605 */ 3606 kmem_free(bktp, sizeof (ibd_lsobkt_t)); 3607 state->id_lso = NULL; 3608 3609 mutex_exit(&state->id_lso_lock); 3610 } 3611 3612 /* 3613 * Free the statically allocated Tx buffer list. 3614 */ 3615 static void 3616 ibd_fini_txlist(ibd_state_t *state) 3617 { 3618 ibd_swqe_t *node; 3619 3620 /* 3621 * Free the allocated swqes 3622 */ 3623 mutex_enter(&state->id_tx_list.dl_mutex); 3624 while (state->id_tx_list.dl_head != NULL) { 3625 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 3626 state->id_tx_list.dl_head = node->swqe_next; 3627 ASSERT(state->id_tx_list.dl_cnt > 0); 3628 state->id_tx_list.dl_cnt--; 3629 } 3630 ASSERT(state->id_tx_list.dl_cnt == 0); 3631 mutex_exit(&state->id_tx_list.dl_mutex); 3632 mutex_enter(&state->id_tx_rel_list.dl_mutex); 3633 while (state->id_tx_rel_list.dl_head != NULL) { 3634 node = WQE_TO_SWQE(state->id_tx_rel_list.dl_head); 3635 state->id_tx_rel_list.dl_head = node->swqe_next; 3636 ASSERT(state->id_tx_rel_list.dl_cnt > 0); 3637 state->id_tx_rel_list.dl_cnt--; 3638 } 3639 ASSERT(state->id_tx_rel_list.dl_cnt == 0); 3640 mutex_exit(&state->id_tx_rel_list.dl_mutex); 3641 3642 ibd_free_tx_lsobufs(state); 3643 ibd_free_tx_copybufs(state); 3644 } 3645 3646 static void 3647 ibd_post_recv_task(ibd_rwqe_t *rwqe, ibd_rwqe_t *tail) 3648 { 3649 uint_t i; 3650 uint_t num_posted; 3651 ibt_status_t ibt_status; 3652 ibt_recv_wr_t wrs[IBD_RX_POST_CNT]; 3653 ibd_state_t *state = rwqe->w_state; 3654 3655 mutex_enter(&state->id_rx_post_lock); 3656 if (state->id_rx_post_busy) { 3657 tail->rwqe_next = state->id_rx_post_head; 3658 state->id_rx_post_head = RWQE_TO_WQE(rwqe); 3659 mutex_exit(&state->id_rx_post_lock); 3660 return; 3661 } 3662 state->id_rx_post_busy = 1; 3663 mutex_exit(&state->id_rx_post_lock); 3664 3665 loop: 3666 /* Post the IBD_RX_POST_CNT receive work requests pointed to by arg. */ 3667 for (i = 0; i < IBD_RX_POST_CNT; i++) { 3668 wrs[i] = rwqe->w_rwr; 3669 rwqe = WQE_TO_RWQE(rwqe->rwqe_next); 3670 } 3671 3672 /* 3673 * If posting fails for some reason, we'll never receive 3674 * completion intimation, so we'll need to cleanup. But 3675 * we need to make sure we don't clean up nodes whose 3676 * wrs have been successfully posted. We assume that the 3677 * hca driver returns on the first failure to post and 3678 * therefore the first 'num_posted' entries don't need 3679 * cleanup here. 3680 */ 3681 atomic_add_32(&state->id_rx_list.dl_cnt, IBD_RX_POST_CNT); 3682 3683 num_posted = 0; 3684 ibt_status = ibt_post_recv(state->id_chnl_hdl, 3685 wrs, IBD_RX_POST_CNT, &num_posted); 3686 if (ibt_status != IBT_SUCCESS) { 3687 ibd_print_warn(state, "ibd_post_recv: FATAL: " 3688 "posting multiple wrs failed: " 3689 "requested=%d, done=%d, ret=%d", 3690 IBD_RX_POST_CNT, num_posted, ibt_status); 3691 atomic_add_32(&state->id_rx_list.dl_cnt, 3692 -(IBD_RX_POST_CNT - num_posted)); 3693 /* This cannot happen! */ 3694 } 3695 if (rwqe != NULL) /* more rwqes on our list? */ 3696 goto loop; 3697 3698 /* check if we have a new list */ 3699 mutex_enter(&state->id_rx_post_lock); 3700 if ((rwqe = WQE_TO_RWQE(state->id_rx_post_head)) != NULL) { 3701 state->id_rx_post_head = NULL; 3702 mutex_exit(&state->id_rx_post_lock); 3703 goto loop; 3704 } 3705 state->id_rx_post_busy = 0; 3706 mutex_exit(&state->id_rx_post_lock); 3707 } 3708 3709 /* macro explained below */ 3710 #define RX_QUEUE_HASH(rwqe) \ 3711 (((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1)) 3712 3713 /* 3714 * Add a rwqe to one of the the Rx lists. If the list is large enough 3715 * (exactly IBD_RX_POST_CNT), post the list to the hardware. 3716 * 3717 * Note: one of 2^N lists is chosen via a hash. This is done 3718 * because using one list is contentious. If the first list is busy 3719 * (mutex_tryenter fails), use a second list (just call mutex_enter). 3720 * 3721 * The number 8 in RX_QUEUE_HASH is a random choice that provides 3722 * even distribution of mapping rwqes to the 2^N queues. 3723 */ 3724 static void 3725 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe) 3726 { 3727 ibd_rx_queue_t *rxp; 3728 ibd_rwqe_t *tail; 3729 3730 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe); 3731 3732 if (!mutex_tryenter(&rxp->rx_post_lock)) { 3733 /* Failed. Try a different queue ("ptr + 16" ensures that). */ 3734 rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16); 3735 mutex_enter(&rxp->rx_post_lock); 3736 } 3737 rwqe->rwqe_next = rxp->rx_head; 3738 if (rxp->rx_cnt == 0) 3739 rxp->rx_tail = RWQE_TO_WQE(rwqe); 3740 if (++rxp->rx_cnt == IBD_RX_POST_CNT) { 3741 rxp->rx_head = NULL; 3742 tail = WQE_TO_RWQE(rxp->rx_tail); 3743 rxp->rx_cnt = 0; 3744 } else { 3745 rxp->rx_head = RWQE_TO_WQE(rwqe); 3746 rwqe = NULL; 3747 } 3748 rxp->rx_stat++; 3749 mutex_exit(&rxp->rx_post_lock); 3750 if (rwqe) { 3751 ibd_post_recv_task(rwqe, tail); 3752 } 3753 } 3754 3755 static int 3756 ibd_alloc_rx_copybufs(ibd_state_t *state) 3757 { 3758 ibt_mr_attr_t mem_attr; 3759 int i; 3760 3761 /* 3762 * Allocate one big chunk for all regular rx copy bufs 3763 */ 3764 state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE; 3765 3766 state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe * 3767 state->id_rx_buf_sz, KM_SLEEP); 3768 3769 state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe * 3770 sizeof (ibd_rwqe_t), KM_SLEEP); 3771 3772 state->id_rx_nqueues = 1 << IBD_LOG_RX_POST; 3773 state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues * 3774 sizeof (ibd_rx_queue_t), KM_SLEEP); 3775 for (i = 0; i < state->id_rx_nqueues; i++) { 3776 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3777 mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL); 3778 } 3779 3780 /* 3781 * Do one memory registration on the entire rxbuf area 3782 */ 3783 mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs; 3784 mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz; 3785 mem_attr.mr_as = NULL; 3786 mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE; 3787 if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr, 3788 &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) { 3789 DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed"); 3790 kmem_free(state->id_rx_wqes, 3791 state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3792 kmem_free(state->id_rx_bufs, 3793 state->id_num_rwqe * state->id_rx_buf_sz); 3794 state->id_rx_bufs = NULL; 3795 state->id_rx_wqes = NULL; 3796 return (DDI_FAILURE); 3797 } 3798 3799 return (DDI_SUCCESS); 3800 } 3801 3802 /* 3803 * Allocate the statically allocated Rx buffer list. 3804 */ 3805 static int 3806 ibd_init_rxlist(ibd_state_t *state) 3807 { 3808 ibd_rwqe_t *rwqe; 3809 ibt_lkey_t lkey; 3810 int i; 3811 uint_t len; 3812 uint8_t *bufaddr; 3813 3814 if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS) 3815 return (DDI_FAILURE); 3816 3817 /* 3818 * Allocate and setup the rwqe list 3819 */ 3820 lkey = state->id_rx_mr_desc.md_lkey; 3821 rwqe = state->id_rx_wqes; 3822 bufaddr = state->id_rx_bufs; 3823 len = state->id_rx_buf_sz; 3824 for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) { 3825 rwqe->rwqe_type = IBD_WQE_RECV; 3826 rwqe->w_state = state; 3827 rwqe->w_freeing_wqe = B_FALSE; 3828 rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb; 3829 rwqe->w_freemsg_cb.free_arg = (char *)rwqe; 3830 3831 rwqe->rwqe_copybuf.ic_bufaddr = bufaddr; 3832 3833 if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0, 3834 &rwqe->w_freemsg_cb)) == NULL) { 3835 DPRINT(10, "ibd_init_rxlist : failed in desballoc()"); 3836 rwqe->rwqe_copybuf.ic_bufaddr = NULL; 3837 ibd_fini_rxlist(state); 3838 return (DDI_FAILURE); 3839 } 3840 3841 rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey; 3842 rwqe->rwqe_copybuf.ic_sgl.ds_va = 3843 (ib_vaddr_t)(uintptr_t)bufaddr; 3844 rwqe->rwqe_copybuf.ic_sgl.ds_len = len; 3845 rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe; 3846 rwqe->w_rwr.wr_nds = 1; 3847 rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl; 3848 3849 ibd_post_recv(state, rwqe); 3850 } 3851 3852 return (DDI_SUCCESS); 3853 } 3854 3855 static void 3856 ibd_free_rx_copybufs(ibd_state_t *state) 3857 { 3858 int i; 3859 3860 /* 3861 * Unregister rxbuf mr 3862 */ 3863 if (ibt_deregister_mr(state->id_hca_hdl, 3864 state->id_rx_mr_hdl) != IBT_SUCCESS) { 3865 DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed"); 3866 } 3867 state->id_rx_mr_hdl = NULL; 3868 3869 /* 3870 * Free rxbuf memory 3871 */ 3872 for (i = 0; i < state->id_rx_nqueues; i++) { 3873 ibd_rx_queue_t *rxp = state->id_rx_queues + i; 3874 mutex_destroy(&rxp->rx_post_lock); 3875 } 3876 kmem_free(state->id_rx_queues, state->id_rx_nqueues * 3877 sizeof (ibd_rx_queue_t)); 3878 kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t)); 3879 kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz); 3880 state->id_rx_queues = NULL; 3881 state->id_rx_wqes = NULL; 3882 state->id_rx_bufs = NULL; 3883 } 3884 3885 /* 3886 * Free the statically allocated Rx buffer list. 3887 * 3888 */ 3889 static void 3890 ibd_fini_rxlist(ibd_state_t *state) 3891 { 3892 ibd_rwqe_t *rwqe; 3893 int i; 3894 3895 mutex_enter(&state->id_rx_list.dl_mutex); 3896 rwqe = state->id_rx_wqes; 3897 for (i = 0; i < state->id_num_rwqe; i++, rwqe++) { 3898 if (rwqe->rwqe_im_mblk != NULL) { 3899 rwqe->w_freeing_wqe = B_TRUE; 3900 freemsg(rwqe->rwqe_im_mblk); 3901 } 3902 } 3903 mutex_exit(&state->id_rx_list.dl_mutex); 3904 3905 ibd_free_rx_copybufs(state); 3906 } 3907 3908 /* 3909 * Free an allocated recv wqe. 3910 */ 3911 /* ARGSUSED */ 3912 static void 3913 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe) 3914 { 3915 /* 3916 * desballoc() failed (no memory). 3917 * 3918 * This rwqe is placed on a free list so that it 3919 * can be reinstated when memory is available. 3920 * 3921 * NOTE: no code currently exists to reinstate 3922 * these "lost" rwqes. 3923 */ 3924 mutex_enter(&state->id_rx_free_list.dl_mutex); 3925 state->id_rx_free_list.dl_cnt++; 3926 rwqe->rwqe_next = state->id_rx_free_list.dl_head; 3927 state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe); 3928 mutex_exit(&state->id_rx_free_list.dl_mutex); 3929 } 3930 3931 /* 3932 * IBA Rx completion queue handler. Guaranteed to be single 3933 * threaded and nonreentrant for this CQ. 3934 */ 3935 /* ARGSUSED */ 3936 static void 3937 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3938 { 3939 ibd_state_t *state = (ibd_state_t *)arg; 3940 3941 atomic_add_64(&state->id_num_intrs, 1); 3942 3943 if (ibd_rx_softintr == 1) { 3944 mutex_enter(&state->id_rcq_poll_lock); 3945 if (state->id_rcq_poll_busy & IBD_CQ_POLLING) { 3946 state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING; 3947 mutex_exit(&state->id_rcq_poll_lock); 3948 return; 3949 } else { 3950 mutex_exit(&state->id_rcq_poll_lock); 3951 ddi_trigger_softintr(state->id_rx); 3952 } 3953 } else 3954 (void) ibd_intr((caddr_t)state); 3955 } 3956 3957 /* 3958 * CQ handler for Tx completions, when the Tx CQ is in 3959 * interrupt driven mode. 3960 */ 3961 /* ARGSUSED */ 3962 static void 3963 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg) 3964 { 3965 ibd_state_t *state = (ibd_state_t *)arg; 3966 3967 atomic_add_64(&state->id_num_intrs, 1); 3968 3969 if (ibd_tx_softintr == 1) { 3970 mutex_enter(&state->id_scq_poll_lock); 3971 if (state->id_scq_poll_busy & IBD_CQ_POLLING) { 3972 state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING; 3973 mutex_exit(&state->id_scq_poll_lock); 3974 return; 3975 } else { 3976 mutex_exit(&state->id_scq_poll_lock); 3977 ddi_trigger_softintr(state->id_tx); 3978 } 3979 } else 3980 (void) ibd_tx_recycle((caddr_t)state); 3981 } 3982 3983 /* 3984 * Multicast group create/delete trap handler. These will be delivered 3985 * on a kernel thread (handling can thus block) and can be invoked 3986 * concurrently. The handler can be invoked anytime after it is 3987 * registered and before ibt_detach(). 3988 */ 3989 /* ARGSUSED */ 3990 static void 3991 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code, 3992 ibt_subnet_event_t *event) 3993 { 3994 ibd_state_t *state = (ibd_state_t *)arg; 3995 ibd_req_t *req; 3996 3997 /* 3998 * The trap handler will get invoked once for every event for 3999 * every port. The input "gid" is the GID0 of the port the 4000 * trap came in on; we just need to act on traps that came 4001 * to our port, meaning the port on which the ipoib interface 4002 * resides. Since ipoib uses GID0 of the port, we just match 4003 * the gids to check whether we need to handle the trap. 4004 */ 4005 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4006 if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0) 4007 return; 4008 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4009 4010 DPRINT(10, "ibd_notices_handler : %d\n", code); 4011 4012 switch (code) { 4013 case IBT_SM_EVENT_UNAVAILABLE: 4014 /* 4015 * If we are in promiscuous mode or have 4016 * sendnonmembers, we need to print a warning 4017 * message right now. Else, just store the 4018 * information, print when we enter promiscuous 4019 * mode or attempt nonmember send. We might 4020 * also want to stop caching sendnonmember. 4021 */ 4022 ibd_print_warn(state, "IBA multicast support " 4023 "degraded due to unavailability of multicast " 4024 "traps"); 4025 break; 4026 case IBT_SM_EVENT_AVAILABLE: 4027 /* 4028 * If we printed a warning message above or 4029 * while trying to nonmember send or get into 4030 * promiscuous mode, print an okay message. 4031 */ 4032 ibd_print_warn(state, "IBA multicast support " 4033 "restored due to availability of multicast " 4034 "traps"); 4035 break; 4036 case IBT_SM_EVENT_MCG_CREATED: 4037 case IBT_SM_EVENT_MCG_DELETED: 4038 /* 4039 * Common processing of creation/deletion traps. 4040 * First check if the instance is being 4041 * [de]initialized; back off then, without doing 4042 * anything more, since we are not sure if the 4043 * async thread is around, or whether we might 4044 * be racing with the detach code in ibd_m_stop() 4045 * that scans the mcg list. 4046 */ 4047 if (!ibd_async_safe(state)) 4048 return; 4049 4050 req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP); 4051 req->rq_gid = event->sm_notice_gid; 4052 req->rq_ptr = (void *)code; 4053 ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP); 4054 break; 4055 } 4056 } 4057 4058 static void 4059 ibd_async_trap(ibd_state_t *state, ibd_req_t *req) 4060 { 4061 ib_gid_t mgid = req->rq_gid; 4062 ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr; 4063 4064 DPRINT(10, "ibd_async_trap : %d\n", code); 4065 4066 /* 4067 * Atomically search the nonmember and sendonlymember lists and 4068 * delete. 4069 */ 4070 ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON); 4071 4072 if (state->id_prom_op == IBD_OP_COMPLETED) { 4073 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4074 4075 /* 4076 * If in promiscuous mode, try to join/attach to the new 4077 * mcg. Given the unreliable out-of-order mode of trap 4078 * delivery, we can never be sure whether it is a problem 4079 * if the join fails. Thus, we warn the admin of a failure 4080 * if this was a creation trap. Note that the trap might 4081 * actually be reporting a long past event, and the mcg 4082 * might already have been deleted, thus we might be warning 4083 * in vain. 4084 */ 4085 if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == 4086 NULL) && (code == IBT_SM_EVENT_MCG_CREATED)) 4087 ibd_print_warn(state, "IBA promiscuous mode missed " 4088 "new multicast gid %016llx:%016llx", 4089 (u_longlong_t)mgid.gid_prefix, 4090 (u_longlong_t)mgid.gid_guid); 4091 } 4092 4093 /* 4094 * Free the request slot allocated by the subnet event thread. 4095 */ 4096 ibd_async_done(state); 4097 } 4098 4099 /* 4100 * GLDv3 entry point to get capabilities. 4101 */ 4102 static boolean_t 4103 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data) 4104 { 4105 ibd_state_t *state = arg; 4106 4107 switch (cap) { 4108 case MAC_CAPAB_HCKSUM: { 4109 uint32_t *txflags = cap_data; 4110 4111 /* 4112 * We either do full checksum or not do it at all 4113 */ 4114 if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) 4115 *txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4; 4116 else 4117 return (B_FALSE); 4118 break; 4119 } 4120 4121 case MAC_CAPAB_LSO: { 4122 mac_capab_lso_t *cap_lso = cap_data; 4123 4124 /* 4125 * In addition to the capability and policy, since LSO 4126 * relies on hw checksum, we'll not enable LSO if we 4127 * don't have hw checksum. Of course, if the HCA doesn't 4128 * provide the reserved lkey capability, enabling LSO will 4129 * actually affect performance adversely, so we'll disable 4130 * LSO even for that case. 4131 */ 4132 if (!state->id_lso_policy || !state->id_lso_capable) 4133 return (B_FALSE); 4134 4135 if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0) 4136 return (B_FALSE); 4137 4138 if (state->id_hca_res_lkey_capab == 0) { 4139 ibd_print_warn(state, "no reserved-lkey capability, " 4140 "disabling LSO"); 4141 return (B_FALSE); 4142 } 4143 4144 cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4; 4145 cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1; 4146 break; 4147 } 4148 4149 default: 4150 return (B_FALSE); 4151 } 4152 4153 return (B_TRUE); 4154 } 4155 4156 static int 4157 ibd_get_port_details(ibd_state_t *state) 4158 { 4159 ibt_hca_portinfo_t *port_infop; 4160 ibt_status_t ret; 4161 uint_t psize, port_infosz; 4162 4163 mutex_enter(&state->id_link_mutex); 4164 4165 /* 4166 * Query for port information 4167 */ 4168 ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port, 4169 &port_infop, &psize, &port_infosz); 4170 if ((ret != IBT_SUCCESS) || (psize != 1)) { 4171 mutex_exit(&state->id_link_mutex); 4172 DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() " 4173 "failed, ret=%d", ret); 4174 return (ENETDOWN); 4175 } 4176 4177 /* 4178 * If the link already went down by the time we get here, 4179 * give up 4180 */ 4181 if (port_infop->p_linkstate != IBT_PORT_ACTIVE) { 4182 mutex_exit(&state->id_link_mutex); 4183 ibt_free_portinfo(port_infop, port_infosz); 4184 DPRINT(10, "ibd_get_port_details: port is not active"); 4185 return (ENETDOWN); 4186 } 4187 4188 /* 4189 * If the link is active, verify the pkey 4190 */ 4191 if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port, 4192 state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) { 4193 mutex_exit(&state->id_link_mutex); 4194 ibt_free_portinfo(port_infop, port_infosz); 4195 DPRINT(10, "ibd_get_port_details: ibt_pkey2index " 4196 "failed, ret=%d", ret); 4197 return (ENONET); 4198 } 4199 4200 state->id_mtu = (128 << port_infop->p_mtu); 4201 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4202 state->id_sgid = *port_infop->p_sgid_tbl; 4203 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid)) 4204 state->id_link_state = LINK_STATE_UP; 4205 4206 mutex_exit(&state->id_link_mutex); 4207 ibt_free_portinfo(port_infop, port_infosz); 4208 4209 /* 4210 * Now that the port is active, record the port speed 4211 */ 4212 state->id_link_speed = ibd_get_portspeed(state); 4213 4214 return (0); 4215 } 4216 4217 static int 4218 ibd_alloc_cqs(ibd_state_t *state) 4219 { 4220 ibt_hca_attr_t hca_attrs; 4221 ibt_cq_attr_t cq_attr; 4222 ibt_status_t ret; 4223 uint32_t real_size; 4224 4225 ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs); 4226 ASSERT(ret == IBT_SUCCESS); 4227 4228 /* 4229 * Allocate Rx/combined CQ: 4230 * Theoretically, there is no point in having more than #rwqe 4231 * plus #swqe cqe's, except that the CQ will be signaled for 4232 * overflow when the last wqe completes, if none of the previous 4233 * cqe's have been polled. Thus, we allocate just a few less wqe's 4234 * to make sure such overflow does not occur. 4235 */ 4236 cq_attr.cq_sched = NULL; 4237 cq_attr.cq_flags = IBT_CQ_NO_FLAGS; 4238 4239 /* 4240 * Allocate Receive CQ. 4241 */ 4242 if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) { 4243 cq_attr.cq_size = state->id_num_rwqe + 1; 4244 } else { 4245 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4246 state->id_num_rwqe = cq_attr.cq_size - 1; 4247 } 4248 4249 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4250 &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) { 4251 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) " 4252 "failed, ret=%d\n", ret); 4253 return (DDI_FAILURE); 4254 } 4255 4256 if ((ret = ibt_modify_cq(state->id_rcq_hdl, 4257 ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) { 4258 DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt " 4259 "moderation failed, ret=%d\n", ret); 4260 } 4261 4262 /* make the #rx wc's the same as max rx chain size */ 4263 state->id_rxwcs_size = IBD_MAX_RX_MP_LEN; 4264 state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) * 4265 state->id_rxwcs_size, KM_SLEEP); 4266 4267 /* 4268 * Allocate Send CQ. 4269 */ 4270 if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) { 4271 cq_attr.cq_size = state->id_num_swqe + 1; 4272 } else { 4273 cq_attr.cq_size = hca_attrs.hca_max_cq_sz; 4274 state->id_num_swqe = cq_attr.cq_size - 1; 4275 } 4276 4277 if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr, 4278 &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) { 4279 DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) " 4280 "failed, ret=%d\n", ret); 4281 kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) * 4282 state->id_rxwcs_size); 4283 (void) ibt_free_cq(state->id_rcq_hdl); 4284 return (DDI_FAILURE); 4285 } 4286 if ((ret = ibt_modify_cq(state->id_scq_hdl, 4287 ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) { 4288 DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt " 4289 "moderation failed, ret=%d\n", ret); 4290 } 4291 4292 state->id_txwcs_size = IBD_TX_POLL_THRESH; 4293 state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) * 4294 state->id_txwcs_size, KM_SLEEP); 4295 4296 /* 4297 * Print message in case we could not allocate as many wqe's 4298 * as was requested. 4299 */ 4300 if (state->id_num_rwqe != IBD_NUM_RWQE) { 4301 ibd_print_warn(state, "Setting #rwqe = %d instead of default " 4302 "%d", state->id_num_rwqe, IBD_NUM_RWQE); 4303 } 4304 if (state->id_num_swqe != IBD_NUM_SWQE) { 4305 ibd_print_warn(state, "Setting #swqe = %d instead of default " 4306 "%d", state->id_num_swqe, IBD_NUM_SWQE); 4307 } 4308 4309 return (DDI_SUCCESS); 4310 } 4311 4312 static int 4313 ibd_setup_ud_channel(ibd_state_t *state) 4314 { 4315 ibt_ud_chan_alloc_args_t ud_alloc_attr; 4316 ibt_ud_chan_query_attr_t ud_chan_attr; 4317 ibt_status_t ret; 4318 4319 ud_alloc_attr.ud_flags = IBT_ALL_SIGNALED; 4320 if (state->id_hca_res_lkey_capab) 4321 ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY; 4322 if (state->id_lso_policy && state->id_lso_capable) 4323 ud_alloc_attr.ud_flags |= IBT_USES_LSO; 4324 4325 ud_alloc_attr.ud_hca_port_num = state->id_port; 4326 ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg; 4327 ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG; 4328 ud_alloc_attr.ud_sizes.cs_sq = state->id_num_swqe; 4329 ud_alloc_attr.ud_sizes.cs_rq = state->id_num_rwqe; 4330 ud_alloc_attr.ud_qkey = state->id_mcinfo->mc_qkey; 4331 ud_alloc_attr.ud_scq = state->id_scq_hdl; 4332 ud_alloc_attr.ud_rcq = state->id_rcq_hdl; 4333 ud_alloc_attr.ud_pd = state->id_pd_hdl; 4334 ud_alloc_attr.ud_pkey_ix = state->id_pkix; 4335 ud_alloc_attr.ud_clone_chan = NULL; 4336 4337 if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS, 4338 &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) { 4339 DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() " 4340 "failed, ret=%d\n", ret); 4341 return (DDI_FAILURE); 4342 } 4343 4344 if ((ret = ibt_query_ud_channel(state->id_chnl_hdl, 4345 &ud_chan_attr)) != IBT_SUCCESS) { 4346 DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() " 4347 "failed, ret=%d\n", ret); 4348 (void) ibt_free_channel(state->id_chnl_hdl); 4349 return (DDI_FAILURE); 4350 } 4351 4352 state->id_qpnum = ud_chan_attr.ud_qpn; 4353 4354 return (DDI_SUCCESS); 4355 } 4356 4357 static int 4358 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state) 4359 { 4360 uint32_t progress = state->id_mac_state; 4361 uint_t attempts; 4362 ibt_status_t ret; 4363 ib_gid_t mgid; 4364 ibd_mce_t *mce; 4365 uint8_t jstate; 4366 4367 /* 4368 * Before we try to stop/undo whatever we did in ibd_start(), 4369 * we need to mark the link state appropriately to prevent the 4370 * ip layer from using this instance for any new transfers. Note 4371 * that if the original state of the link was "up" when we're 4372 * here, we'll set the final link state to "unknown", to behave 4373 * in the same fashion as other ethernet drivers. 4374 */ 4375 mutex_enter(&state->id_link_mutex); 4376 if (cur_link_state == LINK_STATE_DOWN) { 4377 state->id_link_state = cur_link_state; 4378 } else { 4379 state->id_link_state = LINK_STATE_UNKNOWN; 4380 } 4381 mutex_exit(&state->id_link_mutex); 4382 mac_link_update(state->id_mh, state->id_link_state); 4383 4384 state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED); 4385 if (progress & IBD_DRV_STARTED) { 4386 state->id_mac_state &= (~IBD_DRV_STARTED); 4387 } 4388 4389 /* 4390 * First, stop receive interrupts; this stops the driver from 4391 * handing up buffers to higher layers. Wait for receive buffers 4392 * to be returned and give up after 5 seconds. 4393 */ 4394 if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) { 4395 4396 ibt_set_cq_handler(state->id_rcq_hdl, 0, 0); 4397 4398 attempts = 50; 4399 while (state->id_rx_list.dl_bufs_outstanding > 0) { 4400 delay(drv_usectohz(100000)); 4401 if (--attempts == 0) { 4402 /* 4403 * There are pending bufs with the network 4404 * layer and we have no choice but to wait 4405 * for them to be done with. Reap all the 4406 * Tx/Rx completions that were posted since 4407 * we turned off the notification and 4408 * return failure. 4409 */ 4410 DPRINT(2, "ibd_undo_start: " 4411 "reclaiming failed"); 4412 ibd_poll_rcq(state, state->id_rcq_hdl); 4413 ibt_set_cq_handler(state->id_rcq_hdl, 4414 ibd_rcq_handler, state); 4415 return (DDI_FAILURE); 4416 } 4417 } 4418 state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED); 4419 } 4420 4421 if (progress & IBD_DRV_SM_NOTICES_REGISTERED) { 4422 ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL); 4423 4424 mutex_enter(&state->id_trap_lock); 4425 state->id_trap_stop = B_TRUE; 4426 while (state->id_trap_inprog > 0) 4427 cv_wait(&state->id_trap_cv, &state->id_trap_lock); 4428 mutex_exit(&state->id_trap_lock); 4429 4430 state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED); 4431 } 4432 4433 if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) { 4434 /* 4435 * Flushing the channel ensures that all pending WQE's 4436 * are marked with flush_error and handed to the CQ. It 4437 * does not guarantee the invocation of the CQ handler. 4438 * This call is guaranteed to return successfully for 4439 * UD QPNs. 4440 */ 4441 if ((ret = ibt_flush_channel(state->id_chnl_hdl)) != 4442 IBT_SUCCESS) { 4443 DPRINT(10, "ibd_undo_start: flush_channel " 4444 "failed, ret=%d", ret); 4445 } 4446 4447 /* 4448 * Turn off Tx interrupts and poll. By the time the polling 4449 * returns an empty indicator, we are sure we have seen all 4450 * pending Tx callbacks. Note that after the call to 4451 * ibt_set_cq_handler() returns, the old handler is 4452 * guaranteed not to be invoked anymore. 4453 */ 4454 ibt_set_cq_handler(state->id_scq_hdl, 0, 0); 4455 ibd_poll_scq(state, state->id_scq_hdl); 4456 4457 state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED); 4458 } 4459 4460 if (progress & IBD_DRV_ASYNC_THR_CREATED) { 4461 /* 4462 * No new async requests will be posted since the device 4463 * link state has been marked as unknown; completion handlers 4464 * have been turned off, so Tx handler will not cause any 4465 * more IBD_ASYNC_REAP requests. 4466 * 4467 * Queue a request for the async thread to exit, which will 4468 * be serviced after any pending ones. This can take a while, 4469 * specially if the SM is unreachable, since IBMF will slowly 4470 * timeout each SM request issued by the async thread. Reap 4471 * the thread before continuing on, we do not want it to be 4472 * lingering in modunloaded code (or we could move the reap 4473 * to ibd_detach(), provided we keep track of the current 4474 * id_async_thrid somewhere safe). 4475 */ 4476 ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT); 4477 thread_join(state->id_async_thrid); 4478 4479 state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED); 4480 } 4481 4482 if (progress & IBD_DRV_BCAST_GROUP_JOINED) { 4483 /* 4484 * Drop all residual full/non membership. This includes full 4485 * membership to the broadcast group, and any nonmembership 4486 * acquired during transmits. We do this after the Tx completion 4487 * handlers are done, since those might result in some late 4488 * leaves; this also eliminates a potential race with that 4489 * path wrt the mc full list insert/delete. Trap handling 4490 * has also been suppressed at this point. Thus, no locks 4491 * are required while traversing the mc full list. 4492 */ 4493 DPRINT(2, "ibd_undo_start: clear full cache entries"); 4494 mce = list_head(&state->id_mc_full); 4495 while (mce != NULL) { 4496 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4497 jstate = mce->mc_jstate; 4498 mce = list_next(&state->id_mc_full, mce); 4499 ibd_leave_group(state, mgid, jstate); 4500 } 4501 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED); 4502 } 4503 4504 if (progress & IBD_DRV_RXLIST_ALLOCD) { 4505 ibd_fini_rxlist(state); 4506 state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD); 4507 } 4508 4509 if (progress & IBD_DRV_TXLIST_ALLOCD) { 4510 ibd_fini_txlist(state); 4511 state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD); 4512 } 4513 4514 if (progress & IBD_DRV_UD_CHANNEL_SETUP) { 4515 if ((ret = ibt_free_channel(state->id_chnl_hdl)) != 4516 IBT_SUCCESS) { 4517 DPRINT(10, "ibd_undo_start: free_channel " 4518 "failed, ret=%d", ret); 4519 } 4520 4521 state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP); 4522 } 4523 4524 if (progress & IBD_DRV_CQS_ALLOCD) { 4525 kmem_free(state->id_txwcs, 4526 sizeof (ibt_wc_t) * state->id_txwcs_size); 4527 if ((ret = ibt_free_cq(state->id_scq_hdl)) != 4528 IBT_SUCCESS) { 4529 DPRINT(10, "ibd_undo_start: free_cq(scq) " 4530 "failed, ret=%d", ret); 4531 } 4532 4533 kmem_free(state->id_rxwcs, 4534 sizeof (ibt_wc_t) * state->id_rxwcs_size); 4535 if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) { 4536 DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, " 4537 "ret=%d", ret); 4538 } 4539 4540 state->id_txwcs = NULL; 4541 state->id_rxwcs = NULL; 4542 state->id_scq_hdl = NULL; 4543 state->id_rcq_hdl = NULL; 4544 4545 state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD); 4546 } 4547 4548 if (progress & IBD_DRV_ACACHE_INITIALIZED) { 4549 mutex_enter(&state->id_ac_mutex); 4550 mod_hash_destroy_hash(state->id_ah_active_hash); 4551 mutex_exit(&state->id_ac_mutex); 4552 ibd_acache_fini(state); 4553 4554 state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED); 4555 } 4556 4557 if (progress & IBD_DRV_BCAST_GROUP_FOUND) { 4558 /* 4559 * If we'd created the ipoib broadcast group and had 4560 * successfully joined it, leave it now 4561 */ 4562 if (state->id_bgroup_created) { 4563 mgid = state->id_mcinfo->mc_adds_vect.av_dgid; 4564 jstate = IB_MC_JSTATE_FULL; 4565 (void) ibt_leave_mcg(state->id_sgid, mgid, 4566 state->id_sgid, jstate); 4567 } 4568 ibt_free_mcg_info(state->id_mcinfo, 1); 4569 4570 state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND); 4571 } 4572 4573 return (DDI_SUCCESS); 4574 } 4575 4576 /* 4577 * These pair of routines are used to set/clear the condition that 4578 * the caller is likely to do something to change the id_mac_state. 4579 * If there's already someone doing either a start or a stop (possibly 4580 * due to the async handler detecting a pkey relocation event, a plumb 4581 * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until 4582 * that's done. 4583 */ 4584 static void 4585 ibd_set_mac_progress(ibd_state_t *state, uint_t flag) 4586 { 4587 mutex_enter(&state->id_macst_lock); 4588 while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS) 4589 cv_wait(&state->id_macst_cv, &state->id_macst_lock); 4590 4591 state->id_mac_state |= flag; 4592 mutex_exit(&state->id_macst_lock); 4593 } 4594 4595 static void 4596 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag) 4597 { 4598 mutex_enter(&state->id_macst_lock); 4599 state->id_mac_state &= (~flag); 4600 cv_signal(&state->id_macst_cv); 4601 mutex_exit(&state->id_macst_lock); 4602 } 4603 4604 /* 4605 * GLDv3 entry point to start hardware. 4606 */ 4607 /*ARGSUSED*/ 4608 static int 4609 ibd_m_start(void *arg) 4610 { 4611 ibd_state_t *state = arg; 4612 int ret; 4613 4614 ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4615 4616 ret = ibd_start(state); 4617 4618 ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS); 4619 4620 return (ret); 4621 } 4622 4623 static int 4624 ibd_start(ibd_state_t *state) 4625 { 4626 kthread_t *kht; 4627 int err; 4628 ibt_status_t ret; 4629 4630 if (state->id_mac_state & IBD_DRV_STARTED) 4631 return (DDI_SUCCESS); 4632 4633 /* 4634 * Get port details; if we fail here, very likely the port 4635 * state is inactive or the pkey can't be verified. 4636 */ 4637 if ((err = ibd_get_port_details(state)) != 0) { 4638 DPRINT(10, "ibd_start: ibd_get_port_details() failed"); 4639 goto start_fail; 4640 } 4641 state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED; 4642 4643 /* 4644 * Find the IPoIB broadcast group 4645 */ 4646 if (ibd_find_bgroup(state) != IBT_SUCCESS) { 4647 DPRINT(10, "ibd_start: ibd_find_bgroup() failed"); 4648 err = ENOTACTIVE; 4649 goto start_fail; 4650 } 4651 state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND; 4652 4653 /* 4654 * Initialize per-interface caches and lists; if we fail here, 4655 * it is most likely due to a lack of resources 4656 */ 4657 if (ibd_acache_init(state) != DDI_SUCCESS) { 4658 DPRINT(10, "ibd_start: ibd_acache_init() failed"); 4659 err = ENOMEM; 4660 goto start_fail; 4661 } 4662 state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED; 4663 4664 /* 4665 * Allocate send and receive completion queues 4666 */ 4667 if (ibd_alloc_cqs(state) != DDI_SUCCESS) { 4668 DPRINT(10, "ibd_start: ibd_alloc_cqs() failed"); 4669 err = ENOMEM; 4670 goto start_fail; 4671 } 4672 state->id_mac_state |= IBD_DRV_CQS_ALLOCD; 4673 4674 /* 4675 * Setup a UD channel 4676 */ 4677 if (ibd_setup_ud_channel(state) != DDI_SUCCESS) { 4678 err = ENOMEM; 4679 DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed"); 4680 goto start_fail; 4681 } 4682 state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP; 4683 4684 /* 4685 * Allocate and initialize the tx buffer list 4686 */ 4687 if (ibd_init_txlist(state) != DDI_SUCCESS) { 4688 DPRINT(10, "ibd_start: ibd_init_txlist() failed"); 4689 err = ENOMEM; 4690 goto start_fail; 4691 } 4692 state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD; 4693 4694 /* 4695 * Create the send cq handler here 4696 */ 4697 ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state); 4698 if ((ret = ibt_enable_cq_notify(state->id_scq_hdl, 4699 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4700 DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) " 4701 "failed, ret=%d", ret); 4702 err = EINVAL; 4703 goto start_fail; 4704 } 4705 state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED; 4706 4707 /* 4708 * Allocate and initialize the rx buffer list 4709 */ 4710 if (ibd_init_rxlist(state) != DDI_SUCCESS) { 4711 DPRINT(10, "ibd_start: ibd_init_rxlist() failed"); 4712 err = ENOMEM; 4713 goto start_fail; 4714 } 4715 state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD; 4716 4717 /* 4718 * Join IPoIB broadcast group 4719 */ 4720 if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) { 4721 DPRINT(10, "ibd_start: ibd_join_group() failed"); 4722 err = ENOTACTIVE; 4723 goto start_fail; 4724 } 4725 state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED; 4726 4727 /* 4728 * Create the async thread; thread_create never fails. 4729 */ 4730 kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0, 4731 TS_RUN, minclsyspri); 4732 _NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4733 state->id_async_thrid = kht->t_did; 4734 _NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid)) 4735 state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED; 4736 4737 /* 4738 * When we did mac_register() in ibd_attach(), we didn't register 4739 * the real macaddr and we didn't have the true port mtu. Now that 4740 * we're almost ready, set the local mac address and broadcast 4741 * addresses and update gldv3 about the real values of these 4742 * parameters. 4743 */ 4744 ibd_h2n_mac(&state->id_macaddr, state->id_qpnum, 4745 state->id_sgid.gid_prefix, state->id_sgid.gid_guid); 4746 ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK, 4747 state->id_mgid.gid_prefix, state->id_mgid.gid_guid); 4748 4749 (void) mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE); 4750 mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr); 4751 4752 /* 4753 * Setup the receive cq handler 4754 */ 4755 ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state); 4756 if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl, 4757 IBT_NEXT_COMPLETION)) != IBT_SUCCESS) { 4758 DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) " 4759 "failed, ret=%d", ret); 4760 err = EINVAL; 4761 goto start_fail; 4762 } 4763 state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED; 4764 4765 /* 4766 * Setup the subnet notices handler after we've initialized the acache/ 4767 * mcache and started the async thread, both of which are required for 4768 * the trap handler to function properly. 4769 * 4770 * Now that the async thread has been started (and we've already done 4771 * a mac_register() during attach so mac_tx_update() can be called 4772 * if necessary without any problem), we can enable the trap handler 4773 * to queue requests to the async thread. 4774 */ 4775 ibt_register_subnet_notices(state->id_ibt_hdl, 4776 ibd_snet_notices_handler, state); 4777 mutex_enter(&state->id_trap_lock); 4778 state->id_trap_stop = B_FALSE; 4779 mutex_exit(&state->id_trap_lock); 4780 state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED; 4781 4782 /* 4783 * Indicate link status to GLDv3 and higher layers. By default, 4784 * we assume we are in up state (which must have been true at 4785 * least at the time the broadcast mcg's were probed); if there 4786 * were any up/down transitions till the time we come here, the 4787 * async handler will have updated last known state, which we 4788 * use to tell GLDv3. The async handler will not send any 4789 * notifications to GLDv3 till we reach here in the initialization 4790 * sequence. 4791 */ 4792 state->id_mac_state |= IBD_DRV_STARTED; 4793 mac_link_update(state->id_mh, state->id_link_state); 4794 4795 return (DDI_SUCCESS); 4796 4797 start_fail: 4798 /* 4799 * If we ran into a problem during ibd_start() and ran into 4800 * some other problem during undoing our partial work, we can't 4801 * do anything about it. Ignore any errors we might get from 4802 * ibd_undo_start() and just return the original error we got. 4803 */ 4804 (void) ibd_undo_start(state, LINK_STATE_DOWN); 4805 return (err); 4806 } 4807 4808 /* 4809 * GLDv3 entry point to stop hardware from receiving packets. 4810 */ 4811 /*ARGSUSED*/ 4812 static void 4813 ibd_m_stop(void *arg) 4814 { 4815 ibd_state_t *state = (ibd_state_t *)arg; 4816 4817 ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4818 4819 (void) ibd_undo_start(state, state->id_link_state); 4820 4821 ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS); 4822 } 4823 4824 /* 4825 * GLDv3 entry point to modify device's mac address. We do not 4826 * allow address modifications. 4827 */ 4828 static int 4829 ibd_m_unicst(void *arg, const uint8_t *macaddr) 4830 { 4831 ibd_state_t *state = arg; 4832 4833 /* 4834 * Don't bother even comparing the macaddr if we haven't 4835 * completed ibd_m_start(). 4836 */ 4837 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4838 return (0); 4839 4840 if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0) 4841 return (0); 4842 else 4843 return (EINVAL); 4844 } 4845 4846 /* 4847 * The blocking part of the IBA join/leave operations are done out 4848 * of here on the async thread. 4849 */ 4850 static void 4851 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op) 4852 { 4853 DPRINT(3, "ibd_async_multicast : async_setmc op %d :" 4854 "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid); 4855 4856 if (op == IBD_ASYNC_JOIN) { 4857 if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) { 4858 ibd_print_warn(state, "Join multicast group failed :" 4859 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4860 } 4861 } else { 4862 /* 4863 * Here, we must search for the proper mcg_info and 4864 * use that to leave the group. 4865 */ 4866 ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL); 4867 } 4868 } 4869 4870 /* 4871 * GLDv3 entry point for multicast enable/disable requests. 4872 * This function queues the operation to the async thread and 4873 * return success for a valid multicast address. 4874 */ 4875 static int 4876 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac) 4877 { 4878 ibd_state_t *state = (ibd_state_t *)arg; 4879 ipoib_mac_t maddr, *mcast; 4880 ib_gid_t mgid; 4881 ibd_req_t *req; 4882 4883 /* 4884 * If we haven't completed ibd_m_start(), async thread wouldn't 4885 * have been started and id_bcaddr wouldn't be set, so there's 4886 * no point in continuing. 4887 */ 4888 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 4889 return (0); 4890 4891 /* 4892 * The incoming multicast address might not be aligned properly 4893 * on a 4 byte boundary to be considered an ipoib_mac_t. We force 4894 * it to look like one though, to get the offsets of the mc gid, 4895 * since we know we are not going to dereference any values with 4896 * the ipoib_mac_t pointer. 4897 */ 4898 bcopy(mcmac, &maddr, sizeof (ipoib_mac_t)); 4899 mcast = &maddr; 4900 4901 /* 4902 * Check validity of MCG address. We could additionally check 4903 * that a enable/disable is not being issued on the "broadcast" 4904 * mcg, but since this operation is only invokable by privileged 4905 * programs anyway, we allow the flexibility to those dlpi apps. 4906 * Note that we do not validate the "scope" of the IBA mcg. 4907 */ 4908 if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN) 4909 return (EINVAL); 4910 4911 /* 4912 * fill in multicast pkey and scope 4913 */ 4914 IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey); 4915 4916 /* 4917 * If someone is trying to JOIN/LEAVE the broadcast group, we do 4918 * nothing (i.e. we stay JOINed to the broadcast group done in 4919 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically 4920 * requires to be joined to broadcast groups at all times. 4921 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also 4922 * depends on this. 4923 */ 4924 if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0) 4925 return (0); 4926 4927 ibd_n2h_gid(mcast, &mgid); 4928 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 4929 if (req == NULL) 4930 return (ENOMEM); 4931 4932 req->rq_gid = mgid; 4933 4934 if (add) { 4935 DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n", 4936 mgid.gid_prefix, mgid.gid_guid); 4937 ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN); 4938 } else { 4939 DPRINT(1, "ibd_m_multicst : unset_multicast : " 4940 "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid); 4941 ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE); 4942 } 4943 return (0); 4944 } 4945 4946 /* 4947 * The blocking part of the IBA promiscuous operations are done 4948 * out of here on the async thread. The dlpireq parameter indicates 4949 * whether this invocation is due to a dlpi request or due to 4950 * a port up/down event. 4951 */ 4952 static void 4953 ibd_async_unsetprom(ibd_state_t *state) 4954 { 4955 ibd_mce_t *mce = list_head(&state->id_mc_non); 4956 ib_gid_t mgid; 4957 4958 DPRINT(2, "ibd_async_unsetprom : async_unset_promisc"); 4959 4960 while (mce != NULL) { 4961 mgid = mce->mc_info.mc_adds_vect.av_dgid; 4962 mce = list_next(&state->id_mc_non, mce); 4963 ibd_leave_group(state, mgid, IB_MC_JSTATE_NON); 4964 } 4965 state->id_prom_op = IBD_OP_NOTSTARTED; 4966 } 4967 4968 /* 4969 * The blocking part of the IBA promiscuous operations are done 4970 * out of here on the async thread. The dlpireq parameter indicates 4971 * whether this invocation is due to a dlpi request or due to 4972 * a port up/down event. 4973 */ 4974 static void 4975 ibd_async_setprom(ibd_state_t *state) 4976 { 4977 ibt_mcg_attr_t mcg_attr; 4978 ibt_mcg_info_t *mcg_info; 4979 ib_gid_t mgid; 4980 uint_t numg; 4981 int i; 4982 char ret = IBD_OP_COMPLETED; 4983 4984 DPRINT(2, "ibd_async_setprom : async_set_promisc"); 4985 4986 /* 4987 * Obtain all active MC groups on the IB fabric with 4988 * specified criteria (scope + Pkey + Qkey + mtu). 4989 */ 4990 bzero(&mcg_attr, sizeof (mcg_attr)); 4991 mcg_attr.mc_pkey = state->id_pkey; 4992 mcg_attr.mc_scope = state->id_scope; 4993 mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey; 4994 mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu; 4995 mcg_attr.mc_mtu_req.r_selector = IBT_EQU; 4996 if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) != 4997 IBT_SUCCESS) { 4998 ibd_print_warn(state, "Could not get list of IBA multicast " 4999 "groups"); 5000 ret = IBD_OP_ERRORED; 5001 goto done; 5002 } 5003 5004 /* 5005 * Iterate over the returned mcg's and join as NonMember 5006 * to the IP mcg's. 5007 */ 5008 for (i = 0; i < numg; i++) { 5009 /* 5010 * Do a NonMember JOIN on the MC group. 5011 */ 5012 mgid = mcg_info[i].mc_adds_vect.av_dgid; 5013 if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL) 5014 ibd_print_warn(state, "IBA promiscuous mode missed " 5015 "multicast gid %016llx:%016llx", 5016 (u_longlong_t)mgid.gid_prefix, 5017 (u_longlong_t)mgid.gid_guid); 5018 } 5019 5020 ibt_free_mcg_info(mcg_info, numg); 5021 DPRINT(4, "ibd_async_setprom : async_set_promisc completes"); 5022 done: 5023 state->id_prom_op = ret; 5024 } 5025 5026 /* 5027 * GLDv3 entry point for multicast promiscuous enable/disable requests. 5028 * GLDv3 assumes phys state receives more packets than multi state, 5029 * which is not true for IPoIB. Thus, treat the multi and phys 5030 * promiscuous states the same way to work with GLDv3's assumption. 5031 */ 5032 static int 5033 ibd_m_promisc(void *arg, boolean_t on) 5034 { 5035 ibd_state_t *state = (ibd_state_t *)arg; 5036 ibd_req_t *req; 5037 5038 /* 5039 * Async thread wouldn't have been started if we haven't 5040 * passed ibd_m_start() 5041 */ 5042 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5043 return (0); 5044 5045 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5046 if (req == NULL) 5047 return (ENOMEM); 5048 if (on) { 5049 DPRINT(1, "ibd_m_promisc : set_promisc : %d", on); 5050 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON); 5051 } else { 5052 DPRINT(1, "ibd_m_promisc : unset_promisc"); 5053 ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF); 5054 } 5055 5056 return (0); 5057 } 5058 5059 /* 5060 * GLDv3 entry point for gathering statistics. 5061 */ 5062 static int 5063 ibd_m_stat(void *arg, uint_t stat, uint64_t *val) 5064 { 5065 ibd_state_t *state = (ibd_state_t *)arg; 5066 5067 switch (stat) { 5068 case MAC_STAT_IFSPEED: 5069 *val = state->id_link_speed; 5070 break; 5071 case MAC_STAT_MULTIRCV: 5072 *val = state->id_multi_rcv; 5073 break; 5074 case MAC_STAT_BRDCSTRCV: 5075 *val = state->id_brd_rcv; 5076 break; 5077 case MAC_STAT_MULTIXMT: 5078 *val = state->id_multi_xmt; 5079 break; 5080 case MAC_STAT_BRDCSTXMT: 5081 *val = state->id_brd_xmt; 5082 break; 5083 case MAC_STAT_RBYTES: 5084 *val = state->id_rcv_bytes; 5085 break; 5086 case MAC_STAT_IPACKETS: 5087 *val = state->id_rcv_pkt; 5088 break; 5089 case MAC_STAT_OBYTES: 5090 *val = state->id_xmt_bytes; 5091 break; 5092 case MAC_STAT_OPACKETS: 5093 *val = state->id_xmt_pkt; 5094 break; 5095 case MAC_STAT_OERRORS: 5096 *val = state->id_ah_error; /* failed AH translation */ 5097 break; 5098 case MAC_STAT_IERRORS: 5099 *val = 0; 5100 break; 5101 case MAC_STAT_NOXMTBUF: 5102 *val = state->id_tx_short; 5103 break; 5104 case MAC_STAT_NORCVBUF: 5105 default: 5106 return (ENOTSUP); 5107 } 5108 5109 return (0); 5110 } 5111 5112 static void 5113 ibd_async_txsched(ibd_state_t *state) 5114 { 5115 ibd_resume_transmission(state); 5116 } 5117 5118 static void 5119 ibd_resume_transmission(ibd_state_t *state) 5120 { 5121 int flag; 5122 int met_thresh = 0; 5123 int thresh = 0; 5124 int ret = -1; 5125 5126 mutex_enter(&state->id_sched_lock); 5127 if (state->id_sched_needed & IBD_RSRC_SWQE) { 5128 mutex_enter(&state->id_tx_list.dl_mutex); 5129 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5130 met_thresh = state->id_tx_list.dl_cnt + 5131 state->id_tx_rel_list.dl_cnt; 5132 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5133 mutex_exit(&state->id_tx_list.dl_mutex); 5134 thresh = IBD_FREE_SWQES_THRESH; 5135 flag = IBD_RSRC_SWQE; 5136 } else if (state->id_sched_needed & IBD_RSRC_LSOBUF) { 5137 ASSERT(state->id_lso != NULL); 5138 mutex_enter(&state->id_lso_lock); 5139 met_thresh = state->id_lso->bkt_nfree; 5140 thresh = IBD_FREE_LSOS_THRESH; 5141 mutex_exit(&state->id_lso_lock); 5142 flag = IBD_RSRC_LSOBUF; 5143 if (met_thresh > thresh) 5144 state->id_sched_lso_cnt++; 5145 } 5146 if (met_thresh > thresh) { 5147 state->id_sched_needed &= ~flag; 5148 state->id_sched_cnt++; 5149 ret = 0; 5150 } 5151 mutex_exit(&state->id_sched_lock); 5152 5153 if (ret == 0) 5154 mac_tx_update(state->id_mh); 5155 } 5156 5157 /* 5158 * Release the send wqe back into free list. 5159 */ 5160 static void 5161 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n) 5162 { 5163 /* 5164 * Add back on Tx list for reuse. 5165 */ 5166 ASSERT(tail->swqe_next == NULL); 5167 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5168 state->id_tx_rel_list.dl_pending_sends = B_FALSE; 5169 tail->swqe_next = state->id_tx_rel_list.dl_head; 5170 state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head); 5171 state->id_tx_rel_list.dl_cnt += n; 5172 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5173 } 5174 5175 /* 5176 * Acquire a send wqe from free list. 5177 * Returns error number and send wqe pointer. 5178 */ 5179 static ibd_swqe_t * 5180 ibd_acquire_swqe(ibd_state_t *state) 5181 { 5182 ibd_swqe_t *wqe; 5183 5184 mutex_enter(&state->id_tx_rel_list.dl_mutex); 5185 if (state->id_tx_rel_list.dl_head != NULL) { 5186 /* transfer id_tx_rel_list to id_tx_list */ 5187 state->id_tx_list.dl_head = 5188 state->id_tx_rel_list.dl_head; 5189 state->id_tx_list.dl_cnt = 5190 state->id_tx_rel_list.dl_cnt; 5191 state->id_tx_list.dl_pending_sends = B_FALSE; 5192 5193 /* clear id_tx_rel_list */ 5194 state->id_tx_rel_list.dl_head = NULL; 5195 state->id_tx_rel_list.dl_cnt = 0; 5196 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5197 5198 wqe = WQE_TO_SWQE(state->id_tx_list.dl_head); 5199 state->id_tx_list.dl_cnt -= 1; 5200 state->id_tx_list.dl_head = wqe->swqe_next; 5201 } else { /* no free swqe */ 5202 mutex_exit(&state->id_tx_rel_list.dl_mutex); 5203 state->id_tx_list.dl_pending_sends = B_TRUE; 5204 DPRINT(5, "ibd_acquire_swqe: out of Tx wqe"); 5205 state->id_tx_short++; 5206 wqe = NULL; 5207 } 5208 return (wqe); 5209 } 5210 5211 static int 5212 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss, 5213 ibt_ud_dest_hdl_t ud_dest) 5214 { 5215 mblk_t *nmp; 5216 int iph_len, tcph_len; 5217 ibt_wr_lso_t *lso; 5218 uintptr_t ip_start, tcp_start; 5219 uint8_t *dst; 5220 uint_t pending, mblen; 5221 5222 /* 5223 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default; 5224 * we need to adjust it here for lso. 5225 */ 5226 lso = &(node->w_swr.wr.ud_lso); 5227 lso->lso_ud_dest = ud_dest; 5228 lso->lso_mss = mss; 5229 5230 /* 5231 * Calculate the LSO header size and set it in the UD LSO structure. 5232 * Note that the only assumption we make is that each of the IPoIB, 5233 * IP and TCP headers will be contained in a single mblk fragment; 5234 * together, the headers may span multiple mblk fragments. 5235 */ 5236 nmp = mp; 5237 ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE; 5238 if (ip_start >= (uintptr_t)(nmp->b_wptr)) { 5239 ip_start = (uintptr_t)nmp->b_cont->b_rptr 5240 + (ip_start - (uintptr_t)(nmp->b_wptr)); 5241 nmp = nmp->b_cont; 5242 5243 } 5244 iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start); 5245 5246 tcp_start = ip_start + iph_len; 5247 if (tcp_start >= (uintptr_t)(nmp->b_wptr)) { 5248 tcp_start = (uintptr_t)nmp->b_cont->b_rptr 5249 + (tcp_start - (uintptr_t)(nmp->b_wptr)); 5250 nmp = nmp->b_cont; 5251 } 5252 tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start); 5253 lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len; 5254 5255 /* 5256 * If the lso header fits entirely within a single mblk fragment, 5257 * we'll avoid an additional copy of the lso header here and just 5258 * pass the b_rptr of the mblk directly. 5259 * 5260 * If this isn't true, we'd have to allocate for it explicitly. 5261 */ 5262 if (lso->lso_hdr_sz <= MBLKL(mp)) { 5263 lso->lso_hdr = mp->b_rptr; 5264 } else { 5265 /* On work completion, remember to free this allocated hdr */ 5266 lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP); 5267 if (lso->lso_hdr == NULL) { 5268 DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, " 5269 "sz = %d", lso->lso_hdr_sz); 5270 lso->lso_hdr_sz = 0; 5271 lso->lso_mss = 0; 5272 return (-1); 5273 } 5274 } 5275 5276 /* 5277 * Copy in the lso header only if we need to 5278 */ 5279 if (lso->lso_hdr != mp->b_rptr) { 5280 dst = lso->lso_hdr; 5281 pending = lso->lso_hdr_sz; 5282 5283 for (nmp = mp; nmp && pending; nmp = nmp->b_cont) { 5284 mblen = MBLKL(nmp); 5285 if (pending > mblen) { 5286 bcopy(nmp->b_rptr, dst, mblen); 5287 dst += mblen; 5288 pending -= mblen; 5289 } else { 5290 bcopy(nmp->b_rptr, dst, pending); 5291 break; 5292 } 5293 } 5294 } 5295 5296 return (0); 5297 } 5298 5299 static void 5300 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp) 5301 { 5302 ibt_wr_lso_t *lso; 5303 5304 if ((!node) || (!mp)) 5305 return; 5306 5307 /* 5308 * Free any header space that we might've allocated if we 5309 * did an LSO 5310 */ 5311 if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) { 5312 lso = &(node->w_swr.wr.ud_lso); 5313 if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) { 5314 kmem_free(lso->lso_hdr, lso->lso_hdr_sz); 5315 lso->lso_hdr = NULL; 5316 lso->lso_hdr_sz = 0; 5317 } 5318 } 5319 } 5320 5321 static void 5322 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node) 5323 { 5324 uint_t i; 5325 uint_t num_posted; 5326 uint_t n_wrs; 5327 ibt_status_t ibt_status; 5328 ibt_send_wr_t wrs[IBD_MAX_TX_POST_MULTIPLE]; 5329 ibd_swqe_t *tx_head, *elem; 5330 ibd_swqe_t *nodes[IBD_MAX_TX_POST_MULTIPLE]; 5331 5332 /* post the one request, then check for more */ 5333 ibt_status = ibt_post_send(state->id_chnl_hdl, 5334 &node->w_swr, 1, NULL); 5335 if (ibt_status != IBT_SUCCESS) { 5336 ibd_print_warn(state, "ibd_post_send: " 5337 "posting one wr failed: ret=%d", ibt_status); 5338 ibd_tx_cleanup(state, node); 5339 } 5340 5341 tx_head = NULL; 5342 for (;;) { 5343 if (tx_head == NULL) { 5344 mutex_enter(&state->id_txpost_lock); 5345 tx_head = state->id_tx_head; 5346 if (tx_head == NULL) { 5347 state->id_tx_busy = 0; 5348 mutex_exit(&state->id_txpost_lock); 5349 return; 5350 } 5351 state->id_tx_head = NULL; 5352 mutex_exit(&state->id_txpost_lock); 5353 } 5354 5355 /* 5356 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs 5357 * at a time if possible, and keep posting them. 5358 */ 5359 for (n_wrs = 0, elem = tx_head; 5360 (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE); 5361 elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) { 5362 nodes[n_wrs] = elem; 5363 wrs[n_wrs] = elem->w_swr; 5364 } 5365 tx_head = elem; 5366 5367 ASSERT(n_wrs != 0); 5368 5369 /* 5370 * If posting fails for some reason, we'll never receive 5371 * completion intimation, so we'll need to cleanup. But 5372 * we need to make sure we don't clean up nodes whose 5373 * wrs have been successfully posted. We assume that the 5374 * hca driver returns on the first failure to post and 5375 * therefore the first 'num_posted' entries don't need 5376 * cleanup here. 5377 */ 5378 num_posted = 0; 5379 ibt_status = ibt_post_send(state->id_chnl_hdl, 5380 wrs, n_wrs, &num_posted); 5381 if (ibt_status != IBT_SUCCESS) { 5382 ibd_print_warn(state, "ibd_post_send: " 5383 "posting multiple wrs failed: " 5384 "requested=%d, done=%d, ret=%d", 5385 n_wrs, num_posted, ibt_status); 5386 5387 for (i = num_posted; i < n_wrs; i++) 5388 ibd_tx_cleanup(state, nodes[i]); 5389 } 5390 } 5391 } 5392 5393 static int 5394 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node, 5395 uint_t lsohdr_sz) 5396 { 5397 ibt_wr_ds_t *sgl; 5398 ibt_status_t ibt_status; 5399 mblk_t *nmp; 5400 mblk_t *data_mp; 5401 uchar_t *bufp; 5402 size_t blksize; 5403 size_t skip; 5404 size_t avail; 5405 uint_t pktsize; 5406 uint_t frag_len; 5407 uint_t pending_hdr; 5408 int nmblks; 5409 int i; 5410 5411 /* 5412 * Let's skip ahead to the data if this is LSO 5413 */ 5414 data_mp = mp; 5415 pending_hdr = 0; 5416 if (lsohdr_sz) { 5417 pending_hdr = lsohdr_sz; 5418 for (nmp = mp; nmp; nmp = nmp->b_cont) { 5419 frag_len = nmp->b_wptr - nmp->b_rptr; 5420 if (frag_len > pending_hdr) 5421 break; 5422 pending_hdr -= frag_len; 5423 } 5424 data_mp = nmp; /* start of data past lso header */ 5425 ASSERT(data_mp != NULL); 5426 } 5427 5428 /* 5429 * Calculate the size of message data and number of msg blocks 5430 */ 5431 pktsize = 0; 5432 for (nmblks = 0, nmp = data_mp; nmp != NULL; 5433 nmp = nmp->b_cont, nmblks++) { 5434 pktsize += MBLKL(nmp); 5435 } 5436 pktsize -= pending_hdr; 5437 5438 /* 5439 * We only do ibt_map_mem_iov() if the pktsize is above the 5440 * "copy-threshold", and if the number of mp fragments is less than 5441 * the maximum acceptable. 5442 */ 5443 if ((state->id_hca_res_lkey_capab) && 5444 (pktsize > IBD_TX_COPY_THRESH) && 5445 (nmblks < state->id_max_sqseg_hiwm)) { 5446 ibt_iov_t iov_arr[IBD_MAX_SQSEG]; 5447 ibt_iov_attr_t iov_attr; 5448 5449 iov_attr.iov_as = NULL; 5450 iov_attr.iov = iov_arr; 5451 iov_attr.iov_buf = NULL; 5452 iov_attr.iov_list_len = nmblks; 5453 iov_attr.iov_wr_nds = state->id_max_sqseg; 5454 iov_attr.iov_lso_hdr_sz = lsohdr_sz; 5455 iov_attr.iov_flags = IBT_IOV_SLEEP; 5456 5457 for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) { 5458 iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr; 5459 iov_arr[i].iov_len = MBLKL(nmp); 5460 if (i == 0) { 5461 iov_arr[i].iov_addr += pending_hdr; 5462 iov_arr[i].iov_len -= pending_hdr; 5463 } 5464 } 5465 5466 node->w_buftype = IBD_WQE_MAPPED; 5467 node->w_swr.wr_sgl = node->w_sgl; 5468 5469 ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr, 5470 (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl); 5471 if (ibt_status != IBT_SUCCESS) { 5472 ibd_print_warn(state, "ibd_send: ibt_map_mem_iov " 5473 "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status); 5474 goto ibd_copy_path; 5475 } 5476 5477 return (0); 5478 } 5479 5480 ibd_copy_path: 5481 if (pktsize <= state->id_tx_buf_sz) { 5482 node->swqe_copybuf.ic_sgl.ds_len = pktsize; 5483 node->w_swr.wr_nds = 1; 5484 node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl; 5485 node->w_buftype = IBD_WQE_TXBUF; 5486 5487 /* 5488 * Even though this is the copy path for transfers less than 5489 * id_tx_buf_sz, it could still be an LSO packet. If so, it 5490 * is possible the first data mblk fragment (data_mp) still 5491 * contains part of the LSO header that we need to skip. 5492 */ 5493 bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va; 5494 for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) { 5495 blksize = MBLKL(nmp) - pending_hdr; 5496 bcopy(nmp->b_rptr + pending_hdr, bufp, blksize); 5497 bufp += blksize; 5498 pending_hdr = 0; 5499 } 5500 5501 return (0); 5502 } 5503 5504 /* 5505 * Copy path for transfers greater than id_tx_buf_sz 5506 */ 5507 node->w_swr.wr_sgl = node->w_sgl; 5508 if (ibd_acquire_lsobufs(state, pktsize, 5509 node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) { 5510 DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed"); 5511 return (-1); 5512 } 5513 node->w_buftype = IBD_WQE_LSOBUF; 5514 5515 /* 5516 * Copy the larger-than-id_tx_buf_sz packet into a set of 5517 * fixed-sized, pre-mapped LSO buffers. Note that we might 5518 * need to skip part of the LSO header in the first fragment 5519 * as before. 5520 */ 5521 nmp = data_mp; 5522 skip = pending_hdr; 5523 for (i = 0; i < node->w_swr.wr_nds; i++) { 5524 sgl = node->w_swr.wr_sgl + i; 5525 bufp = (uchar_t *)(uintptr_t)sgl->ds_va; 5526 avail = IBD_LSO_BUFSZ; 5527 while (nmp && avail) { 5528 blksize = MBLKL(nmp) - skip; 5529 if (blksize > avail) { 5530 bcopy(nmp->b_rptr + skip, bufp, avail); 5531 skip += avail; 5532 avail = 0; 5533 } else { 5534 bcopy(nmp->b_rptr + skip, bufp, blksize); 5535 skip = 0; 5536 avail -= blksize; 5537 bufp += blksize; 5538 nmp = nmp->b_cont; 5539 } 5540 } 5541 } 5542 5543 return (0); 5544 } 5545 5546 /* 5547 * Schedule a completion queue polling to reap the resource we're 5548 * short on. If we implement the change to reap tx completions 5549 * in a separate thread, we'll need to wake up that thread here. 5550 */ 5551 static int 5552 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag) 5553 { 5554 ibd_req_t *req; 5555 5556 mutex_enter(&state->id_sched_lock); 5557 state->id_sched_needed |= resource_type; 5558 mutex_exit(&state->id_sched_lock); 5559 5560 /* 5561 * If we are asked to queue a work entry, we need to do it 5562 */ 5563 if (q_flag) { 5564 req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP); 5565 if (req == NULL) 5566 return (-1); 5567 5568 ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED); 5569 } 5570 5571 return (0); 5572 } 5573 5574 /* 5575 * The passed in packet has this format: 5576 * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data 5577 */ 5578 static boolean_t 5579 ibd_send(ibd_state_t *state, mblk_t *mp) 5580 { 5581 ibd_ace_t *ace; 5582 ibd_swqe_t *node; 5583 ipoib_mac_t *dest; 5584 ib_header_info_t *ipibp; 5585 ip6_t *ip6h; 5586 uint_t pktsize; 5587 uint32_t mss; 5588 uint32_t hckflags; 5589 uint32_t lsoflags = 0; 5590 uint_t lsohdr_sz = 0; 5591 int ret, len; 5592 boolean_t dofree = B_FALSE; 5593 boolean_t rc; 5594 5595 /* 5596 * If we aren't done with the device initialization and start, 5597 * we shouldn't be here. 5598 */ 5599 if ((state->id_mac_state & IBD_DRV_STARTED) == 0) 5600 return (B_FALSE); 5601 5602 mutex_enter(&state->id_tx_list.dl_mutex); 5603 node = WQE_TO_SWQE(state->id_tx_list.dl_head); 5604 if (node != NULL) { 5605 state->id_tx_list.dl_cnt -= 1; 5606 state->id_tx_list.dl_head = node->swqe_next; 5607 } else { 5608 node = ibd_acquire_swqe(state); 5609 } 5610 mutex_exit(&state->id_tx_list.dl_mutex); 5611 if (node == NULL) { 5612 /* 5613 * If we don't have an swqe available, schedule a transmit 5614 * completion queue cleanup and hold off on sending more 5615 * more packets until we have some free swqes 5616 */ 5617 if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0) 5618 return (B_FALSE); 5619 5620 /* 5621 * If a poll cannot be scheduled, we have no choice but 5622 * to drop this packet 5623 */ 5624 ibd_print_warn(state, "ibd_send: no swqe, pkt drop"); 5625 return (B_TRUE); 5626 } 5627 5628 /* 5629 * Initialize the commonly used fields in swqe to NULL to protect 5630 * against ibd_tx_cleanup accidentally misinterpreting these on a 5631 * failure. 5632 */ 5633 node->swqe_im_mblk = NULL; 5634 node->w_swr.wr_nds = 0; 5635 node->w_swr.wr_sgl = NULL; 5636 node->w_swr.wr_opcode = IBT_WRC_SEND; 5637 5638 /* 5639 * Obtain an address handle for the destination. 5640 */ 5641 ipibp = (ib_header_info_t *)mp->b_rptr; 5642 dest = (ipoib_mac_t *)&ipibp->ib_dst; 5643 if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5644 IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey); 5645 5646 pktsize = msgsize(mp); 5647 5648 atomic_add_64(&state->id_xmt_bytes, pktsize); 5649 atomic_inc_64(&state->id_xmt_pkt); 5650 if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 5651 atomic_inc_64(&state->id_brd_xmt); 5652 else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 5653 atomic_inc_64(&state->id_multi_xmt); 5654 5655 if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) { 5656 node->w_ahandle = ace; 5657 node->w_swr.wr.ud.udwr_dest = ace->ac_dest; 5658 } else { 5659 DPRINT(5, 5660 "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X", 5661 ((ret == EFAULT) ? "failed" : "queued"), 5662 htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]), 5663 htonl(dest->ipoib_gidpref[1]), 5664 htonl(dest->ipoib_gidsuff[0]), 5665 htonl(dest->ipoib_gidsuff[1])); 5666 node->w_ahandle = NULL; 5667 5668 /* 5669 * Here if ibd_acache_lookup() returns EFAULT, it means ibd 5670 * can not find a path for the specific dest address. We 5671 * should get rid of this kind of packet. We also should get 5672 * rid of the packet if we cannot schedule a poll via the 5673 * async thread. For the normal case, ibd will return the 5674 * packet to upper layer and wait for AH creating. 5675 * 5676 * Note that we always queue a work slot entry for the async 5677 * thread when we fail AH lookup (even in intr mode); this is 5678 * due to the convoluted way the code currently looks for AH. 5679 */ 5680 if (ret == EFAULT) { 5681 dofree = B_TRUE; 5682 rc = B_TRUE; 5683 } else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) { 5684 dofree = B_TRUE; 5685 rc = B_TRUE; 5686 } else { 5687 dofree = B_FALSE; 5688 rc = B_FALSE; 5689 } 5690 goto ibd_send_fail; 5691 } 5692 5693 /* 5694 * For ND6 packets, padding is at the front of the source lladdr. 5695 * Insert the padding at front. 5696 */ 5697 if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) { 5698 if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) { 5699 if (!pullupmsg(mp, IPV6_HDR_LEN + 5700 sizeof (ib_header_info_t))) { 5701 DPRINT(10, "ibd_send: pullupmsg failure "); 5702 dofree = B_TRUE; 5703 rc = B_TRUE; 5704 goto ibd_send_fail; 5705 } 5706 ipibp = (ib_header_info_t *)mp->b_rptr; 5707 } 5708 ip6h = (ip6_t *)((uchar_t *)ipibp + 5709 sizeof (ib_header_info_t)); 5710 len = ntohs(ip6h->ip6_plen); 5711 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 5712 mblk_t *pad; 5713 5714 pad = allocb(4, 0); 5715 pad->b_wptr = (uchar_t *)pad->b_rptr + 4; 5716 linkb(mp, pad); 5717 if (MBLKL(mp) < sizeof (ib_header_info_t) + 5718 IPV6_HDR_LEN + len + 4) { 5719 if (!pullupmsg(mp, sizeof (ib_header_info_t) + 5720 IPV6_HDR_LEN + len + 4)) { 5721 DPRINT(10, "ibd_send: pullupmsg " 5722 "failure "); 5723 dofree = B_TRUE; 5724 rc = B_TRUE; 5725 goto ibd_send_fail; 5726 } 5727 ip6h = (ip6_t *)((uchar_t *)mp->b_rptr + 5728 sizeof (ib_header_info_t)); 5729 } 5730 5731 /* LINTED: E_CONSTANT_CONDITION */ 5732 IBD_PAD_NSNA(ip6h, len, IBD_SEND); 5733 } 5734 } 5735 5736 mp->b_rptr += sizeof (ib_addrs_t); 5737 5738 /* 5739 * Do LSO and checksum related work here. For LSO send, adjust the 5740 * ud destination, the opcode and the LSO header information to the 5741 * work request. 5742 */ 5743 lso_info_get(mp, &mss, &lsoflags); 5744 if ((lsoflags & HW_LSO) != HW_LSO) { 5745 node->w_swr.wr_opcode = IBT_WRC_SEND; 5746 lsohdr_sz = 0; 5747 } else { 5748 if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) { 5749 /* 5750 * The routine can only fail if there's no memory; we 5751 * can only drop the packet if this happens 5752 */ 5753 ibd_print_warn(state, 5754 "ibd_send: no memory, lso posting failed"); 5755 dofree = B_TRUE; 5756 rc = B_TRUE; 5757 goto ibd_send_fail; 5758 } 5759 5760 node->w_swr.wr_opcode = IBT_WRC_SEND_LSO; 5761 lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz; 5762 } 5763 5764 hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags); 5765 if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM) 5766 node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM; 5767 else 5768 node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM; 5769 5770 /* 5771 * Prepare the sgl for posting; the routine can only fail if there's 5772 * no lso buf available for posting. If this is the case, we should 5773 * probably resched for lso bufs to become available and then try again. 5774 */ 5775 if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) { 5776 if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) { 5777 dofree = B_TRUE; 5778 rc = B_TRUE; 5779 } else { 5780 dofree = B_FALSE; 5781 rc = B_FALSE; 5782 } 5783 goto ibd_send_fail; 5784 } 5785 node->swqe_im_mblk = mp; 5786 5787 /* 5788 * Queue the wqe to hardware; since we can now simply queue a 5789 * post instead of doing it serially, we cannot assume anything 5790 * about the 'node' after ibd_post_send() returns. 5791 */ 5792 node->swqe_next = NULL; 5793 5794 mutex_enter(&state->id_txpost_lock); 5795 if (state->id_tx_busy) { 5796 if (state->id_tx_head) { 5797 state->id_tx_tail->swqe_next = 5798 SWQE_TO_WQE(node); 5799 } else { 5800 state->id_tx_head = node; 5801 } 5802 state->id_tx_tail = node; 5803 mutex_exit(&state->id_txpost_lock); 5804 } else { 5805 state->id_tx_busy = 1; 5806 mutex_exit(&state->id_txpost_lock); 5807 ibd_post_send(state, node); 5808 } 5809 5810 return (B_TRUE); 5811 5812 ibd_send_fail: 5813 if (node && mp) 5814 ibd_free_lsohdr(node, mp); 5815 5816 if (dofree) 5817 freemsg(mp); 5818 5819 if (node != NULL) 5820 ibd_tx_cleanup(state, node); 5821 5822 return (rc); 5823 } 5824 5825 /* 5826 * GLDv3 entry point for transmitting datagram. 5827 */ 5828 static mblk_t * 5829 ibd_m_tx(void *arg, mblk_t *mp) 5830 { 5831 ibd_state_t *state = (ibd_state_t *)arg; 5832 mblk_t *next; 5833 5834 if (state->id_link_state != LINK_STATE_UP) { 5835 freemsgchain(mp); 5836 mp = NULL; 5837 } 5838 5839 while (mp != NULL) { 5840 next = mp->b_next; 5841 mp->b_next = NULL; 5842 if (ibd_send(state, mp) == B_FALSE) { 5843 /* Send fail */ 5844 mp->b_next = next; 5845 break; 5846 } 5847 mp = next; 5848 } 5849 5850 return (mp); 5851 } 5852 5853 /* 5854 * this handles Tx and Rx completions. With separate CQs, this handles 5855 * only Rx completions. 5856 */ 5857 static uint_t 5858 ibd_intr(caddr_t arg) 5859 { 5860 ibd_state_t *state = (ibd_state_t *)arg; 5861 5862 ibd_poll_rcq(state, state->id_rcq_hdl); 5863 5864 return (DDI_INTR_CLAIMED); 5865 } 5866 5867 /* 5868 * Poll and fully drain the send cq 5869 */ 5870 static void 5871 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5872 { 5873 ibt_wc_t *wcs = state->id_txwcs; 5874 uint_t numwcs = state->id_txwcs_size; 5875 ibd_wqe_t *wqe; 5876 ibd_swqe_t *head, *tail; 5877 ibt_wc_t *wc; 5878 uint_t num_polled; 5879 int i; 5880 5881 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5882 head = tail = NULL; 5883 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5884 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5885 ASSERT(wqe->w_type == IBD_WQE_SEND); 5886 if (wc->wc_status != IBT_WC_SUCCESS) { 5887 /* 5888 * Channel being torn down. 5889 */ 5890 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5891 DPRINT(5, "ibd_drain_scq: flush error"); 5892 /* 5893 * Only invoke the Tx handler to 5894 * release possibly held resources 5895 * like AH refcount etc. 5896 */ 5897 DPRINT(10, "ibd_drain_scq: Bad " 5898 "status %d", wc->wc_status); 5899 } 5900 return; /* give up. no need to clean up */ 5901 } 5902 /* 5903 * Add this swqe to the list to be cleaned up. 5904 */ 5905 if (head) 5906 tail->swqe_next = wqe; 5907 else 5908 head = WQE_TO_SWQE(wqe); 5909 tail = WQE_TO_SWQE(wqe); 5910 } 5911 tail->swqe_next = NULL; 5912 ibd_tx_cleanup_list(state, head, tail); 5913 5914 /* 5915 * Resume any blocked transmissions if possible 5916 */ 5917 ibd_resume_transmission(state); 5918 } 5919 } 5920 5921 /* 5922 * Poll and fully drain the receive cq 5923 */ 5924 static void 5925 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5926 { 5927 ibt_wc_t *wcs = state->id_rxwcs; 5928 uint_t numwcs = state->id_rxwcs_size; 5929 ibd_wqe_t *wqe; 5930 ibt_wc_t *wc; 5931 uint_t num_polled; 5932 int i; 5933 mblk_t *head, *tail, *mp; 5934 5935 while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) { 5936 head = tail = NULL; 5937 for (i = 0, wc = wcs; i < num_polled; i++, wc++) { 5938 wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id; 5939 ASSERT(wqe->w_type == IBD_WQE_RECV); 5940 if (wc->wc_status != IBT_WC_SUCCESS) { 5941 /* 5942 * Channel being torn down. 5943 */ 5944 if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) { 5945 DPRINT(5, "ibd_drain_rcq: flush error"); 5946 /* 5947 * Do not invoke Rx handler because 5948 * it might add buffers to the Rx pool 5949 * when we are trying to deinitialize. 5950 */ 5951 continue; 5952 } 5953 } 5954 mp = ibd_process_rx(state, WQE_TO_RWQE(wqe), wc); 5955 if (mp == NULL) 5956 continue; 5957 5958 /* 5959 * Add this mp to the list to send to the nw layer. 5960 */ 5961 if (head) 5962 tail->b_next = mp; 5963 else 5964 head = mp; 5965 tail = mp; 5966 } 5967 if (head) 5968 mac_rx(state->id_mh, state->id_rh, head); 5969 } 5970 } 5971 5972 /* 5973 * Common code for interrupt handling as well as for polling 5974 * for all completed wqe's while detaching. 5975 */ 5976 static void 5977 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl) 5978 { 5979 int flag, redo_flag; 5980 int redo = 1; 5981 5982 flag = IBD_CQ_POLLING; 5983 redo_flag = IBD_REDO_CQ_POLLING; 5984 5985 mutex_enter(&state->id_scq_poll_lock); 5986 if (state->id_scq_poll_busy & flag) { 5987 ibd_print_warn(state, "ibd_poll_scq: multiple polling threads"); 5988 state->id_scq_poll_busy |= redo_flag; 5989 mutex_exit(&state->id_scq_poll_lock); 5990 return; 5991 } 5992 state->id_scq_poll_busy |= flag; 5993 mutex_exit(&state->id_scq_poll_lock); 5994 5995 /* 5996 * In some cases (eg detaching), this code can be invoked on 5997 * any cpu after disabling cq notification (thus no concurrency 5998 * exists). Apart from that, the following applies normally: 5999 * Transmit completion handling could be from any cpu if 6000 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ 6001 * is interrupt driven. 6002 */ 6003 6004 /* 6005 * Poll and drain the CQ 6006 */ 6007 ibd_drain_scq(state, cq_hdl); 6008 6009 /* 6010 * Enable CQ notifications and redrain the cq to catch any 6011 * completions we might have missed after the ibd_drain_scq() 6012 * above and before the ibt_enable_cq_notify() that follows. 6013 * Finally, service any new requests to poll the cq that 6014 * could've come in after the ibt_enable_cq_notify(). 6015 */ 6016 do { 6017 if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) != 6018 IBT_SUCCESS) { 6019 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6020 } 6021 6022 ibd_drain_scq(state, cq_hdl); 6023 6024 mutex_enter(&state->id_scq_poll_lock); 6025 if (state->id_scq_poll_busy & redo_flag) 6026 state->id_scq_poll_busy &= ~redo_flag; 6027 else { 6028 state->id_scq_poll_busy &= ~flag; 6029 redo = 0; 6030 } 6031 mutex_exit(&state->id_scq_poll_lock); 6032 6033 } while (redo); 6034 } 6035 6036 /* 6037 * Common code for interrupt handling as well as for polling 6038 * for all completed wqe's while detaching. 6039 */ 6040 static void 6041 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq) 6042 { 6043 int flag, redo_flag; 6044 int redo = 1; 6045 6046 flag = IBD_CQ_POLLING; 6047 redo_flag = IBD_REDO_CQ_POLLING; 6048 6049 mutex_enter(&state->id_rcq_poll_lock); 6050 if (state->id_rcq_poll_busy & flag) { 6051 ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads"); 6052 state->id_rcq_poll_busy |= redo_flag; 6053 mutex_exit(&state->id_rcq_poll_lock); 6054 return; 6055 } 6056 state->id_rcq_poll_busy |= flag; 6057 mutex_exit(&state->id_rcq_poll_lock); 6058 6059 /* 6060 * Poll and drain the CQ 6061 */ 6062 ibd_drain_rcq(state, rcq); 6063 6064 /* 6065 * Enable CQ notifications and redrain the cq to catch any 6066 * completions we might have missed after the ibd_drain_cq() 6067 * above and before the ibt_enable_cq_notify() that follows. 6068 * Finally, service any new requests to poll the cq that 6069 * could've come in after the ibt_enable_cq_notify(). 6070 */ 6071 do { 6072 if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) != 6073 IBT_SUCCESS) { 6074 DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed"); 6075 } 6076 6077 ibd_drain_rcq(state, rcq); 6078 6079 mutex_enter(&state->id_rcq_poll_lock); 6080 if (state->id_rcq_poll_busy & redo_flag) 6081 state->id_rcq_poll_busy &= ~redo_flag; 6082 else { 6083 state->id_rcq_poll_busy &= ~flag; 6084 redo = 0; 6085 } 6086 mutex_exit(&state->id_rcq_poll_lock); 6087 6088 } while (redo); 6089 } 6090 6091 /* 6092 * Unmap the memory area associated with a given swqe. 6093 */ 6094 static void 6095 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe) 6096 { 6097 ibt_status_t stat; 6098 6099 DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds); 6100 6101 if (swqe->w_mi_hdl) { 6102 if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl, 6103 swqe->w_mi_hdl)) != IBT_SUCCESS) { 6104 DPRINT(10, 6105 "failed in ibt_unmap_mem_iov, ret=%d\n", stat); 6106 } 6107 swqe->w_mi_hdl = NULL; 6108 } 6109 swqe->w_swr.wr_nds = 0; 6110 } 6111 6112 static void 6113 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace) 6114 { 6115 /* 6116 * The recycling logic can be eliminated from here 6117 * and put into the async thread if we create another 6118 * list to hold ACE's for unjoined mcg's. 6119 */ 6120 if (DEC_REF_DO_CYCLE(ace)) { 6121 ibd_mce_t *mce; 6122 6123 /* 6124 * Check with the lock taken: we decremented 6125 * reference count without the lock, and some 6126 * transmitter might already have bumped the 6127 * reference count (possible in case of multicast 6128 * disable when we leave the AH on the active 6129 * list). If not still 0, get out, leaving the 6130 * recycle bit intact. 6131 * 6132 * Atomically transition the AH from active 6133 * to free list, and queue a work request to 6134 * leave the group and destroy the mce. No 6135 * transmitter can be looking at the AH or 6136 * the MCE in between, since we have the 6137 * ac_mutex lock. In the SendOnly reap case, 6138 * it is not necessary to hold the ac_mutex 6139 * and recheck the ref count (since the AH was 6140 * taken off the active list), we just do it 6141 * to have uniform processing with the Full 6142 * reap case. 6143 */ 6144 mutex_enter(&state->id_ac_mutex); 6145 mce = ace->ac_mce; 6146 if (GET_REF_CYCLE(ace) == 0) { 6147 CLEAR_REFCYCLE(ace); 6148 /* 6149 * Identify the case of fullmember reap as 6150 * opposed to mcg trap reap. Also, port up 6151 * might set ac_mce to NULL to indicate Tx 6152 * cleanup should do no more than put the 6153 * AH in the free list (see ibd_async_link). 6154 */ 6155 if (mce != NULL) { 6156 ace->ac_mce = NULL; 6157 IBD_ACACHE_PULLOUT_ACTIVE(state, ace); 6158 /* 6159 * mc_req was initialized at mce 6160 * creation time. 6161 */ 6162 ibd_queue_work_slot(state, 6163 &mce->mc_req, IBD_ASYNC_REAP); 6164 } 6165 IBD_ACACHE_INSERT_FREE(state, ace); 6166 } 6167 mutex_exit(&state->id_ac_mutex); 6168 } 6169 } 6170 6171 /* 6172 * Common code that deals with clean ups after a successful or 6173 * erroneous transmission attempt. 6174 */ 6175 static void 6176 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe) 6177 { 6178 ibd_ace_t *ace = swqe->w_ahandle; 6179 6180 DPRINT(20, "ibd_tx_cleanup %p\n", swqe); 6181 6182 /* 6183 * If this was a dynamic mapping in ibd_send(), we need to 6184 * unmap here. If this was an lso buffer we'd used for sending, 6185 * we need to release the lso buf to the pool, since the resource 6186 * is scarce. However, if this was simply a normal send using 6187 * the copybuf (present in each swqe), we don't need to release it. 6188 */ 6189 if (swqe->swqe_im_mblk != NULL) { 6190 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6191 ibd_unmap_mem(state, swqe); 6192 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6193 ibd_release_lsobufs(state, 6194 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6195 } 6196 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6197 freemsg(swqe->swqe_im_mblk); 6198 swqe->swqe_im_mblk = NULL; 6199 } 6200 6201 /* 6202 * Drop the reference count on the AH; it can be reused 6203 * now for a different destination if there are no more 6204 * posted sends that will use it. This can be eliminated 6205 * if we can always associate each Tx buffer with an AH. 6206 * The ace can be null if we are cleaning up from the 6207 * ibd_send() error path. 6208 */ 6209 if (ace != NULL) { 6210 ibd_dec_ref_ace(state, ace); 6211 } 6212 6213 /* 6214 * Release the send wqe for reuse. 6215 */ 6216 swqe->swqe_next = NULL; 6217 ibd_release_swqe(state, swqe, swqe, 1); 6218 } 6219 6220 static void 6221 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail) 6222 { 6223 ibd_ace_t *ace; 6224 ibd_swqe_t *swqe; 6225 int n = 0; 6226 6227 DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail); 6228 6229 for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) { 6230 6231 /* 6232 * If this was a dynamic mapping in ibd_send(), we need to 6233 * unmap here. If this was an lso buffer we'd used for sending, 6234 * we need to release the lso buf to the pool, since the 6235 * resource is scarce. However, if this was simply a normal 6236 * send using the copybuf (present in each swqe), we don't need 6237 * to release it. 6238 */ 6239 if (swqe->swqe_im_mblk != NULL) { 6240 if (swqe->w_buftype == IBD_WQE_MAPPED) { 6241 ibd_unmap_mem(state, swqe); 6242 } else if (swqe->w_buftype == IBD_WQE_LSOBUF) { 6243 ibd_release_lsobufs(state, 6244 swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds); 6245 } 6246 ibd_free_lsohdr(swqe, swqe->swqe_im_mblk); 6247 freemsg(swqe->swqe_im_mblk); 6248 swqe->swqe_im_mblk = NULL; 6249 } 6250 6251 /* 6252 * Drop the reference count on the AH; it can be reused 6253 * now for a different destination if there are no more 6254 * posted sends that will use it. This can be eliminated 6255 * if we can always associate each Tx buffer with an AH. 6256 * The ace can be null if we are cleaning up from the 6257 * ibd_send() error path. 6258 */ 6259 ace = swqe->w_ahandle; 6260 if (ace != NULL) { 6261 ibd_dec_ref_ace(state, ace); 6262 } 6263 n++; 6264 } 6265 6266 /* 6267 * Release the send wqes for reuse. 6268 */ 6269 ibd_release_swqe(state, head, tail, n); 6270 } 6271 6272 /* 6273 * Processing to be done after receipt of a packet; hand off to GLD 6274 * in the format expected by GLD. The received packet has this 6275 * format: 2b sap :: 00 :: data. 6276 */ 6277 static mblk_t * 6278 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc) 6279 { 6280 ib_header_info_t *phdr; 6281 mblk_t *mp; 6282 ipoib_hdr_t *ipibp; 6283 ipha_t *iphap; 6284 ip6_t *ip6h; 6285 int len; 6286 ib_msglen_t pkt_len = wc->wc_bytes_xfer; 6287 uint32_t bufs; 6288 6289 atomic_add_32(&state->id_rx_list.dl_cnt, -1); 6290 6291 /* 6292 * Track number handed to upper layer, and number still 6293 * available to receive packets. 6294 */ 6295 bufs = atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 1); 6296 6297 /* Never run out of rwqes, use allocb when running low */ 6298 if (bufs >= state->id_rx_bufs_outstanding_limit) { 6299 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 6300 atomic_inc_32(&state->id_rx_allocb); 6301 mp = allocb(pkt_len, BPRI_HI); 6302 if (mp) { 6303 bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len); 6304 ibd_post_recv(state, rwqe); 6305 } else { /* no memory */ 6306 atomic_inc_32(&state->id_rx_allocb_failed); 6307 ibd_post_recv(state, rwqe); 6308 return (NULL); 6309 } 6310 } else { 6311 mp = rwqe->rwqe_im_mblk; 6312 } 6313 6314 6315 /* 6316 * Adjust write pointer depending on how much data came in. 6317 */ 6318 mp->b_wptr = mp->b_rptr + pkt_len; 6319 6320 /* 6321 * Make sure this is NULL or we're in trouble. 6322 */ 6323 if (mp->b_next != NULL) { 6324 ibd_print_warn(state, 6325 "ibd_process_rx: got duplicate mp from rcq?"); 6326 mp->b_next = NULL; 6327 } 6328 6329 /* 6330 * the IB link will deliver one of the IB link layer 6331 * headers called, the Global Routing Header (GRH). 6332 * ibd driver uses the information in GRH to build the 6333 * Header_info structure and pass it with the datagram up 6334 * to GLDv3. 6335 * If the GRH is not valid, indicate to GLDv3 by setting 6336 * the VerTcFlow field to 0. 6337 */ 6338 phdr = (ib_header_info_t *)mp->b_rptr; 6339 if (wc->wc_flags & IBT_WC_GRH_PRESENT) { 6340 phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn); 6341 6342 /* if it is loop back packet, just drop it. */ 6343 if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr, 6344 IPOIB_ADDRL) == 0) { 6345 freemsg(mp); 6346 return (NULL); 6347 } 6348 6349 ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src, 6350 sizeof (ipoib_mac_t)); 6351 if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) { 6352 phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN); 6353 IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst); 6354 } else { 6355 phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn; 6356 } 6357 } else { 6358 /* 6359 * It can not be a IBA multicast packet. Must have been 6360 * unicast for us. Just copy the interface address to dst. 6361 */ 6362 phdr->ib_grh.ipoib_vertcflow = 0; 6363 ovbcopy(&state->id_macaddr, &phdr->ib_dst, 6364 sizeof (ipoib_mac_t)); 6365 } 6366 6367 /* 6368 * For ND6 packets, padding is at the front of the source/target 6369 * lladdr. However the inet6 layer is not aware of it, hence remove 6370 * the padding from such packets. 6371 */ 6372 ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t)); 6373 if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) { 6374 ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6375 len = ntohs(ip6h->ip6_plen); 6376 if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { 6377 /* LINTED: E_CONSTANT_CONDITION */ 6378 IBD_PAD_NSNA(ip6h, len, IBD_RECV); 6379 } 6380 } 6381 6382 /* 6383 * Update statistics 6384 */ 6385 atomic_add_64(&state->id_rcv_bytes, pkt_len); 6386 atomic_inc_64(&state->id_rcv_pkt); 6387 if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0) 6388 atomic_inc_64(&state->id_brd_rcv); 6389 else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN) 6390 atomic_inc_64(&state->id_multi_rcv); 6391 6392 iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t)); 6393 /* 6394 * Set receive checksum status in mp 6395 * Hardware checksumming can be considered valid only if: 6396 * 1. CQE.IP_OK bit is set 6397 * 2. CQE.CKSUM = 0xffff 6398 * 3. IPv6 routing header is not present in the packet 6399 * 4. If there are no IP_OPTIONS in the IP HEADER 6400 */ 6401 6402 if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) && 6403 (wc->wc_cksum == 0xFFFF) && 6404 (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) { 6405 (void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0, 6406 HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0); 6407 } 6408 6409 return (mp); 6410 } 6411 6412 /* 6413 * Callback code invoked from STREAMs when the receive data buffer is 6414 * free for recycling. 6415 */ 6416 static void 6417 ibd_freemsg_cb(char *arg) 6418 { 6419 ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg; 6420 ibd_state_t *state = rwqe->w_state; 6421 6422 /* 6423 * If the wqe is being destructed, do not attempt recycling. 6424 */ 6425 if (rwqe->w_freeing_wqe == B_TRUE) { 6426 DPRINT(6, "ibd_freemsg: wqe being freed"); 6427 return; 6428 } 6429 6430 rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr, 6431 state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb); 6432 if (rwqe->rwqe_im_mblk == NULL) { 6433 ibd_free_rwqe(state, rwqe); 6434 DPRINT(6, "ibd_freemsg: desballoc failed"); 6435 return; 6436 } 6437 6438 ibd_post_recv(state, rwqe); 6439 6440 atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1); 6441 } 6442 6443 static uint_t 6444 ibd_tx_recycle(caddr_t arg) 6445 { 6446 ibd_state_t *state = (ibd_state_t *)arg; 6447 6448 /* 6449 * Poll for completed entries 6450 */ 6451 ibd_poll_scq(state, state->id_scq_hdl); 6452 6453 return (DDI_INTR_CLAIMED); 6454 } 6455 6456 #ifdef IBD_LOGGING 6457 static void 6458 ibd_log_init(void) 6459 { 6460 ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP); 6461 ibd_lbuf_ndx = 0; 6462 6463 mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL); 6464 } 6465 6466 static void 6467 ibd_log_fini(void) 6468 { 6469 if (ibd_lbuf) 6470 kmem_free(ibd_lbuf, IBD_LOG_SZ); 6471 ibd_lbuf_ndx = 0; 6472 ibd_lbuf = NULL; 6473 6474 mutex_destroy(&ibd_lbuf_lock); 6475 } 6476 6477 static void 6478 ibd_log(const char *fmt, ...) 6479 { 6480 va_list ap; 6481 uint32_t off; 6482 uint32_t msglen; 6483 char tmpbuf[IBD_DMAX_LINE]; 6484 6485 if (ibd_lbuf == NULL) 6486 return; 6487 6488 va_start(ap, fmt); 6489 msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap); 6490 va_end(ap); 6491 6492 if (msglen >= IBD_DMAX_LINE) 6493 msglen = IBD_DMAX_LINE - 1; 6494 6495 mutex_enter(&ibd_lbuf_lock); 6496 6497 off = ibd_lbuf_ndx; /* current msg should go here */ 6498 if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n')) 6499 ibd_lbuf[ibd_lbuf_ndx-1] = '\n'; 6500 6501 ibd_lbuf_ndx += msglen; /* place where next msg should start */ 6502 ibd_lbuf[ibd_lbuf_ndx] = 0; /* current msg should terminate */ 6503 6504 if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE)) 6505 ibd_lbuf_ndx = 0; 6506 6507 mutex_exit(&ibd_lbuf_lock); 6508 6509 bcopy(tmpbuf, ibd_lbuf+off, msglen); /* no lock needed for this */ 6510 } 6511 #endif 6512