xref: /titanic_51/usr/src/uts/common/io/ib/clients/eoib/eib_ibt.c (revision b494511a9cf72b1fc4eb13a0e593f55c624ab829)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/kmem.h>
28 #include <sys/conf.h>
29 #include <sys/ddi.h>
30 #include <sys/sunddi.h>
31 #include <sys/ksynch.h>
32 #include <sys/dlpi.h>			/* HCKSUM_INET_FULL_V4 */
33 #include <sys/pattr.h>			/* HCK_FULLCKSUM */
34 #include <sys/ib/mgt/sm_attr.h>		/* SM_INIT_TYPE_REPLY_... */
35 
36 #include <sys/ib/clients/eoib/eib_impl.h>
37 
38 /*
39  * Declarations private to this file
40  */
41 static void eib_ibt_reset_partitions(eib_t *);
42 static void eib_ibt_wakeup_sqd_waiters(eib_t *, ibt_channel_hdl_t);
43 static int eib_ibt_chan_pkey(eib_t *, eib_chan_t *, ib_pkey_t, boolean_t,
44     boolean_t *);
45 static boolean_t eib_ibt_has_chan_pkey_changed(eib_t *, eib_chan_t *);
46 static boolean_t eib_ibt_has_any_pkey_changed(eib_t *);
47 static int eib_ibt_fill_avect(eib_t *, eib_avect_t *, ib_lid_t);
48 static void eib_ibt_record_srate(eib_t *);
49 
50 /*
51  * Definitions private to this file
52  */
53 
54 /*
55  * SM's init type reply flags
56  */
57 #define	EIB_PORT_ATTR_LOADED(itr)				\
58 	(((itr) & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0)
59 #define	EIB_PORT_ATTR_NOT_PRESERVED(itr)			\
60 	(((itr) & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)
61 #define	EIB_PORT_PRES_NOT_PRESERVED(itr)			\
62 	(((itr) & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) == 0)
63 
64 /*
65  * eib_ibt_hca_init() initialization progress flags
66  */
67 #define	EIB_HCAINIT_HCA_OPENED		0x01
68 #define	EIB_HCAINIT_ATTRS_ALLOCD	0x02
69 #define	EIB_HCAINIT_HCA_PORTS_QUERIED	0x04
70 #define	EIB_HCAINIT_PD_ALLOCD		0x08
71 #define	EIB_HCAINIT_CAPAB_RECORDED	0x10
72 
73 int
74 eib_ibt_hca_init(eib_t *ss)
75 {
76 	ibt_status_t ret;
77 	ibt_hca_portinfo_t *pi;
78 	uint_t num_pi;
79 	uint_t sz_pi;
80 	uint_t progress = 0;
81 
82 	if (ss->ei_hca_hdl)
83 		return (EIB_E_SUCCESS);
84 
85 	/*
86 	 * Open the HCA
87 	 */
88 	ret = ibt_open_hca(ss->ei_ibt_hdl, ss->ei_props->ep_hca_guid,
89 	    &ss->ei_hca_hdl);
90 	if (ret != IBT_SUCCESS) {
91 		EIB_DPRINTF_ERR(ss->ei_instance,
92 		    "ibt_open_hca(hca_guid=0x%llx) "
93 		    "failed, ret=%d", ss->ei_props->ep_hca_guid, ret);
94 		goto ibt_hca_init_fail;
95 	}
96 	progress |= EIB_HCAINIT_HCA_OPENED;
97 
98 	/*
99 	 * Query and store HCA attributes
100 	 */
101 	ss->ei_hca_attrs = kmem_zalloc(sizeof (ibt_hca_attr_t), KM_SLEEP);
102 	progress |= EIB_HCAINIT_ATTRS_ALLOCD;
103 
104 	ret = ibt_query_hca(ss->ei_hca_hdl, ss->ei_hca_attrs);
105 	if (ret != IBT_SUCCESS) {
106 		EIB_DPRINTF_ERR(ss->ei_instance,
107 		    "ibt_query_hca(hca_hdl=0x%llx, "
108 		    "hca_guid=0x%llx) failed, ret=%d",
109 		    ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret);
110 		goto ibt_hca_init_fail;
111 	}
112 
113 	/*
114 	 * At this point, we don't even care about the linkstate, we only want
115 	 * to record our invariant base port guid and mtu
116 	 */
117 	ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
118 	    &pi, &num_pi, &sz_pi);
119 	if (ret != IBT_SUCCESS) {
120 		EIB_DPRINTF_ERR(ss->ei_instance,
121 		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
122 		    "port=0x%x) failed, ret=%d", ss->ei_hca_hdl,
123 		    ss->ei_props->ep_port_num, ret);
124 		goto ibt_hca_init_fail;
125 	}
126 	if (num_pi != 1) {
127 		EIB_DPRINTF_ERR(ss->ei_instance,
128 		    "ibt_query_hca_ports(hca_hdl=0x%llx, "
129 		    "port=0x%x) returned num_pi=%d", ss->ei_hca_hdl,
130 		    ss->ei_props->ep_port_num, num_pi);
131 		ibt_free_portinfo(pi, sz_pi);
132 		goto ibt_hca_init_fail;
133 	}
134 
135 	ss->ei_props->ep_sgid = pi->p_sgid_tbl[0];
136 	ss->ei_props->ep_mtu = (128 << pi->p_mtu);
137 	ibt_free_portinfo(pi, sz_pi);
138 
139 	progress |= EIB_HCAINIT_HCA_PORTS_QUERIED;
140 
141 	/*
142 	 * Allocate a protection domain for all our transactions
143 	 */
144 	ret = ibt_alloc_pd(ss->ei_hca_hdl, IBT_PD_NO_FLAGS, &ss->ei_pd_hdl);
145 	if (ret != IBT_SUCCESS) {
146 		EIB_DPRINTF_ERR(ss->ei_instance,
147 		    "ibt_alloc_pd(hca_hdl=0x%llx, "
148 		    "hca_guid=0x%llx) failed, ret=%d",
149 		    ss->ei_hca_hdl, ss->ei_props->ep_hca_guid, ret);
150 		goto ibt_hca_init_fail;
151 	}
152 	progress |= EIB_HCAINIT_PD_ALLOCD;
153 
154 	/*
155 	 * Finally, record the capabilities
156 	 */
157 	ss->ei_caps = kmem_zalloc(sizeof (eib_caps_t), KM_SLEEP);
158 	eib_ibt_record_capab(ss, ss->ei_hca_attrs, ss->ei_caps);
159 	eib_ibt_record_srate(ss);
160 
161 	progress |= EIB_HCAINIT_CAPAB_RECORDED;
162 
163 	return (EIB_E_SUCCESS);
164 
165 ibt_hca_init_fail:
166 	eib_rb_ibt_hca_init(ss, progress);
167 	return (EIB_E_FAILURE);
168 }
169 
170 void
171 eib_ibt_link_mod(eib_t *ss)
172 {
173 	eib_node_state_t *ns = ss->ei_node_state;
174 	ibt_hca_portinfo_t *pi;
175 	ibt_status_t ret;
176 	uint8_t vn0_mac[ETHERADDRL];
177 	boolean_t all_zombies = B_FALSE;
178 	boolean_t all_need_rejoin = B_FALSE;
179 	uint_t num_pi;
180 	uint_t sz_pi;
181 	uint8_t itr;
182 
183 	if (ns->ns_link_state == LINK_STATE_UNKNOWN)
184 		return;
185 
186 	/*
187 	 * See if we can get the port attributes or we're as good as down.
188 	 */
189 	ret = ibt_query_hca_ports(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
190 	    &pi, &num_pi, &sz_pi);
191 	if ((ret != IBT_SUCCESS) || (pi->p_linkstate != IBT_PORT_ACTIVE)) {
192 		ibt_free_portinfo(pi, sz_pi);
193 		eib_mac_link_down(ss, B_FALSE);
194 		return;
195 	}
196 
197 	/*
198 	 * If the SM re-initialized the port attributes, but did not preserve
199 	 * the old attributes, we need to check more.
200 	 */
201 	itr = pi->p_init_type_reply;
202 	if (EIB_PORT_ATTR_LOADED(itr) && EIB_PORT_ATTR_NOT_PRESERVED(itr)) {
203 		/*
204 		 * We're just coming back up; if we see that our base lid
205 		 * or sgid table has changed, we'll update these and try to
206 		 * restart all active vnics. If any of the vnic pkeys have
207 		 * changed, we'll reset the affected channels to the new pkey.
208 		 */
209 		if (bcmp(pi->p_sgid_tbl, &ss->ei_props->ep_sgid,
210 		    sizeof (ib_gid_t)) != 0) {
211 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
212 			    "eib_ibt_link_mod: port sgid table changed "
213 			    "(old %llx.%llx != new %llx.%llx), "
214 			    "all vnics are zombies now.",
215 			    ss->ei_props->ep_sgid.gid_prefix,
216 			    ss->ei_props->ep_sgid.gid_guid,
217 			    pi->p_sgid_tbl[0].gid_prefix,
218 			    pi->p_sgid_tbl[0].gid_guid);
219 
220 			ss->ei_props->ep_sgid = pi->p_sgid_tbl[0];
221 			all_zombies = B_TRUE;
222 
223 		} else if (ss->ei_props->ep_blid != pi->p_base_lid) {
224 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
225 			    "eib_ibt_link_mod: port base lid changed "
226 			    "(old 0x%x != new 0x%x), "
227 			    "all vnics are zombies now.",
228 			    ss->ei_props->ep_blid, pi->p_base_lid);
229 
230 			ss->ei_props->ep_blid = pi->p_base_lid;
231 			all_zombies = B_TRUE;
232 
233 		} else if (eib_ibt_has_any_pkey_changed(ss)) {
234 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
235 			    "eib_ibt_link_mod: pkey has changed for vnic(s), "
236 			    "resetting all partitions");
237 
238 			eib_ibt_reset_partitions(ss);
239 		}
240 	}
241 
242 	if (pi) {
243 		ibt_free_portinfo(pi, sz_pi);
244 	}
245 
246 	/*
247 	 * If the SM hasn't preserved our presence in MCGs, we need to
248 	 * rejoin all of them.
249 	 */
250 	if (EIB_PORT_PRES_NOT_PRESERVED(itr)) {
251 		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
252 		    "hca_guid=0x%llx, port=0x%x presence not preserved in SM, "
253 		    "rejoining all mcgs", ss->ei_props->ep_hca_guid,
254 		    ss->ei_props->ep_port_num);
255 
256 		all_need_rejoin = B_TRUE;
257 	}
258 
259 	/*
260 	 * Before we do the actual work of restarting/rejoining, we need to
261 	 * see if the GW is reachable at this point of time.  If not, we
262 	 * still continue to keep our link "down."  Whenever the GW becomes
263 	 * reachable again, we'll restart/rejoin all the vnics that we've
264 	 * just marked.
265 	 */
266 	mutex_enter(&ss->ei_vnic_lock);
267 	if (all_zombies) {
268 		ss->ei_zombie_vnics = ss->ei_active_vnics;
269 	}
270 	if (all_need_rejoin) {
271 		ss->ei_rejoin_vnics = ss->ei_active_vnics;
272 	}
273 	if (ss->ei_gw_unreachable) {
274 		mutex_exit(&ss->ei_vnic_lock);
275 
276 		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_link_mod: "
277 		    "gateway (gw_port=0x%x) unreachable for "
278 		    "hca_guid=0x%llx, port=0x%x, link state down",
279 		    ss->ei_gw_props->pp_gw_portid, ss->ei_props->ep_hca_guid,
280 		    ss->ei_props->ep_port_num);
281 
282 		eib_mac_link_down(ss, B_FALSE);
283 		return;
284 	}
285 	mutex_exit(&ss->ei_vnic_lock);
286 
287 	/*
288 	 * Try to awaken the dead if possible
289 	 */
290 	bcopy(eib_zero_mac, vn0_mac, ETHERADDRL);
291 	if (all_zombies) {
292 		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
293 		    "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, "
294 		    "attempting to resurrect zombies",
295 		    ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num,
296 		    ss->ei_gw_props->pp_gw_portid);
297 
298 		eib_vnic_resurrect_zombies(ss, vn0_mac);
299 	}
300 
301 	/*
302 	 * Re-join the mcgs if we need to
303 	 */
304 	if (all_need_rejoin) {
305 		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_link_mod: "
306 		    "hca_guid=0x%llx, hca_port=0x%x, gw_port=0x%x, "
307 		    "attempting to rejoin mcgs",
308 		    ss->ei_props->ep_hca_guid, ss->ei_props->ep_port_num,
309 		    ss->ei_gw_props->pp_gw_portid);
310 
311 		eib_vnic_rejoin_mcgs(ss);
312 	}
313 
314 	/*
315 	 * If we've restarted the zombies because the gateway went down and
316 	 * came back, it is possible our unicast mac address changed from
317 	 * what it was earlier. If so, we need to update our unicast address
318 	 * with the mac layer before marking the link up.
319 	 */
320 	if (bcmp(vn0_mac, eib_zero_mac, ETHERADDRL) != 0)
321 		mac_unicst_update(ss->ei_mac_hdl, vn0_mac);
322 
323 	/*
324 	 * Notify the link state up if required
325 	 */
326 	eib_mac_link_up(ss, B_FALSE);
327 }
328 
329 int
330 eib_ibt_modify_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t pkey)
331 {
332 	/*
333 	 * Make sure the channel pkey and index are set to what we need
334 	 */
335 	return (eib_ibt_chan_pkey(ss, chan, pkey, B_TRUE, NULL));
336 }
337 
338 eib_avect_t *
339 eib_ibt_hold_avect(eib_t *ss, ib_lid_t dlid, uint8_t sl)
340 {
341 	uint_t ndx = dlid % EIB_AV_NBUCKETS;	/* simple hashing */
342 	eib_avect_t *av;
343 	eib_avect_t *prev;
344 	int ret;
345 
346 	mutex_enter(&ss->ei_av_lock);
347 
348 	/*
349 	 * See if we have the address vector
350 	 */
351 	prev = NULL;
352 	for (av = ss->ei_av[ndx]; av; av = av->av_next) {
353 		prev = av;
354 		if ((av->av_vect).av_dlid == dlid)
355 			break;
356 	}
357 
358 	/*
359 	 * If we don't have it, create a new one and chain it to
360 	 * the same bucket
361 	 */
362 	if (av == NULL) {
363 		av = kmem_zalloc(sizeof (eib_avect_t), KM_NOSLEEP);
364 		if (av == NULL) {
365 			mutex_exit(&ss->ei_av_lock);
366 			EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_hold_avect: "
367 			    "no memory, could not allocate address vector");
368 			return (NULL);
369 		}
370 
371 		ret = EIB_E_FAILURE;
372 		if (!eib_wa_no_av_discover)
373 			ret = eib_ibt_fill_avect(ss, av, dlid);
374 
375 		if (ret != EIB_E_SUCCESS) {
376 			(av->av_vect).av_srate = IBT_SRATE_10;
377 			(av->av_vect).av_srvl = sl;
378 			(av->av_vect).av_port_num = ss->ei_props->ep_port_num;
379 			(av->av_vect).av_send_grh = B_FALSE;
380 			(av->av_vect).av_dlid = dlid;
381 			(av->av_vect).av_src_path = 0;	/* we use base lid */
382 		}
383 
384 		if (prev)
385 			prev->av_next = av;
386 		else
387 			ss->ei_av[ndx] = av;
388 	}
389 
390 	/*
391 	 * Increment the address vector reference count before returning
392 	 */
393 	(av->av_ref)++;
394 
395 	mutex_exit(&ss->ei_av_lock);
396 
397 	return (av);
398 }
399 
400 static int
401 eib_ibt_fill_avect(eib_t *ss, eib_avect_t *av, ib_lid_t dlid)
402 {
403 	ibt_node_info_t ni;
404 	ibt_path_attr_t attr;
405 	ibt_path_info_t path;
406 	ibt_status_t ret;
407 	ib_gid_t dgid;
408 
409 	if ((ret = ibt_lid_to_node_info(dlid, &ni)) != IBT_SUCCESS) {
410 		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: "
411 		    "ibt_lid_to_node_info(dlid=0x%x) failed, ret=%d",
412 		    dlid, ret);
413 		return (EIB_E_FAILURE);
414 	}
415 	dgid.gid_prefix = ss->ei_gw_props->pp_gw_sn_prefix;
416 	dgid.gid_guid = ni.n_port_guid;
417 
418 	/*
419 	 * Get the reversible path information for this destination
420 	 */
421 	bzero(&attr, sizeof (ibt_path_info_t));
422 	attr.pa_sgid = ss->ei_props->ep_sgid;
423 	attr.pa_dgids = &dgid;
424 	attr.pa_num_dgids = 1;
425 
426 	bzero(&path, sizeof (ibt_path_info_t));
427 	ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS,
428 	    &attr, 1, &path, NULL);
429 	if ((ret != IBT_SUCCESS) || (path.pi_hca_guid == 0)) {
430 		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_fill_avect: "
431 		    "ibt_get_paths(dgid=%llx.%llx) failed, ret=%d",
432 		    dgid.gid_prefix, dgid.gid_guid);
433 		return (EIB_E_FAILURE);
434 	}
435 
436 	/*
437 	 * Fill in the address vector
438 	 */
439 	bcopy(&path.pi_prim_cep_path.cep_adds_vect, &av->av_vect,
440 	    sizeof (ibt_adds_vect_t));
441 
442 	return (EIB_E_SUCCESS);
443 }
444 
445 void
446 eib_ibt_release_avect(eib_t *ss, eib_avect_t *av)
447 {
448 	mutex_enter(&ss->ei_av_lock);
449 
450 	ASSERT(av->av_ref > 0);
451 	(av->av_ref)--;
452 
453 	mutex_exit(&ss->ei_av_lock);
454 }
455 
456 void
457 eib_ibt_free_avects(eib_t *ss)
458 {
459 	eib_avect_t *av;
460 	eib_avect_t *av_next;
461 	int ndx;
462 
463 	mutex_enter(&ss->ei_av_lock);
464 	for (ndx = 0; ndx < EIB_AV_NBUCKETS; ndx++) {
465 		for (av = ss->ei_av[ndx]; av; av = av_next) {
466 			av_next = av->av_next;
467 
468 			ASSERT(av->av_ref == 0);
469 			kmem_free(av, sizeof (eib_avect_t));
470 		}
471 		ss->ei_av[ndx] = NULL;
472 	}
473 	mutex_exit(&ss->ei_av_lock);
474 }
475 
476 /*ARGSUSED*/
477 void
478 eib_ibt_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
479     ibt_async_code_t code, ibt_async_event_t *event)
480 {
481 	eib_t *ss = (eib_t *)clnt_private;
482 	eib_event_t *evi;
483 	uint_t ev_code;
484 
485 	ev_code = EIB_EV_NONE;
486 
487 	switch (code) {
488 	case IBT_EVENT_SQD:
489 		EIB_DPRINTF_VERBOSE(ss->ei_instance,
490 		    "eib_ibt_async_handler: got IBT_EVENT_SQD");
491 		eib_ibt_wakeup_sqd_waiters(ss, event->ev_chan_hdl);
492 		break;
493 
494 	case IBT_EVENT_PORT_UP:
495 		if (event->ev_port == ss->ei_props->ep_port_num) {
496 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
497 			    "eib_ibt_async_handler: got IBT_EVENT_PORT_UP");
498 			ev_code = EIB_EV_PORT_UP;
499 		}
500 		break;
501 
502 	case IBT_ERROR_PORT_DOWN:
503 		if (event->ev_port == ss->ei_props->ep_port_num) {
504 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
505 			    "eib_ibt_async_handler: got IBT_ERROR_PORT_DOWN");
506 			ev_code = EIB_EV_PORT_DOWN;
507 		}
508 		break;
509 
510 	case IBT_CLNT_REREG_EVENT:
511 		if (event->ev_port == ss->ei_props->ep_port_num) {
512 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
513 			    "eib_ibt_async_handler: got IBT_CLNT_REREG_EVENT");
514 			ev_code = EIB_EV_CLNT_REREG;
515 		}
516 		break;
517 
518 	case IBT_PORT_CHANGE_EVENT:
519 		if ((event->ev_port == ss->ei_props->ep_port_num) &&
520 		    (event->ev_port_flags & IBT_PORT_CHANGE_PKEY)) {
521 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
522 			    "eib_ibt_async_handler: "
523 			    "got IBT_PORT_CHANGE_EVENT(PKEY_CHANGE)");
524 			ev_code = EIB_EV_PKEY_CHANGE;
525 		} else if ((event->ev_port == ss->ei_props->ep_port_num) &&
526 		    (event->ev_port_flags & IBT_PORT_CHANGE_SGID)) {
527 			EIB_DPRINTF_VERBOSE(ss->ei_instance,
528 			    "eib_ibt_async_handler: "
529 			    "got IBT_PORT_CHANGE_EVENT(SGID_CHANGE)");
530 			ev_code = EIB_EV_SGID_CHANGE;
531 		}
532 		break;
533 
534 	case IBT_HCA_ATTACH_EVENT:
535 		/*
536 		 * For HCA attach, after a new HCA is plugged in and
537 		 * configured using cfgadm, an explicit plumb will need
538 		 * to be run, so we don't need to do anything here.
539 		 */
540 		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: "
541 		    "got IBT_HCA_ATTACH_EVENT");
542 		break;
543 
544 	case IBT_HCA_DETACH_EVENT:
545 		/*
546 		 * Before an HCA unplug, cfgadm is expected to trigger
547 		 * any rcm scripts to unplumb the EoIB instances on the
548 		 * card. If so, we should not be holding any hca resource,
549 		 * since we don't do ibt_open_hca() until plumb time. However,
550 		 * if an earlier unplumb hadn't cleaned up the hca resources
551 		 * properly because the network layer hadn't returned the
552 		 * buffers at that time, we could be holding hca resources.
553 		 * We'll try to release them here, and protect the code from
554 		 * racing with some other plumb/unplumb operation.
555 		 */
556 		EIB_DPRINTF_VERBOSE(ss->ei_instance, "eib_ibt_async_handler: "
557 		    "got IBT_HCA_DETACH_EVENT");
558 
559 		eib_mac_set_nic_state(ss, EIB_NIC_STOPPING);
560 		eib_rb_rsrc_setup_bufs(ss, B_FALSE);
561 		if (ss->ei_tx || ss->ei_rx || ss->ei_lso) {
562 			EIB_DPRINTF_WARN(ss->ei_instance,
563 			    "eib_events_handler: nw layer still holding "
564 			    "hca resources, could not detach HCA");
565 		} else if (ss->ei_hca_hdl) {
566 			eib_rb_ibt_hca_init(ss, ~0);
567 		}
568 		eib_mac_clr_nic_state(ss, EIB_NIC_STOPPING);
569 
570 		break;
571 	}
572 
573 	if (ev_code != EIB_EV_NONE) {
574 		evi = kmem_zalloc(sizeof (eib_event_t), KM_NOSLEEP);
575 		if (evi == NULL) {
576 			EIB_DPRINTF_WARN(ss->ei_instance,
577 			    "eib_ibt_async_handler: "
578 			    "no memory, could not handle event 0x%lx", ev_code);
579 		} else {
580 			evi->ev_code = ev_code;
581 			evi->ev_arg = NULL;
582 			eib_svc_enqueue_event(ss, evi);
583 		}
584 	}
585 }
586 
587 /*ARGSUSED*/
588 void
589 eib_ibt_record_capab(eib_t *ss, ibt_hca_attr_t *hca_attrs, eib_caps_t *caps)
590 {
591 	uint_t max_swqe = EIB_DATA_MAX_SWQE;
592 	uint_t max_rwqe = EIB_DATA_MAX_RWQE;
593 
594 	/*
595 	 * Checksum
596 	 */
597 	caps->cp_cksum_flags = 0;
598 	if ((!eib_wa_no_cksum_offload) &&
599 	    (hca_attrs->hca_flags & IBT_HCA_CKSUM_FULL)) {
600 		caps->cp_cksum_flags =
601 		    HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
602 		    /* HCKSUM_INET_FULL_V4 | HCKSUM_IPHDRCKSUM; */
603 	}
604 
605 	/*
606 	 * Reserved L-Key
607 	 */
608 	if (hca_attrs->hca_flags2 & IBT_HCA2_RES_LKEY) {
609 		caps->cp_resv_lkey_capab = 1;
610 		caps->cp_resv_lkey = hca_attrs->hca_reserved_lkey;
611 	}
612 
613 	/*
614 	 * LSO
615 	 */
616 	caps->cp_lso_maxlen = 0;
617 	if (!eib_wa_no_lso) {
618 		if (hca_attrs->hca_max_lso_size > EIB_LSO_MAXLEN) {
619 			caps->cp_lso_maxlen = EIB_LSO_MAXLEN;
620 		} else {
621 			caps->cp_lso_maxlen = hca_attrs->hca_max_lso_size;
622 		}
623 	}
624 
625 	/*
626 	 * SGL
627 	 *
628 	 * Translating virtual address regions into physical regions
629 	 * for using the Reserved LKey feature results in a wr sgl that
630 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
631 	 * we'll record a high-water mark (65%) when we should stop
632 	 * trying to use Reserved LKey
633 	 */
634 	if (hca_attrs->hca_flags & IBT_HCA_WQE_SIZE_INFO) {
635 		caps->cp_max_sgl = hca_attrs->hca_ud_send_sgl_sz;
636 	} else {
637 		caps->cp_max_sgl = hca_attrs->hca_max_sgl;
638 	}
639 	if (caps->cp_max_sgl > EIB_MAX_SGL) {
640 		caps->cp_max_sgl = EIB_MAX_SGL;
641 	}
642 	caps->cp_hiwm_sgl = (caps->cp_max_sgl * 65) / 100;
643 
644 	/*
645 	 * SWQE/RWQE: meet max chan size and max cq size limits (leave room
646 	 * to avoid cq overflow event)
647 	 */
648 	if (max_swqe > hca_attrs->hca_max_chan_sz)
649 		max_swqe = hca_attrs->hca_max_chan_sz;
650 	if (max_swqe > (hca_attrs->hca_max_cq_sz - 1))
651 		max_swqe = hca_attrs->hca_max_cq_sz - 1;
652 	caps->cp_max_swqe = max_swqe;
653 
654 	if (max_rwqe > hca_attrs->hca_max_chan_sz)
655 		max_rwqe = hca_attrs->hca_max_chan_sz;
656 	if (max_rwqe > (hca_attrs->hca_max_cq_sz - 1))
657 		max_rwqe = hca_attrs->hca_max_cq_sz - 1;
658 	caps->cp_max_rwqe = max_rwqe;
659 }
660 
661 void
662 eib_rb_ibt_hca_init(eib_t *ss, uint_t progress)
663 {
664 	ibt_status_t ret;
665 
666 	if (progress & EIB_HCAINIT_CAPAB_RECORDED) {
667 		if (ss->ei_caps) {
668 			kmem_free(ss->ei_caps, sizeof (eib_caps_t));
669 			ss->ei_caps = NULL;
670 		}
671 	}
672 
673 	if (progress & EIB_HCAINIT_PD_ALLOCD) {
674 		if (ss->ei_pd_hdl) {
675 			ret = ibt_free_pd(ss->ei_hca_hdl, ss->ei_pd_hdl);
676 			if (ret != IBT_SUCCESS) {
677 				EIB_DPRINTF_WARN(ss->ei_instance,
678 				    "eib_rb_ibt_hca_init: "
679 				    "ibt_free_pd(hca_hdl=0x%lx, pd_hdl=0x%lx) "
680 				    "failed, ret=%d", ss->ei_hca_hdl,
681 				    ss->ei_pd_hdl, ret);
682 			}
683 			ss->ei_pd_hdl = NULL;
684 		}
685 	}
686 
687 	if (progress & EIB_HCAINIT_HCA_PORTS_QUERIED) {
688 		ss->ei_props->ep_mtu = 0;
689 		bzero(&ss->ei_props->ep_sgid, sizeof (ib_gid_t));
690 	}
691 
692 	if (progress & EIB_HCAINIT_ATTRS_ALLOCD) {
693 		kmem_free(ss->ei_hca_attrs, sizeof (ibt_hca_attr_t));
694 		ss->ei_hca_attrs = NULL;
695 	}
696 
697 	if (progress & EIB_HCAINIT_HCA_OPENED) {
698 		ret = ibt_close_hca(ss->ei_hca_hdl);
699 		if (ret != IBT_SUCCESS) {
700 			EIB_DPRINTF_WARN(ss->ei_instance,
701 			    "ibt_close_hca(hca_hdl=0x%lx) failed, "
702 			    "ret=%d", ss->ei_hca_hdl, ret);
703 		}
704 		ss->ei_hca_hdl = NULL;
705 	}
706 }
707 
708 static void
709 eib_ibt_reset_partitions(eib_t *ss)
710 {
711 	eib_vnic_t *vnic;
712 	eib_chan_t *chan = NULL;
713 	uint64_t av;
714 	int inst = 0;
715 
716 	/*
717 	 * We already have the vhub pkey recorded in our eib_chan_t.
718 	 * We only need to make sure our pkey index still matches it.
719 	 * If not, modify the channel appropriately and update our
720 	 * records.
721 	 */
722 	if ((chan = ss->ei_admin_chan) != NULL)
723 		(void) eib_ibt_modify_chan_pkey(ss, chan, chan->ch_pkey);
724 
725 	mutex_enter(&ss->ei_vnic_lock);
726 	av = ss->ei_active_vnics;
727 	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
728 		if ((vnic = ss->ei_vnic[inst]) != NULL) {
729 			if ((chan = vnic->vn_ctl_chan) != NULL) {
730 				(void) eib_ibt_modify_chan_pkey(ss, chan,
731 				    chan->ch_pkey);
732 			}
733 			if ((chan = vnic->vn_data_chan) != NULL) {
734 				(void) eib_ibt_modify_chan_pkey(ss, chan,
735 				    chan->ch_pkey);
736 			}
737 		}
738 		av &= (~((uint64_t)1 << inst));
739 	}
740 	mutex_exit(&ss->ei_vnic_lock);
741 }
742 
743 static void
744 eib_ibt_wakeup_sqd_waiters(eib_t *ss, ibt_channel_hdl_t ev_chan_hdl)
745 {
746 	eib_vnic_t *vnic;
747 	eib_chan_t *chan = NULL;
748 	uint64_t av;
749 	int inst = 0;
750 
751 	/*
752 	 * See if this channel has been waiting for its queue to drain.
753 	 *
754 	 * Note that since this is especially likely to be called during
755 	 * logging in to the gateway, we also need to check the vnic
756 	 * currently being created.
757 	 */
758 	mutex_enter(&ss->ei_vnic_lock);
759 
760 	if ((vnic = ss->ei_vnic_pending) != NULL) {
761 		chan = vnic->vn_ctl_chan;
762 		if ((chan) && (chan->ch_chan == ev_chan_hdl))
763 			goto wakeup_sqd_waiters;
764 
765 		chan = vnic->vn_data_chan;
766 		if ((chan) && (chan->ch_chan == ev_chan_hdl))
767 			goto wakeup_sqd_waiters;
768 	}
769 
770 	av = ss->ei_active_vnics;
771 	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
772 		if ((vnic = ss->ei_vnic[inst]) != NULL) {
773 			chan = vnic->vn_ctl_chan;
774 			if (chan->ch_chan == ev_chan_hdl)
775 				break;
776 
777 			chan = vnic->vn_data_chan;
778 			if (chan->ch_chan == ev_chan_hdl)
779 				break;
780 		}
781 		av &= (~((uint64_t)1 << inst));
782 	}
783 
784 wakeup_sqd_waiters:
785 	if (chan) {
786 		mutex_enter(&chan->ch_cep_lock);
787 		chan->ch_cep_state = IBT_STATE_SQD;
788 		cv_broadcast(&chan->ch_cep_cv);
789 		mutex_exit(&chan->ch_cep_lock);
790 	}
791 
792 	mutex_exit(&ss->ei_vnic_lock);
793 }
794 
795 static int
796 eib_ibt_chan_pkey(eib_t *ss, eib_chan_t *chan, ib_pkey_t new_pkey,
797     boolean_t set, boolean_t *pkey_changed)
798 {
799 	ibt_qp_info_t qp_attr;
800 	ibt_status_t ret;
801 	uint16_t new_pkey_ix;
802 
803 	ret = ibt_pkey2index(ss->ei_hca_hdl, ss->ei_props->ep_port_num,
804 	    new_pkey, &new_pkey_ix);
805 	if (ret != IBT_SUCCESS) {
806 		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
807 		    "ibt_pkey2index(hca_hdl=0x%llx, port_num=0x%x, "
808 		    "pkey=0x%x) failed, ret=%d",
809 		    ss->ei_hca_hdl, ss->ei_props->ep_port_num, new_pkey, ret);
810 		return (EIB_E_FAILURE);
811 	}
812 
813 	/*
814 	 * If the pkey and the pkey index we have already matches the
815 	 * new one, nothing to do.
816 	 */
817 	mutex_enter(&chan->ch_pkey_lock);
818 	if ((chan->ch_pkey == new_pkey) && (chan->ch_pkey_ix == new_pkey_ix)) {
819 		if (pkey_changed) {
820 			*pkey_changed = B_FALSE;
821 		}
822 		mutex_exit(&chan->ch_pkey_lock);
823 		return (EIB_E_SUCCESS);
824 	}
825 	if (pkey_changed) {
826 		*pkey_changed = B_TRUE;
827 	}
828 	mutex_exit(&chan->ch_pkey_lock);
829 
830 	/*
831 	 * Otherwise, if we're asked only to test if the pkey index
832 	 * supplied matches the one recorded in the channel, return
833 	 * success, but don't set the pkey.
834 	 */
835 	if (!set) {
836 		return (EIB_E_SUCCESS);
837 	}
838 
839 	/*
840 	 * Otherwise, we need to change channel pkey.  Pause the
841 	 * channel sendq first.
842 	 */
843 	ret = ibt_pause_sendq(chan->ch_chan, IBT_CEP_SET_SQD_EVENT);
844 	if (ret != IBT_SUCCESS) {
845 		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
846 		    "ibt_pause_sendq(chan_hdl=0x%llx) failed, ret=%d",
847 		    chan->ch_chan, ret);
848 		return (EIB_E_FAILURE);
849 	}
850 
851 	/*
852 	 * Wait for the channel to enter the IBT_STATE_SQD state
853 	 */
854 	mutex_enter(&chan->ch_cep_lock);
855 	while (chan->ch_cep_state != IBT_STATE_SQD)
856 		cv_wait(&chan->ch_cep_cv, &chan->ch_cep_lock);
857 	mutex_exit(&chan->ch_cep_lock);
858 
859 	/*
860 	 * Modify the qp with the supplied pkey index and unpause the channel
861 	 * If either of these operations fail, we'll leave the channel in
862 	 * the paused state and fail.
863 	 */
864 	bzero(&qp_attr, sizeof (ibt_qp_info_t));
865 
866 	qp_attr.qp_trans = IBT_UD_SRV;
867 	qp_attr.qp_current_state = IBT_STATE_SQD;
868 	qp_attr.qp_state = IBT_STATE_SQD;
869 	qp_attr.qp_transport.ud.ud_pkey_ix = new_pkey_ix;
870 
871 	/*
872 	 * Modify the qp to set the new pkey index, then unpause the
873 	 * channel and put it in RTS state and update the new values
874 	 * in our records
875 	 */
876 	mutex_enter(&chan->ch_pkey_lock);
877 
878 	ret = ibt_modify_qp(chan->ch_chan,
879 	    IBT_CEP_SET_STATE | IBT_CEP_SET_PKEY_IX, &qp_attr, NULL);
880 	if (ret != IBT_SUCCESS) {
881 		mutex_exit(&chan->ch_pkey_lock);
882 		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
883 		    "ibt_modify_qp(chan_hdl=0x%llx, IBT_CEP_SET_PKEY_IX) "
884 		    "failed for new_pkey_ix=0x%x, ret=%d",
885 		    chan->ch_chan, new_pkey_ix, ret);
886 		return (EIB_E_FAILURE);
887 	}
888 
889 	if ((ret = ibt_unpause_sendq(chan->ch_chan)) != IBT_SUCCESS) {
890 		mutex_exit(&chan->ch_pkey_lock);
891 		EIB_DPRINTF_WARN(ss->ei_instance, "eib_ibt_chan_pkey: "
892 		    "ibt_unpause_sendq(chan_hdl=0x%llx) failed, ret=%d",
893 		    chan->ch_chan, ret);
894 		return (EIB_E_FAILURE);
895 	}
896 
897 	chan->ch_pkey = new_pkey;
898 	chan->ch_pkey_ix = new_pkey_ix;
899 	mutex_exit(&chan->ch_pkey_lock);
900 
901 	return (EIB_E_SUCCESS);
902 }
903 
904 static boolean_t
905 eib_ibt_has_chan_pkey_changed(eib_t *ss, eib_chan_t *chan)
906 {
907 	boolean_t changed;
908 	int ret;
909 
910 	/*
911 	 * Don't modify the pkey, just ask if the pkey index for the channel's
912 	 * pkey has changed for any reason.  If we fail, assume that the pkey
913 	 * has changed.
914 	 */
915 	ret = eib_ibt_chan_pkey(ss, chan, chan->ch_pkey, B_FALSE, &changed);
916 	if (ret != EIB_E_SUCCESS)
917 		changed = B_TRUE;
918 
919 	return (changed);
920 }
921 
922 static boolean_t
923 eib_ibt_has_any_pkey_changed(eib_t *ss)
924 {
925 	eib_vnic_t *vnic;
926 	eib_chan_t *chan = NULL;
927 	uint64_t av;
928 	int inst = 0;
929 
930 	/*
931 	 * Return true if the pkey index of any our pkeys (of the channels
932 	 * of all active vnics) has changed.
933 	 */
934 
935 	chan = ss->ei_admin_chan;
936 	if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
937 		return (B_TRUE);
938 
939 	mutex_enter(&ss->ei_vnic_lock);
940 	av = ss->ei_active_vnics;
941 	while ((inst = EIB_FIND_LSB_SET(av)) != -1) {
942 		if ((vnic = ss->ei_vnic[inst]) != NULL) {
943 			chan = vnic->vn_ctl_chan;
944 			if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
945 				return (B_TRUE);
946 
947 			chan = vnic->vn_data_chan;
948 			if ((chan) && (eib_ibt_has_chan_pkey_changed(ss, chan)))
949 				return (B_TRUE);
950 		}
951 		av &= (~((uint64_t)1 << inst));
952 	}
953 	mutex_exit(&ss->ei_vnic_lock);
954 
955 	return (B_FALSE);
956 }
957 
958 /*
959  * This routine is currently used simply to derive and record the port
960  * speed from the loopback path information (for debug purposes).  For
961  * EoIB, currently the srate used in address vectors to IB neighbors
962  * and the gateway is fixed at IBT_SRATE_10. Eventually though, this
963  * information (and sl) has to come from the gateway for all destinations
964  * in the vhub table.
965  */
966 static void
967 eib_ibt_record_srate(eib_t *ss)
968 {
969 	ib_gid_t sgid = ss->ei_props->ep_sgid;
970 	ibt_srate_t srate = IBT_SRATE_10;
971 	ibt_path_info_t path;
972 	ibt_path_attr_t path_attr;
973 	ibt_status_t ret;
974 	uint8_t num_paths;
975 
976 	bzero(&path_attr, sizeof (path_attr));
977 	path_attr.pa_dgids = &sgid;
978 	path_attr.pa_num_dgids = 1;
979 	path_attr.pa_sgid = sgid;
980 
981 	ret = ibt_get_paths(ss->ei_ibt_hdl, IBT_PATH_NO_FLAGS,
982 	    &path_attr, 1, &path, &num_paths);
983 	if (ret == IBT_SUCCESS && num_paths >= 1) {
984 		switch (srate = path.pi_prim_cep_path.cep_adds_vect.av_srate) {
985 		case IBT_SRATE_2:
986 		case IBT_SRATE_10:
987 		case IBT_SRATE_30:
988 		case IBT_SRATE_5:
989 		case IBT_SRATE_20:
990 		case IBT_SRATE_40:
991 		case IBT_SRATE_60:
992 		case IBT_SRATE_80:
993 		case IBT_SRATE_120:
994 			break;
995 		default:
996 			srate = IBT_SRATE_10;
997 		}
998 	}
999 
1000 	ss->ei_props->ep_srate = srate;
1001 
1002 	EIB_DPRINTF_DEBUG(ss->ei_instance, "eib_ibt_record_srate: "
1003 	    "srate = %d", srate);
1004 }
1005