xref: /freebsd/sys/dev/iser/iser_verbs.c (revision 35c0a8c449fd2b7f75029ebed5e10852240f0865)
1 /*-
2  * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include "icl_iser.h"
27 
28 static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend");
29 static int iser_cq_poll_limit = 512;
30 
31 static void
32 iser_cq_event_callback(struct ib_event *cause, void *context)
33 {
34 	ISER_ERR("got cq event %d", cause->event);
35 }
36 
37 static void
38 iser_qp_event_callback(struct ib_event *cause, void *context)
39 {
40 	ISER_ERR("got qp event %d", cause->event);
41 }
42 
43 static void
44 iser_event_handler(struct ib_event_handler *handler,
45 				struct ib_event *event)
46 {
47 	ISER_ERR("async event %d on device %s port %d",
48 		 event->event, event->device->name,
49 		 event->element.port_num);
50 }
51 
52 /**
53  * is_iser_tx_desc - Indicate if the completion wr_id
54  *     is a TX descriptor or not.
55  * @iser_conn: iser connection
56  * @wr_id: completion WR identifier
57  *
58  * Since we cannot rely on wc opcode in FLUSH errors
59  * we must work around it by checking if the wr_id address
60  * falls in the iser connection rx_descs buffer. If so
61  * it is an RX descriptor, otherwize it is a TX.
62  */
63 static inline bool
64 is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
65 {
66 	void *start = iser_conn->rx_descs;
67 	u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
68 	void *end = (void *)((uintptr_t)start + (uintptr_t)len);
69 
70 	if (start) {
71 		if (wr_id >= start && wr_id < end)
72 			return false;
73 	} else {
74 		return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf);
75 	}
76 
77 	return true;
78 }
79 
80 /**
81  * iser_handle_comp_error() - Handle error completion
82  * @ib_conn:   connection RDMA resources
83  * @wc:        work completion
84  *
85  * Notes: Update post_recv_buf_count in case of recv error completion.
86  *        For non-FLUSH error completion we should also notify iscsi layer that
87  *        connection is failed (in case we passed bind stage).
88  */
89 static void
90 iser_handle_comp_error(struct ib_conn *ib_conn,
91 		       struct ib_wc *wc)
92 {
93 	void *wr_id = (void *)(uintptr_t)wc->wr_id;
94 	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
95 						   ib_conn);
96 
97 	if (is_iser_tx_desc(iser_conn, wr_id)) {
98 		ISER_DBG("conn %p got send comp error", iser_conn);
99 	} else {
100 		ISER_DBG("conn %p got recv comp error", iser_conn);
101 		ib_conn->post_recv_buf_count--;
102 	}
103 	if (wc->status != IB_WC_WR_FLUSH_ERR)
104 		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
105 }
106 
107 /**
108  * iser_handle_wc - handle a single work completion
109  * @wc: work completion
110  *
111  * Soft-IRQ context, work completion can be either
112  * SEND or RECV, and can turn out successful or
113  * with error (or flush error).
114  */
115 static void iser_handle_wc(struct ib_wc *wc)
116 {
117 	struct ib_conn *ib_conn;
118 	struct iser_tx_desc *tx_desc;
119 	struct iser_rx_desc *rx_desc;
120 
121 	ib_conn = wc->qp->qp_context;
122 	if (likely(wc->status == IB_WC_SUCCESS)) {
123 		if (wc->opcode == IB_WC_RECV) {
124 			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
125 			iser_rcv_completion(rx_desc, wc->byte_len,
126 					    ib_conn);
127 		} else
128 		if (wc->opcode == IB_WC_SEND) {
129 			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
130 			iser_snd_completion(tx_desc, ib_conn);
131 		} else {
132 			ISER_ERR("Unknown wc opcode %d", wc->opcode);
133 		}
134 	} else {
135 		struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
136 					ib_conn);
137 		if (wc->status != IB_WC_WR_FLUSH_ERR) {
138 			ISER_ERR("conn %p wr id %llx status %d vend_err %x",
139 				 iser_conn, (unsigned long long)wc->wr_id,
140 				 wc->status, wc->vendor_err);
141 		} else {
142 			ISER_DBG("flush error: conn %p wr id %llx",
143 				 iser_conn, (unsigned long long)wc->wr_id);
144 		}
145 
146 		if (wc->wr_id == ISER_BEACON_WRID) {
147 			/* all flush errors were consumed */
148 			mtx_lock(&ib_conn->beacon.flush_lock);
149 			ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn);
150 			cv_signal(&ib_conn->beacon.flush_cv);
151 			mtx_unlock(&ib_conn->beacon.flush_lock);
152 		} else {
153 			iser_handle_comp_error(ib_conn, wc);
154 		}
155 	}
156 }
157 
158 static void
159 iser_cq_tasklet_fn(void *data, int pending)
160 {
161 	struct iser_comp *comp = (struct iser_comp *)data;
162 	struct ib_cq *cq = comp->cq;
163 	struct ib_wc *const wcs = comp->wcs;
164 	int completed = 0;
165 	int i;
166 	int n;
167 
168 	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
169 		for (i = 0; i < n; i++)
170 			iser_handle_wc(&wcs[i]);
171 
172 		completed += n;
173 		if (completed >= iser_cq_poll_limit)
174 			break;
175 	}
176 
177 	/*
178 	 * It is assumed here that arming CQ only once its empty
179 	 * would not cause interrupts to be missed.
180 	 */
181 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
182 }
183 
184 static void
185 iser_cq_callback(struct ib_cq *cq, void *cq_context)
186 {
187 	struct iser_comp *comp = cq_context;
188 
189 	taskqueue_enqueue(comp->tq, &comp->task);
190 }
191 
192 /**
193  * iser_create_device_ib_res - creates Protection Domain (PD), Completion
194  * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
195  * the adapator.
196  *
197  * returns 0 on success, -1 on failure
198  */
199 static int
200 iser_create_device_ib_res(struct iser_device *device)
201 {
202 	struct ib_device *ib_dev = device->ib_device;
203 	int i, max_cqe;
204 
205 	if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
206 		ISER_ERR("device %s doesn't support Fastreg, "
207 			 "can't register memory", device->ib_device->name);
208 		return (1);
209 	}
210 
211 	device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors);
212 
213 	device->comps = malloc(device->comps_used * sizeof(*device->comps),
214 		M_ISER_VERBS, M_WAITOK | M_ZERO);
215 
216 	max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe);
217 
218 	ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d",
219 		 device->comps_used, device->ib_device->name,
220 		 device->ib_device->num_comp_vectors, max_cqe);
221 
222 	device->pd = ib_alloc_pd(device->ib_device, IB_PD_UNSAFE_GLOBAL_RKEY);
223 	if (IS_ERR(device->pd))
224 		goto pd_err;
225 
226 	for (i = 0; i < device->comps_used; i++) {
227 		struct iser_comp *comp = &device->comps[i];
228 		struct ib_cq_init_attr cq_attr = {
229 			.cqe		= max_cqe,
230 			.comp_vector	= i,
231 		};
232 
233 		comp->device = device;
234 		comp->cq = ib_create_cq(device->ib_device,
235 					iser_cq_callback,
236 					iser_cq_event_callback,
237 					(void *)comp,
238 					&cq_attr);
239 		if (IS_ERR(comp->cq)) {
240 			comp->cq = NULL;
241 			goto cq_err;
242 		}
243 
244 		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
245 			goto cq_err;
246 
247 		TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp);
248 		comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT,
249 				taskqueue_thread_enqueue, &comp->tq);
250 		if (!comp->tq)
251 			goto tq_err;
252 		taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq");
253 	}
254 
255 	device->mr = device->pd->__internal_mr;
256 	if (IS_ERR(device->mr))
257 		goto tq_err;
258 
259 	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
260 				iser_event_handler);
261 	if (ib_register_event_handler(&device->event_handler))
262 		goto tq_err;
263 
264 	return (0);
265 
266 tq_err:
267 	for (i = 0; i < device->comps_used; i++) {
268 		struct iser_comp *comp = &device->comps[i];
269 		if (comp->tq)
270 			taskqueue_free(comp->tq);
271 	}
272 cq_err:
273 	for (i = 0; i < device->comps_used; i++) {
274 		struct iser_comp *comp = &device->comps[i];
275 		if (comp->cq)
276 			ib_destroy_cq(comp->cq);
277 	}
278 	ib_dealloc_pd(device->pd);
279 pd_err:
280 	free(device->comps, M_ISER_VERBS);
281 	ISER_ERR("failed to allocate an IB resource");
282 	return (1);
283 }
284 
285 /**
286  * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
287  * CQ and PD created with the device associated with the adapator.
288  */
289 static void
290 iser_free_device_ib_res(struct iser_device *device)
291 {
292 	int i;
293 
294 	for (i = 0; i < device->comps_used; i++) {
295 		struct iser_comp *comp = &device->comps[i];
296 
297 		taskqueue_free(comp->tq);
298 		ib_destroy_cq(comp->cq);
299 		comp->cq = NULL;
300 	}
301 
302 	(void)ib_unregister_event_handler(&device->event_handler);
303 	(void)ib_dealloc_pd(device->pd);
304 
305 	free(device->comps, M_ISER_VERBS);
306 	device->comps = NULL;
307 
308 	device->mr = NULL;
309 	device->pd = NULL;
310 }
311 
312 static int
313 iser_alloc_reg_res(struct ib_device *ib_device,
314 		   struct ib_pd *pd,
315 		   struct iser_reg_resources *res)
316 {
317 	int ret;
318 
319 	res->mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, ISCSI_ISER_SG_TABLESIZE + 1);
320 	if (IS_ERR(res->mr)) {
321 		ret = -PTR_ERR(res->mr);
322 		ISER_ERR("Failed to allocate  fast reg mr err=%d", ret);
323 		return (ret);
324 	}
325 	res->mr_valid = 1;
326 
327 	return (0);
328 }
329 
330 static void
331 iser_free_reg_res(struct iser_reg_resources *rsc)
332 {
333 	ib_dereg_mr(rsc->mr);
334 }
335 
336 static struct fast_reg_descriptor *
337 iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd)
338 {
339 	struct fast_reg_descriptor *desc;
340 	int ret;
341 
342 	desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO);
343 	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
344 	if (ret) {
345 		ISER_ERR("failed to allocate reg_resources");
346 		goto err;
347 	}
348 
349 	return (desc);
350 err:
351 	free(desc, M_ISER_VERBS);
352 	return (NULL);
353 }
354 
355 /**
356  * iser_create_fmr_pool - Creates FMR pool and page_vector
357  *
358  * returns 0 on success, or errno code on failure
359  */
360 int
361 iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
362 {
363 	struct iser_device *device = ib_conn->device;
364 	struct fast_reg_descriptor *desc;
365 	int i;
366 
367 	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
368 	ib_conn->fastreg.pool_size = 0;
369 	for (i = 0; i < cmds_max; i++) {
370 		desc = iser_create_fastreg_desc(device->ib_device, device->pd);
371 		if (!desc) {
372 			ISER_ERR("Failed to create fastreg descriptor");
373 			goto err;
374 		}
375 
376 		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
377 		ib_conn->fastreg.pool_size++;
378 	}
379 
380 	return (0);
381 
382 err:
383 	iser_free_fastreg_pool(ib_conn);
384 	return (ENOMEM);
385 }
386 
387 /**
388  * iser_free_fmr_pool - releases the FMR pool and page vec
389  */
390 void
391 iser_free_fastreg_pool(struct ib_conn *ib_conn)
392 {
393 	struct fast_reg_descriptor *desc, *tmp;
394 	int i = 0;
395 
396 	if (list_empty(&ib_conn->fastreg.pool))
397 		return;
398 
399 	ISER_DBG("freeing conn %p fr pool", ib_conn);
400 
401 	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
402 		list_del(&desc->list);
403 		iser_free_reg_res(&desc->rsc);
404 		free(desc, M_ISER_VERBS);
405 		++i;
406 	}
407 
408 	if (i < ib_conn->fastreg.pool_size)
409 		ISER_WARN("pool still has %d regions registered",
410 			  ib_conn->fastreg.pool_size - i);
411 }
412 
413 /**
414  * iser_create_ib_conn_res - Queue-Pair (QP)
415  *
416  * returns 0 on success, 1 on failure
417  */
418 static int
419 iser_create_ib_conn_res(struct ib_conn *ib_conn)
420 {
421 	struct iser_conn *iser_conn;
422 	struct iser_device *device;
423 	struct ib_device_attr *dev_attr;
424 	struct ib_qp_init_attr init_attr;
425 	int index, min_index = 0;
426 	int ret = -ENOMEM;
427 
428 	iser_conn = container_of(ib_conn, struct iser_conn, ib_conn);
429 	device = ib_conn->device;
430 	dev_attr = &device->dev_attr;
431 
432 	mtx_lock(&ig.connlist_mutex);
433 	/* select the CQ with the minimal number of usages */
434 	for (index = 0; index < device->comps_used; index++) {
435 		if (device->comps[index].active_qps <
436 		    device->comps[min_index].active_qps)
437 			min_index = index;
438 	}
439 	ib_conn->comp = &device->comps[min_index];
440 	ib_conn->comp->active_qps++;
441 	mtx_unlock(&ig.connlist_mutex);
442 	ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn);
443 
444 	memset(&init_attr, 0, sizeof init_attr);
445 	init_attr.event_handler = iser_qp_event_callback;
446 	init_attr.qp_context	= (void *)ib_conn;
447 	init_attr.send_cq	= ib_conn->comp->cq;
448 	init_attr.recv_cq	= ib_conn->comp->cq;
449 	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
450 	init_attr.cap.max_send_sge = 2;
451 	init_attr.cap.max_recv_sge = 1;
452 	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
453 	init_attr.qp_type	= IB_QPT_RC;
454 
455 	if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
456 		init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
457 		iser_conn->max_cmds =
458 			ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
459 	} else {
460 		init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
461 		iser_conn->max_cmds =
462 			ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
463 	}
464 	ISER_DBG("device %s supports max_send_wr %d",
465 	         device->ib_device->name, dev_attr->max_qp_wr);
466 
467 	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
468 	if (ret)
469 		goto out_err;
470 
471 	ib_conn->qp = ib_conn->cma_id->qp;
472 	ISER_DBG("setting conn %p cma_id %p qp %p",
473 		 ib_conn, ib_conn->cma_id,
474 		 ib_conn->cma_id->qp);
475 
476 	return (ret);
477 
478 out_err:
479 	mtx_lock(&ig.connlist_mutex);
480 	ib_conn->comp->active_qps--;
481 	mtx_unlock(&ig.connlist_mutex);
482 	ISER_ERR("unable to alloc mem or create resource, err %d", ret);
483 
484 	return (ret);
485 }
486 
487 /**
488  * based on the resolved device node GUID see if there already allocated
489  * device for this device. If there's no such, create one.
490  */
491 static struct iser_device *
492 iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
493 {
494 	struct iser_device *device;
495 
496 	sx_xlock(&ig.device_list_mutex);
497 
498 	list_for_each_entry(device, &ig.device_list, ig_list)
499 		/* find if there's a match using the node GUID */
500 		if (device->ib_device->node_guid == cma_id->device->node_guid)
501 			goto inc_refcnt;
502 
503 	device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO);
504 	/* assign this device to the device */
505 	device->ib_device = cma_id->device;
506 	/* init the device and link it into ig device list */
507 	if (iser_create_device_ib_res(device)) {
508 		free(device, M_ISER_VERBS);
509 		device = NULL;
510 		goto out;
511 	}
512 	list_add(&device->ig_list, &ig.device_list);
513 
514 inc_refcnt:
515 	device->refcount++;
516 	ISER_INFO("device %p refcount %d", device, device->refcount);
517 out:
518 	sx_xunlock(&ig.device_list_mutex);
519 	return (device);
520 }
521 
522 /* if there's no demand for this device, release it */
523 static void
524 iser_device_try_release(struct iser_device *device)
525 {
526 	sx_xlock(&ig.device_list_mutex);
527 	device->refcount--;
528 	ISER_INFO("device %p refcount %d", device, device->refcount);
529 	if (!device->refcount) {
530 		iser_free_device_ib_res(device);
531 		list_del(&device->ig_list);
532 		free(device, M_ISER_VERBS);
533 		device = NULL;
534 	}
535 	sx_xunlock(&ig.device_list_mutex);
536 }
537 
538 /**
539  * Called with state mutex held
540  **/
541 static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
542 				     enum iser_conn_state comp,
543 				     enum iser_conn_state exch)
544 {
545 	int ret;
546 
547 	ret = (iser_conn->state == comp);
548 	if (ret)
549 		iser_conn->state = exch;
550 
551 	return ret;
552 }
553 
554 /**
555  * iser_free_ib_conn_res - release IB related resources
556  * @iser_conn: iser connection struct
557  * @destroy: indicator if we need to try to release the
558  *     iser device and memory regoins pool (only iscsi
559  *     shutdown and DEVICE_REMOVAL will use this).
560  *
561  * This routine is called with the iser state mutex held
562  * so the cm_id removal is out of here. It is Safe to
563  * be invoked multiple times.
564  */
565 void
566 iser_free_ib_conn_res(struct iser_conn *iser_conn,
567 				  bool destroy)
568 {
569 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
570 	struct iser_device *device = ib_conn->device;
571 
572 	ISER_INFO("freeing conn %p cma_id %p qp %p",
573 		  iser_conn, ib_conn->cma_id, ib_conn->qp);
574 
575 	if (ib_conn->qp != NULL) {
576 		mtx_lock(&ig.connlist_mutex);
577 		ib_conn->comp->active_qps--;
578 		mtx_unlock(&ig.connlist_mutex);
579 		rdma_destroy_qp(ib_conn->cma_id);
580 		ib_conn->qp = NULL;
581 	}
582 
583 	if (destroy) {
584 		if (iser_conn->login_buf)
585 			iser_free_login_buf(iser_conn);
586 
587 		if (iser_conn->rx_descs)
588 			iser_free_rx_descriptors(iser_conn);
589 
590 		if (device != NULL) {
591 			iser_device_try_release(device);
592 			ib_conn->device = NULL;
593 		}
594 	}
595 }
596 
597 /**
598  * triggers start of the disconnect procedures and wait for them to be done
599  * Called with state mutex held
600  */
601 int
602 iser_conn_terminate(struct iser_conn *iser_conn)
603 {
604 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
605 	const struct ib_send_wr *bad_send_wr;
606 	const struct ib_recv_wr *bad_recv_wr;
607 	int err = 0;
608 
609 	/* terminate the iser conn only if the conn state is UP */
610 	if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
611 					   ISER_CONN_TERMINATING))
612 		return (0);
613 
614 	ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state);
615 
616 	if (ib_conn->qp == NULL) {
617 		/* HOW can this be??? */
618 		ISER_WARN("qp wasn't created");
619 		return (1);
620 	}
621 
622 	/*
623 	 * Todo: This is a temporary workaround.
624 	 * We serialize the connection closure using global lock in order to
625 	 * receive all posted beacons completions.
626 	 * Without Serialization, in case we open many connections (QPs) on
627 	 * the same CQ, we might miss beacons because of missing interrupts.
628 	 */
629 	sx_xlock(&ig.close_conns_mutex);
630 
631 	/*
632 	 * In case we didn't already clean up the cma_id (peer initiated
633 	 * a disconnection), we need to Cause the CMA to change the QP
634 	 * state to ERROR.
635 	 */
636 	if (ib_conn->cma_id) {
637 		err = rdma_disconnect(ib_conn->cma_id);
638 		if (err)
639 			ISER_ERR("Failed to disconnect, conn: 0x%p err %d",
640 				iser_conn, err);
641 
642 		mtx_lock(&ib_conn->beacon.flush_lock);
643 		memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr));
644 		ib_conn->beacon.send.wr_id = ISER_BEACON_WRID;
645 		ib_conn->beacon.send.opcode = IB_WR_SEND;
646 		/* post an indication that all send flush errors were consumed */
647 		err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr);
648 		if (err) {
649 			ISER_ERR("conn %p failed to post send_beacon", ib_conn);
650 			mtx_unlock(&ib_conn->beacon.flush_lock);
651 			goto out;
652 		}
653 
654 		ISER_DBG("before send cv_wait: %p", iser_conn);
655 		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
656 		ISER_DBG("after send cv_wait: %p", iser_conn);
657 
658 		memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr));
659 		ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID;
660 		/* post an indication that all recv flush errors were consumed */
661 		err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr);
662 		if (err) {
663 			ISER_ERR("conn %p failed to post recv_beacon", ib_conn);
664 			mtx_unlock(&ib_conn->beacon.flush_lock);
665 			goto out;
666 		}
667 
668 		ISER_DBG("before recv cv_wait: %p", iser_conn);
669 		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
670 		mtx_unlock(&ib_conn->beacon.flush_lock);
671 		ISER_DBG("after recv cv_wait: %p", iser_conn);
672 	}
673 out:
674 	sx_xunlock(&ig.close_conns_mutex);
675 	return (1);
676 }
677 
678 /**
679  * Called with state mutex held
680  **/
681 static void
682 iser_connect_error(struct rdma_cm_id *cma_id)
683 {
684 	struct iser_conn *iser_conn;
685 
686 	iser_conn = cma_id->context;
687 
688 	ISER_ERR("conn %p", iser_conn);
689 
690 	iser_conn->state = ISER_CONN_TERMINATING;
691 
692 	cv_signal(&iser_conn->up_cv);
693 }
694 
695 /**
696  * Called with state mutex held
697  **/
698 static void
699 iser_addr_handler(struct rdma_cm_id *cma_id)
700 {
701 	struct iser_device *device;
702 	struct iser_conn   *iser_conn;
703 	struct ib_conn   *ib_conn;
704 	int    ret;
705 
706 	iser_conn = cma_id->context;
707 
708 	ib_conn = &iser_conn->ib_conn;
709 	device = iser_device_find_by_ib_device(cma_id);
710 	if (!device) {
711 		ISER_ERR("conn %p device lookup/creation failed",
712 			 iser_conn);
713 		iser_connect_error(cma_id);
714 		return;
715 	}
716 
717 	ib_conn->device = device;
718 
719 	ret = rdma_resolve_route(cma_id, 1000);
720 	if (ret) {
721 		ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret);
722 		iser_connect_error(cma_id);
723 		return;
724 	}
725 }
726 
727 /**
728  * Called with state mutex held
729  **/
730 static void
731 iser_route_handler(struct rdma_cm_id *cma_id)
732 {
733 	struct rdma_conn_param conn_param;
734 	int    ret;
735 	struct iser_cm_hdr req_hdr;
736 	struct iser_conn *iser_conn = cma_id->context;
737 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
738 	struct iser_device *device = ib_conn->device;
739 
740 	ret = iser_create_ib_conn_res(ib_conn);
741 	if (ret)
742 		goto failure;
743 
744 	memset(&conn_param, 0, sizeof conn_param);
745 	conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
746 	conn_param.retry_count	       = 7;
747 	conn_param.rnr_retry_count     = 6;
748 	/*
749 	 * Initiaotr depth should not be set, but in order to compat
750 	 * with old targets, we keep this value set.
751 	 */
752 	conn_param.initiator_depth     = 1;
753 
754 	memset(&req_hdr, 0, sizeof(req_hdr));
755 	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
756 			ISER_SEND_W_INV_NOT_SUPPORTED);
757 	conn_param.private_data		= (void *)&req_hdr;
758 	conn_param.private_data_len	= sizeof(struct iser_cm_hdr);
759 
760 	ret = rdma_connect(cma_id, &conn_param);
761 	if (ret) {
762 		ISER_ERR("conn %p failure connecting: %d", iser_conn, ret);
763 		goto failure;
764 	}
765 
766 	return;
767 failure:
768 	iser_connect_error(cma_id);
769 }
770 
771 /**
772  * Called with state mutex held
773  **/
774 static void
775 iser_connected_handler(struct rdma_cm_id *cma_id)
776 {
777 	struct iser_conn *iser_conn;
778 	struct ib_qp_attr attr;
779 	struct ib_qp_init_attr init_attr;
780 
781 	iser_conn = cma_id->context;
782 
783 	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
784 
785 	ISER_INFO("remote qpn:%x my qpn:%x",
786 		  attr.dest_qp_num, cma_id->qp->qp_num);
787 
788 	iser_conn->state = ISER_CONN_UP;
789 
790 	cv_signal(&iser_conn->up_cv);
791 }
792 
793 /**
794  * Called with state mutex held
795  **/
796 static void
797 iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy)
798 {
799 	struct iser_conn *iser_conn = cma_id->context;
800 
801 	if (iser_conn_terminate(iser_conn))
802 		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
803 
804 }
805 
806 int
807 iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
808 {
809 	struct iser_conn *iser_conn;
810 	int ret = 0;
811 
812 	iser_conn = cma_id->context;
813 	ISER_INFO("event %d status %d conn %p id %p",
814 		  event->event, event->status, cma_id->context, cma_id);
815 
816 	sx_xlock(&iser_conn->state_mutex);
817 	switch (event->event) {
818 	case RDMA_CM_EVENT_ADDR_RESOLVED:
819 		iser_addr_handler(cma_id);
820 		break;
821 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
822 		iser_route_handler(cma_id);
823 		break;
824 	case RDMA_CM_EVENT_ESTABLISHED:
825 		iser_connected_handler(cma_id);
826 		break;
827 	case RDMA_CM_EVENT_ADDR_ERROR:
828 	case RDMA_CM_EVENT_ROUTE_ERROR:
829 	case RDMA_CM_EVENT_CONNECT_ERROR:
830 	case RDMA_CM_EVENT_UNREACHABLE:
831 	case RDMA_CM_EVENT_REJECTED:
832 		iser_connect_error(cma_id);
833 		break;
834 	case RDMA_CM_EVENT_DISCONNECTED:
835 	case RDMA_CM_EVENT_ADDR_CHANGE:
836 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
837 		iser_cleanup_handler(cma_id, false);
838 		break;
839 	default:
840 		ISER_ERR("Unexpected RDMA CM event (%d)", event->event);
841 		break;
842 	}
843 	sx_xunlock(&iser_conn->state_mutex);
844 
845 	return (ret);
846 }
847 
848 int
849 iser_post_recvl(struct iser_conn *iser_conn)
850 {
851 	const struct ib_recv_wr *rx_wr_failed;
852 	struct ib_recv_wr rx_wr;
853 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
854 	struct ib_sge	  sge;
855 	int ib_ret;
856 
857 	sge.addr   = iser_conn->login_resp_dma;
858 	sge.length = ISER_RX_LOGIN_SIZE;
859 	sge.lkey   = ib_conn->device->mr->lkey;
860 
861 	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
862 	rx_wr.sg_list = &sge;
863 	rx_wr.num_sge = 1;
864 	rx_wr.next    = NULL;
865 
866 	ib_conn->post_recv_buf_count++;
867 	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
868 	if (ib_ret) {
869 		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
870 		ib_conn->post_recv_buf_count--;
871 	}
872 
873 	return (ib_ret);
874 }
875 
876 int
877 iser_post_recvm(struct iser_conn *iser_conn, int count)
878 {
879 	const struct ib_recv_wr *rx_wr_failed;
880 	struct ib_recv_wr *rx_wr;
881 	int i, ib_ret;
882 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
883 	unsigned int my_rx_head = iser_conn->rx_desc_head;
884 	struct iser_rx_desc *rx_desc;
885 
886 	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
887 		rx_desc		= &iser_conn->rx_descs[my_rx_head];
888 		rx_wr->wr_id	= (uintptr_t)rx_desc;
889 		rx_wr->sg_list	= &rx_desc->rx_sg;
890 		rx_wr->num_sge	= 1;
891 		rx_wr->next	= rx_wr + 1;
892 		my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos;
893 	}
894 
895 	rx_wr--;
896 	rx_wr->next = NULL; /* mark end of work requests list */
897 
898 	ib_conn->post_recv_buf_count += count;
899 	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
900 	if (ib_ret) {
901 		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
902 		ib_conn->post_recv_buf_count -= count;
903 	} else
904 		iser_conn->rx_desc_head = my_rx_head;
905 
906 	return (ib_ret);
907 }
908 
909 /**
910  * iser_start_send - Initiate a Send DTO operation
911  *
912  * returns 0 on success, -1 on failure
913  */
914 int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
915 		   bool signal)
916 {
917 	int		  ib_ret;
918 	const struct ib_send_wr *send_wr_failed;
919 	struct ib_send_wr send_wr;
920 
921 	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
922 				      tx_desc->dma_addr, ISER_HEADERS_LEN,
923 				      DMA_TO_DEVICE);
924 
925 	send_wr.next	   = NULL;
926 	send_wr.wr_id	   = (uintptr_t)tx_desc;
927 	send_wr.sg_list	   = tx_desc->tx_sg;
928 	send_wr.num_sge	   = tx_desc->num_sge;
929 	send_wr.opcode	   = IB_WR_SEND;
930 	send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
931 
932 	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
933 	if (ib_ret)
934 		ISER_ERR("ib_post_send failed, ret:%d", ib_ret);
935 
936 	return (ib_ret);
937 }
938