xref: /freebsd/sys/dev/iser/iser_verbs.c (revision 128e3872b90b0da81dc8340f720c87d18955895e)
1 /* $FreeBSD$ */
2 /*-
3  * Copyright (c) 2015, Mellanox Technologies, Inc. All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  *
14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24  * SUCH DAMAGE.
25  */
26 
27 #include "icl_iser.h"
28 
29 static MALLOC_DEFINE(M_ISER_VERBS, "iser_verbs", "iser verbs backend");
30 static int iser_cq_poll_limit = 512;
31 
32 static void
33 iser_cq_event_callback(struct ib_event *cause, void *context)
34 {
35 	ISER_ERR("got cq event %d", cause->event);
36 }
37 
38 static void
39 iser_qp_event_callback(struct ib_event *cause, void *context)
40 {
41 	ISER_ERR("got qp event %d", cause->event);
42 }
43 
44 static void
45 iser_event_handler(struct ib_event_handler *handler,
46 				struct ib_event *event)
47 {
48 	ISER_ERR("async event %d on device %s port %d",
49 		 event->event, event->device->name,
50 		 event->element.port_num);
51 }
52 
53 /**
54  * is_iser_tx_desc - Indicate if the completion wr_id
55  *     is a TX descriptor or not.
56  * @iser_conn: iser connection
57  * @wr_id: completion WR identifier
58  *
59  * Since we cannot rely on wc opcode in FLUSH errors
60  * we must work around it by checking if the wr_id address
61  * falls in the iser connection rx_descs buffer. If so
62  * it is an RX descriptor, otherwize it is a TX.
63  */
64 static inline bool
65 is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id)
66 {
67 	void *start = iser_conn->rx_descs;
68 	u64 len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs);
69 	void *end = (void *)((uintptr_t)start + (uintptr_t)len);
70 
71 	if (start) {
72 		if (wr_id >= start && wr_id < end)
73 			return false;
74 	} else {
75 		return ((uintptr_t)wr_id != (uintptr_t)iser_conn->login_resp_buf);
76 	}
77 
78 	return true;
79 }
80 
81 /**
82  * iser_handle_comp_error() - Handle error completion
83  * @ib_conn:   connection RDMA resources
84  * @wc:        work completion
85  *
86  * Notes: Update post_recv_buf_count in case of recv error completion.
87  *        For non-FLUSH error completion we should also notify iscsi layer that
88  *        connection is failed (in case we passed bind stage).
89  */
90 static void
91 iser_handle_comp_error(struct ib_conn *ib_conn,
92 		       struct ib_wc *wc)
93 {
94 	void *wr_id = (void *)(uintptr_t)wc->wr_id;
95 	struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
96 						   ib_conn);
97 
98 	if (is_iser_tx_desc(iser_conn, wr_id)) {
99 		ISER_DBG("conn %p got send comp error", iser_conn);
100 	} else {
101 		ISER_DBG("conn %p got recv comp error", iser_conn);
102 		ib_conn->post_recv_buf_count--;
103 	}
104 	if (wc->status != IB_WC_WR_FLUSH_ERR)
105 		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
106 }
107 
108 /**
109  * iser_handle_wc - handle a single work completion
110  * @wc: work completion
111  *
112  * Soft-IRQ context, work completion can be either
113  * SEND or RECV, and can turn out successful or
114  * with error (or flush error).
115  */
116 static void iser_handle_wc(struct ib_wc *wc)
117 {
118 	struct ib_conn *ib_conn;
119 	struct iser_tx_desc *tx_desc;
120 	struct iser_rx_desc *rx_desc;
121 
122 	ib_conn = wc->qp->qp_context;
123 	if (likely(wc->status == IB_WC_SUCCESS)) {
124 		if (wc->opcode == IB_WC_RECV) {
125 			rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id;
126 			iser_rcv_completion(rx_desc, wc->byte_len,
127 					    ib_conn);
128 		} else
129 		if (wc->opcode == IB_WC_SEND) {
130 			tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id;
131 			iser_snd_completion(tx_desc, ib_conn);
132 		} else {
133 			ISER_ERR("Unknown wc opcode %d", wc->opcode);
134 		}
135 	} else {
136 		struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn,
137 					ib_conn);
138 		if (wc->status != IB_WC_WR_FLUSH_ERR) {
139 			ISER_ERR("conn %p wr id %lx status %d vend_err %x",
140 				 iser_conn, wc->wr_id, wc->status, wc->vendor_err);
141 		} else {
142 			ISER_DBG("flush error: conn %p wr id %lx", iser_conn, wc->wr_id);
143 		}
144 
145 		if (wc->wr_id == ISER_BEACON_WRID) {
146 			/* all flush errors were consumed */
147 			mtx_lock(&ib_conn->beacon.flush_lock);
148 			ISER_DBG("conn %p got ISER_BEACON_WRID", iser_conn);
149 			cv_signal(&ib_conn->beacon.flush_cv);
150 			mtx_unlock(&ib_conn->beacon.flush_lock);
151 		} else {
152 			iser_handle_comp_error(ib_conn, wc);
153 		}
154 	}
155 }
156 
157 static void
158 iser_cq_tasklet_fn(void *data, int pending)
159 {
160 	struct iser_comp *comp = (struct iser_comp *)data;
161 	struct ib_cq *cq = comp->cq;
162 	struct ib_wc *const wcs = comp->wcs;
163 	int completed = 0;
164 	int i;
165 	int n;
166 
167 	while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) {
168 		for (i = 0; i < n; i++)
169 			iser_handle_wc(&wcs[i]);
170 
171 		completed += n;
172 		if (completed >= iser_cq_poll_limit)
173 			break;
174 	}
175 
176 	/*
177 	 * It is assumed here that arming CQ only once its empty
178 	 * would not cause interrupts to be missed.
179 	 */
180 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
181 }
182 
183 static void
184 iser_cq_callback(struct ib_cq *cq, void *cq_context)
185 {
186 	struct iser_comp *comp = cq_context;
187 
188 	taskqueue_enqueue(comp->tq, &comp->task);
189 }
190 
191 /**
192  * iser_create_device_ib_res - creates Protection Domain (PD), Completion
193  * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with
194  * the adapator.
195  *
196  * returns 0 on success, -1 on failure
197  */
198 static int
199 iser_create_device_ib_res(struct iser_device *device)
200 {
201 	struct ib_device_attr *dev_attr = &device->dev_attr;
202 	int ret, i, max_cqe;
203 
204 	ret = ib_query_device(device->ib_device, dev_attr);
205 	if (ret) {
206 		ISER_ERR("Query device failed for %s", device->ib_device->name);
207 		return (ret);
208 	}
209 
210 	if (!(dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS)) {
211 		ISER_ERR("device %s doesn't support Fastreg, "
212 			 "can't register memory", device->ib_device->name);
213 		return (1);
214 	}
215 
216 	device->comps_used = min(mp_ncpus, device->ib_device->num_comp_vectors);
217 
218 	device->comps = malloc(device->comps_used * sizeof(*device->comps),
219 		M_ISER_VERBS, M_WAITOK | M_ZERO);
220 	if (!device->comps)
221 		goto comps_err;
222 
223 	max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe);
224 
225 	ISER_DBG("using %d CQs, device %s supports %d vectors max_cqe %d",
226 		 device->comps_used, device->ib_device->name,
227 		 device->ib_device->num_comp_vectors, max_cqe);
228 
229 	device->pd = ib_alloc_pd(device->ib_device);
230 	if (IS_ERR(device->pd))
231 		goto pd_err;
232 
233 	for (i = 0; i < device->comps_used; i++) {
234 		struct iser_comp *comp = &device->comps[i];
235 
236 		comp->device = device;
237 		comp->cq = ib_create_cq(device->ib_device,
238 					iser_cq_callback,
239 					iser_cq_event_callback,
240 					(void *)comp,
241 					max_cqe, i);
242 		if (IS_ERR(comp->cq)) {
243 			comp->cq = NULL;
244 			goto cq_err;
245 		}
246 
247 		if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP))
248 			goto cq_err;
249 
250 		TASK_INIT(&comp->task, 0, iser_cq_tasklet_fn, comp);
251 		comp->tq = taskqueue_create_fast("iser_taskq", M_NOWAIT,
252 				taskqueue_thread_enqueue, &comp->tq);
253 		if (!comp->tq)
254 			goto tq_err;
255 		taskqueue_start_threads(&comp->tq, 1, PI_NET, "iser taskq");
256 	}
257 
258 	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
259 				   IB_ACCESS_REMOTE_WRITE |
260 				   IB_ACCESS_REMOTE_READ);
261 	if (IS_ERR(device->mr))
262 		goto tq_err;
263 
264 	INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device,
265 				iser_event_handler);
266 	if (ib_register_event_handler(&device->event_handler))
267 		goto handler_err;
268 
269 	return (0);
270 
271 handler_err:
272 	ib_dereg_mr(device->mr);
273 tq_err:
274 	for (i = 0; i < device->comps_used; i++) {
275 		struct iser_comp *comp = &device->comps[i];
276 		if (comp->tq)
277 			taskqueue_free(comp->tq);
278 	}
279 cq_err:
280 	for (i = 0; i < device->comps_used; i++) {
281 		struct iser_comp *comp = &device->comps[i];
282 		if (comp->cq)
283 			ib_destroy_cq(comp->cq);
284 	}
285 	ib_dealloc_pd(device->pd);
286 pd_err:
287 	free(device->comps, M_ISER_VERBS);
288 comps_err:
289 	ISER_ERR("failed to allocate an IB resource");
290 	return (1);
291 }
292 
293 /**
294  * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR,
295  * CQ and PD created with the device associated with the adapator.
296  */
297 static void
298 iser_free_device_ib_res(struct iser_device *device)
299 {
300 	int i;
301 
302 	for (i = 0; i < device->comps_used; i++) {
303 		struct iser_comp *comp = &device->comps[i];
304 
305 		taskqueue_free(comp->tq);
306 		ib_destroy_cq(comp->cq);
307 		comp->cq = NULL;
308 	}
309 
310 	(void)ib_unregister_event_handler(&device->event_handler);
311 	(void)ib_dereg_mr(device->mr);
312 	(void)ib_dealloc_pd(device->pd);
313 
314 	free(device->comps, M_ISER_VERBS);
315 	device->comps = NULL;
316 
317 	device->mr = NULL;
318 	device->pd = NULL;
319 }
320 
321 static int
322 iser_alloc_reg_res(struct ib_device *ib_device,
323 		   struct ib_pd *pd,
324 		   struct iser_reg_resources *res)
325 {
326 	int ret;
327 
328 	res->frpl = ib_alloc_fast_reg_page_list(ib_device,
329 						ISCSI_ISER_SG_TABLESIZE + 1);
330 	if (IS_ERR(res->frpl)) {
331 		ret = -PTR_ERR(res->frpl);
332 		ISER_ERR("Failed to allocate fast reg page list err=%d", ret);
333 		return (ret);
334 	}
335 
336 	res->mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1);
337 	if (IS_ERR(res->mr)) {
338 		ret = -PTR_ERR(res->mr);
339 		ISER_ERR("Failed to allocate  fast reg mr err=%d", ret);
340 		goto fast_reg_mr_failure;
341 	}
342 	res->mr_valid = 1;
343 
344 	return (0);
345 
346 fast_reg_mr_failure:
347 	ib_free_fast_reg_page_list(res->frpl);
348 
349 	return (ret);
350 }
351 
352 static void
353 iser_free_reg_res(struct iser_reg_resources *rsc)
354 {
355 	ib_dereg_mr(rsc->mr);
356 	ib_free_fast_reg_page_list(rsc->frpl);
357 }
358 
359 static struct fast_reg_descriptor *
360 iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd)
361 {
362 	struct fast_reg_descriptor *desc;
363 	int ret;
364 
365 	desc = malloc(sizeof(*desc), M_ISER_VERBS, M_WAITOK | M_ZERO);
366 	if (!desc) {
367 		ISER_ERR("Failed to allocate a new fastreg descriptor");
368 		return (NULL);
369 	}
370 
371 	ret = iser_alloc_reg_res(ib_device, pd, &desc->rsc);
372 	if (ret) {
373 		ISER_ERR("failed to allocate reg_resources");
374 		goto err;
375 	}
376 
377 	return (desc);
378 err:
379 	free(desc, M_ISER_VERBS);
380 	return (NULL);
381 }
382 
383 /**
384  * iser_create_fmr_pool - Creates FMR pool and page_vector
385  *
386  * returns 0 on success, or errno code on failure
387  */
388 int
389 iser_create_fastreg_pool(struct ib_conn *ib_conn, unsigned cmds_max)
390 {
391 	struct iser_device *device = ib_conn->device;
392 	struct fast_reg_descriptor *desc;
393 	int i;
394 
395 	INIT_LIST_HEAD(&ib_conn->fastreg.pool);
396 	ib_conn->fastreg.pool_size = 0;
397 	for (i = 0; i < cmds_max; i++) {
398 		desc = iser_create_fastreg_desc(device->ib_device, device->pd);
399 		if (!desc) {
400 			ISER_ERR("Failed to create fastreg descriptor");
401 			goto err;
402 		}
403 
404 		list_add_tail(&desc->list, &ib_conn->fastreg.pool);
405 		ib_conn->fastreg.pool_size++;
406 	}
407 
408 	return (0);
409 
410 err:
411 	iser_free_fastreg_pool(ib_conn);
412 	return (ENOMEM);
413 }
414 
415 /**
416  * iser_free_fmr_pool - releases the FMR pool and page vec
417  */
418 void
419 iser_free_fastreg_pool(struct ib_conn *ib_conn)
420 {
421 	struct fast_reg_descriptor *desc, *tmp;
422 	int i = 0;
423 
424 	if (list_empty(&ib_conn->fastreg.pool))
425 		return;
426 
427 	ISER_DBG("freeing conn %p fr pool", ib_conn);
428 
429 	list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) {
430 		list_del(&desc->list);
431 		iser_free_reg_res(&desc->rsc);
432 		free(desc, M_ISER_VERBS);
433 		++i;
434 	}
435 
436 	if (i < ib_conn->fastreg.pool_size)
437 		ISER_WARN("pool still has %d regions registered",
438 			  ib_conn->fastreg.pool_size - i);
439 }
440 
441 /**
442  * iser_create_ib_conn_res - Queue-Pair (QP)
443  *
444  * returns 0 on success, 1 on failure
445  */
446 static int
447 iser_create_ib_conn_res(struct ib_conn *ib_conn)
448 {
449 	struct iser_conn *iser_conn;
450 	struct iser_device *device;
451 	struct ib_device_attr *dev_attr;
452 	struct ib_qp_init_attr init_attr;
453 	int index, min_index = 0;
454 	int ret = -ENOMEM;
455 
456 	iser_conn = container_of(ib_conn, struct iser_conn, ib_conn);
457 	device = ib_conn->device;
458 	dev_attr = &device->dev_attr;
459 
460 	mtx_lock(&ig.connlist_mutex);
461 	/* select the CQ with the minimal number of usages */
462 	for (index = 0; index < device->comps_used; index++) {
463 		if (device->comps[index].active_qps <
464 		    device->comps[min_index].active_qps)
465 			min_index = index;
466 	}
467 	ib_conn->comp = &device->comps[min_index];
468 	ib_conn->comp->active_qps++;
469 	mtx_unlock(&ig.connlist_mutex);
470 	ISER_INFO("cq index %d used for ib_conn %p", min_index, ib_conn);
471 
472 	memset(&init_attr, 0, sizeof init_attr);
473 	init_attr.event_handler = iser_qp_event_callback;
474 	init_attr.qp_context	= (void *)ib_conn;
475 	init_attr.send_cq	= ib_conn->comp->cq;
476 	init_attr.recv_cq	= ib_conn->comp->cq;
477 	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
478 	init_attr.cap.max_send_sge = 2;
479 	init_attr.cap.max_recv_sge = 1;
480 	init_attr.sq_sig_type	= IB_SIGNAL_REQ_WR;
481 	init_attr.qp_type	= IB_QPT_RC;
482 
483 	if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) {
484 		init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
485 		iser_conn->max_cmds =
486 			ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS);
487 	} else {
488 		init_attr.cap.max_send_wr = dev_attr->max_qp_wr;
489 		iser_conn->max_cmds =
490 			ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr);
491 	}
492 	ISER_DBG("device %s supports max_send_wr %d",
493 	         device->ib_device->name, dev_attr->max_qp_wr);
494 
495 	ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr);
496 	if (ret)
497 		goto out_err;
498 
499 	ib_conn->qp = ib_conn->cma_id->qp;
500 	ISER_DBG("setting conn %p cma_id %p qp %p",
501 		 ib_conn, ib_conn->cma_id,
502 		 ib_conn->cma_id->qp);
503 
504 	return (ret);
505 
506 out_err:
507 	mtx_lock(&ig.connlist_mutex);
508 	ib_conn->comp->active_qps--;
509 	mtx_unlock(&ig.connlist_mutex);
510 	ISER_ERR("unable to alloc mem or create resource, err %d", ret);
511 
512 	return (ret);
513 }
514 
515 /**
516  * based on the resolved device node GUID see if there already allocated
517  * device for this device. If there's no such, create one.
518  */
519 static struct iser_device *
520 iser_device_find_by_ib_device(struct rdma_cm_id *cma_id)
521 {
522 	struct iser_device *device;
523 
524 	sx_xlock(&ig.device_list_mutex);
525 
526 	list_for_each_entry(device, &ig.device_list, ig_list)
527 		/* find if there's a match using the node GUID */
528 		if (device->ib_device->node_guid == cma_id->device->node_guid)
529 			goto inc_refcnt;
530 
531 	device = malloc(sizeof *device, M_ISER_VERBS, M_WAITOK | M_ZERO);
532 	if (device == NULL)
533 		goto out;
534 
535 	/* assign this device to the device */
536 	device->ib_device = cma_id->device;
537 	/* init the device and link it into ig device list */
538 	if (iser_create_device_ib_res(device)) {
539 		free(device, M_ISER_VERBS);
540 		device = NULL;
541 		goto out;
542 	}
543 	list_add(&device->ig_list, &ig.device_list);
544 
545 inc_refcnt:
546 	device->refcount++;
547 	ISER_INFO("device %p refcount %d", device, device->refcount);
548 out:
549 	sx_xunlock(&ig.device_list_mutex);
550 	return (device);
551 }
552 
553 /* if there's no demand for this device, release it */
554 static void
555 iser_device_try_release(struct iser_device *device)
556 {
557 	sx_xlock(&ig.device_list_mutex);
558 	device->refcount--;
559 	ISER_INFO("device %p refcount %d", device, device->refcount);
560 	if (!device->refcount) {
561 		iser_free_device_ib_res(device);
562 		list_del(&device->ig_list);
563 		free(device, M_ISER_VERBS);
564 		device = NULL;
565 	}
566 	sx_xunlock(&ig.device_list_mutex);
567 }
568 
569 /**
570  * Called with state mutex held
571  **/
572 static int iser_conn_state_comp_exch(struct iser_conn *iser_conn,
573 				     enum iser_conn_state comp,
574 				     enum iser_conn_state exch)
575 {
576 	int ret;
577 
578 	ret = (iser_conn->state == comp);
579 	if (ret)
580 		iser_conn->state = exch;
581 
582 	return ret;
583 }
584 
585 /**
586  * iser_free_ib_conn_res - release IB related resources
587  * @iser_conn: iser connection struct
588  * @destroy: indicator if we need to try to release the
589  *     iser device and memory regoins pool (only iscsi
590  *     shutdown and DEVICE_REMOVAL will use this).
591  *
592  * This routine is called with the iser state mutex held
593  * so the cm_id removal is out of here. It is Safe to
594  * be invoked multiple times.
595  */
596 void
597 iser_free_ib_conn_res(struct iser_conn *iser_conn,
598 				  bool destroy)
599 {
600 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
601 	struct iser_device *device = ib_conn->device;
602 
603 	ISER_INFO("freeing conn %p cma_id %p qp %p",
604 		  iser_conn, ib_conn->cma_id, ib_conn->qp);
605 
606 	if (ib_conn->qp != NULL) {
607 		mtx_lock(&ig.connlist_mutex);
608 		ib_conn->comp->active_qps--;
609 		mtx_unlock(&ig.connlist_mutex);
610 		rdma_destroy_qp(ib_conn->cma_id);
611 		ib_conn->qp = NULL;
612 	}
613 
614 	if (destroy) {
615 		if (iser_conn->login_buf)
616 			iser_free_login_buf(iser_conn);
617 
618 		if (iser_conn->rx_descs)
619 			iser_free_rx_descriptors(iser_conn);
620 
621 		if (device != NULL) {
622 			iser_device_try_release(device);
623 			ib_conn->device = NULL;
624 		}
625 	}
626 }
627 
628 /**
629  * triggers start of the disconnect procedures and wait for them to be done
630  * Called with state mutex held
631  */
632 int
633 iser_conn_terminate(struct iser_conn *iser_conn)
634 {
635 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
636 	struct ib_send_wr *bad_send_wr;
637 	struct ib_recv_wr *bad_recv_wr;
638 	int err = 0;
639 
640 	/* terminate the iser conn only if the conn state is UP */
641 	if (!iser_conn_state_comp_exch(iser_conn, ISER_CONN_UP,
642 					   ISER_CONN_TERMINATING))
643 		return (0);
644 
645 	ISER_INFO("iser_conn %p state %d\n", iser_conn, iser_conn->state);
646 
647 	if (ib_conn->qp == NULL) {
648 		/* HOW can this be??? */
649 		ISER_WARN("qp wasn't created");
650 		return (1);
651 	}
652 
653 	/*
654 	 * Todo: This is a temporary workaround.
655 	 * We serialize the connection closure using global lock in order to
656 	 * receive all posted beacons completions.
657 	 * Without Serialization, in case we open many connections (QPs) on
658 	 * the same CQ, we might miss beacons because of missing interrupts.
659 	 */
660 	sx_xlock(&ig.close_conns_mutex);
661 
662 	/*
663 	 * In case we didn't already clean up the cma_id (peer initiated
664 	 * a disconnection), we need to Cause the CMA to change the QP
665 	 * state to ERROR.
666 	 */
667 	if (ib_conn->cma_id) {
668 		err = rdma_disconnect(ib_conn->cma_id);
669 		if (err)
670 			ISER_ERR("Failed to disconnect, conn: 0x%p err %d",
671 				iser_conn, err);
672 
673 		mtx_lock(&ib_conn->beacon.flush_lock);
674 		memset(&ib_conn->beacon.send, 0, sizeof(struct ib_send_wr));
675 		ib_conn->beacon.send.wr_id = ISER_BEACON_WRID;
676 		ib_conn->beacon.send.opcode = IB_WR_SEND;
677 		/* post an indication that all send flush errors were consumed */
678 		err = ib_post_send(ib_conn->qp, &ib_conn->beacon.send, &bad_send_wr);
679 		if (err) {
680 			ISER_ERR("conn %p failed to post send_beacon", ib_conn);
681 			mtx_unlock(&ib_conn->beacon.flush_lock);
682 			goto out;
683 		}
684 
685 		ISER_DBG("before send cv_wait: %p", iser_conn);
686 		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
687 		ISER_DBG("after send cv_wait: %p", iser_conn);
688 
689 		memset(&ib_conn->beacon.recv, 0, sizeof(struct ib_recv_wr));
690 		ib_conn->beacon.recv.wr_id = ISER_BEACON_WRID;
691 		/* post an indication that all recv flush errors were consumed */
692 		err = ib_post_recv(ib_conn->qp, &ib_conn->beacon.recv, &bad_recv_wr);
693 		if (err) {
694 			ISER_ERR("conn %p failed to post recv_beacon", ib_conn);
695 			mtx_unlock(&ib_conn->beacon.flush_lock);
696 			goto out;
697 		}
698 
699 		ISER_DBG("before recv cv_wait: %p", iser_conn);
700 		cv_wait(&ib_conn->beacon.flush_cv, &ib_conn->beacon.flush_lock);
701 		mtx_unlock(&ib_conn->beacon.flush_lock);
702 		ISER_DBG("after recv cv_wait: %p", iser_conn);
703 	}
704 out:
705 	sx_xunlock(&ig.close_conns_mutex);
706 	return (1);
707 }
708 
709 /**
710  * Called with state mutex held
711  **/
712 static void
713 iser_connect_error(struct rdma_cm_id *cma_id)
714 {
715 	struct iser_conn *iser_conn;
716 
717 	iser_conn = cma_id->context;
718 
719 	ISER_ERR("conn %p", iser_conn);
720 
721 	iser_conn->state = ISER_CONN_TERMINATING;
722 
723 	cv_signal(&iser_conn->up_cv);
724 }
725 
726 /**
727  * Called with state mutex held
728  **/
729 static void
730 iser_addr_handler(struct rdma_cm_id *cma_id)
731 {
732 	struct iser_device *device;
733 	struct iser_conn   *iser_conn;
734 	struct ib_conn   *ib_conn;
735 	int    ret;
736 
737 	iser_conn = cma_id->context;
738 
739 	ib_conn = &iser_conn->ib_conn;
740 	device = iser_device_find_by_ib_device(cma_id);
741 	if (!device) {
742 		ISER_ERR("conn %p device lookup/creation failed",
743 			 iser_conn);
744 		iser_connect_error(cma_id);
745 		return;
746 	}
747 
748 	ib_conn->device = device;
749 
750 	ret = rdma_resolve_route(cma_id, 1000);
751 	if (ret) {
752 		ISER_ERR("conn %p resolve route failed: %d", iser_conn, ret);
753 		iser_connect_error(cma_id);
754 		return;
755 	}
756 }
757 
758 /**
759  * Called with state mutex held
760  **/
761 static void
762 iser_route_handler(struct rdma_cm_id *cma_id)
763 {
764 	struct rdma_conn_param conn_param;
765 	int    ret;
766 	struct iser_cm_hdr req_hdr;
767 	struct iser_conn *iser_conn = cma_id->context;
768 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
769 	struct iser_device *device = ib_conn->device;
770 
771 	ret = iser_create_ib_conn_res(ib_conn);
772 	if (ret)
773 		goto failure;
774 
775 	memset(&conn_param, 0, sizeof conn_param);
776 	conn_param.responder_resources = device->dev_attr.max_qp_rd_atom;
777 	conn_param.retry_count	       = 7;
778 	conn_param.rnr_retry_count     = 6;
779 	/*
780 	 * Initiaotr depth should not be set, but in order to compat
781 	 * with old targets, we keep this value set.
782 	 */
783 	conn_param.initiator_depth     = 1;
784 
785 	memset(&req_hdr, 0, sizeof(req_hdr));
786 	req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED |
787 			ISER_SEND_W_INV_NOT_SUPPORTED);
788 	conn_param.private_data		= (void *)&req_hdr;
789 	conn_param.private_data_len	= sizeof(struct iser_cm_hdr);
790 
791 	ret = rdma_connect(cma_id, &conn_param);
792 	if (ret) {
793 		ISER_ERR("conn %p failure connecting: %d", iser_conn, ret);
794 		goto failure;
795 	}
796 
797 	return;
798 failure:
799 	iser_connect_error(cma_id);
800 }
801 
802 /**
803  * Called with state mutex held
804  **/
805 static void
806 iser_connected_handler(struct rdma_cm_id *cma_id)
807 {
808 	struct iser_conn *iser_conn;
809 	struct ib_qp_attr attr;
810 	struct ib_qp_init_attr init_attr;
811 
812 	iser_conn = cma_id->context;
813 
814 	(void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr);
815 
816 	ISER_INFO("remote qpn:%x my qpn:%x",
817 		  attr.dest_qp_num, cma_id->qp->qp_num);
818 
819 	iser_conn->state = ISER_CONN_UP;
820 
821 	cv_signal(&iser_conn->up_cv);
822 }
823 
824 /**
825  * Called with state mutex held
826  **/
827 static void
828 iser_cleanup_handler(struct rdma_cm_id *cma_id, bool destroy)
829 {
830 	struct iser_conn *iser_conn = cma_id->context;
831 
832 	if (iser_conn_terminate(iser_conn))
833 		iser_conn->icl_conn.ic_error(&iser_conn->icl_conn);
834 
835 }
836 
837 int
838 iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event)
839 {
840 	struct iser_conn *iser_conn;
841 	int ret = 0;
842 
843 	iser_conn = cma_id->context;
844 	ISER_INFO("event %d status %d conn %p id %p",
845 		  event->event, event->status, cma_id->context, cma_id);
846 
847 	sx_xlock(&iser_conn->state_mutex);
848 	switch (event->event) {
849 	case RDMA_CM_EVENT_ADDR_RESOLVED:
850 		iser_addr_handler(cma_id);
851 		break;
852 	case RDMA_CM_EVENT_ROUTE_RESOLVED:
853 		iser_route_handler(cma_id);
854 		break;
855 	case RDMA_CM_EVENT_ESTABLISHED:
856 		iser_connected_handler(cma_id);
857 		break;
858 	case RDMA_CM_EVENT_ADDR_ERROR:
859 	case RDMA_CM_EVENT_ROUTE_ERROR:
860 	case RDMA_CM_EVENT_CONNECT_ERROR:
861 	case RDMA_CM_EVENT_UNREACHABLE:
862 	case RDMA_CM_EVENT_REJECTED:
863 		iser_connect_error(cma_id);
864 		break;
865 	case RDMA_CM_EVENT_DISCONNECTED:
866 	case RDMA_CM_EVENT_ADDR_CHANGE:
867 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
868 		iser_cleanup_handler(cma_id, false);
869 		break;
870 	default:
871 		ISER_ERR("Unexpected RDMA CM event (%d)", event->event);
872 		break;
873 	}
874 	sx_xunlock(&iser_conn->state_mutex);
875 
876 	return (ret);
877 }
878 
879 int
880 iser_post_recvl(struct iser_conn *iser_conn)
881 {
882 	struct ib_recv_wr rx_wr, *rx_wr_failed;
883 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
884 	struct ib_sge	  sge;
885 	int ib_ret;
886 
887 	sge.addr   = iser_conn->login_resp_dma;
888 	sge.length = ISER_RX_LOGIN_SIZE;
889 	sge.lkey   = ib_conn->device->mr->lkey;
890 
891 	rx_wr.wr_id   = (uintptr_t)iser_conn->login_resp_buf;
892 	rx_wr.sg_list = &sge;
893 	rx_wr.num_sge = 1;
894 	rx_wr.next    = NULL;
895 
896 	ib_conn->post_recv_buf_count++;
897 	ib_ret	= ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed);
898 	if (ib_ret) {
899 		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
900 		ib_conn->post_recv_buf_count--;
901 	}
902 
903 	return (ib_ret);
904 }
905 
906 int
907 iser_post_recvm(struct iser_conn *iser_conn, int count)
908 {
909 	struct ib_recv_wr *rx_wr, *rx_wr_failed;
910 	int i, ib_ret;
911 	struct ib_conn *ib_conn = &iser_conn->ib_conn;
912 	unsigned int my_rx_head = iser_conn->rx_desc_head;
913 	struct iser_rx_desc *rx_desc;
914 
915 	for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) {
916 		rx_desc		= &iser_conn->rx_descs[my_rx_head];
917 		rx_wr->wr_id	= (uintptr_t)rx_desc;
918 		rx_wr->sg_list	= &rx_desc->rx_sg;
919 		rx_wr->num_sge	= 1;
920 		rx_wr->next	= rx_wr + 1;
921 		my_rx_head = (my_rx_head + 1) % iser_conn->qp_max_recv_dtos;
922 	}
923 
924 	rx_wr--;
925 	rx_wr->next = NULL; /* mark end of work requests list */
926 
927 	ib_conn->post_recv_buf_count += count;
928 	ib_ret	= ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed);
929 	if (ib_ret) {
930 		ISER_ERR("ib_post_recv failed ret=%d", ib_ret);
931 		ib_conn->post_recv_buf_count -= count;
932 	} else
933 		iser_conn->rx_desc_head = my_rx_head;
934 
935 	return (ib_ret);
936 }
937 
938 /**
939  * iser_start_send - Initiate a Send DTO operation
940  *
941  * returns 0 on success, -1 on failure
942  */
943 int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc,
944 		   bool signal)
945 {
946 	int		  ib_ret;
947 	struct ib_send_wr send_wr, *send_wr_failed;
948 
949 	ib_dma_sync_single_for_device(ib_conn->device->ib_device,
950 				      tx_desc->dma_addr, ISER_HEADERS_LEN,
951 				      DMA_TO_DEVICE);
952 
953 	send_wr.next	   = NULL;
954 	send_wr.wr_id	   = (uintptr_t)tx_desc;
955 	send_wr.sg_list	   = tx_desc->tx_sg;
956 	send_wr.num_sge	   = tx_desc->num_sge;
957 	send_wr.opcode	   = IB_WR_SEND;
958 	send_wr.send_flags = signal ? IB_SEND_SIGNALED : 0;
959 
960 	ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed);
961 	if (ib_ret)
962 		ISER_ERR("ib_post_send failed, ret:%d", ib_ret);
963 
964 	return (ib_ret);
965 }
966