xref: /linux/drivers/infiniband/sw/siw/siw_cm.c (revision ea4f6f6c53577fb3f05dbd78b15e586772d49831)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /*          Fredy Neeser */
5 /*          Greg Joyce <greg@opengridcomputing.com> */
6 /* Copyright (c) 2008-2019, IBM Corporation */
7 /* Copyright (c) 2017, Open Grid Computing, Inc. */
8 
9 #include <linux/errno.h>
10 #include <linux/types.h>
11 #include <linux/net.h>
12 #include <linux/inetdevice.h>
13 #include <net/addrconf.h>
14 #include <linux/workqueue.h>
15 #include <net/sock.h>
16 #include <net/tcp.h>
17 #include <linux/inet.h>
18 #include <linux/tcp.h>
19 #include <trace/events/sock.h>
20 
21 #include <rdma/iw_cm.h>
22 #include <rdma/ib_verbs.h>
23 #include <rdma/ib_user_verbs.h>
24 
25 #include "siw.h"
26 #include "siw_cm.h"
27 
28 /*
29  * Set to any combination of
30  * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
31  */
32 static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
33 static const bool relaxed_ird_negotiation = true;
34 
35 static void siw_cm_llp_state_change(struct sock *s);
36 static void siw_cm_llp_data_ready(struct sock *s);
37 static void siw_cm_llp_write_space(struct sock *s);
38 static void siw_cm_llp_error_report(struct sock *s);
39 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
40 			 int status);
41 
42 
43 #ifdef CONFIG_DEBUG_LOCK_ALLOC
44 /*
45  * lockdep can detect false positive circular dependencies
46  * when there are user-space socket API users or in kernel
47  * users switching between a tcp and rdma transport.
48  * Maybe also switching between siw and rxe may cause
49  * problems as per default sockets are only classified
50  * by family and not by ip protocol. And there might
51  * be different locks used between the application
52  * and the low level sockets.
53  *
54  * Problems were seen with ksmbd.ko and cifs.ko,
55  * switching transports, use git blame to find
56  * more details.
57  */
58 static struct lock_class_key siw_sk_key[2];
59 static struct lock_class_key siw_slock_key[2];
60 #endif /* CONFIG_DEBUG_LOCK_ALLOC */
61 
62 static inline void siw_reclassify_socket(struct socket *sock)
63 {
64 #ifdef CONFIG_DEBUG_LOCK_ALLOC
65 	struct sock *sk = sock->sk;
66 
67 	if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
68 		return;
69 
70 	switch (sk->sk_family) {
71 	case AF_INET:
72 		sock_lock_init_class_and_name(sk,
73 					      "slock-AF_INET-RDMA-SIW",
74 					      &siw_slock_key[0],
75 					      "sk_lock-AF_INET-RDMA-SIW",
76 					      &siw_sk_key[0]);
77 		break;
78 	case AF_INET6:
79 		sock_lock_init_class_and_name(sk,
80 					      "slock-AF_INET6-RDMA-SIW",
81 					      &siw_slock_key[1],
82 					      "sk_lock-AF_INET6-RDMA-SIW",
83 					      &siw_sk_key[1]);
84 		break;
85 	default:
86 		WARN_ON_ONCE(1);
87 	}
88 #endif /* CONFIG_DEBUG_LOCK_ALLOC */
89 }
90 
91 static void siw_sk_assign_cm_upcalls(struct sock *sk)
92 {
93 	struct siw_cep *cep = sk_to_cep(sk);
94 
95 	write_lock_bh(&sk->sk_callback_lock);
96 	cep->sk_state_change = sk->sk_state_change;
97 	cep->sk_data_ready = sk->sk_data_ready;
98 	cep->sk_write_space = sk->sk_write_space;
99 	cep->sk_error_report = sk->sk_error_report;
100 
101 	sk->sk_state_change = siw_cm_llp_state_change;
102 	sk->sk_data_ready = siw_cm_llp_data_ready;
103 	sk->sk_write_space = siw_cm_llp_write_space;
104 	sk->sk_error_report = siw_cm_llp_error_report;
105 	write_unlock_bh(&sk->sk_callback_lock);
106 }
107 
108 static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
109 {
110 	sk->sk_state_change = cep->sk_state_change;
111 	sk->sk_data_ready = cep->sk_data_ready;
112 	sk->sk_write_space = cep->sk_write_space;
113 	sk->sk_error_report = cep->sk_error_report;
114 	sk->sk_user_data = NULL;
115 }
116 
117 static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
118 {
119 	struct socket *s = cep->sock;
120 	struct sock *sk = s->sk;
121 
122 	write_lock_bh(&sk->sk_callback_lock);
123 
124 	qp->attrs.sk = s;
125 	sk->sk_data_ready = siw_qp_llp_data_ready;
126 	sk->sk_write_space = siw_qp_llp_write_space;
127 
128 	write_unlock_bh(&sk->sk_callback_lock);
129 }
130 
131 static void siw_socket_disassoc(struct socket *s)
132 {
133 	struct sock *sk = s->sk;
134 	struct siw_cep *cep;
135 
136 	if (sk) {
137 		write_lock_bh(&sk->sk_callback_lock);
138 		cep = sk_to_cep(sk);
139 		if (cep) {
140 			siw_sk_restore_upcalls(sk, cep);
141 			cep->sock = NULL;
142 			siw_cep_put(cep);
143 		} else {
144 			pr_warn("siw: cannot restore sk callbacks: no ep\n");
145 		}
146 		write_unlock_bh(&sk->sk_callback_lock);
147 	} else {
148 		pr_warn("siw: cannot restore sk callbacks: no sk\n");
149 	}
150 }
151 
152 static void siw_rtr_data_ready(struct sock *sk)
153 {
154 	struct siw_cep *cep;
155 	struct siw_qp *qp = NULL;
156 	read_descriptor_t rd_desc;
157 
158 	trace_sk_data_ready(sk);
159 
160 	read_lock(&sk->sk_callback_lock);
161 
162 	cep = sk_to_cep(sk);
163 	if (!cep) {
164 		WARN(1, "No connection endpoint\n");
165 		goto out;
166 	}
167 	qp = sk_to_qp(sk);
168 
169 	memset(&rd_desc, 0, sizeof(rd_desc));
170 	rd_desc.arg.data = qp;
171 	rd_desc.count = 1;
172 
173 	tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
174 	/*
175 	 * Check if first frame was successfully processed.
176 	 * Signal connection full establishment if yes.
177 	 * Failed data processing would have already scheduled
178 	 * connection drop.
179 	 */
180 	if (!qp->rx_stream.rx_suspend)
181 		siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
182 out:
183 	read_unlock(&sk->sk_callback_lock);
184 	if (qp)
185 		siw_qp_socket_assoc(cep, qp);
186 }
187 
188 static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
189 {
190 	struct sock *sk = cep->sock->sk;
191 
192 	write_lock_bh(&sk->sk_callback_lock);
193 	sk->sk_data_ready = siw_rtr_data_ready;
194 	sk->sk_write_space = siw_qp_llp_write_space;
195 	write_unlock_bh(&sk->sk_callback_lock);
196 }
197 
198 static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
199 {
200 	cep->sock = s;
201 	siw_cep_get(cep);
202 	s->sk->sk_user_data = cep;
203 
204 	siw_sk_assign_cm_upcalls(s->sk);
205 }
206 
207 static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
208 {
209 	struct siw_cep *cep = kzalloc_obj(*cep);
210 	unsigned long flags;
211 
212 	if (!cep)
213 		return NULL;
214 
215 	INIT_LIST_HEAD(&cep->listenq);
216 	INIT_LIST_HEAD(&cep->devq);
217 	INIT_LIST_HEAD(&cep->work_freelist);
218 
219 	kref_init(&cep->ref);
220 	cep->state = SIW_EPSTATE_IDLE;
221 	init_waitqueue_head(&cep->waitq);
222 	spin_lock_init(&cep->lock);
223 	cep->sdev = sdev;
224 	cep->enhanced_rdma_conn_est = false;
225 
226 	spin_lock_irqsave(&sdev->lock, flags);
227 	list_add_tail(&cep->devq, &sdev->cep_list);
228 	spin_unlock_irqrestore(&sdev->lock, flags);
229 
230 	siw_dbg_cep(cep, "new endpoint\n");
231 	return cep;
232 }
233 
234 static void siw_cm_free_work(struct siw_cep *cep)
235 {
236 	struct list_head *w, *tmp;
237 	struct siw_cm_work *work;
238 
239 	list_for_each_safe(w, tmp, &cep->work_freelist) {
240 		work = list_entry(w, struct siw_cm_work, list);
241 		list_del(&work->list);
242 		kfree(work);
243 	}
244 }
245 
246 static void siw_cancel_mpatimer(struct siw_cep *cep)
247 {
248 	spin_lock_bh(&cep->lock);
249 	if (cep->mpa_timer) {
250 		if (cancel_delayed_work(&cep->mpa_timer->work)) {
251 			siw_cep_put(cep);
252 			kfree(cep->mpa_timer); /* not needed again */
253 		}
254 		cep->mpa_timer = NULL;
255 	}
256 	spin_unlock_bh(&cep->lock);
257 }
258 
259 static void siw_put_work(struct siw_cm_work *work)
260 {
261 	INIT_LIST_HEAD(&work->list);
262 	spin_lock_bh(&work->cep->lock);
263 	list_add(&work->list, &work->cep->work_freelist);
264 	spin_unlock_bh(&work->cep->lock);
265 }
266 
267 static void siw_cep_set_inuse(struct siw_cep *cep)
268 {
269 	unsigned long flags;
270 retry:
271 	spin_lock_irqsave(&cep->lock, flags);
272 
273 	if (cep->in_use) {
274 		spin_unlock_irqrestore(&cep->lock, flags);
275 		wait_event_interruptible(cep->waitq, !cep->in_use);
276 		if (signal_pending(current))
277 			flush_signals(current);
278 		goto retry;
279 	} else {
280 		cep->in_use = 1;
281 		spin_unlock_irqrestore(&cep->lock, flags);
282 	}
283 }
284 
285 static void siw_cep_set_free(struct siw_cep *cep)
286 {
287 	unsigned long flags;
288 
289 	spin_lock_irqsave(&cep->lock, flags);
290 	cep->in_use = 0;
291 	spin_unlock_irqrestore(&cep->lock, flags);
292 
293 	wake_up(&cep->waitq);
294 }
295 
296 static void __siw_cep_dealloc(struct kref *ref)
297 {
298 	struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
299 	struct siw_device *sdev = cep->sdev;
300 	unsigned long flags;
301 
302 	WARN_ON(cep->listen_cep);
303 
304 	/* kfree(NULL) is safe */
305 	kfree(cep->mpa.pdata);
306 	spin_lock_bh(&cep->lock);
307 	if (!list_empty(&cep->work_freelist))
308 		siw_cm_free_work(cep);
309 	spin_unlock_bh(&cep->lock);
310 
311 	spin_lock_irqsave(&sdev->lock, flags);
312 	list_del(&cep->devq);
313 	spin_unlock_irqrestore(&sdev->lock, flags);
314 
315 	siw_dbg_cep(cep, "free endpoint\n");
316 	kfree(cep);
317 }
318 
319 static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
320 {
321 	struct siw_cm_work *work = NULL;
322 
323 	spin_lock_bh(&cep->lock);
324 	if (!list_empty(&cep->work_freelist)) {
325 		work = list_entry(cep->work_freelist.next, struct siw_cm_work,
326 				  list);
327 		list_del_init(&work->list);
328 	}
329 	spin_unlock_bh(&cep->lock);
330 	return work;
331 }
332 
333 static int siw_cm_alloc_work(struct siw_cep *cep, int num)
334 {
335 	struct siw_cm_work *work;
336 
337 	while (num--) {
338 		work = kmalloc_obj(*work);
339 		if (!work) {
340 			if (!(list_empty(&cep->work_freelist)))
341 				siw_cm_free_work(cep);
342 			return -ENOMEM;
343 		}
344 		work->cep = cep;
345 		INIT_LIST_HEAD(&work->list);
346 		list_add(&work->list, &cep->work_freelist);
347 	}
348 	return 0;
349 }
350 
351 /*
352  * siw_cm_upcall()
353  *
354  * Upcall to IWCM to inform about async connection events
355  */
356 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
357 			 int status)
358 {
359 	struct iw_cm_event event;
360 	struct iw_cm_id *id;
361 
362 	memset(&event, 0, sizeof(event));
363 	event.status = status;
364 	event.event = reason;
365 
366 	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
367 		event.provider_data = cep;
368 		id = cep->listen_cep->cm_id;
369 	} else {
370 		id = cep->cm_id;
371 	}
372 	/* Signal IRD and ORD */
373 	if (reason == IW_CM_EVENT_ESTABLISHED ||
374 	    reason == IW_CM_EVENT_CONNECT_REPLY) {
375 		/* Signal negotiated IRD/ORD values we will use */
376 		event.ird = cep->ird;
377 		event.ord = cep->ord;
378 	} else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
379 		event.ird = cep->ord;
380 		event.ord = cep->ird;
381 	}
382 	/* Signal private data and address information */
383 	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
384 	    reason == IW_CM_EVENT_CONNECT_REPLY) {
385 		u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
386 
387 		if (pd_len) {
388 			/*
389 			 * hand over MPA private data
390 			 */
391 			event.private_data_len = pd_len;
392 			event.private_data = cep->mpa.pdata;
393 
394 			/* Hide MPA V2 IRD/ORD control */
395 			if (cep->enhanced_rdma_conn_est) {
396 				event.private_data_len -=
397 					sizeof(struct mpa_v2_data);
398 				event.private_data +=
399 					sizeof(struct mpa_v2_data);
400 			}
401 		}
402 		getname_local(cep->sock, &event.local_addr);
403 		getname_peer(cep->sock, &event.remote_addr);
404 	}
405 	siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n",
406 		    cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status);
407 
408 	return id->event_handler(id, &event);
409 }
410 
411 static void siw_free_cm_id(struct siw_cep *cep)
412 {
413 	if (!cep->cm_id)
414 		return;
415 
416 	cep->cm_id->rem_ref(cep->cm_id);
417 	cep->cm_id = NULL;
418 }
419 
420 static void siw_destroy_cep_sock(struct siw_cep *cep)
421 {
422 	struct socket *s = cep->sock;
423 
424 	if (s) {
425 		siw_socket_disassoc(s);
426 		sock_release(s);
427 	}
428 }
429 
430 /*
431  * siw_qp_cm_drop()
432  *
433  * Drops established LLP connection if present and not already
434  * scheduled for dropping. Called from user context, SQ workqueue
435  * or receive IRQ. Caller signals if socket can be immediately
436  * closed (basically, if not in IRQ).
437  */
438 void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
439 {
440 	struct siw_cep *cep = qp->cep;
441 
442 	qp->rx_stream.rx_suspend = 1;
443 	qp->tx_ctx.tx_suspend = 1;
444 
445 	if (!qp->cep)
446 		return;
447 
448 	if (schedule) {
449 		siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
450 	} else {
451 		siw_cep_set_inuse(cep);
452 
453 		if (cep->state == SIW_EPSTATE_CLOSED) {
454 			siw_dbg_cep(cep, "already closed\n");
455 			goto out;
456 		}
457 		siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
458 
459 		siw_send_terminate(qp);
460 
461 		if (cep->cm_id) {
462 			switch (cep->state) {
463 			case SIW_EPSTATE_AWAIT_MPAREP:
464 				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
465 					      -EINVAL);
466 				break;
467 
468 			case SIW_EPSTATE_RDMA_MODE:
469 				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
470 				break;
471 
472 			case SIW_EPSTATE_IDLE:
473 			case SIW_EPSTATE_LISTENING:
474 			case SIW_EPSTATE_CONNECTING:
475 			case SIW_EPSTATE_AWAIT_MPAREQ:
476 			case SIW_EPSTATE_RECVD_MPAREQ:
477 			case SIW_EPSTATE_CLOSED:
478 			default:
479 				break;
480 			}
481 			siw_free_cm_id(cep);
482 			siw_cep_put(cep);
483 		}
484 		cep->state = SIW_EPSTATE_CLOSED;
485 
486 		siw_destroy_cep_sock(cep);
487 		if (cep->qp) {
488 			cep->qp = NULL;
489 			siw_qp_put(qp);
490 		}
491 out:
492 		siw_cep_set_free(cep);
493 	}
494 }
495 
496 void siw_cep_put(struct siw_cep *cep)
497 {
498 	WARN_ON(kref_read(&cep->ref) < 1);
499 	kref_put(&cep->ref, __siw_cep_dealloc);
500 }
501 
502 static void siw_cep_set_free_and_put(struct siw_cep *cep)
503 {
504 	siw_cep_set_free(cep);
505 	siw_cep_put(cep);
506 }
507 
508 void siw_cep_get(struct siw_cep *cep)
509 {
510 	kref_get(&cep->ref);
511 }
512 
513 /*
514  * Expects params->pd_len in host byte order
515  */
516 static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
517 {
518 	struct socket *s = cep->sock;
519 	struct mpa_rr *rr = &cep->mpa.hdr;
520 	struct kvec iov[3];
521 	struct msghdr msg;
522 	int rv;
523 	int iovec_num = 0;
524 	int mpa_len;
525 
526 	memset(&msg, 0, sizeof(msg));
527 
528 	iov[iovec_num].iov_base = rr;
529 	iov[iovec_num].iov_len = sizeof(*rr);
530 	mpa_len = sizeof(*rr);
531 
532 	if (cep->enhanced_rdma_conn_est) {
533 		iovec_num++;
534 		iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
535 		iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
536 		mpa_len += sizeof(cep->mpa.v2_ctrl);
537 	}
538 	if (pd_len) {
539 		iovec_num++;
540 		iov[iovec_num].iov_base = (char *)pdata;
541 		iov[iovec_num].iov_len = pd_len;
542 		mpa_len += pd_len;
543 	}
544 	if (cep->enhanced_rdma_conn_est)
545 		pd_len += sizeof(cep->mpa.v2_ctrl);
546 
547 	rr->params.pd_len = cpu_to_be16(pd_len);
548 
549 	rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
550 
551 	return rv < 0 ? rv : 0;
552 }
553 
554 /*
555  * Receive MPA Request/Reply header.
556  *
557  * Returns 0 if complete MPA Request/Reply header including
558  * eventual private data was received. Returns -EAGAIN if
559  * header was partially received or negative error code otherwise.
560  *
561  * Context: May be called in process context only
562  */
563 static int siw_recv_mpa_rr(struct siw_cep *cep)
564 {
565 	struct mpa_rr *hdr = &cep->mpa.hdr;
566 	struct socket *s = cep->sock;
567 	u16 pd_len;
568 	int rcvd, to_rcv;
569 
570 	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
571 		rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
572 				  sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
573 				  0);
574 		if (rcvd <= 0)
575 			return -ECONNABORTED;
576 
577 		cep->mpa.bytes_rcvd += rcvd;
578 
579 		if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
580 			return -EAGAIN;
581 
582 		if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
583 			return -EPROTO;
584 	}
585 	pd_len = be16_to_cpu(hdr->params.pd_len);
586 
587 	/*
588 	 * At least the MPA Request/Reply header (frame not including
589 	 * private data) has been received.
590 	 * Receive (or continue receiving) any private data.
591 	 */
592 	to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
593 
594 	if (!to_rcv) {
595 		/*
596 		 * We must have hdr->params.pd_len == 0 and thus received a
597 		 * complete MPA Request/Reply frame.
598 		 * Check against peer protocol violation.
599 		 */
600 		u32 word;
601 
602 		rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
603 		if (rcvd == -EAGAIN)
604 			return 0;
605 
606 		if (rcvd == 0) {
607 			siw_dbg_cep(cep, "peer EOF\n");
608 			return -EPIPE;
609 		}
610 		if (rcvd < 0) {
611 			siw_dbg_cep(cep, "error: %d\n", rcvd);
612 			return rcvd;
613 		}
614 		siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
615 
616 		return -EPROTO;
617 	}
618 
619 	/*
620 	 * At this point, we must have hdr->params.pd_len != 0.
621 	 * A private data buffer gets allocated if hdr->params.pd_len != 0.
622 	 */
623 	if (!cep->mpa.pdata) {
624 		cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
625 		if (!cep->mpa.pdata)
626 			return -ENOMEM;
627 	}
628 	rcvd = ksock_recv(
629 		s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
630 		to_rcv + 4, MSG_DONTWAIT);
631 
632 	if (rcvd < 0)
633 		return rcvd;
634 
635 	if (rcvd > to_rcv)
636 		return -EPROTO;
637 
638 	cep->mpa.bytes_rcvd += rcvd;
639 
640 	if (to_rcv == rcvd) {
641 		siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
642 		return 0;
643 	}
644 	return -EAGAIN;
645 }
646 
647 /*
648  * siw_proc_mpareq()
649  *
650  * Read MPA Request from socket and signal new connection to IWCM
651  * if success. Caller must hold lock on corresponding listening CEP.
652  */
653 static int siw_proc_mpareq(struct siw_cep *cep)
654 {
655 	struct mpa_rr *req;
656 	int version, rv;
657 	u16 pd_len;
658 
659 	rv = siw_recv_mpa_rr(cep);
660 	if (rv)
661 		return rv;
662 
663 	req = &cep->mpa.hdr;
664 
665 	version = __mpa_rr_revision(req->params.bits);
666 	pd_len = be16_to_cpu(req->params.pd_len);
667 
668 	if (version > MPA_REVISION_2)
669 		/* allow for 0, 1, and 2 only */
670 		return -EPROTO;
671 
672 	if (memcmp(req->key, MPA_KEY_REQ, 16))
673 		return -EPROTO;
674 
675 	/* Prepare for sending MPA reply */
676 	memcpy(req->key, MPA_KEY_REP, 16);
677 
678 	if (version == MPA_REVISION_2 &&
679 	    (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
680 		/*
681 		 * MPA version 2 must signal IRD/ORD values and P2P mode
682 		 * in private data if header flag MPA_RR_FLAG_ENHANCED
683 		 * is set.
684 		 */
685 		if (pd_len < sizeof(struct mpa_v2_data))
686 			goto reject_conn;
687 
688 		cep->enhanced_rdma_conn_est = true;
689 	}
690 
691 	/* MPA Markers: currently not supported. Marker TX to be added. */
692 	if (req->params.bits & MPA_RR_FLAG_MARKERS)
693 		goto reject_conn;
694 
695 	if (req->params.bits & MPA_RR_FLAG_CRC) {
696 		/*
697 		 * RFC 5044, page 27: CRC MUST be used if peer requests it.
698 		 * siw specific: 'mpa_crc_strict' parameter to reject
699 		 * connection with CRC if local CRC off enforced by
700 		 * 'mpa_crc_strict' module parameter.
701 		 */
702 		if (!mpa_crc_required && mpa_crc_strict)
703 			goto reject_conn;
704 
705 		/* Enable CRC if requested by module parameter */
706 		if (mpa_crc_required)
707 			req->params.bits |= MPA_RR_FLAG_CRC;
708 	}
709 	if (cep->enhanced_rdma_conn_est) {
710 		struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
711 
712 		/*
713 		 * Peer requested ORD becomes requested local IRD,
714 		 * peer requested IRD becomes requested local ORD.
715 		 * IRD and ORD get limited by global maximum values.
716 		 */
717 		cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
718 		cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
719 		cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
720 		cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
721 
722 		/* May get overwritten by locally negotiated values */
723 		cep->mpa.v2_ctrl.ird = htons(cep->ird);
724 		cep->mpa.v2_ctrl.ord = htons(cep->ord);
725 
726 		/*
727 		 * Support for peer sent zero length Write or Read to
728 		 * let local side enter RTS. Writes are preferred.
729 		 * Sends would require pre-posting a Receive and are
730 		 * not supported.
731 		 * Propose zero length Write if none of Read and Write
732 		 * is indicated.
733 		 */
734 		if (v2->ird & MPA_V2_PEER_TO_PEER) {
735 			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
736 
737 			if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
738 				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
739 			else if (v2->ord & MPA_V2_RDMA_READ_RTR)
740 				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
741 			else
742 				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
743 		}
744 	}
745 
746 	cep->state = SIW_EPSTATE_RECVD_MPAREQ;
747 
748 	/* Keep reference until IWCM accepts/rejects */
749 	siw_cep_get(cep);
750 	rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
751 	if (rv)
752 		siw_cep_put(cep);
753 
754 	return rv;
755 
756 reject_conn:
757 	siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
758 		    req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
759 		    mpa_crc_required, mpa_crc_strict,
760 		    req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
761 
762 	req->params.bits &= ~MPA_RR_FLAG_MARKERS;
763 	req->params.bits |= MPA_RR_FLAG_REJECT;
764 
765 	if (!mpa_crc_required && mpa_crc_strict)
766 		req->params.bits &= ~MPA_RR_FLAG_CRC;
767 
768 	if (pd_len)
769 		kfree(cep->mpa.pdata);
770 
771 	cep->mpa.pdata = NULL;
772 
773 	siw_send_mpareqrep(cep, NULL, 0);
774 
775 	return -EOPNOTSUPP;
776 }
777 
778 static int siw_proc_mpareply(struct siw_cep *cep)
779 {
780 	struct siw_qp_attrs qp_attrs;
781 	enum siw_qp_attr_mask qp_attr_mask;
782 	struct siw_qp *qp = cep->qp;
783 	struct mpa_rr *rep;
784 	int rv;
785 	u16 rep_ord;
786 	u16 rep_ird;
787 	bool ird_insufficient = false;
788 	enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
789 
790 	rv = siw_recv_mpa_rr(cep);
791 	if (rv)
792 		goto out_err;
793 
794 	siw_cancel_mpatimer(cep);
795 
796 	rep = &cep->mpa.hdr;
797 
798 	if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
799 		/* allow for 0, 1,  and 2 only */
800 		rv = -EPROTO;
801 		goto out_err;
802 	}
803 	if (memcmp(rep->key, MPA_KEY_REP, 16)) {
804 		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
805 				   LLP_ECODE_INVALID_REQ_RESP, 0);
806 		siw_send_terminate(qp);
807 		rv = -EPROTO;
808 		goto out_err;
809 	}
810 	if (rep->params.bits & MPA_RR_FLAG_REJECT) {
811 		siw_dbg_cep(cep, "got mpa reject\n");
812 		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
813 
814 		return -ECONNRESET;
815 	}
816 	if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
817 		siw_dbg_cep(cep, "peer allows GSO on TX\n");
818 		qp->tx_ctx.gso_seg_limit = 0;
819 	}
820 	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
821 	    (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
822 	    (mpa_crc_strict && !mpa_crc_required &&
823 	     (rep->params.bits & MPA_RR_FLAG_CRC))) {
824 		siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
825 			    rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
826 			    mpa_crc_required, mpa_crc_strict,
827 			    rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
828 
829 		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
830 
831 		return -EINVAL;
832 	}
833 	if (cep->enhanced_rdma_conn_est) {
834 		struct mpa_v2_data *v2;
835 
836 		if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
837 		    !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
838 			/*
839 			 * Protocol failure: The responder MUST reply with
840 			 * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
841 			 */
842 			siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
843 				    __mpa_rr_revision(rep->params.bits),
844 				    rep->params.bits & MPA_RR_FLAG_ENHANCED ?
845 					    1 :
846 					    0);
847 
848 			siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
849 				      -ECONNRESET);
850 			return -EINVAL;
851 		}
852 		v2 = (struct mpa_v2_data *)cep->mpa.pdata;
853 		rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
854 		rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
855 
856 		if (cep->ird < rep_ord &&
857 		    (relaxed_ird_negotiation == false ||
858 		     rep_ord > cep->sdev->attrs.max_ird)) {
859 			siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
860 				    cep->ird, rep_ord,
861 				    cep->sdev->attrs.max_ord);
862 			ird_insufficient = true;
863 		}
864 		if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
865 			siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
866 				    rep_ird);
867 			ird_insufficient = true;
868 		}
869 		/*
870 		 * Always report negotiated peer values to user,
871 		 * even if IRD/ORD negotiation failed
872 		 */
873 		cep->ird = rep_ord;
874 		cep->ord = rep_ird;
875 
876 		if (ird_insufficient) {
877 			/*
878 			 * If the initiator IRD is insuffient for the
879 			 * responder ORD, send a TERM.
880 			 */
881 			siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
882 					   LLP_ETYPE_MPA,
883 					   LLP_ECODE_INSUFFICIENT_IRD, 0);
884 			siw_send_terminate(qp);
885 			rv = -ENOMEM;
886 			goto out_err;
887 		}
888 		if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
889 			mpa_p2p_mode =
890 				cep->mpa.v2_ctrl_req.ord &
891 				(MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
892 
893 		/*
894 		 * Check if we requested P2P mode, and if peer agrees
895 		 */
896 		if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
897 			if ((mpa_p2p_mode & v2->ord) == 0) {
898 				/*
899 				 * We requested RTR mode(s), but the peer
900 				 * did not pick any mode we support.
901 				 */
902 				siw_dbg_cep(cep,
903 					    "rtr mode:  req %2x, got %2x\n",
904 					    mpa_p2p_mode,
905 					    v2->ord & (MPA_V2_RDMA_WRITE_RTR |
906 						       MPA_V2_RDMA_READ_RTR));
907 
908 				siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
909 						   LLP_ETYPE_MPA,
910 						   LLP_ECODE_NO_MATCHING_RTR,
911 						   0);
912 				siw_send_terminate(qp);
913 				rv = -EPROTO;
914 				goto out_err;
915 			}
916 			mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
917 						  MPA_V2_RDMA_READ_RTR);
918 		}
919 	}
920 	memset(&qp_attrs, 0, sizeof(qp_attrs));
921 
922 	if (rep->params.bits & MPA_RR_FLAG_CRC)
923 		qp_attrs.flags = SIW_MPA_CRC;
924 
925 	qp_attrs.irq_size = cep->ird;
926 	qp_attrs.orq_size = cep->ord;
927 	qp_attrs.sk = cep->sock;
928 	qp_attrs.state = SIW_QP_STATE_RTS;
929 
930 	qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
931 		       SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
932 
933 	/* Move socket RX/TX under QP control */
934 	down_write(&qp->state_lock);
935 	if (qp->attrs.state > SIW_QP_STATE_RTR) {
936 		rv = -EINVAL;
937 		up_write(&qp->state_lock);
938 		goto out_err;
939 	}
940 	rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
941 
942 	siw_qp_socket_assoc(cep, qp);
943 
944 	up_write(&qp->state_lock);
945 
946 	/* Send extra RDMA frame to trigger peer RTS if negotiated */
947 	if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
948 		rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
949 		if (rv)
950 			goto out_err;
951 	}
952 	if (!rv) {
953 		rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
954 		if (!rv)
955 			cep->state = SIW_EPSTATE_RDMA_MODE;
956 
957 		return 0;
958 	}
959 
960 out_err:
961 	if (rv != -EAGAIN)
962 		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
963 
964 	return rv;
965 }
966 
967 /*
968  * siw_accept_newconn - accept an incoming pending connection
969  *
970  */
971 static void siw_accept_newconn(struct siw_cep *cep)
972 {
973 	struct socket *s = cep->sock;
974 	struct socket *new_s = NULL;
975 	struct siw_cep *new_cep = NULL;
976 	int rv = 0; /* debug only. should disappear */
977 
978 	if (cep->state != SIW_EPSTATE_LISTENING)
979 		goto error;
980 
981 	new_cep = siw_cep_alloc(cep->sdev);
982 	if (!new_cep)
983 		goto error;
984 
985 	/*
986 	 * 4: Allocate a sufficient number of work elements
987 	 * to allow concurrent handling of local + peer close
988 	 * events, MPA header processing + MPA timeout.
989 	 */
990 	if (siw_cm_alloc_work(new_cep, 4) != 0)
991 		goto error;
992 
993 	/*
994 	 * Copy saved socket callbacks from listening CEP
995 	 * and assign new socket with new CEP
996 	 */
997 	new_cep->sk_state_change = cep->sk_state_change;
998 	new_cep->sk_data_ready = cep->sk_data_ready;
999 	new_cep->sk_write_space = cep->sk_write_space;
1000 	new_cep->sk_error_report = cep->sk_error_report;
1001 
1002 	rv = kernel_accept(s, &new_s, O_NONBLOCK);
1003 	if (rv != 0) {
1004 		/*
1005 		 * Connection already aborted by peer..?
1006 		 */
1007 		siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
1008 		goto error;
1009 	}
1010 	new_cep->sock = new_s;
1011 	siw_cep_get(new_cep);
1012 	new_s->sk->sk_user_data = new_cep;
1013 
1014 	if (siw_tcp_nagle == false)
1015 		tcp_sock_set_nodelay(new_s->sk);
1016 	new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
1017 
1018 	rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
1019 	if (rv)
1020 		goto error;
1021 	/*
1022 	 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
1023 	 */
1024 	new_cep->listen_cep = cep;
1025 	siw_cep_get(cep);
1026 
1027 	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
1028 		/*
1029 		 * MPA REQ already queued
1030 		 */
1031 		siw_dbg_cep(cep, "immediate mpa request\n");
1032 
1033 		siw_cep_set_inuse(new_cep);
1034 		rv = siw_proc_mpareq(new_cep);
1035 		if (rv != -EAGAIN) {
1036 			siw_cep_put(cep);
1037 			new_cep->listen_cep = NULL;
1038 			if (rv) {
1039 				siw_cancel_mpatimer(new_cep);
1040 				siw_cep_set_free(new_cep);
1041 				goto error;
1042 			}
1043 		}
1044 		siw_cep_set_free(new_cep);
1045 	}
1046 	return;
1047 
1048 error:
1049 	if (new_cep)
1050 		siw_cep_put(new_cep);
1051 
1052 	if (new_s) {
1053 		siw_socket_disassoc(new_s);
1054 		sock_release(new_s);
1055 	}
1056 	siw_dbg_cep(cep, "error %d\n", rv);
1057 }
1058 
1059 static void siw_cm_work_handler(struct work_struct *w)
1060 {
1061 	struct siw_cm_work *work;
1062 	struct siw_cep *cep;
1063 	int release_cep = 0, rv = 0;
1064 
1065 	work = container_of(w, struct siw_cm_work, work.work);
1066 	cep = work->cep;
1067 
1068 	siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
1069 		    cep->qp ? qp_id(cep->qp) : UINT_MAX,
1070 		    work->type, cep->state);
1071 
1072 	siw_cep_set_inuse(cep);
1073 
1074 	switch (work->type) {
1075 	case SIW_CM_WORK_ACCEPT:
1076 		siw_accept_newconn(cep);
1077 		break;
1078 
1079 	case SIW_CM_WORK_READ_MPAHDR:
1080 		if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1081 			if (cep->listen_cep) {
1082 				siw_cep_set_inuse(cep->listen_cep);
1083 
1084 				if (cep->listen_cep->state ==
1085 				    SIW_EPSTATE_LISTENING)
1086 					rv = siw_proc_mpareq(cep);
1087 				else
1088 					rv = -EFAULT;
1089 
1090 				siw_cep_set_free(cep->listen_cep);
1091 
1092 				if (rv != -EAGAIN) {
1093 					siw_cep_put(cep->listen_cep);
1094 					cep->listen_cep = NULL;
1095 					if (rv)
1096 						siw_cep_put(cep);
1097 				}
1098 			}
1099 		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1100 			rv = siw_proc_mpareply(cep);
1101 		} else {
1102 			/*
1103 			 * CEP already moved out of MPA handshake.
1104 			 * any connection management already done.
1105 			 * silently ignore the mpa packet.
1106 			 */
1107 			if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1108 				cep->sock->sk->sk_data_ready(cep->sock->sk);
1109 				siw_dbg_cep(cep, "already in RDMA mode");
1110 			} else {
1111 				siw_dbg_cep(cep, "out of state: %d\n",
1112 					    cep->state);
1113 			}
1114 		}
1115 		if (rv && rv != -EAGAIN)
1116 			release_cep = 1;
1117 		break;
1118 
1119 	case SIW_CM_WORK_CLOSE_LLP:
1120 		/*
1121 		 * QP scheduled LLP close
1122 		 */
1123 		if (cep->qp)
1124 			siw_send_terminate(cep->qp);
1125 
1126 		if (cep->cm_id)
1127 			siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
1128 
1129 		release_cep = 1;
1130 		break;
1131 
1132 	case SIW_CM_WORK_PEER_CLOSE:
1133 		if (cep->cm_id) {
1134 			if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1135 				/*
1136 				 * MPA reply not received, but connection drop
1137 				 */
1138 				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
1139 					      -ECONNRESET);
1140 			} else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1141 				/*
1142 				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
1143 				 *       to transition IWCM into CLOSING.
1144 				 */
1145 				siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
1146 				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
1147 			}
1148 			/*
1149 			 * for other states there is no connection
1150 			 * known to the IWCM.
1151 			 */
1152 		} else {
1153 			if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
1154 				/*
1155 				 * Wait for the ulp/CM to call accept/reject
1156 				 */
1157 				siw_dbg_cep(cep,
1158 					    "mpa req recvd, wait for ULP\n");
1159 			} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1160 				/*
1161 				 * Socket close before MPA request received.
1162 				 */
1163 				if (cep->listen_cep) {
1164 					siw_dbg_cep(cep,
1165 						"no mpareq: drop listener\n");
1166 					siw_cep_put(cep->listen_cep);
1167 					cep->listen_cep = NULL;
1168 				}
1169 			}
1170 		}
1171 		release_cep = 1;
1172 		break;
1173 
1174 	case SIW_CM_WORK_MPATIMEOUT:
1175 		cep->mpa_timer = NULL;
1176 
1177 		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1178 			/*
1179 			 * MPA request timed out:
1180 			 * Hide any partially received private data and signal
1181 			 * timeout
1182 			 */
1183 			cep->mpa.hdr.params.pd_len = 0;
1184 
1185 			if (cep->cm_id)
1186 				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
1187 					      -ETIMEDOUT);
1188 			release_cep = 1;
1189 
1190 		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1191 			/*
1192 			 * No MPA request received after peer TCP stream setup.
1193 			 */
1194 			if (cep->listen_cep) {
1195 				siw_cep_put(cep->listen_cep);
1196 				cep->listen_cep = NULL;
1197 			}
1198 			release_cep = 1;
1199 		}
1200 		break;
1201 
1202 	default:
1203 		WARN(1, "Undefined CM work type: %d\n", work->type);
1204 	}
1205 	if (release_cep) {
1206 		struct socket *s = cep->sock;
1207 
1208 		siw_dbg_cep(cep,
1209 			    "release: timer=%s, QP[%u]\n",
1210 			    cep->mpa_timer ? "y" : "n",
1211 			    cep->qp ? qp_id(cep->qp) : UINT_MAX);
1212 
1213 		siw_cancel_mpatimer(cep);
1214 
1215 		cep->state = SIW_EPSTATE_CLOSED;
1216 
1217 		if (cep->qp) {
1218 			struct siw_qp *qp = cep->qp;
1219 			/*
1220 			 * Serialize a potential race with application
1221 			 * closing the QP and calling siw_qp_cm_drop()
1222 			 */
1223 			siw_qp_get(qp);
1224 			siw_cep_set_free(cep);
1225 
1226 			siw_qp_llp_close(qp);
1227 			siw_qp_put(qp);
1228 
1229 			siw_cep_set_inuse(cep);
1230 			cep->qp = NULL;
1231 			siw_qp_put(qp);
1232 		}
1233 		if (s) {
1234 			siw_socket_disassoc(s);
1235 			sock_release(s);
1236 		}
1237 		if (cep->cm_id) {
1238 			siw_free_cm_id(cep);
1239 			siw_cep_put(cep);
1240 		}
1241 	}
1242 	siw_cep_set_free(cep);
1243 	siw_put_work(work);
1244 	siw_cep_put(cep);
1245 }
1246 
1247 static struct workqueue_struct *siw_cm_wq;
1248 
1249 int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
1250 {
1251 	struct siw_cm_work *work = siw_get_work(cep);
1252 	unsigned long delay = 0;
1253 
1254 	if (!work) {
1255 		siw_dbg_cep(cep, "failed with no work available\n");
1256 		return -ENOMEM;
1257 	}
1258 	work->type = type;
1259 	work->cep = cep;
1260 
1261 	siw_cep_get(cep);
1262 
1263 	INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
1264 
1265 	if (type == SIW_CM_WORK_MPATIMEOUT) {
1266 		cep->mpa_timer = work;
1267 
1268 		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
1269 			delay = MPAREQ_TIMEOUT;
1270 		else
1271 			delay = MPAREP_TIMEOUT;
1272 	}
1273 	siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n",
1274 		    cep->qp ? qp_id(cep->qp) : -1, type, delay);
1275 
1276 	queue_delayed_work(siw_cm_wq, &work->work, delay);
1277 
1278 	return 0;
1279 }
1280 
1281 static void siw_cm_llp_data_ready(struct sock *sk)
1282 {
1283 	struct siw_cep *cep;
1284 
1285 	trace_sk_data_ready(sk);
1286 
1287 	read_lock(&sk->sk_callback_lock);
1288 
1289 	cep = sk_to_cep(sk);
1290 	if (!cep)
1291 		goto out;
1292 
1293 	siw_dbg_cep(cep, "cep state: %d, socket state %d\n",
1294 		    cep->state, sk->sk_state);
1295 
1296 	if (sk->sk_state != TCP_ESTABLISHED)
1297 		goto out;
1298 
1299 	switch (cep->state) {
1300 	case SIW_EPSTATE_RDMA_MODE:
1301 	case SIW_EPSTATE_LISTENING:
1302 		break;
1303 
1304 	case SIW_EPSTATE_AWAIT_MPAREQ:
1305 	case SIW_EPSTATE_AWAIT_MPAREP:
1306 		siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
1307 		break;
1308 
1309 	default:
1310 		siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
1311 		break;
1312 	}
1313 out:
1314 	read_unlock(&sk->sk_callback_lock);
1315 }
1316 
1317 static void siw_cm_llp_write_space(struct sock *sk)
1318 {
1319 	struct siw_cep *cep = sk_to_cep(sk);
1320 
1321 	if (cep)
1322 		siw_dbg_cep(cep, "state: %d\n", cep->state);
1323 }
1324 
1325 static void siw_cm_llp_error_report(struct sock *sk)
1326 {
1327 	struct siw_cep *cep = sk_to_cep(sk);
1328 
1329 	if (cep) {
1330 		siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
1331 			    sk->sk_err, sk->sk_state, cep->state);
1332 		cep->sk_error_report(sk);
1333 	}
1334 }
1335 
1336 static void siw_cm_llp_state_change(struct sock *sk)
1337 {
1338 	struct siw_cep *cep;
1339 	void (*orig_state_change)(struct sock *s);
1340 
1341 	read_lock(&sk->sk_callback_lock);
1342 
1343 	cep = sk_to_cep(sk);
1344 	if (!cep) {
1345 		/* endpoint already disassociated */
1346 		read_unlock(&sk->sk_callback_lock);
1347 		return;
1348 	}
1349 	orig_state_change = cep->sk_state_change;
1350 
1351 	siw_dbg_cep(cep, "state: %d\n", cep->state);
1352 
1353 	switch (sk->sk_state) {
1354 	case TCP_ESTABLISHED:
1355 		/*
1356 		 * handle accepting socket as special case where only
1357 		 * new connection is possible
1358 		 */
1359 		siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
1360 		break;
1361 
1362 	case TCP_CLOSE:
1363 	case TCP_CLOSE_WAIT:
1364 		if (cep->qp)
1365 			cep->qp->tx_ctx.tx_suspend = 1;
1366 		siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
1367 		break;
1368 
1369 	default:
1370 		siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
1371 	}
1372 	read_unlock(&sk->sk_callback_lock);
1373 	orig_state_change(sk);
1374 }
1375 
1376 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
1377 			      struct sockaddr *raddr, bool afonly)
1378 {
1379 	int rv, flags = 0;
1380 	size_t size = laddr->sa_family == AF_INET ?
1381 		sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
1382 
1383 	/*
1384 	 * Make address available again asap.
1385 	 */
1386 	sock_set_reuseaddr(s->sk);
1387 
1388 	if (afonly) {
1389 		rv = ip6_sock_set_v6only(s->sk);
1390 		if (rv)
1391 			return rv;
1392 	}
1393 
1394 	rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, size);
1395 	if (rv < 0)
1396 		return rv;
1397 
1398 	rv = s->ops->connect(s, (struct sockaddr_unsized *)raddr, size, flags);
1399 
1400 	return rv < 0 ? rv : 0;
1401 }
1402 
1403 int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1404 {
1405 	struct siw_device *sdev = to_siw_dev(id->device);
1406 	struct siw_qp *qp;
1407 	struct siw_cep *cep = NULL;
1408 	struct socket *s = NULL;
1409 	struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
1410 			*raddr = (struct sockaddr *)&id->remote_addr;
1411 	bool p2p_mode = peer_to_peer, v4 = true;
1412 	u16 pd_len = params->private_data_len;
1413 	int version = mpa_version, rv;
1414 
1415 	if (pd_len > MPA_MAX_PRIVDATA)
1416 		return -EINVAL;
1417 
1418 	if (params->ird > sdev->attrs.max_ird ||
1419 	    params->ord > sdev->attrs.max_ord)
1420 		return -ENOMEM;
1421 
1422 	if (laddr->sa_family == AF_INET6)
1423 		v4 = false;
1424 	else if (laddr->sa_family != AF_INET)
1425 		return -EAFNOSUPPORT;
1426 
1427 	/*
1428 	 * Respect any iwarp port mapping: Use mapped remote address
1429 	 * if valid. Local address must not be mapped, since siw
1430 	 * uses kernel TCP stack.
1431 	 */
1432 	if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
1433 	     to_sockaddr_in6(id->remote_addr).sin6_port != 0)
1434 		raddr = (struct sockaddr *)&id->m_remote_addr;
1435 
1436 	qp = siw_qp_id2obj(sdev, params->qpn);
1437 	if (!qp) {
1438 		WARN(1, "[QP %u] does not exist\n", params->qpn);
1439 		rv = -EINVAL;
1440 		goto error;
1441 	}
1442 	siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr,
1443 		   raddr);
1444 
1445 	rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
1446 	if (rv < 0)
1447 		goto error;
1448 	siw_reclassify_socket(s);
1449 
1450 	/*
1451 	 * NOTE: For simplification, connect() is called in blocking
1452 	 * mode. Might be reconsidered for async connection setup at
1453 	 * TCP level.
1454 	 */
1455 	rv = kernel_bindconnect(s, laddr, raddr, id->afonly);
1456 	if (rv != 0) {
1457 		siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
1458 		goto error;
1459 	}
1460 	if (siw_tcp_nagle == false)
1461 		tcp_sock_set_nodelay(s->sk);
1462 	cep = siw_cep_alloc(sdev);
1463 	if (!cep) {
1464 		rv = -ENOMEM;
1465 		goto error;
1466 	}
1467 	siw_cep_set_inuse(cep);
1468 
1469 	/* Associate QP with CEP */
1470 	siw_cep_get(cep);
1471 	qp->cep = cep;
1472 
1473 	/* siw_qp_get(qp) already done by QP lookup */
1474 	cep->qp = qp;
1475 
1476 	id->add_ref(id);
1477 	cep->cm_id = id;
1478 
1479 	/*
1480 	 * 4: Allocate a sufficient number of work elements
1481 	 * to allow concurrent handling of local + peer close
1482 	 * events, MPA header processing + MPA timeout.
1483 	 */
1484 	rv = siw_cm_alloc_work(cep, 4);
1485 	if (rv != 0) {
1486 		rv = -ENOMEM;
1487 		goto error;
1488 	}
1489 	cep->ird = params->ird;
1490 	cep->ord = params->ord;
1491 
1492 	if (p2p_mode && cep->ord == 0)
1493 		cep->ord = 1;
1494 
1495 	cep->state = SIW_EPSTATE_CONNECTING;
1496 
1497 	/*
1498 	 * Associate CEP with socket
1499 	 */
1500 	siw_cep_socket_assoc(cep, s);
1501 
1502 	cep->state = SIW_EPSTATE_AWAIT_MPAREP;
1503 
1504 	/*
1505 	 * Set MPA Request bits: CRC if required, no MPA Markers,
1506 	 * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
1507 	 */
1508 	cep->mpa.hdr.params.bits = 0;
1509 	if (version > MPA_REVISION_2) {
1510 		pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
1511 		version = MPA_REVISION_2;
1512 		/* Adjust also module parameter */
1513 		mpa_version = MPA_REVISION_2;
1514 	}
1515 	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
1516 
1517 	if (try_gso)
1518 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
1519 
1520 	if (mpa_crc_required)
1521 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
1522 
1523 	/*
1524 	 * If MPA version == 2:
1525 	 * o Include ORD and IRD.
1526 	 * o Indicate peer-to-peer mode, if required by module
1527 	 *   parameter 'peer_to_peer'.
1528 	 */
1529 	if (version == MPA_REVISION_2) {
1530 		cep->enhanced_rdma_conn_est = true;
1531 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
1532 
1533 		cep->mpa.v2_ctrl.ird = htons(cep->ird);
1534 		cep->mpa.v2_ctrl.ord = htons(cep->ord);
1535 
1536 		if (p2p_mode) {
1537 			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
1538 			cep->mpa.v2_ctrl.ord |= rtr_type;
1539 		}
1540 		/* Remember own P2P mode requested */
1541 		cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
1542 		cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
1543 	}
1544 	memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
1545 
1546 	rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
1547 	/*
1548 	 * Reset private data.
1549 	 */
1550 	cep->mpa.hdr.params.pd_len = 0;
1551 
1552 	if (rv >= 0) {
1553 		rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
1554 		if (!rv) {
1555 			siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
1556 			siw_cep_set_free(cep);
1557 			return 0;
1558 		}
1559 	}
1560 error:
1561 	siw_dbg(id->device, "failed: %d\n", rv);
1562 
1563 	if (cep) {
1564 		siw_socket_disassoc(s);
1565 		sock_release(s);
1566 
1567 		cep->qp = NULL;
1568 
1569 		cep->cm_id = NULL;
1570 		id->rem_ref(id);
1571 
1572 		qp->cep = NULL;
1573 		siw_cep_put(cep);
1574 
1575 		cep->state = SIW_EPSTATE_CLOSED;
1576 
1577 		siw_cep_set_free_and_put(cep);
1578 
1579 	} else if (s) {
1580 		sock_release(s);
1581 	}
1582 	if (qp)
1583 		siw_qp_put(qp);
1584 
1585 	return rv;
1586 }
1587 
1588 /*
1589  * siw_accept - Let SoftiWARP accept an RDMA connection request
1590  *
1591  * @id:		New connection management id to be used for accepted
1592  *		connection request
1593  * @params:	Connection parameters provided by ULP for accepting connection
1594  *
1595  * Transition QP to RTS state, associate new CM id @id with accepted CEP
1596  * and get prepared for TCP input by installing socket callbacks.
1597  * Then send MPA Reply and generate the "connection established" event.
1598  * Socket callbacks must be installed before sending MPA Reply, because
1599  * the latter may cause a first RDMA message to arrive from the RDMA Initiator
1600  * side very quickly, at which time the socket callbacks must be ready.
1601  */
1602 int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1603 {
1604 	struct siw_device *sdev = to_siw_dev(id->device);
1605 	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1606 	struct siw_qp *qp;
1607 	struct siw_qp_attrs qp_attrs;
1608 	int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA;
1609 	bool wait_for_peer_rts = false;
1610 
1611 	siw_cep_set_inuse(cep);
1612 	siw_cep_put(cep);
1613 
1614 	/* Free lingering inbound private data */
1615 	if (cep->mpa.hdr.params.pd_len) {
1616 		cep->mpa.hdr.params.pd_len = 0;
1617 		kfree(cep->mpa.pdata);
1618 		cep->mpa.pdata = NULL;
1619 	}
1620 	siw_cancel_mpatimer(cep);
1621 
1622 	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1623 		siw_dbg_cep(cep, "out of state\n");
1624 		rv = -ECONNRESET;
1625 		goto free_cep;
1626 	}
1627 	qp = siw_qp_id2obj(sdev, params->qpn);
1628 	if (!qp) {
1629 		WARN(1, "[QP %d] does not exist\n", params->qpn);
1630 		goto free_cep;
1631 	}
1632 	down_write(&qp->state_lock);
1633 	if (qp->attrs.state > SIW_QP_STATE_RTR)
1634 		goto error_unlock;
1635 	siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
1636 
1637 	if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
1638 		siw_dbg_cep(cep, "peer allows GSO on TX\n");
1639 		qp->tx_ctx.gso_seg_limit = 0;
1640 	}
1641 	if (params->ord > sdev->attrs.max_ord ||
1642 	    params->ird > sdev->attrs.max_ird) {
1643 		siw_dbg_cep(
1644 			cep,
1645 			"[QP %u]: ord %d (max %d), ird %d (max %d)\n",
1646 			qp_id(qp), params->ord, sdev->attrs.max_ord,
1647 			params->ird, sdev->attrs.max_ird);
1648 		goto error_unlock;
1649 	}
1650 	if (cep->enhanced_rdma_conn_est)
1651 		max_priv_data -= sizeof(struct mpa_v2_data);
1652 
1653 	if (params->private_data_len > max_priv_data) {
1654 		siw_dbg_cep(
1655 			cep,
1656 			"[QP %u]: private data length: %d (max %d)\n",
1657 			qp_id(qp), params->private_data_len, max_priv_data);
1658 		goto error_unlock;
1659 	}
1660 	if (cep->enhanced_rdma_conn_est) {
1661 		if (params->ord > cep->ord) {
1662 			if (relaxed_ird_negotiation) {
1663 				params->ord = cep->ord;
1664 			} else {
1665 				cep->ird = params->ird;
1666 				cep->ord = params->ord;
1667 				goto error_unlock;
1668 			}
1669 		}
1670 		if (params->ird < cep->ird) {
1671 			if (relaxed_ird_negotiation &&
1672 			    cep->ird <= sdev->attrs.max_ird)
1673 				params->ird = cep->ird;
1674 			else {
1675 				rv = -ENOMEM;
1676 				goto error_unlock;
1677 			}
1678 		}
1679 		if (cep->mpa.v2_ctrl.ord &
1680 		    (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
1681 			wait_for_peer_rts = true;
1682 		/*
1683 		 * Signal back negotiated IRD and ORD values
1684 		 */
1685 		cep->mpa.v2_ctrl.ord =
1686 			htons(params->ord & MPA_IRD_ORD_MASK) |
1687 			(cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
1688 		cep->mpa.v2_ctrl.ird =
1689 			htons(params->ird & MPA_IRD_ORD_MASK) |
1690 			(cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
1691 	}
1692 	cep->ird = params->ird;
1693 	cep->ord = params->ord;
1694 
1695 	cep->cm_id = id;
1696 	id->add_ref(id);
1697 
1698 	memset(&qp_attrs, 0, sizeof(qp_attrs));
1699 	qp_attrs.orq_size = cep->ord;
1700 	qp_attrs.irq_size = cep->ird;
1701 	qp_attrs.sk = cep->sock;
1702 	if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
1703 		qp_attrs.flags = SIW_MPA_CRC;
1704 	qp_attrs.state = SIW_QP_STATE_RTS;
1705 
1706 	siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
1707 
1708 	/* Associate QP with CEP */
1709 	siw_cep_get(cep);
1710 	qp->cep = cep;
1711 
1712 	/* siw_qp_get(qp) already done by QP lookup */
1713 	cep->qp = qp;
1714 
1715 	cep->state = SIW_EPSTATE_RDMA_MODE;
1716 
1717 	/* Move socket RX/TX under QP control */
1718 	rv = siw_qp_modify(qp, &qp_attrs,
1719 			   SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
1720 				   SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
1721 				   SIW_QP_ATTR_MPA);
1722 	up_write(&qp->state_lock);
1723 	if (rv)
1724 		goto error;
1725 
1726 	siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n",
1727 		    qp_id(qp), params->private_data_len);
1728 
1729 	rv = siw_send_mpareqrep(cep, params->private_data,
1730 				params->private_data_len);
1731 	if (rv != 0)
1732 		goto error;
1733 
1734 	if (wait_for_peer_rts) {
1735 		siw_sk_assign_rtr_upcalls(cep);
1736 	} else {
1737 		siw_qp_socket_assoc(cep, qp);
1738 		rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
1739 		if (rv)
1740 			goto error;
1741 	}
1742 	siw_cep_set_free(cep);
1743 
1744 	return 0;
1745 
1746 error_unlock:
1747 	up_write(&qp->state_lock);
1748 error:
1749 	siw_destroy_cep_sock(cep);
1750 
1751 	cep->state = SIW_EPSTATE_CLOSED;
1752 
1753 	siw_free_cm_id(cep);
1754 	if (qp->cep) {
1755 		siw_cep_put(cep);
1756 		qp->cep = NULL;
1757 	}
1758 	cep->qp = NULL;
1759 	siw_qp_put(qp);
1760 free_cep:
1761 	siw_cep_set_free_and_put(cep);
1762 	return rv;
1763 }
1764 
1765 /*
1766  * siw_reject()
1767  *
1768  * Local connection reject case. Send private data back to peer,
1769  * close connection and dereference connection id.
1770  */
1771 int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
1772 {
1773 	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1774 
1775 	siw_cep_set_inuse(cep);
1776 	siw_cep_put(cep);
1777 
1778 	siw_cancel_mpatimer(cep);
1779 
1780 	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1781 		siw_dbg_cep(cep, "out of state\n");
1782 
1783 		siw_cep_set_free_and_put(cep); /* put last reference */
1784 
1785 		return -ECONNRESET;
1786 	}
1787 	siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state,
1788 		    pd_len);
1789 
1790 	if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
1791 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
1792 		siw_send_mpareqrep(cep, pdata, pd_len);
1793 	}
1794 	siw_destroy_cep_sock(cep);
1795 
1796 	cep->state = SIW_EPSTATE_CLOSED;
1797 
1798 	siw_cep_set_free_and_put(cep);
1799 
1800 	return 0;
1801 }
1802 
1803 /*
1804  * siw_create_listen - Create resources for a listener's IWCM ID @id
1805  *
1806  * Starts listen on the socket address id->local_addr.
1807  *
1808  */
1809 int siw_create_listen(struct iw_cm_id *id, int backlog)
1810 {
1811 	struct socket *s;
1812 	struct siw_cep *cep = NULL;
1813 	struct net_device *ndev = NULL;
1814 	struct siw_device *sdev = to_siw_dev(id->device);
1815 	int addr_family = id->local_addr.ss_family;
1816 	int rv = 0;
1817 
1818 	if (addr_family != AF_INET && addr_family != AF_INET6)
1819 		return -EAFNOSUPPORT;
1820 
1821 	rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
1822 	if (rv < 0)
1823 		return rv;
1824 	siw_reclassify_socket(s);
1825 
1826 	/*
1827 	 * Allow binding local port when still in TIME_WAIT from last close.
1828 	 */
1829 	sock_set_reuseaddr(s->sk);
1830 
1831 	if (addr_family == AF_INET) {
1832 		struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
1833 
1834 		/* For wildcard addr, limit binding to current device only */
1835 		if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) {
1836 			ndev = ib_device_get_netdev(id->device, SIW_PORT);
1837 			if (ndev) {
1838 				s->sk->sk_bound_dev_if = ndev->ifindex;
1839 			} else {
1840 				rv = -ENODEV;
1841 				goto error;
1842 			}
1843 		}
1844 		rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
1845 				  sizeof(struct sockaddr_in));
1846 	} else {
1847 		struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
1848 
1849 		if (id->afonly) {
1850 			rv = ip6_sock_set_v6only(s->sk);
1851 			if (rv) {
1852 				siw_dbg(id->device,
1853 					"ip6_sock_set_v6only erro: %d\n", rv);
1854 				goto error;
1855 			}
1856 		}
1857 
1858 		/* For wildcard addr, limit binding to current device only */
1859 		if (ipv6_addr_any(&laddr->sin6_addr)) {
1860 			ndev = ib_device_get_netdev(id->device, SIW_PORT);
1861 			if (ndev) {
1862 				s->sk->sk_bound_dev_if = ndev->ifindex;
1863 			} else {
1864 				rv = -ENODEV;
1865 				goto error;
1866 			}
1867 		}
1868 		rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
1869 				  sizeof(struct sockaddr_in6));
1870 	}
1871 	if (rv) {
1872 		siw_dbg(id->device, "socket bind error: %d\n", rv);
1873 		goto error;
1874 	}
1875 	cep = siw_cep_alloc(sdev);
1876 	if (!cep) {
1877 		rv = -ENOMEM;
1878 		goto error;
1879 	}
1880 	siw_cep_socket_assoc(cep, s);
1881 
1882 	rv = siw_cm_alloc_work(cep, backlog);
1883 	if (rv) {
1884 		siw_dbg(id->device,
1885 			"alloc_work error %d, backlog %d\n",
1886 			rv, backlog);
1887 		goto error;
1888 	}
1889 	rv = s->ops->listen(s, backlog);
1890 	if (rv) {
1891 		siw_dbg(id->device, "listen error %d\n", rv);
1892 		goto error;
1893 	}
1894 	cep->cm_id = id;
1895 	id->add_ref(id);
1896 
1897 	/*
1898 	 * In case of a wildcard rdma_listen on a multi-homed device,
1899 	 * a listener's IWCM id is associated with more than one listening CEP.
1900 	 *
1901 	 * We currently use id->provider_data in three different ways:
1902 	 *
1903 	 * o For a listener's IWCM id, id->provider_data points to
1904 	 *   the list_head of the list of listening CEPs.
1905 	 *   Uses: siw_create_listen(), siw_destroy_listen()
1906 	 *
1907 	 * o For each accepted passive-side IWCM id, id->provider_data
1908 	 *   points to the CEP itself. This is a consequence of
1909 	 *   - siw_cm_upcall() setting event.provider_data = cep and
1910 	 *   - the IWCM's cm_conn_req_handler() setting provider_data of the
1911 	 *     new passive-side IWCM id equal to event.provider_data
1912 	 *   Uses: siw_accept(), siw_reject()
1913 	 *
1914 	 * o For an active-side IWCM id, id->provider_data is not used at all.
1915 	 *
1916 	 */
1917 	if (!id->provider_data) {
1918 		id->provider_data =
1919 			kmalloc_obj(struct list_head);
1920 		if (!id->provider_data) {
1921 			rv = -ENOMEM;
1922 			goto error;
1923 		}
1924 		INIT_LIST_HEAD((struct list_head *)id->provider_data);
1925 	}
1926 	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
1927 	cep->state = SIW_EPSTATE_LISTENING;
1928 	dev_put(ndev);
1929 
1930 	siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
1931 
1932 	return 0;
1933 
1934 error:
1935 	siw_dbg(id->device, "failed: %d\n", rv);
1936 
1937 	if (cep) {
1938 		siw_cep_set_inuse(cep);
1939 
1940 		siw_free_cm_id(cep);
1941 		siw_socket_disassoc(s);
1942 		cep->state = SIW_EPSTATE_CLOSED;
1943 
1944 		siw_cep_set_free_and_put(cep);
1945 	}
1946 	sock_release(s);
1947 	dev_put(ndev);
1948 
1949 	return rv;
1950 }
1951 
1952 static void siw_drop_listeners(struct iw_cm_id *id)
1953 {
1954 	struct list_head *p, *tmp;
1955 
1956 	/*
1957 	 * In case of a wildcard rdma_listen on a multi-homed device,
1958 	 * a listener's IWCM id is associated with more than one listening CEP.
1959 	 */
1960 	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
1961 		struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
1962 		struct socket *s = cep->sock;
1963 
1964 		list_del(p);
1965 
1966 		siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
1967 
1968 		siw_cep_set_inuse(cep);
1969 
1970 		siw_free_cm_id(cep);
1971 		if (s) {
1972 			siw_socket_disassoc(s);
1973 			sock_release(s);
1974 		}
1975 		cep->state = SIW_EPSTATE_CLOSED;
1976 		siw_cep_set_free_and_put(cep);
1977 	}
1978 }
1979 
1980 int siw_destroy_listen(struct iw_cm_id *id)
1981 {
1982 	if (!id->provider_data) {
1983 		siw_dbg(id->device, "no cep(s)\n");
1984 		return 0;
1985 	}
1986 	siw_drop_listeners(id);
1987 	kfree(id->provider_data);
1988 	id->provider_data = NULL;
1989 
1990 	return 0;
1991 }
1992 
1993 int siw_cm_init(void)
1994 {
1995 	/*
1996 	 * create_single_workqueue for strict ordering
1997 	 */
1998 	siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
1999 	if (!siw_cm_wq)
2000 		return -ENOMEM;
2001 
2002 	return 0;
2003 }
2004 
2005 void siw_cm_exit(void)
2006 {
2007 	if (siw_cm_wq)
2008 		destroy_workqueue(siw_cm_wq);
2009 }
2010