xref: /linux/drivers/infiniband/sw/siw/siw_cm.c (revision 55aa394a5ed871208eac11c5f4677cafd258c4dd)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /*          Fredy Neeser */
5 /*          Greg Joyce <greg@opengridcomputing.com> */
6 /* Copyright (c) 2008-2019, IBM Corporation */
7 /* Copyright (c) 2017, Open Grid Computing, Inc. */
8 
9 #include <linux/errno.h>
10 #include <linux/types.h>
11 #include <linux/net.h>
12 #include <linux/inetdevice.h>
13 #include <net/addrconf.h>
14 #include <linux/workqueue.h>
15 #include <net/sock.h>
16 #include <net/tcp.h>
17 #include <linux/inet.h>
18 #include <linux/tcp.h>
19 #include <trace/events/sock.h>
20 
21 #include <rdma/iw_cm.h>
22 #include <rdma/ib_verbs.h>
23 #include <rdma/ib_user_verbs.h>
24 
25 #include "siw.h"
26 #include "siw_cm.h"
27 
28 /*
29  * Set to any combination of
30  * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
31  */
32 static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
33 static const bool relaxed_ird_negotiation = true;
34 
35 static void siw_cm_llp_state_change(struct sock *s);
36 static void siw_cm_llp_data_ready(struct sock *s);
37 static void siw_cm_llp_write_space(struct sock *s);
38 static void siw_cm_llp_error_report(struct sock *s);
39 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
40 			 int status);
41 
42 
43 #ifdef CONFIG_DEBUG_LOCK_ALLOC
44 /*
45  * lockdep can detect false positive circular dependencies
46  * when there are user-space socket API users or in kernel
47  * users switching between a tcp and rdma transport.
48  * Maybe also switching between siw and rxe may cause
49  * problems as per default sockets are only classified
50  * by family and not by ip protocol. And there might
51  * be different locks used between the application
52  * and the low level sockets.
53  *
54  * Problems were seen with ksmbd.ko and cifs.ko,
55  * switching transports, use git blame to find
56  * more details.
57  */
58 static struct lock_class_key siw_sk_key[2];
59 static struct lock_class_key siw_slock_key[2];
60 #endif /* CONFIG_DEBUG_LOCK_ALLOC */
61 
siw_reclassify_socket(struct socket * sock)62 static inline void siw_reclassify_socket(struct socket *sock)
63 {
64 #ifdef CONFIG_DEBUG_LOCK_ALLOC
65 	struct sock *sk = sock->sk;
66 
67 	if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
68 		return;
69 
70 	switch (sk->sk_family) {
71 	case AF_INET:
72 		sock_lock_init_class_and_name(sk,
73 					      "slock-AF_INET-RDMA-SIW",
74 					      &siw_slock_key[0],
75 					      "sk_lock-AF_INET-RDMA-SIW",
76 					      &siw_sk_key[0]);
77 		break;
78 	case AF_INET6:
79 		sock_lock_init_class_and_name(sk,
80 					      "slock-AF_INET6-RDMA-SIW",
81 					      &siw_slock_key[1],
82 					      "sk_lock-AF_INET6-RDMA-SIW",
83 					      &siw_sk_key[1]);
84 		break;
85 	default:
86 		WARN_ON_ONCE(1);
87 	}
88 #endif /* CONFIG_DEBUG_LOCK_ALLOC */
89 }
90 
siw_sk_assign_cm_upcalls(struct sock * sk)91 static void siw_sk_assign_cm_upcalls(struct sock *sk)
92 {
93 	struct siw_cep *cep = sk_to_cep(sk);
94 
95 	write_lock_bh(&sk->sk_callback_lock);
96 	cep->sk_state_change = sk->sk_state_change;
97 	cep->sk_data_ready = sk->sk_data_ready;
98 	cep->sk_write_space = sk->sk_write_space;
99 	cep->sk_error_report = sk->sk_error_report;
100 
101 	sk->sk_state_change = siw_cm_llp_state_change;
102 	sk->sk_data_ready = siw_cm_llp_data_ready;
103 	sk->sk_write_space = siw_cm_llp_write_space;
104 	sk->sk_error_report = siw_cm_llp_error_report;
105 	write_unlock_bh(&sk->sk_callback_lock);
106 }
107 
siw_sk_restore_upcalls(struct sock * sk,struct siw_cep * cep)108 static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
109 {
110 	sk->sk_state_change = cep->sk_state_change;
111 	sk->sk_data_ready = cep->sk_data_ready;
112 	sk->sk_write_space = cep->sk_write_space;
113 	sk->sk_error_report = cep->sk_error_report;
114 	sk->sk_user_data = NULL;
115 }
116 
siw_qp_socket_assoc(struct siw_cep * cep,struct siw_qp * qp)117 static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
118 {
119 	struct socket *s = cep->sock;
120 	struct sock *sk = s->sk;
121 
122 	write_lock_bh(&sk->sk_callback_lock);
123 
124 	qp->attrs.sk = s;
125 	sk->sk_data_ready = siw_qp_llp_data_ready;
126 	sk->sk_write_space = siw_qp_llp_write_space;
127 
128 	write_unlock_bh(&sk->sk_callback_lock);
129 }
130 
siw_socket_disassoc(struct socket * s)131 static void siw_socket_disassoc(struct socket *s)
132 {
133 	struct sock *sk = s->sk;
134 	struct siw_cep *cep;
135 
136 	if (sk) {
137 		write_lock_bh(&sk->sk_callback_lock);
138 		cep = sk_to_cep(sk);
139 		if (cep) {
140 			siw_sk_restore_upcalls(sk, cep);
141 			siw_cep_put(cep);
142 		} else {
143 			pr_warn("siw: cannot restore sk callbacks: no ep\n");
144 		}
145 		write_unlock_bh(&sk->sk_callback_lock);
146 	} else {
147 		pr_warn("siw: cannot restore sk callbacks: no sk\n");
148 	}
149 }
150 
siw_rtr_data_ready(struct sock * sk)151 static void siw_rtr_data_ready(struct sock *sk)
152 {
153 	struct siw_cep *cep;
154 	struct siw_qp *qp = NULL;
155 	read_descriptor_t rd_desc;
156 
157 	trace_sk_data_ready(sk);
158 
159 	read_lock(&sk->sk_callback_lock);
160 
161 	cep = sk_to_cep(sk);
162 	if (!cep) {
163 		WARN(1, "No connection endpoint\n");
164 		goto out;
165 	}
166 	qp = sk_to_qp(sk);
167 
168 	memset(&rd_desc, 0, sizeof(rd_desc));
169 	rd_desc.arg.data = qp;
170 	rd_desc.count = 1;
171 
172 	tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
173 	/*
174 	 * Check if first frame was successfully processed.
175 	 * Signal connection full establishment if yes.
176 	 * Failed data processing would have already scheduled
177 	 * connection drop.
178 	 */
179 	if (!qp->rx_stream.rx_suspend)
180 		siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
181 out:
182 	read_unlock(&sk->sk_callback_lock);
183 	if (qp)
184 		siw_qp_socket_assoc(cep, qp);
185 }
186 
siw_sk_assign_rtr_upcalls(struct siw_cep * cep)187 static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
188 {
189 	struct sock *sk = cep->sock->sk;
190 
191 	write_lock_bh(&sk->sk_callback_lock);
192 	sk->sk_data_ready = siw_rtr_data_ready;
193 	sk->sk_write_space = siw_qp_llp_write_space;
194 	write_unlock_bh(&sk->sk_callback_lock);
195 }
196 
siw_cep_socket_assoc(struct siw_cep * cep,struct socket * s)197 static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
198 {
199 	cep->sock = s;
200 	siw_cep_get(cep);
201 	s->sk->sk_user_data = cep;
202 
203 	siw_sk_assign_cm_upcalls(s->sk);
204 }
205 
siw_cep_alloc(struct siw_device * sdev)206 static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
207 {
208 	struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
209 	unsigned long flags;
210 
211 	if (!cep)
212 		return NULL;
213 
214 	INIT_LIST_HEAD(&cep->listenq);
215 	INIT_LIST_HEAD(&cep->devq);
216 	INIT_LIST_HEAD(&cep->work_freelist);
217 
218 	kref_init(&cep->ref);
219 	cep->state = SIW_EPSTATE_IDLE;
220 	init_waitqueue_head(&cep->waitq);
221 	spin_lock_init(&cep->lock);
222 	cep->sdev = sdev;
223 	cep->enhanced_rdma_conn_est = false;
224 
225 	spin_lock_irqsave(&sdev->lock, flags);
226 	list_add_tail(&cep->devq, &sdev->cep_list);
227 	spin_unlock_irqrestore(&sdev->lock, flags);
228 
229 	siw_dbg_cep(cep, "new endpoint\n");
230 	return cep;
231 }
232 
siw_cm_free_work(struct siw_cep * cep)233 static void siw_cm_free_work(struct siw_cep *cep)
234 {
235 	struct list_head *w, *tmp;
236 	struct siw_cm_work *work;
237 
238 	list_for_each_safe(w, tmp, &cep->work_freelist) {
239 		work = list_entry(w, struct siw_cm_work, list);
240 		list_del(&work->list);
241 		kfree(work);
242 	}
243 }
244 
siw_cancel_mpatimer(struct siw_cep * cep)245 static void siw_cancel_mpatimer(struct siw_cep *cep)
246 {
247 	spin_lock_bh(&cep->lock);
248 	if (cep->mpa_timer) {
249 		if (cancel_delayed_work(&cep->mpa_timer->work)) {
250 			siw_cep_put(cep);
251 			kfree(cep->mpa_timer); /* not needed again */
252 		}
253 		cep->mpa_timer = NULL;
254 	}
255 	spin_unlock_bh(&cep->lock);
256 }
257 
siw_put_work(struct siw_cm_work * work)258 static void siw_put_work(struct siw_cm_work *work)
259 {
260 	INIT_LIST_HEAD(&work->list);
261 	spin_lock_bh(&work->cep->lock);
262 	list_add(&work->list, &work->cep->work_freelist);
263 	spin_unlock_bh(&work->cep->lock);
264 }
265 
siw_cep_set_inuse(struct siw_cep * cep)266 static void siw_cep_set_inuse(struct siw_cep *cep)
267 {
268 	unsigned long flags;
269 retry:
270 	spin_lock_irqsave(&cep->lock, flags);
271 
272 	if (cep->in_use) {
273 		spin_unlock_irqrestore(&cep->lock, flags);
274 		wait_event_interruptible(cep->waitq, !cep->in_use);
275 		if (signal_pending(current))
276 			flush_signals(current);
277 		goto retry;
278 	} else {
279 		cep->in_use = 1;
280 		spin_unlock_irqrestore(&cep->lock, flags);
281 	}
282 }
283 
siw_cep_set_free(struct siw_cep * cep)284 static void siw_cep_set_free(struct siw_cep *cep)
285 {
286 	unsigned long flags;
287 
288 	spin_lock_irqsave(&cep->lock, flags);
289 	cep->in_use = 0;
290 	spin_unlock_irqrestore(&cep->lock, flags);
291 
292 	wake_up(&cep->waitq);
293 }
294 
__siw_cep_dealloc(struct kref * ref)295 static void __siw_cep_dealloc(struct kref *ref)
296 {
297 	struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
298 	struct siw_device *sdev = cep->sdev;
299 	unsigned long flags;
300 
301 	WARN_ON(cep->listen_cep);
302 
303 	/* kfree(NULL) is safe */
304 	kfree(cep->mpa.pdata);
305 	spin_lock_bh(&cep->lock);
306 	if (!list_empty(&cep->work_freelist))
307 		siw_cm_free_work(cep);
308 	spin_unlock_bh(&cep->lock);
309 
310 	spin_lock_irqsave(&sdev->lock, flags);
311 	list_del(&cep->devq);
312 	spin_unlock_irqrestore(&sdev->lock, flags);
313 
314 	siw_dbg_cep(cep, "free endpoint\n");
315 	kfree(cep);
316 }
317 
siw_get_work(struct siw_cep * cep)318 static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
319 {
320 	struct siw_cm_work *work = NULL;
321 
322 	spin_lock_bh(&cep->lock);
323 	if (!list_empty(&cep->work_freelist)) {
324 		work = list_entry(cep->work_freelist.next, struct siw_cm_work,
325 				  list);
326 		list_del_init(&work->list);
327 	}
328 	spin_unlock_bh(&cep->lock);
329 	return work;
330 }
331 
siw_cm_alloc_work(struct siw_cep * cep,int num)332 static int siw_cm_alloc_work(struct siw_cep *cep, int num)
333 {
334 	struct siw_cm_work *work;
335 
336 	while (num--) {
337 		work = kmalloc(sizeof(*work), GFP_KERNEL);
338 		if (!work) {
339 			if (!(list_empty(&cep->work_freelist)))
340 				siw_cm_free_work(cep);
341 			return -ENOMEM;
342 		}
343 		work->cep = cep;
344 		INIT_LIST_HEAD(&work->list);
345 		list_add(&work->list, &cep->work_freelist);
346 	}
347 	return 0;
348 }
349 
350 /*
351  * siw_cm_upcall()
352  *
353  * Upcall to IWCM to inform about async connection events
354  */
siw_cm_upcall(struct siw_cep * cep,enum iw_cm_event_type reason,int status)355 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
356 			 int status)
357 {
358 	struct iw_cm_event event;
359 	struct iw_cm_id *id;
360 
361 	memset(&event, 0, sizeof(event));
362 	event.status = status;
363 	event.event = reason;
364 
365 	if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
366 		event.provider_data = cep;
367 		id = cep->listen_cep->cm_id;
368 	} else {
369 		id = cep->cm_id;
370 	}
371 	/* Signal IRD and ORD */
372 	if (reason == IW_CM_EVENT_ESTABLISHED ||
373 	    reason == IW_CM_EVENT_CONNECT_REPLY) {
374 		/* Signal negotiated IRD/ORD values we will use */
375 		event.ird = cep->ird;
376 		event.ord = cep->ord;
377 	} else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
378 		event.ird = cep->ord;
379 		event.ord = cep->ird;
380 	}
381 	/* Signal private data and address information */
382 	if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
383 	    reason == IW_CM_EVENT_CONNECT_REPLY) {
384 		u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
385 
386 		if (pd_len) {
387 			/*
388 			 * hand over MPA private data
389 			 */
390 			event.private_data_len = pd_len;
391 			event.private_data = cep->mpa.pdata;
392 
393 			/* Hide MPA V2 IRD/ORD control */
394 			if (cep->enhanced_rdma_conn_est) {
395 				event.private_data_len -=
396 					sizeof(struct mpa_v2_data);
397 				event.private_data +=
398 					sizeof(struct mpa_v2_data);
399 			}
400 		}
401 		getname_local(cep->sock, &event.local_addr);
402 		getname_peer(cep->sock, &event.remote_addr);
403 	}
404 	siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n",
405 		    cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status);
406 
407 	return id->event_handler(id, &event);
408 }
409 
siw_free_cm_id(struct siw_cep * cep)410 static void siw_free_cm_id(struct siw_cep *cep)
411 {
412 	if (!cep->cm_id)
413 		return;
414 
415 	cep->cm_id->rem_ref(cep->cm_id);
416 	cep->cm_id = NULL;
417 }
418 
siw_destroy_cep_sock(struct siw_cep * cep)419 static void siw_destroy_cep_sock(struct siw_cep *cep)
420 {
421 	if (cep->sock) {
422 		siw_socket_disassoc(cep->sock);
423 		sock_release(cep->sock);
424 		cep->sock = NULL;
425 	}
426 }
427 
428 /*
429  * siw_qp_cm_drop()
430  *
431  * Drops established LLP connection if present and not already
432  * scheduled for dropping. Called from user context, SQ workqueue
433  * or receive IRQ. Caller signals if socket can be immediately
434  * closed (basically, if not in IRQ).
435  */
siw_qp_cm_drop(struct siw_qp * qp,int schedule)436 void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
437 {
438 	struct siw_cep *cep = qp->cep;
439 
440 	qp->rx_stream.rx_suspend = 1;
441 	qp->tx_ctx.tx_suspend = 1;
442 
443 	if (!qp->cep)
444 		return;
445 
446 	if (schedule) {
447 		siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
448 	} else {
449 		siw_cep_set_inuse(cep);
450 
451 		if (cep->state == SIW_EPSTATE_CLOSED) {
452 			siw_dbg_cep(cep, "already closed\n");
453 			goto out;
454 		}
455 		siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
456 
457 		siw_send_terminate(qp);
458 
459 		if (cep->cm_id) {
460 			switch (cep->state) {
461 			case SIW_EPSTATE_AWAIT_MPAREP:
462 				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
463 					      -EINVAL);
464 				break;
465 
466 			case SIW_EPSTATE_RDMA_MODE:
467 				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
468 				break;
469 
470 			case SIW_EPSTATE_IDLE:
471 			case SIW_EPSTATE_LISTENING:
472 			case SIW_EPSTATE_CONNECTING:
473 			case SIW_EPSTATE_AWAIT_MPAREQ:
474 			case SIW_EPSTATE_RECVD_MPAREQ:
475 			case SIW_EPSTATE_CLOSED:
476 			default:
477 				break;
478 			}
479 			siw_free_cm_id(cep);
480 			siw_cep_put(cep);
481 		}
482 		cep->state = SIW_EPSTATE_CLOSED;
483 
484 		siw_destroy_cep_sock(cep);
485 		if (cep->qp) {
486 			cep->qp = NULL;
487 			siw_qp_put(qp);
488 		}
489 out:
490 		siw_cep_set_free(cep);
491 	}
492 }
493 
siw_cep_put(struct siw_cep * cep)494 void siw_cep_put(struct siw_cep *cep)
495 {
496 	WARN_ON(kref_read(&cep->ref) < 1);
497 	kref_put(&cep->ref, __siw_cep_dealloc);
498 }
499 
siw_cep_set_free_and_put(struct siw_cep * cep)500 static void siw_cep_set_free_and_put(struct siw_cep *cep)
501 {
502 	siw_cep_set_free(cep);
503 	siw_cep_put(cep);
504 }
505 
siw_cep_get(struct siw_cep * cep)506 void siw_cep_get(struct siw_cep *cep)
507 {
508 	kref_get(&cep->ref);
509 }
510 
511 /*
512  * Expects params->pd_len in host byte order
513  */
siw_send_mpareqrep(struct siw_cep * cep,const void * pdata,u8 pd_len)514 static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
515 {
516 	struct socket *s = cep->sock;
517 	struct mpa_rr *rr = &cep->mpa.hdr;
518 	struct kvec iov[3];
519 	struct msghdr msg;
520 	int rv;
521 	int iovec_num = 0;
522 	int mpa_len;
523 
524 	memset(&msg, 0, sizeof(msg));
525 
526 	iov[iovec_num].iov_base = rr;
527 	iov[iovec_num].iov_len = sizeof(*rr);
528 	mpa_len = sizeof(*rr);
529 
530 	if (cep->enhanced_rdma_conn_est) {
531 		iovec_num++;
532 		iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
533 		iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
534 		mpa_len += sizeof(cep->mpa.v2_ctrl);
535 	}
536 	if (pd_len) {
537 		iovec_num++;
538 		iov[iovec_num].iov_base = (char *)pdata;
539 		iov[iovec_num].iov_len = pd_len;
540 		mpa_len += pd_len;
541 	}
542 	if (cep->enhanced_rdma_conn_est)
543 		pd_len += sizeof(cep->mpa.v2_ctrl);
544 
545 	rr->params.pd_len = cpu_to_be16(pd_len);
546 
547 	rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
548 
549 	return rv < 0 ? rv : 0;
550 }
551 
552 /*
553  * Receive MPA Request/Reply header.
554  *
555  * Returns 0 if complete MPA Request/Reply header including
556  * eventual private data was received. Returns -EAGAIN if
557  * header was partially received or negative error code otherwise.
558  *
559  * Context: May be called in process context only
560  */
siw_recv_mpa_rr(struct siw_cep * cep)561 static int siw_recv_mpa_rr(struct siw_cep *cep)
562 {
563 	struct mpa_rr *hdr = &cep->mpa.hdr;
564 	struct socket *s = cep->sock;
565 	u16 pd_len;
566 	int rcvd, to_rcv;
567 
568 	if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
569 		rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
570 				  sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
571 				  0);
572 		if (rcvd <= 0)
573 			return -ECONNABORTED;
574 
575 		cep->mpa.bytes_rcvd += rcvd;
576 
577 		if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
578 			return -EAGAIN;
579 
580 		if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
581 			return -EPROTO;
582 	}
583 	pd_len = be16_to_cpu(hdr->params.pd_len);
584 
585 	/*
586 	 * At least the MPA Request/Reply header (frame not including
587 	 * private data) has been received.
588 	 * Receive (or continue receiving) any private data.
589 	 */
590 	to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
591 
592 	if (!to_rcv) {
593 		/*
594 		 * We must have hdr->params.pd_len == 0 and thus received a
595 		 * complete MPA Request/Reply frame.
596 		 * Check against peer protocol violation.
597 		 */
598 		u32 word;
599 
600 		rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
601 		if (rcvd == -EAGAIN)
602 			return 0;
603 
604 		if (rcvd == 0) {
605 			siw_dbg_cep(cep, "peer EOF\n");
606 			return -EPIPE;
607 		}
608 		if (rcvd < 0) {
609 			siw_dbg_cep(cep, "error: %d\n", rcvd);
610 			return rcvd;
611 		}
612 		siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
613 
614 		return -EPROTO;
615 	}
616 
617 	/*
618 	 * At this point, we must have hdr->params.pd_len != 0.
619 	 * A private data buffer gets allocated if hdr->params.pd_len != 0.
620 	 */
621 	if (!cep->mpa.pdata) {
622 		cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
623 		if (!cep->mpa.pdata)
624 			return -ENOMEM;
625 	}
626 	rcvd = ksock_recv(
627 		s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
628 		to_rcv + 4, MSG_DONTWAIT);
629 
630 	if (rcvd < 0)
631 		return rcvd;
632 
633 	if (rcvd > to_rcv)
634 		return -EPROTO;
635 
636 	cep->mpa.bytes_rcvd += rcvd;
637 
638 	if (to_rcv == rcvd) {
639 		siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
640 		return 0;
641 	}
642 	return -EAGAIN;
643 }
644 
645 /*
646  * siw_proc_mpareq()
647  *
648  * Read MPA Request from socket and signal new connection to IWCM
649  * if success. Caller must hold lock on corresponding listening CEP.
650  */
siw_proc_mpareq(struct siw_cep * cep)651 static int siw_proc_mpareq(struct siw_cep *cep)
652 {
653 	struct mpa_rr *req;
654 	int version, rv;
655 	u16 pd_len;
656 
657 	rv = siw_recv_mpa_rr(cep);
658 	if (rv)
659 		return rv;
660 
661 	req = &cep->mpa.hdr;
662 
663 	version = __mpa_rr_revision(req->params.bits);
664 	pd_len = be16_to_cpu(req->params.pd_len);
665 
666 	if (version > MPA_REVISION_2)
667 		/* allow for 0, 1, and 2 only */
668 		return -EPROTO;
669 
670 	if (memcmp(req->key, MPA_KEY_REQ, 16))
671 		return -EPROTO;
672 
673 	/* Prepare for sending MPA reply */
674 	memcpy(req->key, MPA_KEY_REP, 16);
675 
676 	if (version == MPA_REVISION_2 &&
677 	    (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
678 		/*
679 		 * MPA version 2 must signal IRD/ORD values and P2P mode
680 		 * in private data if header flag MPA_RR_FLAG_ENHANCED
681 		 * is set.
682 		 */
683 		if (pd_len < sizeof(struct mpa_v2_data))
684 			goto reject_conn;
685 
686 		cep->enhanced_rdma_conn_est = true;
687 	}
688 
689 	/* MPA Markers: currently not supported. Marker TX to be added. */
690 	if (req->params.bits & MPA_RR_FLAG_MARKERS)
691 		goto reject_conn;
692 
693 	if (req->params.bits & MPA_RR_FLAG_CRC) {
694 		/*
695 		 * RFC 5044, page 27: CRC MUST be used if peer requests it.
696 		 * siw specific: 'mpa_crc_strict' parameter to reject
697 		 * connection with CRC if local CRC off enforced by
698 		 * 'mpa_crc_strict' module parameter.
699 		 */
700 		if (!mpa_crc_required && mpa_crc_strict)
701 			goto reject_conn;
702 
703 		/* Enable CRC if requested by module parameter */
704 		if (mpa_crc_required)
705 			req->params.bits |= MPA_RR_FLAG_CRC;
706 	}
707 	if (cep->enhanced_rdma_conn_est) {
708 		struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
709 
710 		/*
711 		 * Peer requested ORD becomes requested local IRD,
712 		 * peer requested IRD becomes requested local ORD.
713 		 * IRD and ORD get limited by global maximum values.
714 		 */
715 		cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
716 		cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
717 		cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
718 		cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
719 
720 		/* May get overwritten by locally negotiated values */
721 		cep->mpa.v2_ctrl.ird = htons(cep->ird);
722 		cep->mpa.v2_ctrl.ord = htons(cep->ord);
723 
724 		/*
725 		 * Support for peer sent zero length Write or Read to
726 		 * let local side enter RTS. Writes are preferred.
727 		 * Sends would require pre-posting a Receive and are
728 		 * not supported.
729 		 * Propose zero length Write if none of Read and Write
730 		 * is indicated.
731 		 */
732 		if (v2->ird & MPA_V2_PEER_TO_PEER) {
733 			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
734 
735 			if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
736 				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
737 			else if (v2->ord & MPA_V2_RDMA_READ_RTR)
738 				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
739 			else
740 				cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
741 		}
742 	}
743 
744 	cep->state = SIW_EPSTATE_RECVD_MPAREQ;
745 
746 	/* Keep reference until IWCM accepts/rejects */
747 	siw_cep_get(cep);
748 	rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
749 	if (rv)
750 		siw_cep_put(cep);
751 
752 	return rv;
753 
754 reject_conn:
755 	siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
756 		    req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
757 		    mpa_crc_required, mpa_crc_strict,
758 		    req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
759 
760 	req->params.bits &= ~MPA_RR_FLAG_MARKERS;
761 	req->params.bits |= MPA_RR_FLAG_REJECT;
762 
763 	if (!mpa_crc_required && mpa_crc_strict)
764 		req->params.bits &= ~MPA_RR_FLAG_CRC;
765 
766 	if (pd_len)
767 		kfree(cep->mpa.pdata);
768 
769 	cep->mpa.pdata = NULL;
770 
771 	siw_send_mpareqrep(cep, NULL, 0);
772 
773 	return -EOPNOTSUPP;
774 }
775 
siw_proc_mpareply(struct siw_cep * cep)776 static int siw_proc_mpareply(struct siw_cep *cep)
777 {
778 	struct siw_qp_attrs qp_attrs;
779 	enum siw_qp_attr_mask qp_attr_mask;
780 	struct siw_qp *qp = cep->qp;
781 	struct mpa_rr *rep;
782 	int rv;
783 	u16 rep_ord;
784 	u16 rep_ird;
785 	bool ird_insufficient = false;
786 	enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
787 
788 	rv = siw_recv_mpa_rr(cep);
789 	if (rv)
790 		goto out_err;
791 
792 	siw_cancel_mpatimer(cep);
793 
794 	rep = &cep->mpa.hdr;
795 
796 	if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
797 		/* allow for 0, 1,  and 2 only */
798 		rv = -EPROTO;
799 		goto out_err;
800 	}
801 	if (memcmp(rep->key, MPA_KEY_REP, 16)) {
802 		siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
803 				   LLP_ECODE_INVALID_REQ_RESP, 0);
804 		siw_send_terminate(qp);
805 		rv = -EPROTO;
806 		goto out_err;
807 	}
808 	if (rep->params.bits & MPA_RR_FLAG_REJECT) {
809 		siw_dbg_cep(cep, "got mpa reject\n");
810 		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
811 
812 		return -ECONNRESET;
813 	}
814 	if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
815 		siw_dbg_cep(cep, "peer allows GSO on TX\n");
816 		qp->tx_ctx.gso_seg_limit = 0;
817 	}
818 	if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
819 	    (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
820 	    (mpa_crc_strict && !mpa_crc_required &&
821 	     (rep->params.bits & MPA_RR_FLAG_CRC))) {
822 		siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
823 			    rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
824 			    mpa_crc_required, mpa_crc_strict,
825 			    rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
826 
827 		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
828 
829 		return -EINVAL;
830 	}
831 	if (cep->enhanced_rdma_conn_est) {
832 		struct mpa_v2_data *v2;
833 
834 		if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
835 		    !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
836 			/*
837 			 * Protocol failure: The responder MUST reply with
838 			 * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
839 			 */
840 			siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
841 				    __mpa_rr_revision(rep->params.bits),
842 				    rep->params.bits & MPA_RR_FLAG_ENHANCED ?
843 					    1 :
844 					    0);
845 
846 			siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
847 				      -ECONNRESET);
848 			return -EINVAL;
849 		}
850 		v2 = (struct mpa_v2_data *)cep->mpa.pdata;
851 		rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
852 		rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
853 
854 		if (cep->ird < rep_ord &&
855 		    (relaxed_ird_negotiation == false ||
856 		     rep_ord > cep->sdev->attrs.max_ird)) {
857 			siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
858 				    cep->ird, rep_ord,
859 				    cep->sdev->attrs.max_ord);
860 			ird_insufficient = true;
861 		}
862 		if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
863 			siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
864 				    rep_ird);
865 			ird_insufficient = true;
866 		}
867 		/*
868 		 * Always report negotiated peer values to user,
869 		 * even if IRD/ORD negotiation failed
870 		 */
871 		cep->ird = rep_ord;
872 		cep->ord = rep_ird;
873 
874 		if (ird_insufficient) {
875 			/*
876 			 * If the initiator IRD is insuffient for the
877 			 * responder ORD, send a TERM.
878 			 */
879 			siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
880 					   LLP_ETYPE_MPA,
881 					   LLP_ECODE_INSUFFICIENT_IRD, 0);
882 			siw_send_terminate(qp);
883 			rv = -ENOMEM;
884 			goto out_err;
885 		}
886 		if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
887 			mpa_p2p_mode =
888 				cep->mpa.v2_ctrl_req.ord &
889 				(MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
890 
891 		/*
892 		 * Check if we requested P2P mode, and if peer agrees
893 		 */
894 		if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
895 			if ((mpa_p2p_mode & v2->ord) == 0) {
896 				/*
897 				 * We requested RTR mode(s), but the peer
898 				 * did not pick any mode we support.
899 				 */
900 				siw_dbg_cep(cep,
901 					    "rtr mode:  req %2x, got %2x\n",
902 					    mpa_p2p_mode,
903 					    v2->ord & (MPA_V2_RDMA_WRITE_RTR |
904 						       MPA_V2_RDMA_READ_RTR));
905 
906 				siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
907 						   LLP_ETYPE_MPA,
908 						   LLP_ECODE_NO_MATCHING_RTR,
909 						   0);
910 				siw_send_terminate(qp);
911 				rv = -EPROTO;
912 				goto out_err;
913 			}
914 			mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
915 						  MPA_V2_RDMA_READ_RTR);
916 		}
917 	}
918 	memset(&qp_attrs, 0, sizeof(qp_attrs));
919 
920 	if (rep->params.bits & MPA_RR_FLAG_CRC)
921 		qp_attrs.flags = SIW_MPA_CRC;
922 
923 	qp_attrs.irq_size = cep->ird;
924 	qp_attrs.orq_size = cep->ord;
925 	qp_attrs.sk = cep->sock;
926 	qp_attrs.state = SIW_QP_STATE_RTS;
927 
928 	qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
929 		       SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
930 
931 	/* Move socket RX/TX under QP control */
932 	down_write(&qp->state_lock);
933 	if (qp->attrs.state > SIW_QP_STATE_RTR) {
934 		rv = -EINVAL;
935 		up_write(&qp->state_lock);
936 		goto out_err;
937 	}
938 	rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
939 
940 	siw_qp_socket_assoc(cep, qp);
941 
942 	up_write(&qp->state_lock);
943 
944 	/* Send extra RDMA frame to trigger peer RTS if negotiated */
945 	if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
946 		rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
947 		if (rv)
948 			goto out_err;
949 	}
950 	if (!rv) {
951 		rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
952 		if (!rv)
953 			cep->state = SIW_EPSTATE_RDMA_MODE;
954 
955 		return 0;
956 	}
957 
958 out_err:
959 	if (rv != -EAGAIN)
960 		siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
961 
962 	return rv;
963 }
964 
965 /*
966  * siw_accept_newconn - accept an incoming pending connection
967  *
968  */
siw_accept_newconn(struct siw_cep * cep)969 static void siw_accept_newconn(struct siw_cep *cep)
970 {
971 	struct socket *s = cep->sock;
972 	struct socket *new_s = NULL;
973 	struct siw_cep *new_cep = NULL;
974 	int rv = 0; /* debug only. should disappear */
975 
976 	if (cep->state != SIW_EPSTATE_LISTENING)
977 		goto error;
978 
979 	new_cep = siw_cep_alloc(cep->sdev);
980 	if (!new_cep)
981 		goto error;
982 
983 	/*
984 	 * 4: Allocate a sufficient number of work elements
985 	 * to allow concurrent handling of local + peer close
986 	 * events, MPA header processing + MPA timeout.
987 	 */
988 	if (siw_cm_alloc_work(new_cep, 4) != 0)
989 		goto error;
990 
991 	/*
992 	 * Copy saved socket callbacks from listening CEP
993 	 * and assign new socket with new CEP
994 	 */
995 	new_cep->sk_state_change = cep->sk_state_change;
996 	new_cep->sk_data_ready = cep->sk_data_ready;
997 	new_cep->sk_write_space = cep->sk_write_space;
998 	new_cep->sk_error_report = cep->sk_error_report;
999 
1000 	rv = kernel_accept(s, &new_s, O_NONBLOCK);
1001 	if (rv != 0) {
1002 		/*
1003 		 * Connection already aborted by peer..?
1004 		 */
1005 		siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
1006 		goto error;
1007 	}
1008 	new_cep->sock = new_s;
1009 	siw_cep_get(new_cep);
1010 	new_s->sk->sk_user_data = new_cep;
1011 
1012 	if (siw_tcp_nagle == false)
1013 		tcp_sock_set_nodelay(new_s->sk);
1014 	new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
1015 
1016 	rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
1017 	if (rv)
1018 		goto error;
1019 	/*
1020 	 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
1021 	 */
1022 	new_cep->listen_cep = cep;
1023 	siw_cep_get(cep);
1024 
1025 	if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
1026 		/*
1027 		 * MPA REQ already queued
1028 		 */
1029 		siw_dbg_cep(cep, "immediate mpa request\n");
1030 
1031 		siw_cep_set_inuse(new_cep);
1032 		rv = siw_proc_mpareq(new_cep);
1033 		if (rv != -EAGAIN) {
1034 			siw_cep_put(cep);
1035 			new_cep->listen_cep = NULL;
1036 			if (rv) {
1037 				siw_cancel_mpatimer(new_cep);
1038 				siw_cep_set_free(new_cep);
1039 				goto error;
1040 			}
1041 		}
1042 		siw_cep_set_free(new_cep);
1043 	}
1044 	return;
1045 
1046 error:
1047 	if (new_cep)
1048 		siw_cep_put(new_cep);
1049 
1050 	if (new_s) {
1051 		siw_socket_disassoc(new_s);
1052 		sock_release(new_s);
1053 		new_cep->sock = NULL;
1054 	}
1055 	siw_dbg_cep(cep, "error %d\n", rv);
1056 }
1057 
siw_cm_work_handler(struct work_struct * w)1058 static void siw_cm_work_handler(struct work_struct *w)
1059 {
1060 	struct siw_cm_work *work;
1061 	struct siw_cep *cep;
1062 	int release_cep = 0, rv = 0;
1063 
1064 	work = container_of(w, struct siw_cm_work, work.work);
1065 	cep = work->cep;
1066 
1067 	siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
1068 		    cep->qp ? qp_id(cep->qp) : UINT_MAX,
1069 		    work->type, cep->state);
1070 
1071 	siw_cep_set_inuse(cep);
1072 
1073 	switch (work->type) {
1074 	case SIW_CM_WORK_ACCEPT:
1075 		siw_accept_newconn(cep);
1076 		break;
1077 
1078 	case SIW_CM_WORK_READ_MPAHDR:
1079 		if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1080 			if (cep->listen_cep) {
1081 				siw_cep_set_inuse(cep->listen_cep);
1082 
1083 				if (cep->listen_cep->state ==
1084 				    SIW_EPSTATE_LISTENING)
1085 					rv = siw_proc_mpareq(cep);
1086 				else
1087 					rv = -EFAULT;
1088 
1089 				siw_cep_set_free(cep->listen_cep);
1090 
1091 				if (rv != -EAGAIN) {
1092 					siw_cep_put(cep->listen_cep);
1093 					cep->listen_cep = NULL;
1094 					if (rv)
1095 						siw_cep_put(cep);
1096 				}
1097 			}
1098 		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1099 			rv = siw_proc_mpareply(cep);
1100 		} else {
1101 			/*
1102 			 * CEP already moved out of MPA handshake.
1103 			 * any connection management already done.
1104 			 * silently ignore the mpa packet.
1105 			 */
1106 			if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1107 				cep->sock->sk->sk_data_ready(cep->sock->sk);
1108 				siw_dbg_cep(cep, "already in RDMA mode");
1109 			} else {
1110 				siw_dbg_cep(cep, "out of state: %d\n",
1111 					    cep->state);
1112 			}
1113 		}
1114 		if (rv && rv != -EAGAIN)
1115 			release_cep = 1;
1116 		break;
1117 
1118 	case SIW_CM_WORK_CLOSE_LLP:
1119 		/*
1120 		 * QP scheduled LLP close
1121 		 */
1122 		if (cep->qp)
1123 			siw_send_terminate(cep->qp);
1124 
1125 		if (cep->cm_id)
1126 			siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
1127 
1128 		release_cep = 1;
1129 		break;
1130 
1131 	case SIW_CM_WORK_PEER_CLOSE:
1132 		if (cep->cm_id) {
1133 			if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1134 				/*
1135 				 * MPA reply not received, but connection drop
1136 				 */
1137 				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
1138 					      -ECONNRESET);
1139 			} else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1140 				/*
1141 				 * NOTE: IW_CM_EVENT_DISCONNECT is given just
1142 				 *       to transition IWCM into CLOSING.
1143 				 */
1144 				siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
1145 				siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
1146 			}
1147 			/*
1148 			 * for other states there is no connection
1149 			 * known to the IWCM.
1150 			 */
1151 		} else {
1152 			if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
1153 				/*
1154 				 * Wait for the ulp/CM to call accept/reject
1155 				 */
1156 				siw_dbg_cep(cep,
1157 					    "mpa req recvd, wait for ULP\n");
1158 			} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1159 				/*
1160 				 * Socket close before MPA request received.
1161 				 */
1162 				if (cep->listen_cep) {
1163 					siw_dbg_cep(cep,
1164 						"no mpareq: drop listener\n");
1165 					siw_cep_put(cep->listen_cep);
1166 					cep->listen_cep = NULL;
1167 				}
1168 			}
1169 		}
1170 		release_cep = 1;
1171 		break;
1172 
1173 	case SIW_CM_WORK_MPATIMEOUT:
1174 		cep->mpa_timer = NULL;
1175 
1176 		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1177 			/*
1178 			 * MPA request timed out:
1179 			 * Hide any partially received private data and signal
1180 			 * timeout
1181 			 */
1182 			cep->mpa.hdr.params.pd_len = 0;
1183 
1184 			if (cep->cm_id)
1185 				siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
1186 					      -ETIMEDOUT);
1187 			release_cep = 1;
1188 
1189 		} else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1190 			/*
1191 			 * No MPA request received after peer TCP stream setup.
1192 			 */
1193 			if (cep->listen_cep) {
1194 				siw_cep_put(cep->listen_cep);
1195 				cep->listen_cep = NULL;
1196 			}
1197 			release_cep = 1;
1198 		}
1199 		break;
1200 
1201 	default:
1202 		WARN(1, "Undefined CM work type: %d\n", work->type);
1203 	}
1204 	if (release_cep) {
1205 		siw_dbg_cep(cep,
1206 			    "release: timer=%s, QP[%u]\n",
1207 			    cep->mpa_timer ? "y" : "n",
1208 			    cep->qp ? qp_id(cep->qp) : UINT_MAX);
1209 
1210 		siw_cancel_mpatimer(cep);
1211 
1212 		cep->state = SIW_EPSTATE_CLOSED;
1213 
1214 		if (cep->qp) {
1215 			struct siw_qp *qp = cep->qp;
1216 			/*
1217 			 * Serialize a potential race with application
1218 			 * closing the QP and calling siw_qp_cm_drop()
1219 			 */
1220 			siw_qp_get(qp);
1221 			siw_cep_set_free(cep);
1222 
1223 			siw_qp_llp_close(qp);
1224 			siw_qp_put(qp);
1225 
1226 			siw_cep_set_inuse(cep);
1227 			cep->qp = NULL;
1228 			siw_qp_put(qp);
1229 		}
1230 		if (cep->sock) {
1231 			siw_socket_disassoc(cep->sock);
1232 			sock_release(cep->sock);
1233 			cep->sock = NULL;
1234 		}
1235 		if (cep->cm_id) {
1236 			siw_free_cm_id(cep);
1237 			siw_cep_put(cep);
1238 		}
1239 	}
1240 	siw_cep_set_free(cep);
1241 	siw_put_work(work);
1242 	siw_cep_put(cep);
1243 }
1244 
1245 static struct workqueue_struct *siw_cm_wq;
1246 
siw_cm_queue_work(struct siw_cep * cep,enum siw_work_type type)1247 int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
1248 {
1249 	struct siw_cm_work *work = siw_get_work(cep);
1250 	unsigned long delay = 0;
1251 
1252 	if (!work) {
1253 		siw_dbg_cep(cep, "failed with no work available\n");
1254 		return -ENOMEM;
1255 	}
1256 	work->type = type;
1257 	work->cep = cep;
1258 
1259 	siw_cep_get(cep);
1260 
1261 	INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
1262 
1263 	if (type == SIW_CM_WORK_MPATIMEOUT) {
1264 		cep->mpa_timer = work;
1265 
1266 		if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
1267 			delay = MPAREQ_TIMEOUT;
1268 		else
1269 			delay = MPAREP_TIMEOUT;
1270 	}
1271 	siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n",
1272 		    cep->qp ? qp_id(cep->qp) : -1, type, delay);
1273 
1274 	queue_delayed_work(siw_cm_wq, &work->work, delay);
1275 
1276 	return 0;
1277 }
1278 
siw_cm_llp_data_ready(struct sock * sk)1279 static void siw_cm_llp_data_ready(struct sock *sk)
1280 {
1281 	struct siw_cep *cep;
1282 
1283 	trace_sk_data_ready(sk);
1284 
1285 	read_lock(&sk->sk_callback_lock);
1286 
1287 	cep = sk_to_cep(sk);
1288 	if (!cep)
1289 		goto out;
1290 
1291 	siw_dbg_cep(cep, "cep state: %d, socket state %d\n",
1292 		    cep->state, sk->sk_state);
1293 
1294 	if (sk->sk_state != TCP_ESTABLISHED)
1295 		goto out;
1296 
1297 	switch (cep->state) {
1298 	case SIW_EPSTATE_RDMA_MODE:
1299 	case SIW_EPSTATE_LISTENING:
1300 		break;
1301 
1302 	case SIW_EPSTATE_AWAIT_MPAREQ:
1303 	case SIW_EPSTATE_AWAIT_MPAREP:
1304 		siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
1305 		break;
1306 
1307 	default:
1308 		siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
1309 		break;
1310 	}
1311 out:
1312 	read_unlock(&sk->sk_callback_lock);
1313 }
1314 
siw_cm_llp_write_space(struct sock * sk)1315 static void siw_cm_llp_write_space(struct sock *sk)
1316 {
1317 	struct siw_cep *cep = sk_to_cep(sk);
1318 
1319 	if (cep)
1320 		siw_dbg_cep(cep, "state: %d\n", cep->state);
1321 }
1322 
siw_cm_llp_error_report(struct sock * sk)1323 static void siw_cm_llp_error_report(struct sock *sk)
1324 {
1325 	struct siw_cep *cep = sk_to_cep(sk);
1326 
1327 	if (cep) {
1328 		siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
1329 			    sk->sk_err, sk->sk_state, cep->state);
1330 		cep->sk_error_report(sk);
1331 	}
1332 }
1333 
siw_cm_llp_state_change(struct sock * sk)1334 static void siw_cm_llp_state_change(struct sock *sk)
1335 {
1336 	struct siw_cep *cep;
1337 	void (*orig_state_change)(struct sock *s);
1338 
1339 	read_lock(&sk->sk_callback_lock);
1340 
1341 	cep = sk_to_cep(sk);
1342 	if (!cep) {
1343 		/* endpoint already disassociated */
1344 		read_unlock(&sk->sk_callback_lock);
1345 		return;
1346 	}
1347 	orig_state_change = cep->sk_state_change;
1348 
1349 	siw_dbg_cep(cep, "state: %d\n", cep->state);
1350 
1351 	switch (sk->sk_state) {
1352 	case TCP_ESTABLISHED:
1353 		/*
1354 		 * handle accepting socket as special case where only
1355 		 * new connection is possible
1356 		 */
1357 		siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
1358 		break;
1359 
1360 	case TCP_CLOSE:
1361 	case TCP_CLOSE_WAIT:
1362 		if (cep->qp)
1363 			cep->qp->tx_ctx.tx_suspend = 1;
1364 		siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
1365 		break;
1366 
1367 	default:
1368 		siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
1369 	}
1370 	read_unlock(&sk->sk_callback_lock);
1371 	orig_state_change(sk);
1372 }
1373 
kernel_bindconnect(struct socket * s,struct sockaddr * laddr,struct sockaddr * raddr,bool afonly)1374 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
1375 			      struct sockaddr *raddr, bool afonly)
1376 {
1377 	int rv, flags = 0;
1378 	size_t size = laddr->sa_family == AF_INET ?
1379 		sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
1380 
1381 	/*
1382 	 * Make address available again asap.
1383 	 */
1384 	sock_set_reuseaddr(s->sk);
1385 
1386 	if (afonly) {
1387 		rv = ip6_sock_set_v6only(s->sk);
1388 		if (rv)
1389 			return rv;
1390 	}
1391 
1392 	rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, size);
1393 	if (rv < 0)
1394 		return rv;
1395 
1396 	rv = s->ops->connect(s, (struct sockaddr_unsized *)raddr, size, flags);
1397 
1398 	return rv < 0 ? rv : 0;
1399 }
1400 
siw_connect(struct iw_cm_id * id,struct iw_cm_conn_param * params)1401 int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1402 {
1403 	struct siw_device *sdev = to_siw_dev(id->device);
1404 	struct siw_qp *qp;
1405 	struct siw_cep *cep = NULL;
1406 	struct socket *s = NULL;
1407 	struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
1408 			*raddr = (struct sockaddr *)&id->remote_addr;
1409 	bool p2p_mode = peer_to_peer, v4 = true;
1410 	u16 pd_len = params->private_data_len;
1411 	int version = mpa_version, rv;
1412 
1413 	if (pd_len > MPA_MAX_PRIVDATA)
1414 		return -EINVAL;
1415 
1416 	if (params->ird > sdev->attrs.max_ird ||
1417 	    params->ord > sdev->attrs.max_ord)
1418 		return -ENOMEM;
1419 
1420 	if (laddr->sa_family == AF_INET6)
1421 		v4 = false;
1422 	else if (laddr->sa_family != AF_INET)
1423 		return -EAFNOSUPPORT;
1424 
1425 	/*
1426 	 * Respect any iwarp port mapping: Use mapped remote address
1427 	 * if valid. Local address must not be mapped, since siw
1428 	 * uses kernel TCP stack.
1429 	 */
1430 	if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
1431 	     to_sockaddr_in6(id->remote_addr).sin6_port != 0)
1432 		raddr = (struct sockaddr *)&id->m_remote_addr;
1433 
1434 	qp = siw_qp_id2obj(sdev, params->qpn);
1435 	if (!qp) {
1436 		WARN(1, "[QP %u] does not exist\n", params->qpn);
1437 		rv = -EINVAL;
1438 		goto error;
1439 	}
1440 	siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr,
1441 		   raddr);
1442 
1443 	rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
1444 	if (rv < 0)
1445 		goto error;
1446 	siw_reclassify_socket(s);
1447 
1448 	/*
1449 	 * NOTE: For simplification, connect() is called in blocking
1450 	 * mode. Might be reconsidered for async connection setup at
1451 	 * TCP level.
1452 	 */
1453 	rv = kernel_bindconnect(s, laddr, raddr, id->afonly);
1454 	if (rv != 0) {
1455 		siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
1456 		goto error;
1457 	}
1458 	if (siw_tcp_nagle == false)
1459 		tcp_sock_set_nodelay(s->sk);
1460 	cep = siw_cep_alloc(sdev);
1461 	if (!cep) {
1462 		rv = -ENOMEM;
1463 		goto error;
1464 	}
1465 	siw_cep_set_inuse(cep);
1466 
1467 	/* Associate QP with CEP */
1468 	siw_cep_get(cep);
1469 	qp->cep = cep;
1470 
1471 	/* siw_qp_get(qp) already done by QP lookup */
1472 	cep->qp = qp;
1473 
1474 	id->add_ref(id);
1475 	cep->cm_id = id;
1476 
1477 	/*
1478 	 * 4: Allocate a sufficient number of work elements
1479 	 * to allow concurrent handling of local + peer close
1480 	 * events, MPA header processing + MPA timeout.
1481 	 */
1482 	rv = siw_cm_alloc_work(cep, 4);
1483 	if (rv != 0) {
1484 		rv = -ENOMEM;
1485 		goto error;
1486 	}
1487 	cep->ird = params->ird;
1488 	cep->ord = params->ord;
1489 
1490 	if (p2p_mode && cep->ord == 0)
1491 		cep->ord = 1;
1492 
1493 	cep->state = SIW_EPSTATE_CONNECTING;
1494 
1495 	/*
1496 	 * Associate CEP with socket
1497 	 */
1498 	siw_cep_socket_assoc(cep, s);
1499 
1500 	cep->state = SIW_EPSTATE_AWAIT_MPAREP;
1501 
1502 	/*
1503 	 * Set MPA Request bits: CRC if required, no MPA Markers,
1504 	 * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
1505 	 */
1506 	cep->mpa.hdr.params.bits = 0;
1507 	if (version > MPA_REVISION_2) {
1508 		pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
1509 		version = MPA_REVISION_2;
1510 		/* Adjust also module parameter */
1511 		mpa_version = MPA_REVISION_2;
1512 	}
1513 	__mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
1514 
1515 	if (try_gso)
1516 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
1517 
1518 	if (mpa_crc_required)
1519 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
1520 
1521 	/*
1522 	 * If MPA version == 2:
1523 	 * o Include ORD and IRD.
1524 	 * o Indicate peer-to-peer mode, if required by module
1525 	 *   parameter 'peer_to_peer'.
1526 	 */
1527 	if (version == MPA_REVISION_2) {
1528 		cep->enhanced_rdma_conn_est = true;
1529 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
1530 
1531 		cep->mpa.v2_ctrl.ird = htons(cep->ird);
1532 		cep->mpa.v2_ctrl.ord = htons(cep->ord);
1533 
1534 		if (p2p_mode) {
1535 			cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
1536 			cep->mpa.v2_ctrl.ord |= rtr_type;
1537 		}
1538 		/* Remember own P2P mode requested */
1539 		cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
1540 		cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
1541 	}
1542 	memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
1543 
1544 	rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
1545 	/*
1546 	 * Reset private data.
1547 	 */
1548 	cep->mpa.hdr.params.pd_len = 0;
1549 
1550 	if (rv >= 0) {
1551 		rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
1552 		if (!rv) {
1553 			siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
1554 			siw_cep_set_free(cep);
1555 			return 0;
1556 		}
1557 	}
1558 error:
1559 	siw_dbg(id->device, "failed: %d\n", rv);
1560 
1561 	if (cep) {
1562 		siw_socket_disassoc(s);
1563 		sock_release(s);
1564 		cep->sock = NULL;
1565 
1566 		cep->qp = NULL;
1567 
1568 		cep->cm_id = NULL;
1569 		id->rem_ref(id);
1570 
1571 		qp->cep = NULL;
1572 		siw_cep_put(cep);
1573 
1574 		cep->state = SIW_EPSTATE_CLOSED;
1575 
1576 		siw_cep_set_free_and_put(cep);
1577 
1578 	} else if (s) {
1579 		sock_release(s);
1580 	}
1581 	if (qp)
1582 		siw_qp_put(qp);
1583 
1584 	return rv;
1585 }
1586 
1587 /*
1588  * siw_accept - Let SoftiWARP accept an RDMA connection request
1589  *
1590  * @id:		New connection management id to be used for accepted
1591  *		connection request
1592  * @params:	Connection parameters provided by ULP for accepting connection
1593  *
1594  * Transition QP to RTS state, associate new CM id @id with accepted CEP
1595  * and get prepared for TCP input by installing socket callbacks.
1596  * Then send MPA Reply and generate the "connection established" event.
1597  * Socket callbacks must be installed before sending MPA Reply, because
1598  * the latter may cause a first RDMA message to arrive from the RDMA Initiator
1599  * side very quickly, at which time the socket callbacks must be ready.
1600  */
siw_accept(struct iw_cm_id * id,struct iw_cm_conn_param * params)1601 int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1602 {
1603 	struct siw_device *sdev = to_siw_dev(id->device);
1604 	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1605 	struct siw_qp *qp;
1606 	struct siw_qp_attrs qp_attrs;
1607 	int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA;
1608 	bool wait_for_peer_rts = false;
1609 
1610 	siw_cep_set_inuse(cep);
1611 	siw_cep_put(cep);
1612 
1613 	/* Free lingering inbound private data */
1614 	if (cep->mpa.hdr.params.pd_len) {
1615 		cep->mpa.hdr.params.pd_len = 0;
1616 		kfree(cep->mpa.pdata);
1617 		cep->mpa.pdata = NULL;
1618 	}
1619 	siw_cancel_mpatimer(cep);
1620 
1621 	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1622 		siw_dbg_cep(cep, "out of state\n");
1623 		rv = -ECONNRESET;
1624 		goto free_cep;
1625 	}
1626 	qp = siw_qp_id2obj(sdev, params->qpn);
1627 	if (!qp) {
1628 		WARN(1, "[QP %d] does not exist\n", params->qpn);
1629 		goto free_cep;
1630 	}
1631 	down_write(&qp->state_lock);
1632 	if (qp->attrs.state > SIW_QP_STATE_RTR)
1633 		goto error_unlock;
1634 	siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
1635 
1636 	if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
1637 		siw_dbg_cep(cep, "peer allows GSO on TX\n");
1638 		qp->tx_ctx.gso_seg_limit = 0;
1639 	}
1640 	if (params->ord > sdev->attrs.max_ord ||
1641 	    params->ird > sdev->attrs.max_ird) {
1642 		siw_dbg_cep(
1643 			cep,
1644 			"[QP %u]: ord %d (max %d), ird %d (max %d)\n",
1645 			qp_id(qp), params->ord, sdev->attrs.max_ord,
1646 			params->ird, sdev->attrs.max_ird);
1647 		goto error_unlock;
1648 	}
1649 	if (cep->enhanced_rdma_conn_est)
1650 		max_priv_data -= sizeof(struct mpa_v2_data);
1651 
1652 	if (params->private_data_len > max_priv_data) {
1653 		siw_dbg_cep(
1654 			cep,
1655 			"[QP %u]: private data length: %d (max %d)\n",
1656 			qp_id(qp), params->private_data_len, max_priv_data);
1657 		goto error_unlock;
1658 	}
1659 	if (cep->enhanced_rdma_conn_est) {
1660 		if (params->ord > cep->ord) {
1661 			if (relaxed_ird_negotiation) {
1662 				params->ord = cep->ord;
1663 			} else {
1664 				cep->ird = params->ird;
1665 				cep->ord = params->ord;
1666 				goto error_unlock;
1667 			}
1668 		}
1669 		if (params->ird < cep->ird) {
1670 			if (relaxed_ird_negotiation &&
1671 			    cep->ird <= sdev->attrs.max_ird)
1672 				params->ird = cep->ird;
1673 			else {
1674 				rv = -ENOMEM;
1675 				goto error_unlock;
1676 			}
1677 		}
1678 		if (cep->mpa.v2_ctrl.ord &
1679 		    (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
1680 			wait_for_peer_rts = true;
1681 		/*
1682 		 * Signal back negotiated IRD and ORD values
1683 		 */
1684 		cep->mpa.v2_ctrl.ord =
1685 			htons(params->ord & MPA_IRD_ORD_MASK) |
1686 			(cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
1687 		cep->mpa.v2_ctrl.ird =
1688 			htons(params->ird & MPA_IRD_ORD_MASK) |
1689 			(cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
1690 	}
1691 	cep->ird = params->ird;
1692 	cep->ord = params->ord;
1693 
1694 	cep->cm_id = id;
1695 	id->add_ref(id);
1696 
1697 	memset(&qp_attrs, 0, sizeof(qp_attrs));
1698 	qp_attrs.orq_size = cep->ord;
1699 	qp_attrs.irq_size = cep->ird;
1700 	qp_attrs.sk = cep->sock;
1701 	if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
1702 		qp_attrs.flags = SIW_MPA_CRC;
1703 	qp_attrs.state = SIW_QP_STATE_RTS;
1704 
1705 	siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
1706 
1707 	/* Associate QP with CEP */
1708 	siw_cep_get(cep);
1709 	qp->cep = cep;
1710 
1711 	/* siw_qp_get(qp) already done by QP lookup */
1712 	cep->qp = qp;
1713 
1714 	cep->state = SIW_EPSTATE_RDMA_MODE;
1715 
1716 	/* Move socket RX/TX under QP control */
1717 	rv = siw_qp_modify(qp, &qp_attrs,
1718 			   SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
1719 				   SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
1720 				   SIW_QP_ATTR_MPA);
1721 	up_write(&qp->state_lock);
1722 	if (rv)
1723 		goto error;
1724 
1725 	siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n",
1726 		    qp_id(qp), params->private_data_len);
1727 
1728 	rv = siw_send_mpareqrep(cep, params->private_data,
1729 				params->private_data_len);
1730 	if (rv != 0)
1731 		goto error;
1732 
1733 	if (wait_for_peer_rts) {
1734 		siw_sk_assign_rtr_upcalls(cep);
1735 	} else {
1736 		siw_qp_socket_assoc(cep, qp);
1737 		rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
1738 		if (rv)
1739 			goto error;
1740 	}
1741 	siw_cep_set_free(cep);
1742 
1743 	return 0;
1744 
1745 error_unlock:
1746 	up_write(&qp->state_lock);
1747 error:
1748 	siw_destroy_cep_sock(cep);
1749 
1750 	cep->state = SIW_EPSTATE_CLOSED;
1751 
1752 	siw_free_cm_id(cep);
1753 	if (qp->cep) {
1754 		siw_cep_put(cep);
1755 		qp->cep = NULL;
1756 	}
1757 	cep->qp = NULL;
1758 	siw_qp_put(qp);
1759 free_cep:
1760 	siw_cep_set_free_and_put(cep);
1761 	return rv;
1762 }
1763 
1764 /*
1765  * siw_reject()
1766  *
1767  * Local connection reject case. Send private data back to peer,
1768  * close connection and dereference connection id.
1769  */
siw_reject(struct iw_cm_id * id,const void * pdata,u8 pd_len)1770 int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
1771 {
1772 	struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1773 
1774 	siw_cep_set_inuse(cep);
1775 	siw_cep_put(cep);
1776 
1777 	siw_cancel_mpatimer(cep);
1778 
1779 	if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1780 		siw_dbg_cep(cep, "out of state\n");
1781 
1782 		siw_cep_set_free_and_put(cep); /* put last reference */
1783 
1784 		return -ECONNRESET;
1785 	}
1786 	siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state,
1787 		    pd_len);
1788 
1789 	if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
1790 		cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
1791 		siw_send_mpareqrep(cep, pdata, pd_len);
1792 	}
1793 	siw_destroy_cep_sock(cep);
1794 
1795 	cep->state = SIW_EPSTATE_CLOSED;
1796 
1797 	siw_cep_set_free_and_put(cep);
1798 
1799 	return 0;
1800 }
1801 
1802 /*
1803  * siw_create_listen - Create resources for a listener's IWCM ID @id
1804  *
1805  * Starts listen on the socket address id->local_addr.
1806  *
1807  */
siw_create_listen(struct iw_cm_id * id,int backlog)1808 int siw_create_listen(struct iw_cm_id *id, int backlog)
1809 {
1810 	struct socket *s;
1811 	struct siw_cep *cep = NULL;
1812 	struct net_device *ndev = NULL;
1813 	struct siw_device *sdev = to_siw_dev(id->device);
1814 	int addr_family = id->local_addr.ss_family;
1815 	int rv = 0;
1816 
1817 	if (addr_family != AF_INET && addr_family != AF_INET6)
1818 		return -EAFNOSUPPORT;
1819 
1820 	rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
1821 	if (rv < 0)
1822 		return rv;
1823 	siw_reclassify_socket(s);
1824 
1825 	/*
1826 	 * Allow binding local port when still in TIME_WAIT from last close.
1827 	 */
1828 	sock_set_reuseaddr(s->sk);
1829 
1830 	if (addr_family == AF_INET) {
1831 		struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
1832 
1833 		/* For wildcard addr, limit binding to current device only */
1834 		if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) {
1835 			ndev = ib_device_get_netdev(id->device, SIW_PORT);
1836 			if (ndev) {
1837 				s->sk->sk_bound_dev_if = ndev->ifindex;
1838 			} else {
1839 				rv = -ENODEV;
1840 				goto error;
1841 			}
1842 		}
1843 		rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
1844 				  sizeof(struct sockaddr_in));
1845 	} else {
1846 		struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
1847 
1848 		if (id->afonly) {
1849 			rv = ip6_sock_set_v6only(s->sk);
1850 			if (rv) {
1851 				siw_dbg(id->device,
1852 					"ip6_sock_set_v6only erro: %d\n", rv);
1853 				goto error;
1854 			}
1855 		}
1856 
1857 		/* For wildcard addr, limit binding to current device only */
1858 		if (ipv6_addr_any(&laddr->sin6_addr)) {
1859 			ndev = ib_device_get_netdev(id->device, SIW_PORT);
1860 			if (ndev) {
1861 				s->sk->sk_bound_dev_if = ndev->ifindex;
1862 			} else {
1863 				rv = -ENODEV;
1864 				goto error;
1865 			}
1866 		}
1867 		rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
1868 				  sizeof(struct sockaddr_in6));
1869 	}
1870 	if (rv) {
1871 		siw_dbg(id->device, "socket bind error: %d\n", rv);
1872 		goto error;
1873 	}
1874 	cep = siw_cep_alloc(sdev);
1875 	if (!cep) {
1876 		rv = -ENOMEM;
1877 		goto error;
1878 	}
1879 	siw_cep_socket_assoc(cep, s);
1880 
1881 	rv = siw_cm_alloc_work(cep, backlog);
1882 	if (rv) {
1883 		siw_dbg(id->device,
1884 			"alloc_work error %d, backlog %d\n",
1885 			rv, backlog);
1886 		goto error;
1887 	}
1888 	rv = s->ops->listen(s, backlog);
1889 	if (rv) {
1890 		siw_dbg(id->device, "listen error %d\n", rv);
1891 		goto error;
1892 	}
1893 	cep->cm_id = id;
1894 	id->add_ref(id);
1895 
1896 	/*
1897 	 * In case of a wildcard rdma_listen on a multi-homed device,
1898 	 * a listener's IWCM id is associated with more than one listening CEP.
1899 	 *
1900 	 * We currently use id->provider_data in three different ways:
1901 	 *
1902 	 * o For a listener's IWCM id, id->provider_data points to
1903 	 *   the list_head of the list of listening CEPs.
1904 	 *   Uses: siw_create_listen(), siw_destroy_listen()
1905 	 *
1906 	 * o For each accepted passive-side IWCM id, id->provider_data
1907 	 *   points to the CEP itself. This is a consequence of
1908 	 *   - siw_cm_upcall() setting event.provider_data = cep and
1909 	 *   - the IWCM's cm_conn_req_handler() setting provider_data of the
1910 	 *     new passive-side IWCM id equal to event.provider_data
1911 	 *   Uses: siw_accept(), siw_reject()
1912 	 *
1913 	 * o For an active-side IWCM id, id->provider_data is not used at all.
1914 	 *
1915 	 */
1916 	if (!id->provider_data) {
1917 		id->provider_data =
1918 			kmalloc(sizeof(struct list_head), GFP_KERNEL);
1919 		if (!id->provider_data) {
1920 			rv = -ENOMEM;
1921 			goto error;
1922 		}
1923 		INIT_LIST_HEAD((struct list_head *)id->provider_data);
1924 	}
1925 	list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
1926 	cep->state = SIW_EPSTATE_LISTENING;
1927 	dev_put(ndev);
1928 
1929 	siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
1930 
1931 	return 0;
1932 
1933 error:
1934 	siw_dbg(id->device, "failed: %d\n", rv);
1935 
1936 	if (cep) {
1937 		siw_cep_set_inuse(cep);
1938 
1939 		siw_free_cm_id(cep);
1940 		cep->sock = NULL;
1941 		siw_socket_disassoc(s);
1942 		cep->state = SIW_EPSTATE_CLOSED;
1943 
1944 		siw_cep_set_free_and_put(cep);
1945 	}
1946 	sock_release(s);
1947 	dev_put(ndev);
1948 
1949 	return rv;
1950 }
1951 
siw_drop_listeners(struct iw_cm_id * id)1952 static void siw_drop_listeners(struct iw_cm_id *id)
1953 {
1954 	struct list_head *p, *tmp;
1955 
1956 	/*
1957 	 * In case of a wildcard rdma_listen on a multi-homed device,
1958 	 * a listener's IWCM id is associated with more than one listening CEP.
1959 	 */
1960 	list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
1961 		struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
1962 
1963 		list_del(p);
1964 
1965 		siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
1966 
1967 		siw_cep_set_inuse(cep);
1968 
1969 		siw_free_cm_id(cep);
1970 		if (cep->sock) {
1971 			siw_socket_disassoc(cep->sock);
1972 			sock_release(cep->sock);
1973 			cep->sock = NULL;
1974 		}
1975 		cep->state = SIW_EPSTATE_CLOSED;
1976 		siw_cep_set_free_and_put(cep);
1977 	}
1978 }
1979 
siw_destroy_listen(struct iw_cm_id * id)1980 int siw_destroy_listen(struct iw_cm_id *id)
1981 {
1982 	if (!id->provider_data) {
1983 		siw_dbg(id->device, "no cep(s)\n");
1984 		return 0;
1985 	}
1986 	siw_drop_listeners(id);
1987 	kfree(id->provider_data);
1988 	id->provider_data = NULL;
1989 
1990 	return 0;
1991 }
1992 
siw_cm_init(void)1993 int siw_cm_init(void)
1994 {
1995 	/*
1996 	 * create_single_workqueue for strict ordering
1997 	 */
1998 	siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
1999 	if (!siw_cm_wq)
2000 		return -ENOMEM;
2001 
2002 	return 0;
2003 }
2004 
siw_cm_exit(void)2005 void siw_cm_exit(void)
2006 {
2007 	if (siw_cm_wq)
2008 		destroy_workqueue(siw_cm_wq);
2009 }
2010