1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Fredy Neeser */
5 /* Greg Joyce <greg@opengridcomputing.com> */
6 /* Copyright (c) 2008-2019, IBM Corporation */
7 /* Copyright (c) 2017, Open Grid Computing, Inc. */
8
9 #include <linux/errno.h>
10 #include <linux/types.h>
11 #include <linux/net.h>
12 #include <linux/inetdevice.h>
13 #include <net/addrconf.h>
14 #include <linux/workqueue.h>
15 #include <net/sock.h>
16 #include <net/tcp.h>
17 #include <linux/inet.h>
18 #include <linux/tcp.h>
19 #include <trace/events/sock.h>
20
21 #include <rdma/iw_cm.h>
22 #include <rdma/ib_verbs.h>
23 #include <rdma/ib_user_verbs.h>
24
25 #include "siw.h"
26 #include "siw_cm.h"
27
28 /*
29 * Set to any combination of
30 * MPA_V2_RDMA_NO_RTR, MPA_V2_RDMA_READ_RTR, MPA_V2_RDMA_WRITE_RTR
31 */
32 static __be16 rtr_type = MPA_V2_RDMA_READ_RTR | MPA_V2_RDMA_WRITE_RTR;
33 static const bool relaxed_ird_negotiation = true;
34
35 static void siw_cm_llp_state_change(struct sock *s);
36 static void siw_cm_llp_data_ready(struct sock *s);
37 static void siw_cm_llp_write_space(struct sock *s);
38 static void siw_cm_llp_error_report(struct sock *s);
39 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
40 int status);
41
42
43 #ifdef CONFIG_DEBUG_LOCK_ALLOC
44 /*
45 * lockdep can detect false positive circular dependencies
46 * when there are user-space socket API users or in kernel
47 * users switching between a tcp and rdma transport.
48 * Maybe also switching between siw and rxe may cause
49 * problems as per default sockets are only classified
50 * by family and not by ip protocol. And there might
51 * be different locks used between the application
52 * and the low level sockets.
53 *
54 * Problems were seen with ksmbd.ko and cifs.ko,
55 * switching transports, use git blame to find
56 * more details.
57 */
58 static struct lock_class_key siw_sk_key[2];
59 static struct lock_class_key siw_slock_key[2];
60 #endif /* CONFIG_DEBUG_LOCK_ALLOC */
61
siw_reclassify_socket(struct socket * sock)62 static inline void siw_reclassify_socket(struct socket *sock)
63 {
64 #ifdef CONFIG_DEBUG_LOCK_ALLOC
65 struct sock *sk = sock->sk;
66
67 if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
68 return;
69
70 switch (sk->sk_family) {
71 case AF_INET:
72 sock_lock_init_class_and_name(sk,
73 "slock-AF_INET-RDMA-SIW",
74 &siw_slock_key[0],
75 "sk_lock-AF_INET-RDMA-SIW",
76 &siw_sk_key[0]);
77 break;
78 case AF_INET6:
79 sock_lock_init_class_and_name(sk,
80 "slock-AF_INET6-RDMA-SIW",
81 &siw_slock_key[1],
82 "sk_lock-AF_INET6-RDMA-SIW",
83 &siw_sk_key[1]);
84 break;
85 default:
86 WARN_ON_ONCE(1);
87 }
88 #endif /* CONFIG_DEBUG_LOCK_ALLOC */
89 }
90
siw_sk_assign_cm_upcalls(struct sock * sk)91 static void siw_sk_assign_cm_upcalls(struct sock *sk)
92 {
93 struct siw_cep *cep = sk_to_cep(sk);
94
95 write_lock_bh(&sk->sk_callback_lock);
96 cep->sk_state_change = sk->sk_state_change;
97 cep->sk_data_ready = sk->sk_data_ready;
98 cep->sk_write_space = sk->sk_write_space;
99 cep->sk_error_report = sk->sk_error_report;
100
101 sk->sk_state_change = siw_cm_llp_state_change;
102 sk->sk_data_ready = siw_cm_llp_data_ready;
103 sk->sk_write_space = siw_cm_llp_write_space;
104 sk->sk_error_report = siw_cm_llp_error_report;
105 write_unlock_bh(&sk->sk_callback_lock);
106 }
107
siw_sk_restore_upcalls(struct sock * sk,struct siw_cep * cep)108 static void siw_sk_restore_upcalls(struct sock *sk, struct siw_cep *cep)
109 {
110 sk->sk_state_change = cep->sk_state_change;
111 sk->sk_data_ready = cep->sk_data_ready;
112 sk->sk_write_space = cep->sk_write_space;
113 sk->sk_error_report = cep->sk_error_report;
114 sk->sk_user_data = NULL;
115 }
116
siw_qp_socket_assoc(struct siw_cep * cep,struct siw_qp * qp)117 static void siw_qp_socket_assoc(struct siw_cep *cep, struct siw_qp *qp)
118 {
119 struct socket *s = cep->sock;
120 struct sock *sk = s->sk;
121
122 write_lock_bh(&sk->sk_callback_lock);
123
124 qp->attrs.sk = s;
125 sk->sk_data_ready = siw_qp_llp_data_ready;
126 sk->sk_write_space = siw_qp_llp_write_space;
127
128 write_unlock_bh(&sk->sk_callback_lock);
129 }
130
siw_socket_disassoc(struct socket * s)131 static void siw_socket_disassoc(struct socket *s)
132 {
133 struct sock *sk = s->sk;
134 struct siw_cep *cep;
135
136 if (sk) {
137 write_lock_bh(&sk->sk_callback_lock);
138 cep = sk_to_cep(sk);
139 if (cep) {
140 siw_sk_restore_upcalls(sk, cep);
141 siw_cep_put(cep);
142 } else {
143 pr_warn("siw: cannot restore sk callbacks: no ep\n");
144 }
145 write_unlock_bh(&sk->sk_callback_lock);
146 } else {
147 pr_warn("siw: cannot restore sk callbacks: no sk\n");
148 }
149 }
150
siw_rtr_data_ready(struct sock * sk)151 static void siw_rtr_data_ready(struct sock *sk)
152 {
153 struct siw_cep *cep;
154 struct siw_qp *qp = NULL;
155 read_descriptor_t rd_desc;
156
157 trace_sk_data_ready(sk);
158
159 read_lock(&sk->sk_callback_lock);
160
161 cep = sk_to_cep(sk);
162 if (!cep) {
163 WARN(1, "No connection endpoint\n");
164 goto out;
165 }
166 qp = sk_to_qp(sk);
167
168 memset(&rd_desc, 0, sizeof(rd_desc));
169 rd_desc.arg.data = qp;
170 rd_desc.count = 1;
171
172 tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
173 /*
174 * Check if first frame was successfully processed.
175 * Signal connection full establishment if yes.
176 * Failed data processing would have already scheduled
177 * connection drop.
178 */
179 if (!qp->rx_stream.rx_suspend)
180 siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
181 out:
182 read_unlock(&sk->sk_callback_lock);
183 if (qp)
184 siw_qp_socket_assoc(cep, qp);
185 }
186
siw_sk_assign_rtr_upcalls(struct siw_cep * cep)187 static void siw_sk_assign_rtr_upcalls(struct siw_cep *cep)
188 {
189 struct sock *sk = cep->sock->sk;
190
191 write_lock_bh(&sk->sk_callback_lock);
192 sk->sk_data_ready = siw_rtr_data_ready;
193 sk->sk_write_space = siw_qp_llp_write_space;
194 write_unlock_bh(&sk->sk_callback_lock);
195 }
196
siw_cep_socket_assoc(struct siw_cep * cep,struct socket * s)197 static void siw_cep_socket_assoc(struct siw_cep *cep, struct socket *s)
198 {
199 cep->sock = s;
200 siw_cep_get(cep);
201 s->sk->sk_user_data = cep;
202
203 siw_sk_assign_cm_upcalls(s->sk);
204 }
205
siw_cep_alloc(struct siw_device * sdev)206 static struct siw_cep *siw_cep_alloc(struct siw_device *sdev)
207 {
208 struct siw_cep *cep = kzalloc(sizeof(*cep), GFP_KERNEL);
209 unsigned long flags;
210
211 if (!cep)
212 return NULL;
213
214 INIT_LIST_HEAD(&cep->listenq);
215 INIT_LIST_HEAD(&cep->devq);
216 INIT_LIST_HEAD(&cep->work_freelist);
217
218 kref_init(&cep->ref);
219 cep->state = SIW_EPSTATE_IDLE;
220 init_waitqueue_head(&cep->waitq);
221 spin_lock_init(&cep->lock);
222 cep->sdev = sdev;
223 cep->enhanced_rdma_conn_est = false;
224
225 spin_lock_irqsave(&sdev->lock, flags);
226 list_add_tail(&cep->devq, &sdev->cep_list);
227 spin_unlock_irqrestore(&sdev->lock, flags);
228
229 siw_dbg_cep(cep, "new endpoint\n");
230 return cep;
231 }
232
siw_cm_free_work(struct siw_cep * cep)233 static void siw_cm_free_work(struct siw_cep *cep)
234 {
235 struct list_head *w, *tmp;
236 struct siw_cm_work *work;
237
238 list_for_each_safe(w, tmp, &cep->work_freelist) {
239 work = list_entry(w, struct siw_cm_work, list);
240 list_del(&work->list);
241 kfree(work);
242 }
243 }
244
siw_cancel_mpatimer(struct siw_cep * cep)245 static void siw_cancel_mpatimer(struct siw_cep *cep)
246 {
247 spin_lock_bh(&cep->lock);
248 if (cep->mpa_timer) {
249 if (cancel_delayed_work(&cep->mpa_timer->work)) {
250 siw_cep_put(cep);
251 kfree(cep->mpa_timer); /* not needed again */
252 }
253 cep->mpa_timer = NULL;
254 }
255 spin_unlock_bh(&cep->lock);
256 }
257
siw_put_work(struct siw_cm_work * work)258 static void siw_put_work(struct siw_cm_work *work)
259 {
260 INIT_LIST_HEAD(&work->list);
261 spin_lock_bh(&work->cep->lock);
262 list_add(&work->list, &work->cep->work_freelist);
263 spin_unlock_bh(&work->cep->lock);
264 }
265
siw_cep_set_inuse(struct siw_cep * cep)266 static void siw_cep_set_inuse(struct siw_cep *cep)
267 {
268 unsigned long flags;
269 retry:
270 spin_lock_irqsave(&cep->lock, flags);
271
272 if (cep->in_use) {
273 spin_unlock_irqrestore(&cep->lock, flags);
274 wait_event_interruptible(cep->waitq, !cep->in_use);
275 if (signal_pending(current))
276 flush_signals(current);
277 goto retry;
278 } else {
279 cep->in_use = 1;
280 spin_unlock_irqrestore(&cep->lock, flags);
281 }
282 }
283
siw_cep_set_free(struct siw_cep * cep)284 static void siw_cep_set_free(struct siw_cep *cep)
285 {
286 unsigned long flags;
287
288 spin_lock_irqsave(&cep->lock, flags);
289 cep->in_use = 0;
290 spin_unlock_irqrestore(&cep->lock, flags);
291
292 wake_up(&cep->waitq);
293 }
294
__siw_cep_dealloc(struct kref * ref)295 static void __siw_cep_dealloc(struct kref *ref)
296 {
297 struct siw_cep *cep = container_of(ref, struct siw_cep, ref);
298 struct siw_device *sdev = cep->sdev;
299 unsigned long flags;
300
301 WARN_ON(cep->listen_cep);
302
303 /* kfree(NULL) is safe */
304 kfree(cep->mpa.pdata);
305 spin_lock_bh(&cep->lock);
306 if (!list_empty(&cep->work_freelist))
307 siw_cm_free_work(cep);
308 spin_unlock_bh(&cep->lock);
309
310 spin_lock_irqsave(&sdev->lock, flags);
311 list_del(&cep->devq);
312 spin_unlock_irqrestore(&sdev->lock, flags);
313
314 siw_dbg_cep(cep, "free endpoint\n");
315 kfree(cep);
316 }
317
siw_get_work(struct siw_cep * cep)318 static struct siw_cm_work *siw_get_work(struct siw_cep *cep)
319 {
320 struct siw_cm_work *work = NULL;
321
322 spin_lock_bh(&cep->lock);
323 if (!list_empty(&cep->work_freelist)) {
324 work = list_entry(cep->work_freelist.next, struct siw_cm_work,
325 list);
326 list_del_init(&work->list);
327 }
328 spin_unlock_bh(&cep->lock);
329 return work;
330 }
331
siw_cm_alloc_work(struct siw_cep * cep,int num)332 static int siw_cm_alloc_work(struct siw_cep *cep, int num)
333 {
334 struct siw_cm_work *work;
335
336 while (num--) {
337 work = kmalloc(sizeof(*work), GFP_KERNEL);
338 if (!work) {
339 if (!(list_empty(&cep->work_freelist)))
340 siw_cm_free_work(cep);
341 return -ENOMEM;
342 }
343 work->cep = cep;
344 INIT_LIST_HEAD(&work->list);
345 list_add(&work->list, &cep->work_freelist);
346 }
347 return 0;
348 }
349
350 /*
351 * siw_cm_upcall()
352 *
353 * Upcall to IWCM to inform about async connection events
354 */
siw_cm_upcall(struct siw_cep * cep,enum iw_cm_event_type reason,int status)355 static int siw_cm_upcall(struct siw_cep *cep, enum iw_cm_event_type reason,
356 int status)
357 {
358 struct iw_cm_event event;
359 struct iw_cm_id *id;
360
361 memset(&event, 0, sizeof(event));
362 event.status = status;
363 event.event = reason;
364
365 if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
366 event.provider_data = cep;
367 id = cep->listen_cep->cm_id;
368 } else {
369 id = cep->cm_id;
370 }
371 /* Signal IRD and ORD */
372 if (reason == IW_CM_EVENT_ESTABLISHED ||
373 reason == IW_CM_EVENT_CONNECT_REPLY) {
374 /* Signal negotiated IRD/ORD values we will use */
375 event.ird = cep->ird;
376 event.ord = cep->ord;
377 } else if (reason == IW_CM_EVENT_CONNECT_REQUEST) {
378 event.ird = cep->ord;
379 event.ord = cep->ird;
380 }
381 /* Signal private data and address information */
382 if (reason == IW_CM_EVENT_CONNECT_REQUEST ||
383 reason == IW_CM_EVENT_CONNECT_REPLY) {
384 u16 pd_len = be16_to_cpu(cep->mpa.hdr.params.pd_len);
385
386 if (pd_len) {
387 /*
388 * hand over MPA private data
389 */
390 event.private_data_len = pd_len;
391 event.private_data = cep->mpa.pdata;
392
393 /* Hide MPA V2 IRD/ORD control */
394 if (cep->enhanced_rdma_conn_est) {
395 event.private_data_len -=
396 sizeof(struct mpa_v2_data);
397 event.private_data +=
398 sizeof(struct mpa_v2_data);
399 }
400 }
401 getname_local(cep->sock, &event.local_addr);
402 getname_peer(cep->sock, &event.remote_addr);
403 }
404 siw_dbg_cep(cep, "[QP %u]: reason=%d, status=%d\n",
405 cep->qp ? qp_id(cep->qp) : UINT_MAX, reason, status);
406
407 return id->event_handler(id, &event);
408 }
409
siw_free_cm_id(struct siw_cep * cep)410 static void siw_free_cm_id(struct siw_cep *cep)
411 {
412 if (!cep->cm_id)
413 return;
414
415 cep->cm_id->rem_ref(cep->cm_id);
416 cep->cm_id = NULL;
417 }
418
siw_destroy_cep_sock(struct siw_cep * cep)419 static void siw_destroy_cep_sock(struct siw_cep *cep)
420 {
421 if (cep->sock) {
422 siw_socket_disassoc(cep->sock);
423 sock_release(cep->sock);
424 cep->sock = NULL;
425 }
426 }
427
428 /*
429 * siw_qp_cm_drop()
430 *
431 * Drops established LLP connection if present and not already
432 * scheduled for dropping. Called from user context, SQ workqueue
433 * or receive IRQ. Caller signals if socket can be immediately
434 * closed (basically, if not in IRQ).
435 */
siw_qp_cm_drop(struct siw_qp * qp,int schedule)436 void siw_qp_cm_drop(struct siw_qp *qp, int schedule)
437 {
438 struct siw_cep *cep = qp->cep;
439
440 qp->rx_stream.rx_suspend = 1;
441 qp->tx_ctx.tx_suspend = 1;
442
443 if (!qp->cep)
444 return;
445
446 if (schedule) {
447 siw_cm_queue_work(cep, SIW_CM_WORK_CLOSE_LLP);
448 } else {
449 siw_cep_set_inuse(cep);
450
451 if (cep->state == SIW_EPSTATE_CLOSED) {
452 siw_dbg_cep(cep, "already closed\n");
453 goto out;
454 }
455 siw_dbg_cep(cep, "immediate close, state %d\n", cep->state);
456
457 siw_send_terminate(qp);
458
459 if (cep->cm_id) {
460 switch (cep->state) {
461 case SIW_EPSTATE_AWAIT_MPAREP:
462 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
463 -EINVAL);
464 break;
465
466 case SIW_EPSTATE_RDMA_MODE:
467 siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
468 break;
469
470 case SIW_EPSTATE_IDLE:
471 case SIW_EPSTATE_LISTENING:
472 case SIW_EPSTATE_CONNECTING:
473 case SIW_EPSTATE_AWAIT_MPAREQ:
474 case SIW_EPSTATE_RECVD_MPAREQ:
475 case SIW_EPSTATE_CLOSED:
476 default:
477 break;
478 }
479 siw_free_cm_id(cep);
480 siw_cep_put(cep);
481 }
482 cep->state = SIW_EPSTATE_CLOSED;
483
484 siw_destroy_cep_sock(cep);
485 if (cep->qp) {
486 cep->qp = NULL;
487 siw_qp_put(qp);
488 }
489 out:
490 siw_cep_set_free(cep);
491 }
492 }
493
siw_cep_put(struct siw_cep * cep)494 void siw_cep_put(struct siw_cep *cep)
495 {
496 WARN_ON(kref_read(&cep->ref) < 1);
497 kref_put(&cep->ref, __siw_cep_dealloc);
498 }
499
siw_cep_set_free_and_put(struct siw_cep * cep)500 static void siw_cep_set_free_and_put(struct siw_cep *cep)
501 {
502 siw_cep_set_free(cep);
503 siw_cep_put(cep);
504 }
505
siw_cep_get(struct siw_cep * cep)506 void siw_cep_get(struct siw_cep *cep)
507 {
508 kref_get(&cep->ref);
509 }
510
511 /*
512 * Expects params->pd_len in host byte order
513 */
siw_send_mpareqrep(struct siw_cep * cep,const void * pdata,u8 pd_len)514 static int siw_send_mpareqrep(struct siw_cep *cep, const void *pdata, u8 pd_len)
515 {
516 struct socket *s = cep->sock;
517 struct mpa_rr *rr = &cep->mpa.hdr;
518 struct kvec iov[3];
519 struct msghdr msg;
520 int rv;
521 int iovec_num = 0;
522 int mpa_len;
523
524 memset(&msg, 0, sizeof(msg));
525
526 iov[iovec_num].iov_base = rr;
527 iov[iovec_num].iov_len = sizeof(*rr);
528 mpa_len = sizeof(*rr);
529
530 if (cep->enhanced_rdma_conn_est) {
531 iovec_num++;
532 iov[iovec_num].iov_base = &cep->mpa.v2_ctrl;
533 iov[iovec_num].iov_len = sizeof(cep->mpa.v2_ctrl);
534 mpa_len += sizeof(cep->mpa.v2_ctrl);
535 }
536 if (pd_len) {
537 iovec_num++;
538 iov[iovec_num].iov_base = (char *)pdata;
539 iov[iovec_num].iov_len = pd_len;
540 mpa_len += pd_len;
541 }
542 if (cep->enhanced_rdma_conn_est)
543 pd_len += sizeof(cep->mpa.v2_ctrl);
544
545 rr->params.pd_len = cpu_to_be16(pd_len);
546
547 rv = kernel_sendmsg(s, &msg, iov, iovec_num + 1, mpa_len);
548
549 return rv < 0 ? rv : 0;
550 }
551
552 /*
553 * Receive MPA Request/Reply header.
554 *
555 * Returns 0 if complete MPA Request/Reply header including
556 * eventual private data was received. Returns -EAGAIN if
557 * header was partially received or negative error code otherwise.
558 *
559 * Context: May be called in process context only
560 */
siw_recv_mpa_rr(struct siw_cep * cep)561 static int siw_recv_mpa_rr(struct siw_cep *cep)
562 {
563 struct mpa_rr *hdr = &cep->mpa.hdr;
564 struct socket *s = cep->sock;
565 u16 pd_len;
566 int rcvd, to_rcv;
567
568 if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr)) {
569 rcvd = ksock_recv(s, (char *)hdr + cep->mpa.bytes_rcvd,
570 sizeof(struct mpa_rr) - cep->mpa.bytes_rcvd,
571 0);
572 if (rcvd <= 0)
573 return -ECONNABORTED;
574
575 cep->mpa.bytes_rcvd += rcvd;
576
577 if (cep->mpa.bytes_rcvd < sizeof(struct mpa_rr))
578 return -EAGAIN;
579
580 if (be16_to_cpu(hdr->params.pd_len) > MPA_MAX_PRIVDATA)
581 return -EPROTO;
582 }
583 pd_len = be16_to_cpu(hdr->params.pd_len);
584
585 /*
586 * At least the MPA Request/Reply header (frame not including
587 * private data) has been received.
588 * Receive (or continue receiving) any private data.
589 */
590 to_rcv = pd_len - (cep->mpa.bytes_rcvd - sizeof(struct mpa_rr));
591
592 if (!to_rcv) {
593 /*
594 * We must have hdr->params.pd_len == 0 and thus received a
595 * complete MPA Request/Reply frame.
596 * Check against peer protocol violation.
597 */
598 u32 word;
599
600 rcvd = ksock_recv(s, (char *)&word, sizeof(word), MSG_DONTWAIT);
601 if (rcvd == -EAGAIN)
602 return 0;
603
604 if (rcvd == 0) {
605 siw_dbg_cep(cep, "peer EOF\n");
606 return -EPIPE;
607 }
608 if (rcvd < 0) {
609 siw_dbg_cep(cep, "error: %d\n", rcvd);
610 return rcvd;
611 }
612 siw_dbg_cep(cep, "peer sent extra data: %d\n", rcvd);
613
614 return -EPROTO;
615 }
616
617 /*
618 * At this point, we must have hdr->params.pd_len != 0.
619 * A private data buffer gets allocated if hdr->params.pd_len != 0.
620 */
621 if (!cep->mpa.pdata) {
622 cep->mpa.pdata = kmalloc(pd_len + 4, GFP_KERNEL);
623 if (!cep->mpa.pdata)
624 return -ENOMEM;
625 }
626 rcvd = ksock_recv(
627 s, cep->mpa.pdata + cep->mpa.bytes_rcvd - sizeof(struct mpa_rr),
628 to_rcv + 4, MSG_DONTWAIT);
629
630 if (rcvd < 0)
631 return rcvd;
632
633 if (rcvd > to_rcv)
634 return -EPROTO;
635
636 cep->mpa.bytes_rcvd += rcvd;
637
638 if (to_rcv == rcvd) {
639 siw_dbg_cep(cep, "%d bytes private data received\n", pd_len);
640 return 0;
641 }
642 return -EAGAIN;
643 }
644
645 /*
646 * siw_proc_mpareq()
647 *
648 * Read MPA Request from socket and signal new connection to IWCM
649 * if success. Caller must hold lock on corresponding listening CEP.
650 */
siw_proc_mpareq(struct siw_cep * cep)651 static int siw_proc_mpareq(struct siw_cep *cep)
652 {
653 struct mpa_rr *req;
654 int version, rv;
655 u16 pd_len;
656
657 rv = siw_recv_mpa_rr(cep);
658 if (rv)
659 return rv;
660
661 req = &cep->mpa.hdr;
662
663 version = __mpa_rr_revision(req->params.bits);
664 pd_len = be16_to_cpu(req->params.pd_len);
665
666 if (version > MPA_REVISION_2)
667 /* allow for 0, 1, and 2 only */
668 return -EPROTO;
669
670 if (memcmp(req->key, MPA_KEY_REQ, 16))
671 return -EPROTO;
672
673 /* Prepare for sending MPA reply */
674 memcpy(req->key, MPA_KEY_REP, 16);
675
676 if (version == MPA_REVISION_2 &&
677 (req->params.bits & MPA_RR_FLAG_ENHANCED)) {
678 /*
679 * MPA version 2 must signal IRD/ORD values and P2P mode
680 * in private data if header flag MPA_RR_FLAG_ENHANCED
681 * is set.
682 */
683 if (pd_len < sizeof(struct mpa_v2_data))
684 goto reject_conn;
685
686 cep->enhanced_rdma_conn_est = true;
687 }
688
689 /* MPA Markers: currently not supported. Marker TX to be added. */
690 if (req->params.bits & MPA_RR_FLAG_MARKERS)
691 goto reject_conn;
692
693 if (req->params.bits & MPA_RR_FLAG_CRC) {
694 /*
695 * RFC 5044, page 27: CRC MUST be used if peer requests it.
696 * siw specific: 'mpa_crc_strict' parameter to reject
697 * connection with CRC if local CRC off enforced by
698 * 'mpa_crc_strict' module parameter.
699 */
700 if (!mpa_crc_required && mpa_crc_strict)
701 goto reject_conn;
702
703 /* Enable CRC if requested by module parameter */
704 if (mpa_crc_required)
705 req->params.bits |= MPA_RR_FLAG_CRC;
706 }
707 if (cep->enhanced_rdma_conn_est) {
708 struct mpa_v2_data *v2 = (struct mpa_v2_data *)cep->mpa.pdata;
709
710 /*
711 * Peer requested ORD becomes requested local IRD,
712 * peer requested IRD becomes requested local ORD.
713 * IRD and ORD get limited by global maximum values.
714 */
715 cep->ord = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
716 cep->ord = min(cep->ord, SIW_MAX_ORD_QP);
717 cep->ird = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
718 cep->ird = min(cep->ird, SIW_MAX_IRD_QP);
719
720 /* May get overwritten by locally negotiated values */
721 cep->mpa.v2_ctrl.ird = htons(cep->ird);
722 cep->mpa.v2_ctrl.ord = htons(cep->ord);
723
724 /*
725 * Support for peer sent zero length Write or Read to
726 * let local side enter RTS. Writes are preferred.
727 * Sends would require pre-posting a Receive and are
728 * not supported.
729 * Propose zero length Write if none of Read and Write
730 * is indicated.
731 */
732 if (v2->ird & MPA_V2_PEER_TO_PEER) {
733 cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
734
735 if (v2->ord & MPA_V2_RDMA_WRITE_RTR)
736 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
737 else if (v2->ord & MPA_V2_RDMA_READ_RTR)
738 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_READ_RTR;
739 else
740 cep->mpa.v2_ctrl.ord |= MPA_V2_RDMA_WRITE_RTR;
741 }
742 }
743
744 cep->state = SIW_EPSTATE_RECVD_MPAREQ;
745
746 /* Keep reference until IWCM accepts/rejects */
747 siw_cep_get(cep);
748 rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REQUEST, 0);
749 if (rv)
750 siw_cep_put(cep);
751
752 return rv;
753
754 reject_conn:
755 siw_dbg_cep(cep, "reject: crc %d:%d:%d, m %d:%d\n",
756 req->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
757 mpa_crc_required, mpa_crc_strict,
758 req->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
759
760 req->params.bits &= ~MPA_RR_FLAG_MARKERS;
761 req->params.bits |= MPA_RR_FLAG_REJECT;
762
763 if (!mpa_crc_required && mpa_crc_strict)
764 req->params.bits &= ~MPA_RR_FLAG_CRC;
765
766 if (pd_len)
767 kfree(cep->mpa.pdata);
768
769 cep->mpa.pdata = NULL;
770
771 siw_send_mpareqrep(cep, NULL, 0);
772
773 return -EOPNOTSUPP;
774 }
775
siw_proc_mpareply(struct siw_cep * cep)776 static int siw_proc_mpareply(struct siw_cep *cep)
777 {
778 struct siw_qp_attrs qp_attrs;
779 enum siw_qp_attr_mask qp_attr_mask;
780 struct siw_qp *qp = cep->qp;
781 struct mpa_rr *rep;
782 int rv;
783 u16 rep_ord;
784 u16 rep_ird;
785 bool ird_insufficient = false;
786 enum mpa_v2_ctrl mpa_p2p_mode = MPA_V2_RDMA_NO_RTR;
787
788 rv = siw_recv_mpa_rr(cep);
789 if (rv)
790 goto out_err;
791
792 siw_cancel_mpatimer(cep);
793
794 rep = &cep->mpa.hdr;
795
796 if (__mpa_rr_revision(rep->params.bits) > MPA_REVISION_2) {
797 /* allow for 0, 1, and 2 only */
798 rv = -EPROTO;
799 goto out_err;
800 }
801 if (memcmp(rep->key, MPA_KEY_REP, 16)) {
802 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP, LLP_ETYPE_MPA,
803 LLP_ECODE_INVALID_REQ_RESP, 0);
804 siw_send_terminate(qp);
805 rv = -EPROTO;
806 goto out_err;
807 }
808 if (rep->params.bits & MPA_RR_FLAG_REJECT) {
809 siw_dbg_cep(cep, "got mpa reject\n");
810 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNRESET);
811
812 return -ECONNRESET;
813 }
814 if (try_gso && rep->params.bits & MPA_RR_FLAG_GSO_EXP) {
815 siw_dbg_cep(cep, "peer allows GSO on TX\n");
816 qp->tx_ctx.gso_seg_limit = 0;
817 }
818 if ((rep->params.bits & MPA_RR_FLAG_MARKERS) ||
819 (mpa_crc_required && !(rep->params.bits & MPA_RR_FLAG_CRC)) ||
820 (mpa_crc_strict && !mpa_crc_required &&
821 (rep->params.bits & MPA_RR_FLAG_CRC))) {
822 siw_dbg_cep(cep, "reply unsupp: crc %d:%d:%d, m %d:%d\n",
823 rep->params.bits & MPA_RR_FLAG_CRC ? 1 : 0,
824 mpa_crc_required, mpa_crc_strict,
825 rep->params.bits & MPA_RR_FLAG_MARKERS ? 1 : 0, 0);
826
827 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -ECONNREFUSED);
828
829 return -EINVAL;
830 }
831 if (cep->enhanced_rdma_conn_est) {
832 struct mpa_v2_data *v2;
833
834 if (__mpa_rr_revision(rep->params.bits) < MPA_REVISION_2 ||
835 !(rep->params.bits & MPA_RR_FLAG_ENHANCED)) {
836 /*
837 * Protocol failure: The responder MUST reply with
838 * MPA version 2 and MUST set MPA_RR_FLAG_ENHANCED.
839 */
840 siw_dbg_cep(cep, "mpa reply error: vers %d, enhcd %d\n",
841 __mpa_rr_revision(rep->params.bits),
842 rep->params.bits & MPA_RR_FLAG_ENHANCED ?
843 1 :
844 0);
845
846 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
847 -ECONNRESET);
848 return -EINVAL;
849 }
850 v2 = (struct mpa_v2_data *)cep->mpa.pdata;
851 rep_ird = ntohs(v2->ird) & MPA_IRD_ORD_MASK;
852 rep_ord = ntohs(v2->ord) & MPA_IRD_ORD_MASK;
853
854 if (cep->ird < rep_ord &&
855 (relaxed_ird_negotiation == false ||
856 rep_ord > cep->sdev->attrs.max_ird)) {
857 siw_dbg_cep(cep, "ird %d, rep_ord %d, max_ord %d\n",
858 cep->ird, rep_ord,
859 cep->sdev->attrs.max_ord);
860 ird_insufficient = true;
861 }
862 if (cep->ord > rep_ird && relaxed_ird_negotiation == false) {
863 siw_dbg_cep(cep, "ord %d, rep_ird %d\n", cep->ord,
864 rep_ird);
865 ird_insufficient = true;
866 }
867 /*
868 * Always report negotiated peer values to user,
869 * even if IRD/ORD negotiation failed
870 */
871 cep->ird = rep_ord;
872 cep->ord = rep_ird;
873
874 if (ird_insufficient) {
875 /*
876 * If the initiator IRD is insuffient for the
877 * responder ORD, send a TERM.
878 */
879 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
880 LLP_ETYPE_MPA,
881 LLP_ECODE_INSUFFICIENT_IRD, 0);
882 siw_send_terminate(qp);
883 rv = -ENOMEM;
884 goto out_err;
885 }
886 if (cep->mpa.v2_ctrl_req.ird & MPA_V2_PEER_TO_PEER)
887 mpa_p2p_mode =
888 cep->mpa.v2_ctrl_req.ord &
889 (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR);
890
891 /*
892 * Check if we requested P2P mode, and if peer agrees
893 */
894 if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
895 if ((mpa_p2p_mode & v2->ord) == 0) {
896 /*
897 * We requested RTR mode(s), but the peer
898 * did not pick any mode we support.
899 */
900 siw_dbg_cep(cep,
901 "rtr mode: req %2x, got %2x\n",
902 mpa_p2p_mode,
903 v2->ord & (MPA_V2_RDMA_WRITE_RTR |
904 MPA_V2_RDMA_READ_RTR));
905
906 siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
907 LLP_ETYPE_MPA,
908 LLP_ECODE_NO_MATCHING_RTR,
909 0);
910 siw_send_terminate(qp);
911 rv = -EPROTO;
912 goto out_err;
913 }
914 mpa_p2p_mode = v2->ord & (MPA_V2_RDMA_WRITE_RTR |
915 MPA_V2_RDMA_READ_RTR);
916 }
917 }
918 memset(&qp_attrs, 0, sizeof(qp_attrs));
919
920 if (rep->params.bits & MPA_RR_FLAG_CRC)
921 qp_attrs.flags = SIW_MPA_CRC;
922
923 qp_attrs.irq_size = cep->ird;
924 qp_attrs.orq_size = cep->ord;
925 qp_attrs.sk = cep->sock;
926 qp_attrs.state = SIW_QP_STATE_RTS;
927
928 qp_attr_mask = SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
929 SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD | SIW_QP_ATTR_MPA;
930
931 /* Move socket RX/TX under QP control */
932 down_write(&qp->state_lock);
933 if (qp->attrs.state > SIW_QP_STATE_RTR) {
934 rv = -EINVAL;
935 up_write(&qp->state_lock);
936 goto out_err;
937 }
938 rv = siw_qp_modify(qp, &qp_attrs, qp_attr_mask);
939
940 siw_qp_socket_assoc(cep, qp);
941
942 up_write(&qp->state_lock);
943
944 /* Send extra RDMA frame to trigger peer RTS if negotiated */
945 if (mpa_p2p_mode != MPA_V2_RDMA_NO_RTR) {
946 rv = siw_qp_mpa_rts(qp, mpa_p2p_mode);
947 if (rv)
948 goto out_err;
949 }
950 if (!rv) {
951 rv = siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, 0);
952 if (!rv)
953 cep->state = SIW_EPSTATE_RDMA_MODE;
954
955 return 0;
956 }
957
958 out_err:
959 if (rv != -EAGAIN)
960 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY, -EINVAL);
961
962 return rv;
963 }
964
965 /*
966 * siw_accept_newconn - accept an incoming pending connection
967 *
968 */
siw_accept_newconn(struct siw_cep * cep)969 static void siw_accept_newconn(struct siw_cep *cep)
970 {
971 struct socket *s = cep->sock;
972 struct socket *new_s = NULL;
973 struct siw_cep *new_cep = NULL;
974 int rv = 0; /* debug only. should disappear */
975
976 if (cep->state != SIW_EPSTATE_LISTENING)
977 goto error;
978
979 new_cep = siw_cep_alloc(cep->sdev);
980 if (!new_cep)
981 goto error;
982
983 /*
984 * 4: Allocate a sufficient number of work elements
985 * to allow concurrent handling of local + peer close
986 * events, MPA header processing + MPA timeout.
987 */
988 if (siw_cm_alloc_work(new_cep, 4) != 0)
989 goto error;
990
991 /*
992 * Copy saved socket callbacks from listening CEP
993 * and assign new socket with new CEP
994 */
995 new_cep->sk_state_change = cep->sk_state_change;
996 new_cep->sk_data_ready = cep->sk_data_ready;
997 new_cep->sk_write_space = cep->sk_write_space;
998 new_cep->sk_error_report = cep->sk_error_report;
999
1000 rv = kernel_accept(s, &new_s, O_NONBLOCK);
1001 if (rv != 0) {
1002 /*
1003 * Connection already aborted by peer..?
1004 */
1005 siw_dbg_cep(cep, "kernel_accept() error: %d\n", rv);
1006 goto error;
1007 }
1008 new_cep->sock = new_s;
1009 siw_cep_get(new_cep);
1010 new_s->sk->sk_user_data = new_cep;
1011
1012 if (siw_tcp_nagle == false)
1013 tcp_sock_set_nodelay(new_s->sk);
1014 new_cep->state = SIW_EPSTATE_AWAIT_MPAREQ;
1015
1016 rv = siw_cm_queue_work(new_cep, SIW_CM_WORK_MPATIMEOUT);
1017 if (rv)
1018 goto error;
1019 /*
1020 * See siw_proc_mpareq() etc. for the use of new_cep->listen_cep.
1021 */
1022 new_cep->listen_cep = cep;
1023 siw_cep_get(cep);
1024
1025 if (atomic_read(&new_s->sk->sk_rmem_alloc)) {
1026 /*
1027 * MPA REQ already queued
1028 */
1029 siw_dbg_cep(cep, "immediate mpa request\n");
1030
1031 siw_cep_set_inuse(new_cep);
1032 rv = siw_proc_mpareq(new_cep);
1033 if (rv != -EAGAIN) {
1034 siw_cep_put(cep);
1035 new_cep->listen_cep = NULL;
1036 if (rv) {
1037 siw_cancel_mpatimer(new_cep);
1038 siw_cep_set_free(new_cep);
1039 goto error;
1040 }
1041 }
1042 siw_cep_set_free(new_cep);
1043 }
1044 return;
1045
1046 error:
1047 if (new_cep)
1048 siw_cep_put(new_cep);
1049
1050 if (new_s) {
1051 siw_socket_disassoc(new_s);
1052 sock_release(new_s);
1053 new_cep->sock = NULL;
1054 }
1055 siw_dbg_cep(cep, "error %d\n", rv);
1056 }
1057
siw_cm_work_handler(struct work_struct * w)1058 static void siw_cm_work_handler(struct work_struct *w)
1059 {
1060 struct siw_cm_work *work;
1061 struct siw_cep *cep;
1062 int release_cep = 0, rv = 0;
1063
1064 work = container_of(w, struct siw_cm_work, work.work);
1065 cep = work->cep;
1066
1067 siw_dbg_cep(cep, "[QP %u]: work type: %d, state %d\n",
1068 cep->qp ? qp_id(cep->qp) : UINT_MAX,
1069 work->type, cep->state);
1070
1071 siw_cep_set_inuse(cep);
1072
1073 switch (work->type) {
1074 case SIW_CM_WORK_ACCEPT:
1075 siw_accept_newconn(cep);
1076 break;
1077
1078 case SIW_CM_WORK_READ_MPAHDR:
1079 if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1080 if (cep->listen_cep) {
1081 siw_cep_set_inuse(cep->listen_cep);
1082
1083 if (cep->listen_cep->state ==
1084 SIW_EPSTATE_LISTENING)
1085 rv = siw_proc_mpareq(cep);
1086 else
1087 rv = -EFAULT;
1088
1089 siw_cep_set_free(cep->listen_cep);
1090
1091 if (rv != -EAGAIN) {
1092 siw_cep_put(cep->listen_cep);
1093 cep->listen_cep = NULL;
1094 if (rv)
1095 siw_cep_put(cep);
1096 }
1097 }
1098 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1099 rv = siw_proc_mpareply(cep);
1100 } else {
1101 /*
1102 * CEP already moved out of MPA handshake.
1103 * any connection management already done.
1104 * silently ignore the mpa packet.
1105 */
1106 if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1107 cep->sock->sk->sk_data_ready(cep->sock->sk);
1108 siw_dbg_cep(cep, "already in RDMA mode");
1109 } else {
1110 siw_dbg_cep(cep, "out of state: %d\n",
1111 cep->state);
1112 }
1113 }
1114 if (rv && rv != -EAGAIN)
1115 release_cep = 1;
1116 break;
1117
1118 case SIW_CM_WORK_CLOSE_LLP:
1119 /*
1120 * QP scheduled LLP close
1121 */
1122 if (cep->qp)
1123 siw_send_terminate(cep->qp);
1124
1125 if (cep->cm_id)
1126 siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
1127
1128 release_cep = 1;
1129 break;
1130
1131 case SIW_CM_WORK_PEER_CLOSE:
1132 if (cep->cm_id) {
1133 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1134 /*
1135 * MPA reply not received, but connection drop
1136 */
1137 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
1138 -ECONNRESET);
1139 } else if (cep->state == SIW_EPSTATE_RDMA_MODE) {
1140 /*
1141 * NOTE: IW_CM_EVENT_DISCONNECT is given just
1142 * to transition IWCM into CLOSING.
1143 */
1144 siw_cm_upcall(cep, IW_CM_EVENT_DISCONNECT, 0);
1145 siw_cm_upcall(cep, IW_CM_EVENT_CLOSE, 0);
1146 }
1147 /*
1148 * for other states there is no connection
1149 * known to the IWCM.
1150 */
1151 } else {
1152 if (cep->state == SIW_EPSTATE_RECVD_MPAREQ) {
1153 /*
1154 * Wait for the ulp/CM to call accept/reject
1155 */
1156 siw_dbg_cep(cep,
1157 "mpa req recvd, wait for ULP\n");
1158 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1159 /*
1160 * Socket close before MPA request received.
1161 */
1162 if (cep->listen_cep) {
1163 siw_dbg_cep(cep,
1164 "no mpareq: drop listener\n");
1165 siw_cep_put(cep->listen_cep);
1166 cep->listen_cep = NULL;
1167 }
1168 }
1169 }
1170 release_cep = 1;
1171 break;
1172
1173 case SIW_CM_WORK_MPATIMEOUT:
1174 cep->mpa_timer = NULL;
1175
1176 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP) {
1177 /*
1178 * MPA request timed out:
1179 * Hide any partially received private data and signal
1180 * timeout
1181 */
1182 cep->mpa.hdr.params.pd_len = 0;
1183
1184 if (cep->cm_id)
1185 siw_cm_upcall(cep, IW_CM_EVENT_CONNECT_REPLY,
1186 -ETIMEDOUT);
1187 release_cep = 1;
1188
1189 } else if (cep->state == SIW_EPSTATE_AWAIT_MPAREQ) {
1190 /*
1191 * No MPA request received after peer TCP stream setup.
1192 */
1193 if (cep->listen_cep) {
1194 siw_cep_put(cep->listen_cep);
1195 cep->listen_cep = NULL;
1196 }
1197 release_cep = 1;
1198 }
1199 break;
1200
1201 default:
1202 WARN(1, "Undefined CM work type: %d\n", work->type);
1203 }
1204 if (release_cep) {
1205 siw_dbg_cep(cep,
1206 "release: timer=%s, QP[%u]\n",
1207 cep->mpa_timer ? "y" : "n",
1208 cep->qp ? qp_id(cep->qp) : UINT_MAX);
1209
1210 siw_cancel_mpatimer(cep);
1211
1212 cep->state = SIW_EPSTATE_CLOSED;
1213
1214 if (cep->qp) {
1215 struct siw_qp *qp = cep->qp;
1216 /*
1217 * Serialize a potential race with application
1218 * closing the QP and calling siw_qp_cm_drop()
1219 */
1220 siw_qp_get(qp);
1221 siw_cep_set_free(cep);
1222
1223 siw_qp_llp_close(qp);
1224 siw_qp_put(qp);
1225
1226 siw_cep_set_inuse(cep);
1227 cep->qp = NULL;
1228 siw_qp_put(qp);
1229 }
1230 if (cep->sock) {
1231 siw_socket_disassoc(cep->sock);
1232 sock_release(cep->sock);
1233 cep->sock = NULL;
1234 }
1235 if (cep->cm_id) {
1236 siw_free_cm_id(cep);
1237 siw_cep_put(cep);
1238 }
1239 }
1240 siw_cep_set_free(cep);
1241 siw_put_work(work);
1242 siw_cep_put(cep);
1243 }
1244
1245 static struct workqueue_struct *siw_cm_wq;
1246
siw_cm_queue_work(struct siw_cep * cep,enum siw_work_type type)1247 int siw_cm_queue_work(struct siw_cep *cep, enum siw_work_type type)
1248 {
1249 struct siw_cm_work *work = siw_get_work(cep);
1250 unsigned long delay = 0;
1251
1252 if (!work) {
1253 siw_dbg_cep(cep, "failed with no work available\n");
1254 return -ENOMEM;
1255 }
1256 work->type = type;
1257 work->cep = cep;
1258
1259 siw_cep_get(cep);
1260
1261 INIT_DELAYED_WORK(&work->work, siw_cm_work_handler);
1262
1263 if (type == SIW_CM_WORK_MPATIMEOUT) {
1264 cep->mpa_timer = work;
1265
1266 if (cep->state == SIW_EPSTATE_AWAIT_MPAREP)
1267 delay = MPAREQ_TIMEOUT;
1268 else
1269 delay = MPAREP_TIMEOUT;
1270 }
1271 siw_dbg_cep(cep, "[QP %u]: work type: %d, timeout %lu\n",
1272 cep->qp ? qp_id(cep->qp) : -1, type, delay);
1273
1274 queue_delayed_work(siw_cm_wq, &work->work, delay);
1275
1276 return 0;
1277 }
1278
siw_cm_llp_data_ready(struct sock * sk)1279 static void siw_cm_llp_data_ready(struct sock *sk)
1280 {
1281 struct siw_cep *cep;
1282
1283 trace_sk_data_ready(sk);
1284
1285 read_lock(&sk->sk_callback_lock);
1286
1287 cep = sk_to_cep(sk);
1288 if (!cep)
1289 goto out;
1290
1291 siw_dbg_cep(cep, "cep state: %d, socket state %d\n",
1292 cep->state, sk->sk_state);
1293
1294 if (sk->sk_state != TCP_ESTABLISHED)
1295 goto out;
1296
1297 switch (cep->state) {
1298 case SIW_EPSTATE_RDMA_MODE:
1299 case SIW_EPSTATE_LISTENING:
1300 break;
1301
1302 case SIW_EPSTATE_AWAIT_MPAREQ:
1303 case SIW_EPSTATE_AWAIT_MPAREP:
1304 siw_cm_queue_work(cep, SIW_CM_WORK_READ_MPAHDR);
1305 break;
1306
1307 default:
1308 siw_dbg_cep(cep, "unexpected data, state %d\n", cep->state);
1309 break;
1310 }
1311 out:
1312 read_unlock(&sk->sk_callback_lock);
1313 }
1314
siw_cm_llp_write_space(struct sock * sk)1315 static void siw_cm_llp_write_space(struct sock *sk)
1316 {
1317 struct siw_cep *cep = sk_to_cep(sk);
1318
1319 if (cep)
1320 siw_dbg_cep(cep, "state: %d\n", cep->state);
1321 }
1322
siw_cm_llp_error_report(struct sock * sk)1323 static void siw_cm_llp_error_report(struct sock *sk)
1324 {
1325 struct siw_cep *cep = sk_to_cep(sk);
1326
1327 if (cep) {
1328 siw_dbg_cep(cep, "error %d, socket state: %d, cep state: %d\n",
1329 sk->sk_err, sk->sk_state, cep->state);
1330 cep->sk_error_report(sk);
1331 }
1332 }
1333
siw_cm_llp_state_change(struct sock * sk)1334 static void siw_cm_llp_state_change(struct sock *sk)
1335 {
1336 struct siw_cep *cep;
1337 void (*orig_state_change)(struct sock *s);
1338
1339 read_lock(&sk->sk_callback_lock);
1340
1341 cep = sk_to_cep(sk);
1342 if (!cep) {
1343 /* endpoint already disassociated */
1344 read_unlock(&sk->sk_callback_lock);
1345 return;
1346 }
1347 orig_state_change = cep->sk_state_change;
1348
1349 siw_dbg_cep(cep, "state: %d\n", cep->state);
1350
1351 switch (sk->sk_state) {
1352 case TCP_ESTABLISHED:
1353 /*
1354 * handle accepting socket as special case where only
1355 * new connection is possible
1356 */
1357 siw_cm_queue_work(cep, SIW_CM_WORK_ACCEPT);
1358 break;
1359
1360 case TCP_CLOSE:
1361 case TCP_CLOSE_WAIT:
1362 if (cep->qp)
1363 cep->qp->tx_ctx.tx_suspend = 1;
1364 siw_cm_queue_work(cep, SIW_CM_WORK_PEER_CLOSE);
1365 break;
1366
1367 default:
1368 siw_dbg_cep(cep, "unexpected socket state %d\n", sk->sk_state);
1369 }
1370 read_unlock(&sk->sk_callback_lock);
1371 orig_state_change(sk);
1372 }
1373
kernel_bindconnect(struct socket * s,struct sockaddr * laddr,struct sockaddr * raddr,bool afonly)1374 static int kernel_bindconnect(struct socket *s, struct sockaddr *laddr,
1375 struct sockaddr *raddr, bool afonly)
1376 {
1377 int rv, flags = 0;
1378 size_t size = laddr->sa_family == AF_INET ?
1379 sizeof(struct sockaddr_in) : sizeof(struct sockaddr_in6);
1380
1381 /*
1382 * Make address available again asap.
1383 */
1384 sock_set_reuseaddr(s->sk);
1385
1386 if (afonly) {
1387 rv = ip6_sock_set_v6only(s->sk);
1388 if (rv)
1389 return rv;
1390 }
1391
1392 rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr, size);
1393 if (rv < 0)
1394 return rv;
1395
1396 rv = s->ops->connect(s, (struct sockaddr_unsized *)raddr, size, flags);
1397
1398 return rv < 0 ? rv : 0;
1399 }
1400
siw_connect(struct iw_cm_id * id,struct iw_cm_conn_param * params)1401 int siw_connect(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1402 {
1403 struct siw_device *sdev = to_siw_dev(id->device);
1404 struct siw_qp *qp;
1405 struct siw_cep *cep = NULL;
1406 struct socket *s = NULL;
1407 struct sockaddr *laddr = (struct sockaddr *)&id->local_addr,
1408 *raddr = (struct sockaddr *)&id->remote_addr;
1409 bool p2p_mode = peer_to_peer, v4 = true;
1410 u16 pd_len = params->private_data_len;
1411 int version = mpa_version, rv;
1412
1413 if (pd_len > MPA_MAX_PRIVDATA)
1414 return -EINVAL;
1415
1416 if (params->ird > sdev->attrs.max_ird ||
1417 params->ord > sdev->attrs.max_ord)
1418 return -ENOMEM;
1419
1420 if (laddr->sa_family == AF_INET6)
1421 v4 = false;
1422 else if (laddr->sa_family != AF_INET)
1423 return -EAFNOSUPPORT;
1424
1425 /*
1426 * Respect any iwarp port mapping: Use mapped remote address
1427 * if valid. Local address must not be mapped, since siw
1428 * uses kernel TCP stack.
1429 */
1430 if ((v4 && to_sockaddr_in(id->remote_addr).sin_port != 0) ||
1431 to_sockaddr_in6(id->remote_addr).sin6_port != 0)
1432 raddr = (struct sockaddr *)&id->m_remote_addr;
1433
1434 qp = siw_qp_id2obj(sdev, params->qpn);
1435 if (!qp) {
1436 WARN(1, "[QP %u] does not exist\n", params->qpn);
1437 rv = -EINVAL;
1438 goto error;
1439 }
1440 siw_dbg_qp(qp, "pd_len %d, laddr %pISp, raddr %pISp\n", pd_len, laddr,
1441 raddr);
1442
1443 rv = sock_create(v4 ? AF_INET : AF_INET6, SOCK_STREAM, IPPROTO_TCP, &s);
1444 if (rv < 0)
1445 goto error;
1446 siw_reclassify_socket(s);
1447
1448 /*
1449 * NOTE: For simplification, connect() is called in blocking
1450 * mode. Might be reconsidered for async connection setup at
1451 * TCP level.
1452 */
1453 rv = kernel_bindconnect(s, laddr, raddr, id->afonly);
1454 if (rv != 0) {
1455 siw_dbg_qp(qp, "kernel_bindconnect: error %d\n", rv);
1456 goto error;
1457 }
1458 if (siw_tcp_nagle == false)
1459 tcp_sock_set_nodelay(s->sk);
1460 cep = siw_cep_alloc(sdev);
1461 if (!cep) {
1462 rv = -ENOMEM;
1463 goto error;
1464 }
1465 siw_cep_set_inuse(cep);
1466
1467 /* Associate QP with CEP */
1468 siw_cep_get(cep);
1469 qp->cep = cep;
1470
1471 /* siw_qp_get(qp) already done by QP lookup */
1472 cep->qp = qp;
1473
1474 id->add_ref(id);
1475 cep->cm_id = id;
1476
1477 /*
1478 * 4: Allocate a sufficient number of work elements
1479 * to allow concurrent handling of local + peer close
1480 * events, MPA header processing + MPA timeout.
1481 */
1482 rv = siw_cm_alloc_work(cep, 4);
1483 if (rv != 0) {
1484 rv = -ENOMEM;
1485 goto error;
1486 }
1487 cep->ird = params->ird;
1488 cep->ord = params->ord;
1489
1490 if (p2p_mode && cep->ord == 0)
1491 cep->ord = 1;
1492
1493 cep->state = SIW_EPSTATE_CONNECTING;
1494
1495 /*
1496 * Associate CEP with socket
1497 */
1498 siw_cep_socket_assoc(cep, s);
1499
1500 cep->state = SIW_EPSTATE_AWAIT_MPAREP;
1501
1502 /*
1503 * Set MPA Request bits: CRC if required, no MPA Markers,
1504 * MPA Rev. according to module parameter 'mpa_version', Key 'Request'.
1505 */
1506 cep->mpa.hdr.params.bits = 0;
1507 if (version > MPA_REVISION_2) {
1508 pr_warn("Setting MPA version to %u\n", MPA_REVISION_2);
1509 version = MPA_REVISION_2;
1510 /* Adjust also module parameter */
1511 mpa_version = MPA_REVISION_2;
1512 }
1513 __mpa_rr_set_revision(&cep->mpa.hdr.params.bits, version);
1514
1515 if (try_gso)
1516 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_GSO_EXP;
1517
1518 if (mpa_crc_required)
1519 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_CRC;
1520
1521 /*
1522 * If MPA version == 2:
1523 * o Include ORD and IRD.
1524 * o Indicate peer-to-peer mode, if required by module
1525 * parameter 'peer_to_peer'.
1526 */
1527 if (version == MPA_REVISION_2) {
1528 cep->enhanced_rdma_conn_est = true;
1529 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_ENHANCED;
1530
1531 cep->mpa.v2_ctrl.ird = htons(cep->ird);
1532 cep->mpa.v2_ctrl.ord = htons(cep->ord);
1533
1534 if (p2p_mode) {
1535 cep->mpa.v2_ctrl.ird |= MPA_V2_PEER_TO_PEER;
1536 cep->mpa.v2_ctrl.ord |= rtr_type;
1537 }
1538 /* Remember own P2P mode requested */
1539 cep->mpa.v2_ctrl_req.ird = cep->mpa.v2_ctrl.ird;
1540 cep->mpa.v2_ctrl_req.ord = cep->mpa.v2_ctrl.ord;
1541 }
1542 memcpy(cep->mpa.hdr.key, MPA_KEY_REQ, 16);
1543
1544 rv = siw_send_mpareqrep(cep, params->private_data, pd_len);
1545 /*
1546 * Reset private data.
1547 */
1548 cep->mpa.hdr.params.pd_len = 0;
1549
1550 if (rv >= 0) {
1551 rv = siw_cm_queue_work(cep, SIW_CM_WORK_MPATIMEOUT);
1552 if (!rv) {
1553 siw_dbg_cep(cep, "[QP %u]: exit\n", qp_id(qp));
1554 siw_cep_set_free(cep);
1555 return 0;
1556 }
1557 }
1558 error:
1559 siw_dbg(id->device, "failed: %d\n", rv);
1560
1561 if (cep) {
1562 siw_socket_disassoc(s);
1563 sock_release(s);
1564 cep->sock = NULL;
1565
1566 cep->qp = NULL;
1567
1568 cep->cm_id = NULL;
1569 id->rem_ref(id);
1570
1571 qp->cep = NULL;
1572 siw_cep_put(cep);
1573
1574 cep->state = SIW_EPSTATE_CLOSED;
1575
1576 siw_cep_set_free_and_put(cep);
1577
1578 } else if (s) {
1579 sock_release(s);
1580 }
1581 if (qp)
1582 siw_qp_put(qp);
1583
1584 return rv;
1585 }
1586
1587 /*
1588 * siw_accept - Let SoftiWARP accept an RDMA connection request
1589 *
1590 * @id: New connection management id to be used for accepted
1591 * connection request
1592 * @params: Connection parameters provided by ULP for accepting connection
1593 *
1594 * Transition QP to RTS state, associate new CM id @id with accepted CEP
1595 * and get prepared for TCP input by installing socket callbacks.
1596 * Then send MPA Reply and generate the "connection established" event.
1597 * Socket callbacks must be installed before sending MPA Reply, because
1598 * the latter may cause a first RDMA message to arrive from the RDMA Initiator
1599 * side very quickly, at which time the socket callbacks must be ready.
1600 */
siw_accept(struct iw_cm_id * id,struct iw_cm_conn_param * params)1601 int siw_accept(struct iw_cm_id *id, struct iw_cm_conn_param *params)
1602 {
1603 struct siw_device *sdev = to_siw_dev(id->device);
1604 struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1605 struct siw_qp *qp;
1606 struct siw_qp_attrs qp_attrs;
1607 int rv = -EINVAL, max_priv_data = MPA_MAX_PRIVDATA;
1608 bool wait_for_peer_rts = false;
1609
1610 siw_cep_set_inuse(cep);
1611 siw_cep_put(cep);
1612
1613 /* Free lingering inbound private data */
1614 if (cep->mpa.hdr.params.pd_len) {
1615 cep->mpa.hdr.params.pd_len = 0;
1616 kfree(cep->mpa.pdata);
1617 cep->mpa.pdata = NULL;
1618 }
1619 siw_cancel_mpatimer(cep);
1620
1621 if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1622 siw_dbg_cep(cep, "out of state\n");
1623 rv = -ECONNRESET;
1624 goto free_cep;
1625 }
1626 qp = siw_qp_id2obj(sdev, params->qpn);
1627 if (!qp) {
1628 WARN(1, "[QP %d] does not exist\n", params->qpn);
1629 goto free_cep;
1630 }
1631 down_write(&qp->state_lock);
1632 if (qp->attrs.state > SIW_QP_STATE_RTR)
1633 goto error_unlock;
1634 siw_dbg_cep(cep, "[QP %d]\n", params->qpn);
1635
1636 if (try_gso && cep->mpa.hdr.params.bits & MPA_RR_FLAG_GSO_EXP) {
1637 siw_dbg_cep(cep, "peer allows GSO on TX\n");
1638 qp->tx_ctx.gso_seg_limit = 0;
1639 }
1640 if (params->ord > sdev->attrs.max_ord ||
1641 params->ird > sdev->attrs.max_ird) {
1642 siw_dbg_cep(
1643 cep,
1644 "[QP %u]: ord %d (max %d), ird %d (max %d)\n",
1645 qp_id(qp), params->ord, sdev->attrs.max_ord,
1646 params->ird, sdev->attrs.max_ird);
1647 goto error_unlock;
1648 }
1649 if (cep->enhanced_rdma_conn_est)
1650 max_priv_data -= sizeof(struct mpa_v2_data);
1651
1652 if (params->private_data_len > max_priv_data) {
1653 siw_dbg_cep(
1654 cep,
1655 "[QP %u]: private data length: %d (max %d)\n",
1656 qp_id(qp), params->private_data_len, max_priv_data);
1657 goto error_unlock;
1658 }
1659 if (cep->enhanced_rdma_conn_est) {
1660 if (params->ord > cep->ord) {
1661 if (relaxed_ird_negotiation) {
1662 params->ord = cep->ord;
1663 } else {
1664 cep->ird = params->ird;
1665 cep->ord = params->ord;
1666 goto error_unlock;
1667 }
1668 }
1669 if (params->ird < cep->ird) {
1670 if (relaxed_ird_negotiation &&
1671 cep->ird <= sdev->attrs.max_ird)
1672 params->ird = cep->ird;
1673 else {
1674 rv = -ENOMEM;
1675 goto error_unlock;
1676 }
1677 }
1678 if (cep->mpa.v2_ctrl.ord &
1679 (MPA_V2_RDMA_WRITE_RTR | MPA_V2_RDMA_READ_RTR))
1680 wait_for_peer_rts = true;
1681 /*
1682 * Signal back negotiated IRD and ORD values
1683 */
1684 cep->mpa.v2_ctrl.ord =
1685 htons(params->ord & MPA_IRD_ORD_MASK) |
1686 (cep->mpa.v2_ctrl.ord & ~MPA_V2_MASK_IRD_ORD);
1687 cep->mpa.v2_ctrl.ird =
1688 htons(params->ird & MPA_IRD_ORD_MASK) |
1689 (cep->mpa.v2_ctrl.ird & ~MPA_V2_MASK_IRD_ORD);
1690 }
1691 cep->ird = params->ird;
1692 cep->ord = params->ord;
1693
1694 cep->cm_id = id;
1695 id->add_ref(id);
1696
1697 memset(&qp_attrs, 0, sizeof(qp_attrs));
1698 qp_attrs.orq_size = cep->ord;
1699 qp_attrs.irq_size = cep->ird;
1700 qp_attrs.sk = cep->sock;
1701 if (cep->mpa.hdr.params.bits & MPA_RR_FLAG_CRC)
1702 qp_attrs.flags = SIW_MPA_CRC;
1703 qp_attrs.state = SIW_QP_STATE_RTS;
1704
1705 siw_dbg_cep(cep, "[QP%u]: moving to rts\n", qp_id(qp));
1706
1707 /* Associate QP with CEP */
1708 siw_cep_get(cep);
1709 qp->cep = cep;
1710
1711 /* siw_qp_get(qp) already done by QP lookup */
1712 cep->qp = qp;
1713
1714 cep->state = SIW_EPSTATE_RDMA_MODE;
1715
1716 /* Move socket RX/TX under QP control */
1717 rv = siw_qp_modify(qp, &qp_attrs,
1718 SIW_QP_ATTR_STATE | SIW_QP_ATTR_LLP_HANDLE |
1719 SIW_QP_ATTR_ORD | SIW_QP_ATTR_IRD |
1720 SIW_QP_ATTR_MPA);
1721 up_write(&qp->state_lock);
1722 if (rv)
1723 goto error;
1724
1725 siw_dbg_cep(cep, "[QP %u]: send mpa reply, %d byte pdata\n",
1726 qp_id(qp), params->private_data_len);
1727
1728 rv = siw_send_mpareqrep(cep, params->private_data,
1729 params->private_data_len);
1730 if (rv != 0)
1731 goto error;
1732
1733 if (wait_for_peer_rts) {
1734 siw_sk_assign_rtr_upcalls(cep);
1735 } else {
1736 siw_qp_socket_assoc(cep, qp);
1737 rv = siw_cm_upcall(cep, IW_CM_EVENT_ESTABLISHED, 0);
1738 if (rv)
1739 goto error;
1740 }
1741 siw_cep_set_free(cep);
1742
1743 return 0;
1744
1745 error_unlock:
1746 up_write(&qp->state_lock);
1747 error:
1748 siw_destroy_cep_sock(cep);
1749
1750 cep->state = SIW_EPSTATE_CLOSED;
1751
1752 siw_free_cm_id(cep);
1753 if (qp->cep) {
1754 siw_cep_put(cep);
1755 qp->cep = NULL;
1756 }
1757 cep->qp = NULL;
1758 siw_qp_put(qp);
1759 free_cep:
1760 siw_cep_set_free_and_put(cep);
1761 return rv;
1762 }
1763
1764 /*
1765 * siw_reject()
1766 *
1767 * Local connection reject case. Send private data back to peer,
1768 * close connection and dereference connection id.
1769 */
siw_reject(struct iw_cm_id * id,const void * pdata,u8 pd_len)1770 int siw_reject(struct iw_cm_id *id, const void *pdata, u8 pd_len)
1771 {
1772 struct siw_cep *cep = (struct siw_cep *)id->provider_data;
1773
1774 siw_cep_set_inuse(cep);
1775 siw_cep_put(cep);
1776
1777 siw_cancel_mpatimer(cep);
1778
1779 if (cep->state != SIW_EPSTATE_RECVD_MPAREQ) {
1780 siw_dbg_cep(cep, "out of state\n");
1781
1782 siw_cep_set_free_and_put(cep); /* put last reference */
1783
1784 return -ECONNRESET;
1785 }
1786 siw_dbg_cep(cep, "cep->state %d, pd_len %d\n", cep->state,
1787 pd_len);
1788
1789 if (__mpa_rr_revision(cep->mpa.hdr.params.bits) >= MPA_REVISION_1) {
1790 cep->mpa.hdr.params.bits |= MPA_RR_FLAG_REJECT; /* reject */
1791 siw_send_mpareqrep(cep, pdata, pd_len);
1792 }
1793 siw_destroy_cep_sock(cep);
1794
1795 cep->state = SIW_EPSTATE_CLOSED;
1796
1797 siw_cep_set_free_and_put(cep);
1798
1799 return 0;
1800 }
1801
1802 /*
1803 * siw_create_listen - Create resources for a listener's IWCM ID @id
1804 *
1805 * Starts listen on the socket address id->local_addr.
1806 *
1807 */
siw_create_listen(struct iw_cm_id * id,int backlog)1808 int siw_create_listen(struct iw_cm_id *id, int backlog)
1809 {
1810 struct socket *s;
1811 struct siw_cep *cep = NULL;
1812 struct net_device *ndev = NULL;
1813 struct siw_device *sdev = to_siw_dev(id->device);
1814 int addr_family = id->local_addr.ss_family;
1815 int rv = 0;
1816
1817 if (addr_family != AF_INET && addr_family != AF_INET6)
1818 return -EAFNOSUPPORT;
1819
1820 rv = sock_create(addr_family, SOCK_STREAM, IPPROTO_TCP, &s);
1821 if (rv < 0)
1822 return rv;
1823 siw_reclassify_socket(s);
1824
1825 /*
1826 * Allow binding local port when still in TIME_WAIT from last close.
1827 */
1828 sock_set_reuseaddr(s->sk);
1829
1830 if (addr_family == AF_INET) {
1831 struct sockaddr_in *laddr = &to_sockaddr_in(id->local_addr);
1832
1833 /* For wildcard addr, limit binding to current device only */
1834 if (ipv4_is_zeronet(laddr->sin_addr.s_addr)) {
1835 ndev = ib_device_get_netdev(id->device, SIW_PORT);
1836 if (ndev) {
1837 s->sk->sk_bound_dev_if = ndev->ifindex;
1838 } else {
1839 rv = -ENODEV;
1840 goto error;
1841 }
1842 }
1843 rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
1844 sizeof(struct sockaddr_in));
1845 } else {
1846 struct sockaddr_in6 *laddr = &to_sockaddr_in6(id->local_addr);
1847
1848 if (id->afonly) {
1849 rv = ip6_sock_set_v6only(s->sk);
1850 if (rv) {
1851 siw_dbg(id->device,
1852 "ip6_sock_set_v6only erro: %d\n", rv);
1853 goto error;
1854 }
1855 }
1856
1857 /* For wildcard addr, limit binding to current device only */
1858 if (ipv6_addr_any(&laddr->sin6_addr)) {
1859 ndev = ib_device_get_netdev(id->device, SIW_PORT);
1860 if (ndev) {
1861 s->sk->sk_bound_dev_if = ndev->ifindex;
1862 } else {
1863 rv = -ENODEV;
1864 goto error;
1865 }
1866 }
1867 rv = s->ops->bind(s, (struct sockaddr_unsized *)laddr,
1868 sizeof(struct sockaddr_in6));
1869 }
1870 if (rv) {
1871 siw_dbg(id->device, "socket bind error: %d\n", rv);
1872 goto error;
1873 }
1874 cep = siw_cep_alloc(sdev);
1875 if (!cep) {
1876 rv = -ENOMEM;
1877 goto error;
1878 }
1879 siw_cep_socket_assoc(cep, s);
1880
1881 rv = siw_cm_alloc_work(cep, backlog);
1882 if (rv) {
1883 siw_dbg(id->device,
1884 "alloc_work error %d, backlog %d\n",
1885 rv, backlog);
1886 goto error;
1887 }
1888 rv = s->ops->listen(s, backlog);
1889 if (rv) {
1890 siw_dbg(id->device, "listen error %d\n", rv);
1891 goto error;
1892 }
1893 cep->cm_id = id;
1894 id->add_ref(id);
1895
1896 /*
1897 * In case of a wildcard rdma_listen on a multi-homed device,
1898 * a listener's IWCM id is associated with more than one listening CEP.
1899 *
1900 * We currently use id->provider_data in three different ways:
1901 *
1902 * o For a listener's IWCM id, id->provider_data points to
1903 * the list_head of the list of listening CEPs.
1904 * Uses: siw_create_listen(), siw_destroy_listen()
1905 *
1906 * o For each accepted passive-side IWCM id, id->provider_data
1907 * points to the CEP itself. This is a consequence of
1908 * - siw_cm_upcall() setting event.provider_data = cep and
1909 * - the IWCM's cm_conn_req_handler() setting provider_data of the
1910 * new passive-side IWCM id equal to event.provider_data
1911 * Uses: siw_accept(), siw_reject()
1912 *
1913 * o For an active-side IWCM id, id->provider_data is not used at all.
1914 *
1915 */
1916 if (!id->provider_data) {
1917 id->provider_data =
1918 kmalloc(sizeof(struct list_head), GFP_KERNEL);
1919 if (!id->provider_data) {
1920 rv = -ENOMEM;
1921 goto error;
1922 }
1923 INIT_LIST_HEAD((struct list_head *)id->provider_data);
1924 }
1925 list_add_tail(&cep->listenq, (struct list_head *)id->provider_data);
1926 cep->state = SIW_EPSTATE_LISTENING;
1927 dev_put(ndev);
1928
1929 siw_dbg(id->device, "Listen at laddr %pISp\n", &id->local_addr);
1930
1931 return 0;
1932
1933 error:
1934 siw_dbg(id->device, "failed: %d\n", rv);
1935
1936 if (cep) {
1937 siw_cep_set_inuse(cep);
1938
1939 siw_free_cm_id(cep);
1940 cep->sock = NULL;
1941 siw_socket_disassoc(s);
1942 cep->state = SIW_EPSTATE_CLOSED;
1943
1944 siw_cep_set_free_and_put(cep);
1945 }
1946 sock_release(s);
1947 dev_put(ndev);
1948
1949 return rv;
1950 }
1951
siw_drop_listeners(struct iw_cm_id * id)1952 static void siw_drop_listeners(struct iw_cm_id *id)
1953 {
1954 struct list_head *p, *tmp;
1955
1956 /*
1957 * In case of a wildcard rdma_listen on a multi-homed device,
1958 * a listener's IWCM id is associated with more than one listening CEP.
1959 */
1960 list_for_each_safe(p, tmp, (struct list_head *)id->provider_data) {
1961 struct siw_cep *cep = list_entry(p, struct siw_cep, listenq);
1962
1963 list_del(p);
1964
1965 siw_dbg_cep(cep, "drop cep, state %d\n", cep->state);
1966
1967 siw_cep_set_inuse(cep);
1968
1969 siw_free_cm_id(cep);
1970 if (cep->sock) {
1971 siw_socket_disassoc(cep->sock);
1972 sock_release(cep->sock);
1973 cep->sock = NULL;
1974 }
1975 cep->state = SIW_EPSTATE_CLOSED;
1976 siw_cep_set_free_and_put(cep);
1977 }
1978 }
1979
siw_destroy_listen(struct iw_cm_id * id)1980 int siw_destroy_listen(struct iw_cm_id *id)
1981 {
1982 if (!id->provider_data) {
1983 siw_dbg(id->device, "no cep(s)\n");
1984 return 0;
1985 }
1986 siw_drop_listeners(id);
1987 kfree(id->provider_data);
1988 id->provider_data = NULL;
1989
1990 return 0;
1991 }
1992
siw_cm_init(void)1993 int siw_cm_init(void)
1994 {
1995 /*
1996 * create_single_workqueue for strict ordering
1997 */
1998 siw_cm_wq = create_singlethread_workqueue("siw_cm_wq");
1999 if (!siw_cm_wq)
2000 return -ENOMEM;
2001
2002 return 0;
2003 }
2004
siw_cm_exit(void)2005 void siw_cm_exit(void)
2006 {
2007 if (siw_cm_wq)
2008 destroy_workqueue(siw_cm_wq);
2009 }
2010