xref: /linux/net/smc/af_smc.c (revision b0d5c81e872ed21de1e56feb0fa6e4161da7be61)
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21 
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24 
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <linux/in.h>
30 #include <linux/sched/signal.h>
31 
32 #include <net/sock.h>
33 #include <net/tcp.h>
34 #include <net/smc.h>
35 
36 #include "smc.h"
37 #include "smc_clc.h"
38 #include "smc_llc.h"
39 #include "smc_cdc.h"
40 #include "smc_core.h"
41 #include "smc_ib.h"
42 #include "smc_pnet.h"
43 #include "smc_tx.h"
44 #include "smc_rx.h"
45 #include "smc_close.h"
46 
47 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
48 						 * creation
49 						 */
50 
51 struct smc_lgr_list smc_lgr_list = {		/* established link groups */
52 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
54 };
55 
56 static void smc_tcp_listen_work(struct work_struct *);
57 
58 static void smc_set_keepalive(struct sock *sk, int val)
59 {
60 	struct smc_sock *smc = smc_sk(sk);
61 
62 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63 }
64 
65 static struct smc_hashinfo smc_v4_hashinfo = {
66 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67 };
68 
69 int smc_hash_sk(struct sock *sk)
70 {
71 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72 	struct hlist_head *head;
73 
74 	head = &h->ht;
75 
76 	write_lock_bh(&h->lock);
77 	sk_add_node(sk, head);
78 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79 	write_unlock_bh(&h->lock);
80 
81 	return 0;
82 }
83 EXPORT_SYMBOL_GPL(smc_hash_sk);
84 
85 void smc_unhash_sk(struct sock *sk)
86 {
87 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88 
89 	write_lock_bh(&h->lock);
90 	if (sk_del_node_init(sk))
91 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92 	write_unlock_bh(&h->lock);
93 }
94 EXPORT_SYMBOL_GPL(smc_unhash_sk);
95 
96 struct proto smc_proto = {
97 	.name		= "SMC",
98 	.owner		= THIS_MODULE,
99 	.keepalive	= smc_set_keepalive,
100 	.hash		= smc_hash_sk,
101 	.unhash		= smc_unhash_sk,
102 	.obj_size	= sizeof(struct smc_sock),
103 	.h.smc_hash	= &smc_v4_hashinfo,
104 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
105 };
106 EXPORT_SYMBOL_GPL(smc_proto);
107 
108 static int smc_release(struct socket *sock)
109 {
110 	struct sock *sk = sock->sk;
111 	struct smc_sock *smc;
112 	int rc = 0;
113 
114 	if (!sk)
115 		goto out;
116 
117 	smc = smc_sk(sk);
118 	if (sk->sk_state == SMC_LISTEN)
119 		/* smc_close_non_accepted() is called and acquires
120 		 * sock lock for child sockets again
121 		 */
122 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
123 	else
124 		lock_sock(sk);
125 
126 	if (!smc->use_fallback) {
127 		rc = smc_close_active(smc);
128 		sock_set_flag(sk, SOCK_DEAD);
129 		sk->sk_shutdown |= SHUTDOWN_MASK;
130 	}
131 	if (smc->clcsock) {
132 		sock_release(smc->clcsock);
133 		smc->clcsock = NULL;
134 	}
135 	if (smc->use_fallback) {
136 		sock_put(sk); /* passive closing */
137 		sk->sk_state = SMC_CLOSED;
138 		sk->sk_state_change(sk);
139 	}
140 
141 	/* detach socket */
142 	sock_orphan(sk);
143 	sock->sk = NULL;
144 	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
145 		smc_conn_free(&smc->conn);
146 	release_sock(sk);
147 
148 	sk->sk_prot->unhash(sk);
149 	sock_put(sk); /* final sock_put */
150 out:
151 	return rc;
152 }
153 
154 static void smc_destruct(struct sock *sk)
155 {
156 	if (sk->sk_state != SMC_CLOSED)
157 		return;
158 	if (!sock_flag(sk, SOCK_DEAD))
159 		return;
160 
161 	sk_refcnt_debug_dec(sk);
162 }
163 
164 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
165 {
166 	struct smc_sock *smc;
167 	struct sock *sk;
168 
169 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
170 	if (!sk)
171 		return NULL;
172 
173 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
174 	sk->sk_state = SMC_INIT;
175 	sk->sk_destruct = smc_destruct;
176 	sk->sk_protocol = SMCPROTO_SMC;
177 	smc = smc_sk(sk);
178 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
179 	INIT_LIST_HEAD(&smc->accept_q);
180 	spin_lock_init(&smc->accept_q_lock);
181 	sk->sk_prot->hash(sk);
182 	sk_refcnt_debug_inc(sk);
183 
184 	return sk;
185 }
186 
187 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
188 		    int addr_len)
189 {
190 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
191 	struct sock *sk = sock->sk;
192 	struct smc_sock *smc;
193 	int rc;
194 
195 	smc = smc_sk(sk);
196 
197 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
198 	rc = -EINVAL;
199 	if (addr_len < sizeof(struct sockaddr_in))
200 		goto out;
201 
202 	rc = -EAFNOSUPPORT;
203 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
204 	if ((addr->sin_family != AF_INET) &&
205 	    ((addr->sin_family != AF_UNSPEC) ||
206 	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
207 		goto out;
208 
209 	lock_sock(sk);
210 
211 	/* Check if socket is already active */
212 	rc = -EINVAL;
213 	if (sk->sk_state != SMC_INIT)
214 		goto out_rel;
215 
216 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
217 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
218 
219 out_rel:
220 	release_sock(sk);
221 out:
222 	return rc;
223 }
224 
225 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
226 				   unsigned long mask)
227 {
228 	/* options we don't get control via setsockopt for */
229 	nsk->sk_type = osk->sk_type;
230 	nsk->sk_sndbuf = osk->sk_sndbuf;
231 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
232 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
233 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
234 	nsk->sk_mark = osk->sk_mark;
235 	nsk->sk_priority = osk->sk_priority;
236 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
237 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
238 	nsk->sk_err = osk->sk_err;
239 
240 	nsk->sk_flags &= ~mask;
241 	nsk->sk_flags |= osk->sk_flags & mask;
242 }
243 
244 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
245 			     (1UL << SOCK_KEEPOPEN) | \
246 			     (1UL << SOCK_LINGER) | \
247 			     (1UL << SOCK_BROADCAST) | \
248 			     (1UL << SOCK_TIMESTAMP) | \
249 			     (1UL << SOCK_DBG) | \
250 			     (1UL << SOCK_RCVTSTAMP) | \
251 			     (1UL << SOCK_RCVTSTAMPNS) | \
252 			     (1UL << SOCK_LOCALROUTE) | \
253 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
254 			     (1UL << SOCK_RXQ_OVFL) | \
255 			     (1UL << SOCK_WIFI_STATUS) | \
256 			     (1UL << SOCK_NOFCS) | \
257 			     (1UL << SOCK_FILTER_LOCKED))
258 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
259  * clc socket (since smc is not called for these options from net/core)
260  */
261 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
262 {
263 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
264 }
265 
266 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
267 			     (1UL << SOCK_KEEPOPEN) | \
268 			     (1UL << SOCK_LINGER) | \
269 			     (1UL << SOCK_DBG))
270 /* copy only settings and flags relevant for smc from clc to smc socket */
271 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
272 {
273 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
274 }
275 
276 /* determine subnet and mask of internal TCP socket */
277 int smc_netinfo_by_tcpsk(struct socket *clcsock,
278 			 __be32 *subnet, u8 *prefix_len)
279 {
280 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
281 	struct in_device *in_dev;
282 	struct sockaddr_in addr;
283 	int rc = -ENOENT;
284 	int len;
285 
286 	if (!dst) {
287 		rc = -ENOTCONN;
288 		goto out;
289 	}
290 	if (!dst->dev) {
291 		rc = -ENODEV;
292 		goto out_rel;
293 	}
294 
295 	/* get address to which the internal TCP socket is bound */
296 	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
297 	/* analyze IPv4 specific data of net_device belonging to TCP socket */
298 	rcu_read_lock();
299 	in_dev = __in_dev_get_rcu(dst->dev);
300 	for_ifa(in_dev) {
301 		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
302 			continue;
303 		*prefix_len = inet_mask_len(ifa->ifa_mask);
304 		*subnet = ifa->ifa_address & ifa->ifa_mask;
305 		rc = 0;
306 		break;
307 	} endfor_ifa(in_dev);
308 	rcu_read_unlock();
309 
310 out_rel:
311 	dst_release(dst);
312 out:
313 	return rc;
314 }
315 
316 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
317 {
318 	struct smc_link_group *lgr = smc->conn.lgr;
319 	struct smc_link *link;
320 	int rest;
321 	int rc;
322 
323 	link = &lgr->lnk[SMC_SINGLE_LINK];
324 	/* receive CONFIRM LINK request from server over RoCE fabric */
325 	rest = wait_for_completion_interruptible_timeout(
326 		&link->llc_confirm,
327 		SMC_LLC_WAIT_FIRST_TIME);
328 	if (rest <= 0) {
329 		struct smc_clc_msg_decline dclc;
330 
331 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
332 				      SMC_CLC_DECLINE);
333 		return rc;
334 	}
335 
336 	rc = smc_ib_modify_qp_rts(link);
337 	if (rc)
338 		return SMC_CLC_DECL_INTERR;
339 
340 	smc_wr_remember_qp_attr(link);
341 
342 	rc = smc_wr_reg_send(link,
343 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
344 	if (rc)
345 		return SMC_CLC_DECL_INTERR;
346 
347 	/* send CONFIRM LINK response over RoCE fabric */
348 	rc = smc_llc_send_confirm_link(link,
349 				       link->smcibdev->mac[link->ibport - 1],
350 				       gid, SMC_LLC_RESP);
351 	if (rc < 0)
352 		return SMC_CLC_DECL_TCL;
353 
354 	return rc;
355 }
356 
357 static void smc_conn_save_peer_info(struct smc_sock *smc,
358 				    struct smc_clc_msg_accept_confirm *clc)
359 {
360 	smc->conn.peer_conn_idx = clc->conn_idx;
361 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
362 	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
363 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
364 }
365 
366 static void smc_link_save_peer_info(struct smc_link *link,
367 				    struct smc_clc_msg_accept_confirm *clc)
368 {
369 	link->peer_qpn = ntoh24(clc->qpn);
370 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
371 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
372 	link->peer_psn = ntoh24(clc->psn);
373 	link->peer_mtu = clc->qp_mtu;
374 }
375 
376 static void smc_lgr_forget(struct smc_link_group *lgr)
377 {
378 	spin_lock_bh(&smc_lgr_list.lock);
379 	/* do not use this link group for new connections */
380 	if (!list_empty(&lgr->list))
381 		list_del_init(&lgr->list);
382 	spin_unlock_bh(&smc_lgr_list.lock);
383 }
384 
385 /* setup for RDMA connection of client */
386 static int smc_connect_rdma(struct smc_sock *smc)
387 {
388 	struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
389 	struct smc_clc_msg_accept_confirm aclc;
390 	int local_contact = SMC_FIRST_CONTACT;
391 	struct smc_ib_device *smcibdev;
392 	struct smc_link *link;
393 	u8 srv_first_contact;
394 	int reason_code = 0;
395 	int rc = 0;
396 	u8 ibport;
397 
398 	sock_hold(&smc->sk); /* sock put in passive closing */
399 
400 	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
401 		/* peer has not signalled SMC-capability */
402 		smc->use_fallback = true;
403 		goto out_connected;
404 	}
405 
406 	/* IPSec connections opt out of SMC-R optimizations */
407 	if (using_ipsec(smc)) {
408 		reason_code = SMC_CLC_DECL_IPSEC;
409 		goto decline_rdma;
410 	}
411 
412 	/* PNET table look up: search active ib_device and port
413 	 * within same PNETID that also contains the ethernet device
414 	 * used for the internal TCP socket
415 	 */
416 	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
417 	if (!smcibdev) {
418 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
419 		goto decline_rdma;
420 	}
421 
422 	/* do inband token exchange */
423 	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
424 	if (reason_code < 0) {
425 		rc = reason_code;
426 		goto out_err;
427 	}
428 	if (reason_code > 0) /* configuration error */
429 		goto decline_rdma;
430 	/* receive SMC Accept CLC message */
431 	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
432 				       SMC_CLC_ACCEPT);
433 	if (reason_code < 0) {
434 		rc = reason_code;
435 		goto out_err;
436 	}
437 	if (reason_code > 0)
438 		goto decline_rdma;
439 
440 	srv_first_contact = aclc.hdr.flag;
441 	mutex_lock(&smc_create_lgr_pending);
442 	local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
443 					ibport, &aclc.lcl, srv_first_contact);
444 	if (local_contact < 0) {
445 		rc = local_contact;
446 		if (rc == -ENOMEM)
447 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
448 		else if (rc == -ENOLINK)
449 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
450 		goto decline_rdma_unlock;
451 	}
452 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
453 
454 	smc_conn_save_peer_info(smc, &aclc);
455 
456 	/* create send buffer and rmb */
457 	rc = smc_buf_create(smc);
458 	if (rc) {
459 		reason_code = SMC_CLC_DECL_MEM;
460 		goto decline_rdma_unlock;
461 	}
462 
463 	if (local_contact == SMC_FIRST_CONTACT)
464 		smc_link_save_peer_info(link, &aclc);
465 
466 	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
467 	if (rc) {
468 		reason_code = SMC_CLC_DECL_INTERR;
469 		goto decline_rdma_unlock;
470 	}
471 
472 	smc_close_init(smc);
473 	smc_rx_init(smc);
474 
475 	if (local_contact == SMC_FIRST_CONTACT) {
476 		rc = smc_ib_ready_link(link);
477 		if (rc) {
478 			reason_code = SMC_CLC_DECL_INTERR;
479 			goto decline_rdma_unlock;
480 		}
481 	} else {
482 		struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
483 
484 		if (!buf_desc->reused) {
485 			/* register memory region for new rmb */
486 			rc = smc_wr_reg_send(link,
487 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
488 			if (rc) {
489 				reason_code = SMC_CLC_DECL_INTERR;
490 				goto decline_rdma_unlock;
491 			}
492 		}
493 	}
494 	smc_rmb_sync_sg_for_device(&smc->conn);
495 
496 	rc = smc_clc_send_confirm(smc);
497 	if (rc)
498 		goto out_err_unlock;
499 
500 	if (local_contact == SMC_FIRST_CONTACT) {
501 		/* QP confirmation over RoCE fabric */
502 		reason_code = smc_clnt_conf_first_link(
503 			smc, &smcibdev->gid[ibport - 1]);
504 		if (reason_code < 0) {
505 			rc = reason_code;
506 			goto out_err_unlock;
507 		}
508 		if (reason_code > 0)
509 			goto decline_rdma_unlock;
510 	}
511 
512 	mutex_unlock(&smc_create_lgr_pending);
513 	smc_tx_init(smc);
514 
515 out_connected:
516 	smc_copy_sock_settings_to_clc(smc);
517 	if (smc->sk.sk_state == SMC_INIT)
518 		smc->sk.sk_state = SMC_ACTIVE;
519 
520 	return rc ? rc : local_contact;
521 
522 decline_rdma_unlock:
523 	if (local_contact == SMC_FIRST_CONTACT)
524 		smc_lgr_forget(smc->conn.lgr);
525 	mutex_unlock(&smc_create_lgr_pending);
526 	smc_conn_free(&smc->conn);
527 decline_rdma:
528 	/* RDMA setup failed, switch back to TCP */
529 	smc->use_fallback = true;
530 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
531 		rc = smc_clc_send_decline(smc, reason_code);
532 		if (rc < 0)
533 			goto out_err;
534 	}
535 	goto out_connected;
536 
537 out_err_unlock:
538 	if (local_contact == SMC_FIRST_CONTACT)
539 		smc_lgr_forget(smc->conn.lgr);
540 	mutex_unlock(&smc_create_lgr_pending);
541 	smc_conn_free(&smc->conn);
542 out_err:
543 	if (smc->sk.sk_state == SMC_INIT)
544 		sock_put(&smc->sk); /* passive closing */
545 	return rc;
546 }
547 
548 static int smc_connect(struct socket *sock, struct sockaddr *addr,
549 		       int alen, int flags)
550 {
551 	struct sock *sk = sock->sk;
552 	struct smc_sock *smc;
553 	int rc = -EINVAL;
554 
555 	smc = smc_sk(sk);
556 
557 	/* separate smc parameter checking to be safe */
558 	if (alen < sizeof(addr->sa_family))
559 		goto out_err;
560 	if (addr->sa_family != AF_INET)
561 		goto out_err;
562 	smc->addr = addr;	/* needed for nonblocking connect */
563 
564 	lock_sock(sk);
565 	switch (sk->sk_state) {
566 	default:
567 		goto out;
568 	case SMC_ACTIVE:
569 		rc = -EISCONN;
570 		goto out;
571 	case SMC_INIT:
572 		rc = 0;
573 		break;
574 	}
575 
576 	smc_copy_sock_settings_to_clc(smc);
577 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
578 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
579 	if (rc)
580 		goto out;
581 
582 	/* setup RDMA connection */
583 	rc = smc_connect_rdma(smc);
584 	if (rc < 0)
585 		goto out;
586 	else
587 		rc = 0; /* success cases including fallback */
588 
589 out:
590 	release_sock(sk);
591 out_err:
592 	return rc;
593 }
594 
595 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
596 {
597 	struct socket *new_clcsock = NULL;
598 	struct sock *lsk = &lsmc->sk;
599 	struct sock *new_sk;
600 	int rc;
601 
602 	release_sock(lsk);
603 	new_sk = smc_sock_alloc(sock_net(lsk), NULL);
604 	if (!new_sk) {
605 		rc = -ENOMEM;
606 		lsk->sk_err = ENOMEM;
607 		*new_smc = NULL;
608 		lock_sock(lsk);
609 		goto out;
610 	}
611 	*new_smc = smc_sk(new_sk);
612 
613 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
614 	lock_sock(lsk);
615 	if  (rc < 0)
616 		lsk->sk_err = -rc;
617 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
618 		if (new_clcsock)
619 			sock_release(new_clcsock);
620 		new_sk->sk_state = SMC_CLOSED;
621 		sock_set_flag(new_sk, SOCK_DEAD);
622 		new_sk->sk_prot->unhash(new_sk);
623 		sock_put(new_sk); /* final */
624 		*new_smc = NULL;
625 		goto out;
626 	}
627 
628 	(*new_smc)->clcsock = new_clcsock;
629 out:
630 	return rc;
631 }
632 
633 /* add a just created sock to the accept queue of the listen sock as
634  * candidate for a following socket accept call from user space
635  */
636 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
637 {
638 	struct smc_sock *par = smc_sk(parent);
639 
640 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
641 	spin_lock(&par->accept_q_lock);
642 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
643 	spin_unlock(&par->accept_q_lock);
644 	sk_acceptq_added(parent);
645 }
646 
647 /* remove a socket from the accept queue of its parental listening socket */
648 static void smc_accept_unlink(struct sock *sk)
649 {
650 	struct smc_sock *par = smc_sk(sk)->listen_smc;
651 
652 	spin_lock(&par->accept_q_lock);
653 	list_del_init(&smc_sk(sk)->accept_q);
654 	spin_unlock(&par->accept_q_lock);
655 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
656 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
657 }
658 
659 /* remove a sock from the accept queue to bind it to a new socket created
660  * for a socket accept call from user space
661  */
662 struct sock *smc_accept_dequeue(struct sock *parent,
663 				struct socket *new_sock)
664 {
665 	struct smc_sock *isk, *n;
666 	struct sock *new_sk;
667 
668 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
669 		new_sk = (struct sock *)isk;
670 
671 		smc_accept_unlink(new_sk);
672 		if (new_sk->sk_state == SMC_CLOSED) {
673 			if (isk->clcsock) {
674 				sock_release(isk->clcsock);
675 				isk->clcsock = NULL;
676 			}
677 			new_sk->sk_prot->unhash(new_sk);
678 			sock_put(new_sk); /* final */
679 			continue;
680 		}
681 		if (new_sock)
682 			sock_graft(new_sk, new_sock);
683 		return new_sk;
684 	}
685 	return NULL;
686 }
687 
688 /* clean up for a created but never accepted sock */
689 void smc_close_non_accepted(struct sock *sk)
690 {
691 	struct smc_sock *smc = smc_sk(sk);
692 
693 	lock_sock(sk);
694 	if (!sk->sk_lingertime)
695 		/* wait for peer closing */
696 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
697 	if (!smc->use_fallback) {
698 		smc_close_active(smc);
699 		sock_set_flag(sk, SOCK_DEAD);
700 		sk->sk_shutdown |= SHUTDOWN_MASK;
701 	}
702 	if (smc->clcsock) {
703 		struct socket *tcp;
704 
705 		tcp = smc->clcsock;
706 		smc->clcsock = NULL;
707 		sock_release(tcp);
708 	}
709 	if (smc->use_fallback) {
710 		sock_put(sk); /* passive closing */
711 		sk->sk_state = SMC_CLOSED;
712 	} else {
713 		if (sk->sk_state == SMC_CLOSED)
714 			smc_conn_free(&smc->conn);
715 	}
716 	release_sock(sk);
717 	sk->sk_prot->unhash(sk);
718 	sock_put(sk); /* final sock_put */
719 }
720 
721 static int smc_serv_conf_first_link(struct smc_sock *smc)
722 {
723 	struct smc_link_group *lgr = smc->conn.lgr;
724 	struct smc_link *link;
725 	int rest;
726 	int rc;
727 
728 	link = &lgr->lnk[SMC_SINGLE_LINK];
729 
730 	rc = smc_wr_reg_send(link,
731 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
732 	if (rc)
733 		return SMC_CLC_DECL_INTERR;
734 
735 	/* send CONFIRM LINK request to client over the RoCE fabric */
736 	rc = smc_llc_send_confirm_link(link,
737 				       link->smcibdev->mac[link->ibport - 1],
738 				       &link->smcibdev->gid[link->ibport - 1],
739 				       SMC_LLC_REQ);
740 	if (rc < 0)
741 		return SMC_CLC_DECL_TCL;
742 
743 	/* receive CONFIRM LINK response from client over the RoCE fabric */
744 	rest = wait_for_completion_interruptible_timeout(
745 		&link->llc_confirm_resp,
746 		SMC_LLC_WAIT_FIRST_TIME);
747 	if (rest <= 0) {
748 		struct smc_clc_msg_decline dclc;
749 
750 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
751 				      SMC_CLC_DECLINE);
752 	}
753 
754 	return rc;
755 }
756 
757 /* setup for RDMA connection of server */
758 static void smc_listen_work(struct work_struct *work)
759 {
760 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
761 						smc_listen_work);
762 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
763 	struct socket *newclcsock = new_smc->clcsock;
764 	struct smc_sock *lsmc = new_smc->listen_smc;
765 	struct smc_clc_msg_accept_confirm cclc;
766 	int local_contact = SMC_REUSE_CONTACT;
767 	struct sock *newsmcsk = &new_smc->sk;
768 	struct smc_clc_msg_proposal *pclc;
769 	struct smc_ib_device *smcibdev;
770 	struct sockaddr_in peeraddr;
771 	u8 buf[SMC_CLC_MAX_LEN];
772 	struct smc_link *link;
773 	int reason_code = 0;
774 	int rc = 0, len;
775 	__be32 subnet;
776 	u8 prefix_len;
777 	u8 ibport;
778 
779 	/* check if peer is smc capable */
780 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
781 		new_smc->use_fallback = true;
782 		goto out_connected;
783 	}
784 
785 	/* do inband token exchange -
786 	 *wait for and receive SMC Proposal CLC message
787 	 */
788 	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
789 				       SMC_CLC_PROPOSAL);
790 	if (reason_code < 0)
791 		goto out_err;
792 	if (reason_code > 0)
793 		goto decline_rdma;
794 
795 	/* IPSec connections opt out of SMC-R optimizations */
796 	if (using_ipsec(new_smc)) {
797 		reason_code = SMC_CLC_DECL_IPSEC;
798 		goto decline_rdma;
799 	}
800 
801 	/* PNET table look up: search active ib_device and port
802 	 * within same PNETID that also contains the ethernet device
803 	 * used for the internal TCP socket
804 	 */
805 	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
806 	if (!smcibdev) {
807 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
808 		goto decline_rdma;
809 	}
810 
811 	/* determine subnet and mask from internal TCP socket */
812 	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
813 	if (rc) {
814 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
815 		goto decline_rdma;
816 	}
817 
818 	pclc = (struct smc_clc_msg_proposal *)&buf;
819 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
820 	if (pclc_prfx->outgoing_subnet != subnet ||
821 	    pclc_prfx->prefix_len != prefix_len) {
822 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
823 		goto decline_rdma;
824 	}
825 
826 	/* get address of the peer connected to the internal TCP socket */
827 	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
828 
829 	/* allocate connection / link group */
830 	mutex_lock(&smc_create_lgr_pending);
831 	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
832 					smcibdev, ibport, &pclc->lcl, 0);
833 	if (local_contact < 0) {
834 		rc = local_contact;
835 		if (rc == -ENOMEM)
836 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
837 		goto decline_rdma_unlock;
838 	}
839 	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
840 
841 	/* create send buffer and rmb */
842 	rc = smc_buf_create(new_smc);
843 	if (rc) {
844 		reason_code = SMC_CLC_DECL_MEM;
845 		goto decline_rdma_unlock;
846 	}
847 
848 	smc_close_init(new_smc);
849 	smc_rx_init(new_smc);
850 
851 	if (local_contact != SMC_FIRST_CONTACT) {
852 		struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
853 
854 		if (!buf_desc->reused) {
855 			/* register memory region for new rmb */
856 			rc = smc_wr_reg_send(link,
857 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
858 			if (rc) {
859 				reason_code = SMC_CLC_DECL_INTERR;
860 				goto decline_rdma_unlock;
861 			}
862 		}
863 	}
864 	smc_rmb_sync_sg_for_device(&new_smc->conn);
865 
866 	rc = smc_clc_send_accept(new_smc, local_contact);
867 	if (rc)
868 		goto out_err_unlock;
869 
870 	/* receive SMC Confirm CLC message */
871 	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
872 				       SMC_CLC_CONFIRM);
873 	if (reason_code < 0)
874 		goto out_err_unlock;
875 	if (reason_code > 0)
876 		goto decline_rdma_unlock;
877 	smc_conn_save_peer_info(new_smc, &cclc);
878 	if (local_contact == SMC_FIRST_CONTACT)
879 		smc_link_save_peer_info(link, &cclc);
880 
881 	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
882 	if (rc) {
883 		reason_code = SMC_CLC_DECL_INTERR;
884 		goto decline_rdma_unlock;
885 	}
886 
887 	if (local_contact == SMC_FIRST_CONTACT) {
888 		rc = smc_ib_ready_link(link);
889 		if (rc) {
890 			reason_code = SMC_CLC_DECL_INTERR;
891 			goto decline_rdma_unlock;
892 		}
893 		/* QP confirmation over RoCE fabric */
894 		reason_code = smc_serv_conf_first_link(new_smc);
895 		if (reason_code < 0)
896 			/* peer is not aware of a problem */
897 			goto out_err_unlock;
898 		if (reason_code > 0)
899 			goto decline_rdma_unlock;
900 	}
901 
902 	smc_tx_init(new_smc);
903 	mutex_unlock(&smc_create_lgr_pending);
904 
905 out_connected:
906 	sk_refcnt_debug_inc(newsmcsk);
907 	if (newsmcsk->sk_state == SMC_INIT)
908 		newsmcsk->sk_state = SMC_ACTIVE;
909 enqueue:
910 	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
911 	if (lsmc->sk.sk_state == SMC_LISTEN) {
912 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
913 	} else { /* no longer listening */
914 		smc_close_non_accepted(newsmcsk);
915 	}
916 	release_sock(&lsmc->sk);
917 
918 	/* Wake up accept */
919 	lsmc->sk.sk_data_ready(&lsmc->sk);
920 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
921 	return;
922 
923 decline_rdma_unlock:
924 	if (local_contact == SMC_FIRST_CONTACT)
925 		smc_lgr_forget(new_smc->conn.lgr);
926 	mutex_unlock(&smc_create_lgr_pending);
927 decline_rdma:
928 	/* RDMA setup failed, switch back to TCP */
929 	smc_conn_free(&new_smc->conn);
930 	new_smc->use_fallback = true;
931 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
932 		if (smc_clc_send_decline(new_smc, reason_code) < 0)
933 			goto out_err;
934 	}
935 	goto out_connected;
936 
937 out_err_unlock:
938 	if (local_contact == SMC_FIRST_CONTACT)
939 		smc_lgr_forget(new_smc->conn.lgr);
940 	mutex_unlock(&smc_create_lgr_pending);
941 out_err:
942 	if (newsmcsk->sk_state == SMC_INIT)
943 		sock_put(&new_smc->sk); /* passive closing */
944 	newsmcsk->sk_state = SMC_CLOSED;
945 	smc_conn_free(&new_smc->conn);
946 	goto enqueue; /* queue new sock with sk_err set */
947 }
948 
949 static void smc_tcp_listen_work(struct work_struct *work)
950 {
951 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
952 					     tcp_listen_work);
953 	struct sock *lsk = &lsmc->sk;
954 	struct smc_sock *new_smc;
955 	int rc = 0;
956 
957 	lock_sock(lsk);
958 	while (lsk->sk_state == SMC_LISTEN) {
959 		rc = smc_clcsock_accept(lsmc, &new_smc);
960 		if (rc)
961 			goto out;
962 		if (!new_smc)
963 			continue;
964 
965 		new_smc->listen_smc = lsmc;
966 		new_smc->use_fallback = false; /* assume rdma capability first*/
967 		sock_hold(lsk); /* sock_put in smc_listen_work */
968 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
969 		smc_copy_sock_settings_to_smc(new_smc);
970 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
971 		if (!schedule_work(&new_smc->smc_listen_work))
972 			sock_put(&new_smc->sk);
973 	}
974 
975 out:
976 	if (lsmc->clcsock) {
977 		sock_release(lsmc->clcsock);
978 		lsmc->clcsock = NULL;
979 	}
980 	release_sock(lsk);
981 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
982 }
983 
984 static int smc_listen(struct socket *sock, int backlog)
985 {
986 	struct sock *sk = sock->sk;
987 	struct smc_sock *smc;
988 	int rc;
989 
990 	smc = smc_sk(sk);
991 	lock_sock(sk);
992 
993 	rc = -EINVAL;
994 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
995 		goto out;
996 
997 	rc = 0;
998 	if (sk->sk_state == SMC_LISTEN) {
999 		sk->sk_max_ack_backlog = backlog;
1000 		goto out;
1001 	}
1002 	/* some socket options are handled in core, so we could not apply
1003 	 * them to the clc socket -- copy smc socket options to clc socket
1004 	 */
1005 	smc_copy_sock_settings_to_clc(smc);
1006 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1007 
1008 	rc = kernel_listen(smc->clcsock, backlog);
1009 	if (rc)
1010 		goto out;
1011 	sk->sk_max_ack_backlog = backlog;
1012 	sk->sk_ack_backlog = 0;
1013 	sk->sk_state = SMC_LISTEN;
1014 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1015 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1016 	if (!schedule_work(&smc->tcp_listen_work))
1017 		sock_put(sk);
1018 
1019 out:
1020 	release_sock(sk);
1021 	return rc;
1022 }
1023 
1024 static int smc_accept(struct socket *sock, struct socket *new_sock,
1025 		      int flags, bool kern)
1026 {
1027 	struct sock *sk = sock->sk, *nsk;
1028 	DECLARE_WAITQUEUE(wait, current);
1029 	struct smc_sock *lsmc;
1030 	long timeo;
1031 	int rc = 0;
1032 
1033 	lsmc = smc_sk(sk);
1034 	sock_hold(sk); /* sock_put below */
1035 	lock_sock(sk);
1036 
1037 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1038 		rc = -EINVAL;
1039 		goto out;
1040 	}
1041 
1042 	/* Wait for an incoming connection */
1043 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1044 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1045 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1046 		set_current_state(TASK_INTERRUPTIBLE);
1047 		if (!timeo) {
1048 			rc = -EAGAIN;
1049 			break;
1050 		}
1051 		release_sock(sk);
1052 		timeo = schedule_timeout(timeo);
1053 		/* wakeup by sk_data_ready in smc_listen_work() */
1054 		sched_annotate_sleep();
1055 		lock_sock(sk);
1056 		if (signal_pending(current)) {
1057 			rc = sock_intr_errno(timeo);
1058 			break;
1059 		}
1060 	}
1061 	set_current_state(TASK_RUNNING);
1062 	remove_wait_queue(sk_sleep(sk), &wait);
1063 
1064 	if (!rc)
1065 		rc = sock_error(nsk);
1066 
1067 out:
1068 	release_sock(sk);
1069 	sock_put(sk); /* sock_hold above */
1070 	return rc;
1071 }
1072 
1073 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1074 		       int *len, int peer)
1075 {
1076 	struct smc_sock *smc;
1077 
1078 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1079 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1080 		return -ENOTCONN;
1081 
1082 	smc = smc_sk(sock->sk);
1083 
1084 	return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1085 }
1086 
1087 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1088 {
1089 	struct sock *sk = sock->sk;
1090 	struct smc_sock *smc;
1091 	int rc = -EPIPE;
1092 
1093 	smc = smc_sk(sk);
1094 	lock_sock(sk);
1095 	if ((sk->sk_state != SMC_ACTIVE) &&
1096 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1097 	    (sk->sk_state != SMC_INIT))
1098 		goto out;
1099 	if (smc->use_fallback)
1100 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1101 	else
1102 		rc = smc_tx_sendmsg(smc, msg, len);
1103 out:
1104 	release_sock(sk);
1105 	return rc;
1106 }
1107 
1108 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1109 		       int flags)
1110 {
1111 	struct sock *sk = sock->sk;
1112 	struct smc_sock *smc;
1113 	int rc = -ENOTCONN;
1114 
1115 	smc = smc_sk(sk);
1116 	lock_sock(sk);
1117 	if ((sk->sk_state == SMC_INIT) ||
1118 	    (sk->sk_state == SMC_LISTEN) ||
1119 	    (sk->sk_state == SMC_CLOSED))
1120 		goto out;
1121 
1122 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1123 		rc = 0;
1124 		goto out;
1125 	}
1126 
1127 	if (smc->use_fallback)
1128 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1129 	else
1130 		rc = smc_rx_recvmsg(smc, msg, len, flags);
1131 
1132 out:
1133 	release_sock(sk);
1134 	return rc;
1135 }
1136 
1137 static __poll_t smc_accept_poll(struct sock *parent)
1138 {
1139 	struct smc_sock *isk = smc_sk(parent);
1140 	__poll_t mask = 0;
1141 
1142 	spin_lock(&isk->accept_q_lock);
1143 	if (!list_empty(&isk->accept_q))
1144 		mask = EPOLLIN | EPOLLRDNORM;
1145 	spin_unlock(&isk->accept_q_lock);
1146 
1147 	return mask;
1148 }
1149 
1150 static __poll_t smc_poll(struct file *file, struct socket *sock,
1151 			     poll_table *wait)
1152 {
1153 	struct sock *sk = sock->sk;
1154 	__poll_t mask = 0;
1155 	struct smc_sock *smc;
1156 	int rc;
1157 
1158 	if (!sk)
1159 		return EPOLLNVAL;
1160 
1161 	smc = smc_sk(sock->sk);
1162 	sock_hold(sk);
1163 	lock_sock(sk);
1164 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1165 		/* delegate to CLC child sock */
1166 		release_sock(sk);
1167 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1168 		/* if non-blocking connect finished ... */
1169 		lock_sock(sk);
1170 		if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) {
1171 			sk->sk_err = smc->clcsock->sk->sk_err;
1172 			if (sk->sk_err) {
1173 				mask |= EPOLLERR;
1174 			} else {
1175 				rc = smc_connect_rdma(smc);
1176 				if (rc < 0)
1177 					mask |= EPOLLERR;
1178 				/* success cases including fallback */
1179 				mask |= EPOLLOUT | EPOLLWRNORM;
1180 			}
1181 		}
1182 	} else {
1183 		if (sk->sk_state != SMC_CLOSED) {
1184 			release_sock(sk);
1185 			sock_poll_wait(file, sk_sleep(sk), wait);
1186 			lock_sock(sk);
1187 		}
1188 		if (sk->sk_err)
1189 			mask |= EPOLLERR;
1190 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1191 		    (sk->sk_state == SMC_CLOSED))
1192 			mask |= EPOLLHUP;
1193 		if (sk->sk_state == SMC_LISTEN) {
1194 			/* woken up by sk_data_ready in smc_listen_work() */
1195 			mask = smc_accept_poll(sk);
1196 		} else {
1197 			if (atomic_read(&smc->conn.sndbuf_space) ||
1198 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1199 				mask |= EPOLLOUT | EPOLLWRNORM;
1200 			} else {
1201 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1202 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1203 			}
1204 			if (atomic_read(&smc->conn.bytes_to_rcv))
1205 				mask |= EPOLLIN | EPOLLRDNORM;
1206 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1207 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1208 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1209 				mask |= EPOLLIN;
1210 		}
1211 
1212 	}
1213 	release_sock(sk);
1214 	sock_put(sk);
1215 
1216 	return mask;
1217 }
1218 
1219 static int smc_shutdown(struct socket *sock, int how)
1220 {
1221 	struct sock *sk = sock->sk;
1222 	struct smc_sock *smc;
1223 	int rc = -EINVAL;
1224 	int rc1 = 0;
1225 
1226 	smc = smc_sk(sk);
1227 
1228 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1229 		return rc;
1230 
1231 	lock_sock(sk);
1232 
1233 	rc = -ENOTCONN;
1234 	if ((sk->sk_state != SMC_LISTEN) &&
1235 	    (sk->sk_state != SMC_ACTIVE) &&
1236 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1237 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1238 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1239 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1240 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1241 		goto out;
1242 	if (smc->use_fallback) {
1243 		rc = kernel_sock_shutdown(smc->clcsock, how);
1244 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1245 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1246 			sk->sk_state = SMC_CLOSED;
1247 		goto out;
1248 	}
1249 	switch (how) {
1250 	case SHUT_RDWR:		/* shutdown in both directions */
1251 		rc = smc_close_active(smc);
1252 		break;
1253 	case SHUT_WR:
1254 		rc = smc_close_shutdown_write(smc);
1255 		break;
1256 	case SHUT_RD:
1257 		if (sk->sk_state == SMC_LISTEN)
1258 			rc = smc_close_active(smc);
1259 		else
1260 			rc = 0;
1261 			/* nothing more to do because peer is not involved */
1262 		break;
1263 	}
1264 	rc1 = kernel_sock_shutdown(smc->clcsock, how);
1265 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1266 	sk->sk_shutdown |= how + 1;
1267 
1268 out:
1269 	release_sock(sk);
1270 	return rc ? rc : rc1;
1271 }
1272 
1273 static int smc_setsockopt(struct socket *sock, int level, int optname,
1274 			  char __user *optval, unsigned int optlen)
1275 {
1276 	struct sock *sk = sock->sk;
1277 	struct smc_sock *smc;
1278 
1279 	smc = smc_sk(sk);
1280 
1281 	/* generic setsockopts reaching us here always apply to the
1282 	 * CLC socket
1283 	 */
1284 	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1285 					     optval, optlen);
1286 }
1287 
1288 static int smc_getsockopt(struct socket *sock, int level, int optname,
1289 			  char __user *optval, int __user *optlen)
1290 {
1291 	struct smc_sock *smc;
1292 
1293 	smc = smc_sk(sock->sk);
1294 	/* socket options apply to the CLC socket */
1295 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1296 					     optval, optlen);
1297 }
1298 
1299 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1300 		     unsigned long arg)
1301 {
1302 	struct smc_sock *smc;
1303 
1304 	smc = smc_sk(sock->sk);
1305 	if (smc->use_fallback)
1306 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1307 	else
1308 		return sock_no_ioctl(sock, cmd, arg);
1309 }
1310 
1311 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1312 			    int offset, size_t size, int flags)
1313 {
1314 	struct sock *sk = sock->sk;
1315 	struct smc_sock *smc;
1316 	int rc = -EPIPE;
1317 
1318 	smc = smc_sk(sk);
1319 	lock_sock(sk);
1320 	if (sk->sk_state != SMC_ACTIVE)
1321 		goto out;
1322 	if (smc->use_fallback)
1323 		rc = kernel_sendpage(smc->clcsock, page, offset,
1324 				     size, flags);
1325 	else
1326 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1327 
1328 out:
1329 	release_sock(sk);
1330 	return rc;
1331 }
1332 
1333 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1334 			       struct pipe_inode_info *pipe, size_t len,
1335 				    unsigned int flags)
1336 {
1337 	struct sock *sk = sock->sk;
1338 	struct smc_sock *smc;
1339 	int rc = -ENOTCONN;
1340 
1341 	smc = smc_sk(sk);
1342 	lock_sock(sk);
1343 	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1344 		goto out;
1345 	if (smc->use_fallback) {
1346 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1347 						    pipe, len, flags);
1348 	} else {
1349 		rc = -EOPNOTSUPP;
1350 	}
1351 out:
1352 	release_sock(sk);
1353 	return rc;
1354 }
1355 
1356 /* must look like tcp */
1357 static const struct proto_ops smc_sock_ops = {
1358 	.family		= PF_SMC,
1359 	.owner		= THIS_MODULE,
1360 	.release	= smc_release,
1361 	.bind		= smc_bind,
1362 	.connect	= smc_connect,
1363 	.socketpair	= sock_no_socketpair,
1364 	.accept		= smc_accept,
1365 	.getname	= smc_getname,
1366 	.poll		= smc_poll,
1367 	.ioctl		= smc_ioctl,
1368 	.listen		= smc_listen,
1369 	.shutdown	= smc_shutdown,
1370 	.setsockopt	= smc_setsockopt,
1371 	.getsockopt	= smc_getsockopt,
1372 	.sendmsg	= smc_sendmsg,
1373 	.recvmsg	= smc_recvmsg,
1374 	.mmap		= sock_no_mmap,
1375 	.sendpage	= smc_sendpage,
1376 	.splice_read	= smc_splice_read,
1377 };
1378 
1379 static int smc_create(struct net *net, struct socket *sock, int protocol,
1380 		      int kern)
1381 {
1382 	struct smc_sock *smc;
1383 	struct sock *sk;
1384 	int rc;
1385 
1386 	rc = -ESOCKTNOSUPPORT;
1387 	if (sock->type != SOCK_STREAM)
1388 		goto out;
1389 
1390 	rc = -EPROTONOSUPPORT;
1391 	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1392 		goto out;
1393 
1394 	rc = -ENOBUFS;
1395 	sock->ops = &smc_sock_ops;
1396 	sk = smc_sock_alloc(net, sock);
1397 	if (!sk)
1398 		goto out;
1399 
1400 	/* create internal TCP socket for CLC handshake and fallback */
1401 	smc = smc_sk(sk);
1402 	smc->use_fallback = false; /* assume rdma capability first */
1403 	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1404 			      IPPROTO_TCP, &smc->clcsock);
1405 	if (rc) {
1406 		sk_common_release(sk);
1407 		goto out;
1408 	}
1409 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1410 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1411 
1412 out:
1413 	return rc;
1414 }
1415 
1416 static const struct net_proto_family smc_sock_family_ops = {
1417 	.family	= PF_SMC,
1418 	.owner	= THIS_MODULE,
1419 	.create	= smc_create,
1420 };
1421 
1422 static int __init smc_init(void)
1423 {
1424 	int rc;
1425 
1426 	rc = smc_pnet_init();
1427 	if (rc)
1428 		return rc;
1429 
1430 	rc = smc_llc_init();
1431 	if (rc) {
1432 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1433 		goto out_pnet;
1434 	}
1435 
1436 	rc = smc_cdc_init();
1437 	if (rc) {
1438 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1439 		goto out_pnet;
1440 	}
1441 
1442 	rc = proto_register(&smc_proto, 1);
1443 	if (rc) {
1444 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
1445 		goto out_pnet;
1446 	}
1447 
1448 	rc = sock_register(&smc_sock_family_ops);
1449 	if (rc) {
1450 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1451 		goto out_proto;
1452 	}
1453 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1454 
1455 	rc = smc_ib_register_client();
1456 	if (rc) {
1457 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1458 		goto out_sock;
1459 	}
1460 
1461 	static_branch_enable(&tcp_have_smc);
1462 	return 0;
1463 
1464 out_sock:
1465 	sock_unregister(PF_SMC);
1466 out_proto:
1467 	proto_unregister(&smc_proto);
1468 out_pnet:
1469 	smc_pnet_exit();
1470 	return rc;
1471 }
1472 
1473 static void __exit smc_exit(void)
1474 {
1475 	struct smc_link_group *lgr, *lg;
1476 	LIST_HEAD(lgr_freeing_list);
1477 
1478 	spin_lock_bh(&smc_lgr_list.lock);
1479 	if (!list_empty(&smc_lgr_list.list))
1480 		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1481 	spin_unlock_bh(&smc_lgr_list.lock);
1482 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1483 		list_del_init(&lgr->list);
1484 		smc_lgr_free(lgr); /* free link group */
1485 	}
1486 	static_branch_disable(&tcp_have_smc);
1487 	smc_ib_unregister_client();
1488 	sock_unregister(PF_SMC);
1489 	proto_unregister(&smc_proto);
1490 	smc_pnet_exit();
1491 }
1492 
1493 module_init(smc_init);
1494 module_exit(smc_exit);
1495 
1496 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1497 MODULE_DESCRIPTION("smc socket address family");
1498 MODULE_LICENSE("GPL");
1499 MODULE_ALIAS_NETPROTO(PF_SMC);
1500