xref: /linux/net/smc/af_smc.c (revision e3b9f1e81de2083f359bacd2a94bf1c024f2ede0)
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21 
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24 
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <linux/in.h>
30 #include <linux/sched/signal.h>
31 
32 #include <net/sock.h>
33 #include <net/tcp.h>
34 #include <net/smc.h>
35 
36 #include "smc.h"
37 #include "smc_clc.h"
38 #include "smc_llc.h"
39 #include "smc_cdc.h"
40 #include "smc_core.h"
41 #include "smc_ib.h"
42 #include "smc_pnet.h"
43 #include "smc_tx.h"
44 #include "smc_rx.h"
45 #include "smc_close.h"
46 
47 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
48 						 * creation
49 						 */
50 
51 struct smc_lgr_list smc_lgr_list = {		/* established link groups */
52 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
54 };
55 
56 static void smc_tcp_listen_work(struct work_struct *);
57 
58 static void smc_set_keepalive(struct sock *sk, int val)
59 {
60 	struct smc_sock *smc = smc_sk(sk);
61 
62 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63 }
64 
65 static struct smc_hashinfo smc_v4_hashinfo = {
66 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67 };
68 
69 int smc_hash_sk(struct sock *sk)
70 {
71 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72 	struct hlist_head *head;
73 
74 	head = &h->ht;
75 
76 	write_lock_bh(&h->lock);
77 	sk_add_node(sk, head);
78 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79 	write_unlock_bh(&h->lock);
80 
81 	return 0;
82 }
83 EXPORT_SYMBOL_GPL(smc_hash_sk);
84 
85 void smc_unhash_sk(struct sock *sk)
86 {
87 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88 
89 	write_lock_bh(&h->lock);
90 	if (sk_del_node_init(sk))
91 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92 	write_unlock_bh(&h->lock);
93 }
94 EXPORT_SYMBOL_GPL(smc_unhash_sk);
95 
96 struct proto smc_proto = {
97 	.name		= "SMC",
98 	.owner		= THIS_MODULE,
99 	.keepalive	= smc_set_keepalive,
100 	.hash		= smc_hash_sk,
101 	.unhash		= smc_unhash_sk,
102 	.obj_size	= sizeof(struct smc_sock),
103 	.h.smc_hash	= &smc_v4_hashinfo,
104 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
105 };
106 EXPORT_SYMBOL_GPL(smc_proto);
107 
108 static int smc_release(struct socket *sock)
109 {
110 	struct sock *sk = sock->sk;
111 	struct smc_sock *smc;
112 	int rc = 0;
113 
114 	if (!sk)
115 		goto out;
116 
117 	smc = smc_sk(sk);
118 	if (sk->sk_state == SMC_LISTEN)
119 		/* smc_close_non_accepted() is called and acquires
120 		 * sock lock for child sockets again
121 		 */
122 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
123 	else
124 		lock_sock(sk);
125 
126 	if (!smc->use_fallback) {
127 		rc = smc_close_active(smc);
128 		sock_set_flag(sk, SOCK_DEAD);
129 		sk->sk_shutdown |= SHUTDOWN_MASK;
130 	}
131 	if (smc->clcsock) {
132 		sock_release(smc->clcsock);
133 		smc->clcsock = NULL;
134 	}
135 	if (smc->use_fallback) {
136 		sock_put(sk); /* passive closing */
137 		sk->sk_state = SMC_CLOSED;
138 		sk->sk_state_change(sk);
139 	}
140 
141 	/* detach socket */
142 	sock_orphan(sk);
143 	sock->sk = NULL;
144 	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
145 		smc_conn_free(&smc->conn);
146 	release_sock(sk);
147 
148 	sk->sk_prot->unhash(sk);
149 	sock_put(sk); /* final sock_put */
150 out:
151 	return rc;
152 }
153 
154 static void smc_destruct(struct sock *sk)
155 {
156 	if (sk->sk_state != SMC_CLOSED)
157 		return;
158 	if (!sock_flag(sk, SOCK_DEAD))
159 		return;
160 
161 	sk_refcnt_debug_dec(sk);
162 }
163 
164 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
165 {
166 	struct smc_sock *smc;
167 	struct sock *sk;
168 
169 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
170 	if (!sk)
171 		return NULL;
172 
173 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
174 	sk->sk_state = SMC_INIT;
175 	sk->sk_destruct = smc_destruct;
176 	sk->sk_protocol = SMCPROTO_SMC;
177 	smc = smc_sk(sk);
178 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
179 	INIT_LIST_HEAD(&smc->accept_q);
180 	spin_lock_init(&smc->accept_q_lock);
181 	sk->sk_prot->hash(sk);
182 	sk_refcnt_debug_inc(sk);
183 
184 	return sk;
185 }
186 
187 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
188 		    int addr_len)
189 {
190 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
191 	struct sock *sk = sock->sk;
192 	struct smc_sock *smc;
193 	int rc;
194 
195 	smc = smc_sk(sk);
196 
197 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
198 	rc = -EINVAL;
199 	if (addr_len < sizeof(struct sockaddr_in))
200 		goto out;
201 
202 	rc = -EAFNOSUPPORT;
203 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
204 	if ((addr->sin_family != AF_INET) &&
205 	    ((addr->sin_family != AF_UNSPEC) ||
206 	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
207 		goto out;
208 
209 	lock_sock(sk);
210 
211 	/* Check if socket is already active */
212 	rc = -EINVAL;
213 	if (sk->sk_state != SMC_INIT)
214 		goto out_rel;
215 
216 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
217 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
218 
219 out_rel:
220 	release_sock(sk);
221 out:
222 	return rc;
223 }
224 
225 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
226 				   unsigned long mask)
227 {
228 	/* options we don't get control via setsockopt for */
229 	nsk->sk_type = osk->sk_type;
230 	nsk->sk_sndbuf = osk->sk_sndbuf;
231 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
232 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
233 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
234 	nsk->sk_mark = osk->sk_mark;
235 	nsk->sk_priority = osk->sk_priority;
236 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
237 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
238 	nsk->sk_err = osk->sk_err;
239 
240 	nsk->sk_flags &= ~mask;
241 	nsk->sk_flags |= osk->sk_flags & mask;
242 }
243 
244 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
245 			     (1UL << SOCK_KEEPOPEN) | \
246 			     (1UL << SOCK_LINGER) | \
247 			     (1UL << SOCK_BROADCAST) | \
248 			     (1UL << SOCK_TIMESTAMP) | \
249 			     (1UL << SOCK_DBG) | \
250 			     (1UL << SOCK_RCVTSTAMP) | \
251 			     (1UL << SOCK_RCVTSTAMPNS) | \
252 			     (1UL << SOCK_LOCALROUTE) | \
253 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
254 			     (1UL << SOCK_RXQ_OVFL) | \
255 			     (1UL << SOCK_WIFI_STATUS) | \
256 			     (1UL << SOCK_NOFCS) | \
257 			     (1UL << SOCK_FILTER_LOCKED))
258 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
259  * clc socket (since smc is not called for these options from net/core)
260  */
261 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
262 {
263 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
264 }
265 
266 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
267 			     (1UL << SOCK_KEEPOPEN) | \
268 			     (1UL << SOCK_LINGER) | \
269 			     (1UL << SOCK_DBG))
270 /* copy only settings and flags relevant for smc from clc to smc socket */
271 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
272 {
273 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
274 }
275 
276 /* determine subnet and mask of internal TCP socket */
277 int smc_netinfo_by_tcpsk(struct socket *clcsock,
278 			 __be32 *subnet, u8 *prefix_len)
279 {
280 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
281 	struct in_device *in_dev;
282 	struct sockaddr_in addr;
283 	int rc = -ENOENT;
284 	int len;
285 
286 	if (!dst) {
287 		rc = -ENOTCONN;
288 		goto out;
289 	}
290 	if (!dst->dev) {
291 		rc = -ENODEV;
292 		goto out_rel;
293 	}
294 
295 	/* get address to which the internal TCP socket is bound */
296 	kernel_getsockname(clcsock, (struct sockaddr *)&addr, &len);
297 	/* analyze IPv4 specific data of net_device belonging to TCP socket */
298 	rcu_read_lock();
299 	in_dev = __in_dev_get_rcu(dst->dev);
300 	for_ifa(in_dev) {
301 		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
302 			continue;
303 		*prefix_len = inet_mask_len(ifa->ifa_mask);
304 		*subnet = ifa->ifa_address & ifa->ifa_mask;
305 		rc = 0;
306 		break;
307 	} endfor_ifa(in_dev);
308 	rcu_read_unlock();
309 
310 out_rel:
311 	dst_release(dst);
312 out:
313 	return rc;
314 }
315 
316 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
317 {
318 	struct smc_link_group *lgr = smc->conn.lgr;
319 	struct smc_link *link;
320 	int rest;
321 	int rc;
322 
323 	link = &lgr->lnk[SMC_SINGLE_LINK];
324 	/* receive CONFIRM LINK request from server over RoCE fabric */
325 	rest = wait_for_completion_interruptible_timeout(
326 		&link->llc_confirm,
327 		SMC_LLC_WAIT_FIRST_TIME);
328 	if (rest <= 0) {
329 		struct smc_clc_msg_decline dclc;
330 
331 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
332 				      SMC_CLC_DECLINE);
333 		return rc;
334 	}
335 
336 	rc = smc_ib_modify_qp_rts(link);
337 	if (rc)
338 		return SMC_CLC_DECL_INTERR;
339 
340 	smc_wr_remember_qp_attr(link);
341 
342 	rc = smc_wr_reg_send(link,
343 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
344 	if (rc)
345 		return SMC_CLC_DECL_INTERR;
346 
347 	/* send CONFIRM LINK response over RoCE fabric */
348 	rc = smc_llc_send_confirm_link(link,
349 				       link->smcibdev->mac[link->ibport - 1],
350 				       gid, SMC_LLC_RESP);
351 	if (rc < 0)
352 		return SMC_CLC_DECL_TCL;
353 
354 	return rc;
355 }
356 
357 static void smc_conn_save_peer_info(struct smc_sock *smc,
358 				    struct smc_clc_msg_accept_confirm *clc)
359 {
360 	smc->conn.peer_conn_idx = clc->conn_idx;
361 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
362 	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
363 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
364 }
365 
366 static void smc_link_save_peer_info(struct smc_link *link,
367 				    struct smc_clc_msg_accept_confirm *clc)
368 {
369 	link->peer_qpn = ntoh24(clc->qpn);
370 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
371 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
372 	link->peer_psn = ntoh24(clc->psn);
373 	link->peer_mtu = clc->qp_mtu;
374 }
375 
376 static void smc_lgr_forget(struct smc_link_group *lgr)
377 {
378 	spin_lock_bh(&smc_lgr_list.lock);
379 	/* do not use this link group for new connections */
380 	if (!list_empty(&lgr->list))
381 		list_del_init(&lgr->list);
382 	spin_unlock_bh(&smc_lgr_list.lock);
383 }
384 
385 /* setup for RDMA connection of client */
386 static int smc_connect_rdma(struct smc_sock *smc)
387 {
388 	struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
389 	struct smc_clc_msg_accept_confirm aclc;
390 	int local_contact = SMC_FIRST_CONTACT;
391 	struct smc_ib_device *smcibdev;
392 	struct smc_link *link;
393 	u8 srv_first_contact;
394 	int reason_code = 0;
395 	int rc = 0;
396 	u8 ibport;
397 
398 	sock_hold(&smc->sk); /* sock put in passive closing */
399 
400 	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
401 		/* peer has not signalled SMC-capability */
402 		smc->use_fallback = true;
403 		goto out_connected;
404 	}
405 
406 	/* IPSec connections opt out of SMC-R optimizations */
407 	if (using_ipsec(smc)) {
408 		reason_code = SMC_CLC_DECL_IPSEC;
409 		goto decline_rdma;
410 	}
411 
412 	/* PNET table look up: search active ib_device and port
413 	 * within same PNETID that also contains the ethernet device
414 	 * used for the internal TCP socket
415 	 */
416 	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
417 	if (!smcibdev) {
418 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
419 		goto decline_rdma;
420 	}
421 
422 	/* do inband token exchange */
423 	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
424 	if (reason_code < 0) {
425 		rc = reason_code;
426 		goto out_err;
427 	}
428 	if (reason_code > 0) /* configuration error */
429 		goto decline_rdma;
430 	/* receive SMC Accept CLC message */
431 	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
432 				       SMC_CLC_ACCEPT);
433 	if (reason_code < 0) {
434 		rc = reason_code;
435 		goto out_err;
436 	}
437 	if (reason_code > 0)
438 		goto decline_rdma;
439 
440 	srv_first_contact = aclc.hdr.flag;
441 	mutex_lock(&smc_create_lgr_pending);
442 	local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
443 					ibport, &aclc.lcl, srv_first_contact);
444 	if (local_contact < 0) {
445 		rc = local_contact;
446 		if (rc == -ENOMEM)
447 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
448 		else if (rc == -ENOLINK)
449 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
450 		goto decline_rdma_unlock;
451 	}
452 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
453 
454 	smc_conn_save_peer_info(smc, &aclc);
455 
456 	/* create send buffer and rmb */
457 	rc = smc_buf_create(smc);
458 	if (rc) {
459 		reason_code = SMC_CLC_DECL_MEM;
460 		goto decline_rdma_unlock;
461 	}
462 
463 	if (local_contact == SMC_FIRST_CONTACT)
464 		smc_link_save_peer_info(link, &aclc);
465 
466 	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
467 	if (rc) {
468 		reason_code = SMC_CLC_DECL_INTERR;
469 		goto decline_rdma_unlock;
470 	}
471 
472 	smc_close_init(smc);
473 	smc_rx_init(smc);
474 
475 	if (local_contact == SMC_FIRST_CONTACT) {
476 		rc = smc_ib_ready_link(link);
477 		if (rc) {
478 			reason_code = SMC_CLC_DECL_INTERR;
479 			goto decline_rdma_unlock;
480 		}
481 	} else {
482 		struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
483 
484 		if (!buf_desc->reused) {
485 			/* register memory region for new rmb */
486 			rc = smc_wr_reg_send(link,
487 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
488 			if (rc) {
489 				reason_code = SMC_CLC_DECL_INTERR;
490 				goto decline_rdma_unlock;
491 			}
492 		}
493 	}
494 	smc_rmb_sync_sg_for_device(&smc->conn);
495 
496 	rc = smc_clc_send_confirm(smc);
497 	if (rc)
498 		goto out_err_unlock;
499 
500 	if (local_contact == SMC_FIRST_CONTACT) {
501 		/* QP confirmation over RoCE fabric */
502 		reason_code = smc_clnt_conf_first_link(
503 			smc, &smcibdev->gid[ibport - 1]);
504 		if (reason_code < 0) {
505 			rc = reason_code;
506 			goto out_err_unlock;
507 		}
508 		if (reason_code > 0)
509 			goto decline_rdma_unlock;
510 	}
511 
512 	mutex_unlock(&smc_create_lgr_pending);
513 	smc_tx_init(smc);
514 
515 out_connected:
516 	smc_copy_sock_settings_to_clc(smc);
517 	if (smc->sk.sk_state == SMC_INIT)
518 		smc->sk.sk_state = SMC_ACTIVE;
519 
520 	return rc ? rc : local_contact;
521 
522 decline_rdma_unlock:
523 	if (local_contact == SMC_FIRST_CONTACT)
524 		smc_lgr_forget(smc->conn.lgr);
525 	mutex_unlock(&smc_create_lgr_pending);
526 	smc_conn_free(&smc->conn);
527 decline_rdma:
528 	/* RDMA setup failed, switch back to TCP */
529 	smc->use_fallback = true;
530 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
531 		rc = smc_clc_send_decline(smc, reason_code);
532 		if (rc < 0)
533 			goto out_err;
534 	}
535 	goto out_connected;
536 
537 out_err_unlock:
538 	if (local_contact == SMC_FIRST_CONTACT)
539 		smc_lgr_forget(smc->conn.lgr);
540 	mutex_unlock(&smc_create_lgr_pending);
541 	smc_conn_free(&smc->conn);
542 out_err:
543 	if (smc->sk.sk_state == SMC_INIT)
544 		sock_put(&smc->sk); /* passive closing */
545 	return rc;
546 }
547 
548 static int smc_connect(struct socket *sock, struct sockaddr *addr,
549 		       int alen, int flags)
550 {
551 	struct sock *sk = sock->sk;
552 	struct smc_sock *smc;
553 	int rc = -EINVAL;
554 
555 	smc = smc_sk(sk);
556 
557 	/* separate smc parameter checking to be safe */
558 	if (alen < sizeof(addr->sa_family))
559 		goto out_err;
560 	if (addr->sa_family != AF_INET)
561 		goto out_err;
562 	smc->addr = addr;	/* needed for nonblocking connect */
563 
564 	lock_sock(sk);
565 	switch (sk->sk_state) {
566 	default:
567 		goto out;
568 	case SMC_ACTIVE:
569 		rc = -EISCONN;
570 		goto out;
571 	case SMC_INIT:
572 		rc = 0;
573 		break;
574 	}
575 
576 	smc_copy_sock_settings_to_clc(smc);
577 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
578 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
579 	if (rc)
580 		goto out;
581 
582 	/* setup RDMA connection */
583 	rc = smc_connect_rdma(smc);
584 	if (rc < 0)
585 		goto out;
586 	else
587 		rc = 0; /* success cases including fallback */
588 
589 out:
590 	release_sock(sk);
591 out_err:
592 	return rc;
593 }
594 
595 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
596 {
597 	struct socket *new_clcsock = NULL;
598 	struct sock *lsk = &lsmc->sk;
599 	struct sock *new_sk;
600 	int rc;
601 
602 	release_sock(lsk);
603 	new_sk = smc_sock_alloc(sock_net(lsk), NULL);
604 	if (!new_sk) {
605 		rc = -ENOMEM;
606 		lsk->sk_err = ENOMEM;
607 		*new_smc = NULL;
608 		lock_sock(lsk);
609 		goto out;
610 	}
611 	*new_smc = smc_sk(new_sk);
612 
613 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
614 	lock_sock(lsk);
615 	if  (rc < 0)
616 		lsk->sk_err = -rc;
617 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
618 		if (new_clcsock)
619 			sock_release(new_clcsock);
620 		new_sk->sk_state = SMC_CLOSED;
621 		sock_set_flag(new_sk, SOCK_DEAD);
622 		new_sk->sk_prot->unhash(new_sk);
623 		sock_put(new_sk); /* final */
624 		*new_smc = NULL;
625 		goto out;
626 	}
627 
628 	(*new_smc)->clcsock = new_clcsock;
629 out:
630 	return rc;
631 }
632 
633 /* add a just created sock to the accept queue of the listen sock as
634  * candidate for a following socket accept call from user space
635  */
636 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
637 {
638 	struct smc_sock *par = smc_sk(parent);
639 
640 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
641 	spin_lock(&par->accept_q_lock);
642 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
643 	spin_unlock(&par->accept_q_lock);
644 	sk_acceptq_added(parent);
645 }
646 
647 /* remove a socket from the accept queue of its parental listening socket */
648 static void smc_accept_unlink(struct sock *sk)
649 {
650 	struct smc_sock *par = smc_sk(sk)->listen_smc;
651 
652 	spin_lock(&par->accept_q_lock);
653 	list_del_init(&smc_sk(sk)->accept_q);
654 	spin_unlock(&par->accept_q_lock);
655 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
656 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
657 }
658 
659 /* remove a sock from the accept queue to bind it to a new socket created
660  * for a socket accept call from user space
661  */
662 struct sock *smc_accept_dequeue(struct sock *parent,
663 				struct socket *new_sock)
664 {
665 	struct smc_sock *isk, *n;
666 	struct sock *new_sk;
667 
668 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
669 		new_sk = (struct sock *)isk;
670 
671 		smc_accept_unlink(new_sk);
672 		if (new_sk->sk_state == SMC_CLOSED) {
673 			if (isk->clcsock) {
674 				sock_release(isk->clcsock);
675 				isk->clcsock = NULL;
676 			}
677 			new_sk->sk_prot->unhash(new_sk);
678 			sock_put(new_sk); /* final */
679 			continue;
680 		}
681 		if (new_sock)
682 			sock_graft(new_sk, new_sock);
683 		return new_sk;
684 	}
685 	return NULL;
686 }
687 
688 /* clean up for a created but never accepted sock */
689 void smc_close_non_accepted(struct sock *sk)
690 {
691 	struct smc_sock *smc = smc_sk(sk);
692 
693 	lock_sock(sk);
694 	if (!sk->sk_lingertime)
695 		/* wait for peer closing */
696 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
697 	if (!smc->use_fallback) {
698 		smc_close_active(smc);
699 		sock_set_flag(sk, SOCK_DEAD);
700 		sk->sk_shutdown |= SHUTDOWN_MASK;
701 	}
702 	if (smc->clcsock) {
703 		struct socket *tcp;
704 
705 		tcp = smc->clcsock;
706 		smc->clcsock = NULL;
707 		sock_release(tcp);
708 	}
709 	if (smc->use_fallback) {
710 		sock_put(sk); /* passive closing */
711 		sk->sk_state = SMC_CLOSED;
712 	} else {
713 		if (sk->sk_state == SMC_CLOSED)
714 			smc_conn_free(&smc->conn);
715 	}
716 	release_sock(sk);
717 	sk->sk_prot->unhash(sk);
718 	sock_put(sk); /* final sock_put */
719 }
720 
721 static int smc_serv_conf_first_link(struct smc_sock *smc)
722 {
723 	struct smc_link_group *lgr = smc->conn.lgr;
724 	struct smc_link *link;
725 	int rest;
726 	int rc;
727 
728 	link = &lgr->lnk[SMC_SINGLE_LINK];
729 
730 	rc = smc_wr_reg_send(link,
731 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
732 	if (rc)
733 		return SMC_CLC_DECL_INTERR;
734 
735 	/* send CONFIRM LINK request to client over the RoCE fabric */
736 	rc = smc_llc_send_confirm_link(link,
737 				       link->smcibdev->mac[link->ibport - 1],
738 				       &link->smcibdev->gid[link->ibport - 1],
739 				       SMC_LLC_REQ);
740 	if (rc < 0)
741 		return SMC_CLC_DECL_TCL;
742 
743 	/* receive CONFIRM LINK response from client over the RoCE fabric */
744 	rest = wait_for_completion_interruptible_timeout(
745 		&link->llc_confirm_resp,
746 		SMC_LLC_WAIT_FIRST_TIME);
747 	if (rest <= 0) {
748 		struct smc_clc_msg_decline dclc;
749 
750 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
751 				      SMC_CLC_DECLINE);
752 	}
753 
754 	return rc;
755 }
756 
757 /* setup for RDMA connection of server */
758 static void smc_listen_work(struct work_struct *work)
759 {
760 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
761 						smc_listen_work);
762 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
763 	struct socket *newclcsock = new_smc->clcsock;
764 	struct smc_sock *lsmc = new_smc->listen_smc;
765 	struct smc_clc_msg_accept_confirm cclc;
766 	int local_contact = SMC_REUSE_CONTACT;
767 	struct sock *newsmcsk = &new_smc->sk;
768 	struct smc_clc_msg_proposal *pclc;
769 	struct smc_ib_device *smcibdev;
770 	struct sockaddr_in peeraddr;
771 	u8 buf[SMC_CLC_MAX_LEN];
772 	struct smc_link *link;
773 	int reason_code = 0;
774 	int rc = 0, len;
775 	__be32 subnet;
776 	u8 prefix_len;
777 	u8 ibport;
778 
779 	/* check if peer is smc capable */
780 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
781 		new_smc->use_fallback = true;
782 		goto out_connected;
783 	}
784 
785 	/* do inband token exchange -
786 	 *wait for and receive SMC Proposal CLC message
787 	 */
788 	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
789 				       SMC_CLC_PROPOSAL);
790 	if (reason_code < 0)
791 		goto out_err;
792 	if (reason_code > 0)
793 		goto decline_rdma;
794 
795 	/* IPSec connections opt out of SMC-R optimizations */
796 	if (using_ipsec(new_smc)) {
797 		reason_code = SMC_CLC_DECL_IPSEC;
798 		goto decline_rdma;
799 	}
800 
801 	/* PNET table look up: search active ib_device and port
802 	 * within same PNETID that also contains the ethernet device
803 	 * used for the internal TCP socket
804 	 */
805 	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
806 	if (!smcibdev) {
807 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
808 		goto decline_rdma;
809 	}
810 
811 	/* determine subnet and mask from internal TCP socket */
812 	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
813 	if (rc) {
814 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
815 		goto decline_rdma;
816 	}
817 
818 	pclc = (struct smc_clc_msg_proposal *)&buf;
819 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
820 	if (pclc_prfx->outgoing_subnet != subnet ||
821 	    pclc_prfx->prefix_len != prefix_len) {
822 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
823 		goto decline_rdma;
824 	}
825 
826 	/* get address of the peer connected to the internal TCP socket */
827 	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr, &len);
828 
829 	/* allocate connection / link group */
830 	mutex_lock(&smc_create_lgr_pending);
831 	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
832 					smcibdev, ibport, &pclc->lcl, 0);
833 	if (local_contact < 0) {
834 		rc = local_contact;
835 		if (rc == -ENOMEM)
836 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
837 		goto decline_rdma_unlock;
838 	}
839 	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
840 
841 	/* create send buffer and rmb */
842 	rc = smc_buf_create(new_smc);
843 	if (rc) {
844 		reason_code = SMC_CLC_DECL_MEM;
845 		goto decline_rdma_unlock;
846 	}
847 
848 	smc_close_init(new_smc);
849 	smc_rx_init(new_smc);
850 
851 	if (local_contact != SMC_FIRST_CONTACT) {
852 		struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
853 
854 		if (!buf_desc->reused) {
855 			/* register memory region for new rmb */
856 			rc = smc_wr_reg_send(link,
857 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
858 			if (rc) {
859 				reason_code = SMC_CLC_DECL_INTERR;
860 				goto decline_rdma_unlock;
861 			}
862 		}
863 	}
864 	smc_rmb_sync_sg_for_device(&new_smc->conn);
865 
866 	rc = smc_clc_send_accept(new_smc, local_contact);
867 	if (rc)
868 		goto out_err_unlock;
869 
870 	/* receive SMC Confirm CLC message */
871 	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
872 				       SMC_CLC_CONFIRM);
873 	if (reason_code < 0)
874 		goto out_err_unlock;
875 	if (reason_code > 0)
876 		goto decline_rdma_unlock;
877 	smc_conn_save_peer_info(new_smc, &cclc);
878 	if (local_contact == SMC_FIRST_CONTACT)
879 		smc_link_save_peer_info(link, &cclc);
880 
881 	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
882 	if (rc) {
883 		reason_code = SMC_CLC_DECL_INTERR;
884 		goto decline_rdma_unlock;
885 	}
886 
887 	if (local_contact == SMC_FIRST_CONTACT) {
888 		rc = smc_ib_ready_link(link);
889 		if (rc) {
890 			reason_code = SMC_CLC_DECL_INTERR;
891 			goto decline_rdma_unlock;
892 		}
893 		/* QP confirmation over RoCE fabric */
894 		reason_code = smc_serv_conf_first_link(new_smc);
895 		if (reason_code < 0)
896 			/* peer is not aware of a problem */
897 			goto out_err_unlock;
898 		if (reason_code > 0)
899 			goto decline_rdma_unlock;
900 	}
901 
902 	smc_tx_init(new_smc);
903 	mutex_unlock(&smc_create_lgr_pending);
904 
905 out_connected:
906 	sk_refcnt_debug_inc(newsmcsk);
907 	if (newsmcsk->sk_state == SMC_INIT)
908 		newsmcsk->sk_state = SMC_ACTIVE;
909 enqueue:
910 	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
911 	if (lsmc->sk.sk_state == SMC_LISTEN) {
912 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
913 	} else { /* no longer listening */
914 		smc_close_non_accepted(newsmcsk);
915 	}
916 	release_sock(&lsmc->sk);
917 
918 	/* Wake up accept */
919 	lsmc->sk.sk_data_ready(&lsmc->sk);
920 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
921 	return;
922 
923 decline_rdma_unlock:
924 	if (local_contact == SMC_FIRST_CONTACT)
925 		smc_lgr_forget(new_smc->conn.lgr);
926 	mutex_unlock(&smc_create_lgr_pending);
927 decline_rdma:
928 	/* RDMA setup failed, switch back to TCP */
929 	smc_conn_free(&new_smc->conn);
930 	new_smc->use_fallback = true;
931 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
932 		if (smc_clc_send_decline(new_smc, reason_code) < 0)
933 			goto out_err;
934 	}
935 	goto out_connected;
936 
937 out_err_unlock:
938 	if (local_contact == SMC_FIRST_CONTACT)
939 		smc_lgr_forget(new_smc->conn.lgr);
940 	mutex_unlock(&smc_create_lgr_pending);
941 out_err:
942 	if (newsmcsk->sk_state == SMC_INIT)
943 		sock_put(&new_smc->sk); /* passive closing */
944 	newsmcsk->sk_state = SMC_CLOSED;
945 	smc_conn_free(&new_smc->conn);
946 	goto enqueue; /* queue new sock with sk_err set */
947 }
948 
949 static void smc_tcp_listen_work(struct work_struct *work)
950 {
951 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
952 					     tcp_listen_work);
953 	struct sock *lsk = &lsmc->sk;
954 	struct smc_sock *new_smc;
955 	int rc = 0;
956 
957 	lock_sock(lsk);
958 	while (lsk->sk_state == SMC_LISTEN) {
959 		rc = smc_clcsock_accept(lsmc, &new_smc);
960 		if (rc)
961 			goto out;
962 		if (!new_smc)
963 			continue;
964 
965 		new_smc->listen_smc = lsmc;
966 		new_smc->use_fallback = false; /* assume rdma capability first*/
967 		sock_hold(lsk); /* sock_put in smc_listen_work */
968 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
969 		smc_copy_sock_settings_to_smc(new_smc);
970 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
971 		if (!schedule_work(&new_smc->smc_listen_work))
972 			sock_put(&new_smc->sk);
973 	}
974 
975 out:
976 	if (lsmc->clcsock) {
977 		sock_release(lsmc->clcsock);
978 		lsmc->clcsock = NULL;
979 	}
980 	release_sock(lsk);
981 	/* no more listening, wake up smc_close_wait_listen_clcsock and
982 	 * accept
983 	 */
984 	lsk->sk_state_change(lsk);
985 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
986 }
987 
988 static int smc_listen(struct socket *sock, int backlog)
989 {
990 	struct sock *sk = sock->sk;
991 	struct smc_sock *smc;
992 	int rc;
993 
994 	smc = smc_sk(sk);
995 	lock_sock(sk);
996 
997 	rc = -EINVAL;
998 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
999 		goto out;
1000 
1001 	rc = 0;
1002 	if (sk->sk_state == SMC_LISTEN) {
1003 		sk->sk_max_ack_backlog = backlog;
1004 		goto out;
1005 	}
1006 	/* some socket options are handled in core, so we could not apply
1007 	 * them to the clc socket -- copy smc socket options to clc socket
1008 	 */
1009 	smc_copy_sock_settings_to_clc(smc);
1010 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1011 
1012 	rc = kernel_listen(smc->clcsock, backlog);
1013 	if (rc)
1014 		goto out;
1015 	sk->sk_max_ack_backlog = backlog;
1016 	sk->sk_ack_backlog = 0;
1017 	sk->sk_state = SMC_LISTEN;
1018 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1019 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1020 	if (!schedule_work(&smc->tcp_listen_work))
1021 		sock_put(sk);
1022 
1023 out:
1024 	release_sock(sk);
1025 	return rc;
1026 }
1027 
1028 static int smc_accept(struct socket *sock, struct socket *new_sock,
1029 		      int flags, bool kern)
1030 {
1031 	struct sock *sk = sock->sk, *nsk;
1032 	DECLARE_WAITQUEUE(wait, current);
1033 	struct smc_sock *lsmc;
1034 	long timeo;
1035 	int rc = 0;
1036 
1037 	lsmc = smc_sk(sk);
1038 	sock_hold(sk); /* sock_put below */
1039 	lock_sock(sk);
1040 
1041 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1042 		rc = -EINVAL;
1043 		goto out;
1044 	}
1045 
1046 	/* Wait for an incoming connection */
1047 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1048 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1049 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1050 		set_current_state(TASK_INTERRUPTIBLE);
1051 		if (!timeo) {
1052 			rc = -EAGAIN;
1053 			break;
1054 		}
1055 		release_sock(sk);
1056 		timeo = schedule_timeout(timeo);
1057 		/* wakeup by sk_data_ready in smc_listen_work() */
1058 		sched_annotate_sleep();
1059 		lock_sock(sk);
1060 		if (signal_pending(current)) {
1061 			rc = sock_intr_errno(timeo);
1062 			break;
1063 		}
1064 	}
1065 	set_current_state(TASK_RUNNING);
1066 	remove_wait_queue(sk_sleep(sk), &wait);
1067 
1068 	if (!rc)
1069 		rc = sock_error(nsk);
1070 
1071 out:
1072 	release_sock(sk);
1073 	sock_put(sk); /* sock_hold above */
1074 	return rc;
1075 }
1076 
1077 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1078 		       int *len, int peer)
1079 {
1080 	struct smc_sock *smc;
1081 
1082 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1083 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1084 		return -ENOTCONN;
1085 
1086 	smc = smc_sk(sock->sk);
1087 
1088 	return smc->clcsock->ops->getname(smc->clcsock, addr, len, peer);
1089 }
1090 
1091 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1092 {
1093 	struct sock *sk = sock->sk;
1094 	struct smc_sock *smc;
1095 	int rc = -EPIPE;
1096 
1097 	smc = smc_sk(sk);
1098 	lock_sock(sk);
1099 	if ((sk->sk_state != SMC_ACTIVE) &&
1100 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1101 	    (sk->sk_state != SMC_INIT))
1102 		goto out;
1103 	if (smc->use_fallback)
1104 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1105 	else
1106 		rc = smc_tx_sendmsg(smc, msg, len);
1107 out:
1108 	release_sock(sk);
1109 	return rc;
1110 }
1111 
1112 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1113 		       int flags)
1114 {
1115 	struct sock *sk = sock->sk;
1116 	struct smc_sock *smc;
1117 	int rc = -ENOTCONN;
1118 
1119 	smc = smc_sk(sk);
1120 	lock_sock(sk);
1121 	if ((sk->sk_state == SMC_INIT) ||
1122 	    (sk->sk_state == SMC_LISTEN) ||
1123 	    (sk->sk_state == SMC_CLOSED))
1124 		goto out;
1125 
1126 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1127 		rc = 0;
1128 		goto out;
1129 	}
1130 
1131 	if (smc->use_fallback)
1132 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1133 	else
1134 		rc = smc_rx_recvmsg(smc, msg, len, flags);
1135 
1136 out:
1137 	release_sock(sk);
1138 	return rc;
1139 }
1140 
1141 static __poll_t smc_accept_poll(struct sock *parent)
1142 {
1143 	struct smc_sock *isk = smc_sk(parent);
1144 	__poll_t mask = 0;
1145 
1146 	spin_lock(&isk->accept_q_lock);
1147 	if (!list_empty(&isk->accept_q))
1148 		mask = EPOLLIN | EPOLLRDNORM;
1149 	spin_unlock(&isk->accept_q_lock);
1150 
1151 	return mask;
1152 }
1153 
1154 static __poll_t smc_poll(struct file *file, struct socket *sock,
1155 			     poll_table *wait)
1156 {
1157 	struct sock *sk = sock->sk;
1158 	__poll_t mask = 0;
1159 	struct smc_sock *smc;
1160 	int rc;
1161 
1162 	if (!sk)
1163 		return EPOLLNVAL;
1164 
1165 	smc = smc_sk(sock->sk);
1166 	sock_hold(sk);
1167 	lock_sock(sk);
1168 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1169 		/* delegate to CLC child sock */
1170 		release_sock(sk);
1171 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1172 		/* if non-blocking connect finished ... */
1173 		lock_sock(sk);
1174 		if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) {
1175 			sk->sk_err = smc->clcsock->sk->sk_err;
1176 			if (sk->sk_err) {
1177 				mask |= EPOLLERR;
1178 			} else {
1179 				rc = smc_connect_rdma(smc);
1180 				if (rc < 0)
1181 					mask |= EPOLLERR;
1182 				/* success cases including fallback */
1183 				mask |= EPOLLOUT | EPOLLWRNORM;
1184 			}
1185 		}
1186 	} else {
1187 		if (sk->sk_state != SMC_CLOSED) {
1188 			release_sock(sk);
1189 			sock_poll_wait(file, sk_sleep(sk), wait);
1190 			lock_sock(sk);
1191 		}
1192 		if (sk->sk_err)
1193 			mask |= EPOLLERR;
1194 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1195 		    (sk->sk_state == SMC_CLOSED))
1196 			mask |= EPOLLHUP;
1197 		if (sk->sk_state == SMC_LISTEN) {
1198 			/* woken up by sk_data_ready in smc_listen_work() */
1199 			mask = smc_accept_poll(sk);
1200 		} else {
1201 			if (atomic_read(&smc->conn.sndbuf_space) ||
1202 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1203 				mask |= EPOLLOUT | EPOLLWRNORM;
1204 			} else {
1205 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1206 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1207 			}
1208 			if (atomic_read(&smc->conn.bytes_to_rcv))
1209 				mask |= EPOLLIN | EPOLLRDNORM;
1210 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1211 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1212 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1213 				mask |= EPOLLIN;
1214 		}
1215 
1216 	}
1217 	release_sock(sk);
1218 	sock_put(sk);
1219 
1220 	return mask;
1221 }
1222 
1223 static int smc_shutdown(struct socket *sock, int how)
1224 {
1225 	struct sock *sk = sock->sk;
1226 	struct smc_sock *smc;
1227 	int rc = -EINVAL;
1228 	int rc1 = 0;
1229 
1230 	smc = smc_sk(sk);
1231 
1232 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1233 		return rc;
1234 
1235 	lock_sock(sk);
1236 
1237 	rc = -ENOTCONN;
1238 	if ((sk->sk_state != SMC_LISTEN) &&
1239 	    (sk->sk_state != SMC_ACTIVE) &&
1240 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1241 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1242 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1243 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1244 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1245 		goto out;
1246 	if (smc->use_fallback) {
1247 		rc = kernel_sock_shutdown(smc->clcsock, how);
1248 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1249 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1250 			sk->sk_state = SMC_CLOSED;
1251 		goto out;
1252 	}
1253 	switch (how) {
1254 	case SHUT_RDWR:		/* shutdown in both directions */
1255 		rc = smc_close_active(smc);
1256 		break;
1257 	case SHUT_WR:
1258 		rc = smc_close_shutdown_write(smc);
1259 		break;
1260 	case SHUT_RD:
1261 		if (sk->sk_state == SMC_LISTEN)
1262 			rc = smc_close_active(smc);
1263 		else
1264 			rc = 0;
1265 			/* nothing more to do because peer is not involved */
1266 		break;
1267 	}
1268 	rc1 = kernel_sock_shutdown(smc->clcsock, how);
1269 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1270 	sk->sk_shutdown |= how + 1;
1271 
1272 out:
1273 	release_sock(sk);
1274 	return rc ? rc : rc1;
1275 }
1276 
1277 static int smc_setsockopt(struct socket *sock, int level, int optname,
1278 			  char __user *optval, unsigned int optlen)
1279 {
1280 	struct sock *sk = sock->sk;
1281 	struct smc_sock *smc;
1282 
1283 	smc = smc_sk(sk);
1284 
1285 	/* generic setsockopts reaching us here always apply to the
1286 	 * CLC socket
1287 	 */
1288 	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1289 					     optval, optlen);
1290 }
1291 
1292 static int smc_getsockopt(struct socket *sock, int level, int optname,
1293 			  char __user *optval, int __user *optlen)
1294 {
1295 	struct smc_sock *smc;
1296 
1297 	smc = smc_sk(sock->sk);
1298 	/* socket options apply to the CLC socket */
1299 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1300 					     optval, optlen);
1301 }
1302 
1303 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1304 		     unsigned long arg)
1305 {
1306 	struct smc_sock *smc;
1307 
1308 	smc = smc_sk(sock->sk);
1309 	if (smc->use_fallback)
1310 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1311 	else
1312 		return sock_no_ioctl(sock, cmd, arg);
1313 }
1314 
1315 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1316 			    int offset, size_t size, int flags)
1317 {
1318 	struct sock *sk = sock->sk;
1319 	struct smc_sock *smc;
1320 	int rc = -EPIPE;
1321 
1322 	smc = smc_sk(sk);
1323 	lock_sock(sk);
1324 	if (sk->sk_state != SMC_ACTIVE)
1325 		goto out;
1326 	if (smc->use_fallback)
1327 		rc = kernel_sendpage(smc->clcsock, page, offset,
1328 				     size, flags);
1329 	else
1330 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1331 
1332 out:
1333 	release_sock(sk);
1334 	return rc;
1335 }
1336 
1337 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1338 			       struct pipe_inode_info *pipe, size_t len,
1339 				    unsigned int flags)
1340 {
1341 	struct sock *sk = sock->sk;
1342 	struct smc_sock *smc;
1343 	int rc = -ENOTCONN;
1344 
1345 	smc = smc_sk(sk);
1346 	lock_sock(sk);
1347 	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1348 		goto out;
1349 	if (smc->use_fallback) {
1350 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1351 						    pipe, len, flags);
1352 	} else {
1353 		rc = -EOPNOTSUPP;
1354 	}
1355 out:
1356 	release_sock(sk);
1357 	return rc;
1358 }
1359 
1360 /* must look like tcp */
1361 static const struct proto_ops smc_sock_ops = {
1362 	.family		= PF_SMC,
1363 	.owner		= THIS_MODULE,
1364 	.release	= smc_release,
1365 	.bind		= smc_bind,
1366 	.connect	= smc_connect,
1367 	.socketpair	= sock_no_socketpair,
1368 	.accept		= smc_accept,
1369 	.getname	= smc_getname,
1370 	.poll		= smc_poll,
1371 	.ioctl		= smc_ioctl,
1372 	.listen		= smc_listen,
1373 	.shutdown	= smc_shutdown,
1374 	.setsockopt	= smc_setsockopt,
1375 	.getsockopt	= smc_getsockopt,
1376 	.sendmsg	= smc_sendmsg,
1377 	.recvmsg	= smc_recvmsg,
1378 	.mmap		= sock_no_mmap,
1379 	.sendpage	= smc_sendpage,
1380 	.splice_read	= smc_splice_read,
1381 };
1382 
1383 static int smc_create(struct net *net, struct socket *sock, int protocol,
1384 		      int kern)
1385 {
1386 	struct smc_sock *smc;
1387 	struct sock *sk;
1388 	int rc;
1389 
1390 	rc = -ESOCKTNOSUPPORT;
1391 	if (sock->type != SOCK_STREAM)
1392 		goto out;
1393 
1394 	rc = -EPROTONOSUPPORT;
1395 	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1396 		goto out;
1397 
1398 	rc = -ENOBUFS;
1399 	sock->ops = &smc_sock_ops;
1400 	sk = smc_sock_alloc(net, sock);
1401 	if (!sk)
1402 		goto out;
1403 
1404 	/* create internal TCP socket for CLC handshake and fallback */
1405 	smc = smc_sk(sk);
1406 	smc->use_fallback = false; /* assume rdma capability first */
1407 	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1408 			      IPPROTO_TCP, &smc->clcsock);
1409 	if (rc)
1410 		sk_common_release(sk);
1411 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1412 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1413 
1414 out:
1415 	return rc;
1416 }
1417 
1418 static const struct net_proto_family smc_sock_family_ops = {
1419 	.family	= PF_SMC,
1420 	.owner	= THIS_MODULE,
1421 	.create	= smc_create,
1422 };
1423 
1424 static int __init smc_init(void)
1425 {
1426 	int rc;
1427 
1428 	rc = smc_pnet_init();
1429 	if (rc)
1430 		return rc;
1431 
1432 	rc = smc_llc_init();
1433 	if (rc) {
1434 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1435 		goto out_pnet;
1436 	}
1437 
1438 	rc = smc_cdc_init();
1439 	if (rc) {
1440 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1441 		goto out_pnet;
1442 	}
1443 
1444 	rc = proto_register(&smc_proto, 1);
1445 	if (rc) {
1446 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
1447 		goto out_pnet;
1448 	}
1449 
1450 	rc = sock_register(&smc_sock_family_ops);
1451 	if (rc) {
1452 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1453 		goto out_proto;
1454 	}
1455 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1456 
1457 	rc = smc_ib_register_client();
1458 	if (rc) {
1459 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1460 		goto out_sock;
1461 	}
1462 
1463 	static_branch_enable(&tcp_have_smc);
1464 	return 0;
1465 
1466 out_sock:
1467 	sock_unregister(PF_SMC);
1468 out_proto:
1469 	proto_unregister(&smc_proto);
1470 out_pnet:
1471 	smc_pnet_exit();
1472 	return rc;
1473 }
1474 
1475 static void __exit smc_exit(void)
1476 {
1477 	struct smc_link_group *lgr, *lg;
1478 	LIST_HEAD(lgr_freeing_list);
1479 
1480 	spin_lock_bh(&smc_lgr_list.lock);
1481 	if (!list_empty(&smc_lgr_list.list))
1482 		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1483 	spin_unlock_bh(&smc_lgr_list.lock);
1484 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1485 		list_del_init(&lgr->list);
1486 		smc_lgr_free(lgr); /* free link group */
1487 	}
1488 	static_branch_disable(&tcp_have_smc);
1489 	smc_ib_unregister_client();
1490 	sock_unregister(PF_SMC);
1491 	proto_unregister(&smc_proto);
1492 	smc_pnet_exit();
1493 }
1494 
1495 module_init(smc_init);
1496 module_exit(smc_exit);
1497 
1498 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1499 MODULE_DESCRIPTION("smc socket address family");
1500 MODULE_LICENSE("GPL");
1501 MODULE_ALIAS_NETPROTO(PF_SMC);
1502