xref: /linux/net/smc/af_smc.c (revision d96fc832bcb6269d96e33d506f33033d7ed08598)
1 /*
2  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
3  *
4  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
5  *  applies to SOCK_STREAM sockets only
6  *  offers an alternative communication option for TCP-protocol sockets
7  *  applicable with RoCE-cards only
8  *
9  *  Initial restrictions:
10  *    - non-blocking connect postponed
11  *    - IPv6 support postponed
12  *    - support for alternate links postponed
13  *    - partial support for non-blocking sockets only
14  *    - support for urgent data postponed
15  *
16  *  Copyright IBM Corp. 2016
17  *
18  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
19  *              based on prototype from Frank Blaschka
20  */
21 
22 #define KMSG_COMPONENT "smc"
23 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
24 
25 #include <linux/module.h>
26 #include <linux/socket.h>
27 #include <linux/inetdevice.h>
28 #include <linux/workqueue.h>
29 #include <linux/in.h>
30 #include <linux/sched/signal.h>
31 
32 #include <net/sock.h>
33 #include <net/tcp.h>
34 #include <net/smc.h>
35 
36 #include "smc.h"
37 #include "smc_clc.h"
38 #include "smc_llc.h"
39 #include "smc_cdc.h"
40 #include "smc_core.h"
41 #include "smc_ib.h"
42 #include "smc_pnet.h"
43 #include "smc_tx.h"
44 #include "smc_rx.h"
45 #include "smc_close.h"
46 
47 static DEFINE_MUTEX(smc_create_lgr_pending);	/* serialize link group
48 						 * creation
49 						 */
50 
51 struct smc_lgr_list smc_lgr_list = {		/* established link groups */
52 	.lock = __SPIN_LOCK_UNLOCKED(smc_lgr_list.lock),
53 	.list = LIST_HEAD_INIT(smc_lgr_list.list),
54 };
55 
56 static void smc_tcp_listen_work(struct work_struct *);
57 
58 static void smc_set_keepalive(struct sock *sk, int val)
59 {
60 	struct smc_sock *smc = smc_sk(sk);
61 
62 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
63 }
64 
65 static struct smc_hashinfo smc_v4_hashinfo = {
66 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
67 };
68 
69 int smc_hash_sk(struct sock *sk)
70 {
71 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
72 	struct hlist_head *head;
73 
74 	head = &h->ht;
75 
76 	write_lock_bh(&h->lock);
77 	sk_add_node(sk, head);
78 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
79 	write_unlock_bh(&h->lock);
80 
81 	return 0;
82 }
83 EXPORT_SYMBOL_GPL(smc_hash_sk);
84 
85 void smc_unhash_sk(struct sock *sk)
86 {
87 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
88 
89 	write_lock_bh(&h->lock);
90 	if (sk_del_node_init(sk))
91 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
92 	write_unlock_bh(&h->lock);
93 }
94 EXPORT_SYMBOL_GPL(smc_unhash_sk);
95 
96 struct proto smc_proto = {
97 	.name		= "SMC",
98 	.owner		= THIS_MODULE,
99 	.keepalive	= smc_set_keepalive,
100 	.hash		= smc_hash_sk,
101 	.unhash		= smc_unhash_sk,
102 	.obj_size	= sizeof(struct smc_sock),
103 	.h.smc_hash	= &smc_v4_hashinfo,
104 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
105 };
106 EXPORT_SYMBOL_GPL(smc_proto);
107 
108 static int smc_release(struct socket *sock)
109 {
110 	struct sock *sk = sock->sk;
111 	struct smc_sock *smc;
112 	int rc = 0;
113 
114 	if (!sk)
115 		goto out;
116 
117 	smc = smc_sk(sk);
118 	if (sk->sk_state == SMC_LISTEN)
119 		/* smc_close_non_accepted() is called and acquires
120 		 * sock lock for child sockets again
121 		 */
122 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
123 	else
124 		lock_sock(sk);
125 
126 	if (!smc->use_fallback) {
127 		rc = smc_close_active(smc);
128 		sock_set_flag(sk, SOCK_DEAD);
129 		sk->sk_shutdown |= SHUTDOWN_MASK;
130 	}
131 	if (smc->clcsock) {
132 		sock_release(smc->clcsock);
133 		smc->clcsock = NULL;
134 	}
135 	if (smc->use_fallback) {
136 		sock_put(sk); /* passive closing */
137 		sk->sk_state = SMC_CLOSED;
138 		sk->sk_state_change(sk);
139 	}
140 
141 	/* detach socket */
142 	sock_orphan(sk);
143 	sock->sk = NULL;
144 	if (!smc->use_fallback && sk->sk_state == SMC_CLOSED)
145 		smc_conn_free(&smc->conn);
146 	release_sock(sk);
147 
148 	sk->sk_prot->unhash(sk);
149 	sock_put(sk); /* final sock_put */
150 out:
151 	return rc;
152 }
153 
154 static void smc_destruct(struct sock *sk)
155 {
156 	if (sk->sk_state != SMC_CLOSED)
157 		return;
158 	if (!sock_flag(sk, SOCK_DEAD))
159 		return;
160 
161 	sk_refcnt_debug_dec(sk);
162 }
163 
164 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock)
165 {
166 	struct smc_sock *smc;
167 	struct sock *sk;
168 
169 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, &smc_proto, 0);
170 	if (!sk)
171 		return NULL;
172 
173 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
174 	sk->sk_state = SMC_INIT;
175 	sk->sk_destruct = smc_destruct;
176 	sk->sk_protocol = SMCPROTO_SMC;
177 	smc = smc_sk(sk);
178 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
179 	INIT_LIST_HEAD(&smc->accept_q);
180 	spin_lock_init(&smc->accept_q_lock);
181 	sk->sk_prot->hash(sk);
182 	sk_refcnt_debug_inc(sk);
183 
184 	return sk;
185 }
186 
187 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
188 		    int addr_len)
189 {
190 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
191 	struct sock *sk = sock->sk;
192 	struct smc_sock *smc;
193 	int rc;
194 
195 	smc = smc_sk(sk);
196 
197 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
198 	rc = -EINVAL;
199 	if (addr_len < sizeof(struct sockaddr_in))
200 		goto out;
201 
202 	rc = -EAFNOSUPPORT;
203 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
204 	if ((addr->sin_family != AF_INET) &&
205 	    ((addr->sin_family != AF_UNSPEC) ||
206 	     (addr->sin_addr.s_addr != htonl(INADDR_ANY))))
207 		goto out;
208 
209 	lock_sock(sk);
210 
211 	/* Check if socket is already active */
212 	rc = -EINVAL;
213 	if (sk->sk_state != SMC_INIT)
214 		goto out_rel;
215 
216 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
217 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
218 
219 out_rel:
220 	release_sock(sk);
221 out:
222 	return rc;
223 }
224 
225 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
226 				   unsigned long mask)
227 {
228 	/* options we don't get control via setsockopt for */
229 	nsk->sk_type = osk->sk_type;
230 	nsk->sk_sndbuf = osk->sk_sndbuf;
231 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
232 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
233 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
234 	nsk->sk_mark = osk->sk_mark;
235 	nsk->sk_priority = osk->sk_priority;
236 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
237 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
238 	nsk->sk_err = osk->sk_err;
239 
240 	nsk->sk_flags &= ~mask;
241 	nsk->sk_flags |= osk->sk_flags & mask;
242 }
243 
244 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
245 			     (1UL << SOCK_KEEPOPEN) | \
246 			     (1UL << SOCK_LINGER) | \
247 			     (1UL << SOCK_BROADCAST) | \
248 			     (1UL << SOCK_TIMESTAMP) | \
249 			     (1UL << SOCK_DBG) | \
250 			     (1UL << SOCK_RCVTSTAMP) | \
251 			     (1UL << SOCK_RCVTSTAMPNS) | \
252 			     (1UL << SOCK_LOCALROUTE) | \
253 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
254 			     (1UL << SOCK_RXQ_OVFL) | \
255 			     (1UL << SOCK_WIFI_STATUS) | \
256 			     (1UL << SOCK_NOFCS) | \
257 			     (1UL << SOCK_FILTER_LOCKED))
258 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
259  * clc socket (since smc is not called for these options from net/core)
260  */
261 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
262 {
263 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
264 }
265 
266 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
267 			     (1UL << SOCK_KEEPOPEN) | \
268 			     (1UL << SOCK_LINGER) | \
269 			     (1UL << SOCK_DBG))
270 /* copy only settings and flags relevant for smc from clc to smc socket */
271 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
272 {
273 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
274 }
275 
276 /* determine subnet and mask of internal TCP socket */
277 int smc_netinfo_by_tcpsk(struct socket *clcsock,
278 			 __be32 *subnet, u8 *prefix_len)
279 {
280 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
281 	struct in_device *in_dev;
282 	struct sockaddr_in addr;
283 	int rc = -ENOENT;
284 
285 	if (!dst) {
286 		rc = -ENOTCONN;
287 		goto out;
288 	}
289 	if (!dst->dev) {
290 		rc = -ENODEV;
291 		goto out_rel;
292 	}
293 
294 	/* get address to which the internal TCP socket is bound */
295 	kernel_getsockname(clcsock, (struct sockaddr *)&addr);
296 	/* analyze IPv4 specific data of net_device belonging to TCP socket */
297 	rcu_read_lock();
298 	in_dev = __in_dev_get_rcu(dst->dev);
299 	for_ifa(in_dev) {
300 		if (!inet_ifa_match(addr.sin_addr.s_addr, ifa))
301 			continue;
302 		*prefix_len = inet_mask_len(ifa->ifa_mask);
303 		*subnet = ifa->ifa_address & ifa->ifa_mask;
304 		rc = 0;
305 		break;
306 	} endfor_ifa(in_dev);
307 	rcu_read_unlock();
308 
309 out_rel:
310 	dst_release(dst);
311 out:
312 	return rc;
313 }
314 
315 static int smc_clnt_conf_first_link(struct smc_sock *smc, union ib_gid *gid)
316 {
317 	struct smc_link_group *lgr = smc->conn.lgr;
318 	struct smc_link *link;
319 	int rest;
320 	int rc;
321 
322 	link = &lgr->lnk[SMC_SINGLE_LINK];
323 	/* receive CONFIRM LINK request from server over RoCE fabric */
324 	rest = wait_for_completion_interruptible_timeout(
325 		&link->llc_confirm,
326 		SMC_LLC_WAIT_FIRST_TIME);
327 	if (rest <= 0) {
328 		struct smc_clc_msg_decline dclc;
329 
330 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
331 				      SMC_CLC_DECLINE);
332 		return rc;
333 	}
334 
335 	rc = smc_ib_modify_qp_rts(link);
336 	if (rc)
337 		return SMC_CLC_DECL_INTERR;
338 
339 	smc_wr_remember_qp_attr(link);
340 
341 	rc = smc_wr_reg_send(link,
342 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
343 	if (rc)
344 		return SMC_CLC_DECL_INTERR;
345 
346 	/* send CONFIRM LINK response over RoCE fabric */
347 	rc = smc_llc_send_confirm_link(link,
348 				       link->smcibdev->mac[link->ibport - 1],
349 				       gid, SMC_LLC_RESP);
350 	if (rc < 0)
351 		return SMC_CLC_DECL_TCL;
352 
353 	return rc;
354 }
355 
356 static void smc_conn_save_peer_info(struct smc_sock *smc,
357 				    struct smc_clc_msg_accept_confirm *clc)
358 {
359 	smc->conn.peer_conn_idx = clc->conn_idx;
360 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
361 	smc->conn.peer_rmbe_size = smc_uncompress_bufsize(clc->rmbe_size);
362 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
363 }
364 
365 static void smc_link_save_peer_info(struct smc_link *link,
366 				    struct smc_clc_msg_accept_confirm *clc)
367 {
368 	link->peer_qpn = ntoh24(clc->qpn);
369 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
370 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
371 	link->peer_psn = ntoh24(clc->psn);
372 	link->peer_mtu = clc->qp_mtu;
373 }
374 
375 static void smc_lgr_forget(struct smc_link_group *lgr)
376 {
377 	spin_lock_bh(&smc_lgr_list.lock);
378 	/* do not use this link group for new connections */
379 	if (!list_empty(&lgr->list))
380 		list_del_init(&lgr->list);
381 	spin_unlock_bh(&smc_lgr_list.lock);
382 }
383 
384 /* setup for RDMA connection of client */
385 static int smc_connect_rdma(struct smc_sock *smc)
386 {
387 	struct sockaddr_in *inaddr = (struct sockaddr_in *)smc->addr;
388 	struct smc_clc_msg_accept_confirm aclc;
389 	int local_contact = SMC_FIRST_CONTACT;
390 	struct smc_ib_device *smcibdev;
391 	struct smc_link *link;
392 	u8 srv_first_contact;
393 	int reason_code = 0;
394 	int rc = 0;
395 	u8 ibport;
396 
397 	sock_hold(&smc->sk); /* sock put in passive closing */
398 
399 	if (!tcp_sk(smc->clcsock->sk)->syn_smc) {
400 		/* peer has not signalled SMC-capability */
401 		smc->use_fallback = true;
402 		goto out_connected;
403 	}
404 
405 	/* IPSec connections opt out of SMC-R optimizations */
406 	if (using_ipsec(smc)) {
407 		reason_code = SMC_CLC_DECL_IPSEC;
408 		goto decline_rdma;
409 	}
410 
411 	/* PNET table look up: search active ib_device and port
412 	 * within same PNETID that also contains the ethernet device
413 	 * used for the internal TCP socket
414 	 */
415 	smc_pnet_find_roce_resource(smc->clcsock->sk, &smcibdev, &ibport);
416 	if (!smcibdev) {
417 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
418 		goto decline_rdma;
419 	}
420 
421 	/* do inband token exchange */
422 	reason_code = smc_clc_send_proposal(smc, smcibdev, ibport);
423 	if (reason_code < 0) {
424 		rc = reason_code;
425 		goto out_err;
426 	}
427 	if (reason_code > 0) /* configuration error */
428 		goto decline_rdma;
429 	/* receive SMC Accept CLC message */
430 	reason_code = smc_clc_wait_msg(smc, &aclc, sizeof(aclc),
431 				       SMC_CLC_ACCEPT);
432 	if (reason_code < 0) {
433 		rc = reason_code;
434 		goto out_err;
435 	}
436 	if (reason_code > 0)
437 		goto decline_rdma;
438 
439 	srv_first_contact = aclc.hdr.flag;
440 	mutex_lock(&smc_create_lgr_pending);
441 	local_contact = smc_conn_create(smc, inaddr->sin_addr.s_addr, smcibdev,
442 					ibport, &aclc.lcl, srv_first_contact);
443 	if (local_contact < 0) {
444 		rc = local_contact;
445 		if (rc == -ENOMEM)
446 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
447 		else if (rc == -ENOLINK)
448 			reason_code = SMC_CLC_DECL_SYNCERR; /* synchr. error */
449 		goto decline_rdma_unlock;
450 	}
451 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
452 
453 	smc_conn_save_peer_info(smc, &aclc);
454 
455 	/* create send buffer and rmb */
456 	rc = smc_buf_create(smc);
457 	if (rc) {
458 		reason_code = SMC_CLC_DECL_MEM;
459 		goto decline_rdma_unlock;
460 	}
461 
462 	if (local_contact == SMC_FIRST_CONTACT)
463 		smc_link_save_peer_info(link, &aclc);
464 
465 	rc = smc_rmb_rtoken_handling(&smc->conn, &aclc);
466 	if (rc) {
467 		reason_code = SMC_CLC_DECL_INTERR;
468 		goto decline_rdma_unlock;
469 	}
470 
471 	smc_close_init(smc);
472 	smc_rx_init(smc);
473 
474 	if (local_contact == SMC_FIRST_CONTACT) {
475 		rc = smc_ib_ready_link(link);
476 		if (rc) {
477 			reason_code = SMC_CLC_DECL_INTERR;
478 			goto decline_rdma_unlock;
479 		}
480 	} else {
481 		struct smc_buf_desc *buf_desc = smc->conn.rmb_desc;
482 
483 		if (!buf_desc->reused) {
484 			/* register memory region for new rmb */
485 			rc = smc_wr_reg_send(link,
486 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
487 			if (rc) {
488 				reason_code = SMC_CLC_DECL_INTERR;
489 				goto decline_rdma_unlock;
490 			}
491 		}
492 	}
493 	smc_rmb_sync_sg_for_device(&smc->conn);
494 
495 	rc = smc_clc_send_confirm(smc);
496 	if (rc)
497 		goto out_err_unlock;
498 
499 	if (local_contact == SMC_FIRST_CONTACT) {
500 		/* QP confirmation over RoCE fabric */
501 		reason_code = smc_clnt_conf_first_link(
502 			smc, &smcibdev->gid[ibport - 1]);
503 		if (reason_code < 0) {
504 			rc = reason_code;
505 			goto out_err_unlock;
506 		}
507 		if (reason_code > 0)
508 			goto decline_rdma_unlock;
509 	}
510 
511 	mutex_unlock(&smc_create_lgr_pending);
512 	smc_tx_init(smc);
513 
514 out_connected:
515 	smc_copy_sock_settings_to_clc(smc);
516 	if (smc->sk.sk_state == SMC_INIT)
517 		smc->sk.sk_state = SMC_ACTIVE;
518 
519 	return rc ? rc : local_contact;
520 
521 decline_rdma_unlock:
522 	if (local_contact == SMC_FIRST_CONTACT)
523 		smc_lgr_forget(smc->conn.lgr);
524 	mutex_unlock(&smc_create_lgr_pending);
525 	smc_conn_free(&smc->conn);
526 decline_rdma:
527 	/* RDMA setup failed, switch back to TCP */
528 	smc->use_fallback = true;
529 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
530 		rc = smc_clc_send_decline(smc, reason_code);
531 		if (rc < 0)
532 			goto out_err;
533 	}
534 	goto out_connected;
535 
536 out_err_unlock:
537 	if (local_contact == SMC_FIRST_CONTACT)
538 		smc_lgr_forget(smc->conn.lgr);
539 	mutex_unlock(&smc_create_lgr_pending);
540 	smc_conn_free(&smc->conn);
541 out_err:
542 	if (smc->sk.sk_state == SMC_INIT)
543 		sock_put(&smc->sk); /* passive closing */
544 	return rc;
545 }
546 
547 static int smc_connect(struct socket *sock, struct sockaddr *addr,
548 		       int alen, int flags)
549 {
550 	struct sock *sk = sock->sk;
551 	struct smc_sock *smc;
552 	int rc = -EINVAL;
553 
554 	smc = smc_sk(sk);
555 
556 	/* separate smc parameter checking to be safe */
557 	if (alen < sizeof(addr->sa_family))
558 		goto out_err;
559 	if (addr->sa_family != AF_INET)
560 		goto out_err;
561 	smc->addr = addr;	/* needed for nonblocking connect */
562 
563 	lock_sock(sk);
564 	switch (sk->sk_state) {
565 	default:
566 		goto out;
567 	case SMC_ACTIVE:
568 		rc = -EISCONN;
569 		goto out;
570 	case SMC_INIT:
571 		rc = 0;
572 		break;
573 	}
574 
575 	smc_copy_sock_settings_to_clc(smc);
576 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
577 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
578 	if (rc)
579 		goto out;
580 
581 	/* setup RDMA connection */
582 	rc = smc_connect_rdma(smc);
583 	if (rc < 0)
584 		goto out;
585 	else
586 		rc = 0; /* success cases including fallback */
587 
588 out:
589 	release_sock(sk);
590 out_err:
591 	return rc;
592 }
593 
594 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
595 {
596 	struct socket *new_clcsock = NULL;
597 	struct sock *lsk = &lsmc->sk;
598 	struct sock *new_sk;
599 	int rc;
600 
601 	release_sock(lsk);
602 	new_sk = smc_sock_alloc(sock_net(lsk), NULL);
603 	if (!new_sk) {
604 		rc = -ENOMEM;
605 		lsk->sk_err = ENOMEM;
606 		*new_smc = NULL;
607 		lock_sock(lsk);
608 		goto out;
609 	}
610 	*new_smc = smc_sk(new_sk);
611 
612 	rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
613 	lock_sock(lsk);
614 	if  (rc < 0)
615 		lsk->sk_err = -rc;
616 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
617 		if (new_clcsock)
618 			sock_release(new_clcsock);
619 		new_sk->sk_state = SMC_CLOSED;
620 		sock_set_flag(new_sk, SOCK_DEAD);
621 		new_sk->sk_prot->unhash(new_sk);
622 		sock_put(new_sk); /* final */
623 		*new_smc = NULL;
624 		goto out;
625 	}
626 
627 	(*new_smc)->clcsock = new_clcsock;
628 out:
629 	return rc;
630 }
631 
632 /* add a just created sock to the accept queue of the listen sock as
633  * candidate for a following socket accept call from user space
634  */
635 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
636 {
637 	struct smc_sock *par = smc_sk(parent);
638 
639 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
640 	spin_lock(&par->accept_q_lock);
641 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
642 	spin_unlock(&par->accept_q_lock);
643 	sk_acceptq_added(parent);
644 }
645 
646 /* remove a socket from the accept queue of its parental listening socket */
647 static void smc_accept_unlink(struct sock *sk)
648 {
649 	struct smc_sock *par = smc_sk(sk)->listen_smc;
650 
651 	spin_lock(&par->accept_q_lock);
652 	list_del_init(&smc_sk(sk)->accept_q);
653 	spin_unlock(&par->accept_q_lock);
654 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
655 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
656 }
657 
658 /* remove a sock from the accept queue to bind it to a new socket created
659  * for a socket accept call from user space
660  */
661 struct sock *smc_accept_dequeue(struct sock *parent,
662 				struct socket *new_sock)
663 {
664 	struct smc_sock *isk, *n;
665 	struct sock *new_sk;
666 
667 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
668 		new_sk = (struct sock *)isk;
669 
670 		smc_accept_unlink(new_sk);
671 		if (new_sk->sk_state == SMC_CLOSED) {
672 			if (isk->clcsock) {
673 				sock_release(isk->clcsock);
674 				isk->clcsock = NULL;
675 			}
676 			new_sk->sk_prot->unhash(new_sk);
677 			sock_put(new_sk); /* final */
678 			continue;
679 		}
680 		if (new_sock)
681 			sock_graft(new_sk, new_sock);
682 		return new_sk;
683 	}
684 	return NULL;
685 }
686 
687 /* clean up for a created but never accepted sock */
688 void smc_close_non_accepted(struct sock *sk)
689 {
690 	struct smc_sock *smc = smc_sk(sk);
691 
692 	lock_sock(sk);
693 	if (!sk->sk_lingertime)
694 		/* wait for peer closing */
695 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
696 	if (!smc->use_fallback) {
697 		smc_close_active(smc);
698 		sock_set_flag(sk, SOCK_DEAD);
699 		sk->sk_shutdown |= SHUTDOWN_MASK;
700 	}
701 	if (smc->clcsock) {
702 		struct socket *tcp;
703 
704 		tcp = smc->clcsock;
705 		smc->clcsock = NULL;
706 		sock_release(tcp);
707 	}
708 	if (smc->use_fallback) {
709 		sock_put(sk); /* passive closing */
710 		sk->sk_state = SMC_CLOSED;
711 	} else {
712 		if (sk->sk_state == SMC_CLOSED)
713 			smc_conn_free(&smc->conn);
714 	}
715 	release_sock(sk);
716 	sk->sk_prot->unhash(sk);
717 	sock_put(sk); /* final sock_put */
718 }
719 
720 static int smc_serv_conf_first_link(struct smc_sock *smc)
721 {
722 	struct smc_link_group *lgr = smc->conn.lgr;
723 	struct smc_link *link;
724 	int rest;
725 	int rc;
726 
727 	link = &lgr->lnk[SMC_SINGLE_LINK];
728 
729 	rc = smc_wr_reg_send(link,
730 			     smc->conn.rmb_desc->mr_rx[SMC_SINGLE_LINK]);
731 	if (rc)
732 		return SMC_CLC_DECL_INTERR;
733 
734 	/* send CONFIRM LINK request to client over the RoCE fabric */
735 	rc = smc_llc_send_confirm_link(link,
736 				       link->smcibdev->mac[link->ibport - 1],
737 				       &link->smcibdev->gid[link->ibport - 1],
738 				       SMC_LLC_REQ);
739 	if (rc < 0)
740 		return SMC_CLC_DECL_TCL;
741 
742 	/* receive CONFIRM LINK response from client over the RoCE fabric */
743 	rest = wait_for_completion_interruptible_timeout(
744 		&link->llc_confirm_resp,
745 		SMC_LLC_WAIT_FIRST_TIME);
746 	if (rest <= 0) {
747 		struct smc_clc_msg_decline dclc;
748 
749 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
750 				      SMC_CLC_DECLINE);
751 	}
752 
753 	return rc;
754 }
755 
756 /* setup for RDMA connection of server */
757 static void smc_listen_work(struct work_struct *work)
758 {
759 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
760 						smc_listen_work);
761 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
762 	struct socket *newclcsock = new_smc->clcsock;
763 	struct smc_sock *lsmc = new_smc->listen_smc;
764 	struct smc_clc_msg_accept_confirm cclc;
765 	int local_contact = SMC_REUSE_CONTACT;
766 	struct sock *newsmcsk = &new_smc->sk;
767 	struct smc_clc_msg_proposal *pclc;
768 	struct smc_ib_device *smcibdev;
769 	struct sockaddr_in peeraddr;
770 	u8 buf[SMC_CLC_MAX_LEN];
771 	struct smc_link *link;
772 	int reason_code = 0;
773 	int rc = 0;
774 	__be32 subnet;
775 	u8 prefix_len;
776 	u8 ibport;
777 
778 	/* check if peer is smc capable */
779 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
780 		new_smc->use_fallback = true;
781 		goto out_connected;
782 	}
783 
784 	/* do inband token exchange -
785 	 *wait for and receive SMC Proposal CLC message
786 	 */
787 	reason_code = smc_clc_wait_msg(new_smc, &buf, sizeof(buf),
788 				       SMC_CLC_PROPOSAL);
789 	if (reason_code < 0)
790 		goto out_err;
791 	if (reason_code > 0)
792 		goto decline_rdma;
793 
794 	/* IPSec connections opt out of SMC-R optimizations */
795 	if (using_ipsec(new_smc)) {
796 		reason_code = SMC_CLC_DECL_IPSEC;
797 		goto decline_rdma;
798 	}
799 
800 	/* PNET table look up: search active ib_device and port
801 	 * within same PNETID that also contains the ethernet device
802 	 * used for the internal TCP socket
803 	 */
804 	smc_pnet_find_roce_resource(newclcsock->sk, &smcibdev, &ibport);
805 	if (!smcibdev) {
806 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
807 		goto decline_rdma;
808 	}
809 
810 	/* determine subnet and mask from internal TCP socket */
811 	rc = smc_netinfo_by_tcpsk(newclcsock, &subnet, &prefix_len);
812 	if (rc) {
813 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
814 		goto decline_rdma;
815 	}
816 
817 	pclc = (struct smc_clc_msg_proposal *)&buf;
818 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
819 	if (pclc_prfx->outgoing_subnet != subnet ||
820 	    pclc_prfx->prefix_len != prefix_len) {
821 		reason_code = SMC_CLC_DECL_CNFERR; /* configuration error */
822 		goto decline_rdma;
823 	}
824 
825 	/* get address of the peer connected to the internal TCP socket */
826 	kernel_getpeername(newclcsock, (struct sockaddr *)&peeraddr);
827 
828 	/* allocate connection / link group */
829 	mutex_lock(&smc_create_lgr_pending);
830 	local_contact = smc_conn_create(new_smc, peeraddr.sin_addr.s_addr,
831 					smcibdev, ibport, &pclc->lcl, 0);
832 	if (local_contact < 0) {
833 		rc = local_contact;
834 		if (rc == -ENOMEM)
835 			reason_code = SMC_CLC_DECL_MEM;/* insufficient memory*/
836 		goto decline_rdma_unlock;
837 	}
838 	link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
839 
840 	/* create send buffer and rmb */
841 	rc = smc_buf_create(new_smc);
842 	if (rc) {
843 		reason_code = SMC_CLC_DECL_MEM;
844 		goto decline_rdma_unlock;
845 	}
846 
847 	smc_close_init(new_smc);
848 	smc_rx_init(new_smc);
849 
850 	if (local_contact != SMC_FIRST_CONTACT) {
851 		struct smc_buf_desc *buf_desc = new_smc->conn.rmb_desc;
852 
853 		if (!buf_desc->reused) {
854 			/* register memory region for new rmb */
855 			rc = smc_wr_reg_send(link,
856 					     buf_desc->mr_rx[SMC_SINGLE_LINK]);
857 			if (rc) {
858 				reason_code = SMC_CLC_DECL_INTERR;
859 				goto decline_rdma_unlock;
860 			}
861 		}
862 	}
863 	smc_rmb_sync_sg_for_device(&new_smc->conn);
864 
865 	rc = smc_clc_send_accept(new_smc, local_contact);
866 	if (rc)
867 		goto out_err_unlock;
868 
869 	/* receive SMC Confirm CLC message */
870 	reason_code = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
871 				       SMC_CLC_CONFIRM);
872 	if (reason_code < 0)
873 		goto out_err_unlock;
874 	if (reason_code > 0)
875 		goto decline_rdma_unlock;
876 	smc_conn_save_peer_info(new_smc, &cclc);
877 	if (local_contact == SMC_FIRST_CONTACT)
878 		smc_link_save_peer_info(link, &cclc);
879 
880 	rc = smc_rmb_rtoken_handling(&new_smc->conn, &cclc);
881 	if (rc) {
882 		reason_code = SMC_CLC_DECL_INTERR;
883 		goto decline_rdma_unlock;
884 	}
885 
886 	if (local_contact == SMC_FIRST_CONTACT) {
887 		rc = smc_ib_ready_link(link);
888 		if (rc) {
889 			reason_code = SMC_CLC_DECL_INTERR;
890 			goto decline_rdma_unlock;
891 		}
892 		/* QP confirmation over RoCE fabric */
893 		reason_code = smc_serv_conf_first_link(new_smc);
894 		if (reason_code < 0)
895 			/* peer is not aware of a problem */
896 			goto out_err_unlock;
897 		if (reason_code > 0)
898 			goto decline_rdma_unlock;
899 	}
900 
901 	smc_tx_init(new_smc);
902 	mutex_unlock(&smc_create_lgr_pending);
903 
904 out_connected:
905 	sk_refcnt_debug_inc(newsmcsk);
906 	if (newsmcsk->sk_state == SMC_INIT)
907 		newsmcsk->sk_state = SMC_ACTIVE;
908 enqueue:
909 	lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
910 	if (lsmc->sk.sk_state == SMC_LISTEN) {
911 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
912 	} else { /* no longer listening */
913 		smc_close_non_accepted(newsmcsk);
914 	}
915 	release_sock(&lsmc->sk);
916 
917 	/* Wake up accept */
918 	lsmc->sk.sk_data_ready(&lsmc->sk);
919 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
920 	return;
921 
922 decline_rdma_unlock:
923 	if (local_contact == SMC_FIRST_CONTACT)
924 		smc_lgr_forget(new_smc->conn.lgr);
925 	mutex_unlock(&smc_create_lgr_pending);
926 decline_rdma:
927 	/* RDMA setup failed, switch back to TCP */
928 	smc_conn_free(&new_smc->conn);
929 	new_smc->use_fallback = true;
930 	if (reason_code && (reason_code != SMC_CLC_DECL_REPLY)) {
931 		if (smc_clc_send_decline(new_smc, reason_code) < 0)
932 			goto out_err;
933 	}
934 	goto out_connected;
935 
936 out_err_unlock:
937 	if (local_contact == SMC_FIRST_CONTACT)
938 		smc_lgr_forget(new_smc->conn.lgr);
939 	mutex_unlock(&smc_create_lgr_pending);
940 out_err:
941 	if (newsmcsk->sk_state == SMC_INIT)
942 		sock_put(&new_smc->sk); /* passive closing */
943 	newsmcsk->sk_state = SMC_CLOSED;
944 	smc_conn_free(&new_smc->conn);
945 	goto enqueue; /* queue new sock with sk_err set */
946 }
947 
948 static void smc_tcp_listen_work(struct work_struct *work)
949 {
950 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
951 					     tcp_listen_work);
952 	struct sock *lsk = &lsmc->sk;
953 	struct smc_sock *new_smc;
954 	int rc = 0;
955 
956 	lock_sock(lsk);
957 	while (lsk->sk_state == SMC_LISTEN) {
958 		rc = smc_clcsock_accept(lsmc, &new_smc);
959 		if (rc)
960 			goto out;
961 		if (!new_smc)
962 			continue;
963 
964 		new_smc->listen_smc = lsmc;
965 		new_smc->use_fallback = false; /* assume rdma capability first*/
966 		sock_hold(lsk); /* sock_put in smc_listen_work */
967 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
968 		smc_copy_sock_settings_to_smc(new_smc);
969 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
970 		if (!schedule_work(&new_smc->smc_listen_work))
971 			sock_put(&new_smc->sk);
972 	}
973 
974 out:
975 	if (lsmc->clcsock) {
976 		sock_release(lsmc->clcsock);
977 		lsmc->clcsock = NULL;
978 	}
979 	release_sock(lsk);
980 	/* no more listening, wake up smc_close_wait_listen_clcsock and
981 	 * accept
982 	 */
983 	lsk->sk_state_change(lsk);
984 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
985 }
986 
987 static int smc_listen(struct socket *sock, int backlog)
988 {
989 	struct sock *sk = sock->sk;
990 	struct smc_sock *smc;
991 	int rc;
992 
993 	smc = smc_sk(sk);
994 	lock_sock(sk);
995 
996 	rc = -EINVAL;
997 	if ((sk->sk_state != SMC_INIT) && (sk->sk_state != SMC_LISTEN))
998 		goto out;
999 
1000 	rc = 0;
1001 	if (sk->sk_state == SMC_LISTEN) {
1002 		sk->sk_max_ack_backlog = backlog;
1003 		goto out;
1004 	}
1005 	/* some socket options are handled in core, so we could not apply
1006 	 * them to the clc socket -- copy smc socket options to clc socket
1007 	 */
1008 	smc_copy_sock_settings_to_clc(smc);
1009 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1010 
1011 	rc = kernel_listen(smc->clcsock, backlog);
1012 	if (rc)
1013 		goto out;
1014 	sk->sk_max_ack_backlog = backlog;
1015 	sk->sk_ack_backlog = 0;
1016 	sk->sk_state = SMC_LISTEN;
1017 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
1018 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1019 	if (!schedule_work(&smc->tcp_listen_work))
1020 		sock_put(sk);
1021 
1022 out:
1023 	release_sock(sk);
1024 	return rc;
1025 }
1026 
1027 static int smc_accept(struct socket *sock, struct socket *new_sock,
1028 		      int flags, bool kern)
1029 {
1030 	struct sock *sk = sock->sk, *nsk;
1031 	DECLARE_WAITQUEUE(wait, current);
1032 	struct smc_sock *lsmc;
1033 	long timeo;
1034 	int rc = 0;
1035 
1036 	lsmc = smc_sk(sk);
1037 	sock_hold(sk); /* sock_put below */
1038 	lock_sock(sk);
1039 
1040 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1041 		rc = -EINVAL;
1042 		goto out;
1043 	}
1044 
1045 	/* Wait for an incoming connection */
1046 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1047 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1048 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1049 		set_current_state(TASK_INTERRUPTIBLE);
1050 		if (!timeo) {
1051 			rc = -EAGAIN;
1052 			break;
1053 		}
1054 		release_sock(sk);
1055 		timeo = schedule_timeout(timeo);
1056 		/* wakeup by sk_data_ready in smc_listen_work() */
1057 		sched_annotate_sleep();
1058 		lock_sock(sk);
1059 		if (signal_pending(current)) {
1060 			rc = sock_intr_errno(timeo);
1061 			break;
1062 		}
1063 	}
1064 	set_current_state(TASK_RUNNING);
1065 	remove_wait_queue(sk_sleep(sk), &wait);
1066 
1067 	if (!rc)
1068 		rc = sock_error(nsk);
1069 
1070 out:
1071 	release_sock(sk);
1072 	sock_put(sk); /* sock_hold above */
1073 	return rc;
1074 }
1075 
1076 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1077 		       int peer)
1078 {
1079 	struct smc_sock *smc;
1080 
1081 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1082 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1083 		return -ENOTCONN;
1084 
1085 	smc = smc_sk(sock->sk);
1086 
1087 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1088 }
1089 
1090 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1091 {
1092 	struct sock *sk = sock->sk;
1093 	struct smc_sock *smc;
1094 	int rc = -EPIPE;
1095 
1096 	smc = smc_sk(sk);
1097 	lock_sock(sk);
1098 	if ((sk->sk_state != SMC_ACTIVE) &&
1099 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1100 	    (sk->sk_state != SMC_INIT))
1101 		goto out;
1102 	if (smc->use_fallback)
1103 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1104 	else
1105 		rc = smc_tx_sendmsg(smc, msg, len);
1106 out:
1107 	release_sock(sk);
1108 	return rc;
1109 }
1110 
1111 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1112 		       int flags)
1113 {
1114 	struct sock *sk = sock->sk;
1115 	struct smc_sock *smc;
1116 	int rc = -ENOTCONN;
1117 
1118 	smc = smc_sk(sk);
1119 	lock_sock(sk);
1120 	if ((sk->sk_state == SMC_INIT) ||
1121 	    (sk->sk_state == SMC_LISTEN) ||
1122 	    (sk->sk_state == SMC_CLOSED))
1123 		goto out;
1124 
1125 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1126 		rc = 0;
1127 		goto out;
1128 	}
1129 
1130 	if (smc->use_fallback)
1131 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1132 	else
1133 		rc = smc_rx_recvmsg(smc, msg, len, flags);
1134 
1135 out:
1136 	release_sock(sk);
1137 	return rc;
1138 }
1139 
1140 static __poll_t smc_accept_poll(struct sock *parent)
1141 {
1142 	struct smc_sock *isk = smc_sk(parent);
1143 	__poll_t mask = 0;
1144 
1145 	spin_lock(&isk->accept_q_lock);
1146 	if (!list_empty(&isk->accept_q))
1147 		mask = EPOLLIN | EPOLLRDNORM;
1148 	spin_unlock(&isk->accept_q_lock);
1149 
1150 	return mask;
1151 }
1152 
1153 static __poll_t smc_poll(struct file *file, struct socket *sock,
1154 			     poll_table *wait)
1155 {
1156 	struct sock *sk = sock->sk;
1157 	__poll_t mask = 0;
1158 	struct smc_sock *smc;
1159 	int rc;
1160 
1161 	if (!sk)
1162 		return EPOLLNVAL;
1163 
1164 	smc = smc_sk(sock->sk);
1165 	sock_hold(sk);
1166 	lock_sock(sk);
1167 	if ((sk->sk_state == SMC_INIT) || smc->use_fallback) {
1168 		/* delegate to CLC child sock */
1169 		release_sock(sk);
1170 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1171 		/* if non-blocking connect finished ... */
1172 		lock_sock(sk);
1173 		if ((sk->sk_state == SMC_INIT) && (mask & EPOLLOUT)) {
1174 			sk->sk_err = smc->clcsock->sk->sk_err;
1175 			if (sk->sk_err) {
1176 				mask |= EPOLLERR;
1177 			} else {
1178 				rc = smc_connect_rdma(smc);
1179 				if (rc < 0)
1180 					mask |= EPOLLERR;
1181 				/* success cases including fallback */
1182 				mask |= EPOLLOUT | EPOLLWRNORM;
1183 			}
1184 		}
1185 	} else {
1186 		if (sk->sk_state != SMC_CLOSED) {
1187 			release_sock(sk);
1188 			sock_poll_wait(file, sk_sleep(sk), wait);
1189 			lock_sock(sk);
1190 		}
1191 		if (sk->sk_err)
1192 			mask |= EPOLLERR;
1193 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1194 		    (sk->sk_state == SMC_CLOSED))
1195 			mask |= EPOLLHUP;
1196 		if (sk->sk_state == SMC_LISTEN) {
1197 			/* woken up by sk_data_ready in smc_listen_work() */
1198 			mask = smc_accept_poll(sk);
1199 		} else {
1200 			if (atomic_read(&smc->conn.sndbuf_space) ||
1201 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1202 				mask |= EPOLLOUT | EPOLLWRNORM;
1203 			} else {
1204 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1205 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1206 			}
1207 			if (atomic_read(&smc->conn.bytes_to_rcv))
1208 				mask |= EPOLLIN | EPOLLRDNORM;
1209 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1210 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1211 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1212 				mask |= EPOLLIN;
1213 		}
1214 
1215 	}
1216 	release_sock(sk);
1217 	sock_put(sk);
1218 
1219 	return mask;
1220 }
1221 
1222 static int smc_shutdown(struct socket *sock, int how)
1223 {
1224 	struct sock *sk = sock->sk;
1225 	struct smc_sock *smc;
1226 	int rc = -EINVAL;
1227 	int rc1 = 0;
1228 
1229 	smc = smc_sk(sk);
1230 
1231 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1232 		return rc;
1233 
1234 	lock_sock(sk);
1235 
1236 	rc = -ENOTCONN;
1237 	if ((sk->sk_state != SMC_LISTEN) &&
1238 	    (sk->sk_state != SMC_ACTIVE) &&
1239 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1240 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1241 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1242 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1243 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1244 		goto out;
1245 	if (smc->use_fallback) {
1246 		rc = kernel_sock_shutdown(smc->clcsock, how);
1247 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1248 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1249 			sk->sk_state = SMC_CLOSED;
1250 		goto out;
1251 	}
1252 	switch (how) {
1253 	case SHUT_RDWR:		/* shutdown in both directions */
1254 		rc = smc_close_active(smc);
1255 		break;
1256 	case SHUT_WR:
1257 		rc = smc_close_shutdown_write(smc);
1258 		break;
1259 	case SHUT_RD:
1260 		if (sk->sk_state == SMC_LISTEN)
1261 			rc = smc_close_active(smc);
1262 		else
1263 			rc = 0;
1264 			/* nothing more to do because peer is not involved */
1265 		break;
1266 	}
1267 	rc1 = kernel_sock_shutdown(smc->clcsock, how);
1268 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1269 	sk->sk_shutdown |= how + 1;
1270 
1271 out:
1272 	release_sock(sk);
1273 	return rc ? rc : rc1;
1274 }
1275 
1276 static int smc_setsockopt(struct socket *sock, int level, int optname,
1277 			  char __user *optval, unsigned int optlen)
1278 {
1279 	struct sock *sk = sock->sk;
1280 	struct smc_sock *smc;
1281 
1282 	smc = smc_sk(sk);
1283 
1284 	/* generic setsockopts reaching us here always apply to the
1285 	 * CLC socket
1286 	 */
1287 	return smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1288 					     optval, optlen);
1289 }
1290 
1291 static int smc_getsockopt(struct socket *sock, int level, int optname,
1292 			  char __user *optval, int __user *optlen)
1293 {
1294 	struct smc_sock *smc;
1295 
1296 	smc = smc_sk(sock->sk);
1297 	/* socket options apply to the CLC socket */
1298 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1299 					     optval, optlen);
1300 }
1301 
1302 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1303 		     unsigned long arg)
1304 {
1305 	struct smc_sock *smc;
1306 
1307 	smc = smc_sk(sock->sk);
1308 	if (smc->use_fallback)
1309 		return smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1310 	else
1311 		return sock_no_ioctl(sock, cmd, arg);
1312 }
1313 
1314 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1315 			    int offset, size_t size, int flags)
1316 {
1317 	struct sock *sk = sock->sk;
1318 	struct smc_sock *smc;
1319 	int rc = -EPIPE;
1320 
1321 	smc = smc_sk(sk);
1322 	lock_sock(sk);
1323 	if (sk->sk_state != SMC_ACTIVE)
1324 		goto out;
1325 	if (smc->use_fallback)
1326 		rc = kernel_sendpage(smc->clcsock, page, offset,
1327 				     size, flags);
1328 	else
1329 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1330 
1331 out:
1332 	release_sock(sk);
1333 	return rc;
1334 }
1335 
1336 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1337 			       struct pipe_inode_info *pipe, size_t len,
1338 				    unsigned int flags)
1339 {
1340 	struct sock *sk = sock->sk;
1341 	struct smc_sock *smc;
1342 	int rc = -ENOTCONN;
1343 
1344 	smc = smc_sk(sk);
1345 	lock_sock(sk);
1346 	if ((sk->sk_state != SMC_ACTIVE) && (sk->sk_state != SMC_CLOSED))
1347 		goto out;
1348 	if (smc->use_fallback) {
1349 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1350 						    pipe, len, flags);
1351 	} else {
1352 		rc = -EOPNOTSUPP;
1353 	}
1354 out:
1355 	release_sock(sk);
1356 	return rc;
1357 }
1358 
1359 /* must look like tcp */
1360 static const struct proto_ops smc_sock_ops = {
1361 	.family		= PF_SMC,
1362 	.owner		= THIS_MODULE,
1363 	.release	= smc_release,
1364 	.bind		= smc_bind,
1365 	.connect	= smc_connect,
1366 	.socketpair	= sock_no_socketpair,
1367 	.accept		= smc_accept,
1368 	.getname	= smc_getname,
1369 	.poll		= smc_poll,
1370 	.ioctl		= smc_ioctl,
1371 	.listen		= smc_listen,
1372 	.shutdown	= smc_shutdown,
1373 	.setsockopt	= smc_setsockopt,
1374 	.getsockopt	= smc_getsockopt,
1375 	.sendmsg	= smc_sendmsg,
1376 	.recvmsg	= smc_recvmsg,
1377 	.mmap		= sock_no_mmap,
1378 	.sendpage	= smc_sendpage,
1379 	.splice_read	= smc_splice_read,
1380 };
1381 
1382 static int smc_create(struct net *net, struct socket *sock, int protocol,
1383 		      int kern)
1384 {
1385 	struct smc_sock *smc;
1386 	struct sock *sk;
1387 	int rc;
1388 
1389 	rc = -ESOCKTNOSUPPORT;
1390 	if (sock->type != SOCK_STREAM)
1391 		goto out;
1392 
1393 	rc = -EPROTONOSUPPORT;
1394 	if ((protocol != IPPROTO_IP) && (protocol != IPPROTO_TCP))
1395 		goto out;
1396 
1397 	rc = -ENOBUFS;
1398 	sock->ops = &smc_sock_ops;
1399 	sk = smc_sock_alloc(net, sock);
1400 	if (!sk)
1401 		goto out;
1402 
1403 	/* create internal TCP socket for CLC handshake and fallback */
1404 	smc = smc_sk(sk);
1405 	smc->use_fallback = false; /* assume rdma capability first */
1406 	rc = sock_create_kern(net, PF_INET, SOCK_STREAM,
1407 			      IPPROTO_TCP, &smc->clcsock);
1408 	if (rc)
1409 		sk_common_release(sk);
1410 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1411 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1412 
1413 out:
1414 	return rc;
1415 }
1416 
1417 static const struct net_proto_family smc_sock_family_ops = {
1418 	.family	= PF_SMC,
1419 	.owner	= THIS_MODULE,
1420 	.create	= smc_create,
1421 };
1422 
1423 static int __init smc_init(void)
1424 {
1425 	int rc;
1426 
1427 	rc = smc_pnet_init();
1428 	if (rc)
1429 		return rc;
1430 
1431 	rc = smc_llc_init();
1432 	if (rc) {
1433 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
1434 		goto out_pnet;
1435 	}
1436 
1437 	rc = smc_cdc_init();
1438 	if (rc) {
1439 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
1440 		goto out_pnet;
1441 	}
1442 
1443 	rc = proto_register(&smc_proto, 1);
1444 	if (rc) {
1445 		pr_err("%s: proto_register fails with %d\n", __func__, rc);
1446 		goto out_pnet;
1447 	}
1448 
1449 	rc = sock_register(&smc_sock_family_ops);
1450 	if (rc) {
1451 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
1452 		goto out_proto;
1453 	}
1454 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
1455 
1456 	rc = smc_ib_register_client();
1457 	if (rc) {
1458 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
1459 		goto out_sock;
1460 	}
1461 
1462 	static_branch_enable(&tcp_have_smc);
1463 	return 0;
1464 
1465 out_sock:
1466 	sock_unregister(PF_SMC);
1467 out_proto:
1468 	proto_unregister(&smc_proto);
1469 out_pnet:
1470 	smc_pnet_exit();
1471 	return rc;
1472 }
1473 
1474 static void __exit smc_exit(void)
1475 {
1476 	struct smc_link_group *lgr, *lg;
1477 	LIST_HEAD(lgr_freeing_list);
1478 
1479 	spin_lock_bh(&smc_lgr_list.lock);
1480 	if (!list_empty(&smc_lgr_list.list))
1481 		list_splice_init(&smc_lgr_list.list, &lgr_freeing_list);
1482 	spin_unlock_bh(&smc_lgr_list.lock);
1483 	list_for_each_entry_safe(lgr, lg, &lgr_freeing_list, list) {
1484 		list_del_init(&lgr->list);
1485 		smc_lgr_free(lgr); /* free link group */
1486 	}
1487 	static_branch_disable(&tcp_have_smc);
1488 	smc_ib_unregister_client();
1489 	sock_unregister(PF_SMC);
1490 	proto_unregister(&smc_proto);
1491 	smc_pnet_exit();
1492 }
1493 
1494 module_init(smc_init);
1495 module_exit(smc_exit);
1496 
1497 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
1498 MODULE_DESCRIPTION("smc socket address family");
1499 MODULE_LICENSE("GPL");
1500 MODULE_ALIAS_NETPROTO(PF_SMC);
1501