xref: /linux/net/smc/af_smc.c (revision 06b9cce42634a50f2840777a66553b02320db5ef)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18 
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21 
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28 #include <linux/rcupdate_wait.h>
29 #include <linux/ctype.h>
30 
31 #include <net/sock.h>
32 #include <net/tcp.h>
33 #include <net/smc.h>
34 #include <asm/ioctls.h>
35 
36 #include <net/net_namespace.h>
37 #include <net/netns/generic.h>
38 #include "smc_netns.h"
39 
40 #include "smc.h"
41 #include "smc_clc.h"
42 #include "smc_llc.h"
43 #include "smc_cdc.h"
44 #include "smc_core.h"
45 #include "smc_ib.h"
46 #include "smc_ism.h"
47 #include "smc_pnet.h"
48 #include "smc_netlink.h"
49 #include "smc_tx.h"
50 #include "smc_rx.h"
51 #include "smc_close.h"
52 #include "smc_stats.h"
53 #include "smc_tracepoint.h"
54 
55 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
56 						 * creation on server
57 						 */
58 static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
59 						 * creation on client
60 						 */
61 
62 static struct workqueue_struct	*smc_tcp_ls_wq;	/* wq for tcp listen work */
63 struct workqueue_struct	*smc_hs_wq;	/* wq for handshake work */
64 struct workqueue_struct	*smc_close_wq;	/* wq for close work */
65 
66 static void smc_tcp_listen_work(struct work_struct *);
67 static void smc_connect_work(struct work_struct *);
68 
69 int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb)
70 {
71 	struct smc_nl_dmp_ctx *cb_ctx = smc_nl_dmp_ctx(cb);
72 	void *hdr;
73 
74 	if (cb_ctx->pos[0])
75 		goto out;
76 
77 	hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
78 			  &smc_gen_nl_family, NLM_F_MULTI,
79 			  SMC_NETLINK_DUMP_HS_LIMITATION);
80 	if (!hdr)
81 		return -ENOMEM;
82 
83 	if (nla_put_u8(skb, SMC_NLA_HS_LIMITATION_ENABLED,
84 		       sock_net(skb->sk)->smc.limit_smc_hs))
85 		goto err;
86 
87 	genlmsg_end(skb, hdr);
88 	cb_ctx->pos[0] = 1;
89 out:
90 	return skb->len;
91 err:
92 	genlmsg_cancel(skb, hdr);
93 	return -EMSGSIZE;
94 }
95 
96 int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
97 {
98 	sock_net(skb->sk)->smc.limit_smc_hs = true;
99 	return 0;
100 }
101 
102 int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info)
103 {
104 	sock_net(skb->sk)->smc.limit_smc_hs = false;
105 	return 0;
106 }
107 
108 static void smc_set_keepalive(struct sock *sk, int val)
109 {
110 	struct smc_sock *smc = smc_sk(sk);
111 
112 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
113 }
114 
115 static struct sock *smc_tcp_syn_recv_sock(const struct sock *sk,
116 					  struct sk_buff *skb,
117 					  struct request_sock *req,
118 					  struct dst_entry *dst,
119 					  struct request_sock *req_unhash,
120 					  bool *own_req)
121 {
122 	struct smc_sock *smc;
123 
124 	smc = smc_clcsock_user_data(sk);
125 
126 	if (READ_ONCE(sk->sk_ack_backlog) + atomic_read(&smc->queued_smc_hs) >
127 				sk->sk_max_ack_backlog)
128 		goto drop;
129 
130 	if (sk_acceptq_is_full(&smc->sk)) {
131 		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
132 		goto drop;
133 	}
134 
135 	/* passthrough to original syn recv sock fct */
136 	return smc->ori_af_ops->syn_recv_sock(sk, skb, req, dst, req_unhash,
137 					      own_req);
138 
139 drop:
140 	dst_release(dst);
141 	tcp_listendrop(sk);
142 	return NULL;
143 }
144 
145 static bool smc_hs_congested(const struct sock *sk)
146 {
147 	const struct smc_sock *smc;
148 
149 	smc = smc_clcsock_user_data(sk);
150 
151 	if (!smc)
152 		return true;
153 
154 	if (workqueue_congested(WORK_CPU_UNBOUND, smc_hs_wq))
155 		return true;
156 
157 	return false;
158 }
159 
160 static struct smc_hashinfo smc_v4_hashinfo = {
161 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
162 };
163 
164 static struct smc_hashinfo smc_v6_hashinfo = {
165 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
166 };
167 
168 int smc_hash_sk(struct sock *sk)
169 {
170 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
171 	struct hlist_head *head;
172 
173 	head = &h->ht;
174 
175 	write_lock_bh(&h->lock);
176 	sk_add_node(sk, head);
177 	write_unlock_bh(&h->lock);
178 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
179 
180 	return 0;
181 }
182 EXPORT_SYMBOL_GPL(smc_hash_sk);
183 
184 void smc_unhash_sk(struct sock *sk)
185 {
186 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
187 
188 	write_lock_bh(&h->lock);
189 	if (sk_del_node_init(sk))
190 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
191 	write_unlock_bh(&h->lock);
192 }
193 EXPORT_SYMBOL_GPL(smc_unhash_sk);
194 
195 struct proto smc_proto = {
196 	.name		= "SMC",
197 	.owner		= THIS_MODULE,
198 	.keepalive	= smc_set_keepalive,
199 	.hash		= smc_hash_sk,
200 	.unhash		= smc_unhash_sk,
201 	.obj_size	= sizeof(struct smc_sock),
202 	.h.smc_hash	= &smc_v4_hashinfo,
203 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
204 };
205 EXPORT_SYMBOL_GPL(smc_proto);
206 
207 struct proto smc_proto6 = {
208 	.name		= "SMC6",
209 	.owner		= THIS_MODULE,
210 	.keepalive	= smc_set_keepalive,
211 	.hash		= smc_hash_sk,
212 	.unhash		= smc_unhash_sk,
213 	.obj_size	= sizeof(struct smc_sock),
214 	.h.smc_hash	= &smc_v6_hashinfo,
215 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
216 };
217 EXPORT_SYMBOL_GPL(smc_proto6);
218 
219 static void smc_restore_fallback_changes(struct smc_sock *smc)
220 {
221 	if (smc->clcsock->file) { /* non-accepted sockets have no file yet */
222 		smc->clcsock->file->private_data = smc->sk.sk_socket;
223 		smc->clcsock->file = NULL;
224 	}
225 }
226 
227 static int __smc_release(struct smc_sock *smc)
228 {
229 	struct sock *sk = &smc->sk;
230 	int rc = 0;
231 
232 	if (!smc->use_fallback) {
233 		rc = smc_close_active(smc);
234 		sock_set_flag(sk, SOCK_DEAD);
235 		sk->sk_shutdown |= SHUTDOWN_MASK;
236 	} else {
237 		if (sk->sk_state != SMC_CLOSED) {
238 			if (sk->sk_state != SMC_LISTEN &&
239 			    sk->sk_state != SMC_INIT)
240 				sock_put(sk); /* passive closing */
241 			if (sk->sk_state == SMC_LISTEN) {
242 				/* wake up clcsock accept */
243 				rc = kernel_sock_shutdown(smc->clcsock,
244 							  SHUT_RDWR);
245 			}
246 			sk->sk_state = SMC_CLOSED;
247 			sk->sk_state_change(sk);
248 		}
249 		smc_restore_fallback_changes(smc);
250 	}
251 
252 	sk->sk_prot->unhash(sk);
253 
254 	if (sk->sk_state == SMC_CLOSED) {
255 		if (smc->clcsock) {
256 			release_sock(sk);
257 			smc_clcsock_release(smc);
258 			lock_sock(sk);
259 		}
260 		if (!smc->use_fallback)
261 			smc_conn_free(&smc->conn);
262 	}
263 
264 	return rc;
265 }
266 
267 static int smc_release(struct socket *sock)
268 {
269 	struct sock *sk = sock->sk;
270 	struct smc_sock *smc;
271 	int rc = 0;
272 
273 	if (!sk)
274 		goto out;
275 
276 	sock_hold(sk); /* sock_put below */
277 	smc = smc_sk(sk);
278 
279 	/* cleanup for a dangling non-blocking connect */
280 	if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
281 		tcp_abort(smc->clcsock->sk, ECONNABORTED);
282 
283 	if (cancel_work_sync(&smc->connect_work))
284 		sock_put(&smc->sk); /* sock_hold in smc_connect for passive closing */
285 
286 	if (sk->sk_state == SMC_LISTEN)
287 		/* smc_close_non_accepted() is called and acquires
288 		 * sock lock for child sockets again
289 		 */
290 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
291 	else
292 		lock_sock(sk);
293 
294 	rc = __smc_release(smc);
295 
296 	/* detach socket */
297 	sock_orphan(sk);
298 	sock->sk = NULL;
299 	release_sock(sk);
300 
301 	sock_put(sk); /* sock_hold above */
302 	sock_put(sk); /* final sock_put */
303 out:
304 	return rc;
305 }
306 
307 static void smc_destruct(struct sock *sk)
308 {
309 	if (sk->sk_state != SMC_CLOSED)
310 		return;
311 	if (!sock_flag(sk, SOCK_DEAD))
312 		return;
313 
314 	sk_refcnt_debug_dec(sk);
315 }
316 
317 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
318 				   int protocol)
319 {
320 	struct smc_sock *smc;
321 	struct proto *prot;
322 	struct sock *sk;
323 
324 	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
325 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
326 	if (!sk)
327 		return NULL;
328 
329 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
330 	sk->sk_state = SMC_INIT;
331 	sk->sk_destruct = smc_destruct;
332 	sk->sk_protocol = protocol;
333 	smc = smc_sk(sk);
334 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
335 	INIT_WORK(&smc->connect_work, smc_connect_work);
336 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
337 	INIT_LIST_HEAD(&smc->accept_q);
338 	spin_lock_init(&smc->accept_q_lock);
339 	spin_lock_init(&smc->conn.send_lock);
340 	sk->sk_prot->hash(sk);
341 	sk_refcnt_debug_inc(sk);
342 	mutex_init(&smc->clcsock_release_lock);
343 
344 	return sk;
345 }
346 
347 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
348 		    int addr_len)
349 {
350 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
351 	struct sock *sk = sock->sk;
352 	struct smc_sock *smc;
353 	int rc;
354 
355 	smc = smc_sk(sk);
356 
357 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
358 	rc = -EINVAL;
359 	if (addr_len < sizeof(struct sockaddr_in))
360 		goto out;
361 
362 	rc = -EAFNOSUPPORT;
363 	if (addr->sin_family != AF_INET &&
364 	    addr->sin_family != AF_INET6 &&
365 	    addr->sin_family != AF_UNSPEC)
366 		goto out;
367 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
368 	if (addr->sin_family == AF_UNSPEC &&
369 	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
370 		goto out;
371 
372 	lock_sock(sk);
373 
374 	/* Check if socket is already active */
375 	rc = -EINVAL;
376 	if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
377 		goto out_rel;
378 
379 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
380 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
381 
382 out_rel:
383 	release_sock(sk);
384 out:
385 	return rc;
386 }
387 
388 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
389 				   unsigned long mask)
390 {
391 	/* options we don't get control via setsockopt for */
392 	nsk->sk_type = osk->sk_type;
393 	nsk->sk_sndbuf = osk->sk_sndbuf;
394 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
395 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
396 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
397 	nsk->sk_mark = osk->sk_mark;
398 	nsk->sk_priority = osk->sk_priority;
399 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
400 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
401 	nsk->sk_err = osk->sk_err;
402 
403 	nsk->sk_flags &= ~mask;
404 	nsk->sk_flags |= osk->sk_flags & mask;
405 }
406 
407 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
408 			     (1UL << SOCK_KEEPOPEN) | \
409 			     (1UL << SOCK_LINGER) | \
410 			     (1UL << SOCK_BROADCAST) | \
411 			     (1UL << SOCK_TIMESTAMP) | \
412 			     (1UL << SOCK_DBG) | \
413 			     (1UL << SOCK_RCVTSTAMP) | \
414 			     (1UL << SOCK_RCVTSTAMPNS) | \
415 			     (1UL << SOCK_LOCALROUTE) | \
416 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
417 			     (1UL << SOCK_RXQ_OVFL) | \
418 			     (1UL << SOCK_WIFI_STATUS) | \
419 			     (1UL << SOCK_NOFCS) | \
420 			     (1UL << SOCK_FILTER_LOCKED) | \
421 			     (1UL << SOCK_TSTAMP_NEW))
422 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
423  * clc socket (since smc is not called for these options from net/core)
424  */
425 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
426 {
427 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
428 }
429 
430 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
431 			     (1UL << SOCK_KEEPOPEN) | \
432 			     (1UL << SOCK_LINGER) | \
433 			     (1UL << SOCK_DBG))
434 /* copy only settings and flags relevant for smc from clc to smc socket */
435 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
436 {
437 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
438 }
439 
440 /* register the new rmb on all links */
441 static int smcr_lgr_reg_rmbs(struct smc_link *link,
442 			     struct smc_buf_desc *rmb_desc)
443 {
444 	struct smc_link_group *lgr = link->lgr;
445 	int i, rc = 0;
446 
447 	rc = smc_llc_flow_initiate(lgr, SMC_LLC_FLOW_RKEY);
448 	if (rc)
449 		return rc;
450 	/* protect against parallel smc_llc_cli_rkey_exchange() and
451 	 * parallel smcr_link_reg_rmb()
452 	 */
453 	mutex_lock(&lgr->llc_conf_mutex);
454 	for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
455 		if (!smc_link_active(&lgr->lnk[i]))
456 			continue;
457 		rc = smcr_link_reg_rmb(&lgr->lnk[i], rmb_desc);
458 		if (rc)
459 			goto out;
460 	}
461 
462 	/* exchange confirm_rkey msg with peer */
463 	rc = smc_llc_do_confirm_rkey(link, rmb_desc);
464 	if (rc) {
465 		rc = -EFAULT;
466 		goto out;
467 	}
468 	rmb_desc->is_conf_rkey = true;
469 out:
470 	mutex_unlock(&lgr->llc_conf_mutex);
471 	smc_llc_flow_stop(lgr, &lgr->llc_flow_lcl);
472 	return rc;
473 }
474 
475 static int smcr_clnt_conf_first_link(struct smc_sock *smc)
476 {
477 	struct smc_link *link = smc->conn.lnk;
478 	struct smc_llc_qentry *qentry;
479 	int rc;
480 
481 	/* receive CONFIRM LINK request from server over RoCE fabric */
482 	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
483 			      SMC_LLC_CONFIRM_LINK);
484 	if (!qentry) {
485 		struct smc_clc_msg_decline dclc;
486 
487 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
488 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
489 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
490 	}
491 	smc_llc_save_peer_uid(qentry);
492 	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_REQ);
493 	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
494 	if (rc)
495 		return SMC_CLC_DECL_RMBE_EC;
496 
497 	rc = smc_ib_modify_qp_rts(link);
498 	if (rc)
499 		return SMC_CLC_DECL_ERR_RDYLNK;
500 
501 	smc_wr_remember_qp_attr(link);
502 
503 	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
504 		return SMC_CLC_DECL_ERR_REGRMB;
505 
506 	/* confirm_rkey is implicit on 1st contact */
507 	smc->conn.rmb_desc->is_conf_rkey = true;
508 
509 	/* send CONFIRM LINK response over RoCE fabric */
510 	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
511 	if (rc < 0)
512 		return SMC_CLC_DECL_TIMEOUT_CL;
513 
514 	smc_llc_link_active(link);
515 	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
516 
517 	/* optional 2nd link, receive ADD LINK request from server */
518 	qentry = smc_llc_wait(link->lgr, NULL, SMC_LLC_WAIT_TIME,
519 			      SMC_LLC_ADD_LINK);
520 	if (!qentry) {
521 		struct smc_clc_msg_decline dclc;
522 
523 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
524 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
525 		if (rc == -EAGAIN)
526 			rc = 0; /* no DECLINE received, go with one link */
527 		return rc;
528 	}
529 	smc_llc_flow_qentry_clr(&link->lgr->llc_flow_lcl);
530 	smc_llc_cli_add_link(link, qentry);
531 	return 0;
532 }
533 
534 static bool smc_isascii(char *hostname)
535 {
536 	int i;
537 
538 	for (i = 0; i < SMC_MAX_HOSTNAME_LEN; i++)
539 		if (!isascii(hostname[i]))
540 			return false;
541 	return true;
542 }
543 
544 static void smc_conn_save_peer_info_fce(struct smc_sock *smc,
545 					struct smc_clc_msg_accept_confirm *clc)
546 {
547 	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
548 		(struct smc_clc_msg_accept_confirm_v2 *)clc;
549 	struct smc_clc_first_contact_ext *fce;
550 	int clc_v2_len;
551 
552 	if (clc->hdr.version == SMC_V1 ||
553 	    !(clc->hdr.typev2 & SMC_FIRST_CONTACT_MASK))
554 		return;
555 
556 	if (smc->conn.lgr->is_smcd) {
557 		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->d1.eid,
558 		       SMC_MAX_EID_LEN);
559 		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
560 					 d1);
561 	} else {
562 		memcpy(smc->conn.lgr->negotiated_eid, clc_v2->r1.eid,
563 		       SMC_MAX_EID_LEN);
564 		clc_v2_len = offsetofend(struct smc_clc_msg_accept_confirm_v2,
565 					 r1);
566 	}
567 	fce = (struct smc_clc_first_contact_ext *)(((u8 *)clc_v2) + clc_v2_len);
568 	smc->conn.lgr->peer_os = fce->os_type;
569 	smc->conn.lgr->peer_smc_release = fce->release;
570 	if (smc_isascii(fce->hostname))
571 		memcpy(smc->conn.lgr->peer_hostname, fce->hostname,
572 		       SMC_MAX_HOSTNAME_LEN);
573 }
574 
575 static void smcr_conn_save_peer_info(struct smc_sock *smc,
576 				     struct smc_clc_msg_accept_confirm *clc)
577 {
578 	int bufsize = smc_uncompress_bufsize(clc->r0.rmbe_size);
579 
580 	smc->conn.peer_rmbe_idx = clc->r0.rmbe_idx;
581 	smc->conn.local_tx_ctrl.token = ntohl(clc->r0.rmbe_alert_token);
582 	smc->conn.peer_rmbe_size = bufsize;
583 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
584 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
585 }
586 
587 static void smcd_conn_save_peer_info(struct smc_sock *smc,
588 				     struct smc_clc_msg_accept_confirm *clc)
589 {
590 	int bufsize = smc_uncompress_bufsize(clc->d0.dmbe_size);
591 
592 	smc->conn.peer_rmbe_idx = clc->d0.dmbe_idx;
593 	smc->conn.peer_token = clc->d0.token;
594 	/* msg header takes up space in the buffer */
595 	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
596 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
597 	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
598 }
599 
600 static void smc_conn_save_peer_info(struct smc_sock *smc,
601 				    struct smc_clc_msg_accept_confirm *clc)
602 {
603 	if (smc->conn.lgr->is_smcd)
604 		smcd_conn_save_peer_info(smc, clc);
605 	else
606 		smcr_conn_save_peer_info(smc, clc);
607 	smc_conn_save_peer_info_fce(smc, clc);
608 }
609 
610 static void smc_link_save_peer_info(struct smc_link *link,
611 				    struct smc_clc_msg_accept_confirm *clc,
612 				    struct smc_init_info *ini)
613 {
614 	link->peer_qpn = ntoh24(clc->r0.qpn);
615 	memcpy(link->peer_gid, ini->peer_gid, SMC_GID_SIZE);
616 	memcpy(link->peer_mac, ini->peer_mac, sizeof(link->peer_mac));
617 	link->peer_psn = ntoh24(clc->r0.psn);
618 	link->peer_mtu = clc->r0.qp_mtu;
619 }
620 
621 static void smc_stat_inc_fback_rsn_cnt(struct smc_sock *smc,
622 				       struct smc_stats_fback *fback_arr)
623 {
624 	int cnt;
625 
626 	for (cnt = 0; cnt < SMC_MAX_FBACK_RSN_CNT; cnt++) {
627 		if (fback_arr[cnt].fback_code == smc->fallback_rsn) {
628 			fback_arr[cnt].count++;
629 			break;
630 		}
631 		if (!fback_arr[cnt].fback_code) {
632 			fback_arr[cnt].fback_code = smc->fallback_rsn;
633 			fback_arr[cnt].count++;
634 			break;
635 		}
636 	}
637 }
638 
639 static void smc_stat_fallback(struct smc_sock *smc)
640 {
641 	struct net *net = sock_net(&smc->sk);
642 
643 	mutex_lock(&net->smc.mutex_fback_rsn);
644 	if (smc->listen_smc) {
645 		smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->srv);
646 		net->smc.fback_rsn->srv_fback_cnt++;
647 	} else {
648 		smc_stat_inc_fback_rsn_cnt(smc, net->smc.fback_rsn->clnt);
649 		net->smc.fback_rsn->clnt_fback_cnt++;
650 	}
651 	mutex_unlock(&net->smc.mutex_fback_rsn);
652 }
653 
654 /* must be called under rcu read lock */
655 static void smc_fback_wakeup_waitqueue(struct smc_sock *smc, void *key)
656 {
657 	struct socket_wq *wq;
658 	__poll_t flags;
659 
660 	wq = rcu_dereference(smc->sk.sk_wq);
661 	if (!skwq_has_sleeper(wq))
662 		return;
663 
664 	/* wake up smc sk->sk_wq */
665 	if (!key) {
666 		/* sk_state_change */
667 		wake_up_interruptible_all(&wq->wait);
668 	} else {
669 		flags = key_to_poll(key);
670 		if (flags & (EPOLLIN | EPOLLOUT))
671 			/* sk_data_ready or sk_write_space */
672 			wake_up_interruptible_sync_poll(&wq->wait, flags);
673 		else if (flags & EPOLLERR)
674 			/* sk_error_report */
675 			wake_up_interruptible_poll(&wq->wait, flags);
676 	}
677 }
678 
679 static int smc_fback_mark_woken(wait_queue_entry_t *wait,
680 				unsigned int mode, int sync, void *key)
681 {
682 	struct smc_mark_woken *mark =
683 		container_of(wait, struct smc_mark_woken, wait_entry);
684 
685 	mark->woken = true;
686 	mark->key = key;
687 	return 0;
688 }
689 
690 static void smc_fback_forward_wakeup(struct smc_sock *smc, struct sock *clcsk,
691 				     void (*clcsock_callback)(struct sock *sk))
692 {
693 	struct smc_mark_woken mark = { .woken = false };
694 	struct socket_wq *wq;
695 
696 	init_waitqueue_func_entry(&mark.wait_entry,
697 				  smc_fback_mark_woken);
698 	rcu_read_lock();
699 	wq = rcu_dereference(clcsk->sk_wq);
700 	if (!wq)
701 		goto out;
702 	add_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
703 	clcsock_callback(clcsk);
704 	remove_wait_queue(sk_sleep(clcsk), &mark.wait_entry);
705 
706 	if (mark.woken)
707 		smc_fback_wakeup_waitqueue(smc, mark.key);
708 out:
709 	rcu_read_unlock();
710 }
711 
712 static void smc_fback_state_change(struct sock *clcsk)
713 {
714 	struct smc_sock *smc =
715 		smc_clcsock_user_data(clcsk);
716 
717 	if (!smc)
718 		return;
719 	smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_state_change);
720 }
721 
722 static void smc_fback_data_ready(struct sock *clcsk)
723 {
724 	struct smc_sock *smc =
725 		smc_clcsock_user_data(clcsk);
726 
727 	if (!smc)
728 		return;
729 	smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_data_ready);
730 }
731 
732 static void smc_fback_write_space(struct sock *clcsk)
733 {
734 	struct smc_sock *smc =
735 		smc_clcsock_user_data(clcsk);
736 
737 	if (!smc)
738 		return;
739 	smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_write_space);
740 }
741 
742 static void smc_fback_error_report(struct sock *clcsk)
743 {
744 	struct smc_sock *smc =
745 		smc_clcsock_user_data(clcsk);
746 
747 	if (!smc)
748 		return;
749 	smc_fback_forward_wakeup(smc, clcsk, smc->clcsk_error_report);
750 }
751 
752 static int smc_switch_to_fallback(struct smc_sock *smc, int reason_code)
753 {
754 	struct sock *clcsk;
755 	int rc = 0;
756 
757 	mutex_lock(&smc->clcsock_release_lock);
758 	if (!smc->clcsock) {
759 		rc = -EBADF;
760 		goto out;
761 	}
762 	clcsk = smc->clcsock->sk;
763 
764 	if (smc->use_fallback)
765 		goto out;
766 	smc->use_fallback = true;
767 	smc->fallback_rsn = reason_code;
768 	smc_stat_fallback(smc);
769 	trace_smc_switch_to_fallback(smc, reason_code);
770 	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
771 		smc->clcsock->file = smc->sk.sk_socket->file;
772 		smc->clcsock->file->private_data = smc->clcsock;
773 		smc->clcsock->wq.fasync_list =
774 			smc->sk.sk_socket->wq.fasync_list;
775 
776 		/* There might be some wait entries remaining
777 		 * in smc sk->sk_wq and they should be woken up
778 		 * as clcsock's wait queue is woken up.
779 		 */
780 		smc->clcsk_state_change = clcsk->sk_state_change;
781 		smc->clcsk_data_ready = clcsk->sk_data_ready;
782 		smc->clcsk_write_space = clcsk->sk_write_space;
783 		smc->clcsk_error_report = clcsk->sk_error_report;
784 
785 		clcsk->sk_state_change = smc_fback_state_change;
786 		clcsk->sk_data_ready = smc_fback_data_ready;
787 		clcsk->sk_write_space = smc_fback_write_space;
788 		clcsk->sk_error_report = smc_fback_error_report;
789 
790 		smc->clcsock->sk->sk_user_data =
791 			(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
792 	}
793 out:
794 	mutex_unlock(&smc->clcsock_release_lock);
795 	return rc;
796 }
797 
798 /* fall back during connect */
799 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
800 {
801 	struct net *net = sock_net(&smc->sk);
802 	int rc = 0;
803 
804 	rc = smc_switch_to_fallback(smc, reason_code);
805 	if (rc) { /* fallback fails */
806 		this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
807 		if (smc->sk.sk_state == SMC_INIT)
808 			sock_put(&smc->sk); /* passive closing */
809 		return rc;
810 	}
811 	smc_copy_sock_settings_to_clc(smc);
812 	smc->connect_nonblock = 0;
813 	if (smc->sk.sk_state == SMC_INIT)
814 		smc->sk.sk_state = SMC_ACTIVE;
815 	return 0;
816 }
817 
818 /* decline and fall back during connect */
819 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code,
820 					u8 version)
821 {
822 	struct net *net = sock_net(&smc->sk);
823 	int rc;
824 
825 	if (reason_code < 0) { /* error, fallback is not possible */
826 		this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
827 		if (smc->sk.sk_state == SMC_INIT)
828 			sock_put(&smc->sk); /* passive closing */
829 		return reason_code;
830 	}
831 	if (reason_code != SMC_CLC_DECL_PEERDECL) {
832 		rc = smc_clc_send_decline(smc, reason_code, version);
833 		if (rc < 0) {
834 			this_cpu_inc(net->smc.smc_stats->clnt_hshake_err_cnt);
835 			if (smc->sk.sk_state == SMC_INIT)
836 				sock_put(&smc->sk); /* passive closing */
837 			return rc;
838 		}
839 	}
840 	return smc_connect_fallback(smc, reason_code);
841 }
842 
843 static void smc_conn_abort(struct smc_sock *smc, int local_first)
844 {
845 	struct smc_connection *conn = &smc->conn;
846 	struct smc_link_group *lgr = conn->lgr;
847 	bool lgr_valid = false;
848 
849 	if (smc_conn_lgr_valid(conn))
850 		lgr_valid = true;
851 
852 	smc_conn_free(conn);
853 	if (local_first && lgr_valid)
854 		smc_lgr_cleanup_early(lgr);
855 }
856 
857 /* check if there is a rdma device available for this connection. */
858 /* called for connect and listen */
859 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
860 {
861 	/* PNET table look up: search active ib_device and port
862 	 * within same PNETID that also contains the ethernet device
863 	 * used for the internal TCP socket
864 	 */
865 	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
866 	if (!ini->check_smcrv2 && !ini->ib_dev)
867 		return SMC_CLC_DECL_NOSMCRDEV;
868 	if (ini->check_smcrv2 && !ini->smcrv2.ib_dev_v2)
869 		return SMC_CLC_DECL_NOSMCRDEV;
870 	return 0;
871 }
872 
873 /* check if there is an ISM device available for this connection. */
874 /* called for connect and listen */
875 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
876 {
877 	/* Find ISM device with same PNETID as connecting interface  */
878 	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
879 	if (!ini->ism_dev[0])
880 		return SMC_CLC_DECL_NOSMCDDEV;
881 	else
882 		ini->ism_chid[0] = smc_ism_get_chid(ini->ism_dev[0]);
883 	return 0;
884 }
885 
886 /* is chid unique for the ism devices that are already determined? */
887 static bool smc_find_ism_v2_is_unique_chid(u16 chid, struct smc_init_info *ini,
888 					   int cnt)
889 {
890 	int i = (!ini->ism_dev[0]) ? 1 : 0;
891 
892 	for (; i < cnt; i++)
893 		if (ini->ism_chid[i] == chid)
894 			return false;
895 	return true;
896 }
897 
898 /* determine possible V2 ISM devices (either without PNETID or with PNETID plus
899  * PNETID matching net_device)
900  */
901 static int smc_find_ism_v2_device_clnt(struct smc_sock *smc,
902 				       struct smc_init_info *ini)
903 {
904 	int rc = SMC_CLC_DECL_NOSMCDDEV;
905 	struct smcd_dev *smcd;
906 	int i = 1;
907 	u16 chid;
908 
909 	if (smcd_indicated(ini->smc_type_v1))
910 		rc = 0;		/* already initialized for V1 */
911 	mutex_lock(&smcd_dev_list.mutex);
912 	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
913 		if (smcd->going_away || smcd == ini->ism_dev[0])
914 			continue;
915 		chid = smc_ism_get_chid(smcd);
916 		if (!smc_find_ism_v2_is_unique_chid(chid, ini, i))
917 			continue;
918 		if (!smc_pnet_is_pnetid_set(smcd->pnetid) ||
919 		    smc_pnet_is_ndev_pnetid(sock_net(&smc->sk), smcd->pnetid)) {
920 			ini->ism_dev[i] = smcd;
921 			ini->ism_chid[i] = chid;
922 			ini->is_smcd = true;
923 			rc = 0;
924 			i++;
925 			if (i > SMC_MAX_ISM_DEVS)
926 				break;
927 		}
928 	}
929 	mutex_unlock(&smcd_dev_list.mutex);
930 	ini->ism_offered_cnt = i - 1;
931 	if (!ini->ism_dev[0] && !ini->ism_dev[1])
932 		ini->smcd_version = 0;
933 
934 	return rc;
935 }
936 
937 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
938 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
939 				      struct smc_init_info *ini)
940 {
941 	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev[0], ini->vlan_id))
942 		return SMC_CLC_DECL_ISMVLANERR;
943 	return 0;
944 }
945 
946 static int smc_find_proposal_devices(struct smc_sock *smc,
947 				     struct smc_init_info *ini)
948 {
949 	int rc = 0;
950 
951 	/* check if there is an ism device available */
952 	if (!(ini->smcd_version & SMC_V1) ||
953 	    smc_find_ism_device(smc, ini) ||
954 	    smc_connect_ism_vlan_setup(smc, ini))
955 		ini->smcd_version &= ~SMC_V1;
956 	/* else ISM V1 is supported for this connection */
957 
958 	/* check if there is an rdma device available */
959 	if (!(ini->smcr_version & SMC_V1) ||
960 	    smc_find_rdma_device(smc, ini))
961 		ini->smcr_version &= ~SMC_V1;
962 	/* else RDMA is supported for this connection */
963 
964 	ini->smc_type_v1 = smc_indicated_type(ini->smcd_version & SMC_V1,
965 					      ini->smcr_version & SMC_V1);
966 
967 	/* check if there is an ism v2 device available */
968 	if (!(ini->smcd_version & SMC_V2) ||
969 	    !smc_ism_is_v2_capable() ||
970 	    smc_find_ism_v2_device_clnt(smc, ini))
971 		ini->smcd_version &= ~SMC_V2;
972 
973 	/* check if there is an rdma v2 device available */
974 	ini->check_smcrv2 = true;
975 	ini->smcrv2.saddr = smc->clcsock->sk->sk_rcv_saddr;
976 	if (!(ini->smcr_version & SMC_V2) ||
977 	    smc->clcsock->sk->sk_family != AF_INET ||
978 	    !smc_clc_ueid_count() ||
979 	    smc_find_rdma_device(smc, ini))
980 		ini->smcr_version &= ~SMC_V2;
981 	ini->check_smcrv2 = false;
982 
983 	ini->smc_type_v2 = smc_indicated_type(ini->smcd_version & SMC_V2,
984 					      ini->smcr_version & SMC_V2);
985 
986 	/* if neither ISM nor RDMA are supported, fallback */
987 	if (ini->smc_type_v1 == SMC_TYPE_N && ini->smc_type_v2 == SMC_TYPE_N)
988 		rc = SMC_CLC_DECL_NOSMCDEV;
989 
990 	return rc;
991 }
992 
993 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
994  * used, the VLAN ID will be registered again during the connection setup.
995  */
996 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc,
997 					struct smc_init_info *ini)
998 {
999 	if (!smcd_indicated(ini->smc_type_v1))
1000 		return 0;
1001 	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev[0], ini->vlan_id))
1002 		return SMC_CLC_DECL_CNFERR;
1003 	return 0;
1004 }
1005 
1006 #define SMC_CLC_MAX_ACCEPT_LEN \
1007 	(sizeof(struct smc_clc_msg_accept_confirm_v2) + \
1008 	 sizeof(struct smc_clc_first_contact_ext) + \
1009 	 sizeof(struct smc_clc_msg_trail))
1010 
1011 /* CLC handshake during connect */
1012 static int smc_connect_clc(struct smc_sock *smc,
1013 			   struct smc_clc_msg_accept_confirm_v2 *aclc2,
1014 			   struct smc_init_info *ini)
1015 {
1016 	int rc = 0;
1017 
1018 	/* do inband token exchange */
1019 	rc = smc_clc_send_proposal(smc, ini);
1020 	if (rc)
1021 		return rc;
1022 	/* receive SMC Accept CLC message */
1023 	return smc_clc_wait_msg(smc, aclc2, SMC_CLC_MAX_ACCEPT_LEN,
1024 				SMC_CLC_ACCEPT, CLC_WAIT_TIME);
1025 }
1026 
1027 void smc_fill_gid_list(struct smc_link_group *lgr,
1028 		       struct smc_gidlist *gidlist,
1029 		       struct smc_ib_device *known_dev, u8 *known_gid)
1030 {
1031 	struct smc_init_info *alt_ini = NULL;
1032 
1033 	memset(gidlist, 0, sizeof(*gidlist));
1034 	memcpy(gidlist->list[gidlist->len++], known_gid, SMC_GID_SIZE);
1035 
1036 	alt_ini = kzalloc(sizeof(*alt_ini), GFP_KERNEL);
1037 	if (!alt_ini)
1038 		goto out;
1039 
1040 	alt_ini->vlan_id = lgr->vlan_id;
1041 	alt_ini->check_smcrv2 = true;
1042 	alt_ini->smcrv2.saddr = lgr->saddr;
1043 	smc_pnet_find_alt_roce(lgr, alt_ini, known_dev);
1044 
1045 	if (!alt_ini->smcrv2.ib_dev_v2)
1046 		goto out;
1047 
1048 	memcpy(gidlist->list[gidlist->len++], alt_ini->smcrv2.ib_gid_v2,
1049 	       SMC_GID_SIZE);
1050 
1051 out:
1052 	kfree(alt_ini);
1053 }
1054 
1055 static int smc_connect_rdma_v2_prepare(struct smc_sock *smc,
1056 				       struct smc_clc_msg_accept_confirm *aclc,
1057 				       struct smc_init_info *ini)
1058 {
1059 	struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
1060 		(struct smc_clc_msg_accept_confirm_v2 *)aclc;
1061 	struct smc_clc_first_contact_ext *fce =
1062 		(struct smc_clc_first_contact_ext *)
1063 			(((u8 *)clc_v2) + sizeof(*clc_v2));
1064 
1065 	if (!ini->first_contact_peer || aclc->hdr.version == SMC_V1)
1066 		return 0;
1067 
1068 	if (fce->v2_direct) {
1069 		memcpy(ini->smcrv2.nexthop_mac, &aclc->r0.lcl.mac, ETH_ALEN);
1070 		ini->smcrv2.uses_gateway = false;
1071 	} else {
1072 		if (smc_ib_find_route(smc->clcsock->sk->sk_rcv_saddr,
1073 				      smc_ib_gid_to_ipv4(aclc->r0.lcl.gid),
1074 				      ini->smcrv2.nexthop_mac,
1075 				      &ini->smcrv2.uses_gateway))
1076 			return SMC_CLC_DECL_NOROUTE;
1077 		if (!ini->smcrv2.uses_gateway) {
1078 			/* mismatch: peer claims indirect, but its direct */
1079 			return SMC_CLC_DECL_NOINDIRECT;
1080 		}
1081 	}
1082 	return 0;
1083 }
1084 
1085 /* setup for RDMA connection of client */
1086 static int smc_connect_rdma(struct smc_sock *smc,
1087 			    struct smc_clc_msg_accept_confirm *aclc,
1088 			    struct smc_init_info *ini)
1089 {
1090 	int i, reason_code = 0;
1091 	struct smc_link *link;
1092 	u8 *eid = NULL;
1093 
1094 	ini->is_smcd = false;
1095 	ini->ib_clcqpn = ntoh24(aclc->r0.qpn);
1096 	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
1097 	memcpy(ini->peer_systemid, aclc->r0.lcl.id_for_peer, SMC_SYSTEMID_LEN);
1098 	memcpy(ini->peer_gid, aclc->r0.lcl.gid, SMC_GID_SIZE);
1099 	memcpy(ini->peer_mac, aclc->r0.lcl.mac, ETH_ALEN);
1100 
1101 	reason_code = smc_connect_rdma_v2_prepare(smc, aclc, ini);
1102 	if (reason_code)
1103 		return reason_code;
1104 
1105 	mutex_lock(&smc_client_lgr_pending);
1106 	reason_code = smc_conn_create(smc, ini);
1107 	if (reason_code) {
1108 		mutex_unlock(&smc_client_lgr_pending);
1109 		return reason_code;
1110 	}
1111 
1112 	smc_conn_save_peer_info(smc, aclc);
1113 
1114 	if (ini->first_contact_local) {
1115 		link = smc->conn.lnk;
1116 	} else {
1117 		/* set link that was assigned by server */
1118 		link = NULL;
1119 		for (i = 0; i < SMC_LINKS_PER_LGR_MAX; i++) {
1120 			struct smc_link *l = &smc->conn.lgr->lnk[i];
1121 
1122 			if (l->peer_qpn == ntoh24(aclc->r0.qpn) &&
1123 			    !memcmp(l->peer_gid, &aclc->r0.lcl.gid,
1124 				    SMC_GID_SIZE) &&
1125 			    (aclc->hdr.version > SMC_V1 ||
1126 			     !memcmp(l->peer_mac, &aclc->r0.lcl.mac,
1127 				     sizeof(l->peer_mac)))) {
1128 				link = l;
1129 				break;
1130 			}
1131 		}
1132 		if (!link) {
1133 			reason_code = SMC_CLC_DECL_NOSRVLINK;
1134 			goto connect_abort;
1135 		}
1136 		smc_switch_link_and_count(&smc->conn, link);
1137 	}
1138 
1139 	/* create send buffer and rmb */
1140 	if (smc_buf_create(smc, false)) {
1141 		reason_code = SMC_CLC_DECL_MEM;
1142 		goto connect_abort;
1143 	}
1144 
1145 	if (ini->first_contact_local)
1146 		smc_link_save_peer_info(link, aclc, ini);
1147 
1148 	if (smc_rmb_rtoken_handling(&smc->conn, link, aclc)) {
1149 		reason_code = SMC_CLC_DECL_ERR_RTOK;
1150 		goto connect_abort;
1151 	}
1152 
1153 	smc_close_init(smc);
1154 	smc_rx_init(smc);
1155 
1156 	if (ini->first_contact_local) {
1157 		if (smc_ib_ready_link(link)) {
1158 			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1159 			goto connect_abort;
1160 		}
1161 	} else {
1162 		if (smcr_lgr_reg_rmbs(link, smc->conn.rmb_desc)) {
1163 			reason_code = SMC_CLC_DECL_ERR_REGRMB;
1164 			goto connect_abort;
1165 		}
1166 	}
1167 	smc_rmb_sync_sg_for_device(&smc->conn);
1168 
1169 	if (aclc->hdr.version > SMC_V1) {
1170 		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
1171 			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
1172 
1173 		eid = clc_v2->r1.eid;
1174 		if (ini->first_contact_local)
1175 			smc_fill_gid_list(link->lgr, &ini->smcrv2.gidlist,
1176 					  link->smcibdev, link->gid);
1177 	}
1178 
1179 	reason_code = smc_clc_send_confirm(smc, ini->first_contact_local,
1180 					   aclc->hdr.version, eid, ini);
1181 	if (reason_code)
1182 		goto connect_abort;
1183 
1184 	smc_tx_init(smc);
1185 
1186 	if (ini->first_contact_local) {
1187 		/* QP confirmation over RoCE fabric */
1188 		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
1189 		reason_code = smcr_clnt_conf_first_link(smc);
1190 		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
1191 		if (reason_code)
1192 			goto connect_abort;
1193 	}
1194 	mutex_unlock(&smc_client_lgr_pending);
1195 
1196 	smc_copy_sock_settings_to_clc(smc);
1197 	smc->connect_nonblock = 0;
1198 	if (smc->sk.sk_state == SMC_INIT)
1199 		smc->sk.sk_state = SMC_ACTIVE;
1200 
1201 	return 0;
1202 connect_abort:
1203 	smc_conn_abort(smc, ini->first_contact_local);
1204 	mutex_unlock(&smc_client_lgr_pending);
1205 	smc->connect_nonblock = 0;
1206 
1207 	return reason_code;
1208 }
1209 
1210 /* The server has chosen one of the proposed ISM devices for the communication.
1211  * Determine from the CHID of the received CLC ACCEPT the ISM device chosen.
1212  */
1213 static int
1214 smc_v2_determine_accepted_chid(struct smc_clc_msg_accept_confirm_v2 *aclc,
1215 			       struct smc_init_info *ini)
1216 {
1217 	int i;
1218 
1219 	for (i = 0; i < ini->ism_offered_cnt + 1; i++) {
1220 		if (ini->ism_chid[i] == ntohs(aclc->d1.chid)) {
1221 			ini->ism_selected = i;
1222 			return 0;
1223 		}
1224 	}
1225 
1226 	return -EPROTO;
1227 }
1228 
1229 /* setup for ISM connection of client */
1230 static int smc_connect_ism(struct smc_sock *smc,
1231 			   struct smc_clc_msg_accept_confirm *aclc,
1232 			   struct smc_init_info *ini)
1233 {
1234 	u8 *eid = NULL;
1235 	int rc = 0;
1236 
1237 	ini->is_smcd = true;
1238 	ini->first_contact_peer = aclc->hdr.typev2 & SMC_FIRST_CONTACT_MASK;
1239 
1240 	if (aclc->hdr.version == SMC_V2) {
1241 		struct smc_clc_msg_accept_confirm_v2 *aclc_v2 =
1242 			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
1243 
1244 		rc = smc_v2_determine_accepted_chid(aclc_v2, ini);
1245 		if (rc)
1246 			return rc;
1247 	}
1248 	ini->ism_peer_gid[ini->ism_selected] = aclc->d0.gid;
1249 
1250 	/* there is only one lgr role for SMC-D; use server lock */
1251 	mutex_lock(&smc_server_lgr_pending);
1252 	rc = smc_conn_create(smc, ini);
1253 	if (rc) {
1254 		mutex_unlock(&smc_server_lgr_pending);
1255 		return rc;
1256 	}
1257 
1258 	/* Create send and receive buffers */
1259 	rc = smc_buf_create(smc, true);
1260 	if (rc) {
1261 		rc = (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB : SMC_CLC_DECL_MEM;
1262 		goto connect_abort;
1263 	}
1264 
1265 	smc_conn_save_peer_info(smc, aclc);
1266 	smc_close_init(smc);
1267 	smc_rx_init(smc);
1268 	smc_tx_init(smc);
1269 
1270 	if (aclc->hdr.version > SMC_V1) {
1271 		struct smc_clc_msg_accept_confirm_v2 *clc_v2 =
1272 			(struct smc_clc_msg_accept_confirm_v2 *)aclc;
1273 
1274 		eid = clc_v2->d1.eid;
1275 	}
1276 
1277 	rc = smc_clc_send_confirm(smc, ini->first_contact_local,
1278 				  aclc->hdr.version, eid, NULL);
1279 	if (rc)
1280 		goto connect_abort;
1281 	mutex_unlock(&smc_server_lgr_pending);
1282 
1283 	smc_copy_sock_settings_to_clc(smc);
1284 	smc->connect_nonblock = 0;
1285 	if (smc->sk.sk_state == SMC_INIT)
1286 		smc->sk.sk_state = SMC_ACTIVE;
1287 
1288 	return 0;
1289 connect_abort:
1290 	smc_conn_abort(smc, ini->first_contact_local);
1291 	mutex_unlock(&smc_server_lgr_pending);
1292 	smc->connect_nonblock = 0;
1293 
1294 	return rc;
1295 }
1296 
1297 /* check if received accept type and version matches a proposed one */
1298 static int smc_connect_check_aclc(struct smc_init_info *ini,
1299 				  struct smc_clc_msg_accept_confirm *aclc)
1300 {
1301 	if (aclc->hdr.typev1 != SMC_TYPE_R &&
1302 	    aclc->hdr.typev1 != SMC_TYPE_D)
1303 		return SMC_CLC_DECL_MODEUNSUPP;
1304 
1305 	if (aclc->hdr.version >= SMC_V2) {
1306 		if ((aclc->hdr.typev1 == SMC_TYPE_R &&
1307 		     !smcr_indicated(ini->smc_type_v2)) ||
1308 		    (aclc->hdr.typev1 == SMC_TYPE_D &&
1309 		     !smcd_indicated(ini->smc_type_v2)))
1310 			return SMC_CLC_DECL_MODEUNSUPP;
1311 	} else {
1312 		if ((aclc->hdr.typev1 == SMC_TYPE_R &&
1313 		     !smcr_indicated(ini->smc_type_v1)) ||
1314 		    (aclc->hdr.typev1 == SMC_TYPE_D &&
1315 		     !smcd_indicated(ini->smc_type_v1)))
1316 			return SMC_CLC_DECL_MODEUNSUPP;
1317 	}
1318 
1319 	return 0;
1320 }
1321 
1322 /* perform steps before actually connecting */
1323 static int __smc_connect(struct smc_sock *smc)
1324 {
1325 	u8 version = smc_ism_is_v2_capable() ? SMC_V2 : SMC_V1;
1326 	struct smc_clc_msg_accept_confirm_v2 *aclc2;
1327 	struct smc_clc_msg_accept_confirm *aclc;
1328 	struct smc_init_info *ini = NULL;
1329 	u8 *buf = NULL;
1330 	int rc = 0;
1331 
1332 	if (smc->use_fallback)
1333 		return smc_connect_fallback(smc, smc->fallback_rsn);
1334 
1335 	/* if peer has not signalled SMC-capability, fall back */
1336 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
1337 		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
1338 
1339 	/* IPSec connections opt out of SMC optimizations */
1340 	if (using_ipsec(smc))
1341 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC,
1342 						    version);
1343 
1344 	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
1345 	if (!ini)
1346 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_MEM,
1347 						    version);
1348 
1349 	ini->smcd_version = SMC_V1 | SMC_V2;
1350 	ini->smcr_version = SMC_V1 | SMC_V2;
1351 	ini->smc_type_v1 = SMC_TYPE_B;
1352 	ini->smc_type_v2 = SMC_TYPE_B;
1353 
1354 	/* get vlan id from IP device */
1355 	if (smc_vlan_by_tcpsk(smc->clcsock, ini)) {
1356 		ini->smcd_version &= ~SMC_V1;
1357 		ini->smcr_version = 0;
1358 		ini->smc_type_v1 = SMC_TYPE_N;
1359 		if (!ini->smcd_version) {
1360 			rc = SMC_CLC_DECL_GETVLANERR;
1361 			goto fallback;
1362 		}
1363 	}
1364 
1365 	rc = smc_find_proposal_devices(smc, ini);
1366 	if (rc)
1367 		goto fallback;
1368 
1369 	buf = kzalloc(SMC_CLC_MAX_ACCEPT_LEN, GFP_KERNEL);
1370 	if (!buf) {
1371 		rc = SMC_CLC_DECL_MEM;
1372 		goto fallback;
1373 	}
1374 	aclc2 = (struct smc_clc_msg_accept_confirm_v2 *)buf;
1375 	aclc = (struct smc_clc_msg_accept_confirm *)aclc2;
1376 
1377 	/* perform CLC handshake */
1378 	rc = smc_connect_clc(smc, aclc2, ini);
1379 	if (rc) {
1380 		/* -EAGAIN on timeout, see tcp_recvmsg() */
1381 		if (rc == -EAGAIN) {
1382 			rc = -ETIMEDOUT;
1383 			smc->sk.sk_err = ETIMEDOUT;
1384 		}
1385 		goto vlan_cleanup;
1386 	}
1387 
1388 	/* check if smc modes and versions of CLC proposal and accept match */
1389 	rc = smc_connect_check_aclc(ini, aclc);
1390 	version = aclc->hdr.version == SMC_V1 ? SMC_V1 : SMC_V2;
1391 	if (rc)
1392 		goto vlan_cleanup;
1393 
1394 	/* depending on previous steps, connect using rdma or ism */
1395 	if (aclc->hdr.typev1 == SMC_TYPE_R) {
1396 		ini->smcr_version = version;
1397 		rc = smc_connect_rdma(smc, aclc, ini);
1398 	} else if (aclc->hdr.typev1 == SMC_TYPE_D) {
1399 		ini->smcd_version = version;
1400 		rc = smc_connect_ism(smc, aclc, ini);
1401 	}
1402 	if (rc)
1403 		goto vlan_cleanup;
1404 
1405 	SMC_STAT_CLNT_SUCC_INC(sock_net(smc->clcsock->sk), aclc);
1406 	smc_connect_ism_vlan_cleanup(smc, ini);
1407 	kfree(buf);
1408 	kfree(ini);
1409 	return 0;
1410 
1411 vlan_cleanup:
1412 	smc_connect_ism_vlan_cleanup(smc, ini);
1413 	kfree(buf);
1414 fallback:
1415 	kfree(ini);
1416 	return smc_connect_decline_fallback(smc, rc, version);
1417 }
1418 
1419 static void smc_connect_work(struct work_struct *work)
1420 {
1421 	struct smc_sock *smc = container_of(work, struct smc_sock,
1422 					    connect_work);
1423 	long timeo = smc->sk.sk_sndtimeo;
1424 	int rc = 0;
1425 
1426 	if (!timeo)
1427 		timeo = MAX_SCHEDULE_TIMEOUT;
1428 	lock_sock(smc->clcsock->sk);
1429 	if (smc->clcsock->sk->sk_err) {
1430 		smc->sk.sk_err = smc->clcsock->sk->sk_err;
1431 	} else if ((1 << smc->clcsock->sk->sk_state) &
1432 					(TCPF_SYN_SENT | TCPF_SYN_RECV)) {
1433 		rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
1434 		if ((rc == -EPIPE) &&
1435 		    ((1 << smc->clcsock->sk->sk_state) &
1436 					(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
1437 			rc = 0;
1438 	}
1439 	release_sock(smc->clcsock->sk);
1440 	lock_sock(&smc->sk);
1441 	if (rc != 0 || smc->sk.sk_err) {
1442 		smc->sk.sk_state = SMC_CLOSED;
1443 		if (rc == -EPIPE || rc == -EAGAIN)
1444 			smc->sk.sk_err = EPIPE;
1445 		else if (signal_pending(current))
1446 			smc->sk.sk_err = -sock_intr_errno(timeo);
1447 		sock_put(&smc->sk); /* passive closing */
1448 		goto out;
1449 	}
1450 
1451 	rc = __smc_connect(smc);
1452 	if (rc < 0)
1453 		smc->sk.sk_err = -rc;
1454 
1455 out:
1456 	if (!sock_flag(&smc->sk, SOCK_DEAD)) {
1457 		if (smc->sk.sk_err) {
1458 			smc->sk.sk_state_change(&smc->sk);
1459 		} else { /* allow polling before and after fallback decision */
1460 			smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
1461 			smc->sk.sk_write_space(&smc->sk);
1462 		}
1463 	}
1464 	release_sock(&smc->sk);
1465 }
1466 
1467 static int smc_connect(struct socket *sock, struct sockaddr *addr,
1468 		       int alen, int flags)
1469 {
1470 	struct sock *sk = sock->sk;
1471 	struct smc_sock *smc;
1472 	int rc = -EINVAL;
1473 
1474 	smc = smc_sk(sk);
1475 
1476 	/* separate smc parameter checking to be safe */
1477 	if (alen < sizeof(addr->sa_family))
1478 		goto out_err;
1479 	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
1480 		goto out_err;
1481 
1482 	lock_sock(sk);
1483 	switch (sk->sk_state) {
1484 	default:
1485 		goto out;
1486 	case SMC_ACTIVE:
1487 		rc = -EISCONN;
1488 		goto out;
1489 	case SMC_INIT:
1490 		break;
1491 	}
1492 
1493 	smc_copy_sock_settings_to_clc(smc);
1494 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1495 	if (smc->connect_nonblock) {
1496 		rc = -EALREADY;
1497 		goto out;
1498 	}
1499 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
1500 	if (rc && rc != -EINPROGRESS)
1501 		goto out;
1502 
1503 	sock_hold(&smc->sk); /* sock put in passive closing */
1504 	if (smc->use_fallback)
1505 		goto out;
1506 	if (flags & O_NONBLOCK) {
1507 		if (queue_work(smc_hs_wq, &smc->connect_work))
1508 			smc->connect_nonblock = 1;
1509 		rc = -EINPROGRESS;
1510 	} else {
1511 		rc = __smc_connect(smc);
1512 		if (rc < 0)
1513 			goto out;
1514 		else
1515 			rc = 0; /* success cases including fallback */
1516 	}
1517 
1518 out:
1519 	release_sock(sk);
1520 out_err:
1521 	return rc;
1522 }
1523 
1524 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
1525 {
1526 	struct socket *new_clcsock = NULL;
1527 	struct sock *lsk = &lsmc->sk;
1528 	struct sock *new_sk;
1529 	int rc = -EINVAL;
1530 
1531 	release_sock(lsk);
1532 	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
1533 	if (!new_sk) {
1534 		rc = -ENOMEM;
1535 		lsk->sk_err = ENOMEM;
1536 		*new_smc = NULL;
1537 		lock_sock(lsk);
1538 		goto out;
1539 	}
1540 	*new_smc = smc_sk(new_sk);
1541 
1542 	mutex_lock(&lsmc->clcsock_release_lock);
1543 	if (lsmc->clcsock)
1544 		rc = kernel_accept(lsmc->clcsock, &new_clcsock, SOCK_NONBLOCK);
1545 	mutex_unlock(&lsmc->clcsock_release_lock);
1546 	lock_sock(lsk);
1547 	if  (rc < 0 && rc != -EAGAIN)
1548 		lsk->sk_err = -rc;
1549 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
1550 		new_sk->sk_prot->unhash(new_sk);
1551 		if (new_clcsock)
1552 			sock_release(new_clcsock);
1553 		new_sk->sk_state = SMC_CLOSED;
1554 		sock_set_flag(new_sk, SOCK_DEAD);
1555 		sock_put(new_sk); /* final */
1556 		*new_smc = NULL;
1557 		goto out;
1558 	}
1559 
1560 	/* new clcsock has inherited the smc listen-specific sk_data_ready
1561 	 * function; switch it back to the original sk_data_ready function
1562 	 */
1563 	new_clcsock->sk->sk_data_ready = lsmc->clcsk_data_ready;
1564 	(*new_smc)->clcsock = new_clcsock;
1565 out:
1566 	return rc;
1567 }
1568 
1569 /* add a just created sock to the accept queue of the listen sock as
1570  * candidate for a following socket accept call from user space
1571  */
1572 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
1573 {
1574 	struct smc_sock *par = smc_sk(parent);
1575 
1576 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
1577 	spin_lock(&par->accept_q_lock);
1578 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
1579 	spin_unlock(&par->accept_q_lock);
1580 	sk_acceptq_added(parent);
1581 }
1582 
1583 /* remove a socket from the accept queue of its parental listening socket */
1584 static void smc_accept_unlink(struct sock *sk)
1585 {
1586 	struct smc_sock *par = smc_sk(sk)->listen_smc;
1587 
1588 	spin_lock(&par->accept_q_lock);
1589 	list_del_init(&smc_sk(sk)->accept_q);
1590 	spin_unlock(&par->accept_q_lock);
1591 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
1592 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
1593 }
1594 
1595 /* remove a sock from the accept queue to bind it to a new socket created
1596  * for a socket accept call from user space
1597  */
1598 struct sock *smc_accept_dequeue(struct sock *parent,
1599 				struct socket *new_sock)
1600 {
1601 	struct smc_sock *isk, *n;
1602 	struct sock *new_sk;
1603 
1604 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
1605 		new_sk = (struct sock *)isk;
1606 
1607 		smc_accept_unlink(new_sk);
1608 		if (new_sk->sk_state == SMC_CLOSED) {
1609 			new_sk->sk_prot->unhash(new_sk);
1610 			if (isk->clcsock) {
1611 				sock_release(isk->clcsock);
1612 				isk->clcsock = NULL;
1613 			}
1614 			sock_put(new_sk); /* final */
1615 			continue;
1616 		}
1617 		if (new_sock) {
1618 			sock_graft(new_sk, new_sock);
1619 			if (isk->use_fallback) {
1620 				smc_sk(new_sk)->clcsock->file = new_sock->file;
1621 				isk->clcsock->file->private_data = isk->clcsock;
1622 			}
1623 		}
1624 		return new_sk;
1625 	}
1626 	return NULL;
1627 }
1628 
1629 /* clean up for a created but never accepted sock */
1630 void smc_close_non_accepted(struct sock *sk)
1631 {
1632 	struct smc_sock *smc = smc_sk(sk);
1633 
1634 	sock_hold(sk); /* sock_put below */
1635 	lock_sock(sk);
1636 	if (!sk->sk_lingertime)
1637 		/* wait for peer closing */
1638 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
1639 	__smc_release(smc);
1640 	release_sock(sk);
1641 	sock_put(sk); /* sock_hold above */
1642 	sock_put(sk); /* final sock_put */
1643 }
1644 
1645 static int smcr_serv_conf_first_link(struct smc_sock *smc)
1646 {
1647 	struct smc_link *link = smc->conn.lnk;
1648 	struct smc_llc_qentry *qentry;
1649 	int rc;
1650 
1651 	if (smcr_link_reg_rmb(link, smc->conn.rmb_desc))
1652 		return SMC_CLC_DECL_ERR_REGRMB;
1653 
1654 	/* send CONFIRM LINK request to client over the RoCE fabric */
1655 	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1656 	if (rc < 0)
1657 		return SMC_CLC_DECL_TIMEOUT_CL;
1658 
1659 	/* receive CONFIRM LINK response from client over the RoCE fabric */
1660 	qentry = smc_llc_wait(link->lgr, link, SMC_LLC_WAIT_TIME,
1661 			      SMC_LLC_CONFIRM_LINK);
1662 	if (!qentry) {
1663 		struct smc_clc_msg_decline dclc;
1664 
1665 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1666 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1667 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1668 	}
1669 	smc_llc_save_peer_uid(qentry);
1670 	rc = smc_llc_eval_conf_link(qentry, SMC_LLC_RESP);
1671 	smc_llc_flow_qentry_del(&link->lgr->llc_flow_lcl);
1672 	if (rc)
1673 		return SMC_CLC_DECL_RMBE_EC;
1674 
1675 	/* confirm_rkey is implicit on 1st contact */
1676 	smc->conn.rmb_desc->is_conf_rkey = true;
1677 
1678 	smc_llc_link_active(link);
1679 	smcr_lgr_set_type(link->lgr, SMC_LGR_SINGLE);
1680 
1681 	/* initial contact - try to establish second link */
1682 	smc_llc_srv_add_link(link, NULL);
1683 	return 0;
1684 }
1685 
1686 /* listen worker: finish */
1687 static void smc_listen_out(struct smc_sock *new_smc)
1688 {
1689 	struct smc_sock *lsmc = new_smc->listen_smc;
1690 	struct sock *newsmcsk = &new_smc->sk;
1691 
1692 	if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
1693 		atomic_dec(&lsmc->queued_smc_hs);
1694 
1695 	if (lsmc->sk.sk_state == SMC_LISTEN) {
1696 		lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1697 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
1698 		release_sock(&lsmc->sk);
1699 	} else { /* no longer listening */
1700 		smc_close_non_accepted(newsmcsk);
1701 	}
1702 
1703 	/* Wake up accept */
1704 	lsmc->sk.sk_data_ready(&lsmc->sk);
1705 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1706 }
1707 
1708 /* listen worker: finish in state connected */
1709 static void smc_listen_out_connected(struct smc_sock *new_smc)
1710 {
1711 	struct sock *newsmcsk = &new_smc->sk;
1712 
1713 	sk_refcnt_debug_inc(newsmcsk);
1714 	if (newsmcsk->sk_state == SMC_INIT)
1715 		newsmcsk->sk_state = SMC_ACTIVE;
1716 
1717 	smc_listen_out(new_smc);
1718 }
1719 
1720 /* listen worker: finish in error state */
1721 static void smc_listen_out_err(struct smc_sock *new_smc)
1722 {
1723 	struct sock *newsmcsk = &new_smc->sk;
1724 	struct net *net = sock_net(newsmcsk);
1725 
1726 	this_cpu_inc(net->smc.smc_stats->srv_hshake_err_cnt);
1727 	if (newsmcsk->sk_state == SMC_INIT)
1728 		sock_put(&new_smc->sk); /* passive closing */
1729 	newsmcsk->sk_state = SMC_CLOSED;
1730 
1731 	smc_listen_out(new_smc);
1732 }
1733 
1734 /* listen worker: decline and fall back if possible */
1735 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1736 			       int local_first, u8 version)
1737 {
1738 	/* RDMA setup failed, switch back to TCP */
1739 	smc_conn_abort(new_smc, local_first);
1740 	if (reason_code < 0 ||
1741 	    smc_switch_to_fallback(new_smc, reason_code)) {
1742 		/* error, no fallback possible */
1743 		smc_listen_out_err(new_smc);
1744 		return;
1745 	}
1746 	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1747 		if (smc_clc_send_decline(new_smc, reason_code, version) < 0) {
1748 			smc_listen_out_err(new_smc);
1749 			return;
1750 		}
1751 	}
1752 	smc_listen_out_connected(new_smc);
1753 }
1754 
1755 /* listen worker: version checking */
1756 static int smc_listen_v2_check(struct smc_sock *new_smc,
1757 			       struct smc_clc_msg_proposal *pclc,
1758 			       struct smc_init_info *ini)
1759 {
1760 	struct smc_clc_smcd_v2_extension *pclc_smcd_v2_ext;
1761 	struct smc_clc_v2_extension *pclc_v2_ext;
1762 	int rc = SMC_CLC_DECL_PEERNOSMC;
1763 
1764 	ini->smc_type_v1 = pclc->hdr.typev1;
1765 	ini->smc_type_v2 = pclc->hdr.typev2;
1766 	ini->smcd_version = smcd_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
1767 	ini->smcr_version = smcr_indicated(ini->smc_type_v1) ? SMC_V1 : 0;
1768 	if (pclc->hdr.version > SMC_V1) {
1769 		if (smcd_indicated(ini->smc_type_v2))
1770 			ini->smcd_version |= SMC_V2;
1771 		if (smcr_indicated(ini->smc_type_v2))
1772 			ini->smcr_version |= SMC_V2;
1773 	}
1774 	if (!(ini->smcd_version & SMC_V2) && !(ini->smcr_version & SMC_V2)) {
1775 		rc = SMC_CLC_DECL_PEERNOSMC;
1776 		goto out;
1777 	}
1778 	pclc_v2_ext = smc_get_clc_v2_ext(pclc);
1779 	if (!pclc_v2_ext) {
1780 		ini->smcd_version &= ~SMC_V2;
1781 		ini->smcr_version &= ~SMC_V2;
1782 		rc = SMC_CLC_DECL_NOV2EXT;
1783 		goto out;
1784 	}
1785 	pclc_smcd_v2_ext = smc_get_clc_smcd_v2_ext(pclc_v2_ext);
1786 	if (ini->smcd_version & SMC_V2) {
1787 		if (!smc_ism_is_v2_capable()) {
1788 			ini->smcd_version &= ~SMC_V2;
1789 			rc = SMC_CLC_DECL_NOISM2SUPP;
1790 		} else if (!pclc_smcd_v2_ext) {
1791 			ini->smcd_version &= ~SMC_V2;
1792 			rc = SMC_CLC_DECL_NOV2DEXT;
1793 		} else if (!pclc_v2_ext->hdr.eid_cnt &&
1794 			   !pclc_v2_ext->hdr.flag.seid) {
1795 			ini->smcd_version &= ~SMC_V2;
1796 			rc = SMC_CLC_DECL_NOUEID;
1797 		}
1798 	}
1799 	if (ini->smcr_version & SMC_V2) {
1800 		if (!pclc_v2_ext->hdr.eid_cnt) {
1801 			ini->smcr_version &= ~SMC_V2;
1802 			rc = SMC_CLC_DECL_NOUEID;
1803 		}
1804 	}
1805 
1806 out:
1807 	if (!ini->smcd_version && !ini->smcr_version)
1808 		return rc;
1809 
1810 	return 0;
1811 }
1812 
1813 /* listen worker: check prefixes */
1814 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1815 				 struct smc_clc_msg_proposal *pclc)
1816 {
1817 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
1818 	struct socket *newclcsock = new_smc->clcsock;
1819 
1820 	if (pclc->hdr.typev1 == SMC_TYPE_N)
1821 		return 0;
1822 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1823 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1824 		return SMC_CLC_DECL_DIFFPREFIX;
1825 
1826 	return 0;
1827 }
1828 
1829 /* listen worker: initialize connection and buffers */
1830 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1831 				struct smc_init_info *ini)
1832 {
1833 	int rc;
1834 
1835 	/* allocate connection / link group */
1836 	rc = smc_conn_create(new_smc, ini);
1837 	if (rc)
1838 		return rc;
1839 
1840 	/* create send buffer and rmb */
1841 	if (smc_buf_create(new_smc, false))
1842 		return SMC_CLC_DECL_MEM;
1843 
1844 	return 0;
1845 }
1846 
1847 /* listen worker: initialize connection and buffers for SMC-D */
1848 static int smc_listen_ism_init(struct smc_sock *new_smc,
1849 			       struct smc_init_info *ini)
1850 {
1851 	int rc;
1852 
1853 	rc = smc_conn_create(new_smc, ini);
1854 	if (rc)
1855 		return rc;
1856 
1857 	/* Create send and receive buffers */
1858 	rc = smc_buf_create(new_smc, true);
1859 	if (rc) {
1860 		smc_conn_abort(new_smc, ini->first_contact_local);
1861 		return (rc == -ENOSPC) ? SMC_CLC_DECL_MAX_DMB :
1862 					 SMC_CLC_DECL_MEM;
1863 	}
1864 
1865 	return 0;
1866 }
1867 
1868 static bool smc_is_already_selected(struct smcd_dev *smcd,
1869 				    struct smc_init_info *ini,
1870 				    int matches)
1871 {
1872 	int i;
1873 
1874 	for (i = 0; i < matches; i++)
1875 		if (smcd == ini->ism_dev[i])
1876 			return true;
1877 
1878 	return false;
1879 }
1880 
1881 /* check for ISM devices matching proposed ISM devices */
1882 static void smc_check_ism_v2_match(struct smc_init_info *ini,
1883 				   u16 proposed_chid, u64 proposed_gid,
1884 				   unsigned int *matches)
1885 {
1886 	struct smcd_dev *smcd;
1887 
1888 	list_for_each_entry(smcd, &smcd_dev_list.list, list) {
1889 		if (smcd->going_away)
1890 			continue;
1891 		if (smc_is_already_selected(smcd, ini, *matches))
1892 			continue;
1893 		if (smc_ism_get_chid(smcd) == proposed_chid &&
1894 		    !smc_ism_cantalk(proposed_gid, ISM_RESERVED_VLANID, smcd)) {
1895 			ini->ism_peer_gid[*matches] = proposed_gid;
1896 			ini->ism_dev[*matches] = smcd;
1897 			(*matches)++;
1898 			break;
1899 		}
1900 	}
1901 }
1902 
1903 static void smc_find_ism_store_rc(u32 rc, struct smc_init_info *ini)
1904 {
1905 	if (!ini->rc)
1906 		ini->rc = rc;
1907 }
1908 
1909 static void smc_find_ism_v2_device_serv(struct smc_sock *new_smc,
1910 					struct smc_clc_msg_proposal *pclc,
1911 					struct smc_init_info *ini)
1912 {
1913 	struct smc_clc_smcd_v2_extension *smcd_v2_ext;
1914 	struct smc_clc_v2_extension *smc_v2_ext;
1915 	struct smc_clc_msg_smcd *pclc_smcd;
1916 	unsigned int matches = 0;
1917 	u8 smcd_version;
1918 	u8 *eid = NULL;
1919 	int i, rc;
1920 
1921 	if (!(ini->smcd_version & SMC_V2) || !smcd_indicated(ini->smc_type_v2))
1922 		goto not_found;
1923 
1924 	pclc_smcd = smc_get_clc_msg_smcd(pclc);
1925 	smc_v2_ext = smc_get_clc_v2_ext(pclc);
1926 	smcd_v2_ext = smc_get_clc_smcd_v2_ext(smc_v2_ext);
1927 
1928 	mutex_lock(&smcd_dev_list.mutex);
1929 	if (pclc_smcd->ism.chid)
1930 		/* check for ISM device matching proposed native ISM device */
1931 		smc_check_ism_v2_match(ini, ntohs(pclc_smcd->ism.chid),
1932 				       ntohll(pclc_smcd->ism.gid), &matches);
1933 	for (i = 1; i <= smc_v2_ext->hdr.ism_gid_cnt; i++) {
1934 		/* check for ISM devices matching proposed non-native ISM
1935 		 * devices
1936 		 */
1937 		smc_check_ism_v2_match(ini,
1938 				       ntohs(smcd_v2_ext->gidchid[i - 1].chid),
1939 				       ntohll(smcd_v2_ext->gidchid[i - 1].gid),
1940 				       &matches);
1941 	}
1942 	mutex_unlock(&smcd_dev_list.mutex);
1943 
1944 	if (!ini->ism_dev[0]) {
1945 		smc_find_ism_store_rc(SMC_CLC_DECL_NOSMCD2DEV, ini);
1946 		goto not_found;
1947 	}
1948 
1949 	smc_ism_get_system_eid(&eid);
1950 	if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext,
1951 			       smcd_v2_ext->system_eid, eid))
1952 		goto not_found;
1953 
1954 	/* separate - outside the smcd_dev_list.lock */
1955 	smcd_version = ini->smcd_version;
1956 	for (i = 0; i < matches; i++) {
1957 		ini->smcd_version = SMC_V2;
1958 		ini->is_smcd = true;
1959 		ini->ism_selected = i;
1960 		rc = smc_listen_ism_init(new_smc, ini);
1961 		if (rc) {
1962 			smc_find_ism_store_rc(rc, ini);
1963 			/* try next active ISM device */
1964 			continue;
1965 		}
1966 		return; /* matching and usable V2 ISM device found */
1967 	}
1968 	/* no V2 ISM device could be initialized */
1969 	ini->smcd_version = smcd_version;	/* restore original value */
1970 	ini->negotiated_eid[0] = 0;
1971 
1972 not_found:
1973 	ini->smcd_version &= ~SMC_V2;
1974 	ini->ism_dev[0] = NULL;
1975 	ini->is_smcd = false;
1976 }
1977 
1978 static void smc_find_ism_v1_device_serv(struct smc_sock *new_smc,
1979 					struct smc_clc_msg_proposal *pclc,
1980 					struct smc_init_info *ini)
1981 {
1982 	struct smc_clc_msg_smcd *pclc_smcd = smc_get_clc_msg_smcd(pclc);
1983 	int rc = 0;
1984 
1985 	/* check if ISM V1 is available */
1986 	if (!(ini->smcd_version & SMC_V1) || !smcd_indicated(ini->smc_type_v1))
1987 		goto not_found;
1988 	ini->is_smcd = true; /* prepare ISM check */
1989 	ini->ism_peer_gid[0] = ntohll(pclc_smcd->ism.gid);
1990 	rc = smc_find_ism_device(new_smc, ini);
1991 	if (rc)
1992 		goto not_found;
1993 	ini->ism_selected = 0;
1994 	rc = smc_listen_ism_init(new_smc, ini);
1995 	if (!rc)
1996 		return;		/* V1 ISM device found */
1997 
1998 not_found:
1999 	smc_find_ism_store_rc(rc, ini);
2000 	ini->smcd_version &= ~SMC_V1;
2001 	ini->ism_dev[0] = NULL;
2002 	ini->is_smcd = false;
2003 }
2004 
2005 /* listen worker: register buffers */
2006 static int smc_listen_rdma_reg(struct smc_sock *new_smc, bool local_first)
2007 {
2008 	struct smc_connection *conn = &new_smc->conn;
2009 
2010 	if (!local_first) {
2011 		if (smcr_lgr_reg_rmbs(conn->lnk, conn->rmb_desc))
2012 			return SMC_CLC_DECL_ERR_REGRMB;
2013 	}
2014 	smc_rmb_sync_sg_for_device(&new_smc->conn);
2015 
2016 	return 0;
2017 }
2018 
2019 static void smc_find_rdma_v2_device_serv(struct smc_sock *new_smc,
2020 					 struct smc_clc_msg_proposal *pclc,
2021 					 struct smc_init_info *ini)
2022 {
2023 	struct smc_clc_v2_extension *smc_v2_ext;
2024 	u8 smcr_version;
2025 	int rc;
2026 
2027 	if (!(ini->smcr_version & SMC_V2) || !smcr_indicated(ini->smc_type_v2))
2028 		goto not_found;
2029 
2030 	smc_v2_ext = smc_get_clc_v2_ext(pclc);
2031 	if (!smc_clc_match_eid(ini->negotiated_eid, smc_v2_ext, NULL, NULL))
2032 		goto not_found;
2033 
2034 	/* prepare RDMA check */
2035 	memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
2036 	memcpy(ini->peer_gid, smc_v2_ext->roce, SMC_GID_SIZE);
2037 	memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
2038 	ini->check_smcrv2 = true;
2039 	ini->smcrv2.clc_sk = new_smc->clcsock->sk;
2040 	ini->smcrv2.saddr = new_smc->clcsock->sk->sk_rcv_saddr;
2041 	ini->smcrv2.daddr = smc_ib_gid_to_ipv4(smc_v2_ext->roce);
2042 	rc = smc_find_rdma_device(new_smc, ini);
2043 	if (rc) {
2044 		smc_find_ism_store_rc(rc, ini);
2045 		goto not_found;
2046 	}
2047 	if (!ini->smcrv2.uses_gateway)
2048 		memcpy(ini->smcrv2.nexthop_mac, pclc->lcl.mac, ETH_ALEN);
2049 
2050 	smcr_version = ini->smcr_version;
2051 	ini->smcr_version = SMC_V2;
2052 	rc = smc_listen_rdma_init(new_smc, ini);
2053 	if (!rc)
2054 		rc = smc_listen_rdma_reg(new_smc, ini->first_contact_local);
2055 	if (!rc)
2056 		return;
2057 	ini->smcr_version = smcr_version;
2058 	smc_find_ism_store_rc(rc, ini);
2059 
2060 not_found:
2061 	ini->smcr_version &= ~SMC_V2;
2062 	ini->check_smcrv2 = false;
2063 }
2064 
2065 static int smc_find_rdma_v1_device_serv(struct smc_sock *new_smc,
2066 					struct smc_clc_msg_proposal *pclc,
2067 					struct smc_init_info *ini)
2068 {
2069 	int rc;
2070 
2071 	if (!(ini->smcr_version & SMC_V1) || !smcr_indicated(ini->smc_type_v1))
2072 		return SMC_CLC_DECL_NOSMCDEV;
2073 
2074 	/* prepare RDMA check */
2075 	memcpy(ini->peer_systemid, pclc->lcl.id_for_peer, SMC_SYSTEMID_LEN);
2076 	memcpy(ini->peer_gid, pclc->lcl.gid, SMC_GID_SIZE);
2077 	memcpy(ini->peer_mac, pclc->lcl.mac, ETH_ALEN);
2078 	rc = smc_find_rdma_device(new_smc, ini);
2079 	if (rc) {
2080 		/* no RDMA device found */
2081 		return SMC_CLC_DECL_NOSMCDEV;
2082 	}
2083 	rc = smc_listen_rdma_init(new_smc, ini);
2084 	if (rc)
2085 		return rc;
2086 	return smc_listen_rdma_reg(new_smc, ini->first_contact_local);
2087 }
2088 
2089 /* determine the local device matching to proposal */
2090 static int smc_listen_find_device(struct smc_sock *new_smc,
2091 				  struct smc_clc_msg_proposal *pclc,
2092 				  struct smc_init_info *ini)
2093 {
2094 	int prfx_rc;
2095 
2096 	/* check for ISM device matching V2 proposed device */
2097 	smc_find_ism_v2_device_serv(new_smc, pclc, ini);
2098 	if (ini->ism_dev[0])
2099 		return 0;
2100 
2101 	/* check for matching IP prefix and subnet length (V1) */
2102 	prfx_rc = smc_listen_prfx_check(new_smc, pclc);
2103 	if (prfx_rc)
2104 		smc_find_ism_store_rc(prfx_rc, ini);
2105 
2106 	/* get vlan id from IP device */
2107 	if (smc_vlan_by_tcpsk(new_smc->clcsock, ini))
2108 		return ini->rc ?: SMC_CLC_DECL_GETVLANERR;
2109 
2110 	/* check for ISM device matching V1 proposed device */
2111 	if (!prfx_rc)
2112 		smc_find_ism_v1_device_serv(new_smc, pclc, ini);
2113 	if (ini->ism_dev[0])
2114 		return 0;
2115 
2116 	if (!smcr_indicated(pclc->hdr.typev1) &&
2117 	    !smcr_indicated(pclc->hdr.typev2))
2118 		/* skip RDMA and decline */
2119 		return ini->rc ?: SMC_CLC_DECL_NOSMCDDEV;
2120 
2121 	/* check if RDMA V2 is available */
2122 	smc_find_rdma_v2_device_serv(new_smc, pclc, ini);
2123 	if (ini->smcrv2.ib_dev_v2)
2124 		return 0;
2125 
2126 	/* check if RDMA V1 is available */
2127 	if (!prfx_rc) {
2128 		int rc;
2129 
2130 		rc = smc_find_rdma_v1_device_serv(new_smc, pclc, ini);
2131 		smc_find_ism_store_rc(rc, ini);
2132 		return (!rc) ? 0 : ini->rc;
2133 	}
2134 	return SMC_CLC_DECL_NOSMCDEV;
2135 }
2136 
2137 /* listen worker: finish RDMA setup */
2138 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
2139 				  struct smc_clc_msg_accept_confirm *cclc,
2140 				  bool local_first,
2141 				  struct smc_init_info *ini)
2142 {
2143 	struct smc_link *link = new_smc->conn.lnk;
2144 	int reason_code = 0;
2145 
2146 	if (local_first)
2147 		smc_link_save_peer_info(link, cclc, ini);
2148 
2149 	if (smc_rmb_rtoken_handling(&new_smc->conn, link, cclc))
2150 		return SMC_CLC_DECL_ERR_RTOK;
2151 
2152 	if (local_first) {
2153 		if (smc_ib_ready_link(link))
2154 			return SMC_CLC_DECL_ERR_RDYLNK;
2155 		/* QP confirmation over RoCE fabric */
2156 		smc_llc_flow_initiate(link->lgr, SMC_LLC_FLOW_ADD_LINK);
2157 		reason_code = smcr_serv_conf_first_link(new_smc);
2158 		smc_llc_flow_stop(link->lgr, &link->lgr->llc_flow_lcl);
2159 	}
2160 	return reason_code;
2161 }
2162 
2163 /* setup for connection of server */
2164 static void smc_listen_work(struct work_struct *work)
2165 {
2166 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
2167 						smc_listen_work);
2168 	struct socket *newclcsock = new_smc->clcsock;
2169 	struct smc_clc_msg_accept_confirm *cclc;
2170 	struct smc_clc_msg_proposal_area *buf;
2171 	struct smc_clc_msg_proposal *pclc;
2172 	struct smc_init_info *ini = NULL;
2173 	u8 proposal_version = SMC_V1;
2174 	u8 accept_version;
2175 	int rc = 0;
2176 
2177 	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
2178 		return smc_listen_out_err(new_smc);
2179 
2180 	if (new_smc->use_fallback) {
2181 		smc_listen_out_connected(new_smc);
2182 		return;
2183 	}
2184 
2185 	/* check if peer is smc capable */
2186 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
2187 		rc = smc_switch_to_fallback(new_smc, SMC_CLC_DECL_PEERNOSMC);
2188 		if (rc)
2189 			smc_listen_out_err(new_smc);
2190 		else
2191 			smc_listen_out_connected(new_smc);
2192 		return;
2193 	}
2194 
2195 	/* do inband token exchange -
2196 	 * wait for and receive SMC Proposal CLC message
2197 	 */
2198 	buf = kzalloc(sizeof(*buf), GFP_KERNEL);
2199 	if (!buf) {
2200 		rc = SMC_CLC_DECL_MEM;
2201 		goto out_decl;
2202 	}
2203 	pclc = (struct smc_clc_msg_proposal *)buf;
2204 	rc = smc_clc_wait_msg(new_smc, pclc, sizeof(*buf),
2205 			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
2206 	if (rc)
2207 		goto out_decl;
2208 
2209 	if (pclc->hdr.version > SMC_V1)
2210 		proposal_version = SMC_V2;
2211 
2212 	/* IPSec connections opt out of SMC optimizations */
2213 	if (using_ipsec(new_smc)) {
2214 		rc = SMC_CLC_DECL_IPSEC;
2215 		goto out_decl;
2216 	}
2217 
2218 	ini = kzalloc(sizeof(*ini), GFP_KERNEL);
2219 	if (!ini) {
2220 		rc = SMC_CLC_DECL_MEM;
2221 		goto out_decl;
2222 	}
2223 
2224 	/* initial version checking */
2225 	rc = smc_listen_v2_check(new_smc, pclc, ini);
2226 	if (rc)
2227 		goto out_decl;
2228 
2229 	mutex_lock(&smc_server_lgr_pending);
2230 	smc_close_init(new_smc);
2231 	smc_rx_init(new_smc);
2232 	smc_tx_init(new_smc);
2233 
2234 	/* determine ISM or RoCE device used for connection */
2235 	rc = smc_listen_find_device(new_smc, pclc, ini);
2236 	if (rc)
2237 		goto out_unlock;
2238 
2239 	/* send SMC Accept CLC message */
2240 	accept_version = ini->is_smcd ? ini->smcd_version : ini->smcr_version;
2241 	rc = smc_clc_send_accept(new_smc, ini->first_contact_local,
2242 				 accept_version, ini->negotiated_eid);
2243 	if (rc)
2244 		goto out_unlock;
2245 
2246 	/* SMC-D does not need this lock any more */
2247 	if (ini->is_smcd)
2248 		mutex_unlock(&smc_server_lgr_pending);
2249 
2250 	/* receive SMC Confirm CLC message */
2251 	memset(buf, 0, sizeof(*buf));
2252 	cclc = (struct smc_clc_msg_accept_confirm *)buf;
2253 	rc = smc_clc_wait_msg(new_smc, cclc, sizeof(*buf),
2254 			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
2255 	if (rc) {
2256 		if (!ini->is_smcd)
2257 			goto out_unlock;
2258 		goto out_decl;
2259 	}
2260 
2261 	/* finish worker */
2262 	if (!ini->is_smcd) {
2263 		rc = smc_listen_rdma_finish(new_smc, cclc,
2264 					    ini->first_contact_local, ini);
2265 		if (rc)
2266 			goto out_unlock;
2267 		mutex_unlock(&smc_server_lgr_pending);
2268 	}
2269 	smc_conn_save_peer_info(new_smc, cclc);
2270 	smc_listen_out_connected(new_smc);
2271 	SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
2272 	goto out_free;
2273 
2274 out_unlock:
2275 	mutex_unlock(&smc_server_lgr_pending);
2276 out_decl:
2277 	smc_listen_decline(new_smc, rc, ini ? ini->first_contact_local : 0,
2278 			   proposal_version);
2279 out_free:
2280 	kfree(ini);
2281 	kfree(buf);
2282 }
2283 
2284 static void smc_tcp_listen_work(struct work_struct *work)
2285 {
2286 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
2287 					     tcp_listen_work);
2288 	struct sock *lsk = &lsmc->sk;
2289 	struct smc_sock *new_smc;
2290 	int rc = 0;
2291 
2292 	lock_sock(lsk);
2293 	while (lsk->sk_state == SMC_LISTEN) {
2294 		rc = smc_clcsock_accept(lsmc, &new_smc);
2295 		if (rc) /* clcsock accept queue empty or error */
2296 			goto out;
2297 		if (!new_smc)
2298 			continue;
2299 
2300 		if (tcp_sk(new_smc->clcsock->sk)->syn_smc)
2301 			atomic_inc(&lsmc->queued_smc_hs);
2302 
2303 		new_smc->listen_smc = lsmc;
2304 		new_smc->use_fallback = lsmc->use_fallback;
2305 		new_smc->fallback_rsn = lsmc->fallback_rsn;
2306 		sock_hold(lsk); /* sock_put in smc_listen_work */
2307 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
2308 		smc_copy_sock_settings_to_smc(new_smc);
2309 		new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
2310 		new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
2311 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
2312 		if (!queue_work(smc_hs_wq, &new_smc->smc_listen_work))
2313 			sock_put(&new_smc->sk);
2314 	}
2315 
2316 out:
2317 	release_sock(lsk);
2318 	sock_put(&lsmc->sk); /* sock_hold in smc_clcsock_data_ready() */
2319 }
2320 
2321 static void smc_clcsock_data_ready(struct sock *listen_clcsock)
2322 {
2323 	struct smc_sock *lsmc =
2324 		smc_clcsock_user_data(listen_clcsock);
2325 
2326 	if (!lsmc)
2327 		return;
2328 	lsmc->clcsk_data_ready(listen_clcsock);
2329 	if (lsmc->sk.sk_state == SMC_LISTEN) {
2330 		sock_hold(&lsmc->sk); /* sock_put in smc_tcp_listen_work() */
2331 		if (!queue_work(smc_tcp_ls_wq, &lsmc->tcp_listen_work))
2332 			sock_put(&lsmc->sk);
2333 	}
2334 }
2335 
2336 static int smc_listen(struct socket *sock, int backlog)
2337 {
2338 	struct sock *sk = sock->sk;
2339 	struct smc_sock *smc;
2340 	int rc;
2341 
2342 	smc = smc_sk(sk);
2343 	lock_sock(sk);
2344 
2345 	rc = -EINVAL;
2346 	if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
2347 	    smc->connect_nonblock)
2348 		goto out;
2349 
2350 	rc = 0;
2351 	if (sk->sk_state == SMC_LISTEN) {
2352 		sk->sk_max_ack_backlog = backlog;
2353 		goto out;
2354 	}
2355 	/* some socket options are handled in core, so we could not apply
2356 	 * them to the clc socket -- copy smc socket options to clc socket
2357 	 */
2358 	smc_copy_sock_settings_to_clc(smc);
2359 	if (!smc->use_fallback)
2360 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
2361 
2362 	/* save original sk_data_ready function and establish
2363 	 * smc-specific sk_data_ready function
2364 	 */
2365 	smc->clcsk_data_ready = smc->clcsock->sk->sk_data_ready;
2366 	smc->clcsock->sk->sk_data_ready = smc_clcsock_data_ready;
2367 	smc->clcsock->sk->sk_user_data =
2368 		(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
2369 
2370 	/* save original ops */
2371 	smc->ori_af_ops = inet_csk(smc->clcsock->sk)->icsk_af_ops;
2372 
2373 	smc->af_ops = *smc->ori_af_ops;
2374 	smc->af_ops.syn_recv_sock = smc_tcp_syn_recv_sock;
2375 
2376 	inet_csk(smc->clcsock->sk)->icsk_af_ops = &smc->af_ops;
2377 
2378 	if (smc->limit_smc_hs)
2379 		tcp_sk(smc->clcsock->sk)->smc_hs_congested = smc_hs_congested;
2380 
2381 	rc = kernel_listen(smc->clcsock, backlog);
2382 	if (rc) {
2383 		smc->clcsock->sk->sk_data_ready = smc->clcsk_data_ready;
2384 		goto out;
2385 	}
2386 	sk->sk_max_ack_backlog = backlog;
2387 	sk->sk_ack_backlog = 0;
2388 	sk->sk_state = SMC_LISTEN;
2389 
2390 out:
2391 	release_sock(sk);
2392 	return rc;
2393 }
2394 
2395 static int smc_accept(struct socket *sock, struct socket *new_sock,
2396 		      int flags, bool kern)
2397 {
2398 	struct sock *sk = sock->sk, *nsk;
2399 	DECLARE_WAITQUEUE(wait, current);
2400 	struct smc_sock *lsmc;
2401 	long timeo;
2402 	int rc = 0;
2403 
2404 	lsmc = smc_sk(sk);
2405 	sock_hold(sk); /* sock_put below */
2406 	lock_sock(sk);
2407 
2408 	if (lsmc->sk.sk_state != SMC_LISTEN) {
2409 		rc = -EINVAL;
2410 		release_sock(sk);
2411 		goto out;
2412 	}
2413 
2414 	/* Wait for an incoming connection */
2415 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
2416 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
2417 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
2418 		set_current_state(TASK_INTERRUPTIBLE);
2419 		if (!timeo) {
2420 			rc = -EAGAIN;
2421 			break;
2422 		}
2423 		release_sock(sk);
2424 		timeo = schedule_timeout(timeo);
2425 		/* wakeup by sk_data_ready in smc_listen_work() */
2426 		sched_annotate_sleep();
2427 		lock_sock(sk);
2428 		if (signal_pending(current)) {
2429 			rc = sock_intr_errno(timeo);
2430 			break;
2431 		}
2432 	}
2433 	set_current_state(TASK_RUNNING);
2434 	remove_wait_queue(sk_sleep(sk), &wait);
2435 
2436 	if (!rc)
2437 		rc = sock_error(nsk);
2438 	release_sock(sk);
2439 	if (rc)
2440 		goto out;
2441 
2442 	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
2443 		/* wait till data arrives on the socket */
2444 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
2445 								MSEC_PER_SEC);
2446 		if (smc_sk(nsk)->use_fallback) {
2447 			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
2448 
2449 			lock_sock(clcsk);
2450 			if (skb_queue_empty(&clcsk->sk_receive_queue))
2451 				sk_wait_data(clcsk, &timeo, NULL);
2452 			release_sock(clcsk);
2453 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
2454 			lock_sock(nsk);
2455 			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
2456 			release_sock(nsk);
2457 		}
2458 	}
2459 
2460 out:
2461 	sock_put(sk); /* sock_hold above */
2462 	return rc;
2463 }
2464 
2465 static int smc_getname(struct socket *sock, struct sockaddr *addr,
2466 		       int peer)
2467 {
2468 	struct smc_sock *smc;
2469 
2470 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
2471 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
2472 		return -ENOTCONN;
2473 
2474 	smc = smc_sk(sock->sk);
2475 
2476 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
2477 }
2478 
2479 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
2480 {
2481 	struct sock *sk = sock->sk;
2482 	struct smc_sock *smc;
2483 	int rc = -EPIPE;
2484 
2485 	smc = smc_sk(sk);
2486 	lock_sock(sk);
2487 	if ((sk->sk_state != SMC_ACTIVE) &&
2488 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
2489 	    (sk->sk_state != SMC_INIT))
2490 		goto out;
2491 
2492 	if (msg->msg_flags & MSG_FASTOPEN) {
2493 		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
2494 			rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
2495 			if (rc)
2496 				goto out;
2497 		} else {
2498 			rc = -EINVAL;
2499 			goto out;
2500 		}
2501 	}
2502 
2503 	if (smc->use_fallback) {
2504 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
2505 	} else {
2506 		rc = smc_tx_sendmsg(smc, msg, len);
2507 		SMC_STAT_TX_PAYLOAD(smc, len, rc);
2508 	}
2509 out:
2510 	release_sock(sk);
2511 	return rc;
2512 }
2513 
2514 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
2515 		       int flags)
2516 {
2517 	struct sock *sk = sock->sk;
2518 	struct smc_sock *smc;
2519 	int rc = -ENOTCONN;
2520 
2521 	smc = smc_sk(sk);
2522 	lock_sock(sk);
2523 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
2524 		/* socket was connected before, no more data to read */
2525 		rc = 0;
2526 		goto out;
2527 	}
2528 	if ((sk->sk_state == SMC_INIT) ||
2529 	    (sk->sk_state == SMC_LISTEN) ||
2530 	    (sk->sk_state == SMC_CLOSED))
2531 		goto out;
2532 
2533 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
2534 		rc = 0;
2535 		goto out;
2536 	}
2537 
2538 	if (smc->use_fallback) {
2539 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
2540 	} else {
2541 		msg->msg_namelen = 0;
2542 		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
2543 		SMC_STAT_RX_PAYLOAD(smc, rc, rc);
2544 	}
2545 
2546 out:
2547 	release_sock(sk);
2548 	return rc;
2549 }
2550 
2551 static __poll_t smc_accept_poll(struct sock *parent)
2552 {
2553 	struct smc_sock *isk = smc_sk(parent);
2554 	__poll_t mask = 0;
2555 
2556 	spin_lock(&isk->accept_q_lock);
2557 	if (!list_empty(&isk->accept_q))
2558 		mask = EPOLLIN | EPOLLRDNORM;
2559 	spin_unlock(&isk->accept_q_lock);
2560 
2561 	return mask;
2562 }
2563 
2564 static __poll_t smc_poll(struct file *file, struct socket *sock,
2565 			     poll_table *wait)
2566 {
2567 	struct sock *sk = sock->sk;
2568 	struct smc_sock *smc;
2569 	__poll_t mask = 0;
2570 
2571 	if (!sk)
2572 		return EPOLLNVAL;
2573 
2574 	smc = smc_sk(sock->sk);
2575 	if (smc->use_fallback) {
2576 		/* delegate to CLC child sock */
2577 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
2578 		sk->sk_err = smc->clcsock->sk->sk_err;
2579 	} else {
2580 		if (sk->sk_state != SMC_CLOSED)
2581 			sock_poll_wait(file, sock, wait);
2582 		if (sk->sk_err)
2583 			mask |= EPOLLERR;
2584 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
2585 		    (sk->sk_state == SMC_CLOSED))
2586 			mask |= EPOLLHUP;
2587 		if (sk->sk_state == SMC_LISTEN) {
2588 			/* woken up by sk_data_ready in smc_listen_work() */
2589 			mask |= smc_accept_poll(sk);
2590 		} else if (smc->use_fallback) { /* as result of connect_work()*/
2591 			mask |= smc->clcsock->ops->poll(file, smc->clcsock,
2592 							   wait);
2593 			sk->sk_err = smc->clcsock->sk->sk_err;
2594 		} else {
2595 			if ((sk->sk_state != SMC_INIT &&
2596 			     atomic_read(&smc->conn.sndbuf_space)) ||
2597 			    sk->sk_shutdown & SEND_SHUTDOWN) {
2598 				mask |= EPOLLOUT | EPOLLWRNORM;
2599 			} else {
2600 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
2601 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
2602 			}
2603 			if (atomic_read(&smc->conn.bytes_to_rcv))
2604 				mask |= EPOLLIN | EPOLLRDNORM;
2605 			if (sk->sk_shutdown & RCV_SHUTDOWN)
2606 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
2607 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
2608 				mask |= EPOLLIN;
2609 			if (smc->conn.urg_state == SMC_URG_VALID)
2610 				mask |= EPOLLPRI;
2611 		}
2612 	}
2613 
2614 	return mask;
2615 }
2616 
2617 static int smc_shutdown(struct socket *sock, int how)
2618 {
2619 	struct sock *sk = sock->sk;
2620 	bool do_shutdown = true;
2621 	struct smc_sock *smc;
2622 	int rc = -EINVAL;
2623 	int old_state;
2624 	int rc1 = 0;
2625 
2626 	smc = smc_sk(sk);
2627 
2628 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
2629 		return rc;
2630 
2631 	lock_sock(sk);
2632 
2633 	rc = -ENOTCONN;
2634 	if ((sk->sk_state != SMC_ACTIVE) &&
2635 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
2636 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
2637 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
2638 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
2639 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
2640 		goto out;
2641 	if (smc->use_fallback) {
2642 		rc = kernel_sock_shutdown(smc->clcsock, how);
2643 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
2644 		if (sk->sk_shutdown == SHUTDOWN_MASK)
2645 			sk->sk_state = SMC_CLOSED;
2646 		goto out;
2647 	}
2648 	switch (how) {
2649 	case SHUT_RDWR:		/* shutdown in both directions */
2650 		old_state = sk->sk_state;
2651 		rc = smc_close_active(smc);
2652 		if (old_state == SMC_ACTIVE &&
2653 		    sk->sk_state == SMC_PEERCLOSEWAIT1)
2654 			do_shutdown = false;
2655 		break;
2656 	case SHUT_WR:
2657 		rc = smc_close_shutdown_write(smc);
2658 		break;
2659 	case SHUT_RD:
2660 		rc = 0;
2661 		/* nothing more to do because peer is not involved */
2662 		break;
2663 	}
2664 	if (do_shutdown && smc->clcsock)
2665 		rc1 = kernel_sock_shutdown(smc->clcsock, how);
2666 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
2667 	sk->sk_shutdown |= how + 1;
2668 
2669 out:
2670 	release_sock(sk);
2671 	return rc ? rc : rc1;
2672 }
2673 
2674 static int __smc_getsockopt(struct socket *sock, int level, int optname,
2675 			    char __user *optval, int __user *optlen)
2676 {
2677 	struct smc_sock *smc;
2678 	int val, len;
2679 
2680 	smc = smc_sk(sock->sk);
2681 
2682 	if (get_user(len, optlen))
2683 		return -EFAULT;
2684 
2685 	len = min_t(int, len, sizeof(int));
2686 
2687 	if (len < 0)
2688 		return -EINVAL;
2689 
2690 	switch (optname) {
2691 	case SMC_LIMIT_HS:
2692 		val = smc->limit_smc_hs;
2693 		break;
2694 	default:
2695 		return -EOPNOTSUPP;
2696 	}
2697 
2698 	if (put_user(len, optlen))
2699 		return -EFAULT;
2700 	if (copy_to_user(optval, &val, len))
2701 		return -EFAULT;
2702 
2703 	return 0;
2704 }
2705 
2706 static int __smc_setsockopt(struct socket *sock, int level, int optname,
2707 			    sockptr_t optval, unsigned int optlen)
2708 {
2709 	struct sock *sk = sock->sk;
2710 	struct smc_sock *smc;
2711 	int val, rc;
2712 
2713 	smc = smc_sk(sk);
2714 
2715 	lock_sock(sk);
2716 	switch (optname) {
2717 	case SMC_LIMIT_HS:
2718 		if (optlen < sizeof(int)) {
2719 			rc = -EINVAL;
2720 			break;
2721 		}
2722 		if (copy_from_sockptr(&val, optval, sizeof(int))) {
2723 			rc = -EFAULT;
2724 			break;
2725 		}
2726 
2727 		smc->limit_smc_hs = !!val;
2728 		rc = 0;
2729 		break;
2730 	default:
2731 		rc = -EOPNOTSUPP;
2732 		break;
2733 	}
2734 	release_sock(sk);
2735 
2736 	return rc;
2737 }
2738 
2739 static int smc_setsockopt(struct socket *sock, int level, int optname,
2740 			  sockptr_t optval, unsigned int optlen)
2741 {
2742 	struct sock *sk = sock->sk;
2743 	struct smc_sock *smc;
2744 	int val, rc;
2745 
2746 	if (level == SOL_TCP && optname == TCP_ULP)
2747 		return -EOPNOTSUPP;
2748 	else if (level == SOL_SMC)
2749 		return __smc_setsockopt(sock, level, optname, optval, optlen);
2750 
2751 	smc = smc_sk(sk);
2752 
2753 	/* generic setsockopts reaching us here always apply to the
2754 	 * CLC socket
2755 	 */
2756 	mutex_lock(&smc->clcsock_release_lock);
2757 	if (!smc->clcsock) {
2758 		mutex_unlock(&smc->clcsock_release_lock);
2759 		return -EBADF;
2760 	}
2761 	if (unlikely(!smc->clcsock->ops->setsockopt))
2762 		rc = -EOPNOTSUPP;
2763 	else
2764 		rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
2765 						   optval, optlen);
2766 	if (smc->clcsock->sk->sk_err) {
2767 		sk->sk_err = smc->clcsock->sk->sk_err;
2768 		sk_error_report(sk);
2769 	}
2770 	mutex_unlock(&smc->clcsock_release_lock);
2771 
2772 	if (optlen < sizeof(int))
2773 		return -EINVAL;
2774 	if (copy_from_sockptr(&val, optval, sizeof(int)))
2775 		return -EFAULT;
2776 
2777 	lock_sock(sk);
2778 	if (rc || smc->use_fallback)
2779 		goto out;
2780 	switch (optname) {
2781 	case TCP_FASTOPEN:
2782 	case TCP_FASTOPEN_CONNECT:
2783 	case TCP_FASTOPEN_KEY:
2784 	case TCP_FASTOPEN_NO_COOKIE:
2785 		/* option not supported by SMC */
2786 		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
2787 			rc = smc_switch_to_fallback(smc, SMC_CLC_DECL_OPTUNSUPP);
2788 		} else {
2789 			rc = -EINVAL;
2790 		}
2791 		break;
2792 	case TCP_NODELAY:
2793 		if (sk->sk_state != SMC_INIT &&
2794 		    sk->sk_state != SMC_LISTEN &&
2795 		    sk->sk_state != SMC_CLOSED) {
2796 			if (val) {
2797 				SMC_STAT_INC(smc, ndly_cnt);
2798 				mod_delayed_work(smc->conn.lgr->tx_wq,
2799 						 &smc->conn.tx_work, 0);
2800 			}
2801 		}
2802 		break;
2803 	case TCP_CORK:
2804 		if (sk->sk_state != SMC_INIT &&
2805 		    sk->sk_state != SMC_LISTEN &&
2806 		    sk->sk_state != SMC_CLOSED) {
2807 			if (!val) {
2808 				SMC_STAT_INC(smc, cork_cnt);
2809 				smc_tx_pending(&smc->conn);
2810 				cancel_delayed_work(&smc->conn.tx_work);
2811 			}
2812 		}
2813 		break;
2814 	case TCP_DEFER_ACCEPT:
2815 		smc->sockopt_defer_accept = val;
2816 		break;
2817 	default:
2818 		break;
2819 	}
2820 out:
2821 	release_sock(sk);
2822 
2823 	return rc;
2824 }
2825 
2826 static int smc_getsockopt(struct socket *sock, int level, int optname,
2827 			  char __user *optval, int __user *optlen)
2828 {
2829 	struct smc_sock *smc;
2830 	int rc;
2831 
2832 	if (level == SOL_SMC)
2833 		return __smc_getsockopt(sock, level, optname, optval, optlen);
2834 
2835 	smc = smc_sk(sock->sk);
2836 	mutex_lock(&smc->clcsock_release_lock);
2837 	if (!smc->clcsock) {
2838 		mutex_unlock(&smc->clcsock_release_lock);
2839 		return -EBADF;
2840 	}
2841 	/* socket options apply to the CLC socket */
2842 	if (unlikely(!smc->clcsock->ops->getsockopt)) {
2843 		mutex_unlock(&smc->clcsock_release_lock);
2844 		return -EOPNOTSUPP;
2845 	}
2846 	rc = smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
2847 					   optval, optlen);
2848 	mutex_unlock(&smc->clcsock_release_lock);
2849 	return rc;
2850 }
2851 
2852 static int smc_ioctl(struct socket *sock, unsigned int cmd,
2853 		     unsigned long arg)
2854 {
2855 	union smc_host_cursor cons, urg;
2856 	struct smc_connection *conn;
2857 	struct smc_sock *smc;
2858 	int answ;
2859 
2860 	smc = smc_sk(sock->sk);
2861 	conn = &smc->conn;
2862 	lock_sock(&smc->sk);
2863 	if (smc->use_fallback) {
2864 		if (!smc->clcsock) {
2865 			release_sock(&smc->sk);
2866 			return -EBADF;
2867 		}
2868 		answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
2869 		release_sock(&smc->sk);
2870 		return answ;
2871 	}
2872 	switch (cmd) {
2873 	case SIOCINQ: /* same as FIONREAD */
2874 		if (smc->sk.sk_state == SMC_LISTEN) {
2875 			release_sock(&smc->sk);
2876 			return -EINVAL;
2877 		}
2878 		if (smc->sk.sk_state == SMC_INIT ||
2879 		    smc->sk.sk_state == SMC_CLOSED)
2880 			answ = 0;
2881 		else
2882 			answ = atomic_read(&smc->conn.bytes_to_rcv);
2883 		break;
2884 	case SIOCOUTQ:
2885 		/* output queue size (not send + not acked) */
2886 		if (smc->sk.sk_state == SMC_LISTEN) {
2887 			release_sock(&smc->sk);
2888 			return -EINVAL;
2889 		}
2890 		if (smc->sk.sk_state == SMC_INIT ||
2891 		    smc->sk.sk_state == SMC_CLOSED)
2892 			answ = 0;
2893 		else
2894 			answ = smc->conn.sndbuf_desc->len -
2895 					atomic_read(&smc->conn.sndbuf_space);
2896 		break;
2897 	case SIOCOUTQNSD:
2898 		/* output queue size (not send only) */
2899 		if (smc->sk.sk_state == SMC_LISTEN) {
2900 			release_sock(&smc->sk);
2901 			return -EINVAL;
2902 		}
2903 		if (smc->sk.sk_state == SMC_INIT ||
2904 		    smc->sk.sk_state == SMC_CLOSED)
2905 			answ = 0;
2906 		else
2907 			answ = smc_tx_prepared_sends(&smc->conn);
2908 		break;
2909 	case SIOCATMARK:
2910 		if (smc->sk.sk_state == SMC_LISTEN) {
2911 			release_sock(&smc->sk);
2912 			return -EINVAL;
2913 		}
2914 		if (smc->sk.sk_state == SMC_INIT ||
2915 		    smc->sk.sk_state == SMC_CLOSED) {
2916 			answ = 0;
2917 		} else {
2918 			smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
2919 			smc_curs_copy(&urg, &conn->urg_curs, conn);
2920 			answ = smc_curs_diff(conn->rmb_desc->len,
2921 					     &cons, &urg) == 1;
2922 		}
2923 		break;
2924 	default:
2925 		release_sock(&smc->sk);
2926 		return -ENOIOCTLCMD;
2927 	}
2928 	release_sock(&smc->sk);
2929 
2930 	return put_user(answ, (int __user *)arg);
2931 }
2932 
2933 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
2934 			    int offset, size_t size, int flags)
2935 {
2936 	struct sock *sk = sock->sk;
2937 	struct smc_sock *smc;
2938 	int rc = -EPIPE;
2939 
2940 	smc = smc_sk(sk);
2941 	lock_sock(sk);
2942 	if (sk->sk_state != SMC_ACTIVE) {
2943 		release_sock(sk);
2944 		goto out;
2945 	}
2946 	release_sock(sk);
2947 	if (smc->use_fallback) {
2948 		rc = kernel_sendpage(smc->clcsock, page, offset,
2949 				     size, flags);
2950 	} else {
2951 		lock_sock(sk);
2952 		rc = smc_tx_sendpage(smc, page, offset, size, flags);
2953 		release_sock(sk);
2954 		SMC_STAT_INC(smc, sendpage_cnt);
2955 	}
2956 
2957 out:
2958 	return rc;
2959 }
2960 
2961 /* Map the affected portions of the rmbe into an spd, note the number of bytes
2962  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
2963  * updates till whenever a respective page has been fully processed.
2964  * Note that subsequent recv() calls have to wait till all splice() processing
2965  * completed.
2966  */
2967 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
2968 			       struct pipe_inode_info *pipe, size_t len,
2969 			       unsigned int flags)
2970 {
2971 	struct sock *sk = sock->sk;
2972 	struct smc_sock *smc;
2973 	int rc = -ENOTCONN;
2974 
2975 	smc = smc_sk(sk);
2976 	lock_sock(sk);
2977 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
2978 		/* socket was connected before, no more data to read */
2979 		rc = 0;
2980 		goto out;
2981 	}
2982 	if (sk->sk_state == SMC_INIT ||
2983 	    sk->sk_state == SMC_LISTEN ||
2984 	    sk->sk_state == SMC_CLOSED)
2985 		goto out;
2986 
2987 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
2988 		rc = 0;
2989 		goto out;
2990 	}
2991 
2992 	if (smc->use_fallback) {
2993 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
2994 						    pipe, len, flags);
2995 	} else {
2996 		if (*ppos) {
2997 			rc = -ESPIPE;
2998 			goto out;
2999 		}
3000 		if (flags & SPLICE_F_NONBLOCK)
3001 			flags = MSG_DONTWAIT;
3002 		else
3003 			flags = 0;
3004 		SMC_STAT_INC(smc, splice_cnt);
3005 		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
3006 	}
3007 out:
3008 	release_sock(sk);
3009 
3010 	return rc;
3011 }
3012 
3013 /* must look like tcp */
3014 static const struct proto_ops smc_sock_ops = {
3015 	.family		= PF_SMC,
3016 	.owner		= THIS_MODULE,
3017 	.release	= smc_release,
3018 	.bind		= smc_bind,
3019 	.connect	= smc_connect,
3020 	.socketpair	= sock_no_socketpair,
3021 	.accept		= smc_accept,
3022 	.getname	= smc_getname,
3023 	.poll		= smc_poll,
3024 	.ioctl		= smc_ioctl,
3025 	.listen		= smc_listen,
3026 	.shutdown	= smc_shutdown,
3027 	.setsockopt	= smc_setsockopt,
3028 	.getsockopt	= smc_getsockopt,
3029 	.sendmsg	= smc_sendmsg,
3030 	.recvmsg	= smc_recvmsg,
3031 	.mmap		= sock_no_mmap,
3032 	.sendpage	= smc_sendpage,
3033 	.splice_read	= smc_splice_read,
3034 };
3035 
3036 static int __smc_create(struct net *net, struct socket *sock, int protocol,
3037 			int kern, struct socket *clcsock)
3038 {
3039 	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
3040 	struct smc_sock *smc;
3041 	struct sock *sk;
3042 	int rc;
3043 
3044 	rc = -ESOCKTNOSUPPORT;
3045 	if (sock->type != SOCK_STREAM)
3046 		goto out;
3047 
3048 	rc = -EPROTONOSUPPORT;
3049 	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
3050 		goto out;
3051 
3052 	rc = -ENOBUFS;
3053 	sock->ops = &smc_sock_ops;
3054 	sk = smc_sock_alloc(net, sock, protocol);
3055 	if (!sk)
3056 		goto out;
3057 
3058 	/* create internal TCP socket for CLC handshake and fallback */
3059 	smc = smc_sk(sk);
3060 	smc->use_fallback = false; /* assume rdma capability first */
3061 	smc->fallback_rsn = 0;
3062 
3063 	/* default behavior from limit_smc_hs in every net namespace */
3064 	smc->limit_smc_hs = net->smc.limit_smc_hs;
3065 
3066 	rc = 0;
3067 	if (!clcsock) {
3068 		rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
3069 				      &smc->clcsock);
3070 		if (rc) {
3071 			sk_common_release(sk);
3072 			goto out;
3073 		}
3074 	} else {
3075 		smc->clcsock = clcsock;
3076 	}
3077 
3078 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
3079 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
3080 
3081 out:
3082 	return rc;
3083 }
3084 
3085 static int smc_create(struct net *net, struct socket *sock, int protocol,
3086 		      int kern)
3087 {
3088 	return __smc_create(net, sock, protocol, kern, NULL);
3089 }
3090 
3091 static const struct net_proto_family smc_sock_family_ops = {
3092 	.family	= PF_SMC,
3093 	.owner	= THIS_MODULE,
3094 	.create	= smc_create,
3095 };
3096 
3097 static int smc_ulp_init(struct sock *sk)
3098 {
3099 	struct socket *tcp = sk->sk_socket;
3100 	struct net *net = sock_net(sk);
3101 	struct socket *smcsock;
3102 	int protocol, ret;
3103 
3104 	/* only TCP can be replaced */
3105 	if (tcp->type != SOCK_STREAM || sk->sk_protocol != IPPROTO_TCP ||
3106 	    (sk->sk_family != AF_INET && sk->sk_family != AF_INET6))
3107 		return -ESOCKTNOSUPPORT;
3108 	/* don't handle wq now */
3109 	if (tcp->state != SS_UNCONNECTED || !tcp->file || tcp->wq.fasync_list)
3110 		return -ENOTCONN;
3111 
3112 	if (sk->sk_family == AF_INET)
3113 		protocol = SMCPROTO_SMC;
3114 	else
3115 		protocol = SMCPROTO_SMC6;
3116 
3117 	smcsock = sock_alloc();
3118 	if (!smcsock)
3119 		return -ENFILE;
3120 
3121 	smcsock->type = SOCK_STREAM;
3122 	__module_get(THIS_MODULE); /* tried in __tcp_ulp_find_autoload */
3123 	ret = __smc_create(net, smcsock, protocol, 1, tcp);
3124 	if (ret) {
3125 		sock_release(smcsock); /* module_put() which ops won't be NULL */
3126 		return ret;
3127 	}
3128 
3129 	/* replace tcp socket to smc */
3130 	smcsock->file = tcp->file;
3131 	smcsock->file->private_data = smcsock;
3132 	smcsock->file->f_inode = SOCK_INODE(smcsock); /* replace inode when sock_close */
3133 	smcsock->file->f_path.dentry->d_inode = SOCK_INODE(smcsock); /* dput() in __fput */
3134 	tcp->file = NULL;
3135 
3136 	return ret;
3137 }
3138 
3139 static void smc_ulp_clone(const struct request_sock *req, struct sock *newsk,
3140 			  const gfp_t priority)
3141 {
3142 	struct inet_connection_sock *icsk = inet_csk(newsk);
3143 
3144 	/* don't inherit ulp ops to child when listen */
3145 	icsk->icsk_ulp_ops = NULL;
3146 }
3147 
3148 static struct tcp_ulp_ops smc_ulp_ops __read_mostly = {
3149 	.name		= "smc",
3150 	.owner		= THIS_MODULE,
3151 	.init		= smc_ulp_init,
3152 	.clone		= smc_ulp_clone,
3153 };
3154 
3155 unsigned int smc_net_id;
3156 
3157 static __net_init int smc_net_init(struct net *net)
3158 {
3159 	return smc_pnet_net_init(net);
3160 }
3161 
3162 static void __net_exit smc_net_exit(struct net *net)
3163 {
3164 	smc_pnet_net_exit(net);
3165 }
3166 
3167 static __net_init int smc_net_stat_init(struct net *net)
3168 {
3169 	return smc_stats_init(net);
3170 }
3171 
3172 static void __net_exit smc_net_stat_exit(struct net *net)
3173 {
3174 	smc_stats_exit(net);
3175 }
3176 
3177 static struct pernet_operations smc_net_ops = {
3178 	.init = smc_net_init,
3179 	.exit = smc_net_exit,
3180 	.id   = &smc_net_id,
3181 	.size = sizeof(struct smc_net),
3182 };
3183 
3184 static struct pernet_operations smc_net_stat_ops = {
3185 	.init = smc_net_stat_init,
3186 	.exit = smc_net_stat_exit,
3187 };
3188 
3189 static int __init smc_init(void)
3190 {
3191 	int rc;
3192 
3193 	rc = register_pernet_subsys(&smc_net_ops);
3194 	if (rc)
3195 		return rc;
3196 
3197 	rc = register_pernet_subsys(&smc_net_stat_ops);
3198 	if (rc)
3199 		return rc;
3200 
3201 	smc_ism_init();
3202 	smc_clc_init();
3203 
3204 	rc = smc_nl_init();
3205 	if (rc)
3206 		goto out_pernet_subsys;
3207 
3208 	rc = smc_pnet_init();
3209 	if (rc)
3210 		goto out_nl;
3211 
3212 	rc = -ENOMEM;
3213 
3214 	smc_tcp_ls_wq = alloc_workqueue("smc_tcp_ls_wq", 0, 0);
3215 	if (!smc_tcp_ls_wq)
3216 		goto out_pnet;
3217 
3218 	smc_hs_wq = alloc_workqueue("smc_hs_wq", 0, 0);
3219 	if (!smc_hs_wq)
3220 		goto out_alloc_tcp_ls_wq;
3221 
3222 	smc_close_wq = alloc_workqueue("smc_close_wq", 0, 0);
3223 	if (!smc_close_wq)
3224 		goto out_alloc_hs_wq;
3225 
3226 	rc = smc_core_init();
3227 	if (rc) {
3228 		pr_err("%s: smc_core_init fails with %d\n", __func__, rc);
3229 		goto out_alloc_wqs;
3230 	}
3231 
3232 	rc = smc_llc_init();
3233 	if (rc) {
3234 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
3235 		goto out_core;
3236 	}
3237 
3238 	rc = smc_cdc_init();
3239 	if (rc) {
3240 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
3241 		goto out_core;
3242 	}
3243 
3244 	rc = proto_register(&smc_proto, 1);
3245 	if (rc) {
3246 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
3247 		goto out_core;
3248 	}
3249 
3250 	rc = proto_register(&smc_proto6, 1);
3251 	if (rc) {
3252 		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
3253 		goto out_proto;
3254 	}
3255 
3256 	rc = sock_register(&smc_sock_family_ops);
3257 	if (rc) {
3258 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
3259 		goto out_proto6;
3260 	}
3261 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
3262 	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
3263 
3264 	rc = smc_ib_register_client();
3265 	if (rc) {
3266 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
3267 		goto out_sock;
3268 	}
3269 
3270 	rc = tcp_register_ulp(&smc_ulp_ops);
3271 	if (rc) {
3272 		pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
3273 		goto out_sock;
3274 	}
3275 
3276 	static_branch_enable(&tcp_have_smc);
3277 	return 0;
3278 
3279 out_sock:
3280 	sock_unregister(PF_SMC);
3281 out_proto6:
3282 	proto_unregister(&smc_proto6);
3283 out_proto:
3284 	proto_unregister(&smc_proto);
3285 out_core:
3286 	smc_core_exit();
3287 out_alloc_wqs:
3288 	destroy_workqueue(smc_close_wq);
3289 out_alloc_hs_wq:
3290 	destroy_workqueue(smc_hs_wq);
3291 out_alloc_tcp_ls_wq:
3292 	destroy_workqueue(smc_tcp_ls_wq);
3293 out_pnet:
3294 	smc_pnet_exit();
3295 out_nl:
3296 	smc_nl_exit();
3297 out_pernet_subsys:
3298 	unregister_pernet_subsys(&smc_net_ops);
3299 
3300 	return rc;
3301 }
3302 
3303 static void __exit smc_exit(void)
3304 {
3305 	static_branch_disable(&tcp_have_smc);
3306 	tcp_unregister_ulp(&smc_ulp_ops);
3307 	sock_unregister(PF_SMC);
3308 	smc_core_exit();
3309 	smc_ib_unregister_client();
3310 	destroy_workqueue(smc_close_wq);
3311 	destroy_workqueue(smc_tcp_ls_wq);
3312 	destroy_workqueue(smc_hs_wq);
3313 	proto_unregister(&smc_proto6);
3314 	proto_unregister(&smc_proto);
3315 	smc_pnet_exit();
3316 	smc_nl_exit();
3317 	smc_clc_exit();
3318 	unregister_pernet_subsys(&smc_net_stat_ops);
3319 	unregister_pernet_subsys(&smc_net_ops);
3320 	rcu_barrier();
3321 }
3322 
3323 module_init(smc_init);
3324 module_exit(smc_exit);
3325 
3326 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
3327 MODULE_DESCRIPTION("smc socket address family");
3328 MODULE_LICENSE("GPL");
3329 MODULE_ALIAS_NETPROTO(PF_SMC);
3330 MODULE_ALIAS_TCP_ULP("smc");
3331