xref: /linux/net/smc/af_smc.c (revision 2c63221cd9e5c0dad0424029aeb1c40faada8330)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  AF_SMC protocol family socket handler keeping the AF_INET sock address type
6  *  applies to SOCK_STREAM sockets only
7  *  offers an alternative communication option for TCP-protocol sockets
8  *  applicable with RoCE-cards only
9  *
10  *  Initial restrictions:
11  *    - support for alternate links postponed
12  *
13  *  Copyright IBM Corp. 2016, 2018
14  *
15  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
16  *              based on prototype from Frank Blaschka
17  */
18 
19 #define KMSG_COMPONENT "smc"
20 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
21 
22 #include <linux/module.h>
23 #include <linux/socket.h>
24 #include <linux/workqueue.h>
25 #include <linux/in.h>
26 #include <linux/sched/signal.h>
27 #include <linux/if_vlan.h>
28 
29 #include <net/sock.h>
30 #include <net/tcp.h>
31 #include <net/smc.h>
32 #include <asm/ioctls.h>
33 
34 #include <net/net_namespace.h>
35 #include <net/netns/generic.h>
36 #include "smc_netns.h"
37 
38 #include "smc.h"
39 #include "smc_clc.h"
40 #include "smc_llc.h"
41 #include "smc_cdc.h"
42 #include "smc_core.h"
43 #include "smc_ib.h"
44 #include "smc_ism.h"
45 #include "smc_pnet.h"
46 #include "smc_tx.h"
47 #include "smc_rx.h"
48 #include "smc_close.h"
49 
50 static DEFINE_MUTEX(smc_server_lgr_pending);	/* serialize link group
51 						 * creation on server
52 						 */
53 static DEFINE_MUTEX(smc_client_lgr_pending);	/* serialize link group
54 						 * creation on client
55 						 */
56 
57 static void smc_tcp_listen_work(struct work_struct *);
58 static void smc_connect_work(struct work_struct *);
59 
60 static void smc_set_keepalive(struct sock *sk, int val)
61 {
62 	struct smc_sock *smc = smc_sk(sk);
63 
64 	smc->clcsock->sk->sk_prot->keepalive(smc->clcsock->sk, val);
65 }
66 
67 static struct smc_hashinfo smc_v4_hashinfo = {
68 	.lock = __RW_LOCK_UNLOCKED(smc_v4_hashinfo.lock),
69 };
70 
71 static struct smc_hashinfo smc_v6_hashinfo = {
72 	.lock = __RW_LOCK_UNLOCKED(smc_v6_hashinfo.lock),
73 };
74 
75 int smc_hash_sk(struct sock *sk)
76 {
77 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
78 	struct hlist_head *head;
79 
80 	head = &h->ht;
81 
82 	write_lock_bh(&h->lock);
83 	sk_add_node(sk, head);
84 	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
85 	write_unlock_bh(&h->lock);
86 
87 	return 0;
88 }
89 EXPORT_SYMBOL_GPL(smc_hash_sk);
90 
91 void smc_unhash_sk(struct sock *sk)
92 {
93 	struct smc_hashinfo *h = sk->sk_prot->h.smc_hash;
94 
95 	write_lock_bh(&h->lock);
96 	if (sk_del_node_init(sk))
97 		sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
98 	write_unlock_bh(&h->lock);
99 }
100 EXPORT_SYMBOL_GPL(smc_unhash_sk);
101 
102 struct proto smc_proto = {
103 	.name		= "SMC",
104 	.owner		= THIS_MODULE,
105 	.keepalive	= smc_set_keepalive,
106 	.hash		= smc_hash_sk,
107 	.unhash		= smc_unhash_sk,
108 	.obj_size	= sizeof(struct smc_sock),
109 	.h.smc_hash	= &smc_v4_hashinfo,
110 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
111 };
112 EXPORT_SYMBOL_GPL(smc_proto);
113 
114 struct proto smc_proto6 = {
115 	.name		= "SMC6",
116 	.owner		= THIS_MODULE,
117 	.keepalive	= smc_set_keepalive,
118 	.hash		= smc_hash_sk,
119 	.unhash		= smc_unhash_sk,
120 	.obj_size	= sizeof(struct smc_sock),
121 	.h.smc_hash	= &smc_v6_hashinfo,
122 	.slab_flags	= SLAB_TYPESAFE_BY_RCU,
123 };
124 EXPORT_SYMBOL_GPL(smc_proto6);
125 
126 static void smc_restore_fallback_changes(struct smc_sock *smc)
127 {
128 	smc->clcsock->file->private_data = smc->sk.sk_socket;
129 	smc->clcsock->file = NULL;
130 }
131 
132 static int __smc_release(struct smc_sock *smc)
133 {
134 	struct sock *sk = &smc->sk;
135 	int rc = 0;
136 
137 	if (!smc->use_fallback) {
138 		rc = smc_close_active(smc);
139 		sock_set_flag(sk, SOCK_DEAD);
140 		sk->sk_shutdown |= SHUTDOWN_MASK;
141 	} else {
142 		if (sk->sk_state != SMC_LISTEN && sk->sk_state != SMC_INIT)
143 			sock_put(sk); /* passive closing */
144 		if (sk->sk_state == SMC_LISTEN) {
145 			/* wake up clcsock accept */
146 			rc = kernel_sock_shutdown(smc->clcsock, SHUT_RDWR);
147 		}
148 		sk->sk_state = SMC_CLOSED;
149 		sk->sk_state_change(sk);
150 		smc_restore_fallback_changes(smc);
151 	}
152 
153 	sk->sk_prot->unhash(sk);
154 
155 	if (sk->sk_state == SMC_CLOSED) {
156 		if (smc->clcsock) {
157 			release_sock(sk);
158 			smc_clcsock_release(smc);
159 			lock_sock(sk);
160 		}
161 		if (!smc->use_fallback)
162 			smc_conn_free(&smc->conn);
163 	}
164 
165 	return rc;
166 }
167 
168 static int smc_release(struct socket *sock)
169 {
170 	struct sock *sk = sock->sk;
171 	struct smc_sock *smc;
172 	int rc = 0;
173 
174 	if (!sk)
175 		goto out;
176 
177 	sock_hold(sk); /* sock_put below */
178 	smc = smc_sk(sk);
179 
180 	/* cleanup for a dangling non-blocking connect */
181 	if (smc->connect_nonblock && sk->sk_state == SMC_INIT)
182 		tcp_abort(smc->clcsock->sk, ECONNABORTED);
183 	flush_work(&smc->connect_work);
184 
185 	if (sk->sk_state == SMC_LISTEN)
186 		/* smc_close_non_accepted() is called and acquires
187 		 * sock lock for child sockets again
188 		 */
189 		lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
190 	else
191 		lock_sock(sk);
192 
193 	rc = __smc_release(smc);
194 
195 	/* detach socket */
196 	sock_orphan(sk);
197 	sock->sk = NULL;
198 	release_sock(sk);
199 
200 	sock_put(sk); /* sock_hold above */
201 	sock_put(sk); /* final sock_put */
202 out:
203 	return rc;
204 }
205 
206 static void smc_destruct(struct sock *sk)
207 {
208 	if (sk->sk_state != SMC_CLOSED)
209 		return;
210 	if (!sock_flag(sk, SOCK_DEAD))
211 		return;
212 
213 	sk_refcnt_debug_dec(sk);
214 }
215 
216 static struct sock *smc_sock_alloc(struct net *net, struct socket *sock,
217 				   int protocol)
218 {
219 	struct smc_sock *smc;
220 	struct proto *prot;
221 	struct sock *sk;
222 
223 	prot = (protocol == SMCPROTO_SMC6) ? &smc_proto6 : &smc_proto;
224 	sk = sk_alloc(net, PF_SMC, GFP_KERNEL, prot, 0);
225 	if (!sk)
226 		return NULL;
227 
228 	sock_init_data(sock, sk); /* sets sk_refcnt to 1 */
229 	sk->sk_state = SMC_INIT;
230 	sk->sk_destruct = smc_destruct;
231 	sk->sk_protocol = protocol;
232 	smc = smc_sk(sk);
233 	INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work);
234 	INIT_WORK(&smc->connect_work, smc_connect_work);
235 	INIT_DELAYED_WORK(&smc->conn.tx_work, smc_tx_work);
236 	INIT_LIST_HEAD(&smc->accept_q);
237 	spin_lock_init(&smc->accept_q_lock);
238 	spin_lock_init(&smc->conn.send_lock);
239 	sk->sk_prot->hash(sk);
240 	sk_refcnt_debug_inc(sk);
241 	mutex_init(&smc->clcsock_release_lock);
242 
243 	return sk;
244 }
245 
246 static int smc_bind(struct socket *sock, struct sockaddr *uaddr,
247 		    int addr_len)
248 {
249 	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
250 	struct sock *sk = sock->sk;
251 	struct smc_sock *smc;
252 	int rc;
253 
254 	smc = smc_sk(sk);
255 
256 	/* replicate tests from inet_bind(), to be safe wrt. future changes */
257 	rc = -EINVAL;
258 	if (addr_len < sizeof(struct sockaddr_in))
259 		goto out;
260 
261 	rc = -EAFNOSUPPORT;
262 	if (addr->sin_family != AF_INET &&
263 	    addr->sin_family != AF_INET6 &&
264 	    addr->sin_family != AF_UNSPEC)
265 		goto out;
266 	/* accept AF_UNSPEC (mapped to AF_INET) only if s_addr is INADDR_ANY */
267 	if (addr->sin_family == AF_UNSPEC &&
268 	    addr->sin_addr.s_addr != htonl(INADDR_ANY))
269 		goto out;
270 
271 	lock_sock(sk);
272 
273 	/* Check if socket is already active */
274 	rc = -EINVAL;
275 	if (sk->sk_state != SMC_INIT || smc->connect_nonblock)
276 		goto out_rel;
277 
278 	smc->clcsock->sk->sk_reuse = sk->sk_reuse;
279 	rc = kernel_bind(smc->clcsock, uaddr, addr_len);
280 
281 out_rel:
282 	release_sock(sk);
283 out:
284 	return rc;
285 }
286 
287 static void smc_copy_sock_settings(struct sock *nsk, struct sock *osk,
288 				   unsigned long mask)
289 {
290 	/* options we don't get control via setsockopt for */
291 	nsk->sk_type = osk->sk_type;
292 	nsk->sk_sndbuf = osk->sk_sndbuf;
293 	nsk->sk_rcvbuf = osk->sk_rcvbuf;
294 	nsk->sk_sndtimeo = osk->sk_sndtimeo;
295 	nsk->sk_rcvtimeo = osk->sk_rcvtimeo;
296 	nsk->sk_mark = osk->sk_mark;
297 	nsk->sk_priority = osk->sk_priority;
298 	nsk->sk_rcvlowat = osk->sk_rcvlowat;
299 	nsk->sk_bound_dev_if = osk->sk_bound_dev_if;
300 	nsk->sk_err = osk->sk_err;
301 
302 	nsk->sk_flags &= ~mask;
303 	nsk->sk_flags |= osk->sk_flags & mask;
304 }
305 
306 #define SK_FLAGS_SMC_TO_CLC ((1UL << SOCK_URGINLINE) | \
307 			     (1UL << SOCK_KEEPOPEN) | \
308 			     (1UL << SOCK_LINGER) | \
309 			     (1UL << SOCK_BROADCAST) | \
310 			     (1UL << SOCK_TIMESTAMP) | \
311 			     (1UL << SOCK_DBG) | \
312 			     (1UL << SOCK_RCVTSTAMP) | \
313 			     (1UL << SOCK_RCVTSTAMPNS) | \
314 			     (1UL << SOCK_LOCALROUTE) | \
315 			     (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE) | \
316 			     (1UL << SOCK_RXQ_OVFL) | \
317 			     (1UL << SOCK_WIFI_STATUS) | \
318 			     (1UL << SOCK_NOFCS) | \
319 			     (1UL << SOCK_FILTER_LOCKED) | \
320 			     (1UL << SOCK_TSTAMP_NEW))
321 /* copy only relevant settings and flags of SOL_SOCKET level from smc to
322  * clc socket (since smc is not called for these options from net/core)
323  */
324 static void smc_copy_sock_settings_to_clc(struct smc_sock *smc)
325 {
326 	smc_copy_sock_settings(smc->clcsock->sk, &smc->sk, SK_FLAGS_SMC_TO_CLC);
327 }
328 
329 #define SK_FLAGS_CLC_TO_SMC ((1UL << SOCK_URGINLINE) | \
330 			     (1UL << SOCK_KEEPOPEN) | \
331 			     (1UL << SOCK_LINGER) | \
332 			     (1UL << SOCK_DBG))
333 /* copy only settings and flags relevant for smc from clc to smc socket */
334 static void smc_copy_sock_settings_to_smc(struct smc_sock *smc)
335 {
336 	smc_copy_sock_settings(&smc->sk, smc->clcsock->sk, SK_FLAGS_CLC_TO_SMC);
337 }
338 
339 /* register a new rmb, send confirm_rkey msg to register with peer */
340 static int smc_reg_rmb(struct smc_link *link, struct smc_buf_desc *rmb_desc,
341 		       bool conf_rkey)
342 {
343 	if (!rmb_desc->wr_reg) {
344 		/* register memory region for new rmb */
345 		if (smc_wr_reg_send(link, rmb_desc->mr_rx[SMC_SINGLE_LINK])) {
346 			rmb_desc->regerr = 1;
347 			return -EFAULT;
348 		}
349 		rmb_desc->wr_reg = 1;
350 	}
351 	if (!conf_rkey)
352 		return 0;
353 	/* exchange confirm_rkey msg with peer */
354 	if (smc_llc_do_confirm_rkey(link, rmb_desc)) {
355 		rmb_desc->regerr = 1;
356 		return -EFAULT;
357 	}
358 	return 0;
359 }
360 
361 static int smc_clnt_conf_first_link(struct smc_sock *smc)
362 {
363 	struct net *net = sock_net(smc->clcsock->sk);
364 	struct smc_link_group *lgr = smc->conn.lgr;
365 	struct smc_link *link;
366 	int rest;
367 	int rc;
368 
369 	link = &lgr->lnk[SMC_SINGLE_LINK];
370 	/* receive CONFIRM LINK request from server over RoCE fabric */
371 	rest = wait_for_completion_interruptible_timeout(
372 		&link->llc_confirm,
373 		SMC_LLC_WAIT_FIRST_TIME);
374 	if (rest <= 0) {
375 		struct smc_clc_msg_decline dclc;
376 
377 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
378 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
379 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
380 	}
381 
382 	if (link->llc_confirm_rc)
383 		return SMC_CLC_DECL_RMBE_EC;
384 
385 	rc = smc_ib_modify_qp_rts(link);
386 	if (rc)
387 		return SMC_CLC_DECL_ERR_RDYLNK;
388 
389 	smc_wr_remember_qp_attr(link);
390 
391 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
392 		return SMC_CLC_DECL_ERR_REGRMB;
393 
394 	/* send CONFIRM LINK response over RoCE fabric */
395 	rc = smc_llc_send_confirm_link(link, SMC_LLC_RESP);
396 	if (rc < 0)
397 		return SMC_CLC_DECL_TIMEOUT_CL;
398 
399 	/* receive ADD LINK request from server over RoCE fabric */
400 	rest = wait_for_completion_interruptible_timeout(&link->llc_add,
401 							 SMC_LLC_WAIT_TIME);
402 	if (rest <= 0) {
403 		struct smc_clc_msg_decline dclc;
404 
405 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
406 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
407 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
408 	}
409 
410 	/* send add link reject message, only one link supported for now */
411 	rc = smc_llc_send_add_link(link,
412 				   link->smcibdev->mac[link->ibport - 1],
413 				   link->gid, SMC_LLC_RESP);
414 	if (rc < 0)
415 		return SMC_CLC_DECL_TIMEOUT_AL;
416 
417 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
418 
419 	return 0;
420 }
421 
422 static void smcr_conn_save_peer_info(struct smc_sock *smc,
423 				     struct smc_clc_msg_accept_confirm *clc)
424 {
425 	int bufsize = smc_uncompress_bufsize(clc->rmbe_size);
426 
427 	smc->conn.peer_rmbe_idx = clc->rmbe_idx;
428 	smc->conn.local_tx_ctrl.token = ntohl(clc->rmbe_alert_token);
429 	smc->conn.peer_rmbe_size = bufsize;
430 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
431 	smc->conn.tx_off = bufsize * (smc->conn.peer_rmbe_idx - 1);
432 }
433 
434 static void smcd_conn_save_peer_info(struct smc_sock *smc,
435 				     struct smc_clc_msg_accept_confirm *clc)
436 {
437 	int bufsize = smc_uncompress_bufsize(clc->dmbe_size);
438 
439 	smc->conn.peer_rmbe_idx = clc->dmbe_idx;
440 	smc->conn.peer_token = clc->token;
441 	/* msg header takes up space in the buffer */
442 	smc->conn.peer_rmbe_size = bufsize - sizeof(struct smcd_cdc_msg);
443 	atomic_set(&smc->conn.peer_rmbe_space, smc->conn.peer_rmbe_size);
444 	smc->conn.tx_off = bufsize * smc->conn.peer_rmbe_idx;
445 }
446 
447 static void smc_conn_save_peer_info(struct smc_sock *smc,
448 				    struct smc_clc_msg_accept_confirm *clc)
449 {
450 	if (smc->conn.lgr->is_smcd)
451 		smcd_conn_save_peer_info(smc, clc);
452 	else
453 		smcr_conn_save_peer_info(smc, clc);
454 }
455 
456 static void smc_link_save_peer_info(struct smc_link *link,
457 				    struct smc_clc_msg_accept_confirm *clc)
458 {
459 	link->peer_qpn = ntoh24(clc->qpn);
460 	memcpy(link->peer_gid, clc->lcl.gid, SMC_GID_SIZE);
461 	memcpy(link->peer_mac, clc->lcl.mac, sizeof(link->peer_mac));
462 	link->peer_psn = ntoh24(clc->psn);
463 	link->peer_mtu = clc->qp_mtu;
464 }
465 
466 static void smc_switch_to_fallback(struct smc_sock *smc)
467 {
468 	smc->use_fallback = true;
469 	if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
470 		smc->clcsock->file = smc->sk.sk_socket->file;
471 		smc->clcsock->file->private_data = smc->clcsock;
472 	}
473 }
474 
475 /* fall back during connect */
476 static int smc_connect_fallback(struct smc_sock *smc, int reason_code)
477 {
478 	smc_switch_to_fallback(smc);
479 	smc->fallback_rsn = reason_code;
480 	smc_copy_sock_settings_to_clc(smc);
481 	smc->connect_nonblock = 0;
482 	if (smc->sk.sk_state == SMC_INIT)
483 		smc->sk.sk_state = SMC_ACTIVE;
484 	return 0;
485 }
486 
487 /* decline and fall back during connect */
488 static int smc_connect_decline_fallback(struct smc_sock *smc, int reason_code)
489 {
490 	int rc;
491 
492 	if (reason_code < 0) { /* error, fallback is not possible */
493 		if (smc->sk.sk_state == SMC_INIT)
494 			sock_put(&smc->sk); /* passive closing */
495 		return reason_code;
496 	}
497 	if (reason_code != SMC_CLC_DECL_PEERDECL) {
498 		rc = smc_clc_send_decline(smc, reason_code);
499 		if (rc < 0) {
500 			if (smc->sk.sk_state == SMC_INIT)
501 				sock_put(&smc->sk); /* passive closing */
502 			return rc;
503 		}
504 	}
505 	return smc_connect_fallback(smc, reason_code);
506 }
507 
508 /* abort connecting */
509 static int smc_connect_abort(struct smc_sock *smc, int reason_code,
510 			     int local_contact)
511 {
512 	if (local_contact == SMC_FIRST_CONTACT)
513 		smc_lgr_forget(smc->conn.lgr);
514 	if (smc->conn.lgr->is_smcd)
515 		/* there is only one lgr role for SMC-D; use server lock */
516 		mutex_unlock(&smc_server_lgr_pending);
517 	else
518 		mutex_unlock(&smc_client_lgr_pending);
519 
520 	smc_conn_free(&smc->conn);
521 	smc->connect_nonblock = 0;
522 	return reason_code;
523 }
524 
525 /* check if there is a rdma device available for this connection. */
526 /* called for connect and listen */
527 static int smc_find_rdma_device(struct smc_sock *smc, struct smc_init_info *ini)
528 {
529 	/* PNET table look up: search active ib_device and port
530 	 * within same PNETID that also contains the ethernet device
531 	 * used for the internal TCP socket
532 	 */
533 	smc_pnet_find_roce_resource(smc->clcsock->sk, ini);
534 	if (!ini->ib_dev)
535 		return SMC_CLC_DECL_NOSMCRDEV;
536 	return 0;
537 }
538 
539 /* check if there is an ISM device available for this connection. */
540 /* called for connect and listen */
541 static int smc_find_ism_device(struct smc_sock *smc, struct smc_init_info *ini)
542 {
543 	/* Find ISM device with same PNETID as connecting interface  */
544 	smc_pnet_find_ism_resource(smc->clcsock->sk, ini);
545 	if (!ini->ism_dev)
546 		return SMC_CLC_DECL_NOSMCDDEV;
547 	return 0;
548 }
549 
550 /* Check for VLAN ID and register it on ISM device just for CLC handshake */
551 static int smc_connect_ism_vlan_setup(struct smc_sock *smc,
552 				      struct smc_init_info *ini)
553 {
554 	if (ini->vlan_id && smc_ism_get_vlan(ini->ism_dev, ini->vlan_id))
555 		return SMC_CLC_DECL_ISMVLANERR;
556 	return 0;
557 }
558 
559 /* cleanup temporary VLAN ID registration used for CLC handshake. If ISM is
560  * used, the VLAN ID will be registered again during the connection setup.
561  */
562 static int smc_connect_ism_vlan_cleanup(struct smc_sock *smc, bool is_smcd,
563 					struct smc_init_info *ini)
564 {
565 	if (!is_smcd)
566 		return 0;
567 	if (ini->vlan_id && smc_ism_put_vlan(ini->ism_dev, ini->vlan_id))
568 		return SMC_CLC_DECL_CNFERR;
569 	return 0;
570 }
571 
572 /* CLC handshake during connect */
573 static int smc_connect_clc(struct smc_sock *smc, int smc_type,
574 			   struct smc_clc_msg_accept_confirm *aclc,
575 			   struct smc_init_info *ini)
576 {
577 	int rc = 0;
578 
579 	/* do inband token exchange */
580 	rc = smc_clc_send_proposal(smc, smc_type, ini);
581 	if (rc)
582 		return rc;
583 	/* receive SMC Accept CLC message */
584 	return smc_clc_wait_msg(smc, aclc, sizeof(*aclc), SMC_CLC_ACCEPT,
585 				CLC_WAIT_TIME);
586 }
587 
588 /* setup for RDMA connection of client */
589 static int smc_connect_rdma(struct smc_sock *smc,
590 			    struct smc_clc_msg_accept_confirm *aclc,
591 			    struct smc_init_info *ini)
592 {
593 	struct smc_link *link;
594 	int reason_code = 0;
595 
596 	ini->is_smcd = false;
597 	ini->ib_lcl = &aclc->lcl;
598 	ini->ib_clcqpn = ntoh24(aclc->qpn);
599 	ini->srv_first_contact = aclc->hdr.flag;
600 
601 	mutex_lock(&smc_client_lgr_pending);
602 	reason_code = smc_conn_create(smc, ini);
603 	if (reason_code) {
604 		mutex_unlock(&smc_client_lgr_pending);
605 		return reason_code;
606 	}
607 	link = &smc->conn.lgr->lnk[SMC_SINGLE_LINK];
608 
609 	smc_conn_save_peer_info(smc, aclc);
610 
611 	/* create send buffer and rmb */
612 	if (smc_buf_create(smc, false))
613 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
614 					 ini->cln_first_contact);
615 
616 	if (ini->cln_first_contact == SMC_FIRST_CONTACT)
617 		smc_link_save_peer_info(link, aclc);
618 
619 	if (smc_rmb_rtoken_handling(&smc->conn, aclc))
620 		return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RTOK,
621 					 ini->cln_first_contact);
622 
623 	smc_close_init(smc);
624 	smc_rx_init(smc);
625 
626 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
627 		if (smc_ib_ready_link(link))
628 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_RDYLNK,
629 						 ini->cln_first_contact);
630 	} else {
631 		if (smc_reg_rmb(link, smc->conn.rmb_desc, true))
632 			return smc_connect_abort(smc, SMC_CLC_DECL_ERR_REGRMB,
633 						 ini->cln_first_contact);
634 	}
635 	smc_rmb_sync_sg_for_device(&smc->conn);
636 
637 	reason_code = smc_clc_send_confirm(smc);
638 	if (reason_code)
639 		return smc_connect_abort(smc, reason_code,
640 					 ini->cln_first_contact);
641 
642 	smc_tx_init(smc);
643 
644 	if (ini->cln_first_contact == SMC_FIRST_CONTACT) {
645 		/* QP confirmation over RoCE fabric */
646 		reason_code = smc_clnt_conf_first_link(smc);
647 		if (reason_code)
648 			return smc_connect_abort(smc, reason_code,
649 						 ini->cln_first_contact);
650 	}
651 	mutex_unlock(&smc_client_lgr_pending);
652 
653 	smc_copy_sock_settings_to_clc(smc);
654 	smc->connect_nonblock = 0;
655 	if (smc->sk.sk_state == SMC_INIT)
656 		smc->sk.sk_state = SMC_ACTIVE;
657 
658 	return 0;
659 }
660 
661 /* setup for ISM connection of client */
662 static int smc_connect_ism(struct smc_sock *smc,
663 			   struct smc_clc_msg_accept_confirm *aclc,
664 			   struct smc_init_info *ini)
665 {
666 	int rc = 0;
667 
668 	ini->is_smcd = true;
669 	ini->ism_gid = aclc->gid;
670 	ini->srv_first_contact = aclc->hdr.flag;
671 
672 	/* there is only one lgr role for SMC-D; use server lock */
673 	mutex_lock(&smc_server_lgr_pending);
674 	rc = smc_conn_create(smc, ini);
675 	if (rc) {
676 		mutex_unlock(&smc_server_lgr_pending);
677 		return rc;
678 	}
679 
680 	/* Create send and receive buffers */
681 	if (smc_buf_create(smc, true))
682 		return smc_connect_abort(smc, SMC_CLC_DECL_MEM,
683 					 ini->cln_first_contact);
684 
685 	smc_conn_save_peer_info(smc, aclc);
686 	smc_close_init(smc);
687 	smc_rx_init(smc);
688 	smc_tx_init(smc);
689 
690 	rc = smc_clc_send_confirm(smc);
691 	if (rc)
692 		return smc_connect_abort(smc, rc, ini->cln_first_contact);
693 	mutex_unlock(&smc_server_lgr_pending);
694 
695 	smc_copy_sock_settings_to_clc(smc);
696 	smc->connect_nonblock = 0;
697 	if (smc->sk.sk_state == SMC_INIT)
698 		smc->sk.sk_state = SMC_ACTIVE;
699 
700 	return 0;
701 }
702 
703 /* perform steps before actually connecting */
704 static int __smc_connect(struct smc_sock *smc)
705 {
706 	bool ism_supported = false, rdma_supported = false;
707 	struct smc_clc_msg_accept_confirm aclc;
708 	struct smc_init_info ini = {0};
709 	int smc_type;
710 	int rc = 0;
711 
712 	if (smc->use_fallback)
713 		return smc_connect_fallback(smc, smc->fallback_rsn);
714 
715 	/* if peer has not signalled SMC-capability, fall back */
716 	if (!tcp_sk(smc->clcsock->sk)->syn_smc)
717 		return smc_connect_fallback(smc, SMC_CLC_DECL_PEERNOSMC);
718 
719 	/* IPSec connections opt out of SMC-R optimizations */
720 	if (using_ipsec(smc))
721 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_IPSEC);
722 
723 	/* get vlan id from IP device */
724 	if (smc_vlan_by_tcpsk(smc->clcsock, &ini))
725 		return smc_connect_decline_fallback(smc,
726 						    SMC_CLC_DECL_GETVLANERR);
727 
728 	/* check if there is an ism device available */
729 	if (!smc_find_ism_device(smc, &ini) &&
730 	    !smc_connect_ism_vlan_setup(smc, &ini)) {
731 		/* ISM is supported for this connection */
732 		ism_supported = true;
733 		smc_type = SMC_TYPE_D;
734 	}
735 
736 	/* check if there is a rdma device available */
737 	if (!smc_find_rdma_device(smc, &ini)) {
738 		/* RDMA is supported for this connection */
739 		rdma_supported = true;
740 		if (ism_supported)
741 			smc_type = SMC_TYPE_B; /* both */
742 		else
743 			smc_type = SMC_TYPE_R; /* only RDMA */
744 	}
745 
746 	/* if neither ISM nor RDMA are supported, fallback */
747 	if (!rdma_supported && !ism_supported)
748 		return smc_connect_decline_fallback(smc, SMC_CLC_DECL_NOSMCDEV);
749 
750 	/* perform CLC handshake */
751 	rc = smc_connect_clc(smc, smc_type, &aclc, &ini);
752 	if (rc) {
753 		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
754 		return smc_connect_decline_fallback(smc, rc);
755 	}
756 
757 	/* depending on previous steps, connect using rdma or ism */
758 	if (rdma_supported && aclc.hdr.path == SMC_TYPE_R)
759 		rc = smc_connect_rdma(smc, &aclc, &ini);
760 	else if (ism_supported && aclc.hdr.path == SMC_TYPE_D)
761 		rc = smc_connect_ism(smc, &aclc, &ini);
762 	else
763 		rc = SMC_CLC_DECL_MODEUNSUPP;
764 	if (rc) {
765 		smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
766 		return smc_connect_decline_fallback(smc, rc);
767 	}
768 
769 	smc_connect_ism_vlan_cleanup(smc, ism_supported, &ini);
770 	return 0;
771 }
772 
773 static void smc_connect_work(struct work_struct *work)
774 {
775 	struct smc_sock *smc = container_of(work, struct smc_sock,
776 					    connect_work);
777 	long timeo = smc->sk.sk_sndtimeo;
778 	int rc = 0;
779 
780 	if (!timeo)
781 		timeo = MAX_SCHEDULE_TIMEOUT;
782 	lock_sock(smc->clcsock->sk);
783 	if (smc->clcsock->sk->sk_err) {
784 		smc->sk.sk_err = smc->clcsock->sk->sk_err;
785 	} else if ((1 << smc->clcsock->sk->sk_state) &
786 					(TCPF_SYN_SENT | TCP_SYN_RECV)) {
787 		rc = sk_stream_wait_connect(smc->clcsock->sk, &timeo);
788 		if ((rc == -EPIPE) &&
789 		    ((1 << smc->clcsock->sk->sk_state) &
790 					(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)))
791 			rc = 0;
792 	}
793 	release_sock(smc->clcsock->sk);
794 	lock_sock(&smc->sk);
795 	if (rc != 0 || smc->sk.sk_err) {
796 		smc->sk.sk_state = SMC_CLOSED;
797 		if (rc == -EPIPE || rc == -EAGAIN)
798 			smc->sk.sk_err = EPIPE;
799 		else if (signal_pending(current))
800 			smc->sk.sk_err = -sock_intr_errno(timeo);
801 		goto out;
802 	}
803 
804 	rc = __smc_connect(smc);
805 	if (rc < 0)
806 		smc->sk.sk_err = -rc;
807 
808 out:
809 	if (!sock_flag(&smc->sk, SOCK_DEAD)) {
810 		if (smc->sk.sk_err) {
811 			smc->sk.sk_state_change(&smc->sk);
812 		} else { /* allow polling before and after fallback decision */
813 			smc->clcsock->sk->sk_write_space(smc->clcsock->sk);
814 			smc->sk.sk_write_space(&smc->sk);
815 		}
816 	}
817 	release_sock(&smc->sk);
818 }
819 
820 static int smc_connect(struct socket *sock, struct sockaddr *addr,
821 		       int alen, int flags)
822 {
823 	struct sock *sk = sock->sk;
824 	struct smc_sock *smc;
825 	int rc = -EINVAL;
826 
827 	smc = smc_sk(sk);
828 
829 	/* separate smc parameter checking to be safe */
830 	if (alen < sizeof(addr->sa_family))
831 		goto out_err;
832 	if (addr->sa_family != AF_INET && addr->sa_family != AF_INET6)
833 		goto out_err;
834 
835 	lock_sock(sk);
836 	switch (sk->sk_state) {
837 	default:
838 		goto out;
839 	case SMC_ACTIVE:
840 		rc = -EISCONN;
841 		goto out;
842 	case SMC_INIT:
843 		rc = 0;
844 		break;
845 	}
846 
847 	smc_copy_sock_settings_to_clc(smc);
848 	tcp_sk(smc->clcsock->sk)->syn_smc = 1;
849 	if (smc->connect_nonblock) {
850 		rc = -EALREADY;
851 		goto out;
852 	}
853 	rc = kernel_connect(smc->clcsock, addr, alen, flags);
854 	if (rc && rc != -EINPROGRESS)
855 		goto out;
856 
857 	sock_hold(&smc->sk); /* sock put in passive closing */
858 	if (flags & O_NONBLOCK) {
859 		if (schedule_work(&smc->connect_work))
860 			smc->connect_nonblock = 1;
861 		rc = -EINPROGRESS;
862 	} else {
863 		rc = __smc_connect(smc);
864 		if (rc < 0)
865 			goto out;
866 		else
867 			rc = 0; /* success cases including fallback */
868 	}
869 
870 out:
871 	release_sock(sk);
872 out_err:
873 	return rc;
874 }
875 
876 static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
877 {
878 	struct socket *new_clcsock = NULL;
879 	struct sock *lsk = &lsmc->sk;
880 	struct sock *new_sk;
881 	int rc = -EINVAL;
882 
883 	release_sock(lsk);
884 	new_sk = smc_sock_alloc(sock_net(lsk), NULL, lsk->sk_protocol);
885 	if (!new_sk) {
886 		rc = -ENOMEM;
887 		lsk->sk_err = ENOMEM;
888 		*new_smc = NULL;
889 		lock_sock(lsk);
890 		goto out;
891 	}
892 	*new_smc = smc_sk(new_sk);
893 
894 	mutex_lock(&lsmc->clcsock_release_lock);
895 	if (lsmc->clcsock)
896 		rc = kernel_accept(lsmc->clcsock, &new_clcsock, 0);
897 	mutex_unlock(&lsmc->clcsock_release_lock);
898 	lock_sock(lsk);
899 	if  (rc < 0)
900 		lsk->sk_err = -rc;
901 	if (rc < 0 || lsk->sk_state == SMC_CLOSED) {
902 		new_sk->sk_prot->unhash(new_sk);
903 		if (new_clcsock)
904 			sock_release(new_clcsock);
905 		new_sk->sk_state = SMC_CLOSED;
906 		sock_set_flag(new_sk, SOCK_DEAD);
907 		sock_put(new_sk); /* final */
908 		*new_smc = NULL;
909 		goto out;
910 	}
911 
912 	(*new_smc)->clcsock = new_clcsock;
913 out:
914 	return rc;
915 }
916 
917 /* add a just created sock to the accept queue of the listen sock as
918  * candidate for a following socket accept call from user space
919  */
920 static void smc_accept_enqueue(struct sock *parent, struct sock *sk)
921 {
922 	struct smc_sock *par = smc_sk(parent);
923 
924 	sock_hold(sk); /* sock_put in smc_accept_unlink () */
925 	spin_lock(&par->accept_q_lock);
926 	list_add_tail(&smc_sk(sk)->accept_q, &par->accept_q);
927 	spin_unlock(&par->accept_q_lock);
928 	sk_acceptq_added(parent);
929 }
930 
931 /* remove a socket from the accept queue of its parental listening socket */
932 static void smc_accept_unlink(struct sock *sk)
933 {
934 	struct smc_sock *par = smc_sk(sk)->listen_smc;
935 
936 	spin_lock(&par->accept_q_lock);
937 	list_del_init(&smc_sk(sk)->accept_q);
938 	spin_unlock(&par->accept_q_lock);
939 	sk_acceptq_removed(&smc_sk(sk)->listen_smc->sk);
940 	sock_put(sk); /* sock_hold in smc_accept_enqueue */
941 }
942 
943 /* remove a sock from the accept queue to bind it to a new socket created
944  * for a socket accept call from user space
945  */
946 struct sock *smc_accept_dequeue(struct sock *parent,
947 				struct socket *new_sock)
948 {
949 	struct smc_sock *isk, *n;
950 	struct sock *new_sk;
951 
952 	list_for_each_entry_safe(isk, n, &smc_sk(parent)->accept_q, accept_q) {
953 		new_sk = (struct sock *)isk;
954 
955 		smc_accept_unlink(new_sk);
956 		if (new_sk->sk_state == SMC_CLOSED) {
957 			new_sk->sk_prot->unhash(new_sk);
958 			if (isk->clcsock) {
959 				sock_release(isk->clcsock);
960 				isk->clcsock = NULL;
961 			}
962 			sock_put(new_sk); /* final */
963 			continue;
964 		}
965 		if (new_sock) {
966 			sock_graft(new_sk, new_sock);
967 			if (isk->use_fallback) {
968 				smc_sk(new_sk)->clcsock->file = new_sock->file;
969 				isk->clcsock->file->private_data = isk->clcsock;
970 			}
971 		}
972 		return new_sk;
973 	}
974 	return NULL;
975 }
976 
977 /* clean up for a created but never accepted sock */
978 void smc_close_non_accepted(struct sock *sk)
979 {
980 	struct smc_sock *smc = smc_sk(sk);
981 
982 	sock_hold(sk); /* sock_put below */
983 	lock_sock(sk);
984 	if (!sk->sk_lingertime)
985 		/* wait for peer closing */
986 		sk->sk_lingertime = SMC_MAX_STREAM_WAIT_TIMEOUT;
987 	__smc_release(smc);
988 	release_sock(sk);
989 	sock_put(sk); /* sock_hold above */
990 	sock_put(sk); /* final sock_put */
991 }
992 
993 static int smc_serv_conf_first_link(struct smc_sock *smc)
994 {
995 	struct net *net = sock_net(smc->clcsock->sk);
996 	struct smc_link_group *lgr = smc->conn.lgr;
997 	struct smc_link *link;
998 	int rest;
999 	int rc;
1000 
1001 	link = &lgr->lnk[SMC_SINGLE_LINK];
1002 
1003 	if (smc_reg_rmb(link, smc->conn.rmb_desc, false))
1004 		return SMC_CLC_DECL_ERR_REGRMB;
1005 
1006 	/* send CONFIRM LINK request to client over the RoCE fabric */
1007 	rc = smc_llc_send_confirm_link(link, SMC_LLC_REQ);
1008 	if (rc < 0)
1009 		return SMC_CLC_DECL_TIMEOUT_CL;
1010 
1011 	/* receive CONFIRM LINK response from client over the RoCE fabric */
1012 	rest = wait_for_completion_interruptible_timeout(
1013 		&link->llc_confirm_resp,
1014 		SMC_LLC_WAIT_FIRST_TIME);
1015 	if (rest <= 0) {
1016 		struct smc_clc_msg_decline dclc;
1017 
1018 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1019 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1020 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_CL : rc;
1021 	}
1022 
1023 	if (link->llc_confirm_resp_rc)
1024 		return SMC_CLC_DECL_RMBE_EC;
1025 
1026 	/* send ADD LINK request to client over the RoCE fabric */
1027 	rc = smc_llc_send_add_link(link,
1028 				   link->smcibdev->mac[link->ibport - 1],
1029 				   link->gid, SMC_LLC_REQ);
1030 	if (rc < 0)
1031 		return SMC_CLC_DECL_TIMEOUT_AL;
1032 
1033 	/* receive ADD LINK response from client over the RoCE fabric */
1034 	rest = wait_for_completion_interruptible_timeout(&link->llc_add_resp,
1035 							 SMC_LLC_WAIT_TIME);
1036 	if (rest <= 0) {
1037 		struct smc_clc_msg_decline dclc;
1038 
1039 		rc = smc_clc_wait_msg(smc, &dclc, sizeof(dclc),
1040 				      SMC_CLC_DECLINE, CLC_WAIT_TIME_SHORT);
1041 		return rc == -EAGAIN ? SMC_CLC_DECL_TIMEOUT_AL : rc;
1042 	}
1043 
1044 	smc_llc_link_active(link, net->ipv4.sysctl_tcp_keepalive_time);
1045 
1046 	return 0;
1047 }
1048 
1049 /* listen worker: finish */
1050 static void smc_listen_out(struct smc_sock *new_smc)
1051 {
1052 	struct smc_sock *lsmc = new_smc->listen_smc;
1053 	struct sock *newsmcsk = &new_smc->sk;
1054 
1055 	if (lsmc->sk.sk_state == SMC_LISTEN) {
1056 		lock_sock_nested(&lsmc->sk, SINGLE_DEPTH_NESTING);
1057 		smc_accept_enqueue(&lsmc->sk, newsmcsk);
1058 		release_sock(&lsmc->sk);
1059 	} else { /* no longer listening */
1060 		smc_close_non_accepted(newsmcsk);
1061 	}
1062 
1063 	/* Wake up accept */
1064 	lsmc->sk.sk_data_ready(&lsmc->sk);
1065 	sock_put(&lsmc->sk); /* sock_hold in smc_tcp_listen_work */
1066 }
1067 
1068 /* listen worker: finish in state connected */
1069 static void smc_listen_out_connected(struct smc_sock *new_smc)
1070 {
1071 	struct sock *newsmcsk = &new_smc->sk;
1072 
1073 	sk_refcnt_debug_inc(newsmcsk);
1074 	if (newsmcsk->sk_state == SMC_INIT)
1075 		newsmcsk->sk_state = SMC_ACTIVE;
1076 
1077 	smc_listen_out(new_smc);
1078 }
1079 
1080 /* listen worker: finish in error state */
1081 static void smc_listen_out_err(struct smc_sock *new_smc)
1082 {
1083 	struct sock *newsmcsk = &new_smc->sk;
1084 
1085 	if (newsmcsk->sk_state == SMC_INIT)
1086 		sock_put(&new_smc->sk); /* passive closing */
1087 	newsmcsk->sk_state = SMC_CLOSED;
1088 	smc_conn_free(&new_smc->conn);
1089 
1090 	smc_listen_out(new_smc);
1091 }
1092 
1093 /* listen worker: decline and fall back if possible */
1094 static void smc_listen_decline(struct smc_sock *new_smc, int reason_code,
1095 			       int local_contact)
1096 {
1097 	/* RDMA setup failed, switch back to TCP */
1098 	if (local_contact == SMC_FIRST_CONTACT)
1099 		smc_lgr_forget(new_smc->conn.lgr);
1100 	if (reason_code < 0) { /* error, no fallback possible */
1101 		smc_listen_out_err(new_smc);
1102 		return;
1103 	}
1104 	smc_conn_free(&new_smc->conn);
1105 	smc_switch_to_fallback(new_smc);
1106 	new_smc->fallback_rsn = reason_code;
1107 	if (reason_code && reason_code != SMC_CLC_DECL_PEERDECL) {
1108 		if (smc_clc_send_decline(new_smc, reason_code) < 0) {
1109 			smc_listen_out_err(new_smc);
1110 			return;
1111 		}
1112 	}
1113 	smc_listen_out_connected(new_smc);
1114 }
1115 
1116 /* listen worker: check prefixes */
1117 static int smc_listen_prfx_check(struct smc_sock *new_smc,
1118 				 struct smc_clc_msg_proposal *pclc)
1119 {
1120 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
1121 	struct socket *newclcsock = new_smc->clcsock;
1122 
1123 	pclc_prfx = smc_clc_proposal_get_prefix(pclc);
1124 	if (smc_clc_prfx_match(newclcsock, pclc_prfx))
1125 		return SMC_CLC_DECL_DIFFPREFIX;
1126 
1127 	return 0;
1128 }
1129 
1130 /* listen worker: initialize connection and buffers */
1131 static int smc_listen_rdma_init(struct smc_sock *new_smc,
1132 				struct smc_init_info *ini)
1133 {
1134 	int rc;
1135 
1136 	/* allocate connection / link group */
1137 	rc = smc_conn_create(new_smc, ini);
1138 	if (rc)
1139 		return rc;
1140 
1141 	/* create send buffer and rmb */
1142 	if (smc_buf_create(new_smc, false))
1143 		return SMC_CLC_DECL_MEM;
1144 
1145 	return 0;
1146 }
1147 
1148 /* listen worker: initialize connection and buffers for SMC-D */
1149 static int smc_listen_ism_init(struct smc_sock *new_smc,
1150 			       struct smc_clc_msg_proposal *pclc,
1151 			       struct smc_init_info *ini)
1152 {
1153 	struct smc_clc_msg_smcd *pclc_smcd;
1154 	int rc;
1155 
1156 	pclc_smcd = smc_get_clc_msg_smcd(pclc);
1157 	ini->ism_gid = pclc_smcd->gid;
1158 	rc = smc_conn_create(new_smc, ini);
1159 	if (rc)
1160 		return rc;
1161 
1162 	/* Check if peer can be reached via ISM device */
1163 	if (smc_ism_cantalk(new_smc->conn.lgr->peer_gid,
1164 			    new_smc->conn.lgr->vlan_id,
1165 			    new_smc->conn.lgr->smcd)) {
1166 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1167 			smc_lgr_forget(new_smc->conn.lgr);
1168 		smc_conn_free(&new_smc->conn);
1169 		return SMC_CLC_DECL_SMCDNOTALK;
1170 	}
1171 
1172 	/* Create send and receive buffers */
1173 	if (smc_buf_create(new_smc, true)) {
1174 		if (ini->cln_first_contact == SMC_FIRST_CONTACT)
1175 			smc_lgr_forget(new_smc->conn.lgr);
1176 		smc_conn_free(&new_smc->conn);
1177 		return SMC_CLC_DECL_MEM;
1178 	}
1179 
1180 	return 0;
1181 }
1182 
1183 /* listen worker: register buffers */
1184 static int smc_listen_rdma_reg(struct smc_sock *new_smc, int local_contact)
1185 {
1186 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1187 
1188 	if (local_contact != SMC_FIRST_CONTACT) {
1189 		if (smc_reg_rmb(link, new_smc->conn.rmb_desc, true))
1190 			return SMC_CLC_DECL_ERR_REGRMB;
1191 	}
1192 	smc_rmb_sync_sg_for_device(&new_smc->conn);
1193 
1194 	return 0;
1195 }
1196 
1197 /* listen worker: finish RDMA setup */
1198 static int smc_listen_rdma_finish(struct smc_sock *new_smc,
1199 				  struct smc_clc_msg_accept_confirm *cclc,
1200 				  int local_contact)
1201 {
1202 	struct smc_link *link = &new_smc->conn.lgr->lnk[SMC_SINGLE_LINK];
1203 	int reason_code = 0;
1204 
1205 	if (local_contact == SMC_FIRST_CONTACT)
1206 		smc_link_save_peer_info(link, cclc);
1207 
1208 	if (smc_rmb_rtoken_handling(&new_smc->conn, cclc)) {
1209 		reason_code = SMC_CLC_DECL_ERR_RTOK;
1210 		goto decline;
1211 	}
1212 
1213 	if (local_contact == SMC_FIRST_CONTACT) {
1214 		if (smc_ib_ready_link(link)) {
1215 			reason_code = SMC_CLC_DECL_ERR_RDYLNK;
1216 			goto decline;
1217 		}
1218 		/* QP confirmation over RoCE fabric */
1219 		reason_code = smc_serv_conf_first_link(new_smc);
1220 		if (reason_code)
1221 			goto decline;
1222 	}
1223 	return 0;
1224 
1225 decline:
1226 	smc_listen_decline(new_smc, reason_code, local_contact);
1227 	return reason_code;
1228 }
1229 
1230 /* setup for RDMA connection of server */
1231 static void smc_listen_work(struct work_struct *work)
1232 {
1233 	struct smc_sock *new_smc = container_of(work, struct smc_sock,
1234 						smc_listen_work);
1235 	struct socket *newclcsock = new_smc->clcsock;
1236 	struct smc_clc_msg_accept_confirm cclc;
1237 	struct smc_clc_msg_proposal *pclc;
1238 	struct smc_init_info ini = {0};
1239 	bool ism_supported = false;
1240 	u8 buf[SMC_CLC_MAX_LEN];
1241 	int rc = 0;
1242 
1243 	if (new_smc->listen_smc->sk.sk_state != SMC_LISTEN)
1244 		return smc_listen_out_err(new_smc);
1245 
1246 	if (new_smc->use_fallback) {
1247 		smc_listen_out_connected(new_smc);
1248 		return;
1249 	}
1250 
1251 	/* check if peer is smc capable */
1252 	if (!tcp_sk(newclcsock->sk)->syn_smc) {
1253 		smc_switch_to_fallback(new_smc);
1254 		new_smc->fallback_rsn = SMC_CLC_DECL_PEERNOSMC;
1255 		smc_listen_out_connected(new_smc);
1256 		return;
1257 	}
1258 
1259 	/* do inband token exchange -
1260 	 * wait for and receive SMC Proposal CLC message
1261 	 */
1262 	pclc = (struct smc_clc_msg_proposal *)&buf;
1263 	rc = smc_clc_wait_msg(new_smc, pclc, SMC_CLC_MAX_LEN,
1264 			      SMC_CLC_PROPOSAL, CLC_WAIT_TIME);
1265 	if (rc)
1266 		goto out_decl;
1267 
1268 	/* IPSec connections opt out of SMC-R optimizations */
1269 	if (using_ipsec(new_smc)) {
1270 		rc = SMC_CLC_DECL_IPSEC;
1271 		goto out_decl;
1272 	}
1273 
1274 	/* check for matching IP prefix and subnet length */
1275 	rc = smc_listen_prfx_check(new_smc, pclc);
1276 	if (rc)
1277 		goto out_decl;
1278 
1279 	/* get vlan id from IP device */
1280 	if (smc_vlan_by_tcpsk(new_smc->clcsock, &ini)) {
1281 		rc = SMC_CLC_DECL_GETVLANERR;
1282 		goto out_decl;
1283 	}
1284 
1285 	mutex_lock(&smc_server_lgr_pending);
1286 	smc_close_init(new_smc);
1287 	smc_rx_init(new_smc);
1288 	smc_tx_init(new_smc);
1289 
1290 	/* check if ISM is available */
1291 	if (pclc->hdr.path == SMC_TYPE_D || pclc->hdr.path == SMC_TYPE_B) {
1292 		ini.is_smcd = true; /* prepare ISM check */
1293 		rc = smc_find_ism_device(new_smc, &ini);
1294 		if (!rc)
1295 			rc = smc_listen_ism_init(new_smc, pclc, &ini);
1296 		if (!rc)
1297 			ism_supported = true;
1298 		else if (pclc->hdr.path == SMC_TYPE_D)
1299 			goto out_unlock; /* skip RDMA and decline */
1300 	}
1301 
1302 	/* check if RDMA is available */
1303 	if (!ism_supported) { /* SMC_TYPE_R or SMC_TYPE_B */
1304 		/* prepare RDMA check */
1305 		ini.is_smcd = false;
1306 		ini.ism_dev = NULL;
1307 		ini.ib_lcl = &pclc->lcl;
1308 		rc = smc_find_rdma_device(new_smc, &ini);
1309 		if (rc) {
1310 			/* no RDMA device found */
1311 			if (pclc->hdr.path == SMC_TYPE_B)
1312 				/* neither ISM nor RDMA device found */
1313 				rc = SMC_CLC_DECL_NOSMCDEV;
1314 			goto out_unlock;
1315 		}
1316 		rc = smc_listen_rdma_init(new_smc, &ini);
1317 		if (rc)
1318 			goto out_unlock;
1319 		rc = smc_listen_rdma_reg(new_smc, ini.cln_first_contact);
1320 		if (rc)
1321 			goto out_unlock;
1322 	}
1323 
1324 	/* send SMC Accept CLC message */
1325 	rc = smc_clc_send_accept(new_smc, ini.cln_first_contact);
1326 	if (rc)
1327 		goto out_unlock;
1328 
1329 	/* SMC-D does not need this lock any more */
1330 	if (ism_supported)
1331 		mutex_unlock(&smc_server_lgr_pending);
1332 
1333 	/* receive SMC Confirm CLC message */
1334 	rc = smc_clc_wait_msg(new_smc, &cclc, sizeof(cclc),
1335 			      SMC_CLC_CONFIRM, CLC_WAIT_TIME);
1336 	if (rc) {
1337 		if (!ism_supported)
1338 			goto out_unlock;
1339 		goto out_decl;
1340 	}
1341 
1342 	/* finish worker */
1343 	if (!ism_supported) {
1344 		rc = smc_listen_rdma_finish(new_smc, &cclc,
1345 					    ini.cln_first_contact);
1346 		mutex_unlock(&smc_server_lgr_pending);
1347 		if (rc)
1348 			return;
1349 	}
1350 	smc_conn_save_peer_info(new_smc, &cclc);
1351 	smc_listen_out_connected(new_smc);
1352 	return;
1353 
1354 out_unlock:
1355 	mutex_unlock(&smc_server_lgr_pending);
1356 out_decl:
1357 	smc_listen_decline(new_smc, rc, ini.cln_first_contact);
1358 }
1359 
1360 static void smc_tcp_listen_work(struct work_struct *work)
1361 {
1362 	struct smc_sock *lsmc = container_of(work, struct smc_sock,
1363 					     tcp_listen_work);
1364 	struct sock *lsk = &lsmc->sk;
1365 	struct smc_sock *new_smc;
1366 	int rc = 0;
1367 
1368 	lock_sock(lsk);
1369 	while (lsk->sk_state == SMC_LISTEN) {
1370 		rc = smc_clcsock_accept(lsmc, &new_smc);
1371 		if (rc)
1372 			goto out;
1373 		if (!new_smc)
1374 			continue;
1375 
1376 		new_smc->listen_smc = lsmc;
1377 		new_smc->use_fallback = lsmc->use_fallback;
1378 		new_smc->fallback_rsn = lsmc->fallback_rsn;
1379 		sock_hold(lsk); /* sock_put in smc_listen_work */
1380 		INIT_WORK(&new_smc->smc_listen_work, smc_listen_work);
1381 		smc_copy_sock_settings_to_smc(new_smc);
1382 		new_smc->sk.sk_sndbuf = lsmc->sk.sk_sndbuf;
1383 		new_smc->sk.sk_rcvbuf = lsmc->sk.sk_rcvbuf;
1384 		sock_hold(&new_smc->sk); /* sock_put in passive closing */
1385 		if (!schedule_work(&new_smc->smc_listen_work))
1386 			sock_put(&new_smc->sk);
1387 	}
1388 
1389 out:
1390 	release_sock(lsk);
1391 	sock_put(&lsmc->sk); /* sock_hold in smc_listen */
1392 }
1393 
1394 static int smc_listen(struct socket *sock, int backlog)
1395 {
1396 	struct sock *sk = sock->sk;
1397 	struct smc_sock *smc;
1398 	int rc;
1399 
1400 	smc = smc_sk(sk);
1401 	lock_sock(sk);
1402 
1403 	rc = -EINVAL;
1404 	if ((sk->sk_state != SMC_INIT && sk->sk_state != SMC_LISTEN) ||
1405 	    smc->connect_nonblock)
1406 		goto out;
1407 
1408 	rc = 0;
1409 	if (sk->sk_state == SMC_LISTEN) {
1410 		sk->sk_max_ack_backlog = backlog;
1411 		goto out;
1412 	}
1413 	/* some socket options are handled in core, so we could not apply
1414 	 * them to the clc socket -- copy smc socket options to clc socket
1415 	 */
1416 	smc_copy_sock_settings_to_clc(smc);
1417 	if (!smc->use_fallback)
1418 		tcp_sk(smc->clcsock->sk)->syn_smc = 1;
1419 
1420 	rc = kernel_listen(smc->clcsock, backlog);
1421 	if (rc)
1422 		goto out;
1423 	sk->sk_max_ack_backlog = backlog;
1424 	sk->sk_ack_backlog = 0;
1425 	sk->sk_state = SMC_LISTEN;
1426 	sock_hold(sk); /* sock_hold in tcp_listen_worker */
1427 	if (!schedule_work(&smc->tcp_listen_work))
1428 		sock_put(sk);
1429 
1430 out:
1431 	release_sock(sk);
1432 	return rc;
1433 }
1434 
1435 static int smc_accept(struct socket *sock, struct socket *new_sock,
1436 		      int flags, bool kern)
1437 {
1438 	struct sock *sk = sock->sk, *nsk;
1439 	DECLARE_WAITQUEUE(wait, current);
1440 	struct smc_sock *lsmc;
1441 	long timeo;
1442 	int rc = 0;
1443 
1444 	lsmc = smc_sk(sk);
1445 	sock_hold(sk); /* sock_put below */
1446 	lock_sock(sk);
1447 
1448 	if (lsmc->sk.sk_state != SMC_LISTEN) {
1449 		rc = -EINVAL;
1450 		release_sock(sk);
1451 		goto out;
1452 	}
1453 
1454 	/* Wait for an incoming connection */
1455 	timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
1456 	add_wait_queue_exclusive(sk_sleep(sk), &wait);
1457 	while (!(nsk = smc_accept_dequeue(sk, new_sock))) {
1458 		set_current_state(TASK_INTERRUPTIBLE);
1459 		if (!timeo) {
1460 			rc = -EAGAIN;
1461 			break;
1462 		}
1463 		release_sock(sk);
1464 		timeo = schedule_timeout(timeo);
1465 		/* wakeup by sk_data_ready in smc_listen_work() */
1466 		sched_annotate_sleep();
1467 		lock_sock(sk);
1468 		if (signal_pending(current)) {
1469 			rc = sock_intr_errno(timeo);
1470 			break;
1471 		}
1472 	}
1473 	set_current_state(TASK_RUNNING);
1474 	remove_wait_queue(sk_sleep(sk), &wait);
1475 
1476 	if (!rc)
1477 		rc = sock_error(nsk);
1478 	release_sock(sk);
1479 	if (rc)
1480 		goto out;
1481 
1482 	if (lsmc->sockopt_defer_accept && !(flags & O_NONBLOCK)) {
1483 		/* wait till data arrives on the socket */
1484 		timeo = msecs_to_jiffies(lsmc->sockopt_defer_accept *
1485 								MSEC_PER_SEC);
1486 		if (smc_sk(nsk)->use_fallback) {
1487 			struct sock *clcsk = smc_sk(nsk)->clcsock->sk;
1488 
1489 			lock_sock(clcsk);
1490 			if (skb_queue_empty(&clcsk->sk_receive_queue))
1491 				sk_wait_data(clcsk, &timeo, NULL);
1492 			release_sock(clcsk);
1493 		} else if (!atomic_read(&smc_sk(nsk)->conn.bytes_to_rcv)) {
1494 			lock_sock(nsk);
1495 			smc_rx_wait(smc_sk(nsk), &timeo, smc_rx_data_available);
1496 			release_sock(nsk);
1497 		}
1498 	}
1499 
1500 out:
1501 	sock_put(sk); /* sock_hold above */
1502 	return rc;
1503 }
1504 
1505 static int smc_getname(struct socket *sock, struct sockaddr *addr,
1506 		       int peer)
1507 {
1508 	struct smc_sock *smc;
1509 
1510 	if (peer && (sock->sk->sk_state != SMC_ACTIVE) &&
1511 	    (sock->sk->sk_state != SMC_APPCLOSEWAIT1))
1512 		return -ENOTCONN;
1513 
1514 	smc = smc_sk(sock->sk);
1515 
1516 	return smc->clcsock->ops->getname(smc->clcsock, addr, peer);
1517 }
1518 
1519 static int smc_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
1520 {
1521 	struct sock *sk = sock->sk;
1522 	struct smc_sock *smc;
1523 	int rc = -EPIPE;
1524 
1525 	smc = smc_sk(sk);
1526 	lock_sock(sk);
1527 	if ((sk->sk_state != SMC_ACTIVE) &&
1528 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1529 	    (sk->sk_state != SMC_INIT))
1530 		goto out;
1531 
1532 	if (msg->msg_flags & MSG_FASTOPEN) {
1533 		if (sk->sk_state == SMC_INIT && !smc->connect_nonblock) {
1534 			smc_switch_to_fallback(smc);
1535 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1536 		} else {
1537 			rc = -EINVAL;
1538 			goto out;
1539 		}
1540 	}
1541 
1542 	if (smc->use_fallback)
1543 		rc = smc->clcsock->ops->sendmsg(smc->clcsock, msg, len);
1544 	else
1545 		rc = smc_tx_sendmsg(smc, msg, len);
1546 out:
1547 	release_sock(sk);
1548 	return rc;
1549 }
1550 
1551 static int smc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
1552 		       int flags)
1553 {
1554 	struct sock *sk = sock->sk;
1555 	struct smc_sock *smc;
1556 	int rc = -ENOTCONN;
1557 
1558 	smc = smc_sk(sk);
1559 	lock_sock(sk);
1560 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1561 		/* socket was connected before, no more data to read */
1562 		rc = 0;
1563 		goto out;
1564 	}
1565 	if ((sk->sk_state == SMC_INIT) ||
1566 	    (sk->sk_state == SMC_LISTEN) ||
1567 	    (sk->sk_state == SMC_CLOSED))
1568 		goto out;
1569 
1570 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1571 		rc = 0;
1572 		goto out;
1573 	}
1574 
1575 	if (smc->use_fallback) {
1576 		rc = smc->clcsock->ops->recvmsg(smc->clcsock, msg, len, flags);
1577 	} else {
1578 		msg->msg_namelen = 0;
1579 		rc = smc_rx_recvmsg(smc, msg, NULL, len, flags);
1580 	}
1581 
1582 out:
1583 	release_sock(sk);
1584 	return rc;
1585 }
1586 
1587 static __poll_t smc_accept_poll(struct sock *parent)
1588 {
1589 	struct smc_sock *isk = smc_sk(parent);
1590 	__poll_t mask = 0;
1591 
1592 	spin_lock(&isk->accept_q_lock);
1593 	if (!list_empty(&isk->accept_q))
1594 		mask = EPOLLIN | EPOLLRDNORM;
1595 	spin_unlock(&isk->accept_q_lock);
1596 
1597 	return mask;
1598 }
1599 
1600 static __poll_t smc_poll(struct file *file, struct socket *sock,
1601 			     poll_table *wait)
1602 {
1603 	struct sock *sk = sock->sk;
1604 	struct smc_sock *smc;
1605 	__poll_t mask = 0;
1606 
1607 	if (!sk)
1608 		return EPOLLNVAL;
1609 
1610 	smc = smc_sk(sock->sk);
1611 	if (smc->use_fallback) {
1612 		/* delegate to CLC child sock */
1613 		mask = smc->clcsock->ops->poll(file, smc->clcsock, wait);
1614 		sk->sk_err = smc->clcsock->sk->sk_err;
1615 	} else {
1616 		if (sk->sk_state != SMC_CLOSED)
1617 			sock_poll_wait(file, sock, wait);
1618 		if (sk->sk_err)
1619 			mask |= EPOLLERR;
1620 		if ((sk->sk_shutdown == SHUTDOWN_MASK) ||
1621 		    (sk->sk_state == SMC_CLOSED))
1622 			mask |= EPOLLHUP;
1623 		if (sk->sk_state == SMC_LISTEN) {
1624 			/* woken up by sk_data_ready in smc_listen_work() */
1625 			mask |= smc_accept_poll(sk);
1626 		} else if (smc->use_fallback) { /* as result of connect_work()*/
1627 			mask |= smc->clcsock->ops->poll(file, smc->clcsock,
1628 							   wait);
1629 			sk->sk_err = smc->clcsock->sk->sk_err;
1630 		} else {
1631 			if ((sk->sk_state != SMC_INIT &&
1632 			     atomic_read(&smc->conn.sndbuf_space)) ||
1633 			    sk->sk_shutdown & SEND_SHUTDOWN) {
1634 				mask |= EPOLLOUT | EPOLLWRNORM;
1635 			} else {
1636 				sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
1637 				set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1638 			}
1639 			if (atomic_read(&smc->conn.bytes_to_rcv))
1640 				mask |= EPOLLIN | EPOLLRDNORM;
1641 			if (sk->sk_shutdown & RCV_SHUTDOWN)
1642 				mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
1643 			if (sk->sk_state == SMC_APPCLOSEWAIT1)
1644 				mask |= EPOLLIN;
1645 			if (smc->conn.urg_state == SMC_URG_VALID)
1646 				mask |= EPOLLPRI;
1647 		}
1648 	}
1649 
1650 	return mask;
1651 }
1652 
1653 static int smc_shutdown(struct socket *sock, int how)
1654 {
1655 	struct sock *sk = sock->sk;
1656 	struct smc_sock *smc;
1657 	int rc = -EINVAL;
1658 	int rc1 = 0;
1659 
1660 	smc = smc_sk(sk);
1661 
1662 	if ((how < SHUT_RD) || (how > SHUT_RDWR))
1663 		return rc;
1664 
1665 	lock_sock(sk);
1666 
1667 	rc = -ENOTCONN;
1668 	if ((sk->sk_state != SMC_ACTIVE) &&
1669 	    (sk->sk_state != SMC_PEERCLOSEWAIT1) &&
1670 	    (sk->sk_state != SMC_PEERCLOSEWAIT2) &&
1671 	    (sk->sk_state != SMC_APPCLOSEWAIT1) &&
1672 	    (sk->sk_state != SMC_APPCLOSEWAIT2) &&
1673 	    (sk->sk_state != SMC_APPFINCLOSEWAIT))
1674 		goto out;
1675 	if (smc->use_fallback) {
1676 		rc = kernel_sock_shutdown(smc->clcsock, how);
1677 		sk->sk_shutdown = smc->clcsock->sk->sk_shutdown;
1678 		if (sk->sk_shutdown == SHUTDOWN_MASK)
1679 			sk->sk_state = SMC_CLOSED;
1680 		goto out;
1681 	}
1682 	switch (how) {
1683 	case SHUT_RDWR:		/* shutdown in both directions */
1684 		rc = smc_close_active(smc);
1685 		break;
1686 	case SHUT_WR:
1687 		rc = smc_close_shutdown_write(smc);
1688 		break;
1689 	case SHUT_RD:
1690 		rc = 0;
1691 		/* nothing more to do because peer is not involved */
1692 		break;
1693 	}
1694 	if (smc->clcsock)
1695 		rc1 = kernel_sock_shutdown(smc->clcsock, how);
1696 	/* map sock_shutdown_cmd constants to sk_shutdown value range */
1697 	sk->sk_shutdown |= how + 1;
1698 
1699 out:
1700 	release_sock(sk);
1701 	return rc ? rc : rc1;
1702 }
1703 
1704 static int smc_setsockopt(struct socket *sock, int level, int optname,
1705 			  char __user *optval, unsigned int optlen)
1706 {
1707 	struct sock *sk = sock->sk;
1708 	struct smc_sock *smc;
1709 	int val, rc;
1710 
1711 	smc = smc_sk(sk);
1712 
1713 	/* generic setsockopts reaching us here always apply to the
1714 	 * CLC socket
1715 	 */
1716 	rc = smc->clcsock->ops->setsockopt(smc->clcsock, level, optname,
1717 					   optval, optlen);
1718 	if (smc->clcsock->sk->sk_err) {
1719 		sk->sk_err = smc->clcsock->sk->sk_err;
1720 		sk->sk_error_report(sk);
1721 	}
1722 	if (rc)
1723 		return rc;
1724 
1725 	if (optlen < sizeof(int))
1726 		return -EINVAL;
1727 	if (get_user(val, (int __user *)optval))
1728 		return -EFAULT;
1729 
1730 	lock_sock(sk);
1731 	switch (optname) {
1732 	case TCP_ULP:
1733 	case TCP_FASTOPEN:
1734 	case TCP_FASTOPEN_CONNECT:
1735 	case TCP_FASTOPEN_KEY:
1736 	case TCP_FASTOPEN_NO_COOKIE:
1737 		/* option not supported by SMC */
1738 		if (sk->sk_state == SMC_INIT) {
1739 			smc_switch_to_fallback(smc);
1740 			smc->fallback_rsn = SMC_CLC_DECL_OPTUNSUPP;
1741 		} else {
1742 			if (!smc->use_fallback)
1743 				rc = -EINVAL;
1744 		}
1745 		break;
1746 	case TCP_NODELAY:
1747 		if (sk->sk_state != SMC_INIT &&
1748 		    sk->sk_state != SMC_LISTEN &&
1749 		    sk->sk_state != SMC_CLOSED) {
1750 			if (val && !smc->use_fallback)
1751 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1752 						 0);
1753 		}
1754 		break;
1755 	case TCP_CORK:
1756 		if (sk->sk_state != SMC_INIT &&
1757 		    sk->sk_state != SMC_LISTEN &&
1758 		    sk->sk_state != SMC_CLOSED) {
1759 			if (!val && !smc->use_fallback)
1760 				mod_delayed_work(system_wq, &smc->conn.tx_work,
1761 						 0);
1762 		}
1763 		break;
1764 	case TCP_DEFER_ACCEPT:
1765 		smc->sockopt_defer_accept = val;
1766 		break;
1767 	default:
1768 		break;
1769 	}
1770 	release_sock(sk);
1771 
1772 	return rc;
1773 }
1774 
1775 static int smc_getsockopt(struct socket *sock, int level, int optname,
1776 			  char __user *optval, int __user *optlen)
1777 {
1778 	struct smc_sock *smc;
1779 
1780 	smc = smc_sk(sock->sk);
1781 	/* socket options apply to the CLC socket */
1782 	return smc->clcsock->ops->getsockopt(smc->clcsock, level, optname,
1783 					     optval, optlen);
1784 }
1785 
1786 static int smc_ioctl(struct socket *sock, unsigned int cmd,
1787 		     unsigned long arg)
1788 {
1789 	union smc_host_cursor cons, urg;
1790 	struct smc_connection *conn;
1791 	struct smc_sock *smc;
1792 	int answ;
1793 
1794 	smc = smc_sk(sock->sk);
1795 	conn = &smc->conn;
1796 	lock_sock(&smc->sk);
1797 	if (smc->use_fallback) {
1798 		if (!smc->clcsock) {
1799 			release_sock(&smc->sk);
1800 			return -EBADF;
1801 		}
1802 		answ = smc->clcsock->ops->ioctl(smc->clcsock, cmd, arg);
1803 		release_sock(&smc->sk);
1804 		return answ;
1805 	}
1806 	switch (cmd) {
1807 	case SIOCINQ: /* same as FIONREAD */
1808 		if (smc->sk.sk_state == SMC_LISTEN) {
1809 			release_sock(&smc->sk);
1810 			return -EINVAL;
1811 		}
1812 		if (smc->sk.sk_state == SMC_INIT ||
1813 		    smc->sk.sk_state == SMC_CLOSED)
1814 			answ = 0;
1815 		else
1816 			answ = atomic_read(&smc->conn.bytes_to_rcv);
1817 		break;
1818 	case SIOCOUTQ:
1819 		/* output queue size (not send + not acked) */
1820 		if (smc->sk.sk_state == SMC_LISTEN) {
1821 			release_sock(&smc->sk);
1822 			return -EINVAL;
1823 		}
1824 		if (smc->sk.sk_state == SMC_INIT ||
1825 		    smc->sk.sk_state == SMC_CLOSED)
1826 			answ = 0;
1827 		else
1828 			answ = smc->conn.sndbuf_desc->len -
1829 					atomic_read(&smc->conn.sndbuf_space);
1830 		break;
1831 	case SIOCOUTQNSD:
1832 		/* output queue size (not send only) */
1833 		if (smc->sk.sk_state == SMC_LISTEN) {
1834 			release_sock(&smc->sk);
1835 			return -EINVAL;
1836 		}
1837 		if (smc->sk.sk_state == SMC_INIT ||
1838 		    smc->sk.sk_state == SMC_CLOSED)
1839 			answ = 0;
1840 		else
1841 			answ = smc_tx_prepared_sends(&smc->conn);
1842 		break;
1843 	case SIOCATMARK:
1844 		if (smc->sk.sk_state == SMC_LISTEN) {
1845 			release_sock(&smc->sk);
1846 			return -EINVAL;
1847 		}
1848 		if (smc->sk.sk_state == SMC_INIT ||
1849 		    smc->sk.sk_state == SMC_CLOSED) {
1850 			answ = 0;
1851 		} else {
1852 			smc_curs_copy(&cons, &conn->local_tx_ctrl.cons, conn);
1853 			smc_curs_copy(&urg, &conn->urg_curs, conn);
1854 			answ = smc_curs_diff(conn->rmb_desc->len,
1855 					     &cons, &urg) == 1;
1856 		}
1857 		break;
1858 	default:
1859 		release_sock(&smc->sk);
1860 		return -ENOIOCTLCMD;
1861 	}
1862 	release_sock(&smc->sk);
1863 
1864 	return put_user(answ, (int __user *)arg);
1865 }
1866 
1867 static ssize_t smc_sendpage(struct socket *sock, struct page *page,
1868 			    int offset, size_t size, int flags)
1869 {
1870 	struct sock *sk = sock->sk;
1871 	struct smc_sock *smc;
1872 	int rc = -EPIPE;
1873 
1874 	smc = smc_sk(sk);
1875 	lock_sock(sk);
1876 	if (sk->sk_state != SMC_ACTIVE) {
1877 		release_sock(sk);
1878 		goto out;
1879 	}
1880 	release_sock(sk);
1881 	if (smc->use_fallback)
1882 		rc = kernel_sendpage(smc->clcsock, page, offset,
1883 				     size, flags);
1884 	else
1885 		rc = sock_no_sendpage(sock, page, offset, size, flags);
1886 
1887 out:
1888 	return rc;
1889 }
1890 
1891 /* Map the affected portions of the rmbe into an spd, note the number of bytes
1892  * to splice in conn->splice_pending, and press 'go'. Delays consumer cursor
1893  * updates till whenever a respective page has been fully processed.
1894  * Note that subsequent recv() calls have to wait till all splice() processing
1895  * completed.
1896  */
1897 static ssize_t smc_splice_read(struct socket *sock, loff_t *ppos,
1898 			       struct pipe_inode_info *pipe, size_t len,
1899 			       unsigned int flags)
1900 {
1901 	struct sock *sk = sock->sk;
1902 	struct smc_sock *smc;
1903 	int rc = -ENOTCONN;
1904 
1905 	smc = smc_sk(sk);
1906 	lock_sock(sk);
1907 	if (sk->sk_state == SMC_CLOSED && (sk->sk_shutdown & RCV_SHUTDOWN)) {
1908 		/* socket was connected before, no more data to read */
1909 		rc = 0;
1910 		goto out;
1911 	}
1912 	if (sk->sk_state == SMC_INIT ||
1913 	    sk->sk_state == SMC_LISTEN ||
1914 	    sk->sk_state == SMC_CLOSED)
1915 		goto out;
1916 
1917 	if (sk->sk_state == SMC_PEERFINCLOSEWAIT) {
1918 		rc = 0;
1919 		goto out;
1920 	}
1921 
1922 	if (smc->use_fallback) {
1923 		rc = smc->clcsock->ops->splice_read(smc->clcsock, ppos,
1924 						    pipe, len, flags);
1925 	} else {
1926 		if (*ppos) {
1927 			rc = -ESPIPE;
1928 			goto out;
1929 		}
1930 		if (flags & SPLICE_F_NONBLOCK)
1931 			flags = MSG_DONTWAIT;
1932 		else
1933 			flags = 0;
1934 		rc = smc_rx_recvmsg(smc, NULL, pipe, len, flags);
1935 	}
1936 out:
1937 	release_sock(sk);
1938 
1939 	return rc;
1940 }
1941 
1942 /* must look like tcp */
1943 static const struct proto_ops smc_sock_ops = {
1944 	.family		= PF_SMC,
1945 	.owner		= THIS_MODULE,
1946 	.release	= smc_release,
1947 	.bind		= smc_bind,
1948 	.connect	= smc_connect,
1949 	.socketpair	= sock_no_socketpair,
1950 	.accept		= smc_accept,
1951 	.getname	= smc_getname,
1952 	.poll		= smc_poll,
1953 	.ioctl		= smc_ioctl,
1954 	.listen		= smc_listen,
1955 	.shutdown	= smc_shutdown,
1956 	.setsockopt	= smc_setsockopt,
1957 	.getsockopt	= smc_getsockopt,
1958 	.sendmsg	= smc_sendmsg,
1959 	.recvmsg	= smc_recvmsg,
1960 	.mmap		= sock_no_mmap,
1961 	.sendpage	= smc_sendpage,
1962 	.splice_read	= smc_splice_read,
1963 };
1964 
1965 static int smc_create(struct net *net, struct socket *sock, int protocol,
1966 		      int kern)
1967 {
1968 	int family = (protocol == SMCPROTO_SMC6) ? PF_INET6 : PF_INET;
1969 	struct smc_sock *smc;
1970 	struct sock *sk;
1971 	int rc;
1972 
1973 	rc = -ESOCKTNOSUPPORT;
1974 	if (sock->type != SOCK_STREAM)
1975 		goto out;
1976 
1977 	rc = -EPROTONOSUPPORT;
1978 	if (protocol != SMCPROTO_SMC && protocol != SMCPROTO_SMC6)
1979 		goto out;
1980 
1981 	rc = -ENOBUFS;
1982 	sock->ops = &smc_sock_ops;
1983 	sk = smc_sock_alloc(net, sock, protocol);
1984 	if (!sk)
1985 		goto out;
1986 
1987 	/* create internal TCP socket for CLC handshake and fallback */
1988 	smc = smc_sk(sk);
1989 	smc->use_fallback = false; /* assume rdma capability first */
1990 	smc->fallback_rsn = 0;
1991 	rc = sock_create_kern(net, family, SOCK_STREAM, IPPROTO_TCP,
1992 			      &smc->clcsock);
1993 	if (rc) {
1994 		sk_common_release(sk);
1995 		goto out;
1996 	}
1997 	smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE);
1998 	smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE);
1999 
2000 out:
2001 	return rc;
2002 }
2003 
2004 static const struct net_proto_family smc_sock_family_ops = {
2005 	.family	= PF_SMC,
2006 	.owner	= THIS_MODULE,
2007 	.create	= smc_create,
2008 };
2009 
2010 unsigned int smc_net_id;
2011 
2012 static __net_init int smc_net_init(struct net *net)
2013 {
2014 	return smc_pnet_net_init(net);
2015 }
2016 
2017 static void __net_exit smc_net_exit(struct net *net)
2018 {
2019 	smc_pnet_net_exit(net);
2020 }
2021 
2022 static struct pernet_operations smc_net_ops = {
2023 	.init = smc_net_init,
2024 	.exit = smc_net_exit,
2025 	.id   = &smc_net_id,
2026 	.size = sizeof(struct smc_net),
2027 };
2028 
2029 static int __init smc_init(void)
2030 {
2031 	int rc;
2032 
2033 	rc = register_pernet_subsys(&smc_net_ops);
2034 	if (rc)
2035 		return rc;
2036 
2037 	rc = smc_pnet_init();
2038 	if (rc)
2039 		goto out_pernet_subsys;
2040 
2041 	rc = smc_llc_init();
2042 	if (rc) {
2043 		pr_err("%s: smc_llc_init fails with %d\n", __func__, rc);
2044 		goto out_pnet;
2045 	}
2046 
2047 	rc = smc_cdc_init();
2048 	if (rc) {
2049 		pr_err("%s: smc_cdc_init fails with %d\n", __func__, rc);
2050 		goto out_pnet;
2051 	}
2052 
2053 	rc = proto_register(&smc_proto, 1);
2054 	if (rc) {
2055 		pr_err("%s: proto_register(v4) fails with %d\n", __func__, rc);
2056 		goto out_pnet;
2057 	}
2058 
2059 	rc = proto_register(&smc_proto6, 1);
2060 	if (rc) {
2061 		pr_err("%s: proto_register(v6) fails with %d\n", __func__, rc);
2062 		goto out_proto;
2063 	}
2064 
2065 	rc = sock_register(&smc_sock_family_ops);
2066 	if (rc) {
2067 		pr_err("%s: sock_register fails with %d\n", __func__, rc);
2068 		goto out_proto6;
2069 	}
2070 	INIT_HLIST_HEAD(&smc_v4_hashinfo.ht);
2071 	INIT_HLIST_HEAD(&smc_v6_hashinfo.ht);
2072 
2073 	rc = smc_ib_register_client();
2074 	if (rc) {
2075 		pr_err("%s: ib_register fails with %d\n", __func__, rc);
2076 		goto out_sock;
2077 	}
2078 
2079 	static_branch_enable(&tcp_have_smc);
2080 	return 0;
2081 
2082 out_sock:
2083 	sock_unregister(PF_SMC);
2084 out_proto6:
2085 	proto_unregister(&smc_proto6);
2086 out_proto:
2087 	proto_unregister(&smc_proto);
2088 out_pnet:
2089 	smc_pnet_exit();
2090 out_pernet_subsys:
2091 	unregister_pernet_subsys(&smc_net_ops);
2092 
2093 	return rc;
2094 }
2095 
2096 static void __exit smc_exit(void)
2097 {
2098 	smc_core_exit();
2099 	static_branch_disable(&tcp_have_smc);
2100 	smc_ib_unregister_client();
2101 	sock_unregister(PF_SMC);
2102 	proto_unregister(&smc_proto6);
2103 	proto_unregister(&smc_proto);
2104 	smc_pnet_exit();
2105 	unregister_pernet_subsys(&smc_net_ops);
2106 }
2107 
2108 module_init(smc_init);
2109 module_exit(smc_exit);
2110 
2111 MODULE_AUTHOR("Ursula Braun <ubraun@linux.vnet.ibm.com>");
2112 MODULE_DESCRIPTION("smc socket address family");
2113 MODULE_LICENSE("GPL");
2114 MODULE_ALIAS_NETPROTO(PF_SMC);
2115