xref: /linux/net/smc/smc_clc.c (revision e21f9e2e862e9eb3dd64eaddb6256b3e5098660f)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  Shared Memory Communications over RDMA (SMC-R) and RoCE
4  *
5  *  CLC (connection layer control) handshake over initial TCP socket to
6  *  prepare for RDMA traffic
7  *
8  *  Copyright IBM Corp. 2016, 2018
9  *
10  *  Author(s):  Ursula Braun <ubraun@linux.vnet.ibm.com>
11  */
12 
13 #include <linux/in.h>
14 #include <linux/inetdevice.h>
15 #include <linux/if_ether.h>
16 #include <linux/sched/signal.h>
17 
18 #include <net/addrconf.h>
19 #include <net/sock.h>
20 #include <net/tcp.h>
21 
22 #include "smc.h"
23 #include "smc_core.h"
24 #include "smc_clc.h"
25 #include "smc_ib.h"
26 
27 /* eye catcher "SMCR" EBCDIC for CLC messages */
28 static const char SMC_EYECATCHER[4] = {'\xe2', '\xd4', '\xc3', '\xd9'};
29 
30 /* check if received message has a correct header length and contains valid
31  * heading and trailing eyecatchers
32  */
33 static bool smc_clc_msg_hdr_valid(struct smc_clc_msg_hdr *clcm)
34 {
35 	struct smc_clc_msg_proposal_prefix *pclc_prfx;
36 	struct smc_clc_msg_accept_confirm *clc;
37 	struct smc_clc_msg_proposal *pclc;
38 	struct smc_clc_msg_decline *dclc;
39 	struct smc_clc_msg_trail *trl;
40 
41 	if (memcmp(clcm->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
42 		return false;
43 	switch (clcm->type) {
44 	case SMC_CLC_PROPOSAL:
45 		pclc = (struct smc_clc_msg_proposal *)clcm;
46 		pclc_prfx = smc_clc_proposal_get_prefix(pclc);
47 		if (ntohs(pclc->hdr.length) !=
48 			sizeof(*pclc) + ntohs(pclc->iparea_offset) +
49 			sizeof(*pclc_prfx) +
50 			pclc_prfx->ipv6_prefixes_cnt *
51 				sizeof(struct smc_clc_ipv6_prefix) +
52 			sizeof(*trl))
53 			return false;
54 		trl = (struct smc_clc_msg_trail *)
55 			((u8 *)pclc + ntohs(pclc->hdr.length) - sizeof(*trl));
56 		break;
57 	case SMC_CLC_ACCEPT:
58 	case SMC_CLC_CONFIRM:
59 		clc = (struct smc_clc_msg_accept_confirm *)clcm;
60 		if (ntohs(clc->hdr.length) != sizeof(*clc))
61 			return false;
62 		trl = &clc->trl;
63 		break;
64 	case SMC_CLC_DECLINE:
65 		dclc = (struct smc_clc_msg_decline *)clcm;
66 		if (ntohs(dclc->hdr.length) != sizeof(*dclc))
67 			return false;
68 		trl = &dclc->trl;
69 		break;
70 	default:
71 		return false;
72 	}
73 	if (memcmp(trl->eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER)))
74 		return false;
75 	return true;
76 }
77 
78 /* find ipv4 addr on device and get the prefix len, fill CLC proposal msg */
79 static int smc_clc_prfx_set4_rcu(struct dst_entry *dst, __be32 ipv4,
80 				 struct smc_clc_msg_proposal_prefix *prop)
81 {
82 	struct in_device *in_dev = __in_dev_get_rcu(dst->dev);
83 
84 	if (!in_dev)
85 		return -ENODEV;
86 	for_ifa(in_dev) {
87 		if (!inet_ifa_match(ipv4, ifa))
88 			continue;
89 		prop->prefix_len = inet_mask_len(ifa->ifa_mask);
90 		prop->outgoing_subnet = ifa->ifa_address & ifa->ifa_mask;
91 		/* prop->ipv6_prefixes_cnt = 0; already done by memset before */
92 		return 0;
93 	} endfor_ifa(in_dev);
94 	return -ENOENT;
95 }
96 
97 /* fill CLC proposal msg with ipv6 prefixes from device */
98 static int smc_clc_prfx_set6_rcu(struct dst_entry *dst,
99 				 struct smc_clc_msg_proposal_prefix *prop,
100 				 struct smc_clc_ipv6_prefix *ipv6_prfx)
101 {
102 #if IS_ENABLED(CONFIG_IPV6)
103 	struct inet6_dev *in6_dev = __in6_dev_get(dst->dev);
104 	struct inet6_ifaddr *ifa;
105 	int cnt = 0;
106 
107 	if (!in6_dev)
108 		return -ENODEV;
109 	/* use a maximum of 8 IPv6 prefixes from device */
110 	list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
111 		if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
112 			continue;
113 		ipv6_addr_prefix(&ipv6_prfx[cnt].prefix,
114 				 &ifa->addr, ifa->prefix_len);
115 		ipv6_prfx[cnt].prefix_len = ifa->prefix_len;
116 		cnt++;
117 		if (cnt == SMC_CLC_MAX_V6_PREFIX)
118 			break;
119 	}
120 	prop->ipv6_prefixes_cnt = cnt;
121 	if (cnt)
122 		return 0;
123 #endif
124 	return -ENOENT;
125 }
126 
127 /* retrieve and set prefixes in CLC proposal msg */
128 static int smc_clc_prfx_set(struct socket *clcsock,
129 			    struct smc_clc_msg_proposal_prefix *prop,
130 			    struct smc_clc_ipv6_prefix *ipv6_prfx)
131 {
132 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
133 	struct sockaddr_storage addrs;
134 	struct sockaddr_in6 *addr6;
135 	struct sockaddr_in *addr;
136 	int rc = -ENOENT;
137 
138 	memset(prop, 0, sizeof(*prop));
139 	if (!dst) {
140 		rc = -ENOTCONN;
141 		goto out;
142 	}
143 	if (!dst->dev) {
144 		rc = -ENODEV;
145 		goto out_rel;
146 	}
147 	/* get address to which the internal TCP socket is bound */
148 	kernel_getsockname(clcsock, (struct sockaddr *)&addrs);
149 	/* analyze IP specific data of net_device belonging to TCP socket */
150 	addr6 = (struct sockaddr_in6 *)&addrs;
151 	rcu_read_lock();
152 	if (addrs.ss_family == PF_INET) {
153 		/* IPv4 */
154 		addr = (struct sockaddr_in *)&addrs;
155 		rc = smc_clc_prfx_set4_rcu(dst, addr->sin_addr.s_addr, prop);
156 	} else if (ipv6_addr_v4mapped(&addr6->sin6_addr)) {
157 		/* mapped IPv4 address - peer is IPv4 only */
158 		rc = smc_clc_prfx_set4_rcu(dst, addr6->sin6_addr.s6_addr32[3],
159 					   prop);
160 	} else {
161 		/* IPv6 */
162 		rc = smc_clc_prfx_set6_rcu(dst, prop, ipv6_prfx);
163 	}
164 	rcu_read_unlock();
165 out_rel:
166 	dst_release(dst);
167 out:
168 	return rc;
169 }
170 
171 /* match ipv4 addrs of dev against addr in CLC proposal */
172 static int smc_clc_prfx_match4_rcu(struct net_device *dev,
173 				   struct smc_clc_msg_proposal_prefix *prop)
174 {
175 	struct in_device *in_dev = __in_dev_get_rcu(dev);
176 
177 	if (!in_dev)
178 		return -ENODEV;
179 	for_ifa(in_dev) {
180 		if (prop->prefix_len == inet_mask_len(ifa->ifa_mask) &&
181 		    inet_ifa_match(prop->outgoing_subnet, ifa))
182 			return 0;
183 	} endfor_ifa(in_dev);
184 
185 	return -ENOENT;
186 }
187 
188 /* match ipv6 addrs of dev against addrs in CLC proposal */
189 static int smc_clc_prfx_match6_rcu(struct net_device *dev,
190 				   struct smc_clc_msg_proposal_prefix *prop)
191 {
192 #if IS_ENABLED(CONFIG_IPV6)
193 	struct inet6_dev *in6_dev = __in6_dev_get(dev);
194 	struct smc_clc_ipv6_prefix *ipv6_prfx;
195 	struct inet6_ifaddr *ifa;
196 	int i, max;
197 
198 	if (!in6_dev)
199 		return -ENODEV;
200 	/* ipv6 prefix list starts behind smc_clc_msg_proposal_prefix */
201 	ipv6_prfx = (struct smc_clc_ipv6_prefix *)((u8 *)prop + sizeof(*prop));
202 	max = min_t(u8, prop->ipv6_prefixes_cnt, SMC_CLC_MAX_V6_PREFIX);
203 	list_for_each_entry(ifa, &in6_dev->addr_list, if_list) {
204 		if (ipv6_addr_type(&ifa->addr) & IPV6_ADDR_LINKLOCAL)
205 			continue;
206 		for (i = 0; i < max; i++) {
207 			if (ifa->prefix_len == ipv6_prfx[i].prefix_len &&
208 			    ipv6_prefix_equal(&ifa->addr, &ipv6_prfx[i].prefix,
209 					      ifa->prefix_len))
210 				return 0;
211 		}
212 	}
213 #endif
214 	return -ENOENT;
215 }
216 
217 /* check if proposed prefixes match one of our device prefixes */
218 int smc_clc_prfx_match(struct socket *clcsock,
219 		       struct smc_clc_msg_proposal_prefix *prop)
220 {
221 	struct dst_entry *dst = sk_dst_get(clcsock->sk);
222 	int rc;
223 
224 	if (!dst) {
225 		rc = -ENOTCONN;
226 		goto out;
227 	}
228 	if (!dst->dev) {
229 		rc = -ENODEV;
230 		goto out_rel;
231 	}
232 	rcu_read_lock();
233 	if (!prop->ipv6_prefixes_cnt)
234 		rc = smc_clc_prfx_match4_rcu(dst->dev, prop);
235 	else
236 		rc = smc_clc_prfx_match6_rcu(dst->dev, prop);
237 	rcu_read_unlock();
238 out_rel:
239 	dst_release(dst);
240 out:
241 	return rc;
242 }
243 
244 /* Wait for data on the tcp-socket, analyze received data
245  * Returns:
246  * 0 if success and it was not a decline that we received.
247  * SMC_CLC_DECL_REPLY if decline received for fallback w/o another decl send.
248  * clcsock error, -EINTR, -ECONNRESET, -EPROTO otherwise.
249  */
250 int smc_clc_wait_msg(struct smc_sock *smc, void *buf, int buflen,
251 		     u8 expected_type)
252 {
253 	struct sock *clc_sk = smc->clcsock->sk;
254 	struct smc_clc_msg_hdr *clcm = buf;
255 	struct msghdr msg = {NULL, 0};
256 	int reason_code = 0;
257 	struct kvec vec = {buf, buflen};
258 	int len, datlen;
259 	int krflags;
260 
261 	/* peek the first few bytes to determine length of data to receive
262 	 * so we don't consume any subsequent CLC message or payload data
263 	 * in the TCP byte stream
264 	 */
265 	/*
266 	 * Caller must make sure that buflen is no less than
267 	 * sizeof(struct smc_clc_msg_hdr)
268 	 */
269 	krflags = MSG_PEEK | MSG_WAITALL;
270 	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
271 	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1,
272 			sizeof(struct smc_clc_msg_hdr));
273 	len = sock_recvmsg(smc->clcsock, &msg, krflags);
274 	if (signal_pending(current)) {
275 		reason_code = -EINTR;
276 		clc_sk->sk_err = EINTR;
277 		smc->sk.sk_err = EINTR;
278 		goto out;
279 	}
280 	if (clc_sk->sk_err) {
281 		reason_code = -clc_sk->sk_err;
282 		smc->sk.sk_err = clc_sk->sk_err;
283 		goto out;
284 	}
285 	if (!len) { /* peer has performed orderly shutdown */
286 		smc->sk.sk_err = ECONNRESET;
287 		reason_code = -ECONNRESET;
288 		goto out;
289 	}
290 	if (len < 0) {
291 		smc->sk.sk_err = -len;
292 		reason_code = len;
293 		goto out;
294 	}
295 	datlen = ntohs(clcm->length);
296 	if ((len < sizeof(struct smc_clc_msg_hdr)) ||
297 	    (datlen > buflen) ||
298 	    ((clcm->type != SMC_CLC_DECLINE) &&
299 	     (clcm->type != expected_type))) {
300 		smc->sk.sk_err = EPROTO;
301 		reason_code = -EPROTO;
302 		goto out;
303 	}
304 
305 	/* receive the complete CLC message */
306 	memset(&msg, 0, sizeof(struct msghdr));
307 	iov_iter_kvec(&msg.msg_iter, READ | ITER_KVEC, &vec, 1, datlen);
308 	krflags = MSG_WAITALL;
309 	smc->clcsock->sk->sk_rcvtimeo = CLC_WAIT_TIME;
310 	len = sock_recvmsg(smc->clcsock, &msg, krflags);
311 	if (len < datlen || !smc_clc_msg_hdr_valid(clcm)) {
312 		smc->sk.sk_err = EPROTO;
313 		reason_code = -EPROTO;
314 		goto out;
315 	}
316 	if (clcm->type == SMC_CLC_DECLINE) {
317 		reason_code = SMC_CLC_DECL_REPLY;
318 		if (((struct smc_clc_msg_decline *)buf)->hdr.flag) {
319 			smc->conn.lgr->sync_err = true;
320 			smc_lgr_terminate(smc->conn.lgr);
321 		}
322 	}
323 
324 out:
325 	return reason_code;
326 }
327 
328 /* send CLC DECLINE message across internal TCP socket */
329 int smc_clc_send_decline(struct smc_sock *smc, u32 peer_diag_info)
330 {
331 	struct smc_clc_msg_decline dclc;
332 	struct msghdr msg;
333 	struct kvec vec;
334 	int len;
335 
336 	memset(&dclc, 0, sizeof(dclc));
337 	memcpy(dclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
338 	dclc.hdr.type = SMC_CLC_DECLINE;
339 	dclc.hdr.length = htons(sizeof(struct smc_clc_msg_decline));
340 	dclc.hdr.version = SMC_CLC_V1;
341 	dclc.hdr.flag = (peer_diag_info == SMC_CLC_DECL_SYNCERR) ? 1 : 0;
342 	memcpy(dclc.id_for_peer, local_systemid, sizeof(local_systemid));
343 	dclc.peer_diagnosis = htonl(peer_diag_info);
344 	memcpy(dclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
345 
346 	memset(&msg, 0, sizeof(msg));
347 	vec.iov_base = &dclc;
348 	vec.iov_len = sizeof(struct smc_clc_msg_decline);
349 	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1,
350 			     sizeof(struct smc_clc_msg_decline));
351 	if (len < sizeof(struct smc_clc_msg_decline))
352 		smc->sk.sk_err = EPROTO;
353 	if (len < 0)
354 		smc->sk.sk_err = -len;
355 	return sock_error(&smc->sk);
356 }
357 
358 /* send CLC PROPOSAL message across internal TCP socket */
359 int smc_clc_send_proposal(struct smc_sock *smc,
360 			  struct smc_ib_device *smcibdev,
361 			  u8 ibport)
362 {
363 	struct smc_clc_ipv6_prefix ipv6_prfx[SMC_CLC_MAX_V6_PREFIX];
364 	struct smc_clc_msg_proposal_prefix pclc_prfx;
365 	struct smc_clc_msg_proposal pclc;
366 	struct smc_clc_msg_trail trl;
367 	int len, i, plen, rc;
368 	int reason_code = 0;
369 	struct kvec vec[4];
370 	struct msghdr msg;
371 
372 	/* retrieve ip prefixes for CLC proposal msg */
373 	rc = smc_clc_prfx_set(smc->clcsock, &pclc_prfx, ipv6_prfx);
374 	if (rc)
375 		return SMC_CLC_DECL_CNFERR; /* configuration error */
376 
377 	/* send SMC Proposal CLC message */
378 	plen = sizeof(pclc) + sizeof(pclc_prfx) +
379 	       (pclc_prfx.ipv6_prefixes_cnt * sizeof(ipv6_prfx[0])) +
380 	       sizeof(trl);
381 	memset(&pclc, 0, sizeof(pclc));
382 	memcpy(pclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
383 	pclc.hdr.type = SMC_CLC_PROPOSAL;
384 	pclc.hdr.length = htons(plen);
385 	pclc.hdr.version = SMC_CLC_V1;		/* SMC version */
386 	memcpy(pclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
387 	memcpy(&pclc.lcl.gid, &smcibdev->gid[ibport - 1], SMC_GID_SIZE);
388 	memcpy(&pclc.lcl.mac, &smcibdev->mac[ibport - 1], ETH_ALEN);
389 	pclc.iparea_offset = htons(0);
390 
391 	memcpy(trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
392 	memset(&msg, 0, sizeof(msg));
393 	i = 0;
394 	vec[i].iov_base = &pclc;
395 	vec[i++].iov_len = sizeof(pclc);
396 	vec[i].iov_base = &pclc_prfx;
397 	vec[i++].iov_len = sizeof(pclc_prfx);
398 	if (pclc_prfx.ipv6_prefixes_cnt > 0) {
399 		vec[i].iov_base = &ipv6_prfx[0];
400 		vec[i++].iov_len = pclc_prfx.ipv6_prefixes_cnt *
401 				   sizeof(ipv6_prfx[0]);
402 	}
403 	vec[i].iov_base = &trl;
404 	vec[i++].iov_len = sizeof(trl);
405 	/* due to the few bytes needed for clc-handshake this cannot block */
406 	len = kernel_sendmsg(smc->clcsock, &msg, vec, i, plen);
407 	if (len < sizeof(pclc)) {
408 		if (len >= 0) {
409 			reason_code = -ENETUNREACH;
410 			smc->sk.sk_err = -reason_code;
411 		} else {
412 			smc->sk.sk_err = smc->clcsock->sk->sk_err;
413 			reason_code = -smc->sk.sk_err;
414 		}
415 	}
416 
417 	return reason_code;
418 }
419 
420 /* send CLC CONFIRM message across internal TCP socket */
421 int smc_clc_send_confirm(struct smc_sock *smc)
422 {
423 	struct smc_connection *conn = &smc->conn;
424 	struct smc_clc_msg_accept_confirm cclc;
425 	struct smc_link *link;
426 	int reason_code = 0;
427 	struct msghdr msg;
428 	struct kvec vec;
429 	int len;
430 
431 	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
432 	/* send SMC Confirm CLC msg */
433 	memset(&cclc, 0, sizeof(cclc));
434 	memcpy(cclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
435 	cclc.hdr.type = SMC_CLC_CONFIRM;
436 	cclc.hdr.length = htons(sizeof(cclc));
437 	cclc.hdr.version = SMC_CLC_V1;		/* SMC version */
438 	memcpy(cclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
439 	memcpy(&cclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
440 	       SMC_GID_SIZE);
441 	memcpy(&cclc.lcl.mac, &link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
442 	hton24(cclc.qpn, link->roce_qp->qp_num);
443 	cclc.rmb_rkey =
444 		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
445 	cclc.conn_idx = 1; /* for now: 1 RMB = 1 RMBE */
446 	cclc.rmbe_alert_token = htonl(conn->alert_token_local);
447 	cclc.qp_mtu = min(link->path_mtu, link->peer_mtu);
448 	cclc.rmbe_size = conn->rmbe_size_short;
449 	cclc.rmb_dma_addr = cpu_to_be64(
450 		(u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
451 	hton24(cclc.psn, link->psn_initial);
452 
453 	memcpy(cclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
454 
455 	memset(&msg, 0, sizeof(msg));
456 	vec.iov_base = &cclc;
457 	vec.iov_len = sizeof(cclc);
458 	len = kernel_sendmsg(smc->clcsock, &msg, &vec, 1, sizeof(cclc));
459 	if (len < sizeof(cclc)) {
460 		if (len >= 0) {
461 			reason_code = -ENETUNREACH;
462 			smc->sk.sk_err = -reason_code;
463 		} else {
464 			smc->sk.sk_err = smc->clcsock->sk->sk_err;
465 			reason_code = -smc->sk.sk_err;
466 		}
467 	}
468 	return reason_code;
469 }
470 
471 /* send CLC ACCEPT message across internal TCP socket */
472 int smc_clc_send_accept(struct smc_sock *new_smc, int srv_first_contact)
473 {
474 	struct smc_connection *conn = &new_smc->conn;
475 	struct smc_clc_msg_accept_confirm aclc;
476 	struct smc_link *link;
477 	struct msghdr msg;
478 	struct kvec vec;
479 	int rc = 0;
480 	int len;
481 
482 	link = &conn->lgr->lnk[SMC_SINGLE_LINK];
483 	memset(&aclc, 0, sizeof(aclc));
484 	memcpy(aclc.hdr.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
485 	aclc.hdr.type = SMC_CLC_ACCEPT;
486 	aclc.hdr.length = htons(sizeof(aclc));
487 	aclc.hdr.version = SMC_CLC_V1;		/* SMC version */
488 	if (srv_first_contact)
489 		aclc.hdr.flag = 1;
490 	memcpy(aclc.lcl.id_for_peer, local_systemid, sizeof(local_systemid));
491 	memcpy(&aclc.lcl.gid, &link->smcibdev->gid[link->ibport - 1],
492 	       SMC_GID_SIZE);
493 	memcpy(&aclc.lcl.mac, link->smcibdev->mac[link->ibport - 1], ETH_ALEN);
494 	hton24(aclc.qpn, link->roce_qp->qp_num);
495 	aclc.rmb_rkey =
496 		htonl(conn->rmb_desc->mr_rx[SMC_SINGLE_LINK]->rkey);
497 	aclc.conn_idx = 1;			/* as long as 1 RMB = 1 RMBE */
498 	aclc.rmbe_alert_token = htonl(conn->alert_token_local);
499 	aclc.qp_mtu = link->path_mtu;
500 	aclc.rmbe_size = conn->rmbe_size_short,
501 	aclc.rmb_dma_addr = cpu_to_be64(
502 		(u64)sg_dma_address(conn->rmb_desc->sgt[SMC_SINGLE_LINK].sgl));
503 	hton24(aclc.psn, link->psn_initial);
504 	memcpy(aclc.trl.eyecatcher, SMC_EYECATCHER, sizeof(SMC_EYECATCHER));
505 
506 	memset(&msg, 0, sizeof(msg));
507 	vec.iov_base = &aclc;
508 	vec.iov_len = sizeof(aclc);
509 	len = kernel_sendmsg(new_smc->clcsock, &msg, &vec, 1, sizeof(aclc));
510 	if (len < sizeof(aclc)) {
511 		if (len >= 0)
512 			new_smc->sk.sk_err = EPROTO;
513 		else
514 			new_smc->sk.sk_err = new_smc->clcsock->sk->sk_err;
515 		rc = sock_error(&new_smc->sk);
516 	}
517 
518 	return rc;
519 }
520