xref: /linux/net/mptcp/options.c (revision 8b0adbe3e38dbe5aae9edf6f5159ffdca7cfbdf1)
1 // SPDX-License-Identifier: GPL-2.0
2 /* Multipath TCP
3  *
4  * Copyright (c) 2017 - 2019, Intel Corporation.
5  */
6 
7 #define pr_fmt(fmt) "MPTCP: " fmt
8 
9 #include <linux/kernel.h>
10 #include <crypto/sha2.h>
11 #include <net/tcp.h>
12 #include <net/mptcp.h>
13 #include "protocol.h"
14 #include "mib.h"
15 
16 static bool mptcp_cap_flag_sha256(u8 flags)
17 {
18 	return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256;
19 }
20 
21 static void mptcp_parse_option(const struct sk_buff *skb,
22 			       const unsigned char *ptr, int opsize,
23 			       struct mptcp_options_received *mp_opt)
24 {
25 	u8 subtype = *ptr >> 4;
26 	int expected_opsize;
27 	u8 version;
28 	u8 flags;
29 	u8 i;
30 
31 	switch (subtype) {
32 	case MPTCPOPT_MP_CAPABLE:
33 		/* strict size checking */
34 		if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
35 			if (skb->len > tcp_hdr(skb)->doff << 2)
36 				expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA;
37 			else
38 				expected_opsize = TCPOLEN_MPTCP_MPC_ACK;
39 		} else {
40 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK)
41 				expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK;
42 			else
43 				expected_opsize = TCPOLEN_MPTCP_MPC_SYN;
44 		}
45 		if (opsize != expected_opsize)
46 			break;
47 
48 		/* try to be gentle vs future versions on the initial syn */
49 		version = *ptr++ & MPTCP_VERSION_MASK;
50 		if (opsize != TCPOLEN_MPTCP_MPC_SYN) {
51 			if (version != MPTCP_SUPPORTED_VERSION)
52 				break;
53 		} else if (version < MPTCP_SUPPORTED_VERSION) {
54 			break;
55 		}
56 
57 		flags = *ptr++;
58 		if (!mptcp_cap_flag_sha256(flags) ||
59 		    (flags & MPTCP_CAP_EXTENSIBILITY))
60 			break;
61 
62 		/* RFC 6824, Section 3.1:
63 		 * "For the Checksum Required bit (labeled "A"), if either
64 		 * host requires the use of checksums, checksums MUST be used.
65 		 * In other words, the only way for checksums not to be used
66 		 * is if both hosts in their SYNs set A=0."
67 		 *
68 		 * Section 3.3.0:
69 		 * "If a checksum is not present when its use has been
70 		 * negotiated, the receiver MUST close the subflow with a RST as
71 		 * it is considered broken."
72 		 *
73 		 * We don't implement DSS checksum - fall back to TCP.
74 		 */
75 		if (flags & MPTCP_CAP_CHECKSUM_REQD)
76 			break;
77 
78 		mp_opt->mp_capable = 1;
79 		if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) {
80 			mp_opt->sndr_key = get_unaligned_be64(ptr);
81 			ptr += 8;
82 		}
83 		if (opsize >= TCPOLEN_MPTCP_MPC_ACK) {
84 			mp_opt->rcvr_key = get_unaligned_be64(ptr);
85 			ptr += 8;
86 		}
87 		if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) {
88 			/* Section 3.1.:
89 			 * "the data parameters in a MP_CAPABLE are semantically
90 			 * equivalent to those in a DSS option and can be used
91 			 * interchangeably."
92 			 */
93 			mp_opt->dss = 1;
94 			mp_opt->use_map = 1;
95 			mp_opt->mpc_map = 1;
96 			mp_opt->data_len = get_unaligned_be16(ptr);
97 			ptr += 2;
98 		}
99 		pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d",
100 			 version, flags, opsize, mp_opt->sndr_key,
101 			 mp_opt->rcvr_key, mp_opt->data_len);
102 		break;
103 
104 	case MPTCPOPT_MP_JOIN:
105 		mp_opt->mp_join = 1;
106 		if (opsize == TCPOLEN_MPTCP_MPJ_SYN) {
107 			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
108 			mp_opt->join_id = *ptr++;
109 			mp_opt->token = get_unaligned_be32(ptr);
110 			ptr += 4;
111 			mp_opt->nonce = get_unaligned_be32(ptr);
112 			ptr += 4;
113 			pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u",
114 				 mp_opt->backup, mp_opt->join_id,
115 				 mp_opt->token, mp_opt->nonce);
116 		} else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) {
117 			mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP;
118 			mp_opt->join_id = *ptr++;
119 			mp_opt->thmac = get_unaligned_be64(ptr);
120 			ptr += 8;
121 			mp_opt->nonce = get_unaligned_be32(ptr);
122 			ptr += 4;
123 			pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u",
124 				 mp_opt->backup, mp_opt->join_id,
125 				 mp_opt->thmac, mp_opt->nonce);
126 		} else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) {
127 			ptr += 2;
128 			memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN);
129 			pr_debug("MP_JOIN hmac");
130 		} else {
131 			pr_warn("MP_JOIN bad option size");
132 			mp_opt->mp_join = 0;
133 		}
134 		break;
135 
136 	case MPTCPOPT_DSS:
137 		pr_debug("DSS");
138 		ptr++;
139 
140 		/* we must clear 'mpc_map' be able to detect MP_CAPABLE
141 		 * map vs DSS map in mptcp_incoming_options(), and reconstruct
142 		 * map info accordingly
143 		 */
144 		mp_opt->mpc_map = 0;
145 		flags = (*ptr++) & MPTCP_DSS_FLAG_MASK;
146 		mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0;
147 		mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0;
148 		mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0;
149 		mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0;
150 		mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK);
151 
152 		pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d",
153 			 mp_opt->data_fin, mp_opt->dsn64,
154 			 mp_opt->use_map, mp_opt->ack64,
155 			 mp_opt->use_ack);
156 
157 		expected_opsize = TCPOLEN_MPTCP_DSS_BASE;
158 
159 		if (mp_opt->use_ack) {
160 			if (mp_opt->ack64)
161 				expected_opsize += TCPOLEN_MPTCP_DSS_ACK64;
162 			else
163 				expected_opsize += TCPOLEN_MPTCP_DSS_ACK32;
164 		}
165 
166 		if (mp_opt->use_map) {
167 			if (mp_opt->dsn64)
168 				expected_opsize += TCPOLEN_MPTCP_DSS_MAP64;
169 			else
170 				expected_opsize += TCPOLEN_MPTCP_DSS_MAP32;
171 		}
172 
173 		/* RFC 6824, Section 3.3:
174 		 * If a checksum is present, but its use had
175 		 * not been negotiated in the MP_CAPABLE handshake,
176 		 * the checksum field MUST be ignored.
177 		 */
178 		if (opsize != expected_opsize &&
179 		    opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM)
180 			break;
181 
182 		mp_opt->dss = 1;
183 
184 		if (mp_opt->use_ack) {
185 			if (mp_opt->ack64) {
186 				mp_opt->data_ack = get_unaligned_be64(ptr);
187 				ptr += 8;
188 			} else {
189 				mp_opt->data_ack = get_unaligned_be32(ptr);
190 				ptr += 4;
191 			}
192 
193 			pr_debug("data_ack=%llu", mp_opt->data_ack);
194 		}
195 
196 		if (mp_opt->use_map) {
197 			if (mp_opt->dsn64) {
198 				mp_opt->data_seq = get_unaligned_be64(ptr);
199 				ptr += 8;
200 			} else {
201 				mp_opt->data_seq = get_unaligned_be32(ptr);
202 				ptr += 4;
203 			}
204 
205 			mp_opt->subflow_seq = get_unaligned_be32(ptr);
206 			ptr += 4;
207 
208 			mp_opt->data_len = get_unaligned_be16(ptr);
209 			ptr += 2;
210 
211 			pr_debug("data_seq=%llu subflow_seq=%u data_len=%u",
212 				 mp_opt->data_seq, mp_opt->subflow_seq,
213 				 mp_opt->data_len);
214 		}
215 
216 		break;
217 
218 	case MPTCPOPT_ADD_ADDR:
219 		mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO;
220 		if (!mp_opt->echo) {
221 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR ||
222 			    opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT)
223 				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
224 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
225 			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 ||
226 				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT)
227 				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
228 #endif
229 			else
230 				break;
231 		} else {
232 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE ||
233 			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT)
234 				mp_opt->family = MPTCP_ADDR_IPVERSION_4;
235 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
236 			else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE ||
237 				 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT)
238 				mp_opt->family = MPTCP_ADDR_IPVERSION_6;
239 #endif
240 			else
241 				break;
242 		}
243 
244 		mp_opt->add_addr = 1;
245 		mp_opt->addr_id = *ptr++;
246 		if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) {
247 			memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4);
248 			ptr += 4;
249 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT ||
250 			    opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) {
251 				mp_opt->port = get_unaligned_be16(ptr);
252 				ptr += 2;
253 			}
254 		}
255 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
256 		else {
257 			memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16);
258 			ptr += 16;
259 			if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT ||
260 			    opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) {
261 				mp_opt->port = get_unaligned_be16(ptr);
262 				ptr += 2;
263 			}
264 		}
265 #endif
266 		if (!mp_opt->echo) {
267 			mp_opt->ahmac = get_unaligned_be64(ptr);
268 			ptr += 8;
269 		}
270 		pr_debug("ADD_ADDR%s: id=%d, ahmac=%llu, echo=%d, port=%d",
271 			 (mp_opt->family == MPTCP_ADDR_IPVERSION_6) ? "6" : "",
272 			 mp_opt->addr_id, mp_opt->ahmac, mp_opt->echo, mp_opt->port);
273 		break;
274 
275 	case MPTCPOPT_RM_ADDR:
276 		if (opsize < TCPOLEN_MPTCP_RM_ADDR_BASE + 1 ||
277 		    opsize > TCPOLEN_MPTCP_RM_ADDR_BASE + MPTCP_RM_IDS_MAX)
278 			break;
279 
280 		ptr++;
281 
282 		mp_opt->rm_addr = 1;
283 		mp_opt->rm_list.nr = opsize - TCPOLEN_MPTCP_RM_ADDR_BASE;
284 		for (i = 0; i < mp_opt->rm_list.nr; i++)
285 			mp_opt->rm_list.ids[i] = *ptr++;
286 		pr_debug("RM_ADDR: rm_list_nr=%d", mp_opt->rm_list.nr);
287 		break;
288 
289 	case MPTCPOPT_MP_PRIO:
290 		if (opsize != TCPOLEN_MPTCP_PRIO)
291 			break;
292 
293 		mp_opt->mp_prio = 1;
294 		mp_opt->backup = *ptr++ & MPTCP_PRIO_BKUP;
295 		pr_debug("MP_PRIO: prio=%d", mp_opt->backup);
296 		break;
297 
298 	case MPTCPOPT_MP_FASTCLOSE:
299 		if (opsize != TCPOLEN_MPTCP_FASTCLOSE)
300 			break;
301 
302 		ptr += 2;
303 		mp_opt->rcvr_key = get_unaligned_be64(ptr);
304 		ptr += 8;
305 		mp_opt->fastclose = 1;
306 		break;
307 
308 	default:
309 		break;
310 	}
311 }
312 
313 void mptcp_get_options(const struct sk_buff *skb,
314 		       struct mptcp_options_received *mp_opt)
315 {
316 	const struct tcphdr *th = tcp_hdr(skb);
317 	const unsigned char *ptr;
318 	int length;
319 
320 	/* initialize option status */
321 	mp_opt->mp_capable = 0;
322 	mp_opt->mp_join = 0;
323 	mp_opt->add_addr = 0;
324 	mp_opt->ahmac = 0;
325 	mp_opt->fastclose = 0;
326 	mp_opt->port = 0;
327 	mp_opt->rm_addr = 0;
328 	mp_opt->dss = 0;
329 	mp_opt->mp_prio = 0;
330 
331 	length = (th->doff * 4) - sizeof(struct tcphdr);
332 	ptr = (const unsigned char *)(th + 1);
333 
334 	while (length > 0) {
335 		int opcode = *ptr++;
336 		int opsize;
337 
338 		switch (opcode) {
339 		case TCPOPT_EOL:
340 			return;
341 		case TCPOPT_NOP:	/* Ref: RFC 793 section 3.1 */
342 			length--;
343 			continue;
344 		default:
345 			opsize = *ptr++;
346 			if (opsize < 2) /* "silly options" */
347 				return;
348 			if (opsize > length)
349 				return;	/* don't parse partial options */
350 			if (opcode == TCPOPT_MPTCP)
351 				mptcp_parse_option(skb, ptr, opsize, mp_opt);
352 			ptr += opsize - 2;
353 			length -= opsize;
354 		}
355 	}
356 }
357 
358 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb,
359 		       unsigned int *size, struct mptcp_out_options *opts)
360 {
361 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
362 
363 	/* we will use snd_isn to detect first pkt [re]transmission
364 	 * in mptcp_established_options_mp()
365 	 */
366 	subflow->snd_isn = TCP_SKB_CB(skb)->end_seq;
367 	if (subflow->request_mptcp) {
368 		opts->suboptions = OPTION_MPTCP_MPC_SYN;
369 		*size = TCPOLEN_MPTCP_MPC_SYN;
370 		return true;
371 	} else if (subflow->request_join) {
372 		pr_debug("remote_token=%u, nonce=%u", subflow->remote_token,
373 			 subflow->local_nonce);
374 		opts->suboptions = OPTION_MPTCP_MPJ_SYN;
375 		opts->join_id = subflow->local_id;
376 		opts->token = subflow->remote_token;
377 		opts->nonce = subflow->local_nonce;
378 		opts->backup = subflow->request_bkup;
379 		*size = TCPOLEN_MPTCP_MPJ_SYN;
380 		return true;
381 	}
382 	return false;
383 }
384 
385 /* MP_JOIN client subflow must wait for 4th ack before sending any data:
386  * TCP can't schedule delack timer before the subflow is fully established.
387  * MPTCP uses the delack timer to do 3rd ack retransmissions
388  */
389 static void schedule_3rdack_retransmission(struct sock *sk)
390 {
391 	struct inet_connection_sock *icsk = inet_csk(sk);
392 	struct tcp_sock *tp = tcp_sk(sk);
393 	unsigned long timeout;
394 
395 	/* reschedule with a timeout above RTT, as we must look only for drop */
396 	if (tp->srtt_us)
397 		timeout = tp->srtt_us << 1;
398 	else
399 		timeout = TCP_TIMEOUT_INIT;
400 
401 	WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER);
402 	icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
403 	icsk->icsk_ack.timeout = timeout;
404 	sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
405 }
406 
407 static void clear_3rdack_retransmission(struct sock *sk)
408 {
409 	struct inet_connection_sock *icsk = inet_csk(sk);
410 
411 	sk_stop_timer(sk, &icsk->icsk_delack_timer);
412 	icsk->icsk_ack.timeout = 0;
413 	icsk->icsk_ack.ato = 0;
414 	icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER);
415 }
416 
417 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb,
418 					 bool snd_data_fin_enable,
419 					 unsigned int *size,
420 					 unsigned int remaining,
421 					 struct mptcp_out_options *opts)
422 {
423 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
424 	struct mptcp_ext *mpext;
425 	unsigned int data_len;
426 
427 	/* When skb is not available, we better over-estimate the emitted
428 	 * options len. A full DSS option (28 bytes) is longer than
429 	 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so
430 	 * tell the caller to defer the estimate to
431 	 * mptcp_established_options_dss(), which will reserve enough space.
432 	 */
433 	if (!skb)
434 		return false;
435 
436 	/* MPC/MPJ needed only on 3rd ack packet, DATA_FIN and TCP shutdown take precedence */
437 	if (subflow->fully_established || snd_data_fin_enable ||
438 	    subflow->snd_isn != TCP_SKB_CB(skb)->seq ||
439 	    sk->sk_state != TCP_ESTABLISHED)
440 		return false;
441 
442 	if (subflow->mp_capable) {
443 		mpext = mptcp_get_ext(skb);
444 		data_len = mpext ? mpext->data_len : 0;
445 
446 		/* we will check ext_copy.data_len in mptcp_write_options() to
447 		 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and
448 		 * TCPOLEN_MPTCP_MPC_ACK
449 		 */
450 		opts->ext_copy.data_len = data_len;
451 		opts->suboptions = OPTION_MPTCP_MPC_ACK;
452 		opts->sndr_key = subflow->local_key;
453 		opts->rcvr_key = subflow->remote_key;
454 
455 		/* Section 3.1.
456 		 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK
457 		 * packets that start the first subflow of an MPTCP connection,
458 		 * as well as the first packet that carries data
459 		 */
460 		if (data_len > 0)
461 			*size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4);
462 		else
463 			*size = TCPOLEN_MPTCP_MPC_ACK;
464 
465 		pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d",
466 			 subflow, subflow->local_key, subflow->remote_key,
467 			 data_len);
468 
469 		return true;
470 	} else if (subflow->mp_join) {
471 		opts->suboptions = OPTION_MPTCP_MPJ_ACK;
472 		memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN);
473 		*size = TCPOLEN_MPTCP_MPJ_ACK;
474 		pr_debug("subflow=%p", subflow);
475 
476 		schedule_3rdack_retransmission(sk);
477 		return true;
478 	}
479 	return false;
480 }
481 
482 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow,
483 				 struct sk_buff *skb, struct mptcp_ext *ext)
484 {
485 	/* The write_seq value has already been incremented, so the actual
486 	 * sequence number for the DATA_FIN is one less.
487 	 */
488 	u64 data_fin_tx_seq = READ_ONCE(mptcp_sk(subflow->conn)->write_seq) - 1;
489 
490 	if (!ext->use_map || !skb->len) {
491 		/* RFC6824 requires a DSS mapping with specific values
492 		 * if DATA_FIN is set but no data payload is mapped
493 		 */
494 		ext->data_fin = 1;
495 		ext->use_map = 1;
496 		ext->dsn64 = 1;
497 		ext->data_seq = data_fin_tx_seq;
498 		ext->subflow_seq = 0;
499 		ext->data_len = 1;
500 	} else if (ext->data_seq + ext->data_len == data_fin_tx_seq) {
501 		/* If there's an existing DSS mapping and it is the
502 		 * final mapping, DATA_FIN consumes 1 additional byte of
503 		 * mapping space.
504 		 */
505 		ext->data_fin = 1;
506 		ext->data_len++;
507 	}
508 }
509 
510 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb,
511 					  bool snd_data_fin_enable,
512 					  unsigned int *size,
513 					  unsigned int remaining,
514 					  struct mptcp_out_options *opts)
515 {
516 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
517 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
518 	unsigned int dss_size = 0;
519 	struct mptcp_ext *mpext;
520 	unsigned int ack_size;
521 	bool ret = false;
522 	u64 ack_seq;
523 
524 	mpext = skb ? mptcp_get_ext(skb) : NULL;
525 
526 	if (!skb || (mpext && mpext->use_map) || snd_data_fin_enable) {
527 		unsigned int map_size;
528 
529 		map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64;
530 
531 		remaining -= map_size;
532 		dss_size = map_size;
533 		if (mpext)
534 			opts->ext_copy = *mpext;
535 
536 		if (skb && snd_data_fin_enable)
537 			mptcp_write_data_fin(subflow, skb, &opts->ext_copy);
538 		ret = true;
539 	}
540 
541 	/* passive sockets msk will set the 'can_ack' after accept(), even
542 	 * if the first subflow may have the already the remote key handy
543 	 */
544 	opts->ext_copy.use_ack = 0;
545 	if (!READ_ONCE(msk->can_ack)) {
546 		*size = ALIGN(dss_size, 4);
547 		return ret;
548 	}
549 
550 	ack_seq = READ_ONCE(msk->ack_seq);
551 	if (READ_ONCE(msk->use_64bit_ack)) {
552 		ack_size = TCPOLEN_MPTCP_DSS_ACK64;
553 		opts->ext_copy.data_ack = ack_seq;
554 		opts->ext_copy.ack64 = 1;
555 	} else {
556 		ack_size = TCPOLEN_MPTCP_DSS_ACK32;
557 		opts->ext_copy.data_ack32 = (uint32_t)ack_seq;
558 		opts->ext_copy.ack64 = 0;
559 	}
560 	opts->ext_copy.use_ack = 1;
561 	WRITE_ONCE(msk->old_wspace, __mptcp_space((struct sock *)msk));
562 
563 	/* Add kind/length/subtype/flag overhead if mapping is not populated */
564 	if (dss_size == 0)
565 		ack_size += TCPOLEN_MPTCP_DSS_BASE;
566 
567 	dss_size += ack_size;
568 
569 	*size = ALIGN(dss_size, 4);
570 	return true;
571 }
572 
573 static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id,
574 				  struct in_addr *addr, u16 port)
575 {
576 	u8 hmac[SHA256_DIGEST_SIZE];
577 	u8 msg[7];
578 
579 	msg[0] = addr_id;
580 	memcpy(&msg[1], &addr->s_addr, 4);
581 	msg[5] = port >> 8;
582 	msg[6] = port & 0xFF;
583 
584 	mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac);
585 
586 	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
587 }
588 
589 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
590 static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id,
591 				   struct in6_addr *addr, u16 port)
592 {
593 	u8 hmac[SHA256_DIGEST_SIZE];
594 	u8 msg[19];
595 
596 	msg[0] = addr_id;
597 	memcpy(&msg[1], &addr->s6_addr, 16);
598 	msg[17] = port >> 8;
599 	msg[18] = port & 0xFF;
600 
601 	mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac);
602 
603 	return get_unaligned_be64(&hmac[SHA256_DIGEST_SIZE - sizeof(u64)]);
604 }
605 #endif
606 
607 static bool mptcp_established_options_add_addr(struct sock *sk, struct sk_buff *skb,
608 					       unsigned int *size,
609 					       unsigned int remaining,
610 					       struct mptcp_out_options *opts)
611 {
612 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
613 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
614 	bool drop_other_suboptions = false;
615 	unsigned int opt_size = *size;
616 	struct mptcp_addr_info saddr;
617 	bool echo;
618 	bool port;
619 	int len;
620 
621 	if ((mptcp_pm_should_add_signal_ipv6(msk) ||
622 	     mptcp_pm_should_add_signal_port(msk)) &&
623 	    skb && skb_is_tcp_pure_ack(skb)) {
624 		pr_debug("drop other suboptions");
625 		opts->suboptions = 0;
626 		opts->ext_copy.use_ack = 0;
627 		opts->ext_copy.use_map = 0;
628 		remaining += opt_size;
629 		drop_other_suboptions = true;
630 	}
631 
632 	if (!mptcp_pm_should_add_signal(msk) ||
633 	    !(mptcp_pm_add_addr_signal(msk, remaining, &saddr, &echo, &port)))
634 		return false;
635 
636 	len = mptcp_add_addr_len(saddr.family, echo, port);
637 	if (remaining < len)
638 		return false;
639 
640 	*size = len;
641 	if (drop_other_suboptions)
642 		*size -= opt_size;
643 	opts->addr_id = saddr.id;
644 	if (port)
645 		opts->port = ntohs(saddr.port);
646 	if (saddr.family == AF_INET) {
647 		opts->suboptions |= OPTION_MPTCP_ADD_ADDR;
648 		opts->addr = saddr.addr;
649 		if (!echo) {
650 			opts->ahmac = add_addr_generate_hmac(msk->local_key,
651 							     msk->remote_key,
652 							     opts->addr_id,
653 							     &opts->addr,
654 							     opts->port);
655 		}
656 	}
657 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
658 	else if (saddr.family == AF_INET6) {
659 		opts->suboptions |= OPTION_MPTCP_ADD_ADDR6;
660 		opts->addr6 = saddr.addr6;
661 		if (!echo) {
662 			opts->ahmac = add_addr6_generate_hmac(msk->local_key,
663 							      msk->remote_key,
664 							      opts->addr_id,
665 							      &opts->addr6,
666 							      opts->port);
667 		}
668 	}
669 #endif
670 	pr_debug("addr_id=%d, ahmac=%llu, echo=%d, port=%d",
671 		 opts->addr_id, opts->ahmac, echo, opts->port);
672 
673 	return true;
674 }
675 
676 static bool mptcp_established_options_rm_addr(struct sock *sk,
677 					      unsigned int *size,
678 					      unsigned int remaining,
679 					      struct mptcp_out_options *opts)
680 {
681 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
682 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
683 	struct mptcp_rm_list rm_list;
684 	int i, len;
685 
686 	if (!mptcp_pm_should_rm_signal(msk) ||
687 	    !(mptcp_pm_rm_addr_signal(msk, remaining, &rm_list)))
688 		return false;
689 
690 	len = mptcp_rm_addr_len(&rm_list);
691 	if (len < 0)
692 		return false;
693 	if (remaining < len)
694 		return false;
695 
696 	*size = len;
697 	opts->suboptions |= OPTION_MPTCP_RM_ADDR;
698 	opts->rm_list = rm_list;
699 
700 	for (i = 0; i < opts->rm_list.nr; i++)
701 		pr_debug("rm_list_ids[%d]=%d", i, opts->rm_list.ids[i]);
702 
703 	return true;
704 }
705 
706 static bool mptcp_established_options_mp_prio(struct sock *sk,
707 					      unsigned int *size,
708 					      unsigned int remaining,
709 					      struct mptcp_out_options *opts)
710 {
711 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
712 
713 	if (!subflow->send_mp_prio)
714 		return false;
715 
716 	/* account for the trailing 'nop' option */
717 	if (remaining < TCPOLEN_MPTCP_PRIO_ALIGN)
718 		return false;
719 
720 	*size = TCPOLEN_MPTCP_PRIO_ALIGN;
721 	opts->suboptions |= OPTION_MPTCP_PRIO;
722 	opts->backup = subflow->request_bkup;
723 
724 	pr_debug("prio=%d", opts->backup);
725 
726 	return true;
727 }
728 
729 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb,
730 			       unsigned int *size, unsigned int remaining,
731 			       struct mptcp_out_options *opts)
732 {
733 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
734 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
735 	unsigned int opt_size = 0;
736 	bool snd_data_fin;
737 	bool ret = false;
738 
739 	opts->suboptions = 0;
740 
741 	if (unlikely(__mptcp_check_fallback(msk)))
742 		return false;
743 
744 	/* prevent adding of any MPTCP related options on reset packet
745 	 * until we support MP_TCPRST/MP_FASTCLOSE
746 	 */
747 	if (unlikely(skb && TCP_SKB_CB(skb)->tcp_flags & TCPHDR_RST))
748 		return false;
749 
750 	snd_data_fin = mptcp_data_fin_enabled(msk);
751 	if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts))
752 		ret = true;
753 	else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts))
754 		ret = true;
755 
756 	/* we reserved enough space for the above options, and exceeding the
757 	 * TCP option space would be fatal
758 	 */
759 	if (WARN_ON_ONCE(opt_size > remaining))
760 		return false;
761 
762 	*size += opt_size;
763 	remaining -= opt_size;
764 	if (mptcp_established_options_add_addr(sk, skb, &opt_size, remaining, opts)) {
765 		*size += opt_size;
766 		remaining -= opt_size;
767 		ret = true;
768 	} else if (mptcp_established_options_rm_addr(sk, &opt_size, remaining, opts)) {
769 		*size += opt_size;
770 		remaining -= opt_size;
771 		ret = true;
772 	}
773 
774 	if (mptcp_established_options_mp_prio(sk, &opt_size, remaining, opts)) {
775 		*size += opt_size;
776 		remaining -= opt_size;
777 		ret = true;
778 	}
779 
780 	return ret;
781 }
782 
783 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size,
784 			  struct mptcp_out_options *opts)
785 {
786 	struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req);
787 
788 	if (subflow_req->mp_capable) {
789 		opts->suboptions = OPTION_MPTCP_MPC_SYNACK;
790 		opts->sndr_key = subflow_req->local_key;
791 		*size = TCPOLEN_MPTCP_MPC_SYNACK;
792 		pr_debug("subflow_req=%p, local_key=%llu",
793 			 subflow_req, subflow_req->local_key);
794 		return true;
795 	} else if (subflow_req->mp_join) {
796 		opts->suboptions = OPTION_MPTCP_MPJ_SYNACK;
797 		opts->backup = subflow_req->backup;
798 		opts->join_id = subflow_req->local_id;
799 		opts->thmac = subflow_req->thmac;
800 		opts->nonce = subflow_req->local_nonce;
801 		pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u",
802 			 subflow_req, opts->backup, opts->join_id,
803 			 opts->thmac, opts->nonce);
804 		*size = TCPOLEN_MPTCP_MPJ_SYNACK;
805 		return true;
806 	}
807 	return false;
808 }
809 
810 static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
811 				    struct mptcp_subflow_context *subflow,
812 				    struct sk_buff *skb,
813 				    struct mptcp_options_received *mp_opt)
814 {
815 	/* here we can process OoO, in-window pkts, only in-sequence 4th ack
816 	 * will make the subflow fully established
817 	 */
818 	if (likely(subflow->fully_established)) {
819 		/* on passive sockets, check for 3rd ack retransmission
820 		 * note that msk is always set by subflow_syn_recv_sock()
821 		 * for mp_join subflows
822 		 */
823 		if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 &&
824 		    TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq &&
825 		    subflow->mp_join && mp_opt->mp_join &&
826 		    READ_ONCE(msk->pm.server_side))
827 			tcp_send_ack(ssk);
828 		goto fully_established;
829 	}
830 
831 	/* we must process OoO packets before the first subflow is fully
832 	 * established. OoO packets are instead a protocol violation
833 	 * for MP_JOIN subflows as the peer must not send any data
834 	 * before receiving the forth ack - cfr. RFC 8684 section 3.2.
835 	 */
836 	if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) {
837 		if (subflow->mp_join)
838 			goto reset;
839 		return subflow->mp_capable;
840 	}
841 
842 	if (mp_opt->dss && mp_opt->use_ack) {
843 		/* subflows are fully established as soon as we get any
844 		 * additional ack.
845 		 */
846 		subflow->fully_established = 1;
847 		WRITE_ONCE(msk->fully_established, true);
848 		goto fully_established;
849 	}
850 
851 	if (mp_opt->add_addr) {
852 		WRITE_ONCE(msk->fully_established, true);
853 		return true;
854 	}
855 
856 	/* If the first established packet does not contain MP_CAPABLE + data
857 	 * then fallback to TCP. Fallback scenarios requires a reset for
858 	 * MP_JOIN subflows.
859 	 */
860 	if (!mp_opt->mp_capable) {
861 		if (subflow->mp_join)
862 			goto reset;
863 		subflow->mp_capable = 0;
864 		pr_fallback(msk);
865 		__mptcp_do_fallback(msk);
866 		return false;
867 	}
868 
869 	if (unlikely(!READ_ONCE(msk->pm.server_side)))
870 		pr_warn_once("bogus mpc option on established client sk");
871 	mptcp_subflow_fully_established(subflow, mp_opt);
872 
873 fully_established:
874 	/* if the subflow is not already linked into the conn_list, we can't
875 	 * notify the PM: this subflow is still on the listener queue
876 	 * and the PM possibly acquiring the subflow lock could race with
877 	 * the listener close
878 	 */
879 	if (likely(subflow->pm_notified) || list_empty(&subflow->node))
880 		return true;
881 
882 	subflow->pm_notified = 1;
883 	if (subflow->mp_join) {
884 		clear_3rdack_retransmission(ssk);
885 		mptcp_pm_subflow_established(msk);
886 	} else {
887 		mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC);
888 	}
889 	return true;
890 
891 reset:
892 	mptcp_subflow_reset(ssk);
893 	return false;
894 }
895 
896 static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit)
897 {
898 	u32 old_ack32, cur_ack32;
899 
900 	if (use_64bit)
901 		return cur_ack;
902 
903 	old_ack32 = (u32)old_ack;
904 	cur_ack32 = (u32)cur_ack;
905 	cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32;
906 	if (unlikely(before(cur_ack32, old_ack32)))
907 		return cur_ack + (1LL << 32);
908 	return cur_ack;
909 }
910 
911 static void ack_update_msk(struct mptcp_sock *msk,
912 			   struct sock *ssk,
913 			   struct mptcp_options_received *mp_opt)
914 {
915 	u64 new_wnd_end, new_snd_una, snd_nxt = READ_ONCE(msk->snd_nxt);
916 	struct sock *sk = (struct sock *)msk;
917 	u64 old_snd_una;
918 
919 	mptcp_data_lock(sk);
920 
921 	/* avoid ack expansion on update conflict, to reduce the risk of
922 	 * wrongly expanding to a future ack sequence number, which is way
923 	 * more dangerous than missing an ack
924 	 */
925 	old_snd_una = msk->snd_una;
926 	new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64);
927 
928 	/* ACK for data not even sent yet? Ignore. */
929 	if (after64(new_snd_una, snd_nxt))
930 		new_snd_una = old_snd_una;
931 
932 	new_wnd_end = new_snd_una + tcp_sk(ssk)->snd_wnd;
933 
934 	if (after64(new_wnd_end, msk->wnd_end))
935 		msk->wnd_end = new_wnd_end;
936 
937 	/* this assumes mptcp_incoming_options() is invoked after tcp_ack() */
938 	if (after64(msk->wnd_end, READ_ONCE(msk->snd_nxt)))
939 		__mptcp_check_push(sk, ssk);
940 
941 	if (after64(new_snd_una, old_snd_una)) {
942 		msk->snd_una = new_snd_una;
943 		__mptcp_data_acked(sk);
944 	}
945 	mptcp_data_unlock(sk);
946 }
947 
948 bool mptcp_update_rcv_data_fin(struct mptcp_sock *msk, u64 data_fin_seq, bool use_64bit)
949 {
950 	/* Skip if DATA_FIN was already received.
951 	 * If updating simultaneously with the recvmsg loop, values
952 	 * should match. If they mismatch, the peer is misbehaving and
953 	 * we will prefer the most recent information.
954 	 */
955 	if (READ_ONCE(msk->rcv_data_fin) || !READ_ONCE(msk->first))
956 		return false;
957 
958 	WRITE_ONCE(msk->rcv_data_fin_seq,
959 		   expand_ack(READ_ONCE(msk->ack_seq), data_fin_seq, use_64bit));
960 	WRITE_ONCE(msk->rcv_data_fin, 1);
961 
962 	return true;
963 }
964 
965 static bool add_addr_hmac_valid(struct mptcp_sock *msk,
966 				struct mptcp_options_received *mp_opt)
967 {
968 	u64 hmac = 0;
969 
970 	if (mp_opt->echo)
971 		return true;
972 
973 	if (mp_opt->family == MPTCP_ADDR_IPVERSION_4)
974 		hmac = add_addr_generate_hmac(msk->remote_key,
975 					      msk->local_key,
976 					      mp_opt->addr_id, &mp_opt->addr,
977 					      mp_opt->port);
978 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
979 	else
980 		hmac = add_addr6_generate_hmac(msk->remote_key,
981 					       msk->local_key,
982 					       mp_opt->addr_id, &mp_opt->addr6,
983 					       mp_opt->port);
984 #endif
985 
986 	pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n",
987 		 msk, (unsigned long long)hmac,
988 		 (unsigned long long)mp_opt->ahmac);
989 
990 	return hmac == mp_opt->ahmac;
991 }
992 
993 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb)
994 {
995 	struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
996 	struct mptcp_sock *msk = mptcp_sk(subflow->conn);
997 	struct mptcp_options_received mp_opt;
998 	struct mptcp_ext *mpext;
999 
1000 	if (__mptcp_check_fallback(msk)) {
1001 		/* Keep it simple and unconditionally trigger send data cleanup and
1002 		 * pending queue spooling. We will need to acquire the data lock
1003 		 * for more accurate checks, and once the lock is acquired, such
1004 		 * helpers are cheap.
1005 		 */
1006 		mptcp_data_lock(subflow->conn);
1007 		if (sk_stream_memory_free(sk))
1008 			__mptcp_check_push(subflow->conn, sk);
1009 		__mptcp_data_acked(subflow->conn);
1010 		mptcp_data_unlock(subflow->conn);
1011 		return;
1012 	}
1013 
1014 	mptcp_get_options(skb, &mp_opt);
1015 	if (!check_fully_established(msk, sk, subflow, skb, &mp_opt))
1016 		return;
1017 
1018 	if (mp_opt.fastclose &&
1019 	    msk->local_key == mp_opt.rcvr_key) {
1020 		WRITE_ONCE(msk->rcv_fastclose, true);
1021 		mptcp_schedule_work((struct sock *)msk);
1022 	}
1023 
1024 	if (mp_opt.add_addr && add_addr_hmac_valid(msk, &mp_opt)) {
1025 		struct mptcp_addr_info addr;
1026 
1027 		addr.port = htons(mp_opt.port);
1028 		addr.id = mp_opt.addr_id;
1029 		if (mp_opt.family == MPTCP_ADDR_IPVERSION_4) {
1030 			addr.family = AF_INET;
1031 			addr.addr = mp_opt.addr;
1032 		}
1033 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1034 		else if (mp_opt.family == MPTCP_ADDR_IPVERSION_6) {
1035 			addr.family = AF_INET6;
1036 			addr.addr6 = mp_opt.addr6;
1037 		}
1038 #endif
1039 		if (!mp_opt.echo) {
1040 			mptcp_pm_add_addr_received(msk, &addr);
1041 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ADDADDR);
1042 		} else {
1043 			mptcp_pm_add_addr_echoed(msk, &addr);
1044 			mptcp_pm_del_add_timer(msk, &addr);
1045 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_ECHOADD);
1046 		}
1047 
1048 		if (mp_opt.port)
1049 			MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_PORTADD);
1050 
1051 		mp_opt.add_addr = 0;
1052 	}
1053 
1054 	if (mp_opt.rm_addr) {
1055 		mptcp_pm_rm_addr_received(msk, &mp_opt.rm_list);
1056 		mp_opt.rm_addr = 0;
1057 	}
1058 
1059 	if (mp_opt.mp_prio) {
1060 		mptcp_pm_mp_prio_received(sk, mp_opt.backup);
1061 		MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPPRIORX);
1062 		mp_opt.mp_prio = 0;
1063 	}
1064 
1065 	if (!mp_opt.dss)
1066 		return;
1067 
1068 	/* we can't wait for recvmsg() to update the ack_seq, otherwise
1069 	 * monodirectional flows will stuck
1070 	 */
1071 	if (mp_opt.use_ack)
1072 		ack_update_msk(msk, sk, &mp_opt);
1073 
1074 	/* Zero-data-length packets are dropped by the caller and not
1075 	 * propagated to the MPTCP layer, so the skb extension does not
1076 	 * need to be allocated or populated. DATA_FIN information, if
1077 	 * present, needs to be updated here before the skb is freed.
1078 	 */
1079 	if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) {
1080 		if (mp_opt.data_fin && mp_opt.data_len == 1 &&
1081 		    mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) &&
1082 		    schedule_work(&msk->work))
1083 			sock_hold(subflow->conn);
1084 
1085 		return;
1086 	}
1087 
1088 	mpext = skb_ext_add(skb, SKB_EXT_MPTCP);
1089 	if (!mpext)
1090 		return;
1091 
1092 	memset(mpext, 0, sizeof(*mpext));
1093 
1094 	if (mp_opt.use_map) {
1095 		if (mp_opt.mpc_map) {
1096 			/* this is an MP_CAPABLE carrying MPTCP data
1097 			 * we know this map the first chunk of data
1098 			 */
1099 			mptcp_crypto_key_sha(subflow->remote_key, NULL,
1100 					     &mpext->data_seq);
1101 			mpext->data_seq++;
1102 			mpext->subflow_seq = 1;
1103 			mpext->dsn64 = 1;
1104 			mpext->mpc_map = 1;
1105 			mpext->data_fin = 0;
1106 		} else {
1107 			mpext->data_seq = mp_opt.data_seq;
1108 			mpext->subflow_seq = mp_opt.subflow_seq;
1109 			mpext->dsn64 = mp_opt.dsn64;
1110 			mpext->data_fin = mp_opt.data_fin;
1111 		}
1112 		mpext->data_len = mp_opt.data_len;
1113 		mpext->use_map = 1;
1114 	}
1115 }
1116 
1117 static void mptcp_set_rwin(const struct tcp_sock *tp)
1118 {
1119 	const struct sock *ssk = (const struct sock *)tp;
1120 	const struct mptcp_subflow_context *subflow;
1121 	struct mptcp_sock *msk;
1122 	u64 ack_seq;
1123 
1124 	subflow = mptcp_subflow_ctx(ssk);
1125 	msk = mptcp_sk(subflow->conn);
1126 
1127 	ack_seq = READ_ONCE(msk->ack_seq) + tp->rcv_wnd;
1128 
1129 	if (after64(ack_seq, READ_ONCE(msk->rcv_wnd_sent)))
1130 		WRITE_ONCE(msk->rcv_wnd_sent, ack_seq);
1131 }
1132 
1133 void mptcp_write_options(__be32 *ptr, const struct tcp_sock *tp,
1134 			 struct mptcp_out_options *opts)
1135 {
1136 	if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK |
1137 	     OPTION_MPTCP_MPC_ACK) & opts->suboptions) {
1138 		u8 len;
1139 
1140 		if (OPTION_MPTCP_MPC_SYN & opts->suboptions)
1141 			len = TCPOLEN_MPTCP_MPC_SYN;
1142 		else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions)
1143 			len = TCPOLEN_MPTCP_MPC_SYNACK;
1144 		else if (opts->ext_copy.data_len)
1145 			len = TCPOLEN_MPTCP_MPC_ACK_DATA;
1146 		else
1147 			len = TCPOLEN_MPTCP_MPC_ACK;
1148 
1149 		*ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len,
1150 				      MPTCP_SUPPORTED_VERSION,
1151 				      MPTCP_CAP_HMAC_SHA256);
1152 
1153 		if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) &
1154 		    opts->suboptions))
1155 			goto mp_capable_done;
1156 
1157 		put_unaligned_be64(opts->sndr_key, ptr);
1158 		ptr += 2;
1159 		if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions))
1160 			goto mp_capable_done;
1161 
1162 		put_unaligned_be64(opts->rcvr_key, ptr);
1163 		ptr += 2;
1164 		if (!opts->ext_copy.data_len)
1165 			goto mp_capable_done;
1166 
1167 		put_unaligned_be32(opts->ext_copy.data_len << 16 |
1168 				   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
1169 		ptr += 1;
1170 	}
1171 
1172 mp_capable_done:
1173 	if ((OPTION_MPTCP_ADD_ADDR
1174 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1175 	     | OPTION_MPTCP_ADD_ADDR6
1176 #endif
1177 	    ) & opts->suboptions) {
1178 		u8 len = TCPOLEN_MPTCP_ADD_ADDR_BASE;
1179 		u8 echo = MPTCP_ADDR_ECHO;
1180 
1181 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1182 		if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions)
1183 			len = TCPOLEN_MPTCP_ADD_ADDR6_BASE;
1184 #endif
1185 
1186 		if (opts->port)
1187 			len += TCPOLEN_MPTCP_PORT_LEN;
1188 
1189 		if (opts->ahmac) {
1190 			len += sizeof(opts->ahmac);
1191 			echo = 0;
1192 		}
1193 
1194 		*ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR,
1195 				      len, echo, opts->addr_id);
1196 		if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) {
1197 			memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4);
1198 			ptr += 1;
1199 		}
1200 #if IS_ENABLED(CONFIG_MPTCP_IPV6)
1201 		else if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) {
1202 			memcpy((u8 *)ptr, opts->addr6.s6_addr, 16);
1203 			ptr += 4;
1204 		}
1205 #endif
1206 
1207 		if (!opts->port) {
1208 			if (opts->ahmac) {
1209 				put_unaligned_be64(opts->ahmac, ptr);
1210 				ptr += 2;
1211 			}
1212 		} else {
1213 			if (opts->ahmac) {
1214 				u8 *bptr = (u8 *)ptr;
1215 
1216 				put_unaligned_be16(opts->port, bptr);
1217 				bptr += 2;
1218 				put_unaligned_be64(opts->ahmac, bptr);
1219 				bptr += 8;
1220 				put_unaligned_be16(TCPOPT_NOP << 8 |
1221 						   TCPOPT_NOP, bptr);
1222 
1223 				ptr += 3;
1224 			} else {
1225 				put_unaligned_be32(opts->port << 16 |
1226 						   TCPOPT_NOP << 8 |
1227 						   TCPOPT_NOP, ptr);
1228 				ptr += 1;
1229 			}
1230 		}
1231 	}
1232 
1233 	if (OPTION_MPTCP_RM_ADDR & opts->suboptions) {
1234 		u8 i = 1;
1235 
1236 		*ptr++ = mptcp_option(MPTCPOPT_RM_ADDR,
1237 				      TCPOLEN_MPTCP_RM_ADDR_BASE + opts->rm_list.nr,
1238 				      0, opts->rm_list.ids[0]);
1239 
1240 		while (i < opts->rm_list.nr) {
1241 			u8 id1, id2, id3, id4;
1242 
1243 			id1 = opts->rm_list.ids[i];
1244 			id2 = i + 1 < opts->rm_list.nr ? opts->rm_list.ids[i + 1] : TCPOPT_NOP;
1245 			id3 = i + 2 < opts->rm_list.nr ? opts->rm_list.ids[i + 2] : TCPOPT_NOP;
1246 			id4 = i + 3 < opts->rm_list.nr ? opts->rm_list.ids[i + 3] : TCPOPT_NOP;
1247 			put_unaligned_be32(id1 << 24 | id2 << 16 | id3 << 8 | id4, ptr);
1248 			ptr += 1;
1249 			i += 4;
1250 		}
1251 	}
1252 
1253 	if (OPTION_MPTCP_PRIO & opts->suboptions) {
1254 		const struct sock *ssk = (const struct sock *)tp;
1255 		struct mptcp_subflow_context *subflow;
1256 
1257 		subflow = mptcp_subflow_ctx(ssk);
1258 		subflow->send_mp_prio = 0;
1259 
1260 		*ptr++ = mptcp_option(MPTCPOPT_MP_PRIO,
1261 				      TCPOLEN_MPTCP_PRIO,
1262 				      opts->backup, TCPOPT_NOP);
1263 	}
1264 
1265 	if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) {
1266 		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1267 				      TCPOLEN_MPTCP_MPJ_SYN,
1268 				      opts->backup, opts->join_id);
1269 		put_unaligned_be32(opts->token, ptr);
1270 		ptr += 1;
1271 		put_unaligned_be32(opts->nonce, ptr);
1272 		ptr += 1;
1273 	}
1274 
1275 	if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) {
1276 		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1277 				      TCPOLEN_MPTCP_MPJ_SYNACK,
1278 				      opts->backup, opts->join_id);
1279 		put_unaligned_be64(opts->thmac, ptr);
1280 		ptr += 2;
1281 		put_unaligned_be32(opts->nonce, ptr);
1282 		ptr += 1;
1283 	}
1284 
1285 	if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) {
1286 		*ptr++ = mptcp_option(MPTCPOPT_MP_JOIN,
1287 				      TCPOLEN_MPTCP_MPJ_ACK, 0, 0);
1288 		memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN);
1289 		ptr += 5;
1290 	}
1291 
1292 	if (opts->ext_copy.use_ack || opts->ext_copy.use_map) {
1293 		struct mptcp_ext *mpext = &opts->ext_copy;
1294 		u8 len = TCPOLEN_MPTCP_DSS_BASE;
1295 		u8 flags = 0;
1296 
1297 		if (mpext->use_ack) {
1298 			flags = MPTCP_DSS_HAS_ACK;
1299 			if (mpext->ack64) {
1300 				len += TCPOLEN_MPTCP_DSS_ACK64;
1301 				flags |= MPTCP_DSS_ACK64;
1302 			} else {
1303 				len += TCPOLEN_MPTCP_DSS_ACK32;
1304 			}
1305 		}
1306 
1307 		if (mpext->use_map) {
1308 			len += TCPOLEN_MPTCP_DSS_MAP64;
1309 
1310 			/* Use only 64-bit mapping flags for now, add
1311 			 * support for optional 32-bit mappings later.
1312 			 */
1313 			flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64;
1314 			if (mpext->data_fin)
1315 				flags |= MPTCP_DSS_DATA_FIN;
1316 		}
1317 
1318 		*ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags);
1319 
1320 		if (mpext->use_ack) {
1321 			if (mpext->ack64) {
1322 				put_unaligned_be64(mpext->data_ack, ptr);
1323 				ptr += 2;
1324 			} else {
1325 				put_unaligned_be32(mpext->data_ack32, ptr);
1326 				ptr += 1;
1327 			}
1328 		}
1329 
1330 		if (mpext->use_map) {
1331 			put_unaligned_be64(mpext->data_seq, ptr);
1332 			ptr += 2;
1333 			put_unaligned_be32(mpext->subflow_seq, ptr);
1334 			ptr += 1;
1335 			put_unaligned_be32(mpext->data_len << 16 |
1336 					   TCPOPT_NOP << 8 | TCPOPT_NOP, ptr);
1337 		}
1338 	}
1339 
1340 	if (tp)
1341 		mptcp_set_rwin(tp);
1342 }
1343