1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #include <linux/kernel.h> 8 #include <net/tcp.h> 9 #include <net/mptcp.h> 10 #include "protocol.h" 11 12 static bool mptcp_cap_flag_sha256(u8 flags) 13 { 14 return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; 15 } 16 17 void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, 18 int opsize, struct tcp_options_received *opt_rx) 19 { 20 struct mptcp_options_received *mp_opt = &opt_rx->mptcp; 21 u8 subtype = *ptr >> 4; 22 int expected_opsize; 23 u8 version; 24 u8 flags; 25 26 switch (subtype) { 27 case MPTCPOPT_MP_CAPABLE: 28 /* strict size checking */ 29 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 30 if (skb->len > tcp_hdr(skb)->doff << 2) 31 expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; 32 else 33 expected_opsize = TCPOLEN_MPTCP_MPC_ACK; 34 } else { 35 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) 36 expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; 37 else 38 expected_opsize = TCPOLEN_MPTCP_MPC_SYN; 39 } 40 if (opsize != expected_opsize) 41 break; 42 43 /* try to be gentle vs future versions on the initial syn */ 44 version = *ptr++ & MPTCP_VERSION_MASK; 45 if (opsize != TCPOLEN_MPTCP_MPC_SYN) { 46 if (version != MPTCP_SUPPORTED_VERSION) 47 break; 48 } else if (version < MPTCP_SUPPORTED_VERSION) { 49 break; 50 } 51 52 flags = *ptr++; 53 if (!mptcp_cap_flag_sha256(flags) || 54 (flags & MPTCP_CAP_EXTENSIBILITY)) 55 break; 56 57 /* RFC 6824, Section 3.1: 58 * "For the Checksum Required bit (labeled "A"), if either 59 * host requires the use of checksums, checksums MUST be used. 60 * In other words, the only way for checksums not to be used 61 * is if both hosts in their SYNs set A=0." 62 * 63 * Section 3.3.0: 64 * "If a checksum is not present when its use has been 65 * negotiated, the receiver MUST close the subflow with a RST as 66 * it is considered broken." 67 * 68 * We don't implement DSS checksum - fall back to TCP. 69 */ 70 if (flags & MPTCP_CAP_CHECKSUM_REQD) 71 break; 72 73 mp_opt->mp_capable = 1; 74 if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { 75 mp_opt->sndr_key = get_unaligned_be64(ptr); 76 ptr += 8; 77 } 78 if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { 79 mp_opt->rcvr_key = get_unaligned_be64(ptr); 80 ptr += 8; 81 } 82 if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { 83 /* Section 3.1.: 84 * "the data parameters in a MP_CAPABLE are semantically 85 * equivalent to those in a DSS option and can be used 86 * interchangeably." 87 */ 88 mp_opt->dss = 1; 89 mp_opt->use_map = 1; 90 mp_opt->mpc_map = 1; 91 mp_opt->data_len = get_unaligned_be16(ptr); 92 ptr += 2; 93 } 94 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", 95 version, flags, opsize, mp_opt->sndr_key, 96 mp_opt->rcvr_key, mp_opt->data_len); 97 break; 98 99 case MPTCPOPT_MP_JOIN: 100 mp_opt->mp_join = 1; 101 if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { 102 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; 103 mp_opt->join_id = *ptr++; 104 mp_opt->token = get_unaligned_be32(ptr); 105 ptr += 4; 106 mp_opt->nonce = get_unaligned_be32(ptr); 107 ptr += 4; 108 pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", 109 mp_opt->backup, mp_opt->join_id, 110 mp_opt->token, mp_opt->nonce); 111 } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { 112 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; 113 mp_opt->join_id = *ptr++; 114 mp_opt->thmac = get_unaligned_be64(ptr); 115 ptr += 8; 116 mp_opt->nonce = get_unaligned_be32(ptr); 117 ptr += 4; 118 pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", 119 mp_opt->backup, mp_opt->join_id, 120 mp_opt->thmac, mp_opt->nonce); 121 } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { 122 ptr += 2; 123 memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); 124 pr_debug("MP_JOIN hmac"); 125 } else { 126 pr_warn("MP_JOIN bad option size"); 127 mp_opt->mp_join = 0; 128 } 129 break; 130 131 case MPTCPOPT_DSS: 132 pr_debug("DSS"); 133 ptr++; 134 135 /* we must clear 'mpc_map' be able to detect MP_CAPABLE 136 * map vs DSS map in mptcp_incoming_options(), and reconstruct 137 * map info accordingly 138 */ 139 mp_opt->mpc_map = 0; 140 flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; 141 mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; 142 mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; 143 mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; 144 mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; 145 mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); 146 147 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", 148 mp_opt->data_fin, mp_opt->dsn64, 149 mp_opt->use_map, mp_opt->ack64, 150 mp_opt->use_ack); 151 152 expected_opsize = TCPOLEN_MPTCP_DSS_BASE; 153 154 if (mp_opt->use_ack) { 155 if (mp_opt->ack64) 156 expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; 157 else 158 expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; 159 } 160 161 if (mp_opt->use_map) { 162 if (mp_opt->dsn64) 163 expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; 164 else 165 expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; 166 } 167 168 /* RFC 6824, Section 3.3: 169 * If a checksum is present, but its use had 170 * not been negotiated in the MP_CAPABLE handshake, 171 * the checksum field MUST be ignored. 172 */ 173 if (opsize != expected_opsize && 174 opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) 175 break; 176 177 mp_opt->dss = 1; 178 179 if (mp_opt->use_ack) { 180 if (mp_opt->ack64) { 181 mp_opt->data_ack = get_unaligned_be64(ptr); 182 ptr += 8; 183 } else { 184 mp_opt->data_ack = get_unaligned_be32(ptr); 185 ptr += 4; 186 } 187 188 pr_debug("data_ack=%llu", mp_opt->data_ack); 189 } 190 191 if (mp_opt->use_map) { 192 if (mp_opt->dsn64) { 193 mp_opt->data_seq = get_unaligned_be64(ptr); 194 ptr += 8; 195 } else { 196 mp_opt->data_seq = get_unaligned_be32(ptr); 197 ptr += 4; 198 } 199 200 mp_opt->subflow_seq = get_unaligned_be32(ptr); 201 ptr += 4; 202 203 mp_opt->data_len = get_unaligned_be16(ptr); 204 ptr += 2; 205 206 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", 207 mp_opt->data_seq, mp_opt->subflow_seq, 208 mp_opt->data_len); 209 } 210 211 break; 212 213 case MPTCPOPT_ADD_ADDR: 214 mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; 215 if (!mp_opt->echo) { 216 if (opsize == TCPOLEN_MPTCP_ADD_ADDR || 217 opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) 218 mp_opt->family = MPTCP_ADDR_IPVERSION_4; 219 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 220 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || 221 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) 222 mp_opt->family = MPTCP_ADDR_IPVERSION_6; 223 #endif 224 else 225 break; 226 } else { 227 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || 228 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) 229 mp_opt->family = MPTCP_ADDR_IPVERSION_4; 230 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 231 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || 232 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) 233 mp_opt->family = MPTCP_ADDR_IPVERSION_6; 234 #endif 235 else 236 break; 237 } 238 239 mp_opt->add_addr = 1; 240 mp_opt->port = 0; 241 mp_opt->addr_id = *ptr++; 242 pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id); 243 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { 244 memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); 245 ptr += 4; 246 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || 247 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { 248 mp_opt->port = get_unaligned_be16(ptr); 249 ptr += 2; 250 } 251 } 252 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 253 else { 254 memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); 255 ptr += 16; 256 if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || 257 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { 258 mp_opt->port = get_unaligned_be16(ptr); 259 ptr += 2; 260 } 261 } 262 #endif 263 if (!mp_opt->echo) { 264 mp_opt->ahmac = get_unaligned_be64(ptr); 265 ptr += 8; 266 } 267 break; 268 269 case MPTCPOPT_RM_ADDR: 270 if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) 271 break; 272 273 mp_opt->rm_addr = 1; 274 mp_opt->rm_id = *ptr++; 275 pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); 276 break; 277 278 default: 279 break; 280 } 281 } 282 283 void mptcp_get_options(const struct sk_buff *skb, 284 struct tcp_options_received *opt_rx) 285 { 286 const unsigned char *ptr; 287 const struct tcphdr *th = tcp_hdr(skb); 288 int length = (th->doff * 4) - sizeof(struct tcphdr); 289 290 ptr = (const unsigned char *)(th + 1); 291 292 while (length > 0) { 293 int opcode = *ptr++; 294 int opsize; 295 296 switch (opcode) { 297 case TCPOPT_EOL: 298 return; 299 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 300 length--; 301 continue; 302 default: 303 opsize = *ptr++; 304 if (opsize < 2) /* "silly options" */ 305 return; 306 if (opsize > length) 307 return; /* don't parse partial options */ 308 if (opcode == TCPOPT_MPTCP) 309 mptcp_parse_option(skb, ptr, opsize, opt_rx); 310 ptr += opsize - 2; 311 length -= opsize; 312 } 313 } 314 } 315 316 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, 317 unsigned int *size, struct mptcp_out_options *opts) 318 { 319 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 320 321 /* we will use snd_isn to detect first pkt [re]transmission 322 * in mptcp_established_options_mp() 323 */ 324 subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; 325 if (subflow->request_mptcp) { 326 pr_debug("local_key=%llu", subflow->local_key); 327 opts->suboptions = OPTION_MPTCP_MPC_SYN; 328 opts->sndr_key = subflow->local_key; 329 *size = TCPOLEN_MPTCP_MPC_SYN; 330 return true; 331 } else if (subflow->request_join) { 332 pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, 333 subflow->local_nonce); 334 opts->suboptions = OPTION_MPTCP_MPJ_SYN; 335 opts->join_id = subflow->local_id; 336 opts->token = subflow->remote_token; 337 opts->nonce = subflow->local_nonce; 338 opts->backup = subflow->request_bkup; 339 *size = TCPOLEN_MPTCP_MPJ_SYN; 340 return true; 341 } 342 return false; 343 } 344 345 void mptcp_rcv_synsent(struct sock *sk) 346 { 347 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 348 struct tcp_sock *tp = tcp_sk(sk); 349 350 if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { 351 subflow->mp_capable = 1; 352 subflow->can_ack = 1; 353 subflow->remote_key = tp->rx_opt.mptcp.sndr_key; 354 pr_debug("subflow=%p, remote_key=%llu", subflow, 355 subflow->remote_key); 356 } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { 357 subflow->mp_join = 1; 358 subflow->thmac = tp->rx_opt.mptcp.thmac; 359 subflow->remote_nonce = tp->rx_opt.mptcp.nonce; 360 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, 361 subflow->thmac, subflow->remote_nonce); 362 } else if (subflow->request_mptcp) { 363 tcp_sk(sk)->is_mptcp = 0; 364 } 365 } 366 367 /* MP_JOIN client subflow must wait for 4th ack before sending any data: 368 * TCP can't schedule delack timer before the subflow is fully established. 369 * MPTCP uses the delack timer to do 3rd ack retransmissions 370 */ 371 static void schedule_3rdack_retransmission(struct sock *sk) 372 { 373 struct inet_connection_sock *icsk = inet_csk(sk); 374 struct tcp_sock *tp = tcp_sk(sk); 375 unsigned long timeout; 376 377 /* reschedule with a timeout above RTT, as we must look only for drop */ 378 if (tp->srtt_us) 379 timeout = tp->srtt_us << 1; 380 else 381 timeout = TCP_TIMEOUT_INIT; 382 383 WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); 384 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 385 icsk->icsk_ack.timeout = timeout; 386 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); 387 } 388 389 static void clear_3rdack_retransmission(struct sock *sk) 390 { 391 struct inet_connection_sock *icsk = inet_csk(sk); 392 393 sk_stop_timer(sk, &icsk->icsk_delack_timer); 394 icsk->icsk_ack.timeout = 0; 395 icsk->icsk_ack.ato = 0; 396 icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); 397 } 398 399 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, 400 unsigned int *size, 401 unsigned int remaining, 402 struct mptcp_out_options *opts) 403 { 404 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 405 struct mptcp_ext *mpext; 406 unsigned int data_len; 407 408 /* When skb is not available, we better over-estimate the emitted 409 * options len. A full DSS option (28 bytes) is longer than 410 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so 411 * tell the caller to defer the estimate to 412 * mptcp_established_options_dss(), which will reserve enough space. 413 */ 414 if (!skb) 415 return false; 416 417 /* MPC/MPJ needed only on 3rd ack packet */ 418 if (subflow->fully_established || 419 subflow->snd_isn != TCP_SKB_CB(skb)->seq) 420 return false; 421 422 if (subflow->mp_capable) { 423 mpext = mptcp_get_ext(skb); 424 data_len = mpext ? mpext->data_len : 0; 425 426 /* we will check ext_copy.data_len in mptcp_write_options() to 427 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and 428 * TCPOLEN_MPTCP_MPC_ACK 429 */ 430 opts->ext_copy.data_len = data_len; 431 opts->suboptions = OPTION_MPTCP_MPC_ACK; 432 opts->sndr_key = subflow->local_key; 433 opts->rcvr_key = subflow->remote_key; 434 435 /* Section 3.1. 436 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK 437 * packets that start the first subflow of an MPTCP connection, 438 * as well as the first packet that carries data 439 */ 440 if (data_len > 0) 441 *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); 442 else 443 *size = TCPOLEN_MPTCP_MPC_ACK; 444 445 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", 446 subflow, subflow->local_key, subflow->remote_key, 447 data_len); 448 449 return true; 450 } else if (subflow->mp_join) { 451 opts->suboptions = OPTION_MPTCP_MPJ_ACK; 452 memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); 453 *size = TCPOLEN_MPTCP_MPJ_ACK; 454 pr_debug("subflow=%p", subflow); 455 456 schedule_3rdack_retransmission(sk); 457 return true; 458 } 459 return false; 460 } 461 462 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, 463 struct mptcp_ext *ext) 464 { 465 if (!ext->use_map) { 466 /* RFC6824 requires a DSS mapping with specific values 467 * if DATA_FIN is set but no data payload is mapped 468 */ 469 ext->data_fin = 1; 470 ext->use_map = 1; 471 ext->dsn64 = 1; 472 ext->data_seq = subflow->data_fin_tx_seq; 473 ext->subflow_seq = 0; 474 ext->data_len = 1; 475 } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) { 476 /* If there's an existing DSS mapping and it is the 477 * final mapping, DATA_FIN consumes 1 additional byte of 478 * mapping space. 479 */ 480 ext->data_fin = 1; 481 ext->data_len++; 482 } 483 } 484 485 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, 486 unsigned int *size, 487 unsigned int remaining, 488 struct mptcp_out_options *opts) 489 { 490 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 491 unsigned int dss_size = 0; 492 struct mptcp_ext *mpext; 493 struct mptcp_sock *msk; 494 unsigned int ack_size; 495 bool ret = false; 496 u8 tcp_fin; 497 498 if (skb) { 499 mpext = mptcp_get_ext(skb); 500 tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 501 } else { 502 mpext = NULL; 503 tcp_fin = 0; 504 } 505 506 if (!skb || (mpext && mpext->use_map) || tcp_fin) { 507 unsigned int map_size; 508 509 map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; 510 511 remaining -= map_size; 512 dss_size = map_size; 513 if (mpext) 514 opts->ext_copy = *mpext; 515 516 if (skb && tcp_fin && subflow->data_fin_tx_enable) 517 mptcp_write_data_fin(subflow, &opts->ext_copy); 518 ret = true; 519 } 520 521 /* passive sockets msk will set the 'can_ack' after accept(), even 522 * if the first subflow may have the already the remote key handy 523 */ 524 opts->ext_copy.use_ack = 0; 525 msk = mptcp_sk(subflow->conn); 526 if (!READ_ONCE(msk->can_ack)) { 527 *size = ALIGN(dss_size, 4); 528 return ret; 529 } 530 531 ack_size = TCPOLEN_MPTCP_DSS_ACK64; 532 533 /* Add kind/length/subtype/flag overhead if mapping is not populated */ 534 if (dss_size == 0) 535 ack_size += TCPOLEN_MPTCP_DSS_BASE; 536 537 dss_size += ack_size; 538 539 opts->ext_copy.data_ack = msk->ack_seq; 540 opts->ext_copy.ack64 = 1; 541 opts->ext_copy.use_ack = 1; 542 543 *size = ALIGN(dss_size, 4); 544 return true; 545 } 546 547 static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, 548 struct in_addr *addr) 549 { 550 u8 hmac[MPTCP_ADDR_HMAC_LEN]; 551 u8 msg[7]; 552 553 msg[0] = addr_id; 554 memcpy(&msg[1], &addr->s_addr, 4); 555 msg[5] = 0; 556 msg[6] = 0; 557 558 mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); 559 560 return get_unaligned_be64(hmac); 561 } 562 563 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 564 static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, 565 struct in6_addr *addr) 566 { 567 u8 hmac[MPTCP_ADDR_HMAC_LEN]; 568 u8 msg[19]; 569 570 msg[0] = addr_id; 571 memcpy(&msg[1], &addr->s6_addr, 16); 572 msg[17] = 0; 573 msg[18] = 0; 574 575 mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); 576 577 return get_unaligned_be64(hmac); 578 } 579 #endif 580 581 static bool mptcp_established_options_addr(struct sock *sk, 582 unsigned int *size, 583 unsigned int remaining, 584 struct mptcp_out_options *opts) 585 { 586 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 587 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 588 struct mptcp_addr_info saddr; 589 int len; 590 591 if (!mptcp_pm_should_signal(msk) || 592 !(mptcp_pm_addr_signal(msk, remaining, &saddr))) 593 return false; 594 595 len = mptcp_add_addr_len(saddr.family); 596 if (remaining < len) 597 return false; 598 599 *size = len; 600 opts->addr_id = saddr.id; 601 if (saddr.family == AF_INET) { 602 opts->suboptions |= OPTION_MPTCP_ADD_ADDR; 603 opts->addr = saddr.addr; 604 opts->ahmac = add_addr_generate_hmac(msk->local_key, 605 msk->remote_key, 606 opts->addr_id, 607 &opts->addr); 608 } 609 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 610 else if (saddr.family == AF_INET6) { 611 opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; 612 opts->addr6 = saddr.addr6; 613 opts->ahmac = add_addr6_generate_hmac(msk->local_key, 614 msk->remote_key, 615 opts->addr_id, 616 &opts->addr6); 617 } 618 #endif 619 pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac); 620 621 return true; 622 } 623 624 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, 625 unsigned int *size, unsigned int remaining, 626 struct mptcp_out_options *opts) 627 { 628 unsigned int opt_size = 0; 629 bool ret = false; 630 631 opts->suboptions = 0; 632 633 if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) 634 ret = true; 635 else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, 636 opts)) 637 ret = true; 638 639 /* we reserved enough space for the above options, and exceeding the 640 * TCP option space would be fatal 641 */ 642 if (WARN_ON_ONCE(opt_size > remaining)) 643 return false; 644 645 *size += opt_size; 646 remaining -= opt_size; 647 if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) { 648 *size += opt_size; 649 remaining -= opt_size; 650 ret = true; 651 } 652 653 return ret; 654 } 655 656 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, 657 struct mptcp_out_options *opts) 658 { 659 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 660 661 if (subflow_req->mp_capable) { 662 opts->suboptions = OPTION_MPTCP_MPC_SYNACK; 663 opts->sndr_key = subflow_req->local_key; 664 *size = TCPOLEN_MPTCP_MPC_SYNACK; 665 pr_debug("subflow_req=%p, local_key=%llu", 666 subflow_req, subflow_req->local_key); 667 return true; 668 } else if (subflow_req->mp_join) { 669 opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; 670 opts->backup = subflow_req->backup; 671 opts->join_id = subflow_req->local_id; 672 opts->thmac = subflow_req->thmac; 673 opts->nonce = subflow_req->local_nonce; 674 pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", 675 subflow_req, opts->backup, opts->join_id, 676 opts->thmac, opts->nonce); 677 *size = TCPOLEN_MPTCP_MPJ_SYNACK; 678 return true; 679 } 680 return false; 681 } 682 683 static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, 684 struct mptcp_subflow_context *subflow, 685 struct sk_buff *skb, 686 struct mptcp_options_received *mp_opt) 687 { 688 /* here we can process OoO, in-window pkts, only in-sequence 4th ack 689 * will make the subflow fully established 690 */ 691 if (likely(subflow->fully_established)) { 692 /* on passive sockets, check for 3rd ack retransmission 693 * note that msk is always set by subflow_syn_recv_sock() 694 * for mp_join subflows 695 */ 696 if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && 697 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && 698 subflow->mp_join && mp_opt->mp_join && 699 READ_ONCE(msk->pm.server_side)) 700 tcp_send_ack(sk); 701 goto fully_established; 702 } 703 704 /* we should process OoO packets before the first subflow is fully 705 * established, but not expected for MP_JOIN subflows 706 */ 707 if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) 708 return subflow->mp_capable; 709 710 if (mp_opt->use_ack) { 711 /* subflows are fully established as soon as we get any 712 * additional ack. 713 */ 714 subflow->fully_established = 1; 715 goto fully_established; 716 } 717 718 WARN_ON_ONCE(subflow->can_ack); 719 720 /* If the first established packet does not contain MP_CAPABLE + data 721 * then fallback to TCP 722 */ 723 if (!mp_opt->mp_capable) { 724 subflow->mp_capable = 0; 725 tcp_sk(sk)->is_mptcp = 0; 726 return false; 727 } 728 729 subflow->fully_established = 1; 730 subflow->remote_key = mp_opt->sndr_key; 731 subflow->can_ack = 1; 732 733 fully_established: 734 if (likely(subflow->pm_notified)) 735 return true; 736 737 subflow->pm_notified = 1; 738 if (subflow->mp_join) { 739 clear_3rdack_retransmission(sk); 740 mptcp_pm_subflow_established(msk, subflow); 741 } else { 742 mptcp_pm_fully_established(msk); 743 } 744 return true; 745 } 746 747 static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) 748 { 749 u32 old_ack32, cur_ack32; 750 751 if (use_64bit) 752 return cur_ack; 753 754 old_ack32 = (u32)old_ack; 755 cur_ack32 = (u32)cur_ack; 756 cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; 757 if (unlikely(before(cur_ack32, old_ack32))) 758 return cur_ack + (1LL << 32); 759 return cur_ack; 760 } 761 762 static void update_una(struct mptcp_sock *msk, 763 struct mptcp_options_received *mp_opt) 764 { 765 u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); 766 u64 write_seq = READ_ONCE(msk->write_seq); 767 768 /* avoid ack expansion on update conflict, to reduce the risk of 769 * wrongly expanding to a future ack sequence number, which is way 770 * more dangerous than missing an ack 771 */ 772 new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); 773 774 /* ACK for data not even sent yet? Ignore. */ 775 if (after64(new_snd_una, write_seq)) 776 new_snd_una = old_snd_una; 777 778 while (after64(new_snd_una, old_snd_una)) { 779 snd_una = old_snd_una; 780 old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, 781 new_snd_una); 782 if (old_snd_una == snd_una) { 783 mptcp_data_acked((struct sock *)msk); 784 break; 785 } 786 } 787 } 788 789 static bool add_addr_hmac_valid(struct mptcp_sock *msk, 790 struct mptcp_options_received *mp_opt) 791 { 792 u64 hmac = 0; 793 794 if (mp_opt->echo) 795 return true; 796 797 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) 798 hmac = add_addr_generate_hmac(msk->remote_key, 799 msk->local_key, 800 mp_opt->addr_id, &mp_opt->addr); 801 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 802 else 803 hmac = add_addr6_generate_hmac(msk->remote_key, 804 msk->local_key, 805 mp_opt->addr_id, &mp_opt->addr6); 806 #endif 807 808 pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", 809 msk, (unsigned long long)hmac, 810 (unsigned long long)mp_opt->ahmac); 811 812 return hmac == mp_opt->ahmac; 813 } 814 815 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, 816 struct tcp_options_received *opt_rx) 817 { 818 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 819 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 820 struct mptcp_options_received *mp_opt; 821 struct mptcp_ext *mpext; 822 823 mp_opt = &opt_rx->mptcp; 824 if (!check_fully_established(msk, sk, subflow, skb, mp_opt)) 825 return; 826 827 if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) { 828 struct mptcp_addr_info addr; 829 830 addr.port = htons(mp_opt->port); 831 addr.id = mp_opt->addr_id; 832 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { 833 addr.family = AF_INET; 834 addr.addr = mp_opt->addr; 835 } 836 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 837 else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) { 838 addr.family = AF_INET6; 839 addr.addr6 = mp_opt->addr6; 840 } 841 #endif 842 if (!mp_opt->echo) 843 mptcp_pm_add_addr_received(msk, &addr); 844 mp_opt->add_addr = 0; 845 } 846 847 if (!mp_opt->dss) 848 return; 849 850 /* we can't wait for recvmsg() to update the ack_seq, otherwise 851 * monodirectional flows will stuck 852 */ 853 if (mp_opt->use_ack) 854 update_una(msk, mp_opt); 855 856 mpext = skb_ext_add(skb, SKB_EXT_MPTCP); 857 if (!mpext) 858 return; 859 860 memset(mpext, 0, sizeof(*mpext)); 861 862 if (mp_opt->use_map) { 863 if (mp_opt->mpc_map) { 864 /* this is an MP_CAPABLE carrying MPTCP data 865 * we know this map the first chunk of data 866 */ 867 mptcp_crypto_key_sha(subflow->remote_key, NULL, 868 &mpext->data_seq); 869 mpext->data_seq++; 870 mpext->subflow_seq = 1; 871 mpext->dsn64 = 1; 872 mpext->mpc_map = 1; 873 } else { 874 mpext->data_seq = mp_opt->data_seq; 875 mpext->subflow_seq = mp_opt->subflow_seq; 876 mpext->dsn64 = mp_opt->dsn64; 877 } 878 mpext->data_len = mp_opt->data_len; 879 mpext->use_map = 1; 880 } 881 882 mpext->data_fin = mp_opt->data_fin; 883 } 884 885 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) 886 { 887 if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | 888 OPTION_MPTCP_MPC_ACK) & opts->suboptions) { 889 u8 len; 890 891 if (OPTION_MPTCP_MPC_SYN & opts->suboptions) 892 len = TCPOLEN_MPTCP_MPC_SYN; 893 else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) 894 len = TCPOLEN_MPTCP_MPC_SYNACK; 895 else if (opts->ext_copy.data_len) 896 len = TCPOLEN_MPTCP_MPC_ACK_DATA; 897 else 898 len = TCPOLEN_MPTCP_MPC_ACK; 899 900 *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, 901 MPTCP_SUPPORTED_VERSION, 902 MPTCP_CAP_HMAC_SHA256); 903 904 if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & 905 opts->suboptions)) 906 goto mp_capable_done; 907 908 put_unaligned_be64(opts->sndr_key, ptr); 909 ptr += 2; 910 if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) 911 goto mp_capable_done; 912 913 put_unaligned_be64(opts->rcvr_key, ptr); 914 ptr += 2; 915 if (!opts->ext_copy.data_len) 916 goto mp_capable_done; 917 918 put_unaligned_be32(opts->ext_copy.data_len << 16 | 919 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 920 ptr += 1; 921 } 922 923 mp_capable_done: 924 if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { 925 if (opts->ahmac) 926 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 927 TCPOLEN_MPTCP_ADD_ADDR, 0, 928 opts->addr_id); 929 else 930 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 931 TCPOLEN_MPTCP_ADD_ADDR_BASE, 932 MPTCP_ADDR_ECHO, 933 opts->addr_id); 934 memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); 935 ptr += 1; 936 if (opts->ahmac) { 937 put_unaligned_be64(opts->ahmac, ptr); 938 ptr += 2; 939 } 940 } 941 942 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 943 if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { 944 if (opts->ahmac) 945 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 946 TCPOLEN_MPTCP_ADD_ADDR6, 0, 947 opts->addr_id); 948 else 949 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 950 TCPOLEN_MPTCP_ADD_ADDR6_BASE, 951 MPTCP_ADDR_ECHO, 952 opts->addr_id); 953 memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); 954 ptr += 4; 955 if (opts->ahmac) { 956 put_unaligned_be64(opts->ahmac, ptr); 957 ptr += 2; 958 } 959 } 960 #endif 961 962 if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { 963 *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, 964 TCPOLEN_MPTCP_RM_ADDR_BASE, 965 0, opts->rm_id); 966 } 967 968 if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { 969 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 970 TCPOLEN_MPTCP_MPJ_SYN, 971 opts->backup, opts->join_id); 972 put_unaligned_be32(opts->token, ptr); 973 ptr += 1; 974 put_unaligned_be32(opts->nonce, ptr); 975 ptr += 1; 976 } 977 978 if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { 979 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 980 TCPOLEN_MPTCP_MPJ_SYNACK, 981 opts->backup, opts->join_id); 982 put_unaligned_be64(opts->thmac, ptr); 983 ptr += 2; 984 put_unaligned_be32(opts->nonce, ptr); 985 ptr += 1; 986 } 987 988 if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { 989 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 990 TCPOLEN_MPTCP_MPJ_ACK, 0, 0); 991 memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); 992 ptr += 5; 993 } 994 995 if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { 996 struct mptcp_ext *mpext = &opts->ext_copy; 997 u8 len = TCPOLEN_MPTCP_DSS_BASE; 998 u8 flags = 0; 999 1000 if (mpext->use_ack) { 1001 len += TCPOLEN_MPTCP_DSS_ACK64; 1002 flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; 1003 } 1004 1005 if (mpext->use_map) { 1006 len += TCPOLEN_MPTCP_DSS_MAP64; 1007 1008 /* Use only 64-bit mapping flags for now, add 1009 * support for optional 32-bit mappings later. 1010 */ 1011 flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; 1012 if (mpext->data_fin) 1013 flags |= MPTCP_DSS_DATA_FIN; 1014 } 1015 1016 *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); 1017 1018 if (mpext->use_ack) { 1019 put_unaligned_be64(mpext->data_ack, ptr); 1020 ptr += 2; 1021 } 1022 1023 if (mpext->use_map) { 1024 put_unaligned_be64(mpext->data_seq, ptr); 1025 ptr += 2; 1026 put_unaligned_be32(mpext->subflow_seq, ptr); 1027 ptr += 1; 1028 put_unaligned_be32(mpext->data_len << 16 | 1029 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 1030 } 1031 } 1032 } 1033