1 // SPDX-License-Identifier: GPL-2.0 2 /* Multipath TCP 3 * 4 * Copyright (c) 2017 - 2019, Intel Corporation. 5 */ 6 7 #define pr_fmt(fmt) "MPTCP: " fmt 8 9 #include <linux/kernel.h> 10 #include <net/tcp.h> 11 #include <net/mptcp.h> 12 #include "protocol.h" 13 14 static bool mptcp_cap_flag_sha256(u8 flags) 15 { 16 return (flags & MPTCP_CAP_FLAG_MASK) == MPTCP_CAP_HMAC_SHA256; 17 } 18 19 void mptcp_parse_option(const struct sk_buff *skb, const unsigned char *ptr, 20 int opsize, struct tcp_options_received *opt_rx) 21 { 22 struct mptcp_options_received *mp_opt = &opt_rx->mptcp; 23 u8 subtype = *ptr >> 4; 24 int expected_opsize; 25 u8 version; 26 u8 flags; 27 28 switch (subtype) { 29 case MPTCPOPT_MP_CAPABLE: 30 /* strict size checking */ 31 if (!(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) { 32 if (skb->len > tcp_hdr(skb)->doff << 2) 33 expected_opsize = TCPOLEN_MPTCP_MPC_ACK_DATA; 34 else 35 expected_opsize = TCPOLEN_MPTCP_MPC_ACK; 36 } else { 37 if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_ACK) 38 expected_opsize = TCPOLEN_MPTCP_MPC_SYNACK; 39 else 40 expected_opsize = TCPOLEN_MPTCP_MPC_SYN; 41 } 42 if (opsize != expected_opsize) 43 break; 44 45 /* try to be gentle vs future versions on the initial syn */ 46 version = *ptr++ & MPTCP_VERSION_MASK; 47 if (opsize != TCPOLEN_MPTCP_MPC_SYN) { 48 if (version != MPTCP_SUPPORTED_VERSION) 49 break; 50 } else if (version < MPTCP_SUPPORTED_VERSION) { 51 break; 52 } 53 54 flags = *ptr++; 55 if (!mptcp_cap_flag_sha256(flags) || 56 (flags & MPTCP_CAP_EXTENSIBILITY)) 57 break; 58 59 /* RFC 6824, Section 3.1: 60 * "For the Checksum Required bit (labeled "A"), if either 61 * host requires the use of checksums, checksums MUST be used. 62 * In other words, the only way for checksums not to be used 63 * is if both hosts in their SYNs set A=0." 64 * 65 * Section 3.3.0: 66 * "If a checksum is not present when its use has been 67 * negotiated, the receiver MUST close the subflow with a RST as 68 * it is considered broken." 69 * 70 * We don't implement DSS checksum - fall back to TCP. 71 */ 72 if (flags & MPTCP_CAP_CHECKSUM_REQD) 73 break; 74 75 mp_opt->mp_capable = 1; 76 if (opsize >= TCPOLEN_MPTCP_MPC_SYNACK) { 77 mp_opt->sndr_key = get_unaligned_be64(ptr); 78 ptr += 8; 79 } 80 if (opsize >= TCPOLEN_MPTCP_MPC_ACK) { 81 mp_opt->rcvr_key = get_unaligned_be64(ptr); 82 ptr += 8; 83 } 84 if (opsize == TCPOLEN_MPTCP_MPC_ACK_DATA) { 85 /* Section 3.1.: 86 * "the data parameters in a MP_CAPABLE are semantically 87 * equivalent to those in a DSS option and can be used 88 * interchangeably." 89 */ 90 mp_opt->dss = 1; 91 mp_opt->use_map = 1; 92 mp_opt->mpc_map = 1; 93 mp_opt->data_len = get_unaligned_be16(ptr); 94 ptr += 2; 95 } 96 pr_debug("MP_CAPABLE version=%x, flags=%x, optlen=%d sndr=%llu, rcvr=%llu len=%d", 97 version, flags, opsize, mp_opt->sndr_key, 98 mp_opt->rcvr_key, mp_opt->data_len); 99 break; 100 101 case MPTCPOPT_MP_JOIN: 102 mp_opt->mp_join = 1; 103 if (opsize == TCPOLEN_MPTCP_MPJ_SYN) { 104 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; 105 mp_opt->join_id = *ptr++; 106 mp_opt->token = get_unaligned_be32(ptr); 107 ptr += 4; 108 mp_opt->nonce = get_unaligned_be32(ptr); 109 ptr += 4; 110 pr_debug("MP_JOIN bkup=%u, id=%u, token=%u, nonce=%u", 111 mp_opt->backup, mp_opt->join_id, 112 mp_opt->token, mp_opt->nonce); 113 } else if (opsize == TCPOLEN_MPTCP_MPJ_SYNACK) { 114 mp_opt->backup = *ptr++ & MPTCPOPT_BACKUP; 115 mp_opt->join_id = *ptr++; 116 mp_opt->thmac = get_unaligned_be64(ptr); 117 ptr += 8; 118 mp_opt->nonce = get_unaligned_be32(ptr); 119 ptr += 4; 120 pr_debug("MP_JOIN bkup=%u, id=%u, thmac=%llu, nonce=%u", 121 mp_opt->backup, mp_opt->join_id, 122 mp_opt->thmac, mp_opt->nonce); 123 } else if (opsize == TCPOLEN_MPTCP_MPJ_ACK) { 124 ptr += 2; 125 memcpy(mp_opt->hmac, ptr, MPTCPOPT_HMAC_LEN); 126 pr_debug("MP_JOIN hmac"); 127 } else { 128 pr_warn("MP_JOIN bad option size"); 129 mp_opt->mp_join = 0; 130 } 131 break; 132 133 case MPTCPOPT_DSS: 134 pr_debug("DSS"); 135 ptr++; 136 137 /* we must clear 'mpc_map' be able to detect MP_CAPABLE 138 * map vs DSS map in mptcp_incoming_options(), and reconstruct 139 * map info accordingly 140 */ 141 mp_opt->mpc_map = 0; 142 flags = (*ptr++) & MPTCP_DSS_FLAG_MASK; 143 mp_opt->data_fin = (flags & MPTCP_DSS_DATA_FIN) != 0; 144 mp_opt->dsn64 = (flags & MPTCP_DSS_DSN64) != 0; 145 mp_opt->use_map = (flags & MPTCP_DSS_HAS_MAP) != 0; 146 mp_opt->ack64 = (flags & MPTCP_DSS_ACK64) != 0; 147 mp_opt->use_ack = (flags & MPTCP_DSS_HAS_ACK); 148 149 pr_debug("data_fin=%d dsn64=%d use_map=%d ack64=%d use_ack=%d", 150 mp_opt->data_fin, mp_opt->dsn64, 151 mp_opt->use_map, mp_opt->ack64, 152 mp_opt->use_ack); 153 154 expected_opsize = TCPOLEN_MPTCP_DSS_BASE; 155 156 if (mp_opt->use_ack) { 157 if (mp_opt->ack64) 158 expected_opsize += TCPOLEN_MPTCP_DSS_ACK64; 159 else 160 expected_opsize += TCPOLEN_MPTCP_DSS_ACK32; 161 } 162 163 if (mp_opt->use_map) { 164 if (mp_opt->dsn64) 165 expected_opsize += TCPOLEN_MPTCP_DSS_MAP64; 166 else 167 expected_opsize += TCPOLEN_MPTCP_DSS_MAP32; 168 } 169 170 /* RFC 6824, Section 3.3: 171 * If a checksum is present, but its use had 172 * not been negotiated in the MP_CAPABLE handshake, 173 * the checksum field MUST be ignored. 174 */ 175 if (opsize != expected_opsize && 176 opsize != expected_opsize + TCPOLEN_MPTCP_DSS_CHECKSUM) 177 break; 178 179 mp_opt->dss = 1; 180 181 if (mp_opt->use_ack) { 182 if (mp_opt->ack64) { 183 mp_opt->data_ack = get_unaligned_be64(ptr); 184 ptr += 8; 185 } else { 186 mp_opt->data_ack = get_unaligned_be32(ptr); 187 ptr += 4; 188 } 189 190 pr_debug("data_ack=%llu", mp_opt->data_ack); 191 } 192 193 if (mp_opt->use_map) { 194 if (mp_opt->dsn64) { 195 mp_opt->data_seq = get_unaligned_be64(ptr); 196 ptr += 8; 197 } else { 198 mp_opt->data_seq = get_unaligned_be32(ptr); 199 ptr += 4; 200 } 201 202 mp_opt->subflow_seq = get_unaligned_be32(ptr); 203 ptr += 4; 204 205 mp_opt->data_len = get_unaligned_be16(ptr); 206 ptr += 2; 207 208 pr_debug("data_seq=%llu subflow_seq=%u data_len=%u", 209 mp_opt->data_seq, mp_opt->subflow_seq, 210 mp_opt->data_len); 211 } 212 213 break; 214 215 case MPTCPOPT_ADD_ADDR: 216 mp_opt->echo = (*ptr++) & MPTCP_ADDR_ECHO; 217 if (!mp_opt->echo) { 218 if (opsize == TCPOLEN_MPTCP_ADD_ADDR || 219 opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT) 220 mp_opt->family = MPTCP_ADDR_IPVERSION_4; 221 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 222 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6 || 223 opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT) 224 mp_opt->family = MPTCP_ADDR_IPVERSION_6; 225 #endif 226 else 227 break; 228 } else { 229 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE || 230 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) 231 mp_opt->family = MPTCP_ADDR_IPVERSION_4; 232 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 233 else if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE || 234 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) 235 mp_opt->family = MPTCP_ADDR_IPVERSION_6; 236 #endif 237 else 238 break; 239 } 240 241 mp_opt->add_addr = 1; 242 mp_opt->port = 0; 243 mp_opt->addr_id = *ptr++; 244 pr_debug("ADD_ADDR: id=%d", mp_opt->addr_id); 245 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { 246 memcpy((u8 *)&mp_opt->addr.s_addr, (u8 *)ptr, 4); 247 ptr += 4; 248 if (opsize == TCPOLEN_MPTCP_ADD_ADDR_PORT || 249 opsize == TCPOLEN_MPTCP_ADD_ADDR_BASE_PORT) { 250 mp_opt->port = get_unaligned_be16(ptr); 251 ptr += 2; 252 } 253 } 254 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 255 else { 256 memcpy(mp_opt->addr6.s6_addr, (u8 *)ptr, 16); 257 ptr += 16; 258 if (opsize == TCPOLEN_MPTCP_ADD_ADDR6_PORT || 259 opsize == TCPOLEN_MPTCP_ADD_ADDR6_BASE_PORT) { 260 mp_opt->port = get_unaligned_be16(ptr); 261 ptr += 2; 262 } 263 } 264 #endif 265 if (!mp_opt->echo) { 266 mp_opt->ahmac = get_unaligned_be64(ptr); 267 ptr += 8; 268 } 269 break; 270 271 case MPTCPOPT_RM_ADDR: 272 if (opsize != TCPOLEN_MPTCP_RM_ADDR_BASE) 273 break; 274 275 mp_opt->rm_addr = 1; 276 mp_opt->rm_id = *ptr++; 277 pr_debug("RM_ADDR: id=%d", mp_opt->rm_id); 278 break; 279 280 default: 281 break; 282 } 283 } 284 285 void mptcp_get_options(const struct sk_buff *skb, 286 struct tcp_options_received *opt_rx) 287 { 288 const unsigned char *ptr; 289 const struct tcphdr *th = tcp_hdr(skb); 290 int length = (th->doff * 4) - sizeof(struct tcphdr); 291 292 ptr = (const unsigned char *)(th + 1); 293 294 while (length > 0) { 295 int opcode = *ptr++; 296 int opsize; 297 298 switch (opcode) { 299 case TCPOPT_EOL: 300 return; 301 case TCPOPT_NOP: /* Ref: RFC 793 section 3.1 */ 302 length--; 303 continue; 304 default: 305 opsize = *ptr++; 306 if (opsize < 2) /* "silly options" */ 307 return; 308 if (opsize > length) 309 return; /* don't parse partial options */ 310 if (opcode == TCPOPT_MPTCP) 311 mptcp_parse_option(skb, ptr, opsize, opt_rx); 312 ptr += opsize - 2; 313 length -= opsize; 314 } 315 } 316 } 317 318 bool mptcp_syn_options(struct sock *sk, const struct sk_buff *skb, 319 unsigned int *size, struct mptcp_out_options *opts) 320 { 321 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 322 323 /* we will use snd_isn to detect first pkt [re]transmission 324 * in mptcp_established_options_mp() 325 */ 326 subflow->snd_isn = TCP_SKB_CB(skb)->end_seq; 327 if (subflow->request_mptcp) { 328 pr_debug("local_key=%llu", subflow->local_key); 329 opts->suboptions = OPTION_MPTCP_MPC_SYN; 330 opts->sndr_key = subflow->local_key; 331 *size = TCPOLEN_MPTCP_MPC_SYN; 332 return true; 333 } else if (subflow->request_join) { 334 pr_debug("remote_token=%u, nonce=%u", subflow->remote_token, 335 subflow->local_nonce); 336 opts->suboptions = OPTION_MPTCP_MPJ_SYN; 337 opts->join_id = subflow->local_id; 338 opts->token = subflow->remote_token; 339 opts->nonce = subflow->local_nonce; 340 opts->backup = subflow->request_bkup; 341 *size = TCPOLEN_MPTCP_MPJ_SYN; 342 return true; 343 } 344 return false; 345 } 346 347 void mptcp_rcv_synsent(struct sock *sk) 348 { 349 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 350 struct tcp_sock *tp = tcp_sk(sk); 351 352 if (subflow->request_mptcp && tp->rx_opt.mptcp.mp_capable) { 353 subflow->mp_capable = 1; 354 subflow->can_ack = 1; 355 subflow->remote_key = tp->rx_opt.mptcp.sndr_key; 356 pr_debug("subflow=%p, remote_key=%llu", subflow, 357 subflow->remote_key); 358 } else if (subflow->request_join && tp->rx_opt.mptcp.mp_join) { 359 subflow->mp_join = 1; 360 subflow->thmac = tp->rx_opt.mptcp.thmac; 361 subflow->remote_nonce = tp->rx_opt.mptcp.nonce; 362 pr_debug("subflow=%p, thmac=%llu, remote_nonce=%u", subflow, 363 subflow->thmac, subflow->remote_nonce); 364 } else if (subflow->request_mptcp) { 365 tcp_sk(sk)->is_mptcp = 0; 366 } 367 } 368 369 /* MP_JOIN client subflow must wait for 4th ack before sending any data: 370 * TCP can't schedule delack timer before the subflow is fully established. 371 * MPTCP uses the delack timer to do 3rd ack retransmissions 372 */ 373 static void schedule_3rdack_retransmission(struct sock *sk) 374 { 375 struct inet_connection_sock *icsk = inet_csk(sk); 376 struct tcp_sock *tp = tcp_sk(sk); 377 unsigned long timeout; 378 379 /* reschedule with a timeout above RTT, as we must look only for drop */ 380 if (tp->srtt_us) 381 timeout = tp->srtt_us << 1; 382 else 383 timeout = TCP_TIMEOUT_INIT; 384 385 WARN_ON_ONCE(icsk->icsk_ack.pending & ICSK_ACK_TIMER); 386 icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER; 387 icsk->icsk_ack.timeout = timeout; 388 sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout); 389 } 390 391 static void clear_3rdack_retransmission(struct sock *sk) 392 { 393 struct inet_connection_sock *icsk = inet_csk(sk); 394 395 sk_stop_timer(sk, &icsk->icsk_delack_timer); 396 icsk->icsk_ack.timeout = 0; 397 icsk->icsk_ack.ato = 0; 398 icsk->icsk_ack.pending &= ~(ICSK_ACK_SCHED | ICSK_ACK_TIMER); 399 } 400 401 static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, 402 unsigned int *size, 403 unsigned int remaining, 404 struct mptcp_out_options *opts) 405 { 406 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 407 struct mptcp_ext *mpext; 408 unsigned int data_len; 409 410 /* When skb is not available, we better over-estimate the emitted 411 * options len. A full DSS option (28 bytes) is longer than 412 * TCPOLEN_MPTCP_MPC_ACK_DATA(22) or TCPOLEN_MPTCP_MPJ_ACK(24), so 413 * tell the caller to defer the estimate to 414 * mptcp_established_options_dss(), which will reserve enough space. 415 */ 416 if (!skb) 417 return false; 418 419 /* MPC/MPJ needed only on 3rd ack packet */ 420 if (subflow->fully_established || 421 subflow->snd_isn != TCP_SKB_CB(skb)->seq) 422 return false; 423 424 if (subflow->mp_capable) { 425 mpext = mptcp_get_ext(skb); 426 data_len = mpext ? mpext->data_len : 0; 427 428 /* we will check ext_copy.data_len in mptcp_write_options() to 429 * discriminate between TCPOLEN_MPTCP_MPC_ACK_DATA and 430 * TCPOLEN_MPTCP_MPC_ACK 431 */ 432 opts->ext_copy.data_len = data_len; 433 opts->suboptions = OPTION_MPTCP_MPC_ACK; 434 opts->sndr_key = subflow->local_key; 435 opts->rcvr_key = subflow->remote_key; 436 437 /* Section 3.1. 438 * The MP_CAPABLE option is carried on the SYN, SYN/ACK, and ACK 439 * packets that start the first subflow of an MPTCP connection, 440 * as well as the first packet that carries data 441 */ 442 if (data_len > 0) 443 *size = ALIGN(TCPOLEN_MPTCP_MPC_ACK_DATA, 4); 444 else 445 *size = TCPOLEN_MPTCP_MPC_ACK; 446 447 pr_debug("subflow=%p, local_key=%llu, remote_key=%llu map_len=%d", 448 subflow, subflow->local_key, subflow->remote_key, 449 data_len); 450 451 return true; 452 } else if (subflow->mp_join) { 453 opts->suboptions = OPTION_MPTCP_MPJ_ACK; 454 memcpy(opts->hmac, subflow->hmac, MPTCPOPT_HMAC_LEN); 455 *size = TCPOLEN_MPTCP_MPJ_ACK; 456 pr_debug("subflow=%p", subflow); 457 458 schedule_3rdack_retransmission(sk); 459 return true; 460 } 461 return false; 462 } 463 464 static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, 465 struct mptcp_ext *ext) 466 { 467 if (!ext->use_map) { 468 /* RFC6824 requires a DSS mapping with specific values 469 * if DATA_FIN is set but no data payload is mapped 470 */ 471 ext->data_fin = 1; 472 ext->use_map = 1; 473 ext->dsn64 = 1; 474 ext->data_seq = subflow->data_fin_tx_seq; 475 ext->subflow_seq = 0; 476 ext->data_len = 1; 477 } else if (ext->data_seq + ext->data_len == subflow->data_fin_tx_seq) { 478 /* If there's an existing DSS mapping and it is the 479 * final mapping, DATA_FIN consumes 1 additional byte of 480 * mapping space. 481 */ 482 ext->data_fin = 1; 483 ext->data_len++; 484 } 485 } 486 487 static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, 488 unsigned int *size, 489 unsigned int remaining, 490 struct mptcp_out_options *opts) 491 { 492 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 493 unsigned int dss_size = 0; 494 struct mptcp_ext *mpext; 495 struct mptcp_sock *msk; 496 unsigned int ack_size; 497 bool ret = false; 498 u8 tcp_fin; 499 500 if (skb) { 501 mpext = mptcp_get_ext(skb); 502 tcp_fin = TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN; 503 } else { 504 mpext = NULL; 505 tcp_fin = 0; 506 } 507 508 if (!skb || (mpext && mpext->use_map) || tcp_fin) { 509 unsigned int map_size; 510 511 map_size = TCPOLEN_MPTCP_DSS_BASE + TCPOLEN_MPTCP_DSS_MAP64; 512 513 remaining -= map_size; 514 dss_size = map_size; 515 if (mpext) 516 opts->ext_copy = *mpext; 517 518 if (skb && tcp_fin && subflow->data_fin_tx_enable) 519 mptcp_write_data_fin(subflow, &opts->ext_copy); 520 ret = true; 521 } 522 523 /* passive sockets msk will set the 'can_ack' after accept(), even 524 * if the first subflow may have the already the remote key handy 525 */ 526 opts->ext_copy.use_ack = 0; 527 msk = mptcp_sk(subflow->conn); 528 if (!READ_ONCE(msk->can_ack)) { 529 *size = ALIGN(dss_size, 4); 530 return ret; 531 } 532 533 ack_size = TCPOLEN_MPTCP_DSS_ACK64; 534 535 /* Add kind/length/subtype/flag overhead if mapping is not populated */ 536 if (dss_size == 0) 537 ack_size += TCPOLEN_MPTCP_DSS_BASE; 538 539 dss_size += ack_size; 540 541 opts->ext_copy.data_ack = msk->ack_seq; 542 opts->ext_copy.ack64 = 1; 543 opts->ext_copy.use_ack = 1; 544 545 *size = ALIGN(dss_size, 4); 546 return true; 547 } 548 549 static u64 add_addr_generate_hmac(u64 key1, u64 key2, u8 addr_id, 550 struct in_addr *addr) 551 { 552 u8 hmac[MPTCP_ADDR_HMAC_LEN]; 553 u8 msg[7]; 554 555 msg[0] = addr_id; 556 memcpy(&msg[1], &addr->s_addr, 4); 557 msg[5] = 0; 558 msg[6] = 0; 559 560 mptcp_crypto_hmac_sha(key1, key2, msg, 7, hmac); 561 562 return get_unaligned_be64(hmac); 563 } 564 565 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 566 static u64 add_addr6_generate_hmac(u64 key1, u64 key2, u8 addr_id, 567 struct in6_addr *addr) 568 { 569 u8 hmac[MPTCP_ADDR_HMAC_LEN]; 570 u8 msg[19]; 571 572 msg[0] = addr_id; 573 memcpy(&msg[1], &addr->s6_addr, 16); 574 msg[17] = 0; 575 msg[18] = 0; 576 577 mptcp_crypto_hmac_sha(key1, key2, msg, 19, hmac); 578 579 return get_unaligned_be64(hmac); 580 } 581 #endif 582 583 static bool mptcp_established_options_addr(struct sock *sk, 584 unsigned int *size, 585 unsigned int remaining, 586 struct mptcp_out_options *opts) 587 { 588 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 589 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 590 struct mptcp_addr_info saddr; 591 int len; 592 593 if (!mptcp_pm_should_signal(msk) || 594 !(mptcp_pm_addr_signal(msk, remaining, &saddr))) 595 return false; 596 597 len = mptcp_add_addr_len(saddr.family); 598 if (remaining < len) 599 return false; 600 601 *size = len; 602 opts->addr_id = saddr.id; 603 if (saddr.family == AF_INET) { 604 opts->suboptions |= OPTION_MPTCP_ADD_ADDR; 605 opts->addr = saddr.addr; 606 opts->ahmac = add_addr_generate_hmac(msk->local_key, 607 msk->remote_key, 608 opts->addr_id, 609 &opts->addr); 610 } 611 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 612 else if (saddr.family == AF_INET6) { 613 opts->suboptions |= OPTION_MPTCP_ADD_ADDR6; 614 opts->addr6 = saddr.addr6; 615 opts->ahmac = add_addr6_generate_hmac(msk->local_key, 616 msk->remote_key, 617 opts->addr_id, 618 &opts->addr6); 619 } 620 #endif 621 pr_debug("addr_id=%d, ahmac=%llu", opts->addr_id, opts->ahmac); 622 623 return true; 624 } 625 626 bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, 627 unsigned int *size, unsigned int remaining, 628 struct mptcp_out_options *opts) 629 { 630 unsigned int opt_size = 0; 631 bool ret = false; 632 633 opts->suboptions = 0; 634 635 if (mptcp_established_options_mp(sk, skb, &opt_size, remaining, opts)) 636 ret = true; 637 else if (mptcp_established_options_dss(sk, skb, &opt_size, remaining, 638 opts)) 639 ret = true; 640 641 /* we reserved enough space for the above options, and exceeding the 642 * TCP option space would be fatal 643 */ 644 if (WARN_ON_ONCE(opt_size > remaining)) 645 return false; 646 647 *size += opt_size; 648 remaining -= opt_size; 649 if (mptcp_established_options_addr(sk, &opt_size, remaining, opts)) { 650 *size += opt_size; 651 remaining -= opt_size; 652 ret = true; 653 } 654 655 return ret; 656 } 657 658 bool mptcp_synack_options(const struct request_sock *req, unsigned int *size, 659 struct mptcp_out_options *opts) 660 { 661 struct mptcp_subflow_request_sock *subflow_req = mptcp_subflow_rsk(req); 662 663 if (subflow_req->mp_capable) { 664 opts->suboptions = OPTION_MPTCP_MPC_SYNACK; 665 opts->sndr_key = subflow_req->local_key; 666 *size = TCPOLEN_MPTCP_MPC_SYNACK; 667 pr_debug("subflow_req=%p, local_key=%llu", 668 subflow_req, subflow_req->local_key); 669 return true; 670 } else if (subflow_req->mp_join) { 671 opts->suboptions = OPTION_MPTCP_MPJ_SYNACK; 672 opts->backup = subflow_req->backup; 673 opts->join_id = subflow_req->local_id; 674 opts->thmac = subflow_req->thmac; 675 opts->nonce = subflow_req->local_nonce; 676 pr_debug("req=%p, bkup=%u, id=%u, thmac=%llu, nonce=%u", 677 subflow_req, opts->backup, opts->join_id, 678 opts->thmac, opts->nonce); 679 *size = TCPOLEN_MPTCP_MPJ_SYNACK; 680 return true; 681 } 682 return false; 683 } 684 685 static bool check_fully_established(struct mptcp_sock *msk, struct sock *sk, 686 struct mptcp_subflow_context *subflow, 687 struct sk_buff *skb, 688 struct mptcp_options_received *mp_opt) 689 { 690 /* here we can process OoO, in-window pkts, only in-sequence 4th ack 691 * will make the subflow fully established 692 */ 693 if (likely(subflow->fully_established)) { 694 /* on passive sockets, check for 3rd ack retransmission 695 * note that msk is always set by subflow_syn_recv_sock() 696 * for mp_join subflows 697 */ 698 if (TCP_SKB_CB(skb)->seq == subflow->ssn_offset + 1 && 699 TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq && 700 subflow->mp_join && mp_opt->mp_join && 701 READ_ONCE(msk->pm.server_side)) 702 tcp_send_ack(sk); 703 goto fully_established; 704 } 705 706 /* we should process OoO packets before the first subflow is fully 707 * established, but not expected for MP_JOIN subflows 708 */ 709 if (TCP_SKB_CB(skb)->seq != subflow->ssn_offset + 1) 710 return subflow->mp_capable; 711 712 if (mp_opt->use_ack) { 713 /* subflows are fully established as soon as we get any 714 * additional ack. 715 */ 716 subflow->fully_established = 1; 717 goto fully_established; 718 } 719 720 WARN_ON_ONCE(subflow->can_ack); 721 722 /* If the first established packet does not contain MP_CAPABLE + data 723 * then fallback to TCP 724 */ 725 if (!mp_opt->mp_capable) { 726 subflow->mp_capable = 0; 727 tcp_sk(sk)->is_mptcp = 0; 728 return false; 729 } 730 731 subflow->fully_established = 1; 732 subflow->remote_key = mp_opt->sndr_key; 733 subflow->can_ack = 1; 734 735 fully_established: 736 if (likely(subflow->pm_notified)) 737 return true; 738 739 subflow->pm_notified = 1; 740 if (subflow->mp_join) { 741 clear_3rdack_retransmission(sk); 742 mptcp_pm_subflow_established(msk, subflow); 743 } else { 744 mptcp_pm_fully_established(msk); 745 } 746 return true; 747 } 748 749 static u64 expand_ack(u64 old_ack, u64 cur_ack, bool use_64bit) 750 { 751 u32 old_ack32, cur_ack32; 752 753 if (use_64bit) 754 return cur_ack; 755 756 old_ack32 = (u32)old_ack; 757 cur_ack32 = (u32)cur_ack; 758 cur_ack = (old_ack & GENMASK_ULL(63, 32)) + cur_ack32; 759 if (unlikely(before(cur_ack32, old_ack32))) 760 return cur_ack + (1LL << 32); 761 return cur_ack; 762 } 763 764 static void update_una(struct mptcp_sock *msk, 765 struct mptcp_options_received *mp_opt) 766 { 767 u64 new_snd_una, snd_una, old_snd_una = atomic64_read(&msk->snd_una); 768 u64 write_seq = READ_ONCE(msk->write_seq); 769 770 /* avoid ack expansion on update conflict, to reduce the risk of 771 * wrongly expanding to a future ack sequence number, which is way 772 * more dangerous than missing an ack 773 */ 774 new_snd_una = expand_ack(old_snd_una, mp_opt->data_ack, mp_opt->ack64); 775 776 /* ACK for data not even sent yet? Ignore. */ 777 if (after64(new_snd_una, write_seq)) 778 new_snd_una = old_snd_una; 779 780 while (after64(new_snd_una, old_snd_una)) { 781 snd_una = old_snd_una; 782 old_snd_una = atomic64_cmpxchg(&msk->snd_una, snd_una, 783 new_snd_una); 784 if (old_snd_una == snd_una) { 785 mptcp_data_acked((struct sock *)msk); 786 break; 787 } 788 } 789 } 790 791 static bool add_addr_hmac_valid(struct mptcp_sock *msk, 792 struct mptcp_options_received *mp_opt) 793 { 794 u64 hmac = 0; 795 796 if (mp_opt->echo) 797 return true; 798 799 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) 800 hmac = add_addr_generate_hmac(msk->remote_key, 801 msk->local_key, 802 mp_opt->addr_id, &mp_opt->addr); 803 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 804 else 805 hmac = add_addr6_generate_hmac(msk->remote_key, 806 msk->local_key, 807 mp_opt->addr_id, &mp_opt->addr6); 808 #endif 809 810 pr_debug("msk=%p, ahmac=%llu, mp_opt->ahmac=%llu\n", 811 msk, (unsigned long long)hmac, 812 (unsigned long long)mp_opt->ahmac); 813 814 return hmac == mp_opt->ahmac; 815 } 816 817 void mptcp_incoming_options(struct sock *sk, struct sk_buff *skb, 818 struct tcp_options_received *opt_rx) 819 { 820 struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); 821 struct mptcp_sock *msk = mptcp_sk(subflow->conn); 822 struct mptcp_options_received *mp_opt; 823 struct mptcp_ext *mpext; 824 825 mp_opt = &opt_rx->mptcp; 826 if (!check_fully_established(msk, sk, subflow, skb, mp_opt)) 827 return; 828 829 if (mp_opt->add_addr && add_addr_hmac_valid(msk, mp_opt)) { 830 struct mptcp_addr_info addr; 831 832 addr.port = htons(mp_opt->port); 833 addr.id = mp_opt->addr_id; 834 if (mp_opt->family == MPTCP_ADDR_IPVERSION_4) { 835 addr.family = AF_INET; 836 addr.addr = mp_opt->addr; 837 } 838 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 839 else if (mp_opt->family == MPTCP_ADDR_IPVERSION_6) { 840 addr.family = AF_INET6; 841 addr.addr6 = mp_opt->addr6; 842 } 843 #endif 844 if (!mp_opt->echo) 845 mptcp_pm_add_addr_received(msk, &addr); 846 mp_opt->add_addr = 0; 847 } 848 849 if (!mp_opt->dss) 850 return; 851 852 /* we can't wait for recvmsg() to update the ack_seq, otherwise 853 * monodirectional flows will stuck 854 */ 855 if (mp_opt->use_ack) 856 update_una(msk, mp_opt); 857 858 mpext = skb_ext_add(skb, SKB_EXT_MPTCP); 859 if (!mpext) 860 return; 861 862 memset(mpext, 0, sizeof(*mpext)); 863 864 if (mp_opt->use_map) { 865 if (mp_opt->mpc_map) { 866 /* this is an MP_CAPABLE carrying MPTCP data 867 * we know this map the first chunk of data 868 */ 869 mptcp_crypto_key_sha(subflow->remote_key, NULL, 870 &mpext->data_seq); 871 mpext->data_seq++; 872 mpext->subflow_seq = 1; 873 mpext->dsn64 = 1; 874 mpext->mpc_map = 1; 875 } else { 876 mpext->data_seq = mp_opt->data_seq; 877 mpext->subflow_seq = mp_opt->subflow_seq; 878 mpext->dsn64 = mp_opt->dsn64; 879 } 880 mpext->data_len = mp_opt->data_len; 881 mpext->use_map = 1; 882 } 883 884 mpext->data_fin = mp_opt->data_fin; 885 } 886 887 void mptcp_write_options(__be32 *ptr, struct mptcp_out_options *opts) 888 { 889 if ((OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | 890 OPTION_MPTCP_MPC_ACK) & opts->suboptions) { 891 u8 len; 892 893 if (OPTION_MPTCP_MPC_SYN & opts->suboptions) 894 len = TCPOLEN_MPTCP_MPC_SYN; 895 else if (OPTION_MPTCP_MPC_SYNACK & opts->suboptions) 896 len = TCPOLEN_MPTCP_MPC_SYNACK; 897 else if (opts->ext_copy.data_len) 898 len = TCPOLEN_MPTCP_MPC_ACK_DATA; 899 else 900 len = TCPOLEN_MPTCP_MPC_ACK; 901 902 *ptr++ = mptcp_option(MPTCPOPT_MP_CAPABLE, len, 903 MPTCP_SUPPORTED_VERSION, 904 MPTCP_CAP_HMAC_SHA256); 905 906 if (!((OPTION_MPTCP_MPC_SYNACK | OPTION_MPTCP_MPC_ACK) & 907 opts->suboptions)) 908 goto mp_capable_done; 909 910 put_unaligned_be64(opts->sndr_key, ptr); 911 ptr += 2; 912 if (!((OPTION_MPTCP_MPC_ACK) & opts->suboptions)) 913 goto mp_capable_done; 914 915 put_unaligned_be64(opts->rcvr_key, ptr); 916 ptr += 2; 917 if (!opts->ext_copy.data_len) 918 goto mp_capable_done; 919 920 put_unaligned_be32(opts->ext_copy.data_len << 16 | 921 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 922 ptr += 1; 923 } 924 925 mp_capable_done: 926 if (OPTION_MPTCP_ADD_ADDR & opts->suboptions) { 927 if (opts->ahmac) 928 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 929 TCPOLEN_MPTCP_ADD_ADDR, 0, 930 opts->addr_id); 931 else 932 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 933 TCPOLEN_MPTCP_ADD_ADDR_BASE, 934 MPTCP_ADDR_ECHO, 935 opts->addr_id); 936 memcpy((u8 *)ptr, (u8 *)&opts->addr.s_addr, 4); 937 ptr += 1; 938 if (opts->ahmac) { 939 put_unaligned_be64(opts->ahmac, ptr); 940 ptr += 2; 941 } 942 } 943 944 #if IS_ENABLED(CONFIG_MPTCP_IPV6) 945 if (OPTION_MPTCP_ADD_ADDR6 & opts->suboptions) { 946 if (opts->ahmac) 947 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 948 TCPOLEN_MPTCP_ADD_ADDR6, 0, 949 opts->addr_id); 950 else 951 *ptr++ = mptcp_option(MPTCPOPT_ADD_ADDR, 952 TCPOLEN_MPTCP_ADD_ADDR6_BASE, 953 MPTCP_ADDR_ECHO, 954 opts->addr_id); 955 memcpy((u8 *)ptr, opts->addr6.s6_addr, 16); 956 ptr += 4; 957 if (opts->ahmac) { 958 put_unaligned_be64(opts->ahmac, ptr); 959 ptr += 2; 960 } 961 } 962 #endif 963 964 if (OPTION_MPTCP_RM_ADDR & opts->suboptions) { 965 *ptr++ = mptcp_option(MPTCPOPT_RM_ADDR, 966 TCPOLEN_MPTCP_RM_ADDR_BASE, 967 0, opts->rm_id); 968 } 969 970 if (OPTION_MPTCP_MPJ_SYN & opts->suboptions) { 971 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 972 TCPOLEN_MPTCP_MPJ_SYN, 973 opts->backup, opts->join_id); 974 put_unaligned_be32(opts->token, ptr); 975 ptr += 1; 976 put_unaligned_be32(opts->nonce, ptr); 977 ptr += 1; 978 } 979 980 if (OPTION_MPTCP_MPJ_SYNACK & opts->suboptions) { 981 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 982 TCPOLEN_MPTCP_MPJ_SYNACK, 983 opts->backup, opts->join_id); 984 put_unaligned_be64(opts->thmac, ptr); 985 ptr += 2; 986 put_unaligned_be32(opts->nonce, ptr); 987 ptr += 1; 988 } 989 990 if (OPTION_MPTCP_MPJ_ACK & opts->suboptions) { 991 *ptr++ = mptcp_option(MPTCPOPT_MP_JOIN, 992 TCPOLEN_MPTCP_MPJ_ACK, 0, 0); 993 memcpy(ptr, opts->hmac, MPTCPOPT_HMAC_LEN); 994 ptr += 5; 995 } 996 997 if (opts->ext_copy.use_ack || opts->ext_copy.use_map) { 998 struct mptcp_ext *mpext = &opts->ext_copy; 999 u8 len = TCPOLEN_MPTCP_DSS_BASE; 1000 u8 flags = 0; 1001 1002 if (mpext->use_ack) { 1003 len += TCPOLEN_MPTCP_DSS_ACK64; 1004 flags = MPTCP_DSS_HAS_ACK | MPTCP_DSS_ACK64; 1005 } 1006 1007 if (mpext->use_map) { 1008 len += TCPOLEN_MPTCP_DSS_MAP64; 1009 1010 /* Use only 64-bit mapping flags for now, add 1011 * support for optional 32-bit mappings later. 1012 */ 1013 flags |= MPTCP_DSS_HAS_MAP | MPTCP_DSS_DSN64; 1014 if (mpext->data_fin) 1015 flags |= MPTCP_DSS_DATA_FIN; 1016 } 1017 1018 *ptr++ = mptcp_option(MPTCPOPT_DSS, len, 0, flags); 1019 1020 if (mpext->use_ack) { 1021 put_unaligned_be64(mpext->data_ack, ptr); 1022 ptr += 2; 1023 } 1024 1025 if (mpext->use_map) { 1026 put_unaligned_be64(mpext->data_seq, ptr); 1027 ptr += 2; 1028 put_unaligned_be32(mpext->subflow_seq, ptr); 1029 ptr += 1; 1030 put_unaligned_be32(mpext->data_len << 16 | 1031 TCPOPT_NOP << 8 | TCPOPT_NOP, ptr); 1032 } 1033 } 1034 } 1035