1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * ip_vs_proto_tcp.c: TCP load balancing support for IPVS 4 * 5 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org> 6 * Julian Anastasov <ja@ssi.bg> 7 * 8 * Changes: Hans Schillstrom <hans.schillstrom@ericsson.com> 9 * 10 * Network name space (netns) aware. 11 * Global data moved to netns i.e struct netns_ipvs 12 * tcp_timeouts table has copy per netns in a hash table per 13 * protocol ip_vs_proto_data and is handled by netns 14 */ 15 16 #define pr_fmt(fmt) "IPVS: " fmt 17 18 #include <linux/kernel.h> 19 #include <linux/ip.h> 20 #include <linux/tcp.h> /* for tcphdr */ 21 #include <net/ip.h> 22 #include <net/tcp.h> /* for csum_tcpudp_magic */ 23 #include <net/ip6_checksum.h> 24 #include <linux/netfilter.h> 25 #include <linux/netfilter_ipv4.h> 26 #include <linux/indirect_call_wrapper.h> 27 28 #include <net/ip_vs.h> 29 30 static int 31 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp); 32 33 static int 34 tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb, 35 struct ip_vs_proto_data *pd, 36 int *verdict, struct ip_vs_conn **cpp, 37 struct ip_vs_iphdr *iph) 38 { 39 struct ip_vs_service *svc; 40 struct tcphdr _tcph, *th; 41 __be16 _ports[2], *ports = NULL; 42 43 /* In the event of icmp, we're only guaranteed to have the first 8 44 * bytes of the transport header, so we only check the rest of the 45 * TCP packet for non-ICMP packets 46 */ 47 if (likely(!ip_vs_iph_icmp(iph))) { 48 th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph); 49 if (th) { 50 if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn)) 51 return 1; 52 ports = &th->source; 53 } 54 } else { 55 ports = skb_header_pointer( 56 skb, iph->len, sizeof(_ports), &_ports); 57 } 58 59 if (!ports) { 60 *verdict = NF_DROP; 61 return 0; 62 } 63 64 /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ 65 66 if (likely(!ip_vs_iph_inverse(iph))) 67 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, 68 &iph->daddr, ports[1]); 69 else 70 svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol, 71 &iph->saddr, ports[0]); 72 73 if (svc) { 74 int ignored; 75 76 if (ip_vs_todrop(ipvs)) { 77 /* 78 * It seems that we are very loaded. 79 * We have to drop this packet :( 80 */ 81 *verdict = NF_DROP; 82 return 0; 83 } 84 85 /* 86 * Let the virtual server select a real server for the 87 * incoming connection, and create a connection entry. 88 */ 89 *cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph); 90 if (!*cpp && ignored <= 0) { 91 if (!ignored) 92 *verdict = ip_vs_leave(svc, skb, pd, iph); 93 else 94 *verdict = NF_DROP; 95 return 0; 96 } 97 } 98 /* NF_ACCEPT */ 99 return 1; 100 } 101 102 103 static inline void 104 tcp_fast_csum_update(int af, struct tcphdr *tcph, 105 const union nf_inet_addr *oldip, 106 const union nf_inet_addr *newip, 107 __be16 oldport, __be16 newport) 108 { 109 #ifdef CONFIG_IP_VS_IPV6 110 if (af == AF_INET6) 111 tcph->check = 112 csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 113 ip_vs_check_diff2(oldport, newport, 114 ~csum_unfold(tcph->check)))); 115 else 116 #endif 117 tcph->check = 118 csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 119 ip_vs_check_diff2(oldport, newport, 120 ~csum_unfold(tcph->check)))); 121 } 122 123 124 static inline void 125 tcp_partial_csum_update(int af, struct tcphdr *tcph, 126 const union nf_inet_addr *oldip, 127 const union nf_inet_addr *newip, 128 __be16 oldlen, __be16 newlen) 129 { 130 #ifdef CONFIG_IP_VS_IPV6 131 if (af == AF_INET6) 132 tcph->check = 133 ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, 134 ip_vs_check_diff2(oldlen, newlen, 135 csum_unfold(tcph->check)))); 136 else 137 #endif 138 tcph->check = 139 ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, 140 ip_vs_check_diff2(oldlen, newlen, 141 csum_unfold(tcph->check)))); 142 } 143 144 145 INDIRECT_CALLABLE_SCOPE int 146 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 147 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 148 { 149 struct tcphdr *tcph; 150 unsigned int tcphoff = iph->len; 151 bool payload_csum = false; 152 int oldlen; 153 154 #ifdef CONFIG_IP_VS_IPV6 155 if (cp->af == AF_INET6 && iph->fragoffs) 156 return 1; 157 #endif 158 oldlen = skb->len - tcphoff; 159 160 /* csum_check requires unshared skb */ 161 if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) 162 return 0; 163 164 if (unlikely(cp->app != NULL)) { 165 int ret; 166 167 /* Some checks before mangling */ 168 if (!tcp_csum_check(cp->af, skb, pp)) 169 return 0; 170 171 /* Call application helper if needed */ 172 if (!(ret = ip_vs_app_pkt_out(cp, skb, iph))) 173 return 0; 174 /* ret=2: csum update is needed after payload mangling */ 175 if (ret == 1) 176 oldlen = skb->len - tcphoff; 177 else 178 payload_csum = true; 179 } 180 181 tcph = (void *)skb_network_header(skb) + tcphoff; 182 tcph->source = cp->vport; 183 184 /* Adjust TCP checksums */ 185 if (skb->ip_summed == CHECKSUM_PARTIAL) { 186 tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 187 htons(oldlen), 188 htons(skb->len - tcphoff)); 189 } else if (!payload_csum) { 190 /* Only port and addr are changed, do fast csum update */ 191 tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, 192 cp->dport, cp->vport); 193 if (skb->ip_summed == CHECKSUM_COMPLETE) 194 skb->ip_summed = cp->app ? 195 CHECKSUM_UNNECESSARY : CHECKSUM_NONE; 196 } else { 197 /* full checksum calculation */ 198 tcph->check = 0; 199 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 200 #ifdef CONFIG_IP_VS_IPV6 201 if (cp->af == AF_INET6) 202 tcph->check = csum_ipv6_magic(&cp->vaddr.in6, 203 &cp->caddr.in6, 204 skb->len - tcphoff, 205 cp->protocol, skb->csum); 206 else 207 #endif 208 tcph->check = csum_tcpudp_magic(cp->vaddr.ip, 209 cp->caddr.ip, 210 skb->len - tcphoff, 211 cp->protocol, 212 skb->csum); 213 skb->ip_summed = CHECKSUM_UNNECESSARY; 214 215 IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", 216 pp->name, tcph->check, 217 (char*)&(tcph->check) - (char*)tcph); 218 } 219 return 1; 220 } 221 222 223 static int 224 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, 225 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph) 226 { 227 struct tcphdr *tcph; 228 unsigned int tcphoff = iph->len; 229 bool payload_csum = false; 230 int oldlen; 231 232 #ifdef CONFIG_IP_VS_IPV6 233 if (cp->af == AF_INET6 && iph->fragoffs) 234 return 1; 235 #endif 236 oldlen = skb->len - tcphoff; 237 238 /* csum_check requires unshared skb */ 239 if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph))) 240 return 0; 241 242 if (unlikely(cp->app != NULL)) { 243 int ret; 244 245 /* Some checks before mangling */ 246 if (!tcp_csum_check(cp->af, skb, pp)) 247 return 0; 248 249 /* 250 * Attempt ip_vs_app call. 251 * It will fix ip_vs_conn and iph ack_seq stuff 252 */ 253 if (!(ret = ip_vs_app_pkt_in(cp, skb, iph))) 254 return 0; 255 /* ret=2: csum update is needed after payload mangling */ 256 if (ret == 1) 257 oldlen = skb->len - tcphoff; 258 else 259 payload_csum = true; 260 } 261 262 tcph = (void *)skb_network_header(skb) + tcphoff; 263 tcph->dest = cp->dport; 264 265 /* 266 * Adjust TCP checksums 267 */ 268 if (skb->ip_summed == CHECKSUM_PARTIAL) { 269 tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, 270 htons(oldlen), 271 htons(skb->len - tcphoff)); 272 } else if (!payload_csum) { 273 /* Only port and addr are changed, do fast csum update */ 274 tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, 275 cp->vport, cp->dport); 276 if (skb->ip_summed == CHECKSUM_COMPLETE) 277 skb->ip_summed = cp->app ? 278 CHECKSUM_UNNECESSARY : CHECKSUM_NONE; 279 } else { 280 /* full checksum calculation */ 281 tcph->check = 0; 282 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 283 #ifdef CONFIG_IP_VS_IPV6 284 if (cp->af == AF_INET6) 285 tcph->check = csum_ipv6_magic(&cp->caddr.in6, 286 &cp->daddr.in6, 287 skb->len - tcphoff, 288 cp->protocol, skb->csum); 289 else 290 #endif 291 tcph->check = csum_tcpudp_magic(cp->caddr.ip, 292 cp->daddr.ip, 293 skb->len - tcphoff, 294 cp->protocol, 295 skb->csum); 296 skb->ip_summed = CHECKSUM_UNNECESSARY; 297 } 298 return 1; 299 } 300 301 302 static int 303 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) 304 { 305 unsigned int tcphoff; 306 307 #ifdef CONFIG_IP_VS_IPV6 308 if (af == AF_INET6) 309 tcphoff = sizeof(struct ipv6hdr); 310 else 311 #endif 312 tcphoff = ip_hdrlen(skb); 313 314 switch (skb->ip_summed) { 315 case CHECKSUM_NONE: 316 skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0); 317 fallthrough; 318 case CHECKSUM_COMPLETE: 319 #ifdef CONFIG_IP_VS_IPV6 320 if (af == AF_INET6) { 321 if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr, 322 &ipv6_hdr(skb)->daddr, 323 skb->len - tcphoff, 324 ipv6_hdr(skb)->nexthdr, 325 skb->csum)) { 326 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, 327 "Failed checksum for"); 328 return 0; 329 } 330 } else 331 #endif 332 if (csum_tcpudp_magic(ip_hdr(skb)->saddr, 333 ip_hdr(skb)->daddr, 334 skb->len - tcphoff, 335 ip_hdr(skb)->protocol, 336 skb->csum)) { 337 IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, 338 "Failed checksum for"); 339 return 0; 340 } 341 break; 342 default: 343 /* No need to checksum. */ 344 break; 345 } 346 347 return 1; 348 } 349 350 351 #define TCP_DIR_INPUT 0 352 #define TCP_DIR_OUTPUT 4 353 #define TCP_DIR_INPUT_ONLY 8 354 355 static const int tcp_state_off[IP_VS_DIR_LAST] = { 356 [IP_VS_DIR_INPUT] = TCP_DIR_INPUT, 357 [IP_VS_DIR_OUTPUT] = TCP_DIR_OUTPUT, 358 [IP_VS_DIR_INPUT_ONLY] = TCP_DIR_INPUT_ONLY, 359 }; 360 361 /* 362 * Timeout table[state] 363 */ 364 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { 365 [IP_VS_TCP_S_NONE] = 2*HZ, 366 [IP_VS_TCP_S_ESTABLISHED] = 15*60*HZ, 367 [IP_VS_TCP_S_SYN_SENT] = 2*60*HZ, 368 [IP_VS_TCP_S_SYN_RECV] = 1*60*HZ, 369 [IP_VS_TCP_S_FIN_WAIT] = 2*60*HZ, 370 [IP_VS_TCP_S_TIME_WAIT] = 2*60*HZ, 371 [IP_VS_TCP_S_CLOSE] = 10*HZ, 372 [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, 373 [IP_VS_TCP_S_LAST_ACK] = 30*HZ, 374 [IP_VS_TCP_S_LISTEN] = 2*60*HZ, 375 [IP_VS_TCP_S_SYNACK] = 120*HZ, 376 [IP_VS_TCP_S_LAST] = 2*HZ, 377 }; 378 379 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { 380 [IP_VS_TCP_S_NONE] = "NONE", 381 [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", 382 [IP_VS_TCP_S_SYN_SENT] = "SYN_SENT", 383 [IP_VS_TCP_S_SYN_RECV] = "SYN_RECV", 384 [IP_VS_TCP_S_FIN_WAIT] = "FIN_WAIT", 385 [IP_VS_TCP_S_TIME_WAIT] = "TIME_WAIT", 386 [IP_VS_TCP_S_CLOSE] = "CLOSE", 387 [IP_VS_TCP_S_CLOSE_WAIT] = "CLOSE_WAIT", 388 [IP_VS_TCP_S_LAST_ACK] = "LAST_ACK", 389 [IP_VS_TCP_S_LISTEN] = "LISTEN", 390 [IP_VS_TCP_S_SYNACK] = "SYNACK", 391 [IP_VS_TCP_S_LAST] = "BUG!", 392 }; 393 394 static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = { 395 [IP_VS_TCP_S_NONE] = false, 396 [IP_VS_TCP_S_ESTABLISHED] = true, 397 [IP_VS_TCP_S_SYN_SENT] = true, 398 [IP_VS_TCP_S_SYN_RECV] = true, 399 [IP_VS_TCP_S_FIN_WAIT] = false, 400 [IP_VS_TCP_S_TIME_WAIT] = false, 401 [IP_VS_TCP_S_CLOSE] = false, 402 [IP_VS_TCP_S_CLOSE_WAIT] = false, 403 [IP_VS_TCP_S_LAST_ACK] = false, 404 [IP_VS_TCP_S_LISTEN] = false, 405 [IP_VS_TCP_S_SYNACK] = true, 406 }; 407 408 #define sNO IP_VS_TCP_S_NONE 409 #define sES IP_VS_TCP_S_ESTABLISHED 410 #define sSS IP_VS_TCP_S_SYN_SENT 411 #define sSR IP_VS_TCP_S_SYN_RECV 412 #define sFW IP_VS_TCP_S_FIN_WAIT 413 #define sTW IP_VS_TCP_S_TIME_WAIT 414 #define sCL IP_VS_TCP_S_CLOSE 415 #define sCW IP_VS_TCP_S_CLOSE_WAIT 416 #define sLA IP_VS_TCP_S_LAST_ACK 417 #define sLI IP_VS_TCP_S_LISTEN 418 #define sSA IP_VS_TCP_S_SYNACK 419 420 struct tcp_states_t { 421 int next_state[IP_VS_TCP_S_LAST]; 422 }; 423 424 static const char * tcp_state_name(int state) 425 { 426 if (state >= IP_VS_TCP_S_LAST) 427 return "ERR!"; 428 return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?"; 429 } 430 431 static bool tcp_state_active(int state) 432 { 433 if (state >= IP_VS_TCP_S_LAST) 434 return false; 435 return tcp_state_active_table[state]; 436 } 437 438 static struct tcp_states_t tcp_states[] = { 439 /* INPUT */ 440 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 441 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, 442 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }}, 443 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, 444 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }}, 445 446 /* OUTPUT */ 447 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 448 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }}, 449 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, 450 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, 451 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, 452 453 /* INPUT-ONLY */ 454 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 455 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }}, 456 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, 457 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, 458 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 459 }; 460 461 static struct tcp_states_t tcp_states_dos[] = { 462 /* INPUT */ 463 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 464 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }}, 465 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }}, 466 /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }}, 467 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 468 469 /* OUTPUT */ 470 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 471 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }}, 472 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }}, 473 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }}, 474 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }}, 475 476 /* INPUT-ONLY */ 477 /* sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA */ 478 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }}, 479 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }}, 480 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }}, 481 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }}, 482 }; 483 484 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags) 485 { 486 int on = (flags & 1); /* secure_tcp */ 487 488 /* 489 ** FIXME: change secure_tcp to independent sysctl var 490 ** or make it per-service or per-app because it is valid 491 ** for most if not for all of the applications. Something 492 ** like "capabilities" (flags) for each object. 493 */ 494 pd->tcp_state_table = (on ? tcp_states_dos : tcp_states); 495 } 496 497 static inline int tcp_state_idx(struct tcphdr *th) 498 { 499 if (th->rst) 500 return 3; 501 if (th->syn) 502 return 0; 503 if (th->fin) 504 return 1; 505 if (th->ack) 506 return 2; 507 return -1; 508 } 509 510 static inline void 511 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp, 512 int direction, struct tcphdr *th) 513 { 514 int state_idx; 515 int new_state = IP_VS_TCP_S_CLOSE; 516 int state_off = tcp_state_off[direction]; 517 518 /* 519 * Update state offset to INPUT_ONLY if necessary 520 * or delete NO_OUTPUT flag if output packet detected 521 */ 522 if (cp->flags & IP_VS_CONN_F_NOOUTPUT) { 523 if (state_off == TCP_DIR_OUTPUT) 524 cp->flags &= ~IP_VS_CONN_F_NOOUTPUT; 525 else 526 state_off = TCP_DIR_INPUT_ONLY; 527 } 528 529 if ((state_idx = tcp_state_idx(th)) < 0) { 530 IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx); 531 goto tcp_state_out; 532 } 533 534 new_state = 535 pd->tcp_state_table[state_off+state_idx].next_state[cp->state]; 536 537 tcp_state_out: 538 if (new_state != cp->state) { 539 struct ip_vs_dest *dest = cp->dest; 540 541 IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d " 542 "d:%s:%d state: %s->%s conn->refcnt:%d\n", 543 pd->pp->name, 544 ((state_off == TCP_DIR_OUTPUT) ? 545 "output " : "input "), 546 th->syn ? 'S' : '.', 547 th->fin ? 'F' : '.', 548 th->ack ? 'A' : '.', 549 th->rst ? 'R' : '.', 550 IP_VS_DBG_ADDR(cp->af, &cp->caddr), 551 ntohs(cp->cport), 552 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), 553 ntohs(cp->vport), 554 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), 555 ntohs(cp->dport), 556 tcp_state_name(cp->state), 557 tcp_state_name(new_state), 558 refcount_read(&cp->refcnt)); 559 560 if (dest) { 561 if (!(cp->flags & IP_VS_CONN_F_INACTIVE) && 562 !tcp_state_active(new_state)) { 563 atomic_dec(&dest->activeconns); 564 atomic_inc(&dest->inactconns); 565 cp->flags |= IP_VS_CONN_F_INACTIVE; 566 } else if ((cp->flags & IP_VS_CONN_F_INACTIVE) && 567 tcp_state_active(new_state)) { 568 atomic_inc(&dest->activeconns); 569 atomic_dec(&dest->inactconns); 570 cp->flags &= ~IP_VS_CONN_F_INACTIVE; 571 } 572 } 573 if (new_state == IP_VS_TCP_S_ESTABLISHED) 574 ip_vs_control_assure_ct(cp); 575 } 576 577 if (likely(pd)) 578 cp->timeout = pd->timeout_table[cp->state = new_state]; 579 else /* What to do ? */ 580 cp->timeout = tcp_timeouts[cp->state = new_state]; 581 } 582 583 /* 584 * Handle state transitions 585 */ 586 static void 587 tcp_state_transition(struct ip_vs_conn *cp, int direction, 588 const struct sk_buff *skb, 589 struct ip_vs_proto_data *pd) 590 { 591 struct tcphdr _tcph, *th; 592 593 #ifdef CONFIG_IP_VS_IPV6 594 int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr); 595 #else 596 int ihl = ip_hdrlen(skb); 597 #endif 598 599 th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph); 600 if (th == NULL) 601 return; 602 603 spin_lock_bh(&cp->lock); 604 set_tcp_state(pd, cp, direction, th); 605 spin_unlock_bh(&cp->lock); 606 } 607 608 static inline __u16 tcp_app_hashkey(__be16 port) 609 { 610 return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port) 611 & TCP_APP_TAB_MASK; 612 } 613 614 615 static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) 616 { 617 struct ip_vs_app *i; 618 __u16 hash; 619 __be16 port = inc->port; 620 int ret = 0; 621 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 622 623 hash = tcp_app_hashkey(port); 624 625 list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) { 626 if (i->port == port) { 627 ret = -EEXIST; 628 goto out; 629 } 630 } 631 list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]); 632 atomic_inc(&pd->appcnt); 633 634 out: 635 return ret; 636 } 637 638 639 static void 640 tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc) 641 { 642 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP); 643 644 atomic_dec(&pd->appcnt); 645 list_del_rcu(&inc->p_list); 646 } 647 648 649 static int 650 tcp_app_conn_bind(struct ip_vs_conn *cp) 651 { 652 struct netns_ipvs *ipvs = cp->ipvs; 653 int hash; 654 struct ip_vs_app *inc; 655 int result = 0; 656 657 /* Default binding: bind app only for NAT */ 658 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) 659 return 0; 660 661 /* Lookup application incarnations and bind the right one */ 662 hash = tcp_app_hashkey(cp->vport); 663 664 list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) { 665 if (inc->port == cp->vport) { 666 if (unlikely(!ip_vs_app_inc_get(inc))) 667 break; 668 669 IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->" 670 "%s:%u to app %s on port %u\n", 671 __func__, 672 IP_VS_DBG_ADDR(cp->af, &cp->caddr), 673 ntohs(cp->cport), 674 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), 675 ntohs(cp->vport), 676 inc->name, ntohs(inc->port)); 677 678 cp->app = inc; 679 if (inc->init_conn) 680 result = inc->init_conn(inc, cp); 681 break; 682 } 683 } 684 685 return result; 686 } 687 688 689 /* 690 * Set LISTEN timeout. (ip_vs_conn_put will setup timer) 691 */ 692 void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp) 693 { 694 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP); 695 696 spin_lock_bh(&cp->lock); 697 cp->state = IP_VS_TCP_S_LISTEN; 698 cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN] 699 : tcp_timeouts[IP_VS_TCP_S_LISTEN]); 700 spin_unlock_bh(&cp->lock); 701 } 702 703 /* --------------------------------------------- 704 * timeouts is netns related now. 705 * --------------------------------------------- 706 */ 707 static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) 708 { 709 ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE); 710 pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts, 711 sizeof(tcp_timeouts)); 712 if (!pd->timeout_table) 713 return -ENOMEM; 714 pd->tcp_state_table = tcp_states; 715 return 0; 716 } 717 718 static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd) 719 { 720 kfree(pd->timeout_table); 721 } 722 723 724 struct ip_vs_protocol ip_vs_protocol_tcp = { 725 .name = "TCP", 726 .protocol = IPPROTO_TCP, 727 .num_states = IP_VS_TCP_S_LAST, 728 .dont_defrag = 0, 729 .init = NULL, 730 .exit = NULL, 731 .init_netns = __ip_vs_tcp_init, 732 .exit_netns = __ip_vs_tcp_exit, 733 .register_app = tcp_register_app, 734 .unregister_app = tcp_unregister_app, 735 .conn_schedule = tcp_conn_schedule, 736 .conn_in_get = ip_vs_conn_in_get_proto, 737 .conn_out_get = ip_vs_conn_out_get_proto, 738 .snat_handler = tcp_snat_handler, 739 .dnat_handler = tcp_dnat_handler, 740 .state_name = tcp_state_name, 741 .state_transition = tcp_state_transition, 742 .app_conn_bind = tcp_app_conn_bind, 743 .debug_packet = ip_vs_tcpudp_debug_packet, 744 .timeout_change = tcp_timeout_change, 745 }; 746