xref: /linux/net/netfilter/ipvs/ip_vs_proto_tcp.c (revision 8bf22c33e7a172fbc72464f4cc484d23a6b412ba)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * ip_vs_proto_tcp.c:	TCP load balancing support for IPVS
4  *
5  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
6  *              Julian Anastasov <ja@ssi.bg>
7  *
8  * Changes:     Hans Schillstrom <hans.schillstrom@ericsson.com>
9  *
10  *              Network name space (netns) aware.
11  *              Global data moved to netns i.e struct netns_ipvs
12  *              tcp_timeouts table has copy per netns in a hash table per
13  *              protocol ip_vs_proto_data and is handled by netns
14  */
15 
16 #define pr_fmt(fmt) "IPVS: " fmt
17 
18 #include <linux/kernel.h>
19 #include <linux/ip.h>
20 #include <linux/tcp.h>                  /* for tcphdr */
21 #include <net/ip.h>
22 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
23 #include <net/ip6_checksum.h>
24 #include <linux/netfilter.h>
25 #include <linux/netfilter_ipv4.h>
26 #include <linux/indirect_call_wrapper.h>
27 
28 #include <net/ip_vs.h>
29 
30 static int
31 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
32 	       unsigned int tcphoff);
33 
34 static int
tcp_conn_schedule(struct netns_ipvs * ipvs,int af,struct sk_buff * skb,struct ip_vs_proto_data * pd,int * verdict,struct ip_vs_conn ** cpp,struct ip_vs_iphdr * iph)35 tcp_conn_schedule(struct netns_ipvs *ipvs, int af, struct sk_buff *skb,
36 		  struct ip_vs_proto_data *pd,
37 		  int *verdict, struct ip_vs_conn **cpp,
38 		  struct ip_vs_iphdr *iph)
39 {
40 	struct ip_vs_service *svc;
41 	struct tcphdr _tcph, *th;
42 	__be16 _ports[2], *ports = NULL;
43 
44 	/* In the event of icmp, we're only guaranteed to have the first 8
45 	 * bytes of the transport header, so we only check the rest of the
46 	 * TCP packet for non-ICMP packets
47 	 */
48 	if (likely(!ip_vs_iph_icmp(iph))) {
49 		th = skb_header_pointer(skb, iph->len, sizeof(_tcph), &_tcph);
50 		if (th) {
51 			if (th->rst || !(sysctl_sloppy_tcp(ipvs) || th->syn))
52 				return 1;
53 			ports = &th->source;
54 		}
55 	} else {
56 		ports = skb_header_pointer(
57 			skb, iph->len, sizeof(_ports), &_ports);
58 	}
59 
60 	if (!ports) {
61 		*verdict = NF_DROP;
62 		return 0;
63 	}
64 
65 	/* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */
66 
67 	if (likely(!ip_vs_iph_inverse(iph)))
68 		svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
69 					 &iph->daddr, ports[1]);
70 	else
71 		svc = ip_vs_service_find(ipvs, af, skb->mark, iph->protocol,
72 					 &iph->saddr, ports[0]);
73 
74 	if (svc) {
75 		int ignored;
76 
77 		if (ip_vs_todrop(ipvs)) {
78 			/*
79 			 * It seems that we are very loaded.
80 			 * We have to drop this packet :(
81 			 */
82 			*verdict = NF_DROP;
83 			return 0;
84 		}
85 
86 		/*
87 		 * Let the virtual server select a real server for the
88 		 * incoming connection, and create a connection entry.
89 		 */
90 		*cpp = ip_vs_schedule(svc, skb, pd, &ignored, iph);
91 		if (!*cpp && ignored <= 0) {
92 			if (!ignored)
93 				*verdict = ip_vs_leave(svc, skb, pd, iph);
94 			else
95 				*verdict = NF_DROP;
96 			return 0;
97 		}
98 	}
99 	/* NF_ACCEPT */
100 	return 1;
101 }
102 
103 
104 static inline void
tcp_fast_csum_update(int af,struct tcphdr * tcph,const union nf_inet_addr * oldip,const union nf_inet_addr * newip,__be16 oldport,__be16 newport)105 tcp_fast_csum_update(int af, struct tcphdr *tcph,
106 		     const union nf_inet_addr *oldip,
107 		     const union nf_inet_addr *newip,
108 		     __be16 oldport, __be16 newport)
109 {
110 #ifdef CONFIG_IP_VS_IPV6
111 	if (af == AF_INET6)
112 		tcph->check =
113 			csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
114 					 ip_vs_check_diff2(oldport, newport,
115 						~csum_unfold(tcph->check))));
116 	else
117 #endif
118 	tcph->check =
119 		csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
120 				 ip_vs_check_diff2(oldport, newport,
121 						~csum_unfold(tcph->check))));
122 }
123 
124 
125 static inline void
tcp_partial_csum_update(int af,struct tcphdr * tcph,const union nf_inet_addr * oldip,const union nf_inet_addr * newip,__be16 oldlen,__be16 newlen)126 tcp_partial_csum_update(int af, struct tcphdr *tcph,
127 		     const union nf_inet_addr *oldip,
128 		     const union nf_inet_addr *newip,
129 		     __be16 oldlen, __be16 newlen)
130 {
131 #ifdef CONFIG_IP_VS_IPV6
132 	if (af == AF_INET6)
133 		tcph->check =
134 			~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6,
135 					 ip_vs_check_diff2(oldlen, newlen,
136 						csum_unfold(tcph->check))));
137 	else
138 #endif
139 	tcph->check =
140 		~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip,
141 				ip_vs_check_diff2(oldlen, newlen,
142 						csum_unfold(tcph->check))));
143 }
144 
145 
146 INDIRECT_CALLABLE_SCOPE int
tcp_snat_handler(struct sk_buff * skb,struct ip_vs_protocol * pp,struct ip_vs_conn * cp,struct ip_vs_iphdr * iph)147 tcp_snat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
148 		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
149 {
150 	struct tcphdr *tcph;
151 	unsigned int tcphoff = iph->len;
152 	bool payload_csum = false;
153 	int oldlen;
154 
155 #ifdef CONFIG_IP_VS_IPV6
156 	if (cp->af == AF_INET6 && iph->fragoffs)
157 		return 1;
158 #endif
159 	oldlen = skb->len - tcphoff;
160 
161 	/* csum_check requires unshared skb */
162 	if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
163 		return 0;
164 
165 	if (unlikely(cp->app != NULL)) {
166 		int ret;
167 
168 		/* Some checks before mangling */
169 		if (!tcp_csum_check(cp->af, skb, pp, tcphoff))
170 			return 0;
171 
172 		/* Call application helper if needed */
173 		if (!(ret = ip_vs_app_pkt_out(cp, skb, iph)))
174 			return 0;
175 		/* ret=2: csum update is needed after payload mangling */
176 		if (ret == 1)
177 			oldlen = skb->len - tcphoff;
178 		else
179 			payload_csum = true;
180 	}
181 
182 	tcph = (void *)skb_network_header(skb) + tcphoff;
183 	tcph->source = cp->vport;
184 
185 	/* Adjust TCP checksums */
186 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
187 		tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
188 					htons(oldlen),
189 					htons(skb->len - tcphoff));
190 	} else if (!payload_csum) {
191 		/* Only port and addr are changed, do fast csum update */
192 		tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr,
193 				     cp->dport, cp->vport);
194 		if (skb->ip_summed == CHECKSUM_COMPLETE)
195 			skb->ip_summed = cp->app ?
196 					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
197 	} else {
198 		/* full checksum calculation */
199 		tcph->check = 0;
200 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
201 #ifdef CONFIG_IP_VS_IPV6
202 		if (cp->af == AF_INET6)
203 			tcph->check = csum_ipv6_magic(&cp->vaddr.in6,
204 						      &cp->caddr.in6,
205 						      skb->len - tcphoff,
206 						      cp->protocol, skb->csum);
207 		else
208 #endif
209 			tcph->check = csum_tcpudp_magic(cp->vaddr.ip,
210 							cp->caddr.ip,
211 							skb->len - tcphoff,
212 							cp->protocol,
213 							skb->csum);
214 		skb->ip_summed = CHECKSUM_UNNECESSARY;
215 
216 		IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n",
217 			  pp->name, tcph->check,
218 			  (char*)&(tcph->check) - (char*)tcph);
219 	}
220 	return 1;
221 }
222 
223 
224 static int
tcp_dnat_handler(struct sk_buff * skb,struct ip_vs_protocol * pp,struct ip_vs_conn * cp,struct ip_vs_iphdr * iph)225 tcp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
226 		 struct ip_vs_conn *cp, struct ip_vs_iphdr *iph)
227 {
228 	struct tcphdr *tcph;
229 	unsigned int tcphoff = iph->len;
230 	bool payload_csum = false;
231 	int oldlen;
232 
233 #ifdef CONFIG_IP_VS_IPV6
234 	if (cp->af == AF_INET6 && iph->fragoffs)
235 		return 1;
236 #endif
237 	oldlen = skb->len - tcphoff;
238 
239 	/* csum_check requires unshared skb */
240 	if (skb_ensure_writable(skb, tcphoff + sizeof(*tcph)))
241 		return 0;
242 
243 	if (unlikely(cp->app != NULL)) {
244 		int ret;
245 
246 		/* Some checks before mangling */
247 		if (!tcp_csum_check(cp->af, skb, pp, tcphoff))
248 			return 0;
249 
250 		/*
251 		 *	Attempt ip_vs_app call.
252 		 *	It will fix ip_vs_conn and iph ack_seq stuff
253 		 */
254 		if (!(ret = ip_vs_app_pkt_in(cp, skb, iph)))
255 			return 0;
256 		/* ret=2: csum update is needed after payload mangling */
257 		if (ret == 1)
258 			oldlen = skb->len - tcphoff;
259 		else
260 			payload_csum = true;
261 	}
262 
263 	tcph = (void *)skb_network_header(skb) + tcphoff;
264 	tcph->dest = cp->dport;
265 
266 	/*
267 	 *	Adjust TCP checksums
268 	 */
269 	if (skb->ip_summed == CHECKSUM_PARTIAL) {
270 		tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
271 					htons(oldlen),
272 					htons(skb->len - tcphoff));
273 	} else if (!payload_csum) {
274 		/* Only port and addr are changed, do fast csum update */
275 		tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr,
276 				     cp->vport, cp->dport);
277 		if (skb->ip_summed == CHECKSUM_COMPLETE)
278 			skb->ip_summed = cp->app ?
279 					 CHECKSUM_UNNECESSARY : CHECKSUM_NONE;
280 	} else {
281 		/* full checksum calculation */
282 		tcph->check = 0;
283 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
284 #ifdef CONFIG_IP_VS_IPV6
285 		if (cp->af == AF_INET6)
286 			tcph->check = csum_ipv6_magic(&cp->caddr.in6,
287 						      &cp->daddr.in6,
288 						      skb->len - tcphoff,
289 						      cp->protocol, skb->csum);
290 		else
291 #endif
292 			tcph->check = csum_tcpudp_magic(cp->caddr.ip,
293 							cp->daddr.ip,
294 							skb->len - tcphoff,
295 							cp->protocol,
296 							skb->csum);
297 		skb->ip_summed = CHECKSUM_UNNECESSARY;
298 	}
299 	return 1;
300 }
301 
302 
303 static int
tcp_csum_check(int af,struct sk_buff * skb,struct ip_vs_protocol * pp,unsigned int tcphoff)304 tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp,
305 	       unsigned int tcphoff)
306 {
307 	switch (skb->ip_summed) {
308 	case CHECKSUM_NONE:
309 		skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
310 		fallthrough;
311 	case CHECKSUM_COMPLETE:
312 #ifdef CONFIG_IP_VS_IPV6
313 		if (af == AF_INET6) {
314 			if (csum_ipv6_magic(&ipv6_hdr(skb)->saddr,
315 					    &ipv6_hdr(skb)->daddr,
316 					    skb->len - tcphoff,
317 					    IPPROTO_TCP,
318 					    skb->csum)) {
319 				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
320 						 "Failed checksum for");
321 				return 0;
322 			}
323 		} else
324 #endif
325 			if (csum_tcpudp_magic(ip_hdr(skb)->saddr,
326 					      ip_hdr(skb)->daddr,
327 					      skb->len - tcphoff,
328 					      ip_hdr(skb)->protocol,
329 					      skb->csum)) {
330 				IP_VS_DBG_RL_PKT(0, af, pp, skb, 0,
331 						 "Failed checksum for");
332 				return 0;
333 			}
334 		break;
335 	default:
336 		/* No need to checksum. */
337 		break;
338 	}
339 
340 	return 1;
341 }
342 
343 
344 #define TCP_DIR_INPUT		0
345 #define TCP_DIR_OUTPUT		4
346 #define TCP_DIR_INPUT_ONLY	8
347 
348 static const int tcp_state_off[IP_VS_DIR_LAST] = {
349 	[IP_VS_DIR_INPUT]		=	TCP_DIR_INPUT,
350 	[IP_VS_DIR_OUTPUT]		=	TCP_DIR_OUTPUT,
351 	[IP_VS_DIR_INPUT_ONLY]		=	TCP_DIR_INPUT_ONLY,
352 };
353 
354 /*
355  *	Timeout table[state]
356  */
357 static const int tcp_timeouts[IP_VS_TCP_S_LAST+1] = {
358 	[IP_VS_TCP_S_NONE]		=	2*HZ,
359 	[IP_VS_TCP_S_ESTABLISHED]	=	15*60*HZ,
360 	[IP_VS_TCP_S_SYN_SENT]		=	2*60*HZ,
361 	[IP_VS_TCP_S_SYN_RECV]		=	1*60*HZ,
362 	[IP_VS_TCP_S_FIN_WAIT]		=	2*60*HZ,
363 	[IP_VS_TCP_S_TIME_WAIT]		=	2*60*HZ,
364 	[IP_VS_TCP_S_CLOSE]		=	10*HZ,
365 	[IP_VS_TCP_S_CLOSE_WAIT]	=	60*HZ,
366 	[IP_VS_TCP_S_LAST_ACK]		=	30*HZ,
367 	[IP_VS_TCP_S_LISTEN]		=	2*60*HZ,
368 	[IP_VS_TCP_S_SYNACK]		=	120*HZ,
369 	[IP_VS_TCP_S_LAST]		=	2*HZ,
370 };
371 
372 static const char *const tcp_state_name_table[IP_VS_TCP_S_LAST+1] = {
373 	[IP_VS_TCP_S_NONE]		=	"NONE",
374 	[IP_VS_TCP_S_ESTABLISHED]	=	"ESTABLISHED",
375 	[IP_VS_TCP_S_SYN_SENT]		=	"SYN_SENT",
376 	[IP_VS_TCP_S_SYN_RECV]		=	"SYN_RECV",
377 	[IP_VS_TCP_S_FIN_WAIT]		=	"FIN_WAIT",
378 	[IP_VS_TCP_S_TIME_WAIT]		=	"TIME_WAIT",
379 	[IP_VS_TCP_S_CLOSE]		=	"CLOSE",
380 	[IP_VS_TCP_S_CLOSE_WAIT]	=	"CLOSE_WAIT",
381 	[IP_VS_TCP_S_LAST_ACK]		=	"LAST_ACK",
382 	[IP_VS_TCP_S_LISTEN]		=	"LISTEN",
383 	[IP_VS_TCP_S_SYNACK]		=	"SYNACK",
384 	[IP_VS_TCP_S_LAST]		=	"BUG!",
385 };
386 
387 static const bool tcp_state_active_table[IP_VS_TCP_S_LAST] = {
388 	[IP_VS_TCP_S_NONE]		=	false,
389 	[IP_VS_TCP_S_ESTABLISHED]	=	true,
390 	[IP_VS_TCP_S_SYN_SENT]		=	true,
391 	[IP_VS_TCP_S_SYN_RECV]		=	true,
392 	[IP_VS_TCP_S_FIN_WAIT]		=	false,
393 	[IP_VS_TCP_S_TIME_WAIT]		=	false,
394 	[IP_VS_TCP_S_CLOSE]		=	false,
395 	[IP_VS_TCP_S_CLOSE_WAIT]	=	false,
396 	[IP_VS_TCP_S_LAST_ACK]		=	false,
397 	[IP_VS_TCP_S_LISTEN]		=	false,
398 	[IP_VS_TCP_S_SYNACK]		=	true,
399 };
400 
401 #define sNO IP_VS_TCP_S_NONE
402 #define sES IP_VS_TCP_S_ESTABLISHED
403 #define sSS IP_VS_TCP_S_SYN_SENT
404 #define sSR IP_VS_TCP_S_SYN_RECV
405 #define sFW IP_VS_TCP_S_FIN_WAIT
406 #define sTW IP_VS_TCP_S_TIME_WAIT
407 #define sCL IP_VS_TCP_S_CLOSE
408 #define sCW IP_VS_TCP_S_CLOSE_WAIT
409 #define sLA IP_VS_TCP_S_LAST_ACK
410 #define sLI IP_VS_TCP_S_LISTEN
411 #define sSA IP_VS_TCP_S_SYNACK
412 
413 struct tcp_states_t {
414 	int next_state[IP_VS_TCP_S_LAST];
415 };
416 
tcp_state_name(int state)417 static const char * tcp_state_name(int state)
418 {
419 	if (state >= IP_VS_TCP_S_LAST)
420 		return "ERR!";
421 	return tcp_state_name_table[state] ? tcp_state_name_table[state] : "?";
422 }
423 
tcp_state_active(int state)424 static bool tcp_state_active(int state)
425 {
426 	if (state >= IP_VS_TCP_S_LAST)
427 		return false;
428 	return tcp_state_active_table[state];
429 }
430 
431 static struct tcp_states_t tcp_states[] = {
432 /*	INPUT */
433 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
434 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
435 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sTW }},
436 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
437 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sSR }},
438 
439 /*	OUTPUT */
440 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
441 /*syn*/ {{sSS, sES, sSS, sSR, sSS, sSS, sSS, sSS, sSS, sLI, sSR }},
442 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
443 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
444 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
445 
446 /*	INPUT-ONLY */
447 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
448 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSR }},
449 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
450 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
451 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
452 };
453 
454 static struct tcp_states_t tcp_states_dos[] = {
455 /*	INPUT */
456 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
457 /*syn*/ {{sSR, sES, sES, sSR, sSR, sSR, sSR, sSR, sSR, sSR, sSA }},
458 /*fin*/ {{sCL, sCW, sSS, sTW, sTW, sTW, sCL, sCW, sLA, sLI, sSA }},
459 /*ack*/ {{sES, sES, sSS, sSR, sFW, sTW, sCL, sCW, sCL, sLI, sSA }},
460 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
461 
462 /*	OUTPUT */
463 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
464 /*syn*/ {{sSS, sES, sSS, sSA, sSS, sSS, sSS, sSS, sSS, sLI, sSA }},
465 /*fin*/ {{sTW, sFW, sSS, sTW, sFW, sTW, sCL, sTW, sLA, sLI, sTW }},
466 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sLA, sES, sES }},
467 /*rst*/ {{sCL, sCL, sSS, sCL, sCL, sTW, sCL, sCL, sCL, sCL, sCL }},
468 
469 /*	INPUT-ONLY */
470 /*        sNO, sES, sSS, sSR, sFW, sTW, sCL, sCW, sLA, sLI, sSA	*/
471 /*syn*/ {{sSA, sES, sES, sSR, sSA, sSA, sSA, sSA, sSA, sSA, sSA }},
472 /*fin*/ {{sCL, sFW, sSS, sTW, sFW, sTW, sCL, sCW, sLA, sLI, sTW }},
473 /*ack*/ {{sES, sES, sSS, sES, sFW, sTW, sCL, sCW, sCL, sLI, sES }},
474 /*rst*/ {{sCL, sCL, sCL, sSR, sCL, sCL, sCL, sCL, sLA, sLI, sCL }},
475 };
476 
tcp_timeout_change(struct ip_vs_proto_data * pd,int flags)477 static void tcp_timeout_change(struct ip_vs_proto_data *pd, int flags)
478 {
479 	int on = (flags & 1);		/* secure_tcp */
480 
481 	/*
482 	** FIXME: change secure_tcp to independent sysctl var
483 	** or make it per-service or per-app because it is valid
484 	** for most if not for all of the applications. Something
485 	** like "capabilities" (flags) for each object.
486 	*/
487 	pd->tcp_state_table = (on ? tcp_states_dos : tcp_states);
488 }
489 
tcp_state_idx(struct tcphdr * th)490 static inline int tcp_state_idx(struct tcphdr *th)
491 {
492 	if (th->rst)
493 		return 3;
494 	if (th->syn)
495 		return 0;
496 	if (th->fin)
497 		return 1;
498 	if (th->ack)
499 		return 2;
500 	return -1;
501 }
502 
503 static inline void
set_tcp_state(struct ip_vs_proto_data * pd,struct ip_vs_conn * cp,int direction,struct tcphdr * th)504 set_tcp_state(struct ip_vs_proto_data *pd, struct ip_vs_conn *cp,
505 	      int direction, struct tcphdr *th)
506 {
507 	int state_idx;
508 	int new_state = IP_VS_TCP_S_CLOSE;
509 	int state_off = tcp_state_off[direction];
510 
511 	/*
512 	 *    Update state offset to INPUT_ONLY if necessary
513 	 *    or delete NO_OUTPUT flag if output packet detected
514 	 */
515 	if (cp->flags & IP_VS_CONN_F_NOOUTPUT) {
516 		if (state_off == TCP_DIR_OUTPUT)
517 			cp->flags &= ~IP_VS_CONN_F_NOOUTPUT;
518 		else
519 			state_off = TCP_DIR_INPUT_ONLY;
520 	}
521 
522 	if ((state_idx = tcp_state_idx(th)) < 0) {
523 		IP_VS_DBG(8, "tcp_state_idx=%d!!!\n", state_idx);
524 		goto tcp_state_out;
525 	}
526 
527 	new_state =
528 		pd->tcp_state_table[state_off+state_idx].next_state[cp->state];
529 
530   tcp_state_out:
531 	if (new_state != cp->state) {
532 		struct ip_vs_dest *dest = cp->dest;
533 
534 		IP_VS_DBG_BUF(8, "%s %s [%c%c%c%c] c:%s:%d v:%s:%d "
535 			      "d:%s:%d state: %s->%s conn->refcnt:%d\n",
536 			      pd->pp->name,
537 			      ((state_off == TCP_DIR_OUTPUT) ?
538 			       "output " : "input "),
539 			      th->syn ? 'S' : '.',
540 			      th->fin ? 'F' : '.',
541 			      th->ack ? 'A' : '.',
542 			      th->rst ? 'R' : '.',
543 			      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
544 			      ntohs(cp->cport),
545 			      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
546 			      ntohs(cp->vport),
547 			      IP_VS_DBG_ADDR(cp->daf, &cp->daddr),
548 			      ntohs(cp->dport),
549 			      tcp_state_name(cp->state),
550 			      tcp_state_name(new_state),
551 			      refcount_read(&cp->refcnt));
552 
553 		if (dest) {
554 			if (!(cp->flags & IP_VS_CONN_F_INACTIVE) &&
555 			    !tcp_state_active(new_state)) {
556 				atomic_dec(&dest->activeconns);
557 				atomic_inc(&dest->inactconns);
558 				cp->flags |= IP_VS_CONN_F_INACTIVE;
559 			} else if ((cp->flags & IP_VS_CONN_F_INACTIVE) &&
560 				   tcp_state_active(new_state)) {
561 				atomic_inc(&dest->activeconns);
562 				atomic_dec(&dest->inactconns);
563 				cp->flags &= ~IP_VS_CONN_F_INACTIVE;
564 			}
565 		}
566 		if (new_state == IP_VS_TCP_S_ESTABLISHED)
567 			ip_vs_control_assure_ct(cp);
568 	}
569 
570 	if (likely(pd))
571 		cp->timeout = pd->timeout_table[cp->state = new_state];
572 	else	/* What to do ? */
573 		cp->timeout = tcp_timeouts[cp->state = new_state];
574 }
575 
576 /*
577  *	Handle state transitions
578  */
579 static void
tcp_state_transition(struct ip_vs_conn * cp,int direction,const struct sk_buff * skb,struct ip_vs_proto_data * pd)580 tcp_state_transition(struct ip_vs_conn *cp, int direction,
581 		     const struct sk_buff *skb,
582 		     struct ip_vs_proto_data *pd)
583 {
584 	struct tcphdr _tcph, *th;
585 
586 #ifdef CONFIG_IP_VS_IPV6
587 	int ihl = cp->af == AF_INET ? ip_hdrlen(skb) : sizeof(struct ipv6hdr);
588 #else
589 	int ihl = ip_hdrlen(skb);
590 #endif
591 
592 	th = skb_header_pointer(skb, ihl, sizeof(_tcph), &_tcph);
593 	if (th == NULL)
594 		return;
595 
596 	spin_lock_bh(&cp->lock);
597 	set_tcp_state(pd, cp, direction, th);
598 	spin_unlock_bh(&cp->lock);
599 }
600 
tcp_app_hashkey(__be16 port)601 static inline __u16 tcp_app_hashkey(__be16 port)
602 {
603 	return (((__force u16)port >> TCP_APP_TAB_BITS) ^ (__force u16)port)
604 		& TCP_APP_TAB_MASK;
605 }
606 
607 
tcp_register_app(struct netns_ipvs * ipvs,struct ip_vs_app * inc)608 static int tcp_register_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
609 {
610 	struct ip_vs_app *i;
611 	__u16 hash;
612 	__be16 port = inc->port;
613 	int ret = 0;
614 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
615 
616 	hash = tcp_app_hashkey(port);
617 
618 	list_for_each_entry(i, &ipvs->tcp_apps[hash], p_list) {
619 		if (i->port == port) {
620 			ret = -EEXIST;
621 			goto out;
622 		}
623 	}
624 	list_add_rcu(&inc->p_list, &ipvs->tcp_apps[hash]);
625 	atomic_inc(&pd->appcnt);
626 
627   out:
628 	return ret;
629 }
630 
631 
632 static void
tcp_unregister_app(struct netns_ipvs * ipvs,struct ip_vs_app * inc)633 tcp_unregister_app(struct netns_ipvs *ipvs, struct ip_vs_app *inc)
634 {
635 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
636 
637 	atomic_dec(&pd->appcnt);
638 	list_del_rcu(&inc->p_list);
639 }
640 
641 
642 static int
tcp_app_conn_bind(struct ip_vs_conn * cp)643 tcp_app_conn_bind(struct ip_vs_conn *cp)
644 {
645 	struct netns_ipvs *ipvs = cp->ipvs;
646 	int hash;
647 	struct ip_vs_app *inc;
648 	int result = 0;
649 
650 	/* Default binding: bind app only for NAT */
651 	if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
652 		return 0;
653 
654 	/* Lookup application incarnations and bind the right one */
655 	hash = tcp_app_hashkey(cp->vport);
656 
657 	list_for_each_entry_rcu(inc, &ipvs->tcp_apps[hash], p_list) {
658 		if (inc->port == cp->vport) {
659 			if (unlikely(!ip_vs_app_inc_get(inc)))
660 				break;
661 
662 			IP_VS_DBG_BUF(9, "%s(): Binding conn %s:%u->"
663 				      "%s:%u to app %s on port %u\n",
664 				      __func__,
665 				      IP_VS_DBG_ADDR(cp->af, &cp->caddr),
666 				      ntohs(cp->cport),
667 				      IP_VS_DBG_ADDR(cp->af, &cp->vaddr),
668 				      ntohs(cp->vport),
669 				      inc->name, ntohs(inc->port));
670 
671 			cp->app = inc;
672 			if (inc->init_conn)
673 				result = inc->init_conn(inc, cp);
674 			break;
675 		}
676 	}
677 
678 	return result;
679 }
680 
681 
682 /*
683  *	Set LISTEN timeout. (ip_vs_conn_put will setup timer)
684  */
ip_vs_tcp_conn_listen(struct ip_vs_conn * cp)685 void ip_vs_tcp_conn_listen(struct ip_vs_conn *cp)
686 {
687 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(cp->ipvs, IPPROTO_TCP);
688 
689 	spin_lock_bh(&cp->lock);
690 	cp->state = IP_VS_TCP_S_LISTEN;
691 	cp->timeout = (pd ? pd->timeout_table[IP_VS_TCP_S_LISTEN]
692 			   : tcp_timeouts[IP_VS_TCP_S_LISTEN]);
693 	spin_unlock_bh(&cp->lock);
694 }
695 
696 /* ---------------------------------------------
697  *   timeouts is netns related now.
698  * ---------------------------------------------
699  */
__ip_vs_tcp_init(struct netns_ipvs * ipvs,struct ip_vs_proto_data * pd)700 static int __ip_vs_tcp_init(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
701 {
702 	ip_vs_init_hash_table(ipvs->tcp_apps, TCP_APP_TAB_SIZE);
703 	pd->timeout_table = ip_vs_create_timeout_table((int *)tcp_timeouts,
704 							sizeof(tcp_timeouts));
705 	if (!pd->timeout_table)
706 		return -ENOMEM;
707 	pd->tcp_state_table = tcp_states;
708 	return 0;
709 }
710 
__ip_vs_tcp_exit(struct netns_ipvs * ipvs,struct ip_vs_proto_data * pd)711 static void __ip_vs_tcp_exit(struct netns_ipvs *ipvs, struct ip_vs_proto_data *pd)
712 {
713 	kfree(pd->timeout_table);
714 }
715 
716 
717 struct ip_vs_protocol ip_vs_protocol_tcp = {
718 	.name =			"TCP",
719 	.protocol =		IPPROTO_TCP,
720 	.num_states =		IP_VS_TCP_S_LAST,
721 	.dont_defrag =		0,
722 	.init =			NULL,
723 	.exit =			NULL,
724 	.init_netns =		__ip_vs_tcp_init,
725 	.exit_netns =		__ip_vs_tcp_exit,
726 	.register_app =		tcp_register_app,
727 	.unregister_app =	tcp_unregister_app,
728 	.conn_schedule =	tcp_conn_schedule,
729 	.conn_in_get =		ip_vs_conn_in_get_proto,
730 	.conn_out_get =		ip_vs_conn_out_get_proto,
731 	.snat_handler =		tcp_snat_handler,
732 	.dnat_handler =		tcp_dnat_handler,
733 	.state_name =		tcp_state_name,
734 	.state_transition =	tcp_state_transition,
735 	.app_conn_bind =	tcp_app_conn_bind,
736 	.debug_packet =		ip_vs_tcpudp_debug_packet,
737 	.timeout_change =	tcp_timeout_change,
738 };
739