xref: /linux/net/netfilter/ipvs/ip_vs_conn.c (revision fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * IPVS         An implementation of the IP virtual server support for the
4  *              LINUX operating system.  IPVS is now implemented as a module
5  *              over the Netfilter framework. IPVS can be used to build a
6  *              high-performance and highly available server based on a
7  *              cluster of servers.
8  *
9  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
10  *              Peter Kese <peter.kese@ijs.si>
11  *              Julian Anastasov <ja@ssi.bg>
12  *
13  * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
14  * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
15  * and others. Many code here is taken from IP MASQ code of kernel 2.2.
16  *
17  * Changes:
18  */
19 
20 #define pr_fmt(fmt) "IPVS: " fmt
21 
22 #include <linux/interrupt.h>
23 #include <linux/in.h>
24 #include <linux/inet.h>
25 #include <linux/net.h>
26 #include <linux/kernel.h>
27 #include <linux/module.h>
28 #include <linux/proc_fs.h>		/* for proc_net_* */
29 #include <linux/slab.h>
30 #include <linux/seq_file.h>
31 #include <linux/jhash.h>
32 #include <linux/random.h>
33 #include <linux/rcupdate_wait.h>
34 
35 #include <net/net_namespace.h>
36 #include <net/ip_vs.h>
37 
38 
39 #ifndef CONFIG_IP_VS_TAB_BITS
40 #define CONFIG_IP_VS_TAB_BITS	12
41 #endif
42 
43 /*
44  * Connection hash size. Default is what was selected at compile time.
45 */
46 static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
47 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
48 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
49 
50 /* Max table size */
51 int ip_vs_conn_tab_size __read_mostly;
52 
53 /*  SLAB cache for IPVS connections */
54 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
55 
56 /* We need an addrstrlen that works with or without v6 */
57 #ifdef CONFIG_IP_VS_IPV6
58 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
59 #else
60 #define IP_VS_ADDRSTRLEN (8+1)
61 #endif
62 
63 /* Connection hashing:
64  * - hash (add conn) and unhash (del conn) are safe for RCU readers walking
65  * the bucket, they will not jump to another bucket or hash table and to miss
66  * conns
67  * - rehash (fill cport) hashes the conn to new bucket or even new table,
68  * so we use seqcount to retry lookups on buckets where we delete
69  * conns (unhash) because after hashing their next ptr can point to another
70  * bucket or hash table
71  * - hash table resize works like rehash but always rehashes into new table
72  * - bit lock on bucket serializes all operations that modify the chain
73  * - cp->lock protects conn fields like cp->flags, cp->dest
74  */
75 
76 /* Lock conn_tab bucket for conn hash/unhash, not for rehash */
77 static __always_inline void
conn_tab_lock(struct ip_vs_rht * t,struct ip_vs_conn * cp,u32 hash_key,u32 hash_key2,bool use2,bool new_hash,struct hlist_bl_head ** head_ret,struct hlist_bl_head ** head2_ret)78 conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key,
79 	      u32 hash_key2, bool use2, bool new_hash,
80 	      struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret)
81 {
82 	struct hlist_bl_head *head, *head2;
83 	u32 hash_key_new, hash_key_new2;
84 	struct ip_vs_rht *t2 = t;
85 	u32 idx, idx2;
86 
87 	idx = hash_key & t->mask;
88 	if (use2)
89 		idx2 = hash_key2 & t->mask;
90 	else
91 		idx2 = idx;
92 	if (!new_hash) {
93 		/* We need to lock the bucket in the right table */
94 
95 retry:
96 		if (!ip_vs_rht_same_table(t, hash_key)) {
97 			/* It is already moved to new table */
98 			t = rcu_dereference(t->new_tbl);
99 			/* Rehashing works in two steps and we may detect
100 			 * both nodes in different tables, use idx/idx2
101 			 * for proper lock ordering for heads.
102 			 */
103 			idx = hash_key & t->mask;
104 			idx |= IP_VS_RHT_TABLE_ID_MASK;
105 		}
106 		if (use2) {
107 			if (!ip_vs_rht_same_table(t2, hash_key2)) {
108 				/* It is already moved to new table */
109 				t2 = rcu_dereference(t2->new_tbl);
110 				idx2 = hash_key2 & t2->mask;
111 				idx2 |= IP_VS_RHT_TABLE_ID_MASK;
112 			}
113 		} else {
114 			idx2 = idx;
115 		}
116 	}
117 
118 	head = t->buckets + (hash_key & t->mask);
119 	head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head;
120 
121 	local_bh_disable();
122 	/* Do not touch seqcount, this is a safe operation */
123 
124 	if (idx <= idx2) {
125 		hlist_bl_lock(head);
126 		if (head != head2)
127 			hlist_bl_lock(head2);
128 	} else {
129 		hlist_bl_lock(head2);
130 		hlist_bl_lock(head);
131 	}
132 	if (!new_hash) {
133 		/* Ensure hash_key is read under lock */
134 		hash_key_new = READ_ONCE(cp->hn0.hash_key);
135 		hash_key_new2 = READ_ONCE(cp->hn1.hash_key);
136 		/* Hash changed ? */
137 		if (hash_key != hash_key_new ||
138 		    (hash_key2 != hash_key_new2 && use2)) {
139 			if (head != head2)
140 				hlist_bl_unlock(head2);
141 			hlist_bl_unlock(head);
142 			local_bh_enable();
143 			hash_key = hash_key_new;
144 			hash_key2 = hash_key_new2;
145 			goto retry;
146 		}
147 	}
148 	*head_ret = head;
149 	*head2_ret = head2;
150 }
151 
conn_tab_unlock(struct hlist_bl_head * head,struct hlist_bl_head * head2)152 static inline void conn_tab_unlock(struct hlist_bl_head *head,
153 				   struct hlist_bl_head *head2)
154 {
155 	if (head != head2)
156 		hlist_bl_unlock(head2);
157 	hlist_bl_unlock(head);
158 	local_bh_enable();
159 }
160 
161 static void ip_vs_conn_expire(struct timer_list *t);
162 
163 /*
164  *	Returns hash value for IPVS connection entry
165  */
ip_vs_conn_hashkey(struct ip_vs_rht * t,int af,unsigned int proto,const union nf_inet_addr * addr,__be16 port,const union nf_inet_addr * laddr,__be16 lport)166 static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto,
167 			      const union nf_inet_addr *addr, __be16 port,
168 			      const union nf_inet_addr *laddr, __be16 lport)
169 {
170 	u64 a = (u32)proto << 16 | (__force u32)port;
171 	u64 d;
172 
173 #ifdef CONFIG_IP_VS_IPV6
174 	if (af == AF_INET6) {
175 		u64 b = (u64)addr->all[0] << 32 | addr->all[1];
176 		u64 c = (u64)addr->all[2] << 32 | addr->all[3];
177 
178 		a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport;
179 		c ^= laddr->all[1];
180 		d = (u64)laddr->all[0] << 32 | laddr->all[3];
181 		return (u32)siphash_4u64(a, b, c, d, &t->hash_key);
182 	}
183 #endif
184 	a |= (u64)addr->all[0] << 32;
185 	d = (u64)laddr->all[0] << 32 | (__force u32)lport;
186 	return (u32)siphash_2u64(a, d, &t->hash_key);
187 }
188 
ip_vs_conn_hashkey_param(const struct ip_vs_conn_param * p,struct ip_vs_rht * t,bool inverse)189 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
190 					     struct ip_vs_rht *t, bool inverse)
191 {
192 	const union nf_inet_addr *laddr;
193 	const union nf_inet_addr *addr;
194 	__be16 lport;
195 	__be16 port;
196 
197 	if (p->pe_data && p->pe->hashkey_raw)
198 		return p->pe->hashkey_raw(p, t, inverse);
199 
200 	if (likely(!inverse)) {
201 		addr = p->caddr;
202 		port = p->cport;
203 		laddr = p->vaddr;
204 		lport = p->vport;
205 	} else {
206 		addr = p->vaddr;
207 		port = p->vport;
208 		laddr = p->caddr;
209 		lport = p->cport;
210 	}
211 
212 	return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr,
213 				  lport);
214 }
215 
ip_vs_conn_hashkey_conn(struct ip_vs_rht * t,const struct ip_vs_conn * cp,bool out)216 static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t,
217 					    const struct ip_vs_conn *cp,
218 					    bool out)
219 {
220 	struct ip_vs_conn_param p;
221 
222 	if (!out)
223 		ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
224 				      &cp->caddr, cp->cport, &cp->vaddr,
225 				      cp->vport, &p);
226 	else
227 		ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
228 				      &cp->daddr, cp->dport, &cp->caddr,
229 				      cp->cport, &p);
230 
231 	if (cp->pe) {
232 		p.pe = cp->pe;
233 		p.pe_data = cp->pe_data;
234 		p.pe_data_len = cp->pe_data_len;
235 	}
236 
237 	return ip_vs_conn_hashkey_param(&p, t, out);
238 }
239 
240 /*	Hashes ip_vs_conn in conn_tab
241  *	returns bool success.
242  */
ip_vs_conn_hash(struct ip_vs_conn * cp)243 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
244 {
245 	struct netns_ipvs *ipvs = cp->ipvs;
246 	struct hlist_bl_head *head, *head2;
247 	u32 hash_key, hash_key2;
248 	struct ip_vs_rht *t;
249 	u32 hash, hash2;
250 	bool use2;
251 	int ret;
252 
253 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
254 		return 0;
255 
256 	/* New entries go into recent table */
257 	t = rcu_dereference(ipvs->conn_tab);
258 	t = rcu_dereference(t->new_tbl);
259 
260 	hash = ip_vs_conn_hashkey_conn(t, cp, false);
261 	hash_key = ip_vs_rht_build_hash_key(t, hash);
262 	if (ip_vs_conn_use_hash2(cp)) {
263 		hash2 = ip_vs_conn_hashkey_conn(t, cp, true);
264 		hash_key2 = ip_vs_rht_build_hash_key(t, hash2);
265 		use2 = true;
266 	} else {
267 		hash_key2 = hash_key;
268 		use2 = false;
269 	}
270 
271 	conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
272 		      &head, &head2);
273 
274 	cp->flags |= IP_VS_CONN_F_HASHED;
275 	WRITE_ONCE(cp->hn0.hash_key, hash_key);
276 	WRITE_ONCE(cp->hn1.hash_key, hash_key2);
277 	refcount_inc(&cp->refcnt);
278 	hlist_bl_add_head_rcu(&cp->hn0.node, head);
279 	if (use2)
280 		hlist_bl_add_head_rcu(&cp->hn1.node, head2);
281 
282 	conn_tab_unlock(head, head2);
283 	ret = 1;
284 
285 	/* Schedule resizing if load increases */
286 	if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
287 	    !test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags))
288 		mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0);
289 
290 	return ret;
291 }
292 
293 /* Try to unlink ip_vs_conn from conn_tab.
294  * returns bool success.
295  */
ip_vs_conn_unlink(struct ip_vs_conn * cp)296 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
297 {
298 	struct netns_ipvs *ipvs = cp->ipvs;
299 	struct hlist_bl_head *head, *head2;
300 	u32 hash_key, hash_key2;
301 	struct ip_vs_rht *t;
302 	bool ret = false;
303 	bool use2;
304 
305 	if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
306 		return refcount_dec_if_one(&cp->refcnt);
307 
308 	rcu_read_lock();
309 
310 	t = rcu_dereference(ipvs->conn_tab);
311 	hash_key = READ_ONCE(cp->hn0.hash_key);
312 	hash_key2 = READ_ONCE(cp->hn1.hash_key);
313 	use2 = ip_vs_conn_use_hash2(cp);
314 
315 	conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
316 		      &head, &head2);
317 
318 	if (cp->flags & IP_VS_CONN_F_HASHED) {
319 		/* Decrease refcnt and unlink conn only if we are last user */
320 		if (refcount_dec_if_one(&cp->refcnt)) {
321 			hlist_bl_del_rcu(&cp->hn0.node);
322 			if (use2)
323 				hlist_bl_del_rcu(&cp->hn1.node);
324 			cp->flags &= ~IP_VS_CONN_F_HASHED;
325 			ret = true;
326 		}
327 	}
328 
329 	conn_tab_unlock(head, head2);
330 
331 	rcu_read_unlock();
332 
333 	return ret;
334 }
335 
336 
337 /*
338  *  Gets ip_vs_conn associated with supplied parameters in the conn_tab.
339  *  Called for pkts coming from OUTside-to-INside.
340  *	p->caddr, p->cport: pkt source address (foreign host)
341  *	p->vaddr, p->vport: pkt dest address (load balancer)
342  */
343 static inline struct ip_vs_conn *
__ip_vs_conn_in_get(const struct ip_vs_conn_param * p)344 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
345 {
346 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
347 	struct netns_ipvs *ipvs = p->ipvs;
348 	struct ip_vs_conn_hnode *hn;
349 	struct hlist_bl_head *head;
350 	struct ip_vs_rht *t, *pt;
351 	struct hlist_bl_node *e;
352 	struct ip_vs_conn *cp;
353 	u32 hash, hash_key;
354 
355 	rcu_read_lock();
356 
357 	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
358 		hash = ip_vs_conn_hashkey_param(p, t, false);
359 		hash_key = ip_vs_rht_build_hash_key(t, hash);
360 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
361 			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
362 				if (READ_ONCE(hn->hash_key) != hash_key ||
363 				    hn->dir != 0)
364 					continue;
365 				cp = ip_vs_hn0_to_conn(hn);
366 				if (p->cport == cp->cport &&
367 				    p->vport == cp->vport && cp->af == p->af &&
368 				    ip_vs_addr_equal(p->af, p->caddr,
369 						     &cp->caddr) &&
370 				    ip_vs_addr_equal(p->af, p->vaddr,
371 						     &cp->vaddr) &&
372 				    (!p->cport ^
373 				     (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
374 				    p->protocol == cp->protocol) {
375 					if (__ip_vs_conn_get(cp)) {
376 						/* HIT */
377 						rcu_read_unlock();
378 						return cp;
379 					}
380 				}
381 			}
382 		}
383 	}
384 
385 	rcu_read_unlock();
386 
387 	return NULL;
388 }
389 
ip_vs_conn_in_get(const struct ip_vs_conn_param * p)390 struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
391 {
392 	struct ip_vs_conn *cp;
393 
394 	cp = __ip_vs_conn_in_get(p);
395 	if (!cp) {
396 		struct netns_ipvs *ipvs = p->ipvs;
397 		int af_id = ip_vs_af_index(p->af);
398 
399 		if (atomic_read(&ipvs->no_cport_conns[af_id])) {
400 			struct ip_vs_conn_param cport_zero_p = *p;
401 
402 			cport_zero_p.cport = 0;
403 			cp = __ip_vs_conn_in_get(&cport_zero_p);
404 		}
405 	}
406 
407 	IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
408 		      ip_vs_proto_name(p->protocol),
409 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
410 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
411 		      cp ? "hit" : "not hit");
412 
413 	return cp;
414 }
415 
416 static int
ip_vs_conn_fill_param_proto(struct netns_ipvs * ipvs,int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph,struct ip_vs_conn_param * p)417 ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
418 			    int af, const struct sk_buff *skb,
419 			    const struct ip_vs_iphdr *iph,
420 			    struct ip_vs_conn_param *p)
421 {
422 	__be16 _ports[2], *pptr;
423 
424 	pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
425 	if (pptr == NULL)
426 		return 1;
427 
428 	if (likely(!ip_vs_iph_inverse(iph)))
429 		ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr,
430 				      pptr[0], &iph->daddr, pptr[1], p);
431 	else
432 		ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr,
433 				      pptr[1], &iph->saddr, pptr[0], p);
434 	return 0;
435 }
436 
437 struct ip_vs_conn *
ip_vs_conn_in_get_proto(struct netns_ipvs * ipvs,int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph)438 ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af,
439 			const struct sk_buff *skb,
440 			const struct ip_vs_iphdr *iph)
441 {
442 	struct ip_vs_conn_param p;
443 
444 	if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
445 		return NULL;
446 
447 	return ip_vs_conn_in_get(&p);
448 }
449 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
450 
451 /* Get reference to connection template */
ip_vs_ct_in_get(const struct ip_vs_conn_param * p)452 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
453 {
454 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
455 	struct netns_ipvs *ipvs = p->ipvs;
456 	struct ip_vs_conn_hnode *hn;
457 	struct hlist_bl_head *head;
458 	struct ip_vs_rht *t, *pt;
459 	struct hlist_bl_node *e;
460 	struct ip_vs_conn *cp;
461 	u32 hash, hash_key;
462 
463 	rcu_read_lock();
464 
465 	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
466 		hash = ip_vs_conn_hashkey_param(p, t, false);
467 		hash_key = ip_vs_rht_build_hash_key(t, hash);
468 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
469 			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
470 				if (READ_ONCE(hn->hash_key) != hash_key ||
471 				    hn->dir != 0)
472 					continue;
473 				cp = ip_vs_hn0_to_conn(hn);
474 				if (unlikely(p->pe_data && p->pe->ct_match)) {
475 					if (p->pe == cp->pe &&
476 					    p->pe->ct_match(p, cp) &&
477 					    __ip_vs_conn_get(cp))
478 						goto out;
479 					continue;
480 				}
481 				if (cp->af == p->af &&
482 				    ip_vs_addr_equal(p->af, p->caddr,
483 						     &cp->caddr) &&
484 				    /* protocol should only be IPPROTO_IP if
485 				     * p->vaddr is a fwmark
486 				     */
487 				    ip_vs_addr_equal(p->protocol == IPPROTO_IP ?
488 						     AF_UNSPEC : p->af,
489 						     p->vaddr, &cp->vaddr) &&
490 				    p->vport == cp->vport &&
491 				    p->cport == cp->cport &&
492 				    cp->flags & IP_VS_CONN_F_TEMPLATE &&
493 				    p->protocol == cp->protocol &&
494 				    cp->dport != htons(0xffff)) {
495 					if (__ip_vs_conn_get(cp))
496 						goto out;
497 				}
498 			}
499 		}
500 
501 	}
502 	cp = NULL;
503 
504   out:
505 	rcu_read_unlock();
506 
507 	IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
508 		      ip_vs_proto_name(p->protocol),
509 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
510 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
511 		      cp ? "hit" : "not hit");
512 
513 	return cp;
514 }
515 
516 /* Gets ip_vs_conn associated with supplied parameters in the conn_tab.
517  * Called for pkts coming from inside-to-OUTside.
518  *	p->caddr, p->cport: pkt source address (inside host)
519  *	p->vaddr, p->vport: pkt dest address (foreign host) */
ip_vs_conn_out_get(const struct ip_vs_conn_param * p)520 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
521 {
522 	DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
523 	struct netns_ipvs *ipvs = p->ipvs;
524 	const union nf_inet_addr *saddr;
525 	struct ip_vs_conn_hnode *hn;
526 	struct hlist_bl_head *head;
527 	struct ip_vs_rht *t, *pt;
528 	struct hlist_bl_node *e;
529 	struct ip_vs_conn *cp;
530 	u32 hash, hash_key;
531 	__be16 sport;
532 
533 	rcu_read_lock();
534 
535 	ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
536 		hash = ip_vs_conn_hashkey_param(p, t, true);
537 		hash_key = ip_vs_rht_build_hash_key(t, hash);
538 		ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
539 			hlist_bl_for_each_entry_rcu(hn, e, head, node) {
540 				/* dir can be 0 for DR/TUN */
541 				if (READ_ONCE(hn->hash_key) != hash_key)
542 					continue;
543 				cp = ip_vs_hn_to_conn(hn);
544 				if (p->vport != cp->cport)
545 					continue;
546 
547 				if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
548 					sport = cp->vport;
549 					saddr = &cp->vaddr;
550 				} else {
551 					sport = cp->dport;
552 					saddr = &cp->daddr;
553 				}
554 
555 				if (p->cport == sport && cp->af == p->af &&
556 				    ip_vs_addr_equal(p->af, p->vaddr,
557 						     &cp->caddr) &&
558 				    ip_vs_addr_equal(p->af, p->caddr, saddr) &&
559 				    p->protocol == cp->protocol) {
560 					if (__ip_vs_conn_get(cp))
561 						goto out;
562 				}
563 			}
564 		}
565 	}
566 	cp = NULL;
567 
568 out:
569 	rcu_read_unlock();
570 
571 	IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
572 		      ip_vs_proto_name(p->protocol),
573 		      IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
574 		      IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
575 		      cp ? "hit" : "not hit");
576 
577 	return cp;
578 }
579 
580 struct ip_vs_conn *
ip_vs_conn_out_get_proto(struct netns_ipvs * ipvs,int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph)581 ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
582 			 const struct sk_buff *skb,
583 			 const struct ip_vs_iphdr *iph)
584 {
585 	struct ip_vs_conn_param p;
586 
587 	if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
588 		return NULL;
589 
590 	return ip_vs_conn_out_get(&p);
591 }
592 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
593 
594 /*
595  *      Put back the conn and restart its timer with its timeout
596  */
__ip_vs_conn_put_timer(struct ip_vs_conn * cp)597 static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
598 {
599 	unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
600 		0 : cp->timeout;
601 	mod_timer(&cp->timer, jiffies+t);
602 
603 	__ip_vs_conn_put(cp);
604 }
605 
ip_vs_conn_put(struct ip_vs_conn * cp)606 void ip_vs_conn_put(struct ip_vs_conn *cp)
607 {
608 	if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
609 	    (refcount_read(&cp->refcnt) == 1) &&
610 	    !timer_pending(&cp->timer))
611 		/* expire connection immediately */
612 		ip_vs_conn_expire(&cp->timer);
613 	else
614 		__ip_vs_conn_put_timer(cp);
615 }
616 
617 /*
618  *	Fill a no_client_port connection with a client port number
619  */
ip_vs_conn_fill_cport(struct ip_vs_conn * cp,__be16 cport)620 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
621 {
622 	struct hlist_bl_head *head, *head2, *head_new;
623 	bool use2 = ip_vs_conn_use_hash2(cp);
624 	struct netns_ipvs *ipvs = cp->ipvs;
625 	int af_id = ip_vs_af_index(cp->af);
626 	u32 hash_r = 0, hash_key_r = 0;
627 	struct ip_vs_rht *t, *tp, *t2;
628 	struct ip_vs_conn_hnode *hn;
629 	u32 hash_key, hash_key_new;
630 	struct ip_vs_conn_param p;
631 	bool by_me = false;
632 	int ntbl;
633 	int dir;
634 
635 	/* No packets from inside, so we can do it in 2 steps. */
636 	dir = use2 ? 1 : 0;
637 
638 next_dir:
639 	if (dir)
640 		ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr,
641 				      cp->dport, &cp->caddr, cport, &p);
642 	else
643 		ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr,
644 				      cport, &cp->vaddr, cp->vport, &p);
645 	hn = dir ? &cp->hn1 : &cp->hn0;
646 	ntbl = 0;
647 
648 	/* Attempt to rehash cp safely, by informing seqcount readers */
649 	t = rcu_dereference(ipvs->conn_tab);
650 	hash_key = READ_ONCE(hn->hash_key);
651 	tp = NULL;
652 
653 retry:
654 	/* Moved to new table ? */
655 	if (!ip_vs_rht_same_table(t, hash_key)) {
656 		t = rcu_dereference(t->new_tbl);
657 		ntbl++;
658 		/* We are lost? */
659 		if (ntbl >= 2) {
660 			spin_lock_bh(&cp->lock);
661 			if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me)
662 				cp->cport = 0;
663 			/* hn1 will be rehashed on next packet */
664 			spin_unlock_bh(&cp->lock);
665 			IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n",
666 				     __func__, dir);
667 			return;
668 		}
669 	}
670 
671 	/* Rehashing during resize? Use the recent table for adds */
672 	t2 = rcu_dereference(t->new_tbl);
673 	/* Calc new hash once per table */
674 	if (tp != t2) {
675 		hash_r = ip_vs_conn_hashkey_param(&p, t2, dir);
676 		hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r);
677 		tp = t2;
678 	}
679 	head = t->buckets + (hash_key & t->mask);
680 	head2 = t2->buckets + (hash_key_r & t2->mask);
681 	head_new = head2;
682 
683 	if (head > head2 && t == t2)
684 		swap(head, head2);
685 
686 	/* Protect the cp->flags modification */
687 	spin_lock_bh(&cp->lock);
688 
689 	/* Lock seqcount only for the old bucket, even if we are on new table
690 	 * because it affects the del operation, not the adding.
691 	 */
692 	spin_lock(&t->lock[hash_key & t->lock_mask].l);
693 	preempt_disable_nested();
694 	write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
695 
696 	/* Lock buckets in same (increasing) order */
697 	hlist_bl_lock(head);
698 	if (head != head2)
699 		hlist_bl_lock(head2);
700 
701 	/* Ensure hash_key is read under lock */
702 	hash_key_new = READ_ONCE(hn->hash_key);
703 	/* Racing with another rehashing ? */
704 	if (unlikely(hash_key != hash_key_new)) {
705 		if (head != head2)
706 			hlist_bl_unlock(head2);
707 		hlist_bl_unlock(head);
708 		write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
709 		preempt_enable_nested();
710 		spin_unlock(&t->lock[hash_key & t->lock_mask].l);
711 		spin_unlock_bh(&cp->lock);
712 		hash_key = hash_key_new;
713 		goto retry;
714 	}
715 
716 	/* Fill cport once, even if multiple packets try to do it */
717 	if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) {
718 		/* If we race with resizing make sure cport is set for dir 1 */
719 		if (!cp->cport) {
720 			cp->cport = cport;
721 			by_me = true;
722 		}
723 		if (!dir) {
724 			atomic_dec(&ipvs->no_cport_conns[af_id]);
725 			cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
726 		}
727 		/* We do not recalc hash_key_r under lock, we assume the
728 		 * parameters in cp do not change, i.e. cport is
729 		 * the only possible change.
730 		 */
731 		WRITE_ONCE(hn->hash_key, hash_key_r);
732 		if (!use2)
733 			WRITE_ONCE(cp->hn1.hash_key, hash_key_r);
734 		/* For dir=1 we do not check in flags if hn is already
735 		 * rehashed but this check will do it.
736 		 */
737 		if (head != head2) {
738 			hlist_bl_del_rcu(&hn->node);
739 			hlist_bl_add_head_rcu(&hn->node, head_new);
740 		}
741 	}
742 
743 	if (head != head2)
744 		hlist_bl_unlock(head2);
745 	hlist_bl_unlock(head);
746 	write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
747 	preempt_enable_nested();
748 	spin_unlock(&t->lock[hash_key & t->lock_mask].l);
749 
750 	spin_unlock_bh(&cp->lock);
751 	if (dir-- && by_me)
752 		goto next_dir;
753 }
754 
755 /* Get default load factor to map conn_count/u_thresh to t->size */
ip_vs_conn_default_load_factor(struct netns_ipvs * ipvs)756 static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs)
757 {
758 	int factor;
759 
760 	if (net_eq(ipvs->net, &init_net))
761 		factor = -3;
762 	else
763 		factor = -1;
764 	/* Double hashing adds twice more nodes for NAT */
765 	factor--;
766 	return factor;
767 }
768 
769 /* Get the desired conn_tab size */
ip_vs_conn_desired_size(struct netns_ipvs * ipvs,struct ip_vs_rht * t,int lfactor)770 int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
771 			    int lfactor)
772 {
773 	return ip_vs_rht_desired_size(ipvs, t, atomic_read(&ipvs->conn_count),
774 				      lfactor, IP_VS_CONN_TAB_MIN_BITS,
775 				      ip_vs_conn_tab_bits);
776 }
777 
778 /* Allocate conn_tab */
ip_vs_conn_tab_alloc(struct netns_ipvs * ipvs,int buckets,int lfactor)779 struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
780 				       int lfactor)
781 {
782 	struct ip_vs_rht *t;
783 	int scounts, locks;
784 
785 	/* scounts: affects readers during resize */
786 	scounts = clamp(buckets >> 6, 1, 256);
787 	/* locks: based on parallel IP_VS_CONN_F_NO_CPORT operations + resize */
788 	locks = clamp(8, 1, scounts);
789 
790 	t = ip_vs_rht_alloc(buckets, scounts, locks);
791 	if (!t)
792 		return NULL;
793 	t->lfactor = lfactor;
794 	ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_CONN_TAB_MIN_BITS,
795 				 ip_vs_conn_tab_bits);
796 	return t;
797 }
798 
799 /* conn_tab resizer work */
conn_resize_work_handler(struct work_struct * work)800 static void conn_resize_work_handler(struct work_struct *work)
801 {
802 	struct hlist_bl_head *head, *head2;
803 	unsigned int resched_score = 0;
804 	struct hlist_bl_node *cn, *nn;
805 	struct ip_vs_rht *t, *t_new;
806 	struct ip_vs_conn_hnode *hn;
807 	struct netns_ipvs *ipvs;
808 	struct ip_vs_conn *cp;
809 	bool more_work = false;
810 	u32 hash, hash_key;
811 	int limit = 0;
812 	int new_size;
813 	int lfactor;
814 	u32 bucket;
815 
816 	ipvs = container_of(work, struct netns_ipvs, conn_resize_work.work);
817 
818 	/* Allow work to be queued again */
819 	clear_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags);
820 	t = rcu_dereference_protected(ipvs->conn_tab, 1);
821 	/* Do nothing if table is removed */
822 	if (!t)
823 		goto out;
824 	/* New table needs to be registered? BUG! */
825 	if (t != rcu_dereference_protected(t->new_tbl, 1))
826 		goto out;
827 
828 	lfactor = sysctl_conn_lfactor(ipvs);
829 	/* Should we resize ? */
830 	new_size = ip_vs_conn_desired_size(ipvs, t, lfactor);
831 	if (new_size == t->size && lfactor == t->lfactor)
832 		goto out;
833 
834 	t_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
835 	if (!t_new) {
836 		more_work = true;
837 		goto out;
838 	}
839 	/* Flip the table_id */
840 	t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
841 
842 	rcu_assign_pointer(t->new_tbl, t_new);
843 
844 	/* Wait RCU readers to see the new table, we do not want new
845 	 * conns to go into old table and to be left there.
846 	 */
847 	synchronize_rcu();
848 
849 	ip_vs_rht_for_each_bucket(t, bucket, head) {
850 same_bucket:
851 		if (++limit >= 16) {
852 			if (resched_score >= 100) {
853 				resched_score = 0;
854 				cond_resched();
855 			}
856 			limit = 0;
857 		}
858 		if (hlist_bl_empty(head)) {
859 			resched_score++;
860 			continue;
861 		}
862 		/* Preemption calls ahead... */
863 		resched_score = 0;
864 
865 		/* seqcount_t usage considering PREEMPT_RT rules:
866 		 * - other writers (SoftIRQ) => serialize with spin_lock_bh
867 		 * - readers (SoftIRQ) => disable BHs
868 		 * - readers (processes) => preemption should be disabled
869 		 */
870 		spin_lock_bh(&t->lock[bucket & t->lock_mask].l);
871 		preempt_disable_nested();
872 		write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]);
873 		hlist_bl_lock(head);
874 
875 		hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) {
876 			cp = ip_vs_hn_to_conn(hn);
877 			hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir);
878 			hash_key = ip_vs_rht_build_hash_key(t_new, hash);
879 
880 			head2 = t_new->buckets + (hash & t_new->mask);
881 			hlist_bl_lock(head2);
882 			/* t_new->seqc are not used at this stage, we race
883 			 * only with add/del, so only lock the bucket.
884 			 */
885 			hlist_bl_del_rcu(&hn->node);
886 			WRITE_ONCE(hn->hash_key, hash_key);
887 			/* Keep both hash keys in sync if no double hashing */
888 			if (!ip_vs_conn_use_hash2(cp))
889 				WRITE_ONCE(cp->hn1.hash_key, hash_key);
890 			hlist_bl_add_head_rcu(&hn->node, head2);
891 			hlist_bl_unlock(head2);
892 			/* Too long chain? Do it in steps */
893 			if (++limit >= 64)
894 				break;
895 		}
896 
897 		hlist_bl_unlock(head);
898 		write_seqcount_end(&t->seqc[bucket & t->seqc_mask]);
899 		preempt_enable_nested();
900 		spin_unlock_bh(&t->lock[bucket & t->lock_mask].l);
901 		if (limit >= 64)
902 			goto same_bucket;
903 	}
904 
905 	rcu_assign_pointer(ipvs->conn_tab, t_new);
906 	/* Inform readers that new table is installed */
907 	smp_mb__before_atomic();
908 	atomic_inc(&ipvs->conn_tab_changes);
909 
910 	/* RCU readers should not see more than two tables in chain.
911 	 * To prevent new table to be attached wait here instead of
912 	 * freeing the old table in RCU callback.
913 	 */
914 	synchronize_rcu();
915 	ip_vs_rht_free(t);
916 
917 out:
918 	/* Monitor if we need to shrink table */
919 	queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work,
920 			   more_work ? 1 : 2 * HZ);
921 }
922 
923 /*
924  *	Bind a connection entry with the corresponding packet_xmit.
925  *	Called by ip_vs_conn_new.
926  */
ip_vs_bind_xmit(struct ip_vs_conn * cp)927 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
928 {
929 	switch (IP_VS_FWD_METHOD(cp)) {
930 	case IP_VS_CONN_F_MASQ:
931 		cp->packet_xmit = ip_vs_nat_xmit;
932 		break;
933 
934 	case IP_VS_CONN_F_TUNNEL:
935 #ifdef CONFIG_IP_VS_IPV6
936 		if (cp->daf == AF_INET6)
937 			cp->packet_xmit = ip_vs_tunnel_xmit_v6;
938 		else
939 #endif
940 			cp->packet_xmit = ip_vs_tunnel_xmit;
941 		break;
942 
943 	case IP_VS_CONN_F_DROUTE:
944 		cp->packet_xmit = ip_vs_dr_xmit;
945 		break;
946 
947 	case IP_VS_CONN_F_LOCALNODE:
948 		cp->packet_xmit = ip_vs_null_xmit;
949 		break;
950 
951 	case IP_VS_CONN_F_BYPASS:
952 		cp->packet_xmit = ip_vs_bypass_xmit;
953 		break;
954 	}
955 }
956 
957 #ifdef CONFIG_IP_VS_IPV6
ip_vs_bind_xmit_v6(struct ip_vs_conn * cp)958 static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
959 {
960 	switch (IP_VS_FWD_METHOD(cp)) {
961 	case IP_VS_CONN_F_MASQ:
962 		cp->packet_xmit = ip_vs_nat_xmit_v6;
963 		break;
964 
965 	case IP_VS_CONN_F_TUNNEL:
966 		if (cp->daf == AF_INET6)
967 			cp->packet_xmit = ip_vs_tunnel_xmit_v6;
968 		else
969 			cp->packet_xmit = ip_vs_tunnel_xmit;
970 		break;
971 
972 	case IP_VS_CONN_F_DROUTE:
973 		cp->packet_xmit = ip_vs_dr_xmit_v6;
974 		break;
975 
976 	case IP_VS_CONN_F_LOCALNODE:
977 		cp->packet_xmit = ip_vs_null_xmit;
978 		break;
979 
980 	case IP_VS_CONN_F_BYPASS:
981 		cp->packet_xmit = ip_vs_bypass_xmit_v6;
982 		break;
983 	}
984 }
985 #endif
986 
987 
ip_vs_dest_totalconns(struct ip_vs_dest * dest)988 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
989 {
990 	return atomic_read(&dest->activeconns)
991 		+ atomic_read(&dest->inactconns);
992 }
993 
994 /*
995  *	Bind a connection entry with a virtual service destination
996  *	Called just after a new connection entry is created.
997  */
998 static inline void
ip_vs_bind_dest(struct ip_vs_conn * cp,struct ip_vs_dest * dest)999 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
1000 {
1001 	unsigned int conn_flags;
1002 	__u32 flags;
1003 
1004 	/* if dest is NULL, then return directly */
1005 	if (!dest)
1006 		return;
1007 
1008 	/* Increase the refcnt counter of the dest */
1009 	ip_vs_dest_hold(dest);
1010 
1011 	conn_flags = atomic_read(&dest->conn_flags);
1012 	if (cp->protocol != IPPROTO_UDP)
1013 		conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
1014 	flags = cp->flags;
1015 	/* Bind with the destination and its corresponding transmitter */
1016 	if (flags & IP_VS_CONN_F_SYNC) {
1017 		/* if the connection is not template and is created
1018 		 * by sync, preserve the activity flag.
1019 		 */
1020 		if (!(flags & IP_VS_CONN_F_TEMPLATE))
1021 			conn_flags &= ~IP_VS_CONN_F_INACTIVE;
1022 		/* connections inherit forwarding method from dest */
1023 		flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
1024 	}
1025 	flags |= conn_flags;
1026 	cp->flags = flags;
1027 	cp->dest = dest;
1028 
1029 	IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
1030 		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
1031 		      "dest->refcnt:%d\n",
1032 		      ip_vs_proto_name(cp->protocol),
1033 		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1034 		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1035 		      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
1036 		      ip_vs_fwd_tag(cp), cp->state,
1037 		      cp->flags, refcount_read(&cp->refcnt),
1038 		      refcount_read(&dest->refcnt));
1039 
1040 	/* Update the connection counters */
1041 	if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1042 		/* It is a normal connection, so modify the counters
1043 		 * according to the flags, later the protocol can
1044 		 * update them on state change
1045 		 */
1046 		if (!(flags & IP_VS_CONN_F_INACTIVE))
1047 			atomic_inc(&dest->activeconns);
1048 		else
1049 			atomic_inc(&dest->inactconns);
1050 	} else {
1051 		/* It is a persistent connection/template, so increase
1052 		   the persistent connection counter */
1053 		atomic_inc(&dest->persistconns);
1054 	}
1055 
1056 	if (dest->u_threshold != 0 &&
1057 	    ip_vs_dest_totalconns(dest) >= dest->u_threshold)
1058 		dest->flags |= IP_VS_DEST_F_OVERLOAD;
1059 }
1060 
1061 
1062 /*
1063  * Check if there is a destination for the connection, if so
1064  * bind the connection to the destination.
1065  */
ip_vs_try_bind_dest(struct ip_vs_conn * cp)1066 void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
1067 {
1068 	struct ip_vs_dest *dest;
1069 
1070 	rcu_read_lock();
1071 
1072 	/* This function is only invoked by the synchronization code. We do
1073 	 * not currently support heterogeneous pools with synchronization,
1074 	 * so we can make the assumption that the svc_af is the same as the
1075 	 * dest_af
1076 	 */
1077 	dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr,
1078 			       cp->dport, &cp->vaddr, cp->vport,
1079 			       cp->protocol, cp->fwmark, cp->flags);
1080 	if (dest) {
1081 		struct ip_vs_proto_data *pd;
1082 
1083 		spin_lock_bh(&cp->lock);
1084 		if (cp->dest) {
1085 			spin_unlock_bh(&cp->lock);
1086 			rcu_read_unlock();
1087 			return;
1088 		}
1089 
1090 		/* Applications work depending on the forwarding method
1091 		 * but better to reassign them always when binding dest */
1092 		if (cp->app)
1093 			ip_vs_unbind_app(cp);
1094 
1095 		ip_vs_bind_dest(cp, dest);
1096 		spin_unlock_bh(&cp->lock);
1097 
1098 		/* Update its packet transmitter */
1099 		cp->packet_xmit = NULL;
1100 #ifdef CONFIG_IP_VS_IPV6
1101 		if (cp->af == AF_INET6)
1102 			ip_vs_bind_xmit_v6(cp);
1103 		else
1104 #endif
1105 			ip_vs_bind_xmit(cp);
1106 
1107 		pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol);
1108 		if (pd && atomic_read(&pd->appcnt))
1109 			ip_vs_bind_app(cp, pd->pp);
1110 	}
1111 	rcu_read_unlock();
1112 }
1113 
1114 
1115 /*
1116  *	Unbind a connection entry with its VS destination
1117  *	Called by the ip_vs_conn_expire function.
1118  */
ip_vs_unbind_dest(struct ip_vs_conn * cp)1119 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
1120 {
1121 	struct ip_vs_dest *dest = cp->dest;
1122 
1123 	if (!dest)
1124 		return;
1125 
1126 	IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
1127 		      "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
1128 		      "dest->refcnt:%d\n",
1129 		      ip_vs_proto_name(cp->protocol),
1130 		      IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1131 		      IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1132 		      IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
1133 		      ip_vs_fwd_tag(cp), cp->state,
1134 		      cp->flags, refcount_read(&cp->refcnt),
1135 		      refcount_read(&dest->refcnt));
1136 
1137 	/* Update the connection counters */
1138 	if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
1139 		/* It is a normal connection, so decrease the inactconns
1140 		   or activeconns counter */
1141 		if (cp->flags & IP_VS_CONN_F_INACTIVE) {
1142 			atomic_dec(&dest->inactconns);
1143 		} else {
1144 			atomic_dec(&dest->activeconns);
1145 		}
1146 	} else {
1147 		/* It is a persistent connection/template, so decrease
1148 		   the persistent connection counter */
1149 		atomic_dec(&dest->persistconns);
1150 	}
1151 
1152 	if (dest->l_threshold != 0) {
1153 		if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
1154 			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1155 	} else if (dest->u_threshold != 0) {
1156 		if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
1157 			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1158 	} else {
1159 		if (dest->flags & IP_VS_DEST_F_OVERLOAD)
1160 			dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1161 	}
1162 
1163 	ip_vs_dest_put(dest);
1164 }
1165 
expire_quiescent_template(struct netns_ipvs * ipvs,struct ip_vs_dest * dest)1166 static int expire_quiescent_template(struct netns_ipvs *ipvs,
1167 				     struct ip_vs_dest *dest)
1168 {
1169 #ifdef CONFIG_SYSCTL
1170 	return ipvs->sysctl_expire_quiescent_template &&
1171 		(atomic_read(&dest->weight) == 0);
1172 #else
1173 	return 0;
1174 #endif
1175 }
1176 
1177 /*
1178  *	Checking if the destination of a connection template is available.
1179  *	If available, return 1, otherwise invalidate this connection
1180  *	template and return 0.
1181  */
ip_vs_check_template(struct ip_vs_conn * ct,struct ip_vs_dest * cdest)1182 int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
1183 {
1184 	struct ip_vs_dest *dest = ct->dest;
1185 	struct netns_ipvs *ipvs = ct->ipvs;
1186 
1187 	/*
1188 	 * Checking the dest server status.
1189 	 */
1190 	if ((dest == NULL) ||
1191 	    !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
1192 	    expire_quiescent_template(ipvs, dest) ||
1193 	    (cdest && (dest != cdest))) {
1194 		IP_VS_DBG_BUF(9, "check_template: dest not available for "
1195 			      "protocol %s s:%s:%d v:%s:%d "
1196 			      "-> d:%s:%d\n",
1197 			      ip_vs_proto_name(ct->protocol),
1198 			      IP_VS_DBG_ADDR(ct->af, &ct->caddr),
1199 			      ntohs(ct->cport),
1200 			      IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
1201 			      ntohs(ct->vport),
1202 			      IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
1203 			      ntohs(ct->dport));
1204 
1205 		/* Invalidate the connection template. Prefer to avoid
1206 		 * rehashing, it will move it as first in chain, so use
1207 		 * only dport as indication, it is not a hash key.
1208 		 */
1209 		ct->dport = htons(0xffff);
1210 
1211 		/*
1212 		 * Simply decrease the refcnt of the template,
1213 		 * don't restart its timer.
1214 		 */
1215 		__ip_vs_conn_put(ct);
1216 		return 0;
1217 	}
1218 	return 1;
1219 }
1220 
ip_vs_conn_rcu_free(struct rcu_head * head)1221 static void ip_vs_conn_rcu_free(struct rcu_head *head)
1222 {
1223 	struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
1224 					     rcu_head);
1225 
1226 	ip_vs_pe_put(cp->pe);
1227 	kfree(cp->pe_data);
1228 	kmem_cache_free(ip_vs_conn_cachep, cp);
1229 }
1230 
1231 /* Try to delete connection while not holding reference */
ip_vs_conn_del(struct ip_vs_conn * cp)1232 static void ip_vs_conn_del(struct ip_vs_conn *cp)
1233 {
1234 	if (timer_delete(&cp->timer)) {
1235 		/* Drop cp->control chain too */
1236 		if (cp->control)
1237 			cp->timeout = 0;
1238 		ip_vs_conn_expire(&cp->timer);
1239 	}
1240 }
1241 
1242 /* Try to delete connection while holding reference */
ip_vs_conn_del_put(struct ip_vs_conn * cp)1243 static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
1244 {
1245 	if (timer_delete(&cp->timer)) {
1246 		/* Drop cp->control chain too */
1247 		if (cp->control)
1248 			cp->timeout = 0;
1249 		__ip_vs_conn_put(cp);
1250 		ip_vs_conn_expire(&cp->timer);
1251 	} else {
1252 		__ip_vs_conn_put(cp);
1253 	}
1254 }
1255 
ip_vs_conn_expire(struct timer_list * t)1256 static void ip_vs_conn_expire(struct timer_list *t)
1257 {
1258 	struct ip_vs_conn *cp = timer_container_of(cp, t, timer);
1259 	struct netns_ipvs *ipvs = cp->ipvs;
1260 
1261 	/*
1262 	 *	do I control anybody?
1263 	 */
1264 	if (atomic_read(&cp->n_control))
1265 		goto expire_later;
1266 
1267 	/* Unlink conn if not referenced anymore */
1268 	if (likely(ip_vs_conn_unlink(cp))) {
1269 		struct ip_vs_conn *ct = cp->control;
1270 
1271 		/* delete the timer if it is activated by other users */
1272 		timer_delete(&cp->timer);
1273 
1274 		/* does anybody control me? */
1275 		if (ct) {
1276 			bool has_ref = !cp->timeout && __ip_vs_conn_get(ct);
1277 
1278 			ip_vs_control_del(cp);
1279 			/* Drop CTL or non-assured TPL if not used anymore */
1280 			if (has_ref && !atomic_read(&ct->n_control) &&
1281 			    (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
1282 			     !(ct->state & IP_VS_CTPL_S_ASSURED))) {
1283 				IP_VS_DBG(4, "drop controlling connection\n");
1284 				ip_vs_conn_del_put(ct);
1285 			} else if (has_ref) {
1286 				__ip_vs_conn_put(ct);
1287 			}
1288 		}
1289 
1290 		if ((cp->flags & IP_VS_CONN_F_NFCT) &&
1291 		    !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
1292 			/* Do not access conntracks during subsys cleanup
1293 			 * because nf_conntrack_find_get can not be used after
1294 			 * conntrack cleanup for the net.
1295 			 */
1296 			smp_rmb();
1297 			if (READ_ONCE(ipvs->enable))
1298 				ip_vs_conn_drop_conntrack(cp);
1299 		}
1300 
1301 		if (unlikely(cp->app != NULL))
1302 			ip_vs_unbind_app(cp);
1303 		ip_vs_unbind_dest(cp);
1304 		if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
1305 			int af_id = ip_vs_af_index(cp->af);
1306 
1307 			atomic_dec(&ipvs->no_cport_conns[af_id]);
1308 		}
1309 		if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1310 			ip_vs_conn_rcu_free(&cp->rcu_head);
1311 		else
1312 			call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
1313 		atomic_dec(&ipvs->conn_count);
1314 		return;
1315 	}
1316 
1317   expire_later:
1318 	IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
1319 		  refcount_read(&cp->refcnt),
1320 		  atomic_read(&cp->n_control));
1321 
1322 	refcount_inc(&cp->refcnt);
1323 	cp->timeout = 60*HZ;
1324 
1325 	if (ipvs->sync_state & IP_VS_STATE_MASTER)
1326 		ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
1327 
1328 	__ip_vs_conn_put_timer(cp);
1329 }
1330 
1331 /* Modify timer, so that it expires as soon as possible.
1332  * Can be called without reference only if under RCU lock.
1333  * We can have such chain of conns linked with ->control: DATA->CTL->TPL
1334  * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
1335  * - cp->timeout=0 indicates all conns from chain should be dropped but
1336  * TPL is not dropped if in assured state
1337  */
ip_vs_conn_expire_now(struct ip_vs_conn * cp)1338 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
1339 {
1340 	/* Using mod_timer_pending will ensure the timer is not
1341 	 * modified after the final timer_delete in ip_vs_conn_expire.
1342 	 */
1343 	if (timer_pending(&cp->timer) &&
1344 	    time_after(cp->timer.expires, jiffies))
1345 		mod_timer_pending(&cp->timer, jiffies);
1346 }
1347 
1348 
1349 /*
1350  *	Create a new connection entry and hash it into the conn_tab
1351  */
1352 struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param * p,int dest_af,const union nf_inet_addr * daddr,__be16 dport,unsigned int flags,struct ip_vs_dest * dest,__u32 fwmark)1353 ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
1354 	       const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
1355 	       struct ip_vs_dest *dest, __u32 fwmark)
1356 {
1357 	struct ip_vs_conn *cp;
1358 	struct netns_ipvs *ipvs = p->ipvs;
1359 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
1360 							   p->protocol);
1361 
1362 	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
1363 	if (cp == NULL) {
1364 		IP_VS_ERR_RL("%s(): no memory\n", __func__);
1365 		return NULL;
1366 	}
1367 
1368 	INIT_HLIST_BL_NODE(&cp->hn0.node);
1369 	INIT_HLIST_BL_NODE(&cp->hn1.node);
1370 	timer_setup(&cp->timer, ip_vs_conn_expire, 0);
1371 	cp->ipvs	   = ipvs;
1372 	cp->hn0.dir	   = 0;
1373 	cp->af		   = p->af;
1374 	cp->hn1.dir	   = 1;
1375 	cp->daf		   = dest_af;
1376 	cp->protocol	   = p->protocol;
1377 	ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
1378 	cp->cport	   = p->cport;
1379 	/* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
1380 	ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
1381 		       &cp->vaddr, p->vaddr);
1382 	cp->vport	   = p->vport;
1383 	ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
1384 	cp->dport          = dport;
1385 	cp->flags	   = flags;
1386 	cp->fwmark         = fwmark;
1387 	if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
1388 		ip_vs_pe_get(p->pe);
1389 		cp->pe = p->pe;
1390 		cp->pe_data = p->pe_data;
1391 		cp->pe_data_len = p->pe_data_len;
1392 	} else {
1393 		cp->pe = NULL;
1394 		cp->pe_data = NULL;
1395 		cp->pe_data_len = 0;
1396 	}
1397 	spin_lock_init(&cp->lock);
1398 
1399 	/*
1400 	 * Set the entry is referenced by the current thread before hashing
1401 	 * it in the table, so that other thread run ip_vs_random_dropentry
1402 	 * but cannot drop this entry.
1403 	 */
1404 	refcount_set(&cp->refcnt, 1);
1405 
1406 	cp->control = NULL;
1407 	atomic_set(&cp->n_control, 0);
1408 	atomic_set(&cp->in_pkts, 0);
1409 
1410 	cp->packet_xmit = NULL;
1411 	cp->app = NULL;
1412 	cp->app_data = NULL;
1413 	/* reset struct ip_vs_seq */
1414 	cp->in_seq.delta = 0;
1415 	cp->out_seq.delta = 0;
1416 
1417 	atomic_inc(&ipvs->conn_count);
1418 	if (unlikely(flags & IP_VS_CONN_F_NO_CPORT)) {
1419 		int af_id = ip_vs_af_index(cp->af);
1420 
1421 		atomic_inc(&ipvs->no_cport_conns[af_id]);
1422 	}
1423 
1424 	/* Bind the connection with a destination server */
1425 	cp->dest = NULL;
1426 	ip_vs_bind_dest(cp, dest);
1427 
1428 	/* Set its state and timeout */
1429 	cp->state = 0;
1430 	cp->old_state = 0;
1431 	cp->timeout = 3*HZ;
1432 	cp->sync_endtime = jiffies & ~3UL;
1433 
1434 	/* Bind its packet transmitter */
1435 #ifdef CONFIG_IP_VS_IPV6
1436 	if (p->af == AF_INET6)
1437 		ip_vs_bind_xmit_v6(cp);
1438 	else
1439 #endif
1440 		ip_vs_bind_xmit(cp);
1441 
1442 	if (unlikely(pd && atomic_read(&pd->appcnt)))
1443 		ip_vs_bind_app(cp, pd->pp);
1444 
1445 	/*
1446 	 * Allow conntrack to be preserved. By default, conntrack
1447 	 * is created and destroyed for every packet.
1448 	 * Sometimes keeping conntrack can be useful for
1449 	 * IP_VS_CONN_F_ONE_PACKET too.
1450 	 */
1451 
1452 	if (ip_vs_conntrack_enabled(ipvs))
1453 		cp->flags |= IP_VS_CONN_F_NFCT;
1454 
1455 	/* Hash it in the conn_tab finally */
1456 	ip_vs_conn_hash(cp);
1457 
1458 	return cp;
1459 }
1460 
1461 /*
1462  *	/proc/net/ip_vs_conn entries
1463  */
1464 #ifdef CONFIG_PROC_FS
1465 struct ip_vs_iter_state {
1466 	struct seq_net_private	p;
1467 	struct ip_vs_rht	*t;
1468 	int			gen;
1469 	u32			bucket;
1470 	unsigned int		skip_elems;
1471 };
1472 
ip_vs_conn_array(struct seq_file * seq)1473 static void *ip_vs_conn_array(struct seq_file *seq)
1474 {
1475 	struct ip_vs_iter_state *iter = seq->private;
1476 	struct net *net = seq_file_net(seq);
1477 	struct netns_ipvs *ipvs = net_ipvs(net);
1478 	struct ip_vs_rht *t = iter->t;
1479 	struct ip_vs_conn_hnode *hn;
1480 	struct hlist_bl_node *e;
1481 	int idx;
1482 
1483 	if (!t)
1484 		return NULL;
1485 	for (idx = iter->bucket; idx < t->size; idx++) {
1486 		unsigned int skip = 0;
1487 
1488 		hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) {
1489 			/* __ip_vs_conn_get() is not needed by
1490 			 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
1491 			 */
1492 			if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
1493 				break;
1494 			if (hn->dir != 0)
1495 				continue;
1496 			if (skip >= iter->skip_elems) {
1497 				iter->bucket = idx;
1498 				return hn;
1499 			}
1500 
1501 			++skip;
1502 		}
1503 
1504 		if (!(idx & 31)) {
1505 			cond_resched_rcu();
1506 			/* New table installed ? */
1507 			if (iter->gen != atomic_read(&ipvs->conn_tab_changes))
1508 				break;
1509 		}
1510 		iter->skip_elems = 0;
1511 	}
1512 
1513 	iter->bucket = idx;
1514 	return NULL;
1515 }
1516 
ip_vs_conn_seq_start(struct seq_file * seq,loff_t * pos)1517 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
1518 	__acquires(RCU)
1519 {
1520 	struct ip_vs_iter_state *iter = seq->private;
1521 	struct net *net = seq_file_net(seq);
1522 	struct netns_ipvs *ipvs = net_ipvs(net);
1523 
1524 	rcu_read_lock();
1525 	iter->gen = atomic_read(&ipvs->conn_tab_changes);
1526 	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
1527 	iter->t = rcu_dereference(ipvs->conn_tab);
1528 	if (*pos == 0) {
1529 		iter->skip_elems = 0;
1530 		iter->bucket = 0;
1531 		return SEQ_START_TOKEN;
1532 	}
1533 
1534 	return ip_vs_conn_array(seq);
1535 }
1536 
ip_vs_conn_seq_next(struct seq_file * seq,void * v,loff_t * pos)1537 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1538 {
1539 	struct ip_vs_iter_state *iter = seq->private;
1540 	struct ip_vs_conn_hnode *hn = v;
1541 	struct hlist_bl_node *e;
1542 	struct ip_vs_rht *t;
1543 
1544 	++*pos;
1545 	if (v == SEQ_START_TOKEN)
1546 		return ip_vs_conn_array(seq);
1547 
1548 	t = iter->t;
1549 	if (!t)
1550 		return NULL;
1551 
1552 	/* more on same hash chain? */
1553 	hlist_bl_for_each_entry_continue_rcu(hn, e, node) {
1554 		/* Our cursor was moved to new table ? */
1555 		if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
1556 			break;
1557 		if (hn->dir != 0)
1558 			continue;
1559 		iter->skip_elems++;
1560 		return hn;
1561 	}
1562 
1563 	iter->skip_elems = 0;
1564 	iter->bucket++;
1565 
1566 	return ip_vs_conn_array(seq);
1567 }
1568 
ip_vs_conn_seq_stop(struct seq_file * seq,void * v)1569 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
1570 	__releases(RCU)
1571 {
1572 	rcu_read_unlock();
1573 }
1574 
ip_vs_conn_seq_show(struct seq_file * seq,void * v)1575 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
1576 {
1577 
1578 	if (v == SEQ_START_TOKEN)
1579 		seq_puts(seq,
1580    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Expires PEName PEData\n");
1581 	else {
1582 		struct ip_vs_conn_hnode *hn = v;
1583 		const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn);
1584 		char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
1585 		size_t len = 0;
1586 		char dbuf[IP_VS_ADDRSTRLEN];
1587 
1588 		if (cp->pe_data) {
1589 			pe_data[0] = ' ';
1590 			len = strlen(cp->pe->name);
1591 			memcpy(pe_data + 1, cp->pe->name, len);
1592 			pe_data[len + 1] = ' ';
1593 			len += 2;
1594 			len += cp->pe->show_pe_data(cp, pe_data + len);
1595 		}
1596 		pe_data[len] = '\0';
1597 
1598 #ifdef CONFIG_IP_VS_IPV6
1599 		if (cp->daf == AF_INET6)
1600 			snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1601 		else
1602 #endif
1603 			snprintf(dbuf, sizeof(dbuf), "%08X",
1604 				 ntohl(cp->daddr.ip));
1605 
1606 #ifdef CONFIG_IP_VS_IPV6
1607 		if (cp->af == AF_INET6)
1608 			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1609 				"%s %04X %-11s %7u%s\n",
1610 				ip_vs_proto_name(cp->protocol),
1611 				&cp->caddr.in6, ntohs(cp->cport),
1612 				&cp->vaddr.in6, ntohs(cp->vport),
1613 				dbuf, ntohs(cp->dport),
1614 				ip_vs_state_name(cp),
1615 				jiffies_delta_to_msecs(cp->timer.expires -
1616 						       jiffies) / 1000,
1617 				pe_data);
1618 		else
1619 #endif
1620 			seq_printf(seq,
1621 				"%-3s %08X %04X %08X %04X"
1622 				" %s %04X %-11s %7u%s\n",
1623 				ip_vs_proto_name(cp->protocol),
1624 				ntohl(cp->caddr.ip), ntohs(cp->cport),
1625 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
1626 				dbuf, ntohs(cp->dport),
1627 				ip_vs_state_name(cp),
1628 				jiffies_delta_to_msecs(cp->timer.expires -
1629 						       jiffies) / 1000,
1630 				pe_data);
1631 	}
1632 	return 0;
1633 }
1634 
1635 static const struct seq_operations ip_vs_conn_seq_ops = {
1636 	.start = ip_vs_conn_seq_start,
1637 	.next  = ip_vs_conn_seq_next,
1638 	.stop  = ip_vs_conn_seq_stop,
1639 	.show  = ip_vs_conn_seq_show,
1640 };
1641 
ip_vs_origin_name(unsigned int flags)1642 static const char *ip_vs_origin_name(unsigned int flags)
1643 {
1644 	if (flags & IP_VS_CONN_F_SYNC)
1645 		return "SYNC";
1646 	else
1647 		return "LOCAL";
1648 }
1649 
ip_vs_conn_sync_seq_show(struct seq_file * seq,void * v)1650 static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1651 {
1652 	char dbuf[IP_VS_ADDRSTRLEN];
1653 
1654 	if (v == SEQ_START_TOKEN)
1655 		seq_puts(seq,
1656    "Pro FromIP   FPrt ToIP     TPrt DestIP   DPrt State       Origin Expires\n");
1657 	else {
1658 		const struct ip_vs_conn *cp = v;
1659 
1660 #ifdef CONFIG_IP_VS_IPV6
1661 		if (cp->daf == AF_INET6)
1662 			snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1663 		else
1664 #endif
1665 			snprintf(dbuf, sizeof(dbuf), "%08X",
1666 				 ntohl(cp->daddr.ip));
1667 
1668 #ifdef CONFIG_IP_VS_IPV6
1669 		if (cp->af == AF_INET6)
1670 			seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1671 				"%s %04X %-11s %-6s %7u\n",
1672 				ip_vs_proto_name(cp->protocol),
1673 				&cp->caddr.in6, ntohs(cp->cport),
1674 				&cp->vaddr.in6, ntohs(cp->vport),
1675 				dbuf, ntohs(cp->dport),
1676 				ip_vs_state_name(cp),
1677 				ip_vs_origin_name(cp->flags),
1678 				jiffies_delta_to_msecs(cp->timer.expires -
1679 						       jiffies) / 1000);
1680 		else
1681 #endif
1682 			seq_printf(seq,
1683 				"%-3s %08X %04X %08X %04X "
1684 				"%s %04X %-11s %-6s %7u\n",
1685 				ip_vs_proto_name(cp->protocol),
1686 				ntohl(cp->caddr.ip), ntohs(cp->cport),
1687 				ntohl(cp->vaddr.ip), ntohs(cp->vport),
1688 				dbuf, ntohs(cp->dport),
1689 				ip_vs_state_name(cp),
1690 				ip_vs_origin_name(cp->flags),
1691 				jiffies_delta_to_msecs(cp->timer.expires -
1692 						       jiffies) / 1000);
1693 	}
1694 	return 0;
1695 }
1696 
1697 static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1698 	.start = ip_vs_conn_seq_start,
1699 	.next  = ip_vs_conn_seq_next,
1700 	.stop  = ip_vs_conn_seq_stop,
1701 	.show  = ip_vs_conn_sync_seq_show,
1702 };
1703 #endif
1704 
1705 #ifdef CONFIG_SYSCTL
1706 
1707 /* Randomly drop connection entries before running out of memory
1708  * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
1709  * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
1710  * - traffic for services not in OPS mode does not increase ct->in_pkts in
1711  * all cases, so it is not supported
1712  */
todrop_entry(struct ip_vs_conn * cp)1713 static inline int todrop_entry(struct ip_vs_conn *cp)
1714 {
1715 	struct netns_ipvs *ipvs = cp->ipvs;
1716 	int i;
1717 
1718 	/* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1719 	   This will leave enough time for normal connection to get
1720 	   through. */
1721 	if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
1722 		return 0;
1723 
1724 	/* Drop only conns with number of incoming packets in [1..8] range */
1725 	i = atomic_read(&cp->in_pkts);
1726 	if (i > 8 || i < 1)
1727 		return 0;
1728 
1729 	i--;
1730 	if (--ipvs->dropentry_counters[i] > 0)
1731 		return 0;
1732 
1733 	/* Prefer to drop conns with less number of incoming packets */
1734 	ipvs->dropentry_counters[i] = i + 1;
1735 	return 1;
1736 }
1737 
ip_vs_conn_ops_mode(struct ip_vs_conn * cp)1738 static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
1739 {
1740 	struct ip_vs_service *svc;
1741 
1742 	if (!cp->dest)
1743 		return false;
1744 	svc = rcu_dereference(cp->dest->svc);
1745 	return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
1746 }
1747 
ip_vs_random_dropentry(struct netns_ipvs * ipvs)1748 void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
1749 {
1750 	struct ip_vs_conn_hnode *hn;
1751 	struct hlist_bl_node *e;
1752 	struct ip_vs_conn *cp;
1753 	struct ip_vs_rht *t;
1754 	unsigned int r;
1755 	int idx;
1756 
1757 	r = get_random_u32();
1758 	rcu_read_lock();
1759 	t = rcu_dereference(ipvs->conn_tab);
1760 	if (!t)
1761 		goto out;
1762 	/*
1763 	 * Randomly scan 1/32 of the whole table every second
1764 	 */
1765 	for (idx = 0; idx < (t->size >> 5); idx++) {
1766 		unsigned int hash = (r + idx) & t->mask;
1767 
1768 		/* Don't care if due to moved entry we jump to another bucket
1769 		 * and even to new table
1770 		 */
1771 		hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) {
1772 			if (hn->dir != 0)
1773 				continue;
1774 			cp = ip_vs_hn0_to_conn(hn);
1775 			if (atomic_read(&cp->n_control))
1776 				continue;
1777 			if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
1778 				/* connection template of OPS */
1779 				if (ip_vs_conn_ops_mode(cp))
1780 					goto try_drop;
1781 				if (!(cp->state & IP_VS_CTPL_S_ASSURED))
1782 					goto drop;
1783 				continue;
1784 			}
1785 			if (cp->protocol == IPPROTO_TCP) {
1786 				switch(cp->state) {
1787 				case IP_VS_TCP_S_SYN_RECV:
1788 				case IP_VS_TCP_S_SYNACK:
1789 					break;
1790 
1791 				case IP_VS_TCP_S_ESTABLISHED:
1792 					if (todrop_entry(cp))
1793 						break;
1794 					continue;
1795 
1796 				default:
1797 					continue;
1798 				}
1799 			} else if (cp->protocol == IPPROTO_SCTP) {
1800 				switch (cp->state) {
1801 				case IP_VS_SCTP_S_INIT1:
1802 				case IP_VS_SCTP_S_INIT:
1803 					break;
1804 				case IP_VS_SCTP_S_ESTABLISHED:
1805 					if (todrop_entry(cp))
1806 						break;
1807 					continue;
1808 				default:
1809 					continue;
1810 				}
1811 			} else {
1812 try_drop:
1813 				if (!todrop_entry(cp))
1814 					continue;
1815 			}
1816 
1817 drop:
1818 			IP_VS_DBG(4, "drop connection\n");
1819 			ip_vs_conn_del(cp);
1820 		}
1821 		if (!(idx & 31)) {
1822 			cond_resched_rcu();
1823 			t = rcu_dereference(ipvs->conn_tab);
1824 			if (!t)
1825 				goto out;
1826 		}
1827 	}
1828 
1829 out:
1830 	rcu_read_unlock();
1831 }
1832 #endif
1833 
1834 /* Flush all the connection entries in the conn_tab */
ip_vs_conn_flush(struct netns_ipvs * ipvs)1835 static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
1836 {
1837 	DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
1838 	struct ip_vs_conn *cp, *cp_c;
1839 	struct ip_vs_conn_hnode *hn;
1840 	struct hlist_bl_head *head;
1841 	struct ip_vs_rht *t, *p;
1842 	struct hlist_bl_node *e;
1843 
1844 	if (!rcu_dereference_protected(ipvs->conn_tab, 1))
1845 		return;
1846 	disable_delayed_work_sync(&ipvs->conn_resize_work);
1847 	if (!atomic_read(&ipvs->conn_count))
1848 		goto unreg;
1849 
1850 flush_again:
1851 	/* Rely on RCU grace period while accessing cp after ip_vs_conn_del */
1852 	rcu_read_lock();
1853 	ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) {
1854 		hlist_bl_for_each_entry_rcu(hn, e, head, node) {
1855 			if (hn->dir != 0)
1856 				continue;
1857 			cp = ip_vs_hn0_to_conn(hn);
1858 			if (atomic_read(&cp->n_control))
1859 				continue;
1860 			cp_c = cp->control;
1861 			IP_VS_DBG(4, "del connection\n");
1862 			ip_vs_conn_del(cp);
1863 			if (cp_c && !atomic_read(&cp_c->n_control)) {
1864 				IP_VS_DBG(4, "del controlling connection\n");
1865 				ip_vs_conn_del(cp_c);
1866 			}
1867 		}
1868 		cond_resched_rcu();
1869 	}
1870 	rcu_read_unlock();
1871 
1872 	/* the counter may be not NULL, because maybe some conn entries
1873 	   are run by slow timer handler or unhashed but still referred */
1874 	if (atomic_read(&ipvs->conn_count) != 0) {
1875 		schedule();
1876 		goto flush_again;
1877 	}
1878 
1879 unreg:
1880 	/* Unregister the hash table and release it after RCU grace period.
1881 	 * This is needed because other works may not be stopped yet and
1882 	 * they may walk the tables.
1883 	 */
1884 	t = rcu_dereference_protected(ipvs->conn_tab, 1);
1885 	rcu_assign_pointer(ipvs->conn_tab, NULL);
1886 	/* Inform readers that conn_tab is changed */
1887 	smp_mb__before_atomic();
1888 	atomic_inc(&ipvs->conn_tab_changes);
1889 	while (1) {
1890 		p = rcu_dereference_protected(t->new_tbl, 1);
1891 		call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
1892 		if (p == t)
1893 			break;
1894 		t = p;
1895 	}
1896 }
1897 
1898 #ifdef CONFIG_SYSCTL
ip_vs_expire_nodest_conn_flush(struct netns_ipvs * ipvs)1899 void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
1900 {
1901 	DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
1902 	unsigned int resched_score = 0;
1903 	struct ip_vs_conn *cp, *cp_c;
1904 	struct ip_vs_conn_hnode *hn;
1905 	struct hlist_bl_head *head;
1906 	struct ip_vs_dest *dest;
1907 	struct hlist_bl_node *e;
1908 	int old_gen, new_gen;
1909 
1910 	if (!atomic_read(&ipvs->conn_count))
1911 		return;
1912 	old_gen = atomic_read(&ipvs->conn_tab_changes);
1913 	rcu_read_lock();
1914 
1915 repeat:
1916 	smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
1917 	ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) {
1918 		hlist_bl_for_each_entry_rcu(hn, e, head, node) {
1919 			if (hn->dir != 0)
1920 				continue;
1921 			cp = ip_vs_hn0_to_conn(hn);
1922 			resched_score++;
1923 			dest = cp->dest;
1924 			if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
1925 				continue;
1926 
1927 			if (atomic_read(&cp->n_control))
1928 				continue;
1929 
1930 			cp_c = cp->control;
1931 			IP_VS_DBG(4, "del connection\n");
1932 			ip_vs_conn_del(cp);
1933 			if (cp_c && !atomic_read(&cp_c->n_control)) {
1934 				IP_VS_DBG(4, "del controlling connection\n");
1935 				ip_vs_conn_del(cp_c);
1936 			}
1937 			resched_score += 10;
1938 		}
1939 		resched_score++;
1940 		if (resched_score >= 100) {
1941 			resched_score = 0;
1942 			cond_resched_rcu();
1943 			/* netns clean up started, abort delayed work */
1944 			if (!READ_ONCE(ipvs->enable))
1945 				goto out;
1946 			new_gen = atomic_read(&ipvs->conn_tab_changes);
1947 			/* New table installed ? */
1948 			if (old_gen != new_gen) {
1949 				old_gen = new_gen;
1950 				goto repeat;
1951 			}
1952 		}
1953 	}
1954 
1955 out:
1956 	rcu_read_unlock();
1957 }
1958 #endif
1959 
1960 /*
1961  * per netns init and exit
1962  */
ip_vs_conn_net_init(struct netns_ipvs * ipvs)1963 int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
1964 {
1965 	int idx;
1966 
1967 	atomic_set(&ipvs->conn_count, 0);
1968 	for (idx = 0; idx < IP_VS_AF_MAX; idx++)
1969 		atomic_set(&ipvs->no_cport_conns[idx], 0);
1970 	INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler);
1971 	RCU_INIT_POINTER(ipvs->conn_tab, NULL);
1972 	atomic_set(&ipvs->conn_tab_changes, 0);
1973 	ipvs->sysctl_conn_lfactor = ip_vs_conn_default_load_factor(ipvs);
1974 
1975 #ifdef CONFIG_PROC_FS
1976 	if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
1977 			     &ip_vs_conn_seq_ops,
1978 			     sizeof(struct ip_vs_iter_state)))
1979 		goto err_conn;
1980 
1981 	if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
1982 			     &ip_vs_conn_sync_seq_ops,
1983 			     sizeof(struct ip_vs_iter_state)))
1984 		goto err_conn_sync;
1985 #endif
1986 
1987 	return 0;
1988 
1989 #ifdef CONFIG_PROC_FS
1990 err_conn_sync:
1991 	remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
1992 err_conn:
1993 	return -ENOMEM;
1994 #endif
1995 }
1996 
ip_vs_conn_net_cleanup(struct netns_ipvs * ipvs)1997 void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
1998 {
1999 	/* flush all the connection entries first */
2000 	ip_vs_conn_flush(ipvs);
2001 #ifdef CONFIG_PROC_FS
2002 	remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
2003 	remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
2004 #endif
2005 }
2006 
ip_vs_conn_init(void)2007 int __init ip_vs_conn_init(void)
2008 {
2009 	int min = IP_VS_CONN_TAB_MIN_BITS;
2010 	int max = IP_VS_CONN_TAB_MAX_BITS;
2011 	size_t tab_array_size;
2012 	int max_avail;
2013 
2014 	max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
2015 	/* 64-bit: 27 bits at 64GB, 32-bit: 20 bits at 512MB */
2016 	max_avail += 1;		/* hash table loaded at 50% */
2017 	max_avail -= 1;		/* IPVS up to 1/2 of mem */
2018 	max_avail -= order_base_2(sizeof(struct ip_vs_conn));
2019 	max = clamp(max_avail, min, max);
2020 	ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max);
2021 	ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
2022 
2023 	/*
2024 	 * Allocate the connection hash table and initialize its list heads
2025 	 */
2026 	tab_array_size = array_size(ip_vs_conn_tab_size,
2027 				    sizeof(struct hlist_bl_head));
2028 
2029 	/* Allocate ip_vs_conn slab cache */
2030 	ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN);
2031 	if (!ip_vs_conn_cachep)
2032 		return -ENOMEM;
2033 
2034 	pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n",
2035 		ip_vs_conn_tab_size, tab_array_size / 1024);
2036 	IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
2037 		  sizeof(struct ip_vs_conn));
2038 
2039 	return 0;
2040 }
2041 
ip_vs_conn_cleanup(void)2042 void ip_vs_conn_cleanup(void)
2043 {
2044 	/* Wait all ip_vs_conn_rcu_free() callbacks to complete */
2045 	rcu_barrier();
2046 	/* Release the empty cache */
2047 	kmem_cache_destroy(ip_vs_conn_cachep);
2048 }
2049