1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPVS An implementation of the IP virtual server support for the
4 * LINUX operating system. IPVS is now implemented as a module
5 * over the Netfilter framework. IPVS can be used to build a
6 * high-performance and highly available server based on a
7 * cluster of servers.
8 *
9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
10 * Peter Kese <peter.kese@ijs.si>
11 * Julian Anastasov <ja@ssi.bg>
12 *
13 * The IPVS code for kernel 2.2 was done by Wensong Zhang and Peter Kese,
14 * with changes/fixes from Julian Anastasov, Lars Marowsky-Bree, Horms
15 * and others. Many code here is taken from IP MASQ code of kernel 2.2.
16 *
17 * Changes:
18 */
19
20 #define pr_fmt(fmt) "IPVS: " fmt
21
22 #include <linux/interrupt.h>
23 #include <linux/in.h>
24 #include <linux/inet.h>
25 #include <linux/net.h>
26 #include <linux/kernel.h>
27 #include <linux/module.h>
28 #include <linux/proc_fs.h> /* for proc_net_* */
29 #include <linux/slab.h>
30 #include <linux/seq_file.h>
31 #include <linux/jhash.h>
32 #include <linux/random.h>
33 #include <linux/rcupdate_wait.h>
34
35 #include <net/net_namespace.h>
36 #include <net/ip_vs.h>
37
38
39 #ifndef CONFIG_IP_VS_TAB_BITS
40 #define CONFIG_IP_VS_TAB_BITS 12
41 #endif
42
43 /*
44 * Connection hash size. Default is what was selected at compile time.
45 */
46 static int ip_vs_conn_tab_bits = CONFIG_IP_VS_TAB_BITS;
47 module_param_named(conn_tab_bits, ip_vs_conn_tab_bits, int, 0444);
48 MODULE_PARM_DESC(conn_tab_bits, "Set connections' hash size");
49
50 /* Max table size */
51 int ip_vs_conn_tab_size __read_mostly;
52
53 /* SLAB cache for IPVS connections */
54 static struct kmem_cache *ip_vs_conn_cachep __read_mostly;
55
56 /* We need an addrstrlen that works with or without v6 */
57 #ifdef CONFIG_IP_VS_IPV6
58 #define IP_VS_ADDRSTRLEN INET6_ADDRSTRLEN
59 #else
60 #define IP_VS_ADDRSTRLEN (8+1)
61 #endif
62
63 /* Connection hashing:
64 * - hash (add conn) and unhash (del conn) are safe for RCU readers walking
65 * the bucket, they will not jump to another bucket or hash table and to miss
66 * conns
67 * - rehash (fill cport) hashes the conn to new bucket or even new table,
68 * so we use seqcount to retry lookups on buckets where we delete
69 * conns (unhash) because after hashing their next ptr can point to another
70 * bucket or hash table
71 * - hash table resize works like rehash but always rehashes into new table
72 * - bit lock on bucket serializes all operations that modify the chain
73 * - cp->lock protects conn fields like cp->flags, cp->dest
74 */
75
76 /* Lock conn_tab bucket for conn hash/unhash, not for rehash */
77 static __always_inline void
conn_tab_lock(struct ip_vs_rht * t,struct ip_vs_conn * cp,u32 hash_key,u32 hash_key2,bool use2,bool new_hash,struct hlist_bl_head ** head_ret,struct hlist_bl_head ** head2_ret)78 conn_tab_lock(struct ip_vs_rht *t, struct ip_vs_conn *cp, u32 hash_key,
79 u32 hash_key2, bool use2, bool new_hash,
80 struct hlist_bl_head **head_ret, struct hlist_bl_head **head2_ret)
81 {
82 struct hlist_bl_head *head, *head2;
83 u32 hash_key_new, hash_key_new2;
84 struct ip_vs_rht *t2 = t;
85 u32 idx, idx2;
86
87 idx = hash_key & t->mask;
88 if (use2)
89 idx2 = hash_key2 & t->mask;
90 else
91 idx2 = idx;
92 if (!new_hash) {
93 /* We need to lock the bucket in the right table */
94
95 retry:
96 if (!ip_vs_rht_same_table(t, hash_key)) {
97 /* It is already moved to new table */
98 t = rcu_dereference(t->new_tbl);
99 /* Rehashing works in two steps and we may detect
100 * both nodes in different tables, use idx/idx2
101 * for proper lock ordering for heads.
102 */
103 idx = hash_key & t->mask;
104 idx |= IP_VS_RHT_TABLE_ID_MASK;
105 }
106 if (use2) {
107 if (!ip_vs_rht_same_table(t2, hash_key2)) {
108 /* It is already moved to new table */
109 t2 = rcu_dereference(t2->new_tbl);
110 idx2 = hash_key2 & t2->mask;
111 idx2 |= IP_VS_RHT_TABLE_ID_MASK;
112 }
113 } else {
114 idx2 = idx;
115 }
116 }
117
118 head = t->buckets + (hash_key & t->mask);
119 head2 = use2 ? t2->buckets + (hash_key2 & t2->mask) : head;
120
121 local_bh_disable();
122 /* Do not touch seqcount, this is a safe operation */
123
124 if (idx <= idx2) {
125 hlist_bl_lock(head);
126 if (head != head2)
127 hlist_bl_lock(head2);
128 } else {
129 hlist_bl_lock(head2);
130 hlist_bl_lock(head);
131 }
132 if (!new_hash) {
133 /* Ensure hash_key is read under lock */
134 hash_key_new = READ_ONCE(cp->hn0.hash_key);
135 hash_key_new2 = READ_ONCE(cp->hn1.hash_key);
136 /* Hash changed ? */
137 if (hash_key != hash_key_new ||
138 (hash_key2 != hash_key_new2 && use2)) {
139 if (head != head2)
140 hlist_bl_unlock(head2);
141 hlist_bl_unlock(head);
142 local_bh_enable();
143 hash_key = hash_key_new;
144 hash_key2 = hash_key_new2;
145 goto retry;
146 }
147 }
148 *head_ret = head;
149 *head2_ret = head2;
150 }
151
conn_tab_unlock(struct hlist_bl_head * head,struct hlist_bl_head * head2)152 static inline void conn_tab_unlock(struct hlist_bl_head *head,
153 struct hlist_bl_head *head2)
154 {
155 if (head != head2)
156 hlist_bl_unlock(head2);
157 hlist_bl_unlock(head);
158 local_bh_enable();
159 }
160
161 static void ip_vs_conn_expire(struct timer_list *t);
162
163 /*
164 * Returns hash value for IPVS connection entry
165 */
ip_vs_conn_hashkey(struct ip_vs_rht * t,int af,unsigned int proto,const union nf_inet_addr * addr,__be16 port,const union nf_inet_addr * laddr,__be16 lport)166 static u32 ip_vs_conn_hashkey(struct ip_vs_rht *t, int af, unsigned int proto,
167 const union nf_inet_addr *addr, __be16 port,
168 const union nf_inet_addr *laddr, __be16 lport)
169 {
170 u64 a = (u32)proto << 16 | (__force u32)port;
171 u64 d;
172
173 #ifdef CONFIG_IP_VS_IPV6
174 if (af == AF_INET6) {
175 u64 b = (u64)addr->all[0] << 32 | addr->all[1];
176 u64 c = (u64)addr->all[2] << 32 | addr->all[3];
177
178 a |= (u64)laddr->all[2] << 32 ^ (__force u32)lport;
179 c ^= laddr->all[1];
180 d = (u64)laddr->all[0] << 32 | laddr->all[3];
181 return (u32)siphash_4u64(a, b, c, d, &t->hash_key);
182 }
183 #endif
184 a |= (u64)addr->all[0] << 32;
185 d = (u64)laddr->all[0] << 32 | (__force u32)lport;
186 return (u32)siphash_2u64(a, d, &t->hash_key);
187 }
188
ip_vs_conn_hashkey_param(const struct ip_vs_conn_param * p,struct ip_vs_rht * t,bool inverse)189 static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p,
190 struct ip_vs_rht *t, bool inverse)
191 {
192 const union nf_inet_addr *laddr;
193 const union nf_inet_addr *addr;
194 __be16 lport;
195 __be16 port;
196
197 if (p->pe_data && p->pe->hashkey_raw)
198 return p->pe->hashkey_raw(p, t, inverse);
199
200 if (likely(!inverse)) {
201 addr = p->caddr;
202 port = p->cport;
203 laddr = p->vaddr;
204 lport = p->vport;
205 } else {
206 addr = p->vaddr;
207 port = p->vport;
208 laddr = p->caddr;
209 lport = p->cport;
210 }
211
212 return ip_vs_conn_hashkey(t, p->af, p->protocol, addr, port, laddr,
213 lport);
214 }
215
ip_vs_conn_hashkey_conn(struct ip_vs_rht * t,const struct ip_vs_conn * cp,bool out)216 static unsigned int ip_vs_conn_hashkey_conn(struct ip_vs_rht *t,
217 const struct ip_vs_conn *cp,
218 bool out)
219 {
220 struct ip_vs_conn_param p;
221
222 if (!out)
223 ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
224 &cp->caddr, cp->cport, &cp->vaddr,
225 cp->vport, &p);
226 else
227 ip_vs_conn_fill_param(cp->ipvs, cp->af, cp->protocol,
228 &cp->daddr, cp->dport, &cp->caddr,
229 cp->cport, &p);
230
231 if (cp->pe) {
232 p.pe = cp->pe;
233 p.pe_data = cp->pe_data;
234 p.pe_data_len = cp->pe_data_len;
235 }
236
237 return ip_vs_conn_hashkey_param(&p, t, out);
238 }
239
240 /* Hashes ip_vs_conn in conn_tab
241 * returns bool success.
242 */
ip_vs_conn_hash(struct ip_vs_conn * cp)243 static inline int ip_vs_conn_hash(struct ip_vs_conn *cp)
244 {
245 struct netns_ipvs *ipvs = cp->ipvs;
246 struct hlist_bl_head *head, *head2;
247 u32 hash_key, hash_key2;
248 struct ip_vs_rht *t;
249 u32 hash, hash2;
250 bool use2;
251 int ret;
252
253 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
254 return 0;
255
256 /* New entries go into recent table */
257 t = rcu_dereference(ipvs->conn_tab);
258 t = rcu_dereference(t->new_tbl);
259
260 hash = ip_vs_conn_hashkey_conn(t, cp, false);
261 hash_key = ip_vs_rht_build_hash_key(t, hash);
262 if (ip_vs_conn_use_hash2(cp)) {
263 hash2 = ip_vs_conn_hashkey_conn(t, cp, true);
264 hash_key2 = ip_vs_rht_build_hash_key(t, hash2);
265 use2 = true;
266 } else {
267 hash_key2 = hash_key;
268 use2 = false;
269 }
270
271 conn_tab_lock(t, cp, hash_key, hash_key2, use2, true /* new_hash */,
272 &head, &head2);
273
274 cp->flags |= IP_VS_CONN_F_HASHED;
275 WRITE_ONCE(cp->hn0.hash_key, hash_key);
276 WRITE_ONCE(cp->hn1.hash_key, hash_key2);
277 refcount_inc(&cp->refcnt);
278 hlist_bl_add_head_rcu(&cp->hn0.node, head);
279 if (use2)
280 hlist_bl_add_head_rcu(&cp->hn1.node, head2);
281
282 conn_tab_unlock(head, head2);
283 ret = 1;
284
285 /* Schedule resizing if load increases */
286 if (atomic_read(&ipvs->conn_count) > t->u_thresh &&
287 !test_and_set_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags))
288 mod_delayed_work(system_unbound_wq, &ipvs->conn_resize_work, 0);
289
290 return ret;
291 }
292
293 /* Try to unlink ip_vs_conn from conn_tab.
294 * returns bool success.
295 */
ip_vs_conn_unlink(struct ip_vs_conn * cp)296 static inline bool ip_vs_conn_unlink(struct ip_vs_conn *cp)
297 {
298 struct netns_ipvs *ipvs = cp->ipvs;
299 struct hlist_bl_head *head, *head2;
300 u32 hash_key, hash_key2;
301 struct ip_vs_rht *t;
302 bool ret = false;
303 bool use2;
304
305 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
306 return refcount_dec_if_one(&cp->refcnt);
307
308 rcu_read_lock();
309
310 t = rcu_dereference(ipvs->conn_tab);
311 hash_key = READ_ONCE(cp->hn0.hash_key);
312 hash_key2 = READ_ONCE(cp->hn1.hash_key);
313 use2 = ip_vs_conn_use_hash2(cp);
314
315 conn_tab_lock(t, cp, hash_key, hash_key2, use2, false /* new_hash */,
316 &head, &head2);
317
318 if (cp->flags & IP_VS_CONN_F_HASHED) {
319 /* Decrease refcnt and unlink conn only if we are last user */
320 if (refcount_dec_if_one(&cp->refcnt)) {
321 hlist_bl_del_rcu(&cp->hn0.node);
322 if (use2)
323 hlist_bl_del_rcu(&cp->hn1.node);
324 cp->flags &= ~IP_VS_CONN_F_HASHED;
325 ret = true;
326 }
327 }
328
329 conn_tab_unlock(head, head2);
330
331 rcu_read_unlock();
332
333 return ret;
334 }
335
336
337 /*
338 * Gets ip_vs_conn associated with supplied parameters in the conn_tab.
339 * Called for pkts coming from OUTside-to-INside.
340 * p->caddr, p->cport: pkt source address (foreign host)
341 * p->vaddr, p->vport: pkt dest address (load balancer)
342 */
343 static inline struct ip_vs_conn *
__ip_vs_conn_in_get(const struct ip_vs_conn_param * p)344 __ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
345 {
346 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
347 struct netns_ipvs *ipvs = p->ipvs;
348 struct ip_vs_conn_hnode *hn;
349 struct hlist_bl_head *head;
350 struct ip_vs_rht *t, *pt;
351 struct hlist_bl_node *e;
352 struct ip_vs_conn *cp;
353 u32 hash, hash_key;
354
355 rcu_read_lock();
356
357 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
358 hash = ip_vs_conn_hashkey_param(p, t, false);
359 hash_key = ip_vs_rht_build_hash_key(t, hash);
360 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
361 hlist_bl_for_each_entry_rcu(hn, e, head, node) {
362 if (READ_ONCE(hn->hash_key) != hash_key ||
363 hn->dir != 0)
364 continue;
365 cp = ip_vs_hn0_to_conn(hn);
366 if (p->cport == cp->cport &&
367 p->vport == cp->vport && cp->af == p->af &&
368 ip_vs_addr_equal(p->af, p->caddr,
369 &cp->caddr) &&
370 ip_vs_addr_equal(p->af, p->vaddr,
371 &cp->vaddr) &&
372 (!p->cport ^
373 (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) &&
374 p->protocol == cp->protocol) {
375 if (__ip_vs_conn_get(cp)) {
376 /* HIT */
377 rcu_read_unlock();
378 return cp;
379 }
380 }
381 }
382 }
383 }
384
385 rcu_read_unlock();
386
387 return NULL;
388 }
389
ip_vs_conn_in_get(const struct ip_vs_conn_param * p)390 struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p)
391 {
392 struct ip_vs_conn *cp;
393
394 cp = __ip_vs_conn_in_get(p);
395 if (!cp) {
396 struct netns_ipvs *ipvs = p->ipvs;
397 int af_id = ip_vs_af_index(p->af);
398
399 if (atomic_read(&ipvs->no_cport_conns[af_id])) {
400 struct ip_vs_conn_param cport_zero_p = *p;
401
402 cport_zero_p.cport = 0;
403 cp = __ip_vs_conn_in_get(&cport_zero_p);
404 }
405 }
406
407 IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n",
408 ip_vs_proto_name(p->protocol),
409 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
410 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
411 cp ? "hit" : "not hit");
412
413 return cp;
414 }
415
416 static int
ip_vs_conn_fill_param_proto(struct netns_ipvs * ipvs,int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph,struct ip_vs_conn_param * p)417 ip_vs_conn_fill_param_proto(struct netns_ipvs *ipvs,
418 int af, const struct sk_buff *skb,
419 const struct ip_vs_iphdr *iph,
420 struct ip_vs_conn_param *p)
421 {
422 __be16 _ports[2], *pptr;
423
424 pptr = frag_safe_skb_hp(skb, iph->len, sizeof(_ports), _ports);
425 if (pptr == NULL)
426 return 1;
427
428 if (likely(!ip_vs_iph_inverse(iph)))
429 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->saddr,
430 pptr[0], &iph->daddr, pptr[1], p);
431 else
432 ip_vs_conn_fill_param(ipvs, af, iph->protocol, &iph->daddr,
433 pptr[1], &iph->saddr, pptr[0], p);
434 return 0;
435 }
436
437 struct ip_vs_conn *
ip_vs_conn_in_get_proto(struct netns_ipvs * ipvs,int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph)438 ip_vs_conn_in_get_proto(struct netns_ipvs *ipvs, int af,
439 const struct sk_buff *skb,
440 const struct ip_vs_iphdr *iph)
441 {
442 struct ip_vs_conn_param p;
443
444 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
445 return NULL;
446
447 return ip_vs_conn_in_get(&p);
448 }
449 EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto);
450
451 /* Get reference to connection template */
ip_vs_ct_in_get(const struct ip_vs_conn_param * p)452 struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p)
453 {
454 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
455 struct netns_ipvs *ipvs = p->ipvs;
456 struct ip_vs_conn_hnode *hn;
457 struct hlist_bl_head *head;
458 struct ip_vs_rht *t, *pt;
459 struct hlist_bl_node *e;
460 struct ip_vs_conn *cp;
461 u32 hash, hash_key;
462
463 rcu_read_lock();
464
465 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
466 hash = ip_vs_conn_hashkey_param(p, t, false);
467 hash_key = ip_vs_rht_build_hash_key(t, hash);
468 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
469 hlist_bl_for_each_entry_rcu(hn, e, head, node) {
470 if (READ_ONCE(hn->hash_key) != hash_key ||
471 hn->dir != 0)
472 continue;
473 cp = ip_vs_hn0_to_conn(hn);
474 if (unlikely(p->pe_data && p->pe->ct_match)) {
475 if (p->pe == cp->pe &&
476 p->pe->ct_match(p, cp) &&
477 __ip_vs_conn_get(cp))
478 goto out;
479 continue;
480 }
481 if (cp->af == p->af &&
482 ip_vs_addr_equal(p->af, p->caddr,
483 &cp->caddr) &&
484 /* protocol should only be IPPROTO_IP if
485 * p->vaddr is a fwmark
486 */
487 ip_vs_addr_equal(p->protocol == IPPROTO_IP ?
488 AF_UNSPEC : p->af,
489 p->vaddr, &cp->vaddr) &&
490 p->vport == cp->vport &&
491 p->cport == cp->cport &&
492 cp->flags & IP_VS_CONN_F_TEMPLATE &&
493 p->protocol == cp->protocol &&
494 cp->dport != htons(0xffff)) {
495 if (__ip_vs_conn_get(cp))
496 goto out;
497 }
498 }
499 }
500
501 }
502 cp = NULL;
503
504 out:
505 rcu_read_unlock();
506
507 IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n",
508 ip_vs_proto_name(p->protocol),
509 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
510 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
511 cp ? "hit" : "not hit");
512
513 return cp;
514 }
515
516 /* Gets ip_vs_conn associated with supplied parameters in the conn_tab.
517 * Called for pkts coming from inside-to-OUTside.
518 * p->caddr, p->cport: pkt source address (inside host)
519 * p->vaddr, p->vport: pkt dest address (foreign host) */
ip_vs_conn_out_get(const struct ip_vs_conn_param * p)520 struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p)
521 {
522 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
523 struct netns_ipvs *ipvs = p->ipvs;
524 const union nf_inet_addr *saddr;
525 struct ip_vs_conn_hnode *hn;
526 struct hlist_bl_head *head;
527 struct ip_vs_rht *t, *pt;
528 struct hlist_bl_node *e;
529 struct ip_vs_conn *cp;
530 u32 hash, hash_key;
531 __be16 sport;
532
533 rcu_read_lock();
534
535 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
536 hash = ip_vs_conn_hashkey_param(p, t, true);
537 hash_key = ip_vs_rht_build_hash_key(t, hash);
538 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
539 hlist_bl_for_each_entry_rcu(hn, e, head, node) {
540 /* dir can be 0 for DR/TUN */
541 if (READ_ONCE(hn->hash_key) != hash_key)
542 continue;
543 cp = ip_vs_hn_to_conn(hn);
544 if (p->vport != cp->cport)
545 continue;
546
547 if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
548 sport = cp->vport;
549 saddr = &cp->vaddr;
550 } else {
551 sport = cp->dport;
552 saddr = &cp->daddr;
553 }
554
555 if (p->cport == sport && cp->af == p->af &&
556 ip_vs_addr_equal(p->af, p->vaddr,
557 &cp->caddr) &&
558 ip_vs_addr_equal(p->af, p->caddr, saddr) &&
559 p->protocol == cp->protocol) {
560 if (__ip_vs_conn_get(cp))
561 goto out;
562 }
563 }
564 }
565 }
566 cp = NULL;
567
568 out:
569 rcu_read_unlock();
570
571 IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n",
572 ip_vs_proto_name(p->protocol),
573 IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport),
574 IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport),
575 cp ? "hit" : "not hit");
576
577 return cp;
578 }
579
580 struct ip_vs_conn *
ip_vs_conn_out_get_proto(struct netns_ipvs * ipvs,int af,const struct sk_buff * skb,const struct ip_vs_iphdr * iph)581 ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
582 const struct sk_buff *skb,
583 const struct ip_vs_iphdr *iph)
584 {
585 struct ip_vs_conn_param p;
586
587 if (ip_vs_conn_fill_param_proto(ipvs, af, skb, iph, &p))
588 return NULL;
589
590 return ip_vs_conn_out_get(&p);
591 }
592 EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
593
594 /*
595 * Put back the conn and restart its timer with its timeout
596 */
__ip_vs_conn_put_timer(struct ip_vs_conn * cp)597 static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
598 {
599 unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
600 0 : cp->timeout;
601 mod_timer(&cp->timer, jiffies+t);
602
603 __ip_vs_conn_put(cp);
604 }
605
ip_vs_conn_put(struct ip_vs_conn * cp)606 void ip_vs_conn_put(struct ip_vs_conn *cp)
607 {
608 if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
609 (refcount_read(&cp->refcnt) == 1) &&
610 !timer_pending(&cp->timer))
611 /* expire connection immediately */
612 ip_vs_conn_expire(&cp->timer);
613 else
614 __ip_vs_conn_put_timer(cp);
615 }
616
617 /*
618 * Fill a no_client_port connection with a client port number
619 */
ip_vs_conn_fill_cport(struct ip_vs_conn * cp,__be16 cport)620 void ip_vs_conn_fill_cport(struct ip_vs_conn *cp, __be16 cport)
621 {
622 struct hlist_bl_head *head, *head2, *head_new;
623 bool use2 = ip_vs_conn_use_hash2(cp);
624 struct netns_ipvs *ipvs = cp->ipvs;
625 int af_id = ip_vs_af_index(cp->af);
626 u32 hash_r = 0, hash_key_r = 0;
627 struct ip_vs_rht *t, *tp, *t2;
628 struct ip_vs_conn_hnode *hn;
629 u32 hash_key, hash_key_new;
630 struct ip_vs_conn_param p;
631 bool by_me = false;
632 int ntbl;
633 int dir;
634
635 /* No packets from inside, so we can do it in 2 steps. */
636 dir = use2 ? 1 : 0;
637
638 next_dir:
639 if (dir)
640 ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->daddr,
641 cp->dport, &cp->caddr, cport, &p);
642 else
643 ip_vs_conn_fill_param(ipvs, cp->af, cp->protocol, &cp->caddr,
644 cport, &cp->vaddr, cp->vport, &p);
645 hn = dir ? &cp->hn1 : &cp->hn0;
646 ntbl = 0;
647
648 /* Attempt to rehash cp safely, by informing seqcount readers */
649 t = rcu_dereference(ipvs->conn_tab);
650 hash_key = READ_ONCE(hn->hash_key);
651 tp = NULL;
652
653 retry:
654 /* Moved to new table ? */
655 if (!ip_vs_rht_same_table(t, hash_key)) {
656 t = rcu_dereference(t->new_tbl);
657 ntbl++;
658 /* We are lost? */
659 if (ntbl >= 2) {
660 spin_lock_bh(&cp->lock);
661 if (cp->flags & IP_VS_CONN_F_NO_CPORT && by_me)
662 cp->cport = 0;
663 /* hn1 will be rehashed on next packet */
664 spin_unlock_bh(&cp->lock);
665 IP_VS_ERR_RL("%s(): Too many ht changes for dir %d\n",
666 __func__, dir);
667 return;
668 }
669 }
670
671 /* Rehashing during resize? Use the recent table for adds */
672 t2 = rcu_dereference(t->new_tbl);
673 /* Calc new hash once per table */
674 if (tp != t2) {
675 hash_r = ip_vs_conn_hashkey_param(&p, t2, dir);
676 hash_key_r = ip_vs_rht_build_hash_key(t2, hash_r);
677 tp = t2;
678 }
679 head = t->buckets + (hash_key & t->mask);
680 head2 = t2->buckets + (hash_key_r & t2->mask);
681 head_new = head2;
682
683 if (head > head2 && t == t2)
684 swap(head, head2);
685
686 /* Protect the cp->flags modification */
687 spin_lock_bh(&cp->lock);
688
689 /* Lock seqcount only for the old bucket, even if we are on new table
690 * because it affects the del operation, not the adding.
691 */
692 spin_lock(&t->lock[hash_key & t->lock_mask].l);
693 preempt_disable_nested();
694 write_seqcount_begin(&t->seqc[hash_key & t->seqc_mask]);
695
696 /* Lock buckets in same (increasing) order */
697 hlist_bl_lock(head);
698 if (head != head2)
699 hlist_bl_lock(head2);
700
701 /* Ensure hash_key is read under lock */
702 hash_key_new = READ_ONCE(hn->hash_key);
703 /* Racing with another rehashing ? */
704 if (unlikely(hash_key != hash_key_new)) {
705 if (head != head2)
706 hlist_bl_unlock(head2);
707 hlist_bl_unlock(head);
708 write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
709 preempt_enable_nested();
710 spin_unlock(&t->lock[hash_key & t->lock_mask].l);
711 spin_unlock_bh(&cp->lock);
712 hash_key = hash_key_new;
713 goto retry;
714 }
715
716 /* Fill cport once, even if multiple packets try to do it */
717 if (cp->flags & IP_VS_CONN_F_NO_CPORT && (!cp->cport || by_me)) {
718 /* If we race with resizing make sure cport is set for dir 1 */
719 if (!cp->cport) {
720 cp->cport = cport;
721 by_me = true;
722 }
723 if (!dir) {
724 atomic_dec(&ipvs->no_cport_conns[af_id]);
725 cp->flags &= ~IP_VS_CONN_F_NO_CPORT;
726 }
727 /* We do not recalc hash_key_r under lock, we assume the
728 * parameters in cp do not change, i.e. cport is
729 * the only possible change.
730 */
731 WRITE_ONCE(hn->hash_key, hash_key_r);
732 if (!use2)
733 WRITE_ONCE(cp->hn1.hash_key, hash_key_r);
734 /* For dir=1 we do not check in flags if hn is already
735 * rehashed but this check will do it.
736 */
737 if (head != head2) {
738 hlist_bl_del_rcu(&hn->node);
739 hlist_bl_add_head_rcu(&hn->node, head_new);
740 }
741 }
742
743 if (head != head2)
744 hlist_bl_unlock(head2);
745 hlist_bl_unlock(head);
746 write_seqcount_end(&t->seqc[hash_key & t->seqc_mask]);
747 preempt_enable_nested();
748 spin_unlock(&t->lock[hash_key & t->lock_mask].l);
749
750 spin_unlock_bh(&cp->lock);
751 if (dir-- && by_me)
752 goto next_dir;
753 }
754
755 /* Get default load factor to map conn_count/u_thresh to t->size */
ip_vs_conn_default_load_factor(struct netns_ipvs * ipvs)756 static int ip_vs_conn_default_load_factor(struct netns_ipvs *ipvs)
757 {
758 int factor;
759
760 if (net_eq(ipvs->net, &init_net))
761 factor = -3;
762 else
763 factor = -1;
764 /* Double hashing adds twice more nodes for NAT */
765 factor--;
766 return factor;
767 }
768
769 /* Get the desired conn_tab size */
ip_vs_conn_desired_size(struct netns_ipvs * ipvs,struct ip_vs_rht * t,int lfactor)770 int ip_vs_conn_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
771 int lfactor)
772 {
773 return ip_vs_rht_desired_size(ipvs, t, atomic_read(&ipvs->conn_count),
774 lfactor, IP_VS_CONN_TAB_MIN_BITS,
775 ip_vs_conn_tab_bits);
776 }
777
778 /* Allocate conn_tab */
ip_vs_conn_tab_alloc(struct netns_ipvs * ipvs,int buckets,int lfactor)779 struct ip_vs_rht *ip_vs_conn_tab_alloc(struct netns_ipvs *ipvs, int buckets,
780 int lfactor)
781 {
782 struct ip_vs_rht *t;
783 int scounts, locks;
784
785 /* scounts: affects readers during resize */
786 scounts = clamp(buckets >> 6, 1, 256);
787 /* locks: based on parallel IP_VS_CONN_F_NO_CPORT operations + resize */
788 locks = clamp(8, 1, scounts);
789
790 t = ip_vs_rht_alloc(buckets, scounts, locks);
791 if (!t)
792 return NULL;
793 t->lfactor = lfactor;
794 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_CONN_TAB_MIN_BITS,
795 ip_vs_conn_tab_bits);
796 return t;
797 }
798
799 /* conn_tab resizer work */
conn_resize_work_handler(struct work_struct * work)800 static void conn_resize_work_handler(struct work_struct *work)
801 {
802 struct hlist_bl_head *head, *head2;
803 unsigned int resched_score = 0;
804 struct hlist_bl_node *cn, *nn;
805 struct ip_vs_rht *t, *t_new;
806 struct ip_vs_conn_hnode *hn;
807 struct netns_ipvs *ipvs;
808 struct ip_vs_conn *cp;
809 bool more_work = false;
810 u32 hash, hash_key;
811 int limit = 0;
812 int new_size;
813 int lfactor;
814 u32 bucket;
815
816 ipvs = container_of(work, struct netns_ipvs, conn_resize_work.work);
817
818 /* Allow work to be queued again */
819 clear_bit(IP_VS_WORK_CONN_RESIZE, &ipvs->work_flags);
820 t = rcu_dereference_protected(ipvs->conn_tab, 1);
821 /* Do nothing if table is removed */
822 if (!t)
823 goto out;
824 /* New table needs to be registered? BUG! */
825 if (t != rcu_dereference_protected(t->new_tbl, 1))
826 goto out;
827
828 lfactor = sysctl_conn_lfactor(ipvs);
829 /* Should we resize ? */
830 new_size = ip_vs_conn_desired_size(ipvs, t, lfactor);
831 if (new_size == t->size && lfactor == t->lfactor)
832 goto out;
833
834 t_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
835 if (!t_new) {
836 more_work = true;
837 goto out;
838 }
839 /* Flip the table_id */
840 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
841
842 rcu_assign_pointer(t->new_tbl, t_new);
843
844 /* Wait RCU readers to see the new table, we do not want new
845 * conns to go into old table and to be left there.
846 */
847 synchronize_rcu();
848
849 ip_vs_rht_for_each_bucket(t, bucket, head) {
850 same_bucket:
851 if (++limit >= 16) {
852 if (resched_score >= 100) {
853 resched_score = 0;
854 cond_resched();
855 }
856 limit = 0;
857 }
858 if (hlist_bl_empty(head)) {
859 resched_score++;
860 continue;
861 }
862 /* Preemption calls ahead... */
863 resched_score = 0;
864
865 /* seqcount_t usage considering PREEMPT_RT rules:
866 * - other writers (SoftIRQ) => serialize with spin_lock_bh
867 * - readers (SoftIRQ) => disable BHs
868 * - readers (processes) => preemption should be disabled
869 */
870 spin_lock_bh(&t->lock[bucket & t->lock_mask].l);
871 preempt_disable_nested();
872 write_seqcount_begin(&t->seqc[bucket & t->seqc_mask]);
873 hlist_bl_lock(head);
874
875 hlist_bl_for_each_entry_safe(hn, cn, nn, head, node) {
876 cp = ip_vs_hn_to_conn(hn);
877 hash = ip_vs_conn_hashkey_conn(t_new, cp, hn->dir);
878 hash_key = ip_vs_rht_build_hash_key(t_new, hash);
879
880 head2 = t_new->buckets + (hash & t_new->mask);
881 hlist_bl_lock(head2);
882 /* t_new->seqc are not used at this stage, we race
883 * only with add/del, so only lock the bucket.
884 */
885 hlist_bl_del_rcu(&hn->node);
886 WRITE_ONCE(hn->hash_key, hash_key);
887 /* Keep both hash keys in sync if no double hashing */
888 if (!ip_vs_conn_use_hash2(cp))
889 WRITE_ONCE(cp->hn1.hash_key, hash_key);
890 hlist_bl_add_head_rcu(&hn->node, head2);
891 hlist_bl_unlock(head2);
892 /* Too long chain? Do it in steps */
893 if (++limit >= 64)
894 break;
895 }
896
897 hlist_bl_unlock(head);
898 write_seqcount_end(&t->seqc[bucket & t->seqc_mask]);
899 preempt_enable_nested();
900 spin_unlock_bh(&t->lock[bucket & t->lock_mask].l);
901 if (limit >= 64)
902 goto same_bucket;
903 }
904
905 rcu_assign_pointer(ipvs->conn_tab, t_new);
906 /* Inform readers that new table is installed */
907 smp_mb__before_atomic();
908 atomic_inc(&ipvs->conn_tab_changes);
909
910 /* RCU readers should not see more than two tables in chain.
911 * To prevent new table to be attached wait here instead of
912 * freeing the old table in RCU callback.
913 */
914 synchronize_rcu();
915 ip_vs_rht_free(t);
916
917 out:
918 /* Monitor if we need to shrink table */
919 queue_delayed_work(system_unbound_wq, &ipvs->conn_resize_work,
920 more_work ? 1 : 2 * HZ);
921 }
922
923 /*
924 * Bind a connection entry with the corresponding packet_xmit.
925 * Called by ip_vs_conn_new.
926 */
ip_vs_bind_xmit(struct ip_vs_conn * cp)927 static inline void ip_vs_bind_xmit(struct ip_vs_conn *cp)
928 {
929 switch (IP_VS_FWD_METHOD(cp)) {
930 case IP_VS_CONN_F_MASQ:
931 cp->packet_xmit = ip_vs_nat_xmit;
932 break;
933
934 case IP_VS_CONN_F_TUNNEL:
935 #ifdef CONFIG_IP_VS_IPV6
936 if (cp->daf == AF_INET6)
937 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
938 else
939 #endif
940 cp->packet_xmit = ip_vs_tunnel_xmit;
941 break;
942
943 case IP_VS_CONN_F_DROUTE:
944 cp->packet_xmit = ip_vs_dr_xmit;
945 break;
946
947 case IP_VS_CONN_F_LOCALNODE:
948 cp->packet_xmit = ip_vs_null_xmit;
949 break;
950
951 case IP_VS_CONN_F_BYPASS:
952 cp->packet_xmit = ip_vs_bypass_xmit;
953 break;
954 }
955 }
956
957 #ifdef CONFIG_IP_VS_IPV6
ip_vs_bind_xmit_v6(struct ip_vs_conn * cp)958 static inline void ip_vs_bind_xmit_v6(struct ip_vs_conn *cp)
959 {
960 switch (IP_VS_FWD_METHOD(cp)) {
961 case IP_VS_CONN_F_MASQ:
962 cp->packet_xmit = ip_vs_nat_xmit_v6;
963 break;
964
965 case IP_VS_CONN_F_TUNNEL:
966 if (cp->daf == AF_INET6)
967 cp->packet_xmit = ip_vs_tunnel_xmit_v6;
968 else
969 cp->packet_xmit = ip_vs_tunnel_xmit;
970 break;
971
972 case IP_VS_CONN_F_DROUTE:
973 cp->packet_xmit = ip_vs_dr_xmit_v6;
974 break;
975
976 case IP_VS_CONN_F_LOCALNODE:
977 cp->packet_xmit = ip_vs_null_xmit;
978 break;
979
980 case IP_VS_CONN_F_BYPASS:
981 cp->packet_xmit = ip_vs_bypass_xmit_v6;
982 break;
983 }
984 }
985 #endif
986
987
ip_vs_dest_totalconns(struct ip_vs_dest * dest)988 static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest)
989 {
990 return atomic_read(&dest->activeconns)
991 + atomic_read(&dest->inactconns);
992 }
993
994 /*
995 * Bind a connection entry with a virtual service destination
996 * Called just after a new connection entry is created.
997 */
998 static inline void
ip_vs_bind_dest(struct ip_vs_conn * cp,struct ip_vs_dest * dest)999 ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest)
1000 {
1001 unsigned int conn_flags;
1002 __u32 flags;
1003
1004 /* if dest is NULL, then return directly */
1005 if (!dest)
1006 return;
1007
1008 /* Increase the refcnt counter of the dest */
1009 ip_vs_dest_hold(dest);
1010
1011 conn_flags = atomic_read(&dest->conn_flags);
1012 if (cp->protocol != IPPROTO_UDP)
1013 conn_flags &= ~IP_VS_CONN_F_ONE_PACKET;
1014 flags = cp->flags;
1015 /* Bind with the destination and its corresponding transmitter */
1016 if (flags & IP_VS_CONN_F_SYNC) {
1017 /* if the connection is not template and is created
1018 * by sync, preserve the activity flag.
1019 */
1020 if (!(flags & IP_VS_CONN_F_TEMPLATE))
1021 conn_flags &= ~IP_VS_CONN_F_INACTIVE;
1022 /* connections inherit forwarding method from dest */
1023 flags &= ~(IP_VS_CONN_F_FWD_MASK | IP_VS_CONN_F_NOOUTPUT);
1024 }
1025 flags |= conn_flags;
1026 cp->flags = flags;
1027 cp->dest = dest;
1028
1029 IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d "
1030 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
1031 "dest->refcnt:%d\n",
1032 ip_vs_proto_name(cp->protocol),
1033 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1034 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1035 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
1036 ip_vs_fwd_tag(cp), cp->state,
1037 cp->flags, refcount_read(&cp->refcnt),
1038 refcount_read(&dest->refcnt));
1039
1040 /* Update the connection counters */
1041 if (!(flags & IP_VS_CONN_F_TEMPLATE)) {
1042 /* It is a normal connection, so modify the counters
1043 * according to the flags, later the protocol can
1044 * update them on state change
1045 */
1046 if (!(flags & IP_VS_CONN_F_INACTIVE))
1047 atomic_inc(&dest->activeconns);
1048 else
1049 atomic_inc(&dest->inactconns);
1050 } else {
1051 /* It is a persistent connection/template, so increase
1052 the persistent connection counter */
1053 atomic_inc(&dest->persistconns);
1054 }
1055
1056 if (dest->u_threshold != 0 &&
1057 ip_vs_dest_totalconns(dest) >= dest->u_threshold)
1058 dest->flags |= IP_VS_DEST_F_OVERLOAD;
1059 }
1060
1061
1062 /*
1063 * Check if there is a destination for the connection, if so
1064 * bind the connection to the destination.
1065 */
ip_vs_try_bind_dest(struct ip_vs_conn * cp)1066 void ip_vs_try_bind_dest(struct ip_vs_conn *cp)
1067 {
1068 struct ip_vs_dest *dest;
1069
1070 rcu_read_lock();
1071
1072 /* This function is only invoked by the synchronization code. We do
1073 * not currently support heterogeneous pools with synchronization,
1074 * so we can make the assumption that the svc_af is the same as the
1075 * dest_af
1076 */
1077 dest = ip_vs_find_dest(cp->ipvs, cp->af, cp->af, &cp->daddr,
1078 cp->dport, &cp->vaddr, cp->vport,
1079 cp->protocol, cp->fwmark, cp->flags);
1080 if (dest) {
1081 struct ip_vs_proto_data *pd;
1082
1083 spin_lock_bh(&cp->lock);
1084 if (cp->dest) {
1085 spin_unlock_bh(&cp->lock);
1086 rcu_read_unlock();
1087 return;
1088 }
1089
1090 /* Applications work depending on the forwarding method
1091 * but better to reassign them always when binding dest */
1092 if (cp->app)
1093 ip_vs_unbind_app(cp);
1094
1095 ip_vs_bind_dest(cp, dest);
1096 spin_unlock_bh(&cp->lock);
1097
1098 /* Update its packet transmitter */
1099 cp->packet_xmit = NULL;
1100 #ifdef CONFIG_IP_VS_IPV6
1101 if (cp->af == AF_INET6)
1102 ip_vs_bind_xmit_v6(cp);
1103 else
1104 #endif
1105 ip_vs_bind_xmit(cp);
1106
1107 pd = ip_vs_proto_data_get(cp->ipvs, cp->protocol);
1108 if (pd && atomic_read(&pd->appcnt))
1109 ip_vs_bind_app(cp, pd->pp);
1110 }
1111 rcu_read_unlock();
1112 }
1113
1114
1115 /*
1116 * Unbind a connection entry with its VS destination
1117 * Called by the ip_vs_conn_expire function.
1118 */
ip_vs_unbind_dest(struct ip_vs_conn * cp)1119 static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp)
1120 {
1121 struct ip_vs_dest *dest = cp->dest;
1122
1123 if (!dest)
1124 return;
1125
1126 IP_VS_DBG_BUF(7, "Unbind-dest %s c:%s:%d v:%s:%d "
1127 "d:%s:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d "
1128 "dest->refcnt:%d\n",
1129 ip_vs_proto_name(cp->protocol),
1130 IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
1131 IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
1132 IP_VS_DBG_ADDR(cp->daf, &cp->daddr), ntohs(cp->dport),
1133 ip_vs_fwd_tag(cp), cp->state,
1134 cp->flags, refcount_read(&cp->refcnt),
1135 refcount_read(&dest->refcnt));
1136
1137 /* Update the connection counters */
1138 if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) {
1139 /* It is a normal connection, so decrease the inactconns
1140 or activeconns counter */
1141 if (cp->flags & IP_VS_CONN_F_INACTIVE) {
1142 atomic_dec(&dest->inactconns);
1143 } else {
1144 atomic_dec(&dest->activeconns);
1145 }
1146 } else {
1147 /* It is a persistent connection/template, so decrease
1148 the persistent connection counter */
1149 atomic_dec(&dest->persistconns);
1150 }
1151
1152 if (dest->l_threshold != 0) {
1153 if (ip_vs_dest_totalconns(dest) < dest->l_threshold)
1154 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1155 } else if (dest->u_threshold != 0) {
1156 if (ip_vs_dest_totalconns(dest) * 4 < dest->u_threshold * 3)
1157 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1158 } else {
1159 if (dest->flags & IP_VS_DEST_F_OVERLOAD)
1160 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1161 }
1162
1163 ip_vs_dest_put(dest);
1164 }
1165
expire_quiescent_template(struct netns_ipvs * ipvs,struct ip_vs_dest * dest)1166 static int expire_quiescent_template(struct netns_ipvs *ipvs,
1167 struct ip_vs_dest *dest)
1168 {
1169 #ifdef CONFIG_SYSCTL
1170 return ipvs->sysctl_expire_quiescent_template &&
1171 (atomic_read(&dest->weight) == 0);
1172 #else
1173 return 0;
1174 #endif
1175 }
1176
1177 /*
1178 * Checking if the destination of a connection template is available.
1179 * If available, return 1, otherwise invalidate this connection
1180 * template and return 0.
1181 */
ip_vs_check_template(struct ip_vs_conn * ct,struct ip_vs_dest * cdest)1182 int ip_vs_check_template(struct ip_vs_conn *ct, struct ip_vs_dest *cdest)
1183 {
1184 struct ip_vs_dest *dest = ct->dest;
1185 struct netns_ipvs *ipvs = ct->ipvs;
1186
1187 /*
1188 * Checking the dest server status.
1189 */
1190 if ((dest == NULL) ||
1191 !(dest->flags & IP_VS_DEST_F_AVAILABLE) ||
1192 expire_quiescent_template(ipvs, dest) ||
1193 (cdest && (dest != cdest))) {
1194 IP_VS_DBG_BUF(9, "check_template: dest not available for "
1195 "protocol %s s:%s:%d v:%s:%d "
1196 "-> d:%s:%d\n",
1197 ip_vs_proto_name(ct->protocol),
1198 IP_VS_DBG_ADDR(ct->af, &ct->caddr),
1199 ntohs(ct->cport),
1200 IP_VS_DBG_ADDR(ct->af, &ct->vaddr),
1201 ntohs(ct->vport),
1202 IP_VS_DBG_ADDR(ct->daf, &ct->daddr),
1203 ntohs(ct->dport));
1204
1205 /* Invalidate the connection template. Prefer to avoid
1206 * rehashing, it will move it as first in chain, so use
1207 * only dport as indication, it is not a hash key.
1208 */
1209 ct->dport = htons(0xffff);
1210
1211 /*
1212 * Simply decrease the refcnt of the template,
1213 * don't restart its timer.
1214 */
1215 __ip_vs_conn_put(ct);
1216 return 0;
1217 }
1218 return 1;
1219 }
1220
ip_vs_conn_rcu_free(struct rcu_head * head)1221 static void ip_vs_conn_rcu_free(struct rcu_head *head)
1222 {
1223 struct ip_vs_conn *cp = container_of(head, struct ip_vs_conn,
1224 rcu_head);
1225
1226 ip_vs_pe_put(cp->pe);
1227 kfree(cp->pe_data);
1228 kmem_cache_free(ip_vs_conn_cachep, cp);
1229 }
1230
1231 /* Try to delete connection while not holding reference */
ip_vs_conn_del(struct ip_vs_conn * cp)1232 static void ip_vs_conn_del(struct ip_vs_conn *cp)
1233 {
1234 if (timer_delete(&cp->timer)) {
1235 /* Drop cp->control chain too */
1236 if (cp->control)
1237 cp->timeout = 0;
1238 ip_vs_conn_expire(&cp->timer);
1239 }
1240 }
1241
1242 /* Try to delete connection while holding reference */
ip_vs_conn_del_put(struct ip_vs_conn * cp)1243 static void ip_vs_conn_del_put(struct ip_vs_conn *cp)
1244 {
1245 if (timer_delete(&cp->timer)) {
1246 /* Drop cp->control chain too */
1247 if (cp->control)
1248 cp->timeout = 0;
1249 __ip_vs_conn_put(cp);
1250 ip_vs_conn_expire(&cp->timer);
1251 } else {
1252 __ip_vs_conn_put(cp);
1253 }
1254 }
1255
ip_vs_conn_expire(struct timer_list * t)1256 static void ip_vs_conn_expire(struct timer_list *t)
1257 {
1258 struct ip_vs_conn *cp = timer_container_of(cp, t, timer);
1259 struct netns_ipvs *ipvs = cp->ipvs;
1260
1261 /*
1262 * do I control anybody?
1263 */
1264 if (atomic_read(&cp->n_control))
1265 goto expire_later;
1266
1267 /* Unlink conn if not referenced anymore */
1268 if (likely(ip_vs_conn_unlink(cp))) {
1269 struct ip_vs_conn *ct = cp->control;
1270
1271 /* delete the timer if it is activated by other users */
1272 timer_delete(&cp->timer);
1273
1274 /* does anybody control me? */
1275 if (ct) {
1276 bool has_ref = !cp->timeout && __ip_vs_conn_get(ct);
1277
1278 ip_vs_control_del(cp);
1279 /* Drop CTL or non-assured TPL if not used anymore */
1280 if (has_ref && !atomic_read(&ct->n_control) &&
1281 (!(ct->flags & IP_VS_CONN_F_TEMPLATE) ||
1282 !(ct->state & IP_VS_CTPL_S_ASSURED))) {
1283 IP_VS_DBG(4, "drop controlling connection\n");
1284 ip_vs_conn_del_put(ct);
1285 } else if (has_ref) {
1286 __ip_vs_conn_put(ct);
1287 }
1288 }
1289
1290 if ((cp->flags & IP_VS_CONN_F_NFCT) &&
1291 !(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
1292 /* Do not access conntracks during subsys cleanup
1293 * because nf_conntrack_find_get can not be used after
1294 * conntrack cleanup for the net.
1295 */
1296 smp_rmb();
1297 if (READ_ONCE(ipvs->enable))
1298 ip_vs_conn_drop_conntrack(cp);
1299 }
1300
1301 if (unlikely(cp->app != NULL))
1302 ip_vs_unbind_app(cp);
1303 ip_vs_unbind_dest(cp);
1304 if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
1305 int af_id = ip_vs_af_index(cp->af);
1306
1307 atomic_dec(&ipvs->no_cport_conns[af_id]);
1308 }
1309 if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
1310 ip_vs_conn_rcu_free(&cp->rcu_head);
1311 else
1312 call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
1313 atomic_dec(&ipvs->conn_count);
1314 return;
1315 }
1316
1317 expire_later:
1318 IP_VS_DBG(7, "delayed: conn->refcnt=%d conn->n_control=%d\n",
1319 refcount_read(&cp->refcnt),
1320 atomic_read(&cp->n_control));
1321
1322 refcount_inc(&cp->refcnt);
1323 cp->timeout = 60*HZ;
1324
1325 if (ipvs->sync_state & IP_VS_STATE_MASTER)
1326 ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
1327
1328 __ip_vs_conn_put_timer(cp);
1329 }
1330
1331 /* Modify timer, so that it expires as soon as possible.
1332 * Can be called without reference only if under RCU lock.
1333 * We can have such chain of conns linked with ->control: DATA->CTL->TPL
1334 * - DATA (eg. FTP) and TPL (persistence) can be present depending on setup
1335 * - cp->timeout=0 indicates all conns from chain should be dropped but
1336 * TPL is not dropped if in assured state
1337 */
ip_vs_conn_expire_now(struct ip_vs_conn * cp)1338 void ip_vs_conn_expire_now(struct ip_vs_conn *cp)
1339 {
1340 /* Using mod_timer_pending will ensure the timer is not
1341 * modified after the final timer_delete in ip_vs_conn_expire.
1342 */
1343 if (timer_pending(&cp->timer) &&
1344 time_after(cp->timer.expires, jiffies))
1345 mod_timer_pending(&cp->timer, jiffies);
1346 }
1347
1348
1349 /*
1350 * Create a new connection entry and hash it into the conn_tab
1351 */
1352 struct ip_vs_conn *
ip_vs_conn_new(const struct ip_vs_conn_param * p,int dest_af,const union nf_inet_addr * daddr,__be16 dport,unsigned int flags,struct ip_vs_dest * dest,__u32 fwmark)1353 ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
1354 const union nf_inet_addr *daddr, __be16 dport, unsigned int flags,
1355 struct ip_vs_dest *dest, __u32 fwmark)
1356 {
1357 struct ip_vs_conn *cp;
1358 struct netns_ipvs *ipvs = p->ipvs;
1359 struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
1360 p->protocol);
1361
1362 cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
1363 if (cp == NULL) {
1364 IP_VS_ERR_RL("%s(): no memory\n", __func__);
1365 return NULL;
1366 }
1367
1368 INIT_HLIST_BL_NODE(&cp->hn0.node);
1369 INIT_HLIST_BL_NODE(&cp->hn1.node);
1370 timer_setup(&cp->timer, ip_vs_conn_expire, 0);
1371 cp->ipvs = ipvs;
1372 cp->hn0.dir = 0;
1373 cp->af = p->af;
1374 cp->hn1.dir = 1;
1375 cp->daf = dest_af;
1376 cp->protocol = p->protocol;
1377 ip_vs_addr_set(p->af, &cp->caddr, p->caddr);
1378 cp->cport = p->cport;
1379 /* proto should only be IPPROTO_IP if p->vaddr is a fwmark */
1380 ip_vs_addr_set(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af,
1381 &cp->vaddr, p->vaddr);
1382 cp->vport = p->vport;
1383 ip_vs_addr_set(cp->daf, &cp->daddr, daddr);
1384 cp->dport = dport;
1385 cp->flags = flags;
1386 cp->fwmark = fwmark;
1387 if (flags & IP_VS_CONN_F_TEMPLATE && p->pe) {
1388 ip_vs_pe_get(p->pe);
1389 cp->pe = p->pe;
1390 cp->pe_data = p->pe_data;
1391 cp->pe_data_len = p->pe_data_len;
1392 } else {
1393 cp->pe = NULL;
1394 cp->pe_data = NULL;
1395 cp->pe_data_len = 0;
1396 }
1397 spin_lock_init(&cp->lock);
1398
1399 /*
1400 * Set the entry is referenced by the current thread before hashing
1401 * it in the table, so that other thread run ip_vs_random_dropentry
1402 * but cannot drop this entry.
1403 */
1404 refcount_set(&cp->refcnt, 1);
1405
1406 cp->control = NULL;
1407 atomic_set(&cp->n_control, 0);
1408 atomic_set(&cp->in_pkts, 0);
1409
1410 cp->packet_xmit = NULL;
1411 cp->app = NULL;
1412 cp->app_data = NULL;
1413 /* reset struct ip_vs_seq */
1414 cp->in_seq.delta = 0;
1415 cp->out_seq.delta = 0;
1416
1417 atomic_inc(&ipvs->conn_count);
1418 if (unlikely(flags & IP_VS_CONN_F_NO_CPORT)) {
1419 int af_id = ip_vs_af_index(cp->af);
1420
1421 atomic_inc(&ipvs->no_cport_conns[af_id]);
1422 }
1423
1424 /* Bind the connection with a destination server */
1425 cp->dest = NULL;
1426 ip_vs_bind_dest(cp, dest);
1427
1428 /* Set its state and timeout */
1429 cp->state = 0;
1430 cp->old_state = 0;
1431 cp->timeout = 3*HZ;
1432 cp->sync_endtime = jiffies & ~3UL;
1433
1434 /* Bind its packet transmitter */
1435 #ifdef CONFIG_IP_VS_IPV6
1436 if (p->af == AF_INET6)
1437 ip_vs_bind_xmit_v6(cp);
1438 else
1439 #endif
1440 ip_vs_bind_xmit(cp);
1441
1442 if (unlikely(pd && atomic_read(&pd->appcnt)))
1443 ip_vs_bind_app(cp, pd->pp);
1444
1445 /*
1446 * Allow conntrack to be preserved. By default, conntrack
1447 * is created and destroyed for every packet.
1448 * Sometimes keeping conntrack can be useful for
1449 * IP_VS_CONN_F_ONE_PACKET too.
1450 */
1451
1452 if (ip_vs_conntrack_enabled(ipvs))
1453 cp->flags |= IP_VS_CONN_F_NFCT;
1454
1455 /* Hash it in the conn_tab finally */
1456 ip_vs_conn_hash(cp);
1457
1458 return cp;
1459 }
1460
1461 /*
1462 * /proc/net/ip_vs_conn entries
1463 */
1464 #ifdef CONFIG_PROC_FS
1465 struct ip_vs_iter_state {
1466 struct seq_net_private p;
1467 struct ip_vs_rht *t;
1468 int gen;
1469 u32 bucket;
1470 unsigned int skip_elems;
1471 };
1472
ip_vs_conn_array(struct seq_file * seq)1473 static void *ip_vs_conn_array(struct seq_file *seq)
1474 {
1475 struct ip_vs_iter_state *iter = seq->private;
1476 struct net *net = seq_file_net(seq);
1477 struct netns_ipvs *ipvs = net_ipvs(net);
1478 struct ip_vs_rht *t = iter->t;
1479 struct ip_vs_conn_hnode *hn;
1480 struct hlist_bl_node *e;
1481 int idx;
1482
1483 if (!t)
1484 return NULL;
1485 for (idx = iter->bucket; idx < t->size; idx++) {
1486 unsigned int skip = 0;
1487
1488 hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[idx], node) {
1489 /* __ip_vs_conn_get() is not needed by
1490 * ip_vs_conn_seq_show and ip_vs_conn_sync_seq_show
1491 */
1492 if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
1493 break;
1494 if (hn->dir != 0)
1495 continue;
1496 if (skip >= iter->skip_elems) {
1497 iter->bucket = idx;
1498 return hn;
1499 }
1500
1501 ++skip;
1502 }
1503
1504 if (!(idx & 31)) {
1505 cond_resched_rcu();
1506 /* New table installed ? */
1507 if (iter->gen != atomic_read(&ipvs->conn_tab_changes))
1508 break;
1509 }
1510 iter->skip_elems = 0;
1511 }
1512
1513 iter->bucket = idx;
1514 return NULL;
1515 }
1516
ip_vs_conn_seq_start(struct seq_file * seq,loff_t * pos)1517 static void *ip_vs_conn_seq_start(struct seq_file *seq, loff_t *pos)
1518 __acquires(RCU)
1519 {
1520 struct ip_vs_iter_state *iter = seq->private;
1521 struct net *net = seq_file_net(seq);
1522 struct netns_ipvs *ipvs = net_ipvs(net);
1523
1524 rcu_read_lock();
1525 iter->gen = atomic_read(&ipvs->conn_tab_changes);
1526 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
1527 iter->t = rcu_dereference(ipvs->conn_tab);
1528 if (*pos == 0) {
1529 iter->skip_elems = 0;
1530 iter->bucket = 0;
1531 return SEQ_START_TOKEN;
1532 }
1533
1534 return ip_vs_conn_array(seq);
1535 }
1536
ip_vs_conn_seq_next(struct seq_file * seq,void * v,loff_t * pos)1537 static void *ip_vs_conn_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1538 {
1539 struct ip_vs_iter_state *iter = seq->private;
1540 struct ip_vs_conn_hnode *hn = v;
1541 struct hlist_bl_node *e;
1542 struct ip_vs_rht *t;
1543
1544 ++*pos;
1545 if (v == SEQ_START_TOKEN)
1546 return ip_vs_conn_array(seq);
1547
1548 t = iter->t;
1549 if (!t)
1550 return NULL;
1551
1552 /* more on same hash chain? */
1553 hlist_bl_for_each_entry_continue_rcu(hn, e, node) {
1554 /* Our cursor was moved to new table ? */
1555 if (!ip_vs_rht_same_table(t, READ_ONCE(hn->hash_key)))
1556 break;
1557 if (hn->dir != 0)
1558 continue;
1559 iter->skip_elems++;
1560 return hn;
1561 }
1562
1563 iter->skip_elems = 0;
1564 iter->bucket++;
1565
1566 return ip_vs_conn_array(seq);
1567 }
1568
ip_vs_conn_seq_stop(struct seq_file * seq,void * v)1569 static void ip_vs_conn_seq_stop(struct seq_file *seq, void *v)
1570 __releases(RCU)
1571 {
1572 rcu_read_unlock();
1573 }
1574
ip_vs_conn_seq_show(struct seq_file * seq,void * v)1575 static int ip_vs_conn_seq_show(struct seq_file *seq, void *v)
1576 {
1577
1578 if (v == SEQ_START_TOKEN)
1579 seq_puts(seq,
1580 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n");
1581 else {
1582 struct ip_vs_conn_hnode *hn = v;
1583 const struct ip_vs_conn *cp = ip_vs_hn0_to_conn(hn);
1584 char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3];
1585 size_t len = 0;
1586 char dbuf[IP_VS_ADDRSTRLEN];
1587
1588 if (cp->pe_data) {
1589 pe_data[0] = ' ';
1590 len = strlen(cp->pe->name);
1591 memcpy(pe_data + 1, cp->pe->name, len);
1592 pe_data[len + 1] = ' ';
1593 len += 2;
1594 len += cp->pe->show_pe_data(cp, pe_data + len);
1595 }
1596 pe_data[len] = '\0';
1597
1598 #ifdef CONFIG_IP_VS_IPV6
1599 if (cp->daf == AF_INET6)
1600 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1601 else
1602 #endif
1603 snprintf(dbuf, sizeof(dbuf), "%08X",
1604 ntohl(cp->daddr.ip));
1605
1606 #ifdef CONFIG_IP_VS_IPV6
1607 if (cp->af == AF_INET6)
1608 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1609 "%s %04X %-11s %7u%s\n",
1610 ip_vs_proto_name(cp->protocol),
1611 &cp->caddr.in6, ntohs(cp->cport),
1612 &cp->vaddr.in6, ntohs(cp->vport),
1613 dbuf, ntohs(cp->dport),
1614 ip_vs_state_name(cp),
1615 jiffies_delta_to_msecs(cp->timer.expires -
1616 jiffies) / 1000,
1617 pe_data);
1618 else
1619 #endif
1620 seq_printf(seq,
1621 "%-3s %08X %04X %08X %04X"
1622 " %s %04X %-11s %7u%s\n",
1623 ip_vs_proto_name(cp->protocol),
1624 ntohl(cp->caddr.ip), ntohs(cp->cport),
1625 ntohl(cp->vaddr.ip), ntohs(cp->vport),
1626 dbuf, ntohs(cp->dport),
1627 ip_vs_state_name(cp),
1628 jiffies_delta_to_msecs(cp->timer.expires -
1629 jiffies) / 1000,
1630 pe_data);
1631 }
1632 return 0;
1633 }
1634
1635 static const struct seq_operations ip_vs_conn_seq_ops = {
1636 .start = ip_vs_conn_seq_start,
1637 .next = ip_vs_conn_seq_next,
1638 .stop = ip_vs_conn_seq_stop,
1639 .show = ip_vs_conn_seq_show,
1640 };
1641
ip_vs_origin_name(unsigned int flags)1642 static const char *ip_vs_origin_name(unsigned int flags)
1643 {
1644 if (flags & IP_VS_CONN_F_SYNC)
1645 return "SYNC";
1646 else
1647 return "LOCAL";
1648 }
1649
ip_vs_conn_sync_seq_show(struct seq_file * seq,void * v)1650 static int ip_vs_conn_sync_seq_show(struct seq_file *seq, void *v)
1651 {
1652 char dbuf[IP_VS_ADDRSTRLEN];
1653
1654 if (v == SEQ_START_TOKEN)
1655 seq_puts(seq,
1656 "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Origin Expires\n");
1657 else {
1658 const struct ip_vs_conn *cp = v;
1659
1660 #ifdef CONFIG_IP_VS_IPV6
1661 if (cp->daf == AF_INET6)
1662 snprintf(dbuf, sizeof(dbuf), "%pI6", &cp->daddr.in6);
1663 else
1664 #endif
1665 snprintf(dbuf, sizeof(dbuf), "%08X",
1666 ntohl(cp->daddr.ip));
1667
1668 #ifdef CONFIG_IP_VS_IPV6
1669 if (cp->af == AF_INET6)
1670 seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X "
1671 "%s %04X %-11s %-6s %7u\n",
1672 ip_vs_proto_name(cp->protocol),
1673 &cp->caddr.in6, ntohs(cp->cport),
1674 &cp->vaddr.in6, ntohs(cp->vport),
1675 dbuf, ntohs(cp->dport),
1676 ip_vs_state_name(cp),
1677 ip_vs_origin_name(cp->flags),
1678 jiffies_delta_to_msecs(cp->timer.expires -
1679 jiffies) / 1000);
1680 else
1681 #endif
1682 seq_printf(seq,
1683 "%-3s %08X %04X %08X %04X "
1684 "%s %04X %-11s %-6s %7u\n",
1685 ip_vs_proto_name(cp->protocol),
1686 ntohl(cp->caddr.ip), ntohs(cp->cport),
1687 ntohl(cp->vaddr.ip), ntohs(cp->vport),
1688 dbuf, ntohs(cp->dport),
1689 ip_vs_state_name(cp),
1690 ip_vs_origin_name(cp->flags),
1691 jiffies_delta_to_msecs(cp->timer.expires -
1692 jiffies) / 1000);
1693 }
1694 return 0;
1695 }
1696
1697 static const struct seq_operations ip_vs_conn_sync_seq_ops = {
1698 .start = ip_vs_conn_seq_start,
1699 .next = ip_vs_conn_seq_next,
1700 .stop = ip_vs_conn_seq_stop,
1701 .show = ip_vs_conn_sync_seq_show,
1702 };
1703 #endif
1704
1705 #ifdef CONFIG_SYSCTL
1706
1707 /* Randomly drop connection entries before running out of memory
1708 * Can be used for DATA and CTL conns. For TPL conns there are exceptions:
1709 * - traffic for services in OPS mode increases ct->in_pkts, so it is supported
1710 * - traffic for services not in OPS mode does not increase ct->in_pkts in
1711 * all cases, so it is not supported
1712 */
todrop_entry(struct ip_vs_conn * cp)1713 static inline int todrop_entry(struct ip_vs_conn *cp)
1714 {
1715 struct netns_ipvs *ipvs = cp->ipvs;
1716 int i;
1717
1718 /* if the conn entry hasn't lasted for 60 seconds, don't drop it.
1719 This will leave enough time for normal connection to get
1720 through. */
1721 if (time_before(cp->timeout + jiffies, cp->timer.expires + 60*HZ))
1722 return 0;
1723
1724 /* Drop only conns with number of incoming packets in [1..8] range */
1725 i = atomic_read(&cp->in_pkts);
1726 if (i > 8 || i < 1)
1727 return 0;
1728
1729 i--;
1730 if (--ipvs->dropentry_counters[i] > 0)
1731 return 0;
1732
1733 /* Prefer to drop conns with less number of incoming packets */
1734 ipvs->dropentry_counters[i] = i + 1;
1735 return 1;
1736 }
1737
ip_vs_conn_ops_mode(struct ip_vs_conn * cp)1738 static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
1739 {
1740 struct ip_vs_service *svc;
1741
1742 if (!cp->dest)
1743 return false;
1744 svc = rcu_dereference(cp->dest->svc);
1745 return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
1746 }
1747
ip_vs_random_dropentry(struct netns_ipvs * ipvs)1748 void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
1749 {
1750 struct ip_vs_conn_hnode *hn;
1751 struct hlist_bl_node *e;
1752 struct ip_vs_conn *cp;
1753 struct ip_vs_rht *t;
1754 unsigned int r;
1755 int idx;
1756
1757 r = get_random_u32();
1758 rcu_read_lock();
1759 t = rcu_dereference(ipvs->conn_tab);
1760 if (!t)
1761 goto out;
1762 /*
1763 * Randomly scan 1/32 of the whole table every second
1764 */
1765 for (idx = 0; idx < (t->size >> 5); idx++) {
1766 unsigned int hash = (r + idx) & t->mask;
1767
1768 /* Don't care if due to moved entry we jump to another bucket
1769 * and even to new table
1770 */
1771 hlist_bl_for_each_entry_rcu(hn, e, &t->buckets[hash], node) {
1772 if (hn->dir != 0)
1773 continue;
1774 cp = ip_vs_hn0_to_conn(hn);
1775 if (atomic_read(&cp->n_control))
1776 continue;
1777 if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
1778 /* connection template of OPS */
1779 if (ip_vs_conn_ops_mode(cp))
1780 goto try_drop;
1781 if (!(cp->state & IP_VS_CTPL_S_ASSURED))
1782 goto drop;
1783 continue;
1784 }
1785 if (cp->protocol == IPPROTO_TCP) {
1786 switch(cp->state) {
1787 case IP_VS_TCP_S_SYN_RECV:
1788 case IP_VS_TCP_S_SYNACK:
1789 break;
1790
1791 case IP_VS_TCP_S_ESTABLISHED:
1792 if (todrop_entry(cp))
1793 break;
1794 continue;
1795
1796 default:
1797 continue;
1798 }
1799 } else if (cp->protocol == IPPROTO_SCTP) {
1800 switch (cp->state) {
1801 case IP_VS_SCTP_S_INIT1:
1802 case IP_VS_SCTP_S_INIT:
1803 break;
1804 case IP_VS_SCTP_S_ESTABLISHED:
1805 if (todrop_entry(cp))
1806 break;
1807 continue;
1808 default:
1809 continue;
1810 }
1811 } else {
1812 try_drop:
1813 if (!todrop_entry(cp))
1814 continue;
1815 }
1816
1817 drop:
1818 IP_VS_DBG(4, "drop connection\n");
1819 ip_vs_conn_del(cp);
1820 }
1821 if (!(idx & 31)) {
1822 cond_resched_rcu();
1823 t = rcu_dereference(ipvs->conn_tab);
1824 if (!t)
1825 goto out;
1826 }
1827 }
1828
1829 out:
1830 rcu_read_unlock();
1831 }
1832 #endif
1833
1834 /* Flush all the connection entries in the conn_tab */
ip_vs_conn_flush(struct netns_ipvs * ipvs)1835 static void ip_vs_conn_flush(struct netns_ipvs *ipvs)
1836 {
1837 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
1838 struct ip_vs_conn *cp, *cp_c;
1839 struct ip_vs_conn_hnode *hn;
1840 struct hlist_bl_head *head;
1841 struct ip_vs_rht *t, *p;
1842 struct hlist_bl_node *e;
1843
1844 if (!rcu_dereference_protected(ipvs->conn_tab, 1))
1845 return;
1846 disable_delayed_work_sync(&ipvs->conn_resize_work);
1847 if (!atomic_read(&ipvs->conn_count))
1848 goto unreg;
1849
1850 flush_again:
1851 /* Rely on RCU grace period while accessing cp after ip_vs_conn_del */
1852 rcu_read_lock();
1853 ip_vs_rht_walk_buckets_safe_rcu(ipvs->conn_tab, head) {
1854 hlist_bl_for_each_entry_rcu(hn, e, head, node) {
1855 if (hn->dir != 0)
1856 continue;
1857 cp = ip_vs_hn0_to_conn(hn);
1858 if (atomic_read(&cp->n_control))
1859 continue;
1860 cp_c = cp->control;
1861 IP_VS_DBG(4, "del connection\n");
1862 ip_vs_conn_del(cp);
1863 if (cp_c && !atomic_read(&cp_c->n_control)) {
1864 IP_VS_DBG(4, "del controlling connection\n");
1865 ip_vs_conn_del(cp_c);
1866 }
1867 }
1868 cond_resched_rcu();
1869 }
1870 rcu_read_unlock();
1871
1872 /* the counter may be not NULL, because maybe some conn entries
1873 are run by slow timer handler or unhashed but still referred */
1874 if (atomic_read(&ipvs->conn_count) != 0) {
1875 schedule();
1876 goto flush_again;
1877 }
1878
1879 unreg:
1880 /* Unregister the hash table and release it after RCU grace period.
1881 * This is needed because other works may not be stopped yet and
1882 * they may walk the tables.
1883 */
1884 t = rcu_dereference_protected(ipvs->conn_tab, 1);
1885 rcu_assign_pointer(ipvs->conn_tab, NULL);
1886 /* Inform readers that conn_tab is changed */
1887 smp_mb__before_atomic();
1888 atomic_inc(&ipvs->conn_tab_changes);
1889 while (1) {
1890 p = rcu_dereference_protected(t->new_tbl, 1);
1891 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
1892 if (p == t)
1893 break;
1894 t = p;
1895 }
1896 }
1897
1898 #ifdef CONFIG_SYSCTL
ip_vs_expire_nodest_conn_flush(struct netns_ipvs * ipvs)1899 void ip_vs_expire_nodest_conn_flush(struct netns_ipvs *ipvs)
1900 {
1901 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
1902 unsigned int resched_score = 0;
1903 struct ip_vs_conn *cp, *cp_c;
1904 struct ip_vs_conn_hnode *hn;
1905 struct hlist_bl_head *head;
1906 struct ip_vs_dest *dest;
1907 struct hlist_bl_node *e;
1908 int old_gen, new_gen;
1909
1910 if (!atomic_read(&ipvs->conn_count))
1911 return;
1912 old_gen = atomic_read(&ipvs->conn_tab_changes);
1913 rcu_read_lock();
1914
1915 repeat:
1916 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
1917 ip_vs_rht_walk_buckets_rcu(ipvs->conn_tab, head) {
1918 hlist_bl_for_each_entry_rcu(hn, e, head, node) {
1919 if (hn->dir != 0)
1920 continue;
1921 cp = ip_vs_hn0_to_conn(hn);
1922 resched_score++;
1923 dest = cp->dest;
1924 if (!dest || (dest->flags & IP_VS_DEST_F_AVAILABLE))
1925 continue;
1926
1927 if (atomic_read(&cp->n_control))
1928 continue;
1929
1930 cp_c = cp->control;
1931 IP_VS_DBG(4, "del connection\n");
1932 ip_vs_conn_del(cp);
1933 if (cp_c && !atomic_read(&cp_c->n_control)) {
1934 IP_VS_DBG(4, "del controlling connection\n");
1935 ip_vs_conn_del(cp_c);
1936 }
1937 resched_score += 10;
1938 }
1939 resched_score++;
1940 if (resched_score >= 100) {
1941 resched_score = 0;
1942 cond_resched_rcu();
1943 /* netns clean up started, abort delayed work */
1944 if (!READ_ONCE(ipvs->enable))
1945 goto out;
1946 new_gen = atomic_read(&ipvs->conn_tab_changes);
1947 /* New table installed ? */
1948 if (old_gen != new_gen) {
1949 old_gen = new_gen;
1950 goto repeat;
1951 }
1952 }
1953 }
1954
1955 out:
1956 rcu_read_unlock();
1957 }
1958 #endif
1959
1960 /*
1961 * per netns init and exit
1962 */
ip_vs_conn_net_init(struct netns_ipvs * ipvs)1963 int __net_init ip_vs_conn_net_init(struct netns_ipvs *ipvs)
1964 {
1965 int idx;
1966
1967 atomic_set(&ipvs->conn_count, 0);
1968 for (idx = 0; idx < IP_VS_AF_MAX; idx++)
1969 atomic_set(&ipvs->no_cport_conns[idx], 0);
1970 INIT_DELAYED_WORK(&ipvs->conn_resize_work, conn_resize_work_handler);
1971 RCU_INIT_POINTER(ipvs->conn_tab, NULL);
1972 atomic_set(&ipvs->conn_tab_changes, 0);
1973 ipvs->sysctl_conn_lfactor = ip_vs_conn_default_load_factor(ipvs);
1974
1975 #ifdef CONFIG_PROC_FS
1976 if (!proc_create_net("ip_vs_conn", 0, ipvs->net->proc_net,
1977 &ip_vs_conn_seq_ops,
1978 sizeof(struct ip_vs_iter_state)))
1979 goto err_conn;
1980
1981 if (!proc_create_net("ip_vs_conn_sync", 0, ipvs->net->proc_net,
1982 &ip_vs_conn_sync_seq_ops,
1983 sizeof(struct ip_vs_iter_state)))
1984 goto err_conn_sync;
1985 #endif
1986
1987 return 0;
1988
1989 #ifdef CONFIG_PROC_FS
1990 err_conn_sync:
1991 remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
1992 err_conn:
1993 return -ENOMEM;
1994 #endif
1995 }
1996
ip_vs_conn_net_cleanup(struct netns_ipvs * ipvs)1997 void __net_exit ip_vs_conn_net_cleanup(struct netns_ipvs *ipvs)
1998 {
1999 /* flush all the connection entries first */
2000 ip_vs_conn_flush(ipvs);
2001 #ifdef CONFIG_PROC_FS
2002 remove_proc_entry("ip_vs_conn", ipvs->net->proc_net);
2003 remove_proc_entry("ip_vs_conn_sync", ipvs->net->proc_net);
2004 #endif
2005 }
2006
ip_vs_conn_init(void)2007 int __init ip_vs_conn_init(void)
2008 {
2009 int min = IP_VS_CONN_TAB_MIN_BITS;
2010 int max = IP_VS_CONN_TAB_MAX_BITS;
2011 size_t tab_array_size;
2012 int max_avail;
2013
2014 max_avail = order_base_2(totalram_pages()) + PAGE_SHIFT;
2015 /* 64-bit: 27 bits at 64GB, 32-bit: 20 bits at 512MB */
2016 max_avail += 1; /* hash table loaded at 50% */
2017 max_avail -= 1; /* IPVS up to 1/2 of mem */
2018 max_avail -= order_base_2(sizeof(struct ip_vs_conn));
2019 max = clamp(max_avail, min, max);
2020 ip_vs_conn_tab_bits = clamp(ip_vs_conn_tab_bits, min, max);
2021 ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits;
2022
2023 /*
2024 * Allocate the connection hash table and initialize its list heads
2025 */
2026 tab_array_size = array_size(ip_vs_conn_tab_size,
2027 sizeof(struct hlist_bl_head));
2028
2029 /* Allocate ip_vs_conn slab cache */
2030 ip_vs_conn_cachep = KMEM_CACHE(ip_vs_conn, SLAB_HWCACHE_ALIGN);
2031 if (!ip_vs_conn_cachep)
2032 return -ENOMEM;
2033
2034 pr_info("Connection hash table configured (size=%d, memory=%zdKbytes)\n",
2035 ip_vs_conn_tab_size, tab_array_size / 1024);
2036 IP_VS_DBG(0, "Each connection entry needs %zd bytes at least\n",
2037 sizeof(struct ip_vs_conn));
2038
2039 return 0;
2040 }
2041
ip_vs_conn_cleanup(void)2042 void ip_vs_conn_cleanup(void)
2043 {
2044 /* Wait all ip_vs_conn_rcu_free() callbacks to complete */
2045 rcu_barrier();
2046 /* Release the empty cache */
2047 kmem_cache_destroy(ip_vs_conn_cachep);
2048 }
2049