1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPVS An implementation of the IP virtual server support for the
4 * LINUX operating system. IPVS is now implemented as a module
5 * over the NetFilter framework. IPVS can be used to build a
6 * high-performance and highly available server based on a
7 * cluster of servers.
8 *
9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
10 * Peter Kese <peter.kese@ijs.si>
11 * Julian Anastasov <ja@ssi.bg>
12 *
13 * Changes:
14 */
15
16 #define pr_fmt(fmt) "IPVS: " fmt
17
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/types.h>
21 #include <linux/capability.h>
22 #include <linux/fs.h>
23 #include <linux/sysctl.h>
24 #include <linux/proc_fs.h>
25 #include <linux/workqueue.h>
26 #include <linux/seq_file.h>
27 #include <linux/slab.h>
28
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/mutex.h>
32 #include <linux/rcupdate_wait.h>
33
34 #include <net/net_namespace.h>
35 #include <linux/nsproxy.h>
36 #include <net/ip.h>
37 #ifdef CONFIG_IP_VS_IPV6
38 #include <net/ipv6.h>
39 #include <net/ip6_route.h>
40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
41 #endif
42 #include <net/route.h>
43 #include <net/sock.h>
44 #include <net/genetlink.h>
45
46 #include <linux/uaccess.h>
47
48 #include <net/ip_vs.h>
49
50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
51
52 static struct lock_class_key __ipvs_service_key;
53
54 /* sysctl variables */
55
56 #ifdef CONFIG_IP_VS_DEBUG
57 static int sysctl_ip_vs_debug_level = 0;
58
ip_vs_get_debug_level(void)59 int ip_vs_get_debug_level(void)
60 {
61 return sysctl_ip_vs_debug_level;
62 }
63 #endif
64
65
66 /* Protos */
67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
68
69
70 #ifdef CONFIG_IP_VS_IPV6
71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
__ip_vs_addr_is_local_v6(struct net * net,const struct in6_addr * addr)72 static bool __ip_vs_addr_is_local_v6(struct net *net,
73 const struct in6_addr *addr)
74 {
75 struct flowi6 fl6 = {
76 .daddr = *addr,
77 };
78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
79 bool is_local;
80
81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
82
83 dst_release(dst);
84 return is_local;
85 }
86 #endif
87
88 #ifdef CONFIG_SYSCTL
89 /*
90 * update_defense_level is called from keventd and from sysctl,
91 * so it needs to protect itself from softirqs
92 */
update_defense_level(struct netns_ipvs * ipvs)93 static void update_defense_level(struct netns_ipvs *ipvs)
94 {
95 struct sysinfo i;
96 int availmem;
97 int amemthresh;
98 int nomem;
99 int to_change = -1;
100
101 /* we only count free and buffered memory (in pages) */
102 si_meminfo(&i);
103 availmem = i.freeram + i.bufferram;
104 /* however in linux 2.5 the i.bufferram is total page cache size,
105 we need adjust it */
106 /* si_swapinfo(&i); */
107 /* availmem = availmem - (i.totalswap - i.freeswap); */
108
109 amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
110 nomem = (availmem < amemthresh);
111
112 local_bh_disable();
113
114 /* drop_entry */
115 spin_lock(&ipvs->dropentry_lock);
116 switch (ipvs->sysctl_drop_entry) {
117 case 0:
118 atomic_set(&ipvs->dropentry, 0);
119 break;
120 case 1:
121 if (nomem) {
122 atomic_set(&ipvs->dropentry, 1);
123 ipvs->sysctl_drop_entry = 2;
124 } else {
125 atomic_set(&ipvs->dropentry, 0);
126 }
127 break;
128 case 2:
129 if (nomem) {
130 atomic_set(&ipvs->dropentry, 1);
131 } else {
132 atomic_set(&ipvs->dropentry, 0);
133 ipvs->sysctl_drop_entry = 1;
134 }
135 break;
136 case 3:
137 atomic_set(&ipvs->dropentry, 1);
138 break;
139 }
140 spin_unlock(&ipvs->dropentry_lock);
141
142 /* drop_packet */
143 spin_lock(&ipvs->droppacket_lock);
144 switch (ipvs->sysctl_drop_packet) {
145 case 0:
146 ipvs->drop_rate = 0;
147 break;
148 case 1:
149 if (nomem) {
150 ipvs->drop_counter = amemthresh / (amemthresh - availmem);
151 ipvs->drop_rate = ipvs->drop_counter;
152 ipvs->sysctl_drop_packet = 2;
153 } else {
154 ipvs->drop_rate = 0;
155 }
156 break;
157 case 2:
158 if (nomem) {
159 ipvs->drop_counter = amemthresh / (amemthresh - availmem);
160 ipvs->drop_rate = ipvs->drop_counter;
161 } else {
162 ipvs->drop_rate = 0;
163 ipvs->sysctl_drop_packet = 1;
164 }
165 break;
166 case 3:
167 ipvs->drop_rate = ipvs->sysctl_am_droprate;
168 break;
169 }
170 spin_unlock(&ipvs->droppacket_lock);
171
172 /* secure_tcp */
173 spin_lock(&ipvs->securetcp_lock);
174 switch (ipvs->sysctl_secure_tcp) {
175 case 0:
176 if (ipvs->old_secure_tcp >= 2)
177 to_change = 0;
178 break;
179 case 1:
180 if (nomem) {
181 if (ipvs->old_secure_tcp < 2)
182 to_change = 1;
183 ipvs->sysctl_secure_tcp = 2;
184 } else {
185 if (ipvs->old_secure_tcp >= 2)
186 to_change = 0;
187 }
188 break;
189 case 2:
190 if (nomem) {
191 if (ipvs->old_secure_tcp < 2)
192 to_change = 1;
193 } else {
194 if (ipvs->old_secure_tcp >= 2)
195 to_change = 0;
196 ipvs->sysctl_secure_tcp = 1;
197 }
198 break;
199 case 3:
200 if (ipvs->old_secure_tcp < 2)
201 to_change = 1;
202 break;
203 }
204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
205 if (to_change >= 0)
206 ip_vs_protocol_timeout_change(ipvs,
207 ipvs->sysctl_secure_tcp > 1);
208 spin_unlock(&ipvs->securetcp_lock);
209
210 local_bh_enable();
211 }
212
213 /* Handler for delayed work for expiring no
214 * destination connections
215 */
expire_nodest_conn_handler(struct work_struct * work)216 static void expire_nodest_conn_handler(struct work_struct *work)
217 {
218 struct netns_ipvs *ipvs;
219
220 ipvs = container_of(work, struct netns_ipvs,
221 expire_nodest_conn_work.work);
222 ip_vs_expire_nodest_conn_flush(ipvs);
223 }
224
225 /*
226 * Timer for checking the defense
227 */
228 #define DEFENSE_TIMER_PERIOD 1*HZ
229
defense_work_handler(struct work_struct * work)230 static void defense_work_handler(struct work_struct *work)
231 {
232 struct netns_ipvs *ipvs =
233 container_of(work, struct netns_ipvs, defense_work.work);
234
235 update_defense_level(ipvs);
236 if (atomic_read(&ipvs->dropentry))
237 ip_vs_random_dropentry(ipvs);
238 queue_delayed_work(system_long_wq, &ipvs->defense_work,
239 DEFENSE_TIMER_PERIOD);
240 }
241 #endif
242
est_reload_work_handler(struct work_struct * work)243 static void est_reload_work_handler(struct work_struct *work)
244 {
245 struct netns_ipvs *ipvs =
246 container_of(work, struct netns_ipvs, est_reload_work.work);
247 int genid_done = atomic_read(&ipvs->est_genid_done);
248 unsigned long delay = HZ / 10; /* repeat startups after failure */
249 bool repeat = false;
250 int genid;
251 int id;
252
253 mutex_lock(&ipvs->est_mutex);
254 genid = atomic_read(&ipvs->est_genid);
255 for (id = 0; id < ipvs->est_kt_count; id++) {
256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
257
258 /* netns clean up started, abort delayed work */
259 if (!READ_ONCE(ipvs->enable))
260 goto unlock;
261 if (!kd)
262 continue;
263 /* New config ? Stop kthread tasks */
264 if (genid != genid_done)
265 ip_vs_est_kthread_stop(kd);
266 if (!kd->task && !ip_vs_est_stopped(ipvs)) {
267 /* Do not start kthreads above 0 in calc phase */
268 if ((!id || !ipvs->est_calc_phase) &&
269 ip_vs_est_kthread_start(ipvs, kd) < 0)
270 repeat = true;
271 }
272 }
273
274 atomic_set(&ipvs->est_genid_done, genid);
275
276 if (repeat)
277 queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
278 delay);
279
280 unlock:
281 mutex_unlock(&ipvs->est_mutex);
282 }
283
get_conn_tab_size(struct netns_ipvs * ipvs)284 static int get_conn_tab_size(struct netns_ipvs *ipvs)
285 {
286 const struct ip_vs_rht *t;
287 int size = 0;
288
289 rcu_read_lock();
290 t = rcu_dereference(ipvs->conn_tab);
291 if (t)
292 size = t->size;
293 rcu_read_unlock();
294
295 return size;
296 }
297
298 int
ip_vs_use_count_inc(void)299 ip_vs_use_count_inc(void)
300 {
301 return try_module_get(THIS_MODULE);
302 }
303
304 void
ip_vs_use_count_dec(void)305 ip_vs_use_count_dec(void)
306 {
307 module_put(THIS_MODULE);
308 }
309
310
311 /* Service hashing:
312 * Operation Locking order
313 * ---------------------------------------------------------------------------
314 * add table service_mutex, svc_resize_sem(W)
315 * del table service_mutex
316 * move between tables svc_resize_sem(W), seqcount_t(W), bit lock
317 * add/del service service_mutex, bit lock
318 * find service RCU, seqcount_t(R)
319 * walk services(blocking) service_mutex, svc_resize_sem(R)
320 * walk services(non-blocking) RCU, seqcount_t(R)
321 *
322 * - new tables are linked/unlinked under service_mutex and svc_resize_sem
323 * - new table is linked on resizing and all operations can run in parallel
324 * in 2 tables until the new table is registered as current one
325 * - two contexts can modify buckets: config and table resize, both in
326 * process context
327 * - only table resizer can move entries, so we do not protect t->seqc[]
328 * items with t->lock[]
329 * - lookups occur under RCU lock and seqcount reader lock to detect if
330 * services are moved to new table
331 * - move operations may disturb readers: find operation will not miss entries
332 * but walkers may see same entry twice if they are forced to retry chains
333 * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to hold
334 * service_mutex to disallow new tables to be installed or to check
335 * svc_table_changes and repeat the RCU read section if new table is installed
336 */
337
338 /*
339 * Returns hash value for virtual service
340 */
341 static inline u32
ip_vs_svc_hashval(struct ip_vs_rht * t,int af,unsigned int proto,const union nf_inet_addr * addr,__be16 port)342 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto,
343 const union nf_inet_addr *addr, __be16 port)
344 {
345 return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto);
346 }
347
348 /*
349 * Returns hash value of fwmark for virtual service lookup
350 */
ip_vs_svc_fwm_hashval(struct ip_vs_rht * t,int af,__u32 fwmark)351 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af,
352 __u32 fwmark)
353 {
354 return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]);
355 }
356
357 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */
ip_vs_svc_hash(struct ip_vs_service * svc)358 static int ip_vs_svc_hash(struct ip_vs_service *svc)
359 {
360 struct netns_ipvs *ipvs = svc->ipvs;
361 struct hlist_bl_head *head;
362 struct ip_vs_rht *t;
363 u32 hash;
364
365 if (svc->flags & IP_VS_SVC_F_HASHED) {
366 pr_err("%s(): request for already hashed, called from %pS\n",
367 __func__, __builtin_return_address(0));
368 return 0;
369 }
370
371 /* increase its refcnt because it is referenced by the svc table */
372 atomic_inc(&svc->refcnt);
373
374 /* New entries go into recent table */
375 t = rcu_dereference_protected(ipvs->svc_table, 1);
376 t = rcu_dereference_protected(t->new_tbl, 1);
377
378 if (svc->fwmark == 0) {
379 /*
380 * Hash it by <protocol,addr,port>
381 */
382 hash = ip_vs_svc_hashval(t, svc->af, svc->protocol,
383 &svc->addr, svc->port);
384 } else {
385 /*
386 * Hash it by fwmark
387 */
388 hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark);
389 }
390 head = t->buckets + (hash & t->mask);
391 hlist_bl_lock(head);
392 WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash));
393 svc->flags |= IP_VS_SVC_F_HASHED;
394 hlist_bl_add_head_rcu(&svc->s_list, head);
395 hlist_bl_unlock(head);
396
397 return 1;
398 }
399
400
401 /*
402 * Unhashes a service from svc_table.
403 * Should be called with locked tables.
404 */
ip_vs_svc_unhash(struct ip_vs_service * svc)405 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
406 {
407 struct netns_ipvs *ipvs = svc->ipvs;
408 struct hlist_bl_head *head;
409 struct ip_vs_rht *t;
410 u32 hash_key2;
411 u32 hash_key;
412
413 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
414 pr_err("%s(): request for unhash flagged, called from %pS\n",
415 __func__, __builtin_return_address(0));
416 return 0;
417 }
418
419 t = rcu_dereference_protected(ipvs->svc_table, 1);
420 hash_key = READ_ONCE(svc->hash_key);
421 /* We need to lock the bucket in the right table */
422 if (ip_vs_rht_same_table(t, hash_key)) {
423 head = t->buckets + (hash_key & t->mask);
424 hlist_bl_lock(head);
425 /* Ensure hash_key is read under lock */
426 hash_key2 = READ_ONCE(svc->hash_key);
427 /* Moved to new table ? */
428 if (hash_key != hash_key2) {
429 hlist_bl_unlock(head);
430 t = rcu_dereference_protected(t->new_tbl, 1);
431 head = t->buckets + (hash_key2 & t->mask);
432 hlist_bl_lock(head);
433 }
434 } else {
435 /* It is already moved to new table */
436 t = rcu_dereference_protected(t->new_tbl, 1);
437 head = t->buckets + (hash_key & t->mask);
438 hlist_bl_lock(head);
439 }
440 /* Remove it from svc_table */
441 hlist_bl_del_rcu(&svc->s_list);
442
443 svc->flags &= ~IP_VS_SVC_F_HASHED;
444 atomic_dec(&svc->refcnt);
445 hlist_bl_unlock(head);
446 return 1;
447 }
448
449
450 /*
451 * Get service by {netns, proto,addr,port} in the service table.
452 */
453 static inline struct ip_vs_service *
__ip_vs_service_find(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * vaddr,__be16 vport)454 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
455 const union nf_inet_addr *vaddr, __be16 vport)
456 {
457 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
458 struct hlist_bl_head *head;
459 struct ip_vs_service *svc;
460 struct ip_vs_rht *t, *p;
461 struct hlist_bl_node *e;
462 u32 hash, hash_key;
463
464 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
465 /* Check for "full" addressed entries */
466 hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport);
467
468 hash_key = ip_vs_rht_build_hash_key(t, hash);
469 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
470 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
471 if (READ_ONCE(svc->hash_key) == hash_key &&
472 svc->af == af &&
473 ip_vs_addr_equal(af, &svc->addr, vaddr) &&
474 svc->port == vport &&
475 svc->protocol == protocol && !svc->fwmark) {
476 /* HIT */
477 return svc;
478 }
479 }
480 }
481 }
482
483 return NULL;
484 }
485
486
487 /*
488 * Get service by {fwmark} in the service table.
489 */
490 static inline struct ip_vs_service *
__ip_vs_svc_fwm_find(struct netns_ipvs * ipvs,int af,__u32 fwmark)491 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
492 {
493 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
494 struct hlist_bl_head *head;
495 struct ip_vs_service *svc;
496 struct ip_vs_rht *t, *p;
497 struct hlist_bl_node *e;
498 u32 hash, hash_key;
499
500 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
501 /* Check for fwmark addressed entries */
502 hash = ip_vs_svc_fwm_hashval(t, af, fwmark);
503
504 hash_key = ip_vs_rht_build_hash_key(t, hash);
505 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
506 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
507 if (READ_ONCE(svc->hash_key) == hash_key &&
508 svc->fwmark == fwmark && svc->af == af) {
509 /* HIT */
510 return svc;
511 }
512 }
513 }
514 }
515
516 return NULL;
517 }
518
519 /* Find service, called under RCU lock */
520 struct ip_vs_service *
ip_vs_service_find(struct netns_ipvs * ipvs,int af,__u32 fwmark,__u16 protocol,const union nf_inet_addr * vaddr,__be16 vport)521 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
522 const union nf_inet_addr *vaddr, __be16 vport)
523 {
524 struct ip_vs_service *svc = NULL;
525 int af_id = ip_vs_af_index(af);
526
527 /*
528 * Check the table hashed by fwmark first
529 */
530 if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) {
531 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
532 if (svc)
533 goto out;
534 }
535
536 if (!atomic_read(&ipvs->nonfwm_services[af_id]))
537 goto out;
538
539 /*
540 * Check the table hashed by <protocol,addr,port>
541 * for "full" addressed entries
542 */
543 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
544 if (svc)
545 goto out;
546
547 if (protocol == IPPROTO_TCP &&
548 atomic_read(&ipvs->ftpsvc_counter[af_id]) &&
549 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
550 /*
551 * Check if ftp service entry exists, the packet
552 * might belong to FTP data connections.
553 */
554 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
555 if (svc)
556 goto out;
557 }
558
559 if (atomic_read(&ipvs->nullsvc_counter[af_id])) {
560 /*
561 * Check if the catch-all port (port zero) exists
562 */
563 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
564 }
565
566 out:
567 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
568 fwmark, ip_vs_proto_name(protocol),
569 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
570 svc ? "hit" : "not hit");
571
572 return svc;
573 }
574
575 /* Return the number of registered services */
ip_vs_get_num_services(struct netns_ipvs * ipvs)576 static int ip_vs_get_num_services(struct netns_ipvs *ipvs)
577 {
578 int ns = 0, ni = IP_VS_AF_MAX;
579
580 while (--ni >= 0)
581 ns += atomic_read(&ipvs->num_services[ni]);
582 return ns;
583 }
584
585 /* Get default load factor to map num_services/u_thresh to t->size */
ip_vs_svc_default_load_factor(struct netns_ipvs * ipvs)586 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs)
587 {
588 int factor;
589
590 if (net_eq(ipvs->net, &init_net))
591 factor = -3; /* grow if load is above 12.5% */
592 else
593 factor = -2; /* grow if load is above 25% */
594 return factor;
595 }
596
597 /* Get the desired svc_table size */
ip_vs_svc_desired_size(struct netns_ipvs * ipvs,struct ip_vs_rht * t,int lfactor)598 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
599 int lfactor)
600 {
601 return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs),
602 lfactor, IP_VS_SVC_TAB_MIN_BITS,
603 IP_VS_SVC_TAB_MAX_BITS);
604 }
605
606 /* Allocate svc_table */
ip_vs_svc_table_alloc(struct netns_ipvs * ipvs,int buckets,int lfactor)607 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs,
608 int buckets, int lfactor)
609 {
610 struct ip_vs_rht *t;
611 int scounts, locks;
612
613 /* No frequent lookups to race with resizing, so use max of 64
614 * seqcounts. Only resizer moves entries, so use 0 locks.
615 */
616 scounts = clamp(buckets >> 4, 1, 64);
617 locks = 0;
618
619 t = ip_vs_rht_alloc(buckets, scounts, locks);
620 if (!t)
621 return NULL;
622 t->lfactor = lfactor;
623 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS,
624 IP_VS_SVC_TAB_MAX_BITS);
625 return t;
626 }
627
628 /* svc_table resizer work */
svc_resize_work_handler(struct work_struct * work)629 static void svc_resize_work_handler(struct work_struct *work)
630 {
631 struct hlist_bl_head *head, *head2;
632 struct ip_vs_rht *t_free = NULL;
633 unsigned int resched_score = 0;
634 struct hlist_bl_node *cn, *nn;
635 struct ip_vs_rht *t, *t_new;
636 struct ip_vs_service *svc;
637 struct netns_ipvs *ipvs;
638 bool more_work = true;
639 seqcount_t *sc;
640 int limit = 0;
641 int new_size;
642 int lfactor;
643 u32 bucket;
644
645 ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work);
646
647 if (!down_write_trylock(&ipvs->svc_resize_sem))
648 goto out;
649 if (!mutex_trylock(&ipvs->service_mutex))
650 goto unlock_sem;
651 more_work = false;
652 clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
653 if (!READ_ONCE(ipvs->enable) ||
654 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
655 goto unlock_m;
656 t = rcu_dereference_protected(ipvs->svc_table, 1);
657 /* Do nothing if table is removed */
658 if (!t)
659 goto unlock_m;
660 /* New table needs to be registered? BUG! */
661 if (t != rcu_dereference_protected(t->new_tbl, 1))
662 goto unlock_m;
663
664 lfactor = sysctl_svc_lfactor(ipvs);
665 /* Should we resize ? */
666 new_size = ip_vs_svc_desired_size(ipvs, t, lfactor);
667 if (new_size == t->size && lfactor == t->lfactor)
668 goto unlock_m;
669
670 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
671 if (!t_new) {
672 more_work = true;
673 goto unlock_m;
674 }
675 /* Flip the table_id */
676 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
677
678 rcu_assign_pointer(t->new_tbl, t_new);
679 /* Allow add/del to new_tbl while moving from old table */
680 mutex_unlock(&ipvs->service_mutex);
681
682 ip_vs_rht_for_each_bucket(t, bucket, head) {
683 same_bucket:
684 if (++limit >= 16) {
685 if (!READ_ONCE(ipvs->enable) ||
686 test_bit(IP_VS_WORK_SVC_NORESIZE,
687 &ipvs->work_flags))
688 goto unlock_sem;
689 if (resched_score >= 100) {
690 resched_score = 0;
691 cond_resched();
692 }
693 limit = 0;
694 }
695 if (hlist_bl_empty(head)) {
696 resched_score++;
697 continue;
698 }
699 /* Preemption calls ahead... */
700 resched_score = 0;
701
702 sc = &t->seqc[bucket & t->seqc_mask];
703 /* seqcount_t usage considering PREEMPT_RT rules:
704 * - we are the only writer => preemption can be allowed
705 * - readers (SoftIRQ) => disable BHs
706 * - readers (processes) => preemption should be disabled
707 */
708 local_bh_disable();
709 preempt_disable_nested();
710 write_seqcount_begin(sc);
711 hlist_bl_lock(head);
712
713 hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) {
714 u32 hash;
715
716 /* New hash for the new table */
717 if (svc->fwmark == 0) {
718 /* Hash it by <protocol,addr,port> */
719 hash = ip_vs_svc_hashval(t_new, svc->af,
720 svc->protocol,
721 &svc->addr, svc->port);
722 } else {
723 /* Hash it by fwmark */
724 hash = ip_vs_svc_fwm_hashval(t_new, svc->af,
725 svc->fwmark);
726 }
727 hlist_bl_del_rcu(&svc->s_list);
728 head2 = t_new->buckets + (hash & t_new->mask);
729
730 hlist_bl_lock(head2);
731 WRITE_ONCE(svc->hash_key,
732 ip_vs_rht_build_hash_key(t_new, hash));
733 /* t_new->seqc are not used at this stage, we race
734 * only with add/del, so only lock the bucket.
735 */
736 hlist_bl_add_head_rcu(&svc->s_list, head2);
737 hlist_bl_unlock(head2);
738 /* Too long chain? Do it in steps */
739 if (++limit >= 64)
740 break;
741 }
742
743 hlist_bl_unlock(head);
744 write_seqcount_end(sc);
745 preempt_enable_nested();
746 local_bh_enable();
747 if (limit >= 64)
748 goto same_bucket;
749 }
750
751 /* Tables can be switched only under service_mutex */
752 while (!mutex_trylock(&ipvs->service_mutex)) {
753 cond_resched();
754 if (!READ_ONCE(ipvs->enable) ||
755 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
756 goto unlock_sem;
757 }
758 if (!READ_ONCE(ipvs->enable) ||
759 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
760 goto unlock_m;
761
762 rcu_assign_pointer(ipvs->svc_table, t_new);
763 /* Inform readers that new table is installed */
764 smp_mb__before_atomic();
765 atomic_inc(&ipvs->svc_table_changes);
766 t_free = t;
767
768 unlock_m:
769 mutex_unlock(&ipvs->service_mutex);
770
771 unlock_sem:
772 up_write(&ipvs->svc_resize_sem);
773
774 if (t_free) {
775 /* RCU readers should not see more than two tables in chain.
776 * To prevent new table to be attached wait here instead of
777 * freeing the old table in RCU callback.
778 */
779 synchronize_rcu();
780 ip_vs_rht_free(t_free);
781 }
782
783 out:
784 if (!READ_ONCE(ipvs->enable) || !more_work ||
785 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
786 return;
787 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
788 }
789
790 static inline void
__ip_vs_bind_svc(struct ip_vs_dest * dest,struct ip_vs_service * svc)791 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
792 {
793 atomic_inc(&svc->refcnt);
794 rcu_assign_pointer(dest->svc, svc);
795 }
796
ip_vs_service_free(struct ip_vs_service * svc)797 static void ip_vs_service_free(struct ip_vs_service *svc)
798 {
799 ip_vs_stats_release(&svc->stats);
800 kfree(svc);
801 }
802
ip_vs_service_rcu_free(struct rcu_head * head)803 static void ip_vs_service_rcu_free(struct rcu_head *head)
804 {
805 struct ip_vs_service *svc;
806
807 svc = container_of(head, struct ip_vs_service, rcu_head);
808 ip_vs_service_free(svc);
809 }
810
__ip_vs_svc_put(struct ip_vs_service * svc)811 static void __ip_vs_svc_put(struct ip_vs_service *svc)
812 {
813 if (atomic_dec_and_test(&svc->refcnt)) {
814 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
815 svc->fwmark,
816 IP_VS_DBG_ADDR(svc->af, &svc->addr),
817 ntohs(svc->port));
818 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
819 }
820 }
821
822
823 /*
824 * Returns hash value for real service
825 */
ip_vs_rs_hashkey(int af,const union nf_inet_addr * addr,__be16 port)826 static inline unsigned int ip_vs_rs_hashkey(int af,
827 const union nf_inet_addr *addr,
828 __be16 port)
829 {
830 unsigned int porth = ntohs(port);
831 __be32 addr_fold = addr->ip;
832
833 #ifdef CONFIG_IP_VS_IPV6
834 if (af == AF_INET6)
835 addr_fold = addr->ip6[0]^addr->ip6[1]^
836 addr->ip6[2]^addr->ip6[3];
837 #endif
838
839 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
840 & IP_VS_RTAB_MASK;
841 }
842
843 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
ip_vs_rs_hash(struct netns_ipvs * ipvs,struct ip_vs_dest * dest)844 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
845 {
846 unsigned int hash;
847 __be16 port;
848
849 if (dest->in_rs_table)
850 return;
851
852 switch (IP_VS_DFWD_METHOD(dest)) {
853 case IP_VS_CONN_F_MASQ:
854 port = dest->port;
855 break;
856 case IP_VS_CONN_F_TUNNEL:
857 switch (dest->tun_type) {
858 case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
859 port = dest->tun_port;
860 break;
861 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
862 case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
863 port = 0;
864 break;
865 default:
866 return;
867 }
868 break;
869 default:
870 return;
871 }
872
873 /*
874 * Hash by proto,addr,port,
875 * which are the parameters of the real service.
876 */
877 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
878
879 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
880 dest->in_rs_table = 1;
881 }
882
883 /* Unhash ip_vs_dest from rs_table. */
ip_vs_rs_unhash(struct ip_vs_dest * dest)884 static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
885 {
886 /*
887 * Remove it from the rs_table table.
888 */
889 if (dest->in_rs_table) {
890 hlist_del_rcu(&dest->d_list);
891 dest->in_rs_table = 0;
892 }
893 }
894
895 /* Check if real service by <proto,addr,port> is present */
ip_vs_has_real_service(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * daddr,__be16 dport)896 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
897 const union nf_inet_addr *daddr, __be16 dport)
898 {
899 unsigned int hash;
900 struct ip_vs_dest *dest;
901
902 /* Check for "full" addressed entries */
903 hash = ip_vs_rs_hashkey(af, daddr, dport);
904
905 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
906 if (dest->port == dport &&
907 dest->af == af &&
908 ip_vs_addr_equal(af, &dest->addr, daddr) &&
909 (dest->protocol == protocol || dest->vfwmark) &&
910 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
911 /* HIT */
912 return true;
913 }
914 }
915
916 return false;
917 }
918
919 /* Find real service record by <proto,addr,port>.
920 * In case of multiple records with the same <proto,addr,port>, only
921 * the first found record is returned.
922 *
923 * To be called under RCU lock.
924 */
ip_vs_find_real_service(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * daddr,__be16 dport)925 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
926 __u16 protocol,
927 const union nf_inet_addr *daddr,
928 __be16 dport)
929 {
930 unsigned int hash;
931 struct ip_vs_dest *dest;
932
933 /* Check for "full" addressed entries */
934 hash = ip_vs_rs_hashkey(af, daddr, dport);
935
936 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
937 if (dest->port == dport &&
938 dest->af == af &&
939 ip_vs_addr_equal(af, &dest->addr, daddr) &&
940 (dest->protocol == protocol || dest->vfwmark) &&
941 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
942 /* HIT */
943 return dest;
944 }
945 }
946
947 return NULL;
948 }
949
950 /* Find real service record by <af,addr,tun_port>.
951 * In case of multiple records with the same <af,addr,tun_port>, only
952 * the first found record is returned.
953 *
954 * To be called under RCU lock.
955 */
ip_vs_find_tunnel(struct netns_ipvs * ipvs,int af,const union nf_inet_addr * daddr,__be16 tun_port)956 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
957 const union nf_inet_addr *daddr,
958 __be16 tun_port)
959 {
960 struct ip_vs_dest *dest;
961 unsigned int hash;
962
963 /* Check for "full" addressed entries */
964 hash = ip_vs_rs_hashkey(af, daddr, tun_port);
965
966 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
967 if (dest->tun_port == tun_port &&
968 dest->af == af &&
969 ip_vs_addr_equal(af, &dest->addr, daddr) &&
970 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
971 /* HIT */
972 return dest;
973 }
974 }
975
976 return NULL;
977 }
978
979 /* Lookup destination by {addr,port} in the given service
980 * Called under RCU lock.
981 */
982 static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service * svc,int dest_af,const union nf_inet_addr * daddr,__be16 dport)983 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
984 const union nf_inet_addr *daddr, __be16 dport)
985 {
986 struct ip_vs_dest *dest;
987
988 /*
989 * Find the destination for the given service
990 */
991 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
992 if ((dest->af == dest_af) &&
993 ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
994 (dest->port == dport)) {
995 /* HIT */
996 return dest;
997 }
998 }
999
1000 return NULL;
1001 }
1002
1003 /*
1004 * Find destination by {daddr,dport,vaddr,protocol}
1005 * Created to be used in ip_vs_process_message() in
1006 * the backup synchronization daemon. It finds the
1007 * destination to be bound to the received connection
1008 * on the backup.
1009 * Called under RCU lock, no refcnt is returned.
1010 */
ip_vs_find_dest(struct netns_ipvs * ipvs,int svc_af,int dest_af,const union nf_inet_addr * daddr,__be16 dport,const union nf_inet_addr * vaddr,__be16 vport,__u16 protocol,__u32 fwmark,__u32 flags)1011 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
1012 const union nf_inet_addr *daddr,
1013 __be16 dport,
1014 const union nf_inet_addr *vaddr,
1015 __be16 vport, __u16 protocol, __u32 fwmark,
1016 __u32 flags)
1017 {
1018 struct ip_vs_dest *dest;
1019 struct ip_vs_service *svc;
1020 __be16 port = dport;
1021
1022 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
1023 if (!svc)
1024 return NULL;
1025 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
1026 port = 0;
1027 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
1028 if (!dest)
1029 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
1030 return dest;
1031 }
1032
ip_vs_dest_dst_rcu_free(struct rcu_head * head)1033 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
1034 {
1035 struct ip_vs_dest_dst *dest_dst = container_of(head,
1036 struct ip_vs_dest_dst,
1037 rcu_head);
1038
1039 dst_release(dest_dst->dst_cache);
1040 kfree(dest_dst);
1041 }
1042
1043 /* Release dest_dst and dst_cache for dest in user context */
__ip_vs_dst_cache_reset(struct ip_vs_dest * dest)1044 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
1045 {
1046 struct ip_vs_dest_dst *old;
1047
1048 old = rcu_dereference_protected(dest->dest_dst, 1);
1049 if (old) {
1050 RCU_INIT_POINTER(dest->dest_dst, NULL);
1051 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
1052 }
1053 }
1054
1055 /*
1056 * Lookup dest by {svc,addr,port} in the destination trash.
1057 * The destination trash is used to hold the destinations that are removed
1058 * from the service table but are still referenced by some conn entries.
1059 * The reason to add the destination trash is when the dest is temporary
1060 * down (either by administrator or by monitor program), the dest can be
1061 * picked back from the trash, the remaining connections to the dest can
1062 * continue, and the counting information of the dest is also useful for
1063 * scheduling.
1064 */
1065 static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service * svc,int dest_af,const union nf_inet_addr * daddr,__be16 dport)1066 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
1067 const union nf_inet_addr *daddr, __be16 dport)
1068 {
1069 struct ip_vs_dest *dest;
1070 struct netns_ipvs *ipvs = svc->ipvs;
1071
1072 /*
1073 * Find the destination in trash
1074 */
1075 spin_lock_bh(&ipvs->dest_trash_lock);
1076 list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1077 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
1078 "dest->refcnt=%d\n",
1079 dest->vfwmark,
1080 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1081 ntohs(dest->port),
1082 refcount_read(&dest->refcnt));
1083 if (dest->af == dest_af &&
1084 ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
1085 dest->port == dport &&
1086 dest->vfwmark == svc->fwmark &&
1087 dest->protocol == svc->protocol &&
1088 (svc->fwmark ||
1089 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
1090 dest->vport == svc->port))) {
1091 /* HIT */
1092 list_del(&dest->t_list);
1093 goto out;
1094 }
1095 }
1096
1097 dest = NULL;
1098
1099 out:
1100 spin_unlock_bh(&ipvs->dest_trash_lock);
1101
1102 return dest;
1103 }
1104
ip_vs_dest_rcu_free(struct rcu_head * head)1105 static void ip_vs_dest_rcu_free(struct rcu_head *head)
1106 {
1107 struct ip_vs_dest *dest;
1108
1109 dest = container_of(head, struct ip_vs_dest, rcu_head);
1110 ip_vs_stats_release(&dest->stats);
1111 ip_vs_dest_put_and_free(dest);
1112 }
1113
ip_vs_dest_free(struct ip_vs_dest * dest)1114 static void ip_vs_dest_free(struct ip_vs_dest *dest)
1115 {
1116 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
1117
1118 __ip_vs_svc_put(svc);
1119 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
1120 }
1121
1122 /*
1123 * Clean up all the destinations in the trash
1124 * Called by the ip_vs_control_cleanup()
1125 *
1126 * When the ip_vs_control_clearup is activated by ipvs module exit,
1127 * the service tables must have been flushed and all the connections
1128 * are expired, and the refcnt of each destination in the trash must
1129 * be 1, so we simply release them here.
1130 */
ip_vs_trash_cleanup(struct netns_ipvs * ipvs)1131 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
1132 {
1133 struct ip_vs_dest *dest, *nxt;
1134
1135 timer_delete_sync(&ipvs->dest_trash_timer);
1136 /* No need to use dest_trash_lock */
1137 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
1138 list_del(&dest->t_list);
1139 ip_vs_dest_free(dest);
1140 }
1141 }
1142
ip_vs_stats_rcu_free(struct rcu_head * head)1143 static void ip_vs_stats_rcu_free(struct rcu_head *head)
1144 {
1145 struct ip_vs_stats_rcu *rs = container_of(head,
1146 struct ip_vs_stats_rcu,
1147 rcu_head);
1148
1149 ip_vs_stats_release(&rs->s);
1150 kfree(rs);
1151 }
1152
1153 static void
ip_vs_copy_stats(struct ip_vs_kstats * dst,struct ip_vs_stats * src)1154 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
1155 {
1156 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
1157
1158 spin_lock(&src->lock);
1159
1160 IP_VS_SHOW_STATS_COUNTER(conns);
1161 IP_VS_SHOW_STATS_COUNTER(inpkts);
1162 IP_VS_SHOW_STATS_COUNTER(outpkts);
1163 IP_VS_SHOW_STATS_COUNTER(inbytes);
1164 IP_VS_SHOW_STATS_COUNTER(outbytes);
1165
1166 ip_vs_read_estimator(dst, src);
1167
1168 spin_unlock(&src->lock);
1169 }
1170
1171 static void
ip_vs_export_stats_user(struct ip_vs_stats_user * dst,struct ip_vs_kstats * src)1172 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
1173 {
1174 dst->conns = (u32)src->conns;
1175 dst->inpkts = (u32)src->inpkts;
1176 dst->outpkts = (u32)src->outpkts;
1177 dst->inbytes = src->inbytes;
1178 dst->outbytes = src->outbytes;
1179 dst->cps = (u32)src->cps;
1180 dst->inpps = (u32)src->inpps;
1181 dst->outpps = (u32)src->outpps;
1182 dst->inbps = (u32)src->inbps;
1183 dst->outbps = (u32)src->outbps;
1184 }
1185
1186 static void
ip_vs_zero_stats(struct ip_vs_stats * stats)1187 ip_vs_zero_stats(struct ip_vs_stats *stats)
1188 {
1189 spin_lock(&stats->lock);
1190
1191 /* get current counters as zero point, rates are zeroed */
1192
1193 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
1194
1195 IP_VS_ZERO_STATS_COUNTER(conns);
1196 IP_VS_ZERO_STATS_COUNTER(inpkts);
1197 IP_VS_ZERO_STATS_COUNTER(outpkts);
1198 IP_VS_ZERO_STATS_COUNTER(inbytes);
1199 IP_VS_ZERO_STATS_COUNTER(outbytes);
1200
1201 ip_vs_zero_estimator(stats);
1202
1203 spin_unlock(&stats->lock);
1204 }
1205
1206 /* Allocate fields after kzalloc */
ip_vs_stats_init_alloc(struct ip_vs_stats * s)1207 int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
1208 {
1209 int i;
1210
1211 spin_lock_init(&s->lock);
1212 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1213 if (!s->cpustats)
1214 return -ENOMEM;
1215
1216 for_each_possible_cpu(i) {
1217 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
1218
1219 u64_stats_init(&cs->syncp);
1220 }
1221 return 0;
1222 }
1223
ip_vs_stats_alloc(void)1224 struct ip_vs_stats *ip_vs_stats_alloc(void)
1225 {
1226 struct ip_vs_stats *s = kzalloc_obj(*s);
1227
1228 if (s && ip_vs_stats_init_alloc(s) >= 0)
1229 return s;
1230 kfree(s);
1231 return NULL;
1232 }
1233
ip_vs_stats_release(struct ip_vs_stats * stats)1234 void ip_vs_stats_release(struct ip_vs_stats *stats)
1235 {
1236 free_percpu(stats->cpustats);
1237 }
1238
ip_vs_stats_free(struct ip_vs_stats * stats)1239 void ip_vs_stats_free(struct ip_vs_stats *stats)
1240 {
1241 if (stats) {
1242 ip_vs_stats_release(stats);
1243 kfree(stats);
1244 }
1245 }
1246
1247 /*
1248 * Update a destination in the given service
1249 */
1250 static void
__ip_vs_update_dest(struct ip_vs_service * svc,struct ip_vs_dest * dest,struct ip_vs_dest_user_kern * udest,int add)1251 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
1252 struct ip_vs_dest_user_kern *udest, int add)
1253 {
1254 struct netns_ipvs *ipvs = svc->ipvs;
1255 struct ip_vs_service *old_svc;
1256 struct ip_vs_scheduler *sched;
1257 int conn_flags;
1258
1259 /* We cannot modify an address and change the address family */
1260 BUG_ON(!add && udest->af != dest->af);
1261
1262 if (add && udest->af != svc->af)
1263 ipvs->mixed_address_family_dests++;
1264
1265 /* keep the last_weight with latest non-0 weight */
1266 if (add || udest->weight != 0)
1267 atomic_set(&dest->last_weight, udest->weight);
1268
1269 /* set the weight and the flags */
1270 atomic_set(&dest->weight, udest->weight);
1271 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
1272 conn_flags |= IP_VS_CONN_F_INACTIVE;
1273
1274 /* Need to rehash? */
1275 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
1276 IP_VS_DFWD_METHOD(dest) ||
1277 udest->tun_type != dest->tun_type ||
1278 udest->tun_port != dest->tun_port)
1279 ip_vs_rs_unhash(dest);
1280
1281 /* set the tunnel info */
1282 dest->tun_type = udest->tun_type;
1283 dest->tun_port = udest->tun_port;
1284 dest->tun_flags = udest->tun_flags;
1285
1286 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
1287 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
1288 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
1289 } else {
1290 /* FTP-NAT requires conntrack for mangling */
1291 if (svc->port == FTPPORT)
1292 ip_vs_register_conntrack(svc);
1293 }
1294 atomic_set(&dest->conn_flags, conn_flags);
1295 /* Put the real service in rs_table if not present. */
1296 ip_vs_rs_hash(ipvs, dest);
1297
1298 /* bind the service */
1299 old_svc = rcu_dereference_protected(dest->svc, 1);
1300 if (!old_svc) {
1301 __ip_vs_bind_svc(dest, svc);
1302 } else {
1303 if (old_svc != svc) {
1304 ip_vs_zero_stats(&dest->stats);
1305 __ip_vs_bind_svc(dest, svc);
1306 __ip_vs_svc_put(old_svc);
1307 }
1308 }
1309
1310 /* set the dest status flags */
1311 dest->flags |= IP_VS_DEST_F_AVAILABLE;
1312
1313 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
1314 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1315 dest->u_threshold = udest->u_threshold;
1316 dest->l_threshold = udest->l_threshold;
1317
1318 dest->af = udest->af;
1319
1320 if (add) {
1321 list_add_rcu(&dest->n_list, &svc->destinations);
1322 svc->num_dests++;
1323 sched = rcu_dereference_protected(svc->scheduler, 1);
1324 if (sched && sched->add_dest)
1325 sched->add_dest(svc, dest);
1326 } else {
1327 spin_lock_bh(&dest->dst_lock);
1328 __ip_vs_dst_cache_reset(dest);
1329 spin_unlock_bh(&dest->dst_lock);
1330
1331 sched = rcu_dereference_protected(svc->scheduler, 1);
1332 if (sched && sched->upd_dest)
1333 sched->upd_dest(svc, dest);
1334 }
1335 }
1336
1337
1338 /*
1339 * Create a destination for the given service
1340 */
1341 static int
ip_vs_new_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1342 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1343 {
1344 struct ip_vs_dest *dest;
1345 unsigned int atype;
1346 int ret;
1347
1348 #ifdef CONFIG_IP_VS_IPV6
1349 if (udest->af == AF_INET6) {
1350 atype = ipv6_addr_type(&udest->addr.in6);
1351 if ((!(atype & IPV6_ADDR_UNICAST) ||
1352 atype & IPV6_ADDR_LINKLOCAL) &&
1353 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
1354 return -EINVAL;
1355
1356 ret = nf_defrag_ipv6_enable(svc->ipvs->net);
1357 if (ret)
1358 return ret;
1359 } else
1360 #endif
1361 {
1362 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
1363 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
1364 return -EINVAL;
1365 }
1366
1367 dest = kzalloc_obj(struct ip_vs_dest);
1368 if (dest == NULL)
1369 return -ENOMEM;
1370
1371 ret = ip_vs_stats_init_alloc(&dest->stats);
1372 if (ret < 0)
1373 goto err_alloc;
1374
1375 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1376 if (ret < 0)
1377 goto err_stats;
1378
1379 dest->af = udest->af;
1380 dest->protocol = svc->protocol;
1381 dest->vaddr = svc->addr;
1382 dest->vport = svc->port;
1383 dest->vfwmark = svc->fwmark;
1384 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
1385 dest->port = udest->port;
1386
1387 atomic_set(&dest->activeconns, 0);
1388 atomic_set(&dest->inactconns, 0);
1389 atomic_set(&dest->persistconns, 0);
1390 refcount_set(&dest->refcnt, 1);
1391
1392 INIT_HLIST_NODE(&dest->d_list);
1393 spin_lock_init(&dest->dst_lock);
1394 __ip_vs_update_dest(svc, dest, udest, 1);
1395
1396 return 0;
1397
1398 err_stats:
1399 ip_vs_stats_release(&dest->stats);
1400
1401 err_alloc:
1402 kfree(dest);
1403 return ret;
1404 }
1405
1406
1407 /*
1408 * Add a destination into an existing service
1409 */
1410 static int
ip_vs_add_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1411 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1412 {
1413 struct ip_vs_dest *dest;
1414 union nf_inet_addr daddr;
1415 __be16 dport = udest->port;
1416 int ret;
1417
1418 if (udest->weight < 0) {
1419 pr_err("%s(): server weight less than zero\n", __func__);
1420 return -ERANGE;
1421 }
1422
1423 if (udest->l_threshold > udest->u_threshold) {
1424 pr_err("%s(): lower threshold is higher than upper threshold\n",
1425 __func__);
1426 return -ERANGE;
1427 }
1428
1429 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1430 if (udest->tun_port == 0) {
1431 pr_err("%s(): tunnel port is zero\n", __func__);
1432 return -EINVAL;
1433 }
1434 }
1435
1436 ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1437
1438 /* We use function that requires RCU lock */
1439 rcu_read_lock();
1440 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1441 rcu_read_unlock();
1442
1443 if (dest != NULL) {
1444 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
1445 return -EEXIST;
1446 }
1447
1448 /*
1449 * Check if the dest already exists in the trash and
1450 * is from the same service
1451 */
1452 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
1453
1454 if (dest != NULL) {
1455 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
1456 "dest->refcnt=%d, service %u/%s:%u\n",
1457 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
1458 refcount_read(&dest->refcnt),
1459 dest->vfwmark,
1460 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
1461 ntohs(dest->vport));
1462
1463 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1464 if (ret < 0)
1465 return ret;
1466 __ip_vs_update_dest(svc, dest, udest, 1);
1467 } else {
1468 /*
1469 * Allocate and initialize the dest structure
1470 */
1471 ret = ip_vs_new_dest(svc, udest);
1472 }
1473
1474 return ret;
1475 }
1476
1477
1478 /*
1479 * Edit a destination in the given service
1480 */
1481 static int
ip_vs_edit_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1482 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1483 {
1484 struct ip_vs_dest *dest;
1485 union nf_inet_addr daddr;
1486 __be16 dport = udest->port;
1487
1488 if (udest->weight < 0) {
1489 pr_err("%s(): server weight less than zero\n", __func__);
1490 return -ERANGE;
1491 }
1492
1493 if (udest->l_threshold > udest->u_threshold) {
1494 pr_err("%s(): lower threshold is higher than upper threshold\n",
1495 __func__);
1496 return -ERANGE;
1497 }
1498
1499 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1500 if (udest->tun_port == 0) {
1501 pr_err("%s(): tunnel port is zero\n", __func__);
1502 return -EINVAL;
1503 }
1504 }
1505
1506 ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1507
1508 /* We use function that requires RCU lock */
1509 rcu_read_lock();
1510 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1511 rcu_read_unlock();
1512
1513 if (dest == NULL) {
1514 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1515 return -ENOENT;
1516 }
1517
1518 __ip_vs_update_dest(svc, dest, udest, 0);
1519
1520 return 0;
1521 }
1522
1523 /*
1524 * Delete a destination (must be already unlinked from the service)
1525 */
__ip_vs_del_dest(struct netns_ipvs * ipvs,struct ip_vs_dest * dest,bool cleanup)1526 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1527 bool cleanup)
1528 {
1529 ip_vs_stop_estimator(ipvs, &dest->stats);
1530
1531 /*
1532 * Remove it from the d-linked list with the real services.
1533 */
1534 ip_vs_rs_unhash(dest);
1535
1536 spin_lock_bh(&ipvs->dest_trash_lock);
1537 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1538 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1539 refcount_read(&dest->refcnt));
1540 if (list_empty(&ipvs->dest_trash) && !cleanup)
1541 mod_timer(&ipvs->dest_trash_timer,
1542 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1543 /* dest lives in trash with reference */
1544 list_add(&dest->t_list, &ipvs->dest_trash);
1545 dest->idle_start = 0;
1546 spin_unlock_bh(&ipvs->dest_trash_lock);
1547
1548 /* Queue up delayed work to expire all no destination connections.
1549 * No-op when CONFIG_SYSCTL is disabled.
1550 */
1551 if (!cleanup)
1552 ip_vs_enqueue_expire_nodest_conns(ipvs);
1553 }
1554
1555
1556 /*
1557 * Unlink a destination from the given service
1558 */
__ip_vs_unlink_dest(struct ip_vs_service * svc,struct ip_vs_dest * dest,int svcupd)1559 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1560 struct ip_vs_dest *dest,
1561 int svcupd)
1562 {
1563 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1564
1565 spin_lock_bh(&dest->dst_lock);
1566 __ip_vs_dst_cache_reset(dest);
1567 spin_unlock_bh(&dest->dst_lock);
1568
1569 /*
1570 * Remove it from the d-linked destination list.
1571 */
1572 list_del_rcu(&dest->n_list);
1573 svc->num_dests--;
1574
1575 if (dest->af != svc->af)
1576 svc->ipvs->mixed_address_family_dests--;
1577
1578 if (svcupd) {
1579 struct ip_vs_scheduler *sched;
1580
1581 sched = rcu_dereference_protected(svc->scheduler, 1);
1582 if (sched && sched->del_dest)
1583 sched->del_dest(svc, dest);
1584 }
1585 }
1586
1587
1588 /*
1589 * Delete a destination server in the given service
1590 */
1591 static int
ip_vs_del_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1592 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1593 {
1594 struct ip_vs_dest *dest;
1595 __be16 dport = udest->port;
1596
1597 /* We use function that requires RCU lock */
1598 rcu_read_lock();
1599 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
1600 rcu_read_unlock();
1601
1602 if (dest == NULL) {
1603 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1604 return -ENOENT;
1605 }
1606
1607 /*
1608 * Unlink dest from the service
1609 */
1610 __ip_vs_unlink_dest(svc, dest, 1);
1611
1612 /*
1613 * Delete the destination
1614 */
1615 __ip_vs_del_dest(svc->ipvs, dest, false);
1616
1617 return 0;
1618 }
1619
ip_vs_dest_trash_expire(struct timer_list * t)1620 static void ip_vs_dest_trash_expire(struct timer_list *t)
1621 {
1622 struct netns_ipvs *ipvs = timer_container_of(ipvs, t,
1623 dest_trash_timer);
1624 struct ip_vs_dest *dest, *next;
1625 unsigned long now = jiffies;
1626
1627 spin_lock(&ipvs->dest_trash_lock);
1628 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1629 if (refcount_read(&dest->refcnt) > 1)
1630 continue;
1631 if (dest->idle_start) {
1632 if (time_before(now, dest->idle_start +
1633 IP_VS_DEST_TRASH_PERIOD))
1634 continue;
1635 } else {
1636 dest->idle_start = max(1UL, now);
1637 continue;
1638 }
1639 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1640 dest->vfwmark,
1641 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1642 ntohs(dest->port));
1643 list_del(&dest->t_list);
1644 ip_vs_dest_free(dest);
1645 }
1646 if (!list_empty(&ipvs->dest_trash))
1647 mod_timer(&ipvs->dest_trash_timer,
1648 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1649 spin_unlock(&ipvs->dest_trash_lock);
1650 }
1651
1652 /*
1653 * Add a service into the service hash table
1654 */
1655 static int
ip_vs_add_service(struct netns_ipvs * ipvs,struct ip_vs_service_user_kern * u,struct ip_vs_service ** svc_p)1656 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
1657 struct ip_vs_service **svc_p)
1658 {
1659 struct ip_vs_scheduler *sched = NULL;
1660 struct ip_vs_rht *tc_new = NULL;
1661 struct ip_vs_rht *t, *t_new = NULL;
1662 int af_id = ip_vs_af_index(u->af);
1663 struct ip_vs_service *svc = NULL;
1664 struct ip_vs_pe *pe = NULL;
1665 int ret_hooks = -1;
1666 int ret = 0;
1667
1668 /* increase the module use count */
1669 if (!ip_vs_use_count_inc())
1670 return -ENOPROTOOPT;
1671
1672 /* Lookup the scheduler by 'u->sched_name' */
1673 if (strcmp(u->sched_name, "none")) {
1674 sched = ip_vs_scheduler_get(u->sched_name);
1675 if (!sched) {
1676 pr_info("Scheduler module ip_vs_%s not found\n",
1677 u->sched_name);
1678 ret = -ENOENT;
1679 goto out_err;
1680 }
1681 }
1682
1683 if (u->pe_name && *u->pe_name) {
1684 pe = ip_vs_pe_getbyname(u->pe_name);
1685 if (pe == NULL) {
1686 pr_info("persistence engine module ip_vs_pe_%s "
1687 "not found\n", u->pe_name);
1688 ret = -ENOENT;
1689 goto out_err;
1690 }
1691 }
1692
1693 #ifdef CONFIG_IP_VS_IPV6
1694 if (u->af == AF_INET6) {
1695 __u32 plen = (__force __u32) u->netmask;
1696
1697 if (plen < 1 || plen > 128) {
1698 ret = -EINVAL;
1699 goto out_err;
1700 }
1701
1702 ret = nf_defrag_ipv6_enable(ipvs->net);
1703 if (ret)
1704 goto out_err;
1705 }
1706 #endif
1707
1708 t = rcu_dereference_protected(ipvs->svc_table, 1);
1709 if (!t) {
1710 int lfactor = sysctl_svc_lfactor(ipvs);
1711 int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
1712
1713 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
1714 if (!t_new) {
1715 ret = -ENOMEM;
1716 goto out_err;
1717 }
1718 }
1719
1720 if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
1721 int lfactor = sysctl_conn_lfactor(ipvs);
1722 int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);
1723
1724 tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
1725 if (!tc_new) {
1726 ret = -ENOMEM;
1727 goto out_err;
1728 }
1729 }
1730
1731 if (!atomic_read(&ipvs->num_services[af_id])) {
1732 ret = ip_vs_register_hooks(ipvs, u->af);
1733 if (ret < 0)
1734 goto out_err;
1735 ret_hooks = ret;
1736 }
1737
1738 svc = kzalloc_obj(struct ip_vs_service);
1739 if (svc == NULL) {
1740 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1741 ret = -ENOMEM;
1742 goto out_err;
1743 }
1744 ret = ip_vs_stats_init_alloc(&svc->stats);
1745 if (ret < 0)
1746 goto out_err;
1747
1748 /* I'm the first user of the service */
1749 atomic_set(&svc->refcnt, 0);
1750
1751 svc->af = u->af;
1752 svc->protocol = u->protocol;
1753 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1754 svc->port = u->port;
1755 svc->fwmark = u->fwmark;
1756 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
1757 svc->timeout = u->timeout * HZ;
1758 svc->netmask = u->netmask;
1759 svc->ipvs = ipvs;
1760
1761 INIT_LIST_HEAD(&svc->destinations);
1762 spin_lock_init(&svc->sched_lock);
1763
1764 /* Bind the scheduler */
1765 if (sched) {
1766 ret = ip_vs_bind_scheduler(svc, sched);
1767 if (ret)
1768 goto out_err;
1769 }
1770
1771 ret = ip_vs_start_estimator(ipvs, &svc->stats);
1772 if (ret < 0)
1773 goto out_err;
1774
1775 if (t_new) {
1776 clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
1777 rcu_assign_pointer(ipvs->svc_table, t_new);
1778 t_new = NULL;
1779 }
1780 if (tc_new) {
1781 rcu_assign_pointer(ipvs->conn_tab, tc_new);
1782 tc_new = NULL;
1783 }
1784
1785 /* Update the virtual service counters */
1786 if (svc->port == FTPPORT)
1787 atomic_inc(&ipvs->ftpsvc_counter[af_id]);
1788 else if (!svc->port && !svc->fwmark)
1789 atomic_inc(&ipvs->nullsvc_counter[af_id]);
1790 if (pe && pe->conn_out)
1791 atomic_inc(&ipvs->conn_out_counter[af_id]);
1792
1793 /* Bind the ct retriever */
1794 RCU_INIT_POINTER(svc->pe, pe);
1795 pe = NULL;
1796
1797 if (svc->fwmark)
1798 atomic_inc(&ipvs->fwm_services[af_id]);
1799 else
1800 atomic_inc(&ipvs->nonfwm_services[af_id]);
1801 atomic_inc(&ipvs->num_services[af_id]);
1802
1803 /* Hash the service into the service table */
1804 ip_vs_svc_hash(svc);
1805
1806 /* Schedule resize work */
1807 if (t && ip_vs_get_num_services(ipvs) > t->u_thresh &&
1808 !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
1809 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
1810 1);
1811
1812 *svc_p = svc;
1813
1814 if (!READ_ONCE(ipvs->enable)) {
1815 /* Now there is a service - full throttle */
1816 WRITE_ONCE(ipvs->enable, 1);
1817
1818 /* Start estimation for first time */
1819 ip_vs_est_reload_start(ipvs);
1820 }
1821
1822 return 0;
1823
1824
1825 out_err:
1826 if (tc_new)
1827 ip_vs_rht_free(tc_new);
1828 if (t_new)
1829 ip_vs_rht_free(t_new);
1830 if (ret_hooks >= 0)
1831 ip_vs_unregister_hooks(ipvs, u->af);
1832 if (svc != NULL) {
1833 ip_vs_unbind_scheduler(svc, sched);
1834 ip_vs_service_free(svc);
1835 }
1836 ip_vs_scheduler_put(sched);
1837 ip_vs_pe_put(pe);
1838
1839 /* decrease the module use count */
1840 ip_vs_use_count_dec();
1841
1842 return ret;
1843 }
1844
1845
1846 /*
1847 * Edit a service and bind it with a new scheduler
1848 */
1849 static int
ip_vs_edit_service(struct ip_vs_service * svc,struct ip_vs_service_user_kern * u)1850 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1851 {
1852 struct ip_vs_scheduler *sched = NULL, *old_sched;
1853 struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1854 int ret = 0;
1855 bool new_pe_conn_out, old_pe_conn_out;
1856 struct netns_ipvs *ipvs = svc->ipvs;
1857 int af_id = ip_vs_af_index(svc->af);
1858
1859 /*
1860 * Lookup the scheduler, by 'u->sched_name'
1861 */
1862 if (strcmp(u->sched_name, "none")) {
1863 sched = ip_vs_scheduler_get(u->sched_name);
1864 if (!sched) {
1865 pr_info("Scheduler module ip_vs_%s not found\n",
1866 u->sched_name);
1867 return -ENOENT;
1868 }
1869 }
1870 old_sched = sched;
1871
1872 if (u->pe_name && *u->pe_name) {
1873 pe = ip_vs_pe_getbyname(u->pe_name);
1874 if (pe == NULL) {
1875 pr_info("persistence engine module ip_vs_pe_%s "
1876 "not found\n", u->pe_name);
1877 ret = -ENOENT;
1878 goto out;
1879 }
1880 old_pe = pe;
1881 }
1882
1883 #ifdef CONFIG_IP_VS_IPV6
1884 if (u->af == AF_INET6) {
1885 __u32 plen = (__force __u32) u->netmask;
1886
1887 if (plen < 1 || plen > 128) {
1888 ret = -EINVAL;
1889 goto out;
1890 }
1891 }
1892 #endif
1893
1894 old_sched = rcu_dereference_protected(svc->scheduler, 1);
1895 if (sched != old_sched) {
1896 if (old_sched) {
1897 ip_vs_unbind_scheduler(svc, old_sched);
1898 RCU_INIT_POINTER(svc->scheduler, NULL);
1899 /* Wait all svc->sched_data users */
1900 synchronize_rcu();
1901 }
1902 /* Bind the new scheduler */
1903 if (sched) {
1904 ret = ip_vs_bind_scheduler(svc, sched);
1905 if (ret) {
1906 ip_vs_scheduler_put(sched);
1907 goto out;
1908 }
1909 }
1910 }
1911
1912 /*
1913 * Set the flags and timeout value
1914 */
1915 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1916 svc->timeout = u->timeout * HZ;
1917 svc->netmask = u->netmask;
1918
1919 old_pe = rcu_dereference_protected(svc->pe, 1);
1920 if (pe != old_pe) {
1921 rcu_assign_pointer(svc->pe, pe);
1922 /* check for optional methods in new pe */
1923 new_pe_conn_out = (pe && pe->conn_out) ? true : false;
1924 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
1925 if (new_pe_conn_out && !old_pe_conn_out)
1926 atomic_inc(&ipvs->conn_out_counter[af_id]);
1927 if (old_pe_conn_out && !new_pe_conn_out)
1928 atomic_dec(&ipvs->conn_out_counter[af_id]);
1929 }
1930
1931 out:
1932 ip_vs_scheduler_put(old_sched);
1933 ip_vs_pe_put(old_pe);
1934 return ret;
1935 }
1936
1937 /*
1938 * Delete a service from the service list
1939 * - The service must be unlinked, unlocked and not referenced!
1940 * - We are called under _bh lock
1941 */
__ip_vs_del_service(struct ip_vs_service * svc,bool cleanup)1942 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
1943 {
1944 struct ip_vs_dest *dest, *nxt;
1945 struct ip_vs_scheduler *old_sched;
1946 struct ip_vs_pe *old_pe;
1947 struct netns_ipvs *ipvs = svc->ipvs;
1948 int af_id = ip_vs_af_index(svc->af);
1949
1950 atomic_dec(&ipvs->num_services[af_id]);
1951 if (!atomic_read(&ipvs->num_services[af_id]))
1952 ip_vs_unregister_hooks(ipvs, svc->af);
1953 if (svc->fwmark)
1954 atomic_dec(&ipvs->fwm_services[af_id]);
1955 else
1956 atomic_dec(&ipvs->nonfwm_services[af_id]);
1957
1958 ip_vs_stop_estimator(svc->ipvs, &svc->stats);
1959
1960 /* Unbind scheduler */
1961 old_sched = rcu_dereference_protected(svc->scheduler, 1);
1962 ip_vs_unbind_scheduler(svc, old_sched);
1963 ip_vs_scheduler_put(old_sched);
1964
1965 /* Unbind persistence engine, keep svc->pe */
1966 old_pe = rcu_dereference_protected(svc->pe, 1);
1967 if (old_pe && old_pe->conn_out)
1968 atomic_dec(&ipvs->conn_out_counter[af_id]);
1969 ip_vs_pe_put(old_pe);
1970
1971 /*
1972 * Unlink the whole destination list
1973 */
1974 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
1975 __ip_vs_unlink_dest(svc, dest, 0);
1976 __ip_vs_del_dest(svc->ipvs, dest, cleanup);
1977 }
1978
1979 /*
1980 * Update the virtual service counters
1981 */
1982 if (svc->port == FTPPORT)
1983 atomic_dec(&ipvs->ftpsvc_counter[af_id]);
1984 else if (!svc->port && !svc->fwmark)
1985 atomic_dec(&ipvs->nullsvc_counter[af_id]);
1986
1987 /*
1988 * Free the service if nobody refers to it
1989 */
1990 __ip_vs_svc_put(svc);
1991
1992 /* decrease the module use count */
1993 ip_vs_use_count_dec();
1994 }
1995
1996 /*
1997 * Unlink a service from list and try to delete it if its refcnt reached 0
1998 */
ip_vs_unlink_service(struct ip_vs_service * svc,bool cleanup)1999 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
2000 {
2001 ip_vs_unregister_conntrack(svc);
2002 /* Hold svc to avoid double release from dest_trash */
2003 atomic_inc(&svc->refcnt);
2004 /*
2005 * Unhash it from the service table
2006 */
2007 ip_vs_svc_unhash(svc);
2008
2009 __ip_vs_del_service(svc, cleanup);
2010 }
2011
2012 /*
2013 * Delete a service from the service list
2014 */
ip_vs_del_service(struct ip_vs_service * svc)2015 static int ip_vs_del_service(struct ip_vs_service *svc)
2016 {
2017 struct netns_ipvs *ipvs;
2018 struct ip_vs_rht *t, *p;
2019 int ns;
2020
2021 if (svc == NULL)
2022 return -EEXIST;
2023 ipvs = svc->ipvs;
2024 ip_vs_unlink_service(svc, false);
2025 t = rcu_dereference_protected(ipvs->svc_table, 1);
2026
2027 /* Drop the table if no more services */
2028 ns = ip_vs_get_num_services(ipvs);
2029 if (!ns) {
2030 /* Stop the resizer and drop the tables */
2031 set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
2032 cancel_delayed_work_sync(&ipvs->svc_resize_work);
2033 if (t) {
2034 rcu_assign_pointer(ipvs->svc_table, NULL);
2035 while (1) {
2036 p = rcu_dereference_protected(t->new_tbl, 1);
2037 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2038 if (p == t)
2039 break;
2040 t = p;
2041 }
2042 }
2043 } else if (ns <= t->l_thresh &&
2044 !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
2045 &ipvs->work_flags)) {
2046 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
2047 1);
2048 }
2049 return 0;
2050 }
2051
2052
2053 /*
2054 * Flush all the virtual services
2055 */
ip_vs_flush(struct netns_ipvs * ipvs,bool cleanup)2056 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
2057 {
2058 DECLARE_IP_VS_RHT_WALK_BUCKETS();
2059 struct hlist_bl_head *head;
2060 struct ip_vs_service *svc;
2061 struct hlist_bl_node *ne;
2062 struct hlist_bl_node *e;
2063 struct ip_vs_rht *t, *p;
2064
2065 /* Stop the resizer and drop the tables */
2066 if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
2067 cancel_delayed_work_sync(&ipvs->svc_resize_work);
2068 /* No resizer, so now we have exclusive write access */
2069
2070 if (ip_vs_get_num_services(ipvs)) {
2071 ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
2072 hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list)
2073 ip_vs_unlink_service(svc, cleanup);
2074 }
2075 }
2076
2077 /* Unregister the hash table and release it after RCU grace period */
2078 t = rcu_dereference_protected(ipvs->svc_table, 1);
2079 if (t) {
2080 rcu_assign_pointer(ipvs->svc_table, NULL);
2081 while (1) {
2082 p = rcu_dereference_protected(t->new_tbl, 1);
2083 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2084 if (p == t)
2085 break;
2086 t = p;
2087 }
2088 }
2089 return 0;
2090 }
2091
2092 /*
2093 * Delete service by {netns} in the service table.
2094 * Called by __ip_vs_batch_cleanup()
2095 */
ip_vs_service_nets_cleanup(struct list_head * net_list)2096 void ip_vs_service_nets_cleanup(struct list_head *net_list)
2097 {
2098 struct netns_ipvs *ipvs;
2099 struct net *net;
2100
2101 /* Check for "full" addressed entries */
2102 list_for_each_entry(net, net_list, exit_list) {
2103 ipvs = net_ipvs(net);
2104 mutex_lock(&ipvs->service_mutex);
2105 ip_vs_flush(ipvs, true);
2106 mutex_unlock(&ipvs->service_mutex);
2107 }
2108 }
2109
2110 /* Put all references for device (dst_cache) */
2111 static inline void
ip_vs_forget_dev(struct ip_vs_dest * dest,struct net_device * dev)2112 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
2113 {
2114 struct ip_vs_dest_dst *dest_dst;
2115
2116 spin_lock_bh(&dest->dst_lock);
2117 dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
2118 if (dest_dst && dest_dst->dst_cache->dev == dev) {
2119 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
2120 dev->name,
2121 IP_VS_DBG_ADDR(dest->af, &dest->addr),
2122 ntohs(dest->port),
2123 refcount_read(&dest->refcnt));
2124 __ip_vs_dst_cache_reset(dest);
2125 }
2126 spin_unlock_bh(&dest->dst_lock);
2127
2128 }
2129 /* Netdev event receiver
2130 * Currently only NETDEV_DOWN is handled to release refs to cached dsts
2131 */
ip_vs_dst_event(struct notifier_block * this,unsigned long event,void * ptr)2132 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
2133 void *ptr)
2134 {
2135 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2136 struct net *net = dev_net(dev);
2137 struct netns_ipvs *ipvs = net_ipvs(net);
2138 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2139 unsigned int resched_score = 0;
2140 struct hlist_bl_head *head;
2141 struct ip_vs_service *svc;
2142 struct hlist_bl_node *e;
2143 struct ip_vs_dest *dest;
2144 int old_gen, new_gen;
2145
2146 if (event != NETDEV_DOWN || !ipvs)
2147 return NOTIFY_DONE;
2148 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
2149
2150 old_gen = atomic_read(&ipvs->svc_table_changes);
2151
2152 rcu_read_lock();
2153
2154 repeat:
2155 smp_rmb(); /* ipvs->svc_table and svc_table_changes */
2156 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2157 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2158 list_for_each_entry_rcu(dest, &svc->destinations,
2159 n_list) {
2160 ip_vs_forget_dev(dest, dev);
2161 resched_score += 10;
2162 }
2163 resched_score++;
2164 }
2165 resched_score++;
2166 if (resched_score >= 100) {
2167 resched_score = 0;
2168 cond_resched_rcu();
2169 new_gen = atomic_read(&ipvs->svc_table_changes);
2170 /* New table installed ? */
2171 if (old_gen != new_gen) {
2172 old_gen = new_gen;
2173 goto repeat;
2174 }
2175 }
2176 }
2177 rcu_read_unlock();
2178
2179 return NOTIFY_DONE;
2180 }
2181
2182 /*
2183 * Zero counters in a service or all services
2184 */
ip_vs_zero_service(struct ip_vs_service * svc)2185 static int ip_vs_zero_service(struct ip_vs_service *svc)
2186 {
2187 struct ip_vs_dest *dest;
2188
2189 list_for_each_entry(dest, &svc->destinations, n_list) {
2190 ip_vs_zero_stats(&dest->stats);
2191 }
2192 ip_vs_zero_stats(&svc->stats);
2193 return 0;
2194 }
2195
ip_vs_zero_all(struct netns_ipvs * ipvs)2196 static int ip_vs_zero_all(struct netns_ipvs *ipvs)
2197 {
2198 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2199 unsigned int resched_score = 0;
2200 struct hlist_bl_head *head;
2201 struct ip_vs_service *svc;
2202 struct hlist_bl_node *e;
2203
2204 rcu_read_lock();
2205
2206 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2207 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2208 ip_vs_zero_service(svc);
2209 resched_score += 10;
2210 }
2211 resched_score++;
2212 if (resched_score >= 100) {
2213 resched_score = 0;
2214 cond_resched_rcu();
2215 }
2216 }
2217
2218 rcu_read_unlock();
2219
2220 ip_vs_zero_stats(&ipvs->tot_stats->s);
2221 return 0;
2222 }
2223
2224 #ifdef CONFIG_SYSCTL
2225
2226 static int
proc_do_defense_mode(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2227 proc_do_defense_mode(const struct ctl_table *table, int write,
2228 void *buffer, size_t *lenp, loff_t *ppos)
2229 {
2230 struct netns_ipvs *ipvs = table->extra2;
2231 int *valp = table->data;
2232 int val = *valp;
2233 int rc;
2234
2235 struct ctl_table tmp = {
2236 .data = &val,
2237 .maxlen = sizeof(int),
2238 .mode = table->mode,
2239 };
2240
2241 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2242 if (write && (*valp != val)) {
2243 if (val < 0 || val > 3) {
2244 rc = -EINVAL;
2245 } else {
2246 *valp = val;
2247 update_defense_level(ipvs);
2248 }
2249 }
2250 return rc;
2251 }
2252
2253 static int
proc_do_sync_threshold(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2254 proc_do_sync_threshold(const struct ctl_table *table, int write,
2255 void *buffer, size_t *lenp, loff_t *ppos)
2256 {
2257 struct netns_ipvs *ipvs = table->extra2;
2258 int *valp = table->data;
2259 int val[2];
2260 int rc;
2261 struct ctl_table tmp = {
2262 .data = &val,
2263 .maxlen = table->maxlen,
2264 .mode = table->mode,
2265 };
2266
2267 mutex_lock(&ipvs->sync_mutex);
2268 memcpy(val, valp, sizeof(val));
2269 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2270 if (write) {
2271 if (val[0] < 0 || val[1] < 0 ||
2272 (val[0] >= val[1] && val[1]))
2273 rc = -EINVAL;
2274 else
2275 memcpy(valp, val, sizeof(val));
2276 }
2277 mutex_unlock(&ipvs->sync_mutex);
2278 return rc;
2279 }
2280
2281 static int
proc_do_sync_ports(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2282 proc_do_sync_ports(const struct ctl_table *table, int write,
2283 void *buffer, size_t *lenp, loff_t *ppos)
2284 {
2285 int *valp = table->data;
2286 int val = *valp;
2287 int rc;
2288
2289 struct ctl_table tmp = {
2290 .data = &val,
2291 .maxlen = sizeof(int),
2292 .mode = table->mode,
2293 };
2294
2295 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2296 if (write && (*valp != val)) {
2297 if (val < 1 || !is_power_of_2(val))
2298 rc = -EINVAL;
2299 else
2300 *valp = val;
2301 }
2302 return rc;
2303 }
2304
ipvs_proc_est_cpumask_set(const struct ctl_table * table,void * buffer)2305 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
2306 void *buffer)
2307 {
2308 struct netns_ipvs *ipvs = table->extra2;
2309 cpumask_var_t *valp = table->data;
2310 cpumask_var_t newmask;
2311 int ret;
2312
2313 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
2314 return -ENOMEM;
2315
2316 ret = cpulist_parse(buffer, newmask);
2317 if (ret)
2318 goto out;
2319
2320 mutex_lock(&ipvs->est_mutex);
2321
2322 if (!ipvs->est_cpulist_valid) {
2323 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
2324 ret = -ENOMEM;
2325 goto unlock;
2326 }
2327 ipvs->est_cpulist_valid = 1;
2328 }
2329 cpumask_and(newmask, newmask, ¤t->cpus_mask);
2330 cpumask_copy(*valp, newmask);
2331 /* est_max_threads may depend on cpulist size */
2332 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
2333 ipvs->est_calc_phase = 1;
2334 ip_vs_est_reload_start(ipvs);
2335
2336 unlock:
2337 mutex_unlock(&ipvs->est_mutex);
2338
2339 out:
2340 free_cpumask_var(newmask);
2341 return ret;
2342 }
2343
ipvs_proc_est_cpumask_get(const struct ctl_table * table,void * buffer,size_t size)2344 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
2345 void *buffer, size_t size)
2346 {
2347 struct netns_ipvs *ipvs = table->extra2;
2348 cpumask_var_t *valp = table->data;
2349 struct cpumask *mask;
2350 int ret;
2351
2352 mutex_lock(&ipvs->est_mutex);
2353
2354 if (ipvs->est_cpulist_valid)
2355 mask = *valp;
2356 else
2357 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
2358 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
2359
2360 mutex_unlock(&ipvs->est_mutex);
2361
2362 return ret;
2363 }
2364
ipvs_proc_est_cpulist(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2365 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write,
2366 void *buffer, size_t *lenp, loff_t *ppos)
2367 {
2368 int ret;
2369
2370 /* Ignore both read and write(append) if *ppos not 0 */
2371 if (*ppos || !*lenp) {
2372 *lenp = 0;
2373 return 0;
2374 }
2375 if (write) {
2376 /* proc_sys_call_handler() appends terminator */
2377 ret = ipvs_proc_est_cpumask_set(table, buffer);
2378 if (ret >= 0)
2379 *ppos += *lenp;
2380 } else {
2381 /* proc_sys_call_handler() allocates 1 byte for terminator */
2382 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
2383 if (ret >= 0) {
2384 *lenp = ret;
2385 *ppos += *lenp;
2386 ret = 0;
2387 }
2388 }
2389 return ret;
2390 }
2391
ipvs_proc_est_nice(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2392 static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
2393 void *buffer, size_t *lenp, loff_t *ppos)
2394 {
2395 struct netns_ipvs *ipvs = table->extra2;
2396 int *valp = table->data;
2397 int val = *valp;
2398 int ret;
2399
2400 struct ctl_table tmp_table = {
2401 .data = &val,
2402 .maxlen = sizeof(int),
2403 .mode = table->mode,
2404 };
2405
2406 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2407 if (write && ret >= 0) {
2408 if (val < MIN_NICE || val > MAX_NICE) {
2409 ret = -EINVAL;
2410 } else {
2411 mutex_lock(&ipvs->est_mutex);
2412 if (*valp != val) {
2413 *valp = val;
2414 ip_vs_est_reload_start(ipvs);
2415 }
2416 mutex_unlock(&ipvs->est_mutex);
2417 }
2418 }
2419 return ret;
2420 }
2421
ipvs_proc_run_estimation(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2422 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
2423 void *buffer, size_t *lenp, loff_t *ppos)
2424 {
2425 struct netns_ipvs *ipvs = table->extra2;
2426 int *valp = table->data;
2427 int val = *valp;
2428 int ret;
2429
2430 struct ctl_table tmp_table = {
2431 .data = &val,
2432 .maxlen = sizeof(int),
2433 .mode = table->mode,
2434 };
2435
2436 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2437 if (write && ret >= 0) {
2438 mutex_lock(&ipvs->est_mutex);
2439 if (*valp != val) {
2440 *valp = val;
2441 ip_vs_est_reload_start(ipvs);
2442 }
2443 mutex_unlock(&ipvs->est_mutex);
2444 }
2445 return ret;
2446 }
2447
ipvs_proc_conn_lfactor(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2448 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
2449 void *buffer, size_t *lenp, loff_t *ppos)
2450 {
2451 struct netns_ipvs *ipvs = table->extra2;
2452 int *valp = table->data;
2453 int val = *valp;
2454 int ret;
2455
2456 struct ctl_table tmp_table = {
2457 .data = &val,
2458 .maxlen = sizeof(int),
2459 };
2460
2461 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2462 if (write && ret >= 0) {
2463 if (val < -8 || val > 8) {
2464 ret = -EINVAL;
2465 } else {
2466 *valp = val;
2467 if (rcu_access_pointer(ipvs->conn_tab))
2468 mod_delayed_work(system_unbound_wq,
2469 &ipvs->conn_resize_work, 0);
2470 }
2471 }
2472 return ret;
2473 }
2474
ipvs_proc_svc_lfactor(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2475 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
2476 void *buffer, size_t *lenp, loff_t *ppos)
2477 {
2478 struct netns_ipvs *ipvs = table->extra2;
2479 int *valp = table->data;
2480 int val = *valp;
2481 int ret;
2482
2483 struct ctl_table tmp_table = {
2484 .data = &val,
2485 .maxlen = sizeof(int),
2486 };
2487
2488 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2489 if (write && ret >= 0) {
2490 if (val < -8 || val > 8) {
2491 ret = -EINVAL;
2492 } else {
2493 *valp = val;
2494 if (rcu_access_pointer(ipvs->svc_table))
2495 mod_delayed_work(system_unbound_wq,
2496 &ipvs->svc_resize_work, 0);
2497 }
2498 }
2499 return ret;
2500 }
2501
2502 /*
2503 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
2504 * Do not change order or insert new entries without
2505 * align with netns init in ip_vs_control_net_init()
2506 */
2507
2508 static struct ctl_table vs_vars[] = {
2509 {
2510 .procname = "amemthresh",
2511 .maxlen = sizeof(int),
2512 .mode = 0644,
2513 .proc_handler = proc_dointvec,
2514 },
2515 {
2516 .procname = "am_droprate",
2517 .maxlen = sizeof(int),
2518 .mode = 0644,
2519 .proc_handler = proc_dointvec,
2520 },
2521 {
2522 .procname = "drop_entry",
2523 .maxlen = sizeof(int),
2524 .mode = 0644,
2525 .proc_handler = proc_do_defense_mode,
2526 },
2527 {
2528 .procname = "drop_packet",
2529 .maxlen = sizeof(int),
2530 .mode = 0644,
2531 .proc_handler = proc_do_defense_mode,
2532 },
2533 #ifdef CONFIG_IP_VS_NFCT
2534 {
2535 .procname = "conntrack",
2536 .maxlen = sizeof(int),
2537 .mode = 0644,
2538 .proc_handler = &proc_dointvec,
2539 },
2540 #endif
2541 {
2542 .procname = "secure_tcp",
2543 .maxlen = sizeof(int),
2544 .mode = 0644,
2545 .proc_handler = proc_do_defense_mode,
2546 },
2547 {
2548 .procname = "snat_reroute",
2549 .maxlen = sizeof(int),
2550 .mode = 0644,
2551 .proc_handler = &proc_dointvec,
2552 },
2553 {
2554 .procname = "sync_version",
2555 .maxlen = sizeof(int),
2556 .mode = 0644,
2557 .proc_handler = proc_dointvec_minmax,
2558 .extra1 = SYSCTL_ZERO,
2559 .extra2 = SYSCTL_ONE,
2560 },
2561 {
2562 .procname = "sync_ports",
2563 .maxlen = sizeof(int),
2564 .mode = 0644,
2565 .proc_handler = proc_do_sync_ports,
2566 },
2567 {
2568 .procname = "sync_persist_mode",
2569 .maxlen = sizeof(int),
2570 .mode = 0644,
2571 .proc_handler = proc_dointvec,
2572 },
2573 {
2574 .procname = "sync_qlen_max",
2575 .maxlen = sizeof(unsigned long),
2576 .mode = 0644,
2577 .proc_handler = proc_doulongvec_minmax,
2578 },
2579 {
2580 .procname = "sync_sock_size",
2581 .maxlen = sizeof(int),
2582 .mode = 0644,
2583 .proc_handler = proc_dointvec,
2584 },
2585 {
2586 .procname = "cache_bypass",
2587 .maxlen = sizeof(int),
2588 .mode = 0644,
2589 .proc_handler = proc_dointvec,
2590 },
2591 {
2592 .procname = "expire_nodest_conn",
2593 .maxlen = sizeof(int),
2594 .mode = 0644,
2595 .proc_handler = proc_dointvec,
2596 },
2597 {
2598 .procname = "sloppy_tcp",
2599 .maxlen = sizeof(int),
2600 .mode = 0644,
2601 .proc_handler = proc_dointvec,
2602 },
2603 {
2604 .procname = "sloppy_sctp",
2605 .maxlen = sizeof(int),
2606 .mode = 0644,
2607 .proc_handler = proc_dointvec,
2608 },
2609 {
2610 .procname = "expire_quiescent_template",
2611 .maxlen = sizeof(int),
2612 .mode = 0644,
2613 .proc_handler = proc_dointvec,
2614 },
2615 {
2616 .procname = "sync_threshold",
2617 .maxlen =
2618 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
2619 .mode = 0644,
2620 .proc_handler = proc_do_sync_threshold,
2621 },
2622 {
2623 .procname = "sync_refresh_period",
2624 .maxlen = sizeof(int),
2625 .mode = 0644,
2626 .proc_handler = proc_dointvec_jiffies,
2627 },
2628 {
2629 .procname = "sync_retries",
2630 .maxlen = sizeof(int),
2631 .mode = 0644,
2632 .proc_handler = proc_dointvec_minmax,
2633 .extra1 = SYSCTL_ZERO,
2634 .extra2 = SYSCTL_THREE,
2635 },
2636 {
2637 .procname = "nat_icmp_send",
2638 .maxlen = sizeof(int),
2639 .mode = 0644,
2640 .proc_handler = proc_dointvec,
2641 },
2642 {
2643 .procname = "pmtu_disc",
2644 .maxlen = sizeof(int),
2645 .mode = 0644,
2646 .proc_handler = proc_dointvec,
2647 },
2648 {
2649 .procname = "backup_only",
2650 .maxlen = sizeof(int),
2651 .mode = 0644,
2652 .proc_handler = proc_dointvec,
2653 },
2654 {
2655 .procname = "conn_reuse_mode",
2656 .maxlen = sizeof(int),
2657 .mode = 0644,
2658 .proc_handler = proc_dointvec,
2659 },
2660 {
2661 .procname = "schedule_icmp",
2662 .maxlen = sizeof(int),
2663 .mode = 0644,
2664 .proc_handler = proc_dointvec,
2665 },
2666 {
2667 .procname = "ignore_tunneled",
2668 .maxlen = sizeof(int),
2669 .mode = 0644,
2670 .proc_handler = proc_dointvec,
2671 },
2672 {
2673 .procname = "run_estimation",
2674 .maxlen = sizeof(int),
2675 .mode = 0644,
2676 .proc_handler = ipvs_proc_run_estimation,
2677 },
2678 {
2679 .procname = "est_cpulist",
2680 .maxlen = NR_CPUS, /* unused */
2681 .mode = 0644,
2682 .proc_handler = ipvs_proc_est_cpulist,
2683 },
2684 {
2685 .procname = "est_nice",
2686 .maxlen = sizeof(int),
2687 .mode = 0644,
2688 .proc_handler = ipvs_proc_est_nice,
2689 },
2690 {
2691 .procname = "conn_lfactor",
2692 .maxlen = sizeof(int),
2693 .mode = 0644,
2694 .proc_handler = ipvs_proc_conn_lfactor,
2695 },
2696 {
2697 .procname = "svc_lfactor",
2698 .maxlen = sizeof(int),
2699 .mode = 0644,
2700 .proc_handler = ipvs_proc_svc_lfactor,
2701 },
2702 #ifdef CONFIG_IP_VS_DEBUG
2703 {
2704 .procname = "debug_level",
2705 .data = &sysctl_ip_vs_debug_level,
2706 .maxlen = sizeof(int),
2707 .mode = 0644,
2708 .proc_handler = proc_dointvec,
2709 },
2710 #endif
2711 };
2712
2713 #endif
2714
2715 #ifdef CONFIG_PROC_FS
2716
2717 struct ip_vs_iter {
2718 struct seq_net_private p; /* Do not move this, netns depends upon it*/
2719 struct ip_vs_rht *t;
2720 u32 bucket;
2721 };
2722
2723 /*
2724 * Write the contents of the VS rule table to a PROCfs file.
2725 * (It is kept just for backward compatibility)
2726 */
ip_vs_fwd_name(unsigned int flags)2727 static inline const char *ip_vs_fwd_name(unsigned int flags)
2728 {
2729 switch (flags & IP_VS_CONN_F_FWD_MASK) {
2730 case IP_VS_CONN_F_LOCALNODE:
2731 return "Local";
2732 case IP_VS_CONN_F_TUNNEL:
2733 return "Tunnel";
2734 case IP_VS_CONN_F_DROUTE:
2735 return "Route";
2736 default:
2737 return "Masq";
2738 }
2739 }
2740
2741 /* Do not expect consistent view during add, del and move(table resize).
2742 * We may miss entries and even show duplicates.
2743 */
ip_vs_info_array(struct seq_file * seq,loff_t pos)2744 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
2745 {
2746 struct ip_vs_iter *iter = seq->private;
2747 struct ip_vs_rht *t = iter->t;
2748 struct ip_vs_service *svc;
2749 struct hlist_bl_node *e;
2750 int idx;
2751
2752 if (!t)
2753 return NULL;
2754 for (idx = 0; idx < t->size; idx++) {
2755 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) {
2756 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2757 break;
2758 if (pos-- == 0) {
2759 iter->bucket = idx;
2760 return svc;
2761 }
2762 }
2763 }
2764 return NULL;
2765 }
2766
ip_vs_info_seq_start(struct seq_file * seq,loff_t * pos)2767 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
2768 __acquires(RCU)
2769 {
2770 struct ip_vs_iter *iter = seq->private;
2771 struct net *net = seq_file_net(seq);
2772 struct netns_ipvs *ipvs = net_ipvs(net);
2773
2774 rcu_read_lock();
2775 iter->t = rcu_dereference(ipvs->svc_table);
2776 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2777 }
2778
2779
ip_vs_info_seq_next(struct seq_file * seq,void * v,loff_t * pos)2780 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2781 {
2782 struct ip_vs_service *svc;
2783 struct ip_vs_iter *iter;
2784 struct hlist_bl_node *e;
2785 struct ip_vs_rht *t;
2786
2787 ++*pos;
2788 if (v == SEQ_START_TOKEN)
2789 return ip_vs_info_array(seq,0);
2790
2791 svc = v;
2792 iter = seq->private;
2793 t = iter->t;
2794 if (!t)
2795 return NULL;
2796
2797 hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) {
2798 /* Our cursor was moved to new table ? */
2799 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2800 break;
2801 return svc;
2802 }
2803
2804 while (++iter->bucket < t->size) {
2805 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket],
2806 s_list) {
2807 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2808 break;
2809 return svc;
2810 }
2811 }
2812 return NULL;
2813 }
2814
ip_vs_info_seq_stop(struct seq_file * seq,void * v)2815 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2816 __releases(RCU)
2817 {
2818 rcu_read_unlock();
2819 }
2820
2821
ip_vs_info_seq_show(struct seq_file * seq,void * v)2822 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2823 {
2824 struct net *net = seq_file_net(seq);
2825 struct netns_ipvs *ipvs = net_ipvs(net);
2826
2827 if (v == SEQ_START_TOKEN) {
2828 seq_printf(seq,
2829 "IP Virtual Server version %d.%d.%d (size=%d)\n",
2830 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
2831 seq_puts(seq,
2832 "Prot LocalAddress:Port Scheduler Flags\n");
2833 seq_puts(seq,
2834 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2835 } else {
2836 const struct ip_vs_service *svc = v;
2837 const struct ip_vs_dest *dest;
2838 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
2839 char *sched_name = sched ? sched->name : "none";
2840
2841 if (!svc->fwmark) {
2842 #ifdef CONFIG_IP_VS_IPV6
2843 if (svc->af == AF_INET6)
2844 seq_printf(seq, "%s [%pI6]:%04X %s ",
2845 ip_vs_proto_name(svc->protocol),
2846 &svc->addr.in6,
2847 ntohs(svc->port),
2848 sched_name);
2849 else
2850 #endif
2851 seq_printf(seq, "%s %08X:%04X %s %s ",
2852 ip_vs_proto_name(svc->protocol),
2853 ntohl(svc->addr.ip),
2854 ntohs(svc->port),
2855 sched_name,
2856 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2857 } else {
2858 seq_printf(seq, "FWM %08X %s %s",
2859 svc->fwmark, sched_name,
2860 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2861 }
2862
2863 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2864 seq_printf(seq, "persistent %d %08X\n",
2865 svc->timeout,
2866 ntohl(svc->netmask));
2867 else
2868 seq_putc(seq, '\n');
2869
2870 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
2871 #ifdef CONFIG_IP_VS_IPV6
2872 if (dest->af == AF_INET6)
2873 seq_printf(seq,
2874 " -> [%pI6]:%04X"
2875 " %-7s %-6d %-10d %-10d\n",
2876 &dest->addr.in6,
2877 ntohs(dest->port),
2878 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2879 atomic_read(&dest->weight),
2880 atomic_read(&dest->activeconns),
2881 atomic_read(&dest->inactconns));
2882 else
2883 #endif
2884 seq_printf(seq,
2885 " -> %08X:%04X "
2886 "%-7s %-6d %-10d %-10d\n",
2887 ntohl(dest->addr.ip),
2888 ntohs(dest->port),
2889 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2890 atomic_read(&dest->weight),
2891 atomic_read(&dest->activeconns),
2892 atomic_read(&dest->inactconns));
2893
2894 }
2895 }
2896 return 0;
2897 }
2898
2899 static const struct seq_operations ip_vs_info_seq_ops = {
2900 .start = ip_vs_info_seq_start,
2901 .next = ip_vs_info_seq_next,
2902 .stop = ip_vs_info_seq_stop,
2903 .show = ip_vs_info_seq_show,
2904 };
2905
ip_vs_stats_show(struct seq_file * seq,void * v)2906 static int ip_vs_stats_show(struct seq_file *seq, void *v)
2907 {
2908 struct net *net = seq_file_single_net(seq);
2909 struct ip_vs_kstats show;
2910
2911 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2912 seq_puts(seq,
2913 " Total Incoming Outgoing Incoming Outgoing\n");
2914 seq_puts(seq,
2915 " Conns Packets Packets Bytes Bytes\n");
2916
2917 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
2918 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
2919 (unsigned long long)show.conns,
2920 (unsigned long long)show.inpkts,
2921 (unsigned long long)show.outpkts,
2922 (unsigned long long)show.inbytes,
2923 (unsigned long long)show.outbytes);
2924
2925 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/
2926 seq_puts(seq,
2927 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
2928 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
2929 (unsigned long long)show.cps,
2930 (unsigned long long)show.inpps,
2931 (unsigned long long)show.outpps,
2932 (unsigned long long)show.inbps,
2933 (unsigned long long)show.outbps);
2934
2935 return 0;
2936 }
2937
ip_vs_stats_percpu_show(struct seq_file * seq,void * v)2938 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
2939 {
2940 struct net *net = seq_file_single_net(seq);
2941 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
2942 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
2943 struct ip_vs_kstats kstats;
2944 int i;
2945
2946 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2947 seq_puts(seq,
2948 " Total Incoming Outgoing Incoming Outgoing\n");
2949 seq_puts(seq,
2950 "CPU Conns Packets Packets Bytes Bytes\n");
2951
2952 for_each_possible_cpu(i) {
2953 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
2954 unsigned int start;
2955 u64 conns, inpkts, outpkts, inbytes, outbytes;
2956
2957 do {
2958 start = u64_stats_fetch_begin(&u->syncp);
2959 conns = u64_stats_read(&u->cnt.conns);
2960 inpkts = u64_stats_read(&u->cnt.inpkts);
2961 outpkts = u64_stats_read(&u->cnt.outpkts);
2962 inbytes = u64_stats_read(&u->cnt.inbytes);
2963 outbytes = u64_stats_read(&u->cnt.outbytes);
2964 } while (u64_stats_fetch_retry(&u->syncp, start));
2965
2966 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
2967 i, (u64)conns, (u64)inpkts,
2968 (u64)outpkts, (u64)inbytes,
2969 (u64)outbytes);
2970 }
2971
2972 ip_vs_copy_stats(&kstats, tot_stats);
2973
2974 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n",
2975 (unsigned long long)kstats.conns,
2976 (unsigned long long)kstats.inpkts,
2977 (unsigned long long)kstats.outpkts,
2978 (unsigned long long)kstats.inbytes,
2979 (unsigned long long)kstats.outbytes);
2980
2981 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
2982 seq_puts(seq,
2983 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
2984 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n",
2985 kstats.cps,
2986 kstats.inpps,
2987 kstats.outpps,
2988 kstats.inbps,
2989 kstats.outbps);
2990
2991 return 0;
2992 }
2993
ip_vs_status_show(struct seq_file * seq,void * v)2994 static int ip_vs_status_show(struct seq_file *seq, void *v)
2995 {
2996 struct net *net = seq_file_single_net(seq);
2997 struct netns_ipvs *ipvs = net_ipvs(net);
2998 unsigned int resched_score = 0;
2999 struct ip_vs_conn_hnode *hn;
3000 struct hlist_bl_head *head;
3001 struct ip_vs_service *svc;
3002 struct ip_vs_rht *t, *pt;
3003 struct hlist_bl_node *e;
3004 int old_gen, new_gen;
3005 u32 counts[8];
3006 u32 bucket;
3007 int count;
3008 u32 sum1;
3009 u32 sum;
3010 int i;
3011
3012 rcu_read_lock();
3013
3014 t = rcu_dereference(ipvs->conn_tab);
3015
3016 seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count));
3017 seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n",
3018 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3019
3020 if (!atomic_read(&ipvs->conn_count))
3021 goto after_conns;
3022 old_gen = atomic_read(&ipvs->conn_tab_changes);
3023
3024 repeat_conn:
3025 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
3026 memset(counts, 0, sizeof(counts));
3027 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
3028 for (bucket = 0; bucket < t->size; bucket++) {
3029 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3030
3031 count = 0;
3032 resched_score++;
3033 ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3034 count = 0;
3035 hlist_bl_for_each_entry_rcu(hn, e, head, node)
3036 count++;
3037 }
3038 resched_score += count;
3039 if (resched_score >= 100) {
3040 resched_score = 0;
3041 cond_resched_rcu();
3042 new_gen = atomic_read(&ipvs->conn_tab_changes);
3043 /* New table installed ? */
3044 if (old_gen != new_gen) {
3045 old_gen = new_gen;
3046 goto repeat_conn;
3047 }
3048 }
3049 counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
3050 }
3051 }
3052 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3053 sum += counts[i];
3054 sum1 = sum - counts[0];
3055 seq_printf(seq, "Conn buckets empty:\t%u (%lu%%)\n",
3056 counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
3057 for (i = 1; i < ARRAY_SIZE(counts); i++) {
3058 if (!counts[i])
3059 continue;
3060 seq_printf(seq, "Conn buckets len-%d:\t%u (%lu%%)\n",
3061 i, counts[i],
3062 (unsigned long)counts[i] * 100 / max(sum1, 1U));
3063 }
3064
3065 after_conns:
3066 t = rcu_dereference(ipvs->svc_table);
3067
3068 count = ip_vs_get_num_services(ipvs);
3069 seq_printf(seq, "Services:\t%d\n", count);
3070 seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
3071 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3072
3073 if (!count)
3074 goto after_svc;
3075 old_gen = atomic_read(&ipvs->svc_table_changes);
3076
3077 repeat_svc:
3078 smp_rmb(); /* ipvs->svc_table and svc_table_changes */
3079 memset(counts, 0, sizeof(counts));
3080 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) {
3081 for (bucket = 0; bucket < t->size; bucket++) {
3082 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3083
3084 count = 0;
3085 resched_score++;
3086 ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3087 count = 0;
3088 hlist_bl_for_each_entry_rcu(svc, e, head,
3089 s_list)
3090 count++;
3091 }
3092 resched_score += count;
3093 if (resched_score >= 100) {
3094 resched_score = 0;
3095 cond_resched_rcu();
3096 new_gen = atomic_read(&ipvs->svc_table_changes);
3097 /* New table installed ? */
3098 if (old_gen != new_gen) {
3099 old_gen = new_gen;
3100 goto repeat_svc;
3101 }
3102 }
3103 counts[min(count, (int)ARRAY_SIZE(counts) - 1)]++;
3104 }
3105 }
3106 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3107 sum += counts[i];
3108 sum1 = sum - counts[0];
3109 seq_printf(seq, "Service buckets empty:\t%u (%lu%%)\n",
3110 counts[0], (unsigned long)counts[0] * 100 / max(sum, 1U));
3111 for (i = 1; i < ARRAY_SIZE(counts); i++) {
3112 if (!counts[i])
3113 continue;
3114 seq_printf(seq, "Service buckets len-%d:\t%u (%lu%%)\n",
3115 i, counts[i],
3116 (unsigned long)counts[i] * 100 / max(sum1, 1U));
3117 }
3118
3119 after_svc:
3120 seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n",
3121 ipvs->est_kt_count, ipvs->est_max_threads);
3122 seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max);
3123 seq_printf(seq, "Stats thread ests:\t%d\n",
3124 ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR *
3125 IPVS_EST_NTICKS);
3126
3127 rcu_read_unlock();
3128 return 0;
3129 }
3130
3131 #endif
3132
3133 /*
3134 * Set timeout values for tcp tcpfin udp in the timeout_table.
3135 */
ip_vs_set_timeout(struct netns_ipvs * ipvs,struct ip_vs_timeout_user * u)3136 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3137 {
3138 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3139 struct ip_vs_proto_data *pd;
3140 #endif
3141
3142 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
3143 u->tcp_timeout,
3144 u->tcp_fin_timeout,
3145 u->udp_timeout);
3146
3147 #ifdef CONFIG_IP_VS_PROTO_TCP
3148 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
3149 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
3150 return -EINVAL;
3151 }
3152 #endif
3153
3154 #ifdef CONFIG_IP_VS_PROTO_UDP
3155 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
3156 return -EINVAL;
3157 #endif
3158
3159 #ifdef CONFIG_IP_VS_PROTO_TCP
3160 if (u->tcp_timeout) {
3161 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3162 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
3163 = u->tcp_timeout * HZ;
3164 }
3165
3166 if (u->tcp_fin_timeout) {
3167 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3168 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
3169 = u->tcp_fin_timeout * HZ;
3170 }
3171 #endif
3172
3173 #ifdef CONFIG_IP_VS_PROTO_UDP
3174 if (u->udp_timeout) {
3175 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3176 pd->timeout_table[IP_VS_UDP_S_NORMAL]
3177 = u->udp_timeout * HZ;
3178 }
3179 #endif
3180 return 0;
3181 }
3182
3183 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL)
3184
3185 struct ip_vs_svcdest_user {
3186 struct ip_vs_service_user s;
3187 struct ip_vs_dest_user d;
3188 };
3189
3190 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
3191 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user),
3192 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user),
3193 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user),
3194 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user),
3195 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user),
3196 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user),
3197 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
3198 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
3199 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user),
3200 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user),
3201 };
3202
3203 union ip_vs_set_arglen {
3204 struct ip_vs_service_user field_IP_VS_SO_SET_ADD;
3205 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT;
3206 struct ip_vs_service_user field_IP_VS_SO_SET_DEL;
3207 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST;
3208 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST;
3209 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST;
3210 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT;
3211 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON;
3212 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON;
3213 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO;
3214 };
3215
3216 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen)
3217
ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern * usvc,struct ip_vs_service_user * usvc_compat)3218 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
3219 struct ip_vs_service_user *usvc_compat)
3220 {
3221 memset(usvc, 0, sizeof(*usvc));
3222
3223 usvc->af = AF_INET;
3224 usvc->protocol = usvc_compat->protocol;
3225 usvc->addr.ip = usvc_compat->addr;
3226 usvc->port = usvc_compat->port;
3227 usvc->fwmark = usvc_compat->fwmark;
3228
3229 /* Deep copy of sched_name is not needed here */
3230 usvc->sched_name = usvc_compat->sched_name;
3231
3232 usvc->flags = usvc_compat->flags;
3233 usvc->timeout = usvc_compat->timeout;
3234 usvc->netmask = usvc_compat->netmask;
3235 }
3236
ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern * udest,struct ip_vs_dest_user * udest_compat)3237 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
3238 struct ip_vs_dest_user *udest_compat)
3239 {
3240 memset(udest, 0, sizeof(*udest));
3241
3242 udest->addr.ip = udest_compat->addr;
3243 udest->port = udest_compat->port;
3244 udest->conn_flags = udest_compat->conn_flags;
3245 udest->weight = udest_compat->weight;
3246 udest->u_threshold = udest_compat->u_threshold;
3247 udest->l_threshold = udest_compat->l_threshold;
3248 udest->af = AF_INET;
3249 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
3250 }
3251
3252 static int
do_ip_vs_set_ctl(struct sock * sk,int cmd,sockptr_t ptr,unsigned int len)3253 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
3254 {
3255 struct net *net = sock_net(sk);
3256 int ret;
3257 unsigned char arg[MAX_SET_ARGLEN];
3258 struct ip_vs_service_user *usvc_compat;
3259 struct ip_vs_service_user_kern usvc;
3260 struct ip_vs_service *svc;
3261 struct ip_vs_dest_user *udest_compat;
3262 struct ip_vs_dest_user_kern udest;
3263 struct netns_ipvs *ipvs = net_ipvs(net);
3264
3265 BUILD_BUG_ON(sizeof(arg) > 255);
3266 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3267 return -EPERM;
3268
3269 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
3270 return -EINVAL;
3271 if (len != set_arglen[CMDID(cmd)]) {
3272 IP_VS_DBG(1, "set_ctl: len %u != %u\n",
3273 len, set_arglen[CMDID(cmd)]);
3274 return -EINVAL;
3275 }
3276
3277 if (copy_from_sockptr(arg, ptr, len) != 0)
3278 return -EFAULT;
3279
3280 /* Handle daemons since they have another lock */
3281 if (cmd == IP_VS_SO_SET_STARTDAEMON ||
3282 cmd == IP_VS_SO_SET_STOPDAEMON) {
3283 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
3284
3285 if (cmd == IP_VS_SO_SET_STARTDAEMON) {
3286 struct ipvs_sync_daemon_cfg cfg;
3287
3288 memset(&cfg, 0, sizeof(cfg));
3289 ret = -EINVAL;
3290 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
3291 sizeof(cfg.mcast_ifn)) <= 0)
3292 return ret;
3293 cfg.syncid = dm->syncid;
3294 ret = start_sync_thread(ipvs, &cfg, dm->state);
3295 } else {
3296 ret = stop_sync_thread(ipvs, dm->state);
3297 }
3298 return ret;
3299 }
3300
3301 mutex_lock(&ipvs->service_mutex);
3302 if (cmd == IP_VS_SO_SET_FLUSH) {
3303 /* Flush the virtual service */
3304 ret = ip_vs_flush(ipvs, false);
3305 goto out_unlock;
3306 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
3307 /* Set timeout values for (tcp tcpfin udp) */
3308 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
3309 goto out_unlock;
3310 } else if (!len) {
3311 /* No more commands with len == 0 below */
3312 ret = -EINVAL;
3313 goto out_unlock;
3314 }
3315
3316 usvc_compat = (struct ip_vs_service_user *)arg;
3317 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
3318
3319 /* We only use the new structs internally, so copy userspace compat
3320 * structs to extended internal versions */
3321 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
3322 ip_vs_copy_udest_compat(&udest, udest_compat);
3323
3324 if (cmd == IP_VS_SO_SET_ZERO) {
3325 /* if no service address is set, zero counters in all */
3326 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
3327 ret = ip_vs_zero_all(ipvs);
3328 goto out_unlock;
3329 }
3330 }
3331
3332 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
3333 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
3334 IP_VS_SCHEDNAME_MAXLEN) {
3335 ret = -EINVAL;
3336 goto out_unlock;
3337 }
3338
3339 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
3340 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
3341 usvc.protocol != IPPROTO_SCTP) {
3342 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
3343 usvc.protocol, &usvc.addr.ip,
3344 ntohs(usvc.port));
3345 ret = -EFAULT;
3346 goto out_unlock;
3347 }
3348
3349 /* Lookup the exact service by <protocol, addr, port> or fwmark */
3350 rcu_read_lock();
3351 if (usvc.fwmark == 0)
3352 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
3353 &usvc.addr, usvc.port);
3354 else
3355 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
3356 rcu_read_unlock();
3357
3358 if (cmd != IP_VS_SO_SET_ADD
3359 && (svc == NULL || svc->protocol != usvc.protocol)) {
3360 ret = -ESRCH;
3361 goto out_unlock;
3362 }
3363
3364 switch (cmd) {
3365 case IP_VS_SO_SET_ADD:
3366 if (svc != NULL)
3367 ret = -EEXIST;
3368 else
3369 ret = ip_vs_add_service(ipvs, &usvc, &svc);
3370 break;
3371 case IP_VS_SO_SET_EDIT:
3372 ret = ip_vs_edit_service(svc, &usvc);
3373 break;
3374 case IP_VS_SO_SET_DEL:
3375 ret = ip_vs_del_service(svc);
3376 if (!ret)
3377 goto out_unlock;
3378 break;
3379 case IP_VS_SO_SET_ZERO:
3380 ret = ip_vs_zero_service(svc);
3381 break;
3382 case IP_VS_SO_SET_ADDDEST:
3383 ret = ip_vs_add_dest(svc, &udest);
3384 break;
3385 case IP_VS_SO_SET_EDITDEST:
3386 ret = ip_vs_edit_dest(svc, &udest);
3387 break;
3388 case IP_VS_SO_SET_DELDEST:
3389 ret = ip_vs_del_dest(svc, &udest);
3390 break;
3391 default:
3392 WARN_ON_ONCE(1);
3393 ret = -EINVAL;
3394 break;
3395 }
3396
3397 out_unlock:
3398 mutex_unlock(&ipvs->service_mutex);
3399 return ret;
3400 }
3401
3402
3403 static void
ip_vs_copy_service(struct ip_vs_service_entry * dst,struct ip_vs_service * src)3404 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
3405 {
3406 struct ip_vs_scheduler *sched;
3407 struct ip_vs_kstats kstats;
3408 char *sched_name;
3409
3410 sched = rcu_dereference_protected(src->scheduler, 1);
3411 sched_name = sched ? sched->name : "none";
3412 dst->protocol = src->protocol;
3413 dst->addr = src->addr.ip;
3414 dst->port = src->port;
3415 dst->fwmark = src->fwmark;
3416 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
3417 dst->flags = src->flags;
3418 dst->timeout = src->timeout / HZ;
3419 dst->netmask = src->netmask;
3420 dst->num_dests = src->num_dests;
3421 ip_vs_copy_stats(&kstats, &src->stats);
3422 ip_vs_export_stats_user(&dst->stats, &kstats);
3423 }
3424
3425 static inline int
__ip_vs_get_service_entries(struct netns_ipvs * ipvs,const struct ip_vs_get_services * get,struct ip_vs_get_services __user * uptr)3426 __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
3427 const struct ip_vs_get_services *get,
3428 struct ip_vs_get_services __user *uptr)
3429 {
3430 struct ip_vs_service_entry entry;
3431 DECLARE_IP_VS_RHT_WALK_BUCKETS();
3432 struct hlist_bl_head *head;
3433 struct ip_vs_service *svc;
3434 struct hlist_bl_node *e;
3435 int count = 0;
3436 int ret = 0;
3437
3438 lockdep_assert_held(&ipvs->svc_resize_sem);
3439 /* All service modifications are disabled, go ahead */
3440 ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
3441 hlist_bl_for_each_entry(svc, e, head, s_list) {
3442 /* Only expose IPv4 entries to old interface */
3443 if (svc->af != AF_INET)
3444 continue;
3445
3446 if (count >= get->num_services)
3447 goto out;
3448 memset(&entry, 0, sizeof(entry));
3449 ip_vs_copy_service(&entry, svc);
3450 if (copy_to_user(&uptr->entrytable[count],
3451 &entry, sizeof(entry))) {
3452 ret = -EFAULT;
3453 goto out;
3454 }
3455 count++;
3456 }
3457 }
3458
3459 out:
3460 return ret;
3461 }
3462
3463 static inline int
__ip_vs_get_dest_entries(struct netns_ipvs * ipvs,const struct ip_vs_get_dests * get,struct ip_vs_get_dests __user * uptr)3464 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
3465 struct ip_vs_get_dests __user *uptr)
3466 {
3467 struct ip_vs_service *svc;
3468 union nf_inet_addr addr = { .ip = get->addr };
3469 int ret = 0;
3470
3471 rcu_read_lock();
3472 if (get->fwmark)
3473 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
3474 else
3475 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
3476 get->port);
3477 rcu_read_unlock();
3478
3479 if (svc) {
3480 int count = 0;
3481 struct ip_vs_dest *dest;
3482 struct ip_vs_dest_entry entry;
3483 struct ip_vs_kstats kstats;
3484
3485 memset(&entry, 0, sizeof(entry));
3486 list_for_each_entry(dest, &svc->destinations, n_list) {
3487 if (count >= get->num_dests)
3488 break;
3489
3490 /* Cannot expose heterogeneous members via sockopt
3491 * interface
3492 */
3493 if (dest->af != svc->af)
3494 continue;
3495
3496 entry.addr = dest->addr.ip;
3497 entry.port = dest->port;
3498 entry.conn_flags = atomic_read(&dest->conn_flags);
3499 entry.weight = atomic_read(&dest->weight);
3500 entry.u_threshold = dest->u_threshold;
3501 entry.l_threshold = dest->l_threshold;
3502 entry.activeconns = atomic_read(&dest->activeconns);
3503 entry.inactconns = atomic_read(&dest->inactconns);
3504 entry.persistconns = atomic_read(&dest->persistconns);
3505 ip_vs_copy_stats(&kstats, &dest->stats);
3506 ip_vs_export_stats_user(&entry.stats, &kstats);
3507 if (copy_to_user(&uptr->entrytable[count],
3508 &entry, sizeof(entry))) {
3509 ret = -EFAULT;
3510 break;
3511 }
3512 count++;
3513 }
3514 } else
3515 ret = -ESRCH;
3516 return ret;
3517 }
3518
3519 static inline void
__ip_vs_get_timeouts(struct netns_ipvs * ipvs,struct ip_vs_timeout_user * u)3520 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3521 {
3522 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3523 struct ip_vs_proto_data *pd;
3524 #endif
3525
3526 memset(u, 0, sizeof (*u));
3527
3528 #ifdef CONFIG_IP_VS_PROTO_TCP
3529 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3530 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
3531 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
3532 #endif
3533 #ifdef CONFIG_IP_VS_PROTO_UDP
3534 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3535 u->udp_timeout =
3536 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
3537 #endif
3538 }
3539
3540 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
3541 [CMDID(IP_VS_SO_GET_VERSION)] = 64,
3542 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo),
3543 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
3544 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry),
3545 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests),
3546 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
3547 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user),
3548 };
3549
3550 union ip_vs_get_arglen {
3551 char field_IP_VS_SO_GET_VERSION[64];
3552 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO;
3553 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES;
3554 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE;
3555 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS;
3556 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT;
3557 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2];
3558 };
3559
3560 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen)
3561
3562 static int
do_ip_vs_get_ctl(struct sock * sk,int cmd,void __user * user,int * len)3563 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
3564 {
3565 unsigned char arg[MAX_GET_ARGLEN];
3566 int ret = 0;
3567 unsigned int copylen;
3568 struct net *net = sock_net(sk);
3569 struct netns_ipvs *ipvs = net_ipvs(net);
3570
3571 BUG_ON(!net);
3572 BUILD_BUG_ON(sizeof(arg) > 255);
3573 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3574 return -EPERM;
3575
3576 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
3577 return -EINVAL;
3578
3579 copylen = get_arglen[CMDID(cmd)];
3580 if (*len < (int) copylen) {
3581 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
3582 return -EINVAL;
3583 }
3584
3585 if (copy_from_user(arg, user, copylen) != 0)
3586 return -EFAULT;
3587 /*
3588 * Handle daemons first since it has its own locking
3589 */
3590 if (cmd == IP_VS_SO_GET_DAEMON) {
3591 struct ip_vs_daemon_user d[2];
3592
3593 memset(&d, 0, sizeof(d));
3594 mutex_lock(&ipvs->sync_mutex);
3595 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
3596 d[0].state = IP_VS_STATE_MASTER;
3597 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
3598 sizeof(d[0].mcast_ifn));
3599 d[0].syncid = ipvs->mcfg.syncid;
3600 }
3601 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
3602 d[1].state = IP_VS_STATE_BACKUP;
3603 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
3604 sizeof(d[1].mcast_ifn));
3605 d[1].syncid = ipvs->bcfg.syncid;
3606 }
3607 if (copy_to_user(user, &d, sizeof(d)) != 0)
3608 ret = -EFAULT;
3609 mutex_unlock(&ipvs->sync_mutex);
3610 return ret;
3611 }
3612
3613 if (cmd == IP_VS_SO_GET_SERVICES) {
3614 struct ip_vs_get_services *get;
3615 size_t size;
3616
3617 get = (struct ip_vs_get_services *)arg;
3618 size = struct_size(get, entrytable, get->num_services);
3619 if (*len != size) {
3620 pr_err("length: %u != %zu\n", *len, size);
3621 return -EINVAL;
3622 }
3623 /* Protect against table resizer moving the entries.
3624 * Try reverse locking, so that we do not hold the mutex
3625 * while waiting for semaphore.
3626 */
3627 while (1) {
3628 ret = down_read_killable(&ipvs->svc_resize_sem);
3629 if (ret < 0)
3630 return ret;
3631 if (mutex_trylock(&ipvs->service_mutex))
3632 break;
3633 up_read(&ipvs->svc_resize_sem);
3634 cond_resched();
3635 }
3636 ret = __ip_vs_get_service_entries(ipvs, get, user);
3637 up_read(&ipvs->svc_resize_sem);
3638 mutex_unlock(&ipvs->service_mutex);
3639 return ret;
3640 }
3641
3642 mutex_lock(&ipvs->service_mutex);
3643 switch (cmd) {
3644 case IP_VS_SO_GET_VERSION:
3645 {
3646 char buf[64];
3647
3648 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
3649 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
3650 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
3651 ret = -EFAULT;
3652 goto out;
3653 }
3654 *len = strlen(buf)+1;
3655 }
3656 break;
3657
3658 case IP_VS_SO_GET_INFO:
3659 {
3660 struct ip_vs_getinfo info;
3661
3662 info.version = IP_VS_VERSION_CODE;
3663 info.size = get_conn_tab_size(ipvs);
3664 info.num_services =
3665 atomic_read(&ipvs->num_services[IP_VS_AF_INET]);
3666 if (copy_to_user(user, &info, sizeof(info)) != 0)
3667 ret = -EFAULT;
3668 }
3669 break;
3670
3671 case IP_VS_SO_GET_SERVICE:
3672 {
3673 struct ip_vs_service_entry *entry;
3674 struct ip_vs_service *svc;
3675 union nf_inet_addr addr;
3676
3677 entry = (struct ip_vs_service_entry *)arg;
3678 addr.ip = entry->addr;
3679 rcu_read_lock();
3680 if (entry->fwmark)
3681 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
3682 else
3683 svc = __ip_vs_service_find(ipvs, AF_INET,
3684 entry->protocol, &addr,
3685 entry->port);
3686 rcu_read_unlock();
3687 if (svc) {
3688 ip_vs_copy_service(entry, svc);
3689 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
3690 ret = -EFAULT;
3691 } else
3692 ret = -ESRCH;
3693 }
3694 break;
3695
3696 case IP_VS_SO_GET_DESTS:
3697 {
3698 struct ip_vs_get_dests *get;
3699 size_t size;
3700
3701 get = (struct ip_vs_get_dests *)arg;
3702 size = struct_size(get, entrytable, get->num_dests);
3703 if (*len != size) {
3704 pr_err("length: %u != %zu\n", *len, size);
3705 ret = -EINVAL;
3706 goto out;
3707 }
3708 ret = __ip_vs_get_dest_entries(ipvs, get, user);
3709 }
3710 break;
3711
3712 case IP_VS_SO_GET_TIMEOUT:
3713 {
3714 struct ip_vs_timeout_user t;
3715
3716 __ip_vs_get_timeouts(ipvs, &t);
3717 if (copy_to_user(user, &t, sizeof(t)) != 0)
3718 ret = -EFAULT;
3719 }
3720 break;
3721
3722 default:
3723 ret = -EINVAL;
3724 }
3725
3726 out:
3727 mutex_unlock(&ipvs->service_mutex);
3728 return ret;
3729 }
3730
3731
3732 static struct nf_sockopt_ops ip_vs_sockopts = {
3733 .pf = PF_INET,
3734 .set_optmin = IP_VS_BASE_CTL,
3735 .set_optmax = IP_VS_SO_SET_MAX+1,
3736 .set = do_ip_vs_set_ctl,
3737 .get_optmin = IP_VS_BASE_CTL,
3738 .get_optmax = IP_VS_SO_GET_MAX+1,
3739 .get = do_ip_vs_get_ctl,
3740 .owner = THIS_MODULE,
3741 };
3742
3743 /*
3744 * Generic Netlink interface
3745 */
3746
3747 /* IPVS genetlink family */
3748 static struct genl_family ip_vs_genl_family;
3749
3750 /* Policy used for first-level command attributes */
3751 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
3752 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
3753 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
3754 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
3755 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
3756 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
3757 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
3758 };
3759
3760 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
3761 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
3762 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
3763 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
3764 .len = IP_VS_IFNAME_MAXLEN - 1 },
3765 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
3766 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 },
3767 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 },
3768 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) },
3769 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 },
3770 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 },
3771 };
3772
3773 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
3774 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
3775 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
3776 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
3777 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
3778 .len = sizeof(union nf_inet_addr) },
3779 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
3780 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
3781 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
3782 .len = IP_VS_SCHEDNAME_MAXLEN - 1 },
3783 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
3784 .len = IP_VS_PENAME_MAXLEN },
3785 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
3786 .len = sizeof(struct ip_vs_flags) },
3787 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
3788 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
3789 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
3790 };
3791
3792 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
3793 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
3794 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
3795 .len = sizeof(union nf_inet_addr) },
3796 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
3797 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
3798 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
3799 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
3800 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
3801 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
3802 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
3803 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
3804 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
3805 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
3806 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
3807 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
3808 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 },
3809 };
3810
ip_vs_genl_fill_stats(struct sk_buff * skb,int container_type,struct ip_vs_kstats * kstats)3811 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
3812 struct ip_vs_kstats *kstats)
3813 {
3814 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3815
3816 if (!nl_stats)
3817 return -EMSGSIZE;
3818
3819 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
3820 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
3821 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
3822 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3823 IPVS_STATS_ATTR_PAD) ||
3824 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3825 IPVS_STATS_ATTR_PAD) ||
3826 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
3827 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
3828 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
3829 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
3830 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
3831 goto nla_put_failure;
3832 nla_nest_end(skb, nl_stats);
3833
3834 return 0;
3835
3836 nla_put_failure:
3837 nla_nest_cancel(skb, nl_stats);
3838 return -EMSGSIZE;
3839 }
3840
ip_vs_genl_fill_stats64(struct sk_buff * skb,int container_type,struct ip_vs_kstats * kstats)3841 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
3842 struct ip_vs_kstats *kstats)
3843 {
3844 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3845
3846 if (!nl_stats)
3847 return -EMSGSIZE;
3848
3849 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
3850 IPVS_STATS_ATTR_PAD) ||
3851 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
3852 IPVS_STATS_ATTR_PAD) ||
3853 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
3854 IPVS_STATS_ATTR_PAD) ||
3855 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3856 IPVS_STATS_ATTR_PAD) ||
3857 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3858 IPVS_STATS_ATTR_PAD) ||
3859 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
3860 IPVS_STATS_ATTR_PAD) ||
3861 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
3862 IPVS_STATS_ATTR_PAD) ||
3863 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
3864 IPVS_STATS_ATTR_PAD) ||
3865 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
3866 IPVS_STATS_ATTR_PAD) ||
3867 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
3868 IPVS_STATS_ATTR_PAD))
3869 goto nla_put_failure;
3870 nla_nest_end(skb, nl_stats);
3871
3872 return 0;
3873
3874 nla_put_failure:
3875 nla_nest_cancel(skb, nl_stats);
3876 return -EMSGSIZE;
3877 }
3878
ip_vs_genl_fill_service(struct sk_buff * skb,struct ip_vs_service * svc)3879 static int ip_vs_genl_fill_service(struct sk_buff *skb,
3880 struct ip_vs_service *svc)
3881 {
3882 struct ip_vs_scheduler *sched;
3883 struct ip_vs_pe *pe;
3884 struct nlattr *nl_service;
3885 struct ip_vs_flags flags = { .flags = svc->flags,
3886 .mask = ~0 };
3887 struct ip_vs_kstats kstats;
3888 char *sched_name;
3889
3890 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
3891 if (!nl_service)
3892 return -EMSGSIZE;
3893
3894 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
3895 goto nla_put_failure;
3896 if (svc->fwmark) {
3897 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
3898 goto nla_put_failure;
3899 } else {
3900 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
3901 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
3902 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
3903 goto nla_put_failure;
3904 }
3905
3906 sched = rcu_dereference(svc->scheduler);
3907 sched_name = sched ? sched->name : "none";
3908 pe = rcu_dereference(svc->pe);
3909 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
3910 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
3911 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
3912 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
3913 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
3914 goto nla_put_failure;
3915 ip_vs_copy_stats(&kstats, &svc->stats);
3916 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
3917 goto nla_put_failure;
3918 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
3919 goto nla_put_failure;
3920
3921 nla_nest_end(skb, nl_service);
3922
3923 return 0;
3924
3925 nla_put_failure:
3926 nla_nest_cancel(skb, nl_service);
3927 return -EMSGSIZE;
3928 }
3929
ip_vs_genl_dump_service(struct sk_buff * skb,struct ip_vs_service * svc,struct netlink_callback * cb)3930 static int ip_vs_genl_dump_service(struct sk_buff *skb,
3931 struct ip_vs_service *svc,
3932 struct netlink_callback *cb)
3933 {
3934 void *hdr;
3935
3936 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
3937 &ip_vs_genl_family, NLM_F_MULTI,
3938 IPVS_CMD_NEW_SERVICE);
3939 if (!hdr)
3940 return -EMSGSIZE;
3941
3942 if (ip_vs_genl_fill_service(skb, svc) < 0)
3943 goto nla_put_failure;
3944
3945 genlmsg_end(skb, hdr);
3946 return 0;
3947
3948 nla_put_failure:
3949 genlmsg_cancel(skb, hdr);
3950 return -EMSGSIZE;
3951 }
3952
ip_vs_genl_dump_services(struct sk_buff * skb,struct netlink_callback * cb)3953 static int ip_vs_genl_dump_services(struct sk_buff *skb,
3954 struct netlink_callback *cb)
3955 {
3956 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
3957 struct net *net = sock_net(skb->sk);
3958 struct netns_ipvs *ipvs = net_ipvs(net);
3959 struct hlist_bl_head *head;
3960 struct ip_vs_service *svc;
3961 struct hlist_bl_node *e;
3962 int start = cb->args[0];
3963 int idx = 0;
3964
3965 down_read(&ipvs->svc_resize_sem);
3966 rcu_read_lock();
3967 ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
3968 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
3969 if (++idx <= start)
3970 continue;
3971 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
3972 idx--;
3973 goto nla_put_failure;
3974 }
3975 }
3976 }
3977
3978 nla_put_failure:
3979 rcu_read_unlock();
3980 up_read(&ipvs->svc_resize_sem);
3981 cb->args[0] = idx;
3982
3983 return skb->len;
3984 }
3985
ip_vs_is_af_valid(int af)3986 static bool ip_vs_is_af_valid(int af)
3987 {
3988 if (af == AF_INET)
3989 return true;
3990 #ifdef CONFIG_IP_VS_IPV6
3991 if (af == AF_INET6 && ipv6_mod_enabled())
3992 return true;
3993 #endif
3994 return false;
3995 }
3996
ip_vs_genl_parse_service(struct netns_ipvs * ipvs,struct ip_vs_service_user_kern * usvc,struct nlattr * nla,bool full_entry,struct ip_vs_service ** ret_svc)3997 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
3998 struct ip_vs_service_user_kern *usvc,
3999 struct nlattr *nla, bool full_entry,
4000 struct ip_vs_service **ret_svc)
4001 {
4002 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
4003 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
4004 struct ip_vs_service *svc;
4005
4006 /* Parse mandatory identifying service fields first */
4007 if (nla == NULL ||
4008 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL))
4009 return -EINVAL;
4010
4011 nla_af = attrs[IPVS_SVC_ATTR_AF];
4012 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
4013 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
4014 nla_port = attrs[IPVS_SVC_ATTR_PORT];
4015 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
4016
4017 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
4018 return -EINVAL;
4019
4020 memset(usvc, 0, sizeof(*usvc));
4021
4022 usvc->af = nla_get_u16(nla_af);
4023 if (!ip_vs_is_af_valid(usvc->af))
4024 return -EAFNOSUPPORT;
4025
4026 if (nla_fwmark) {
4027 usvc->protocol = IPPROTO_TCP;
4028 usvc->fwmark = nla_get_u32(nla_fwmark);
4029 } else {
4030 usvc->protocol = nla_get_u16(nla_protocol);
4031 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
4032 usvc->port = nla_get_be16(nla_port);
4033 usvc->fwmark = 0;
4034 }
4035
4036 if (usvc->fwmark)
4037 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
4038 else
4039 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
4040 &usvc->addr, usvc->port);
4041 *ret_svc = svc;
4042
4043 /* If a full entry was requested, check for the additional fields */
4044 if (full_entry) {
4045 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
4046 *nla_netmask;
4047 struct ip_vs_flags flags;
4048
4049 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
4050 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
4051 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
4052 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
4053 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
4054
4055 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
4056 return -EINVAL;
4057
4058 nla_memcpy(&flags, nla_flags, sizeof(flags));
4059
4060 /* prefill flags from service if it already exists */
4061 if (svc)
4062 usvc->flags = svc->flags;
4063
4064 /* set new flags from userland */
4065 usvc->flags = (usvc->flags & ~flags.mask) |
4066 (flags.flags & flags.mask);
4067 usvc->sched_name = nla_data(nla_sched);
4068 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
4069 usvc->timeout = nla_get_u32(nla_timeout);
4070 usvc->netmask = nla_get_be32(nla_netmask);
4071 }
4072
4073 return 0;
4074 }
4075
ip_vs_genl_find_service(struct netns_ipvs * ipvs,struct nlattr * nla)4076 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
4077 struct nlattr *nla)
4078 {
4079 struct ip_vs_service_user_kern usvc;
4080 struct ip_vs_service *svc;
4081 int ret;
4082
4083 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc);
4084 return ret ? ERR_PTR(ret) : svc;
4085 }
4086
ip_vs_genl_fill_dest(struct sk_buff * skb,struct ip_vs_dest * dest)4087 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
4088 {
4089 struct nlattr *nl_dest;
4090 struct ip_vs_kstats kstats;
4091
4092 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
4093 if (!nl_dest)
4094 return -EMSGSIZE;
4095
4096 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
4097 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
4098 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
4099 (atomic_read(&dest->conn_flags) &
4100 IP_VS_CONN_F_FWD_MASK)) ||
4101 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
4102 atomic_read(&dest->weight)) ||
4103 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
4104 dest->tun_type) ||
4105 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
4106 dest->tun_port) ||
4107 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
4108 dest->tun_flags) ||
4109 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
4110 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
4111 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
4112 atomic_read(&dest->activeconns)) ||
4113 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
4114 atomic_read(&dest->inactconns)) ||
4115 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
4116 atomic_read(&dest->persistconns)) ||
4117 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
4118 goto nla_put_failure;
4119 ip_vs_copy_stats(&kstats, &dest->stats);
4120 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
4121 goto nla_put_failure;
4122 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
4123 goto nla_put_failure;
4124
4125 nla_nest_end(skb, nl_dest);
4126
4127 return 0;
4128
4129 nla_put_failure:
4130 nla_nest_cancel(skb, nl_dest);
4131 return -EMSGSIZE;
4132 }
4133
ip_vs_genl_dump_dest(struct sk_buff * skb,struct ip_vs_dest * dest,struct netlink_callback * cb)4134 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
4135 struct netlink_callback *cb)
4136 {
4137 void *hdr;
4138
4139 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4140 &ip_vs_genl_family, NLM_F_MULTI,
4141 IPVS_CMD_NEW_DEST);
4142 if (!hdr)
4143 return -EMSGSIZE;
4144
4145 if (ip_vs_genl_fill_dest(skb, dest) < 0)
4146 goto nla_put_failure;
4147
4148 genlmsg_end(skb, hdr);
4149 return 0;
4150
4151 nla_put_failure:
4152 genlmsg_cancel(skb, hdr);
4153 return -EMSGSIZE;
4154 }
4155
ip_vs_genl_dump_dests(struct sk_buff * skb,struct netlink_callback * cb)4156 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
4157 struct netlink_callback *cb)
4158 {
4159 int idx = 0;
4160 int start = cb->args[0];
4161 struct ip_vs_service *svc;
4162 struct ip_vs_dest *dest;
4163 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
4164 struct net *net = sock_net(skb->sk);
4165 struct netns_ipvs *ipvs = net_ipvs(net);
4166
4167 rcu_read_lock();
4168
4169 /* Try to find the service for which to dump destinations */
4170 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
4171 goto out_err;
4172
4173
4174 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
4175 if (IS_ERR_OR_NULL(svc))
4176 goto out_err;
4177
4178 /* Dump the destinations */
4179 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
4180 if (++idx <= start)
4181 continue;
4182 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
4183 idx--;
4184 goto nla_put_failure;
4185 }
4186 }
4187
4188 nla_put_failure:
4189 cb->args[0] = idx;
4190
4191 out_err:
4192 rcu_read_unlock();
4193
4194 return skb->len;
4195 }
4196
ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern * udest,struct nlattr * nla,bool full_entry)4197 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
4198 struct nlattr *nla, bool full_entry)
4199 {
4200 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
4201 struct nlattr *nla_addr, *nla_port;
4202 struct nlattr *nla_addr_family;
4203
4204 /* Parse mandatory identifying destination fields first */
4205 if (nla == NULL ||
4206 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL))
4207 return -EINVAL;
4208
4209 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
4210 nla_port = attrs[IPVS_DEST_ATTR_PORT];
4211 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
4212
4213 if (!(nla_addr && nla_port))
4214 return -EINVAL;
4215
4216 memset(udest, 0, sizeof(*udest));
4217
4218 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
4219 udest->port = nla_get_be16(nla_port);
4220
4221 udest->af = nla_get_u16_default(nla_addr_family, 0);
4222
4223 /* If a full entry was requested, check for the additional fields */
4224 if (full_entry) {
4225 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
4226 *nla_l_thresh, *nla_tun_type, *nla_tun_port,
4227 *nla_tun_flags;
4228
4229 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
4230 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
4231 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
4232 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
4233 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
4234 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
4235 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS];
4236
4237 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
4238 return -EINVAL;
4239
4240 udest->conn_flags = nla_get_u32(nla_fwd)
4241 & IP_VS_CONN_F_FWD_MASK;
4242 udest->weight = nla_get_u32(nla_weight);
4243 udest->u_threshold = nla_get_u32(nla_u_thresh);
4244 udest->l_threshold = nla_get_u32(nla_l_thresh);
4245
4246 if (nla_tun_type)
4247 udest->tun_type = nla_get_u8(nla_tun_type);
4248
4249 if (nla_tun_port)
4250 udest->tun_port = nla_get_be16(nla_tun_port);
4251
4252 if (nla_tun_flags)
4253 udest->tun_flags = nla_get_u16(nla_tun_flags);
4254 }
4255
4256 return 0;
4257 }
4258
ip_vs_genl_fill_daemon(struct sk_buff * skb,__u32 state,struct ipvs_sync_daemon_cfg * c)4259 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
4260 struct ipvs_sync_daemon_cfg *c)
4261 {
4262 struct nlattr *nl_daemon;
4263
4264 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON);
4265 if (!nl_daemon)
4266 return -EMSGSIZE;
4267
4268 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
4269 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
4270 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
4271 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
4272 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
4273 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
4274 goto nla_put_failure;
4275 #ifdef CONFIG_IP_VS_IPV6
4276 if (c->mcast_af == AF_INET6) {
4277 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
4278 &c->mcast_group.in6))
4279 goto nla_put_failure;
4280 } else
4281 #endif
4282 if (c->mcast_af == AF_INET &&
4283 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
4284 c->mcast_group.ip))
4285 goto nla_put_failure;
4286 nla_nest_end(skb, nl_daemon);
4287
4288 return 0;
4289
4290 nla_put_failure:
4291 nla_nest_cancel(skb, nl_daemon);
4292 return -EMSGSIZE;
4293 }
4294
ip_vs_genl_dump_daemon(struct sk_buff * skb,__u32 state,struct ipvs_sync_daemon_cfg * c,struct netlink_callback * cb)4295 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
4296 struct ipvs_sync_daemon_cfg *c,
4297 struct netlink_callback *cb)
4298 {
4299 void *hdr;
4300 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4301 &ip_vs_genl_family, NLM_F_MULTI,
4302 IPVS_CMD_NEW_DAEMON);
4303 if (!hdr)
4304 return -EMSGSIZE;
4305
4306 if (ip_vs_genl_fill_daemon(skb, state, c))
4307 goto nla_put_failure;
4308
4309 genlmsg_end(skb, hdr);
4310 return 0;
4311
4312 nla_put_failure:
4313 genlmsg_cancel(skb, hdr);
4314 return -EMSGSIZE;
4315 }
4316
ip_vs_genl_dump_daemons(struct sk_buff * skb,struct netlink_callback * cb)4317 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
4318 struct netlink_callback *cb)
4319 {
4320 struct net *net = sock_net(skb->sk);
4321 struct netns_ipvs *ipvs = net_ipvs(net);
4322
4323 mutex_lock(&ipvs->sync_mutex);
4324 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
4325 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
4326 &ipvs->mcfg, cb) < 0)
4327 goto nla_put_failure;
4328
4329 cb->args[0] = 1;
4330 }
4331
4332 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
4333 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
4334 &ipvs->bcfg, cb) < 0)
4335 goto nla_put_failure;
4336
4337 cb->args[1] = 1;
4338 }
4339
4340 nla_put_failure:
4341 mutex_unlock(&ipvs->sync_mutex);
4342
4343 return skb->len;
4344 }
4345
ip_vs_genl_new_daemon(struct netns_ipvs * ipvs,struct nlattr ** attrs)4346 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4347 {
4348 struct ipvs_sync_daemon_cfg c;
4349 struct nlattr *a;
4350 int ret;
4351
4352 memset(&c, 0, sizeof(c));
4353 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
4354 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
4355 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
4356 return -EINVAL;
4357 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
4358 sizeof(c.mcast_ifn));
4359 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
4360
4361 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
4362 if (a)
4363 c.sync_maxlen = nla_get_u16(a);
4364
4365 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
4366 if (a) {
4367 c.mcast_af = AF_INET;
4368 c.mcast_group.ip = nla_get_in_addr(a);
4369 if (!ipv4_is_multicast(c.mcast_group.ip))
4370 return -EINVAL;
4371 } else {
4372 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
4373 if (a) {
4374 #ifdef CONFIG_IP_VS_IPV6
4375 int addr_type;
4376
4377 c.mcast_af = AF_INET6;
4378 c.mcast_group.in6 = nla_get_in6_addr(a);
4379 addr_type = ipv6_addr_type(&c.mcast_group.in6);
4380 if (!(addr_type & IPV6_ADDR_MULTICAST))
4381 return -EINVAL;
4382 #else
4383 return -EAFNOSUPPORT;
4384 #endif
4385 }
4386 }
4387
4388 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
4389 if (a)
4390 c.mcast_port = nla_get_u16(a);
4391
4392 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
4393 if (a)
4394 c.mcast_ttl = nla_get_u8(a);
4395
4396 /* The synchronization protocol is incompatible with mixed family
4397 * services
4398 */
4399 if (ipvs->mixed_address_family_dests > 0)
4400 return -EINVAL;
4401
4402 ret = start_sync_thread(ipvs, &c,
4403 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4404 return ret;
4405 }
4406
ip_vs_genl_del_daemon(struct netns_ipvs * ipvs,struct nlattr ** attrs)4407 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4408 {
4409 int ret;
4410
4411 if (!attrs[IPVS_DAEMON_ATTR_STATE])
4412 return -EINVAL;
4413
4414 ret = stop_sync_thread(ipvs,
4415 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4416 return ret;
4417 }
4418
ip_vs_genl_set_config(struct netns_ipvs * ipvs,struct nlattr ** attrs)4419 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
4420 {
4421 struct ip_vs_timeout_user t;
4422
4423 __ip_vs_get_timeouts(ipvs, &t);
4424
4425 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
4426 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
4427
4428 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
4429 t.tcp_fin_timeout =
4430 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
4431
4432 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
4433 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
4434
4435 return ip_vs_set_timeout(ipvs, &t);
4436 }
4437
ip_vs_genl_set_daemon(struct sk_buff * skb,struct genl_info * info)4438 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
4439 {
4440 int ret = -EINVAL, cmd;
4441 struct net *net = sock_net(skb->sk);
4442 struct netns_ipvs *ipvs = net_ipvs(net);
4443
4444 cmd = info->genlhdr->cmd;
4445
4446 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
4447 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
4448
4449 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
4450 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack))
4451 goto out;
4452
4453 if (cmd == IPVS_CMD_NEW_DAEMON)
4454 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
4455 else
4456 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
4457 }
4458
4459 out:
4460 return ret;
4461 }
4462
ip_vs_genl_set_cmd(struct sk_buff * skb,struct genl_info * info)4463 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
4464 {
4465 bool need_full_svc = false, need_full_dest = false;
4466 struct ip_vs_service *svc = NULL;
4467 struct ip_vs_service_user_kern usvc;
4468 struct ip_vs_dest_user_kern udest;
4469 int ret = 0, cmd;
4470 struct net *net = sock_net(skb->sk);
4471 struct netns_ipvs *ipvs = net_ipvs(net);
4472
4473 cmd = info->genlhdr->cmd;
4474
4475 mutex_lock(&ipvs->service_mutex);
4476
4477 if (cmd == IPVS_CMD_FLUSH) {
4478 ret = ip_vs_flush(ipvs, false);
4479 goto out;
4480 } else if (cmd == IPVS_CMD_SET_CONFIG) {
4481 ret = ip_vs_genl_set_config(ipvs, info->attrs);
4482 goto out;
4483 } else if (cmd == IPVS_CMD_ZERO &&
4484 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
4485 ret = ip_vs_zero_all(ipvs);
4486 goto out;
4487 }
4488
4489 /* All following commands require a service argument, so check if we
4490 * received a valid one. We need a full service specification when
4491 * adding / editing a service. Only identifying members otherwise. */
4492 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
4493 need_full_svc = true;
4494
4495 /* We use function that requires RCU lock (hlist_bl) */
4496 rcu_read_lock();
4497 ret = ip_vs_genl_parse_service(ipvs, &usvc,
4498 info->attrs[IPVS_CMD_ATTR_SERVICE],
4499 need_full_svc, &svc);
4500 rcu_read_unlock();
4501 if (ret)
4502 goto out;
4503
4504 /* Unless we're adding a new service, the service must already exist */
4505 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
4506 ret = -ESRCH;
4507 goto out;
4508 }
4509
4510 /* Destination commands require a valid destination argument. For
4511 * adding / editing a destination, we need a full destination
4512 * specification. */
4513 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
4514 cmd == IPVS_CMD_DEL_DEST) {
4515 if (cmd != IPVS_CMD_DEL_DEST)
4516 need_full_dest = true;
4517
4518 ret = ip_vs_genl_parse_dest(&udest,
4519 info->attrs[IPVS_CMD_ATTR_DEST],
4520 need_full_dest);
4521 if (ret)
4522 goto out;
4523
4524 /* Old protocols did not allow the user to specify address
4525 * family, so we set it to zero instead. We also didn't
4526 * allow heterogeneous pools in the old code, so it's safe
4527 * to assume that this will have the same address family as
4528 * the service.
4529 */
4530 if (udest.af == 0)
4531 udest.af = svc->af;
4532
4533 if (!ip_vs_is_af_valid(udest.af)) {
4534 ret = -EAFNOSUPPORT;
4535 goto out;
4536 }
4537
4538 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
4539 /* The synchronization protocol is incompatible
4540 * with mixed family services
4541 */
4542 if (ipvs->sync_state) {
4543 ret = -EINVAL;
4544 goto out;
4545 }
4546
4547 /* Which connection types do we support? */
4548 switch (udest.conn_flags) {
4549 case IP_VS_CONN_F_TUNNEL:
4550 /* We are able to forward this */
4551 break;
4552 default:
4553 ret = -EINVAL;
4554 goto out;
4555 }
4556 }
4557 }
4558
4559 switch (cmd) {
4560 case IPVS_CMD_NEW_SERVICE:
4561 if (svc == NULL)
4562 ret = ip_vs_add_service(ipvs, &usvc, &svc);
4563 else
4564 ret = -EEXIST;
4565 break;
4566 case IPVS_CMD_SET_SERVICE:
4567 ret = ip_vs_edit_service(svc, &usvc);
4568 break;
4569 case IPVS_CMD_DEL_SERVICE:
4570 ret = ip_vs_del_service(svc);
4571 /* do not use svc, it can be freed */
4572 break;
4573 case IPVS_CMD_NEW_DEST:
4574 ret = ip_vs_add_dest(svc, &udest);
4575 break;
4576 case IPVS_CMD_SET_DEST:
4577 ret = ip_vs_edit_dest(svc, &udest);
4578 break;
4579 case IPVS_CMD_DEL_DEST:
4580 ret = ip_vs_del_dest(svc, &udest);
4581 break;
4582 case IPVS_CMD_ZERO:
4583 ret = ip_vs_zero_service(svc);
4584 break;
4585 default:
4586 ret = -EINVAL;
4587 }
4588
4589 out:
4590 mutex_unlock(&ipvs->service_mutex);
4591
4592 return ret;
4593 }
4594
ip_vs_genl_get_cmd(struct sk_buff * skb,struct genl_info * info)4595 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
4596 {
4597 struct sk_buff *msg;
4598 void *reply;
4599 int ret, cmd, reply_cmd;
4600 struct net *net = sock_net(skb->sk);
4601 struct netns_ipvs *ipvs = net_ipvs(net);
4602
4603 cmd = info->genlhdr->cmd;
4604
4605 if (cmd == IPVS_CMD_GET_SERVICE)
4606 reply_cmd = IPVS_CMD_NEW_SERVICE;
4607 else if (cmd == IPVS_CMD_GET_INFO)
4608 reply_cmd = IPVS_CMD_SET_INFO;
4609 else if (cmd == IPVS_CMD_GET_CONFIG)
4610 reply_cmd = IPVS_CMD_SET_CONFIG;
4611 else {
4612 pr_err("unknown Generic Netlink command\n");
4613 return -EINVAL;
4614 }
4615
4616 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4617 if (!msg)
4618 return -ENOMEM;
4619
4620 rcu_read_lock();
4621
4622 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
4623 if (reply == NULL)
4624 goto nla_put_failure;
4625
4626 switch (cmd) {
4627 case IPVS_CMD_GET_SERVICE:
4628 {
4629 struct ip_vs_service *svc;
4630
4631 svc = ip_vs_genl_find_service(ipvs,
4632 info->attrs[IPVS_CMD_ATTR_SERVICE]);
4633 if (IS_ERR(svc)) {
4634 ret = PTR_ERR(svc);
4635 goto out_err;
4636 } else if (svc) {
4637 ret = ip_vs_genl_fill_service(msg, svc);
4638 if (ret)
4639 goto nla_put_failure;
4640 } else {
4641 ret = -ESRCH;
4642 goto out_err;
4643 }
4644
4645 break;
4646 }
4647
4648 case IPVS_CMD_GET_CONFIG:
4649 {
4650 struct ip_vs_timeout_user t;
4651
4652 __ip_vs_get_timeouts(ipvs, &t);
4653 #ifdef CONFIG_IP_VS_PROTO_TCP
4654 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
4655 t.tcp_timeout) ||
4656 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
4657 t.tcp_fin_timeout))
4658 goto nla_put_failure;
4659 #endif
4660 #ifdef CONFIG_IP_VS_PROTO_UDP
4661 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
4662 goto nla_put_failure;
4663 #endif
4664
4665 break;
4666 }
4667
4668 case IPVS_CMD_GET_INFO:
4669 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
4670 IP_VS_VERSION_CODE) ||
4671 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
4672 get_conn_tab_size(ipvs)))
4673 goto nla_put_failure;
4674 break;
4675 }
4676
4677 genlmsg_end(msg, reply);
4678 ret = genlmsg_reply(msg, info);
4679 goto out;
4680
4681 nla_put_failure:
4682 pr_err("not enough space in Netlink message\n");
4683 ret = -EMSGSIZE;
4684
4685 out_err:
4686 nlmsg_free(msg);
4687 out:
4688 rcu_read_unlock();
4689
4690 return ret;
4691 }
4692
4693
4694 static const struct genl_small_ops ip_vs_genl_ops[] = {
4695 {
4696 .cmd = IPVS_CMD_NEW_SERVICE,
4697 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4698 .flags = GENL_ADMIN_PERM,
4699 .doit = ip_vs_genl_set_cmd,
4700 },
4701 {
4702 .cmd = IPVS_CMD_SET_SERVICE,
4703 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4704 .flags = GENL_ADMIN_PERM,
4705 .doit = ip_vs_genl_set_cmd,
4706 },
4707 {
4708 .cmd = IPVS_CMD_DEL_SERVICE,
4709 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4710 .flags = GENL_ADMIN_PERM,
4711 .doit = ip_vs_genl_set_cmd,
4712 },
4713 {
4714 .cmd = IPVS_CMD_GET_SERVICE,
4715 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4716 .flags = GENL_ADMIN_PERM,
4717 .doit = ip_vs_genl_get_cmd,
4718 .dumpit = ip_vs_genl_dump_services,
4719 },
4720 {
4721 .cmd = IPVS_CMD_NEW_DEST,
4722 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4723 .flags = GENL_ADMIN_PERM,
4724 .doit = ip_vs_genl_set_cmd,
4725 },
4726 {
4727 .cmd = IPVS_CMD_SET_DEST,
4728 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4729 .flags = GENL_ADMIN_PERM,
4730 .doit = ip_vs_genl_set_cmd,
4731 },
4732 {
4733 .cmd = IPVS_CMD_DEL_DEST,
4734 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4735 .flags = GENL_ADMIN_PERM,
4736 .doit = ip_vs_genl_set_cmd,
4737 },
4738 {
4739 .cmd = IPVS_CMD_GET_DEST,
4740 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4741 .flags = GENL_ADMIN_PERM,
4742 .dumpit = ip_vs_genl_dump_dests,
4743 },
4744 {
4745 .cmd = IPVS_CMD_NEW_DAEMON,
4746 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4747 .flags = GENL_ADMIN_PERM,
4748 .doit = ip_vs_genl_set_daemon,
4749 },
4750 {
4751 .cmd = IPVS_CMD_DEL_DAEMON,
4752 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4753 .flags = GENL_ADMIN_PERM,
4754 .doit = ip_vs_genl_set_daemon,
4755 },
4756 {
4757 .cmd = IPVS_CMD_GET_DAEMON,
4758 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4759 .flags = GENL_ADMIN_PERM,
4760 .dumpit = ip_vs_genl_dump_daemons,
4761 },
4762 {
4763 .cmd = IPVS_CMD_SET_CONFIG,
4764 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4765 .flags = GENL_ADMIN_PERM,
4766 .doit = ip_vs_genl_set_cmd,
4767 },
4768 {
4769 .cmd = IPVS_CMD_GET_CONFIG,
4770 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4771 .flags = GENL_ADMIN_PERM,
4772 .doit = ip_vs_genl_get_cmd,
4773 },
4774 {
4775 .cmd = IPVS_CMD_GET_INFO,
4776 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4777 .flags = GENL_ADMIN_PERM,
4778 .doit = ip_vs_genl_get_cmd,
4779 },
4780 {
4781 .cmd = IPVS_CMD_ZERO,
4782 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4783 .flags = GENL_ADMIN_PERM,
4784 .doit = ip_vs_genl_set_cmd,
4785 },
4786 {
4787 .cmd = IPVS_CMD_FLUSH,
4788 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4789 .flags = GENL_ADMIN_PERM,
4790 .doit = ip_vs_genl_set_cmd,
4791 },
4792 };
4793
4794 static struct genl_family ip_vs_genl_family __ro_after_init = {
4795 .hdrsize = 0,
4796 .name = IPVS_GENL_NAME,
4797 .version = IPVS_GENL_VERSION,
4798 .maxattr = IPVS_CMD_ATTR_MAX,
4799 .policy = ip_vs_cmd_policy,
4800 .netnsok = true, /* Make ipvsadm to work on netns */
4801 .module = THIS_MODULE,
4802 .small_ops = ip_vs_genl_ops,
4803 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
4804 .resv_start_op = IPVS_CMD_FLUSH + 1,
4805 .parallel_ops = 1,
4806 };
4807
ip_vs_genl_register(void)4808 static int __init ip_vs_genl_register(void)
4809 {
4810 return genl_register_family(&ip_vs_genl_family);
4811 }
4812
ip_vs_genl_unregister(void)4813 static void ip_vs_genl_unregister(void)
4814 {
4815 genl_unregister_family(&ip_vs_genl_family);
4816 }
4817
4818 /* End of Generic Netlink interface definitions */
4819
4820 /*
4821 * per netns intit/exit func.
4822 */
4823 #ifdef CONFIG_SYSCTL
ip_vs_control_net_init_sysctl(struct netns_ipvs * ipvs)4824 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
4825 {
4826 struct net *net = ipvs->net;
4827 struct ctl_table *tbl;
4828 int idx, ret;
4829 size_t ctl_table_size = ARRAY_SIZE(vs_vars);
4830 bool unpriv = net->user_ns != &init_user_ns;
4831
4832 atomic_set(&ipvs->dropentry, 0);
4833 spin_lock_init(&ipvs->dropentry_lock);
4834 spin_lock_init(&ipvs->droppacket_lock);
4835 spin_lock_init(&ipvs->securetcp_lock);
4836 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
4837 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
4838 expire_nodest_conn_handler);
4839 ipvs->est_stopped = 0;
4840
4841 if (!net_eq(net, &init_net)) {
4842 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
4843 if (tbl == NULL)
4844 return -ENOMEM;
4845 } else
4846 tbl = vs_vars;
4847 /* Initialize sysctl defaults */
4848 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
4849 if (tbl[idx].proc_handler == proc_do_defense_mode)
4850 tbl[idx].extra2 = ipvs;
4851 }
4852 idx = 0;
4853 ipvs->sysctl_amemthresh = 1024;
4854 tbl[idx++].data = &ipvs->sysctl_amemthresh;
4855 ipvs->sysctl_am_droprate = 10;
4856 tbl[idx++].data = &ipvs->sysctl_am_droprate;
4857 tbl[idx++].data = &ipvs->sysctl_drop_entry;
4858 tbl[idx++].data = &ipvs->sysctl_drop_packet;
4859 #ifdef CONFIG_IP_VS_NFCT
4860 tbl[idx++].data = &ipvs->sysctl_conntrack;
4861 #endif
4862 tbl[idx++].data = &ipvs->sysctl_secure_tcp;
4863 ipvs->sysctl_snat_reroute = 1;
4864 tbl[idx++].data = &ipvs->sysctl_snat_reroute;
4865 ipvs->sysctl_sync_ver = 1;
4866 tbl[idx++].data = &ipvs->sysctl_sync_ver;
4867 ipvs->sysctl_sync_ports = 1;
4868 tbl[idx++].data = &ipvs->sysctl_sync_ports;
4869 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
4870
4871 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
4872 if (unpriv)
4873 tbl[idx].mode = 0444;
4874 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
4875
4876 ipvs->sysctl_sync_sock_size = 0;
4877 if (unpriv)
4878 tbl[idx].mode = 0444;
4879 tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
4880
4881 tbl[idx++].data = &ipvs->sysctl_cache_bypass;
4882 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
4883 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
4884 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
4885 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
4886 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
4887 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
4888 tbl[idx].data = &ipvs->sysctl_sync_threshold;
4889 tbl[idx].extra2 = ipvs;
4890 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
4891 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
4892 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
4893 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
4894 tbl[idx++].data = &ipvs->sysctl_sync_retries;
4895 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
4896 ipvs->sysctl_pmtu_disc = 1;
4897 tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
4898 tbl[idx++].data = &ipvs->sysctl_backup_only;
4899 ipvs->sysctl_conn_reuse_mode = 1;
4900 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
4901 tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
4902 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
4903
4904 ipvs->sysctl_run_estimation = 1;
4905 if (unpriv)
4906 tbl[idx].mode = 0444;
4907 tbl[idx].extra2 = ipvs;
4908 tbl[idx++].data = &ipvs->sysctl_run_estimation;
4909
4910 ipvs->est_cpulist_valid = 0;
4911 if (unpriv)
4912 tbl[idx].mode = 0444;
4913 tbl[idx].extra2 = ipvs;
4914 tbl[idx++].data = &ipvs->sysctl_est_cpulist;
4915
4916 ipvs->sysctl_est_nice = IPVS_EST_NICE;
4917 if (unpriv)
4918 tbl[idx].mode = 0444;
4919 tbl[idx].extra2 = ipvs;
4920 tbl[idx++].data = &ipvs->sysctl_est_nice;
4921
4922 if (unpriv)
4923 tbl[idx].mode = 0444;
4924 tbl[idx].extra2 = ipvs;
4925 tbl[idx++].data = &ipvs->sysctl_conn_lfactor;
4926
4927 if (unpriv)
4928 tbl[idx].mode = 0444;
4929 tbl[idx].extra2 = ipvs;
4930 tbl[idx++].data = &ipvs->sysctl_svc_lfactor;
4931
4932 #ifdef CONFIG_IP_VS_DEBUG
4933 /* Global sysctls must be ro in non-init netns */
4934 if (!net_eq(net, &init_net))
4935 tbl[idx++].mode = 0444;
4936 #endif
4937
4938 ret = -ENOMEM;
4939 ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
4940 ctl_table_size);
4941 if (!ipvs->sysctl_hdr)
4942 goto err;
4943 ipvs->sysctl_tbl = tbl;
4944
4945 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
4946 if (ret < 0)
4947 goto err;
4948
4949 /* Schedule defense work */
4950 queue_delayed_work(system_long_wq, &ipvs->defense_work,
4951 DEFENSE_TIMER_PERIOD);
4952
4953 return 0;
4954
4955 err:
4956 unregister_net_sysctl_table(ipvs->sysctl_hdr);
4957 if (!net_eq(net, &init_net))
4958 kfree(tbl);
4959 return ret;
4960 }
4961
ip_vs_control_net_cleanup_sysctl(struct netns_ipvs * ipvs)4962 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
4963 {
4964 struct net *net = ipvs->net;
4965
4966 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
4967 cancel_delayed_work_sync(&ipvs->defense_work);
4968 cancel_work_sync(&ipvs->defense_work.work);
4969 unregister_net_sysctl_table(ipvs->sysctl_hdr);
4970 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
4971
4972 if (ipvs->est_cpulist_valid)
4973 free_cpumask_var(ipvs->sysctl_est_cpulist);
4974
4975 if (!net_eq(net, &init_net))
4976 kfree(ipvs->sysctl_tbl);
4977 }
4978
4979 #else
4980
ip_vs_control_net_init_sysctl(struct netns_ipvs * ipvs)4981 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
ip_vs_control_net_cleanup_sysctl(struct netns_ipvs * ipvs)4982 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
4983
4984 #endif
4985
4986 static struct notifier_block ip_vs_dst_notifier = {
4987 .notifier_call = ip_vs_dst_event,
4988 #ifdef CONFIG_IP_VS_IPV6
4989 .priority = ADDRCONF_NOTIFY_PRIORITY + 5,
4990 #endif
4991 };
4992
ip_vs_control_net_init(struct netns_ipvs * ipvs)4993 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
4994 {
4995 int ret = -ENOMEM;
4996 int idx;
4997
4998 /* Initialize service_mutex, svc_table per netns */
4999 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
5000 init_rwsem(&ipvs->svc_resize_sem);
5001 INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
5002 atomic_set(&ipvs->svc_table_changes, 0);
5003 RCU_INIT_POINTER(ipvs->svc_table, NULL);
5004
5005 /* Initialize rs_table */
5006 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
5007 INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
5008
5009 INIT_LIST_HEAD(&ipvs->dest_trash);
5010 spin_lock_init(&ipvs->dest_trash_lock);
5011 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0);
5012 for (idx = 0; idx < IP_VS_AF_MAX; idx++) {
5013 atomic_set(&ipvs->num_services[idx], 0);
5014 atomic_set(&ipvs->fwm_services[idx], 0);
5015 atomic_set(&ipvs->nonfwm_services[idx], 0);
5016 atomic_set(&ipvs->ftpsvc_counter[idx], 0);
5017 atomic_set(&ipvs->nullsvc_counter[idx], 0);
5018 atomic_set(&ipvs->conn_out_counter[idx], 0);
5019 }
5020
5021 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
5022 ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs);
5023
5024 /* procfs stats */
5025 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats);
5026 if (!ipvs->tot_stats)
5027 goto out;
5028 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
5029 goto err_tot_stats;
5030
5031 #ifdef CONFIG_PROC_FS
5032 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
5033 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
5034 goto err_vs;
5035 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
5036 ip_vs_stats_show, NULL))
5037 goto err_stats;
5038 if (!proc_create_net_single("ip_vs_stats_percpu", 0,
5039 ipvs->net->proc_net,
5040 ip_vs_stats_percpu_show, NULL))
5041 goto err_percpu;
5042 if (!proc_create_net_single("ip_vs_status", 0, ipvs->net->proc_net,
5043 ip_vs_status_show, NULL))
5044 goto err_status;
5045 #endif
5046
5047 ret = ip_vs_control_net_init_sysctl(ipvs);
5048 if (ret < 0)
5049 goto err;
5050
5051 return 0;
5052
5053 err:
5054 #ifdef CONFIG_PROC_FS
5055 remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5056
5057 err_status:
5058 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5059
5060 err_percpu:
5061 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5062
5063 err_stats:
5064 remove_proc_entry("ip_vs", ipvs->net->proc_net);
5065
5066 err_vs:
5067 #endif
5068 ip_vs_stats_release(&ipvs->tot_stats->s);
5069
5070 err_tot_stats:
5071 kfree(ipvs->tot_stats);
5072
5073 out:
5074 return ret;
5075 }
5076
ip_vs_control_net_cleanup(struct netns_ipvs * ipvs)5077 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
5078 {
5079 ip_vs_trash_cleanup(ipvs);
5080 ip_vs_control_net_cleanup_sysctl(ipvs);
5081 cancel_delayed_work_sync(&ipvs->est_reload_work);
5082 #ifdef CONFIG_PROC_FS
5083 remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5084 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5085 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5086 remove_proc_entry("ip_vs", ipvs->net->proc_net);
5087 #endif
5088 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
5089 }
5090
ip_vs_register_nl_ioctl(void)5091 int __init ip_vs_register_nl_ioctl(void)
5092 {
5093 int ret;
5094
5095 ret = nf_register_sockopt(&ip_vs_sockopts);
5096 if (ret) {
5097 pr_err("cannot register sockopt.\n");
5098 goto err_sock;
5099 }
5100
5101 ret = ip_vs_genl_register();
5102 if (ret) {
5103 pr_err("cannot register Generic Netlink interface.\n");
5104 goto err_genl;
5105 }
5106 return 0;
5107
5108 err_genl:
5109 nf_unregister_sockopt(&ip_vs_sockopts);
5110 err_sock:
5111 return ret;
5112 }
5113
ip_vs_unregister_nl_ioctl(void)5114 void ip_vs_unregister_nl_ioctl(void)
5115 {
5116 ip_vs_genl_unregister();
5117 nf_unregister_sockopt(&ip_vs_sockopts);
5118 }
5119
ip_vs_control_init(void)5120 int __init ip_vs_control_init(void)
5121 {
5122 int ret;
5123
5124 ret = register_netdevice_notifier(&ip_vs_dst_notifier);
5125 if (ret < 0)
5126 return ret;
5127
5128 return 0;
5129 }
5130
5131
ip_vs_control_cleanup(void)5132 void ip_vs_control_cleanup(void)
5133 {
5134 unregister_netdevice_notifier(&ip_vs_dst_notifier);
5135 /* relying on common rcu_barrier() in ip_vs_cleanup() */
5136 }
5137