1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPVS An implementation of the IP virtual server support for the
4 * LINUX operating system. IPVS is now implemented as a module
5 * over the NetFilter framework. IPVS can be used to build a
6 * high-performance and highly available server based on a
7 * cluster of servers.
8 *
9 * Authors: Wensong Zhang <wensong@linuxvirtualserver.org>
10 * Peter Kese <peter.kese@ijs.si>
11 * Julian Anastasov <ja@ssi.bg>
12 *
13 * Changes:
14 */
15
16 #define pr_fmt(fmt) "IPVS: " fmt
17
18 #include <linux/module.h>
19 #include <linux/init.h>
20 #include <linux/types.h>
21 #include <linux/capability.h>
22 #include <linux/fs.h>
23 #include <linux/sysctl.h>
24 #include <linux/proc_fs.h>
25 #include <linux/workqueue.h>
26 #include <linux/seq_file.h>
27 #include <linux/slab.h>
28
29 #include <linux/netfilter.h>
30 #include <linux/netfilter_ipv4.h>
31 #include <linux/mutex.h>
32 #include <linux/rcupdate_wait.h>
33
34 #include <net/net_namespace.h>
35 #include <linux/nsproxy.h>
36 #include <net/ip.h>
37 #ifdef CONFIG_IP_VS_IPV6
38 #include <net/ipv6.h>
39 #include <net/ip6_route.h>
40 #include <net/netfilter/ipv6/nf_defrag_ipv6.h>
41 #endif
42 #include <net/route.h>
43 #include <net/sock.h>
44 #include <net/genetlink.h>
45
46 #include <linux/uaccess.h>
47
48 #include <net/ip_vs.h>
49
50 MODULE_ALIAS_GENL_FAMILY(IPVS_GENL_NAME);
51
52 static struct lock_class_key __ipvs_service_key;
53
54 /* sysctl variables */
55
56 #ifdef CONFIG_IP_VS_DEBUG
57 static int sysctl_ip_vs_debug_level = 0;
58
ip_vs_get_debug_level(void)59 int ip_vs_get_debug_level(void)
60 {
61 return sysctl_ip_vs_debug_level;
62 }
63 #endif
64
65
66 /* Protos */
67 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup);
68
69
70 #ifdef CONFIG_IP_VS_IPV6
71 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */
__ip_vs_addr_is_local_v6(struct net * net,const struct in6_addr * addr)72 static bool __ip_vs_addr_is_local_v6(struct net *net,
73 const struct in6_addr *addr)
74 {
75 struct flowi6 fl6 = {
76 .daddr = *addr,
77 };
78 struct dst_entry *dst = ip6_route_output(net, NULL, &fl6);
79 bool is_local;
80
81 is_local = !dst->error && dst->dev && (dst->dev->flags & IFF_LOOPBACK);
82
83 dst_release(dst);
84 return is_local;
85 }
86 #endif
87
88 #ifdef CONFIG_SYSCTL
89 /*
90 * update_defense_level is called from keventd and from sysctl,
91 * so it needs to protect itself from softirqs
92 */
update_defense_level(struct netns_ipvs * ipvs)93 static void update_defense_level(struct netns_ipvs *ipvs)
94 {
95 struct sysinfo i;
96 int availmem;
97 int amemthresh;
98 int nomem;
99 int to_change = -1;
100
101 /* we only count free and buffered memory (in pages) */
102 si_meminfo(&i);
103 availmem = i.freeram + i.bufferram;
104 /* however in linux 2.5 the i.bufferram is total page cache size,
105 we need adjust it */
106 /* si_swapinfo(&i); */
107 /* availmem = availmem - (i.totalswap - i.freeswap); */
108
109 amemthresh = max(READ_ONCE(ipvs->sysctl_amemthresh), 0);
110 nomem = (availmem < amemthresh);
111
112 local_bh_disable();
113
114 /* drop_entry */
115 spin_lock(&ipvs->dropentry_lock);
116 switch (ipvs->sysctl_drop_entry) {
117 case 0:
118 atomic_set(&ipvs->dropentry, 0);
119 break;
120 case 1:
121 if (nomem) {
122 atomic_set(&ipvs->dropentry, 1);
123 ipvs->sysctl_drop_entry = 2;
124 } else {
125 atomic_set(&ipvs->dropentry, 0);
126 }
127 break;
128 case 2:
129 if (nomem) {
130 atomic_set(&ipvs->dropentry, 1);
131 } else {
132 atomic_set(&ipvs->dropentry, 0);
133 ipvs->sysctl_drop_entry = 1;
134 }
135 break;
136 case 3:
137 atomic_set(&ipvs->dropentry, 1);
138 break;
139 }
140 spin_unlock(&ipvs->dropentry_lock);
141
142 /* drop_packet */
143 spin_lock(&ipvs->droppacket_lock);
144 switch (ipvs->sysctl_drop_packet) {
145 case 0:
146 ipvs->drop_rate = 0;
147 break;
148 case 1:
149 if (nomem) {
150 ipvs->drop_counter = amemthresh / (amemthresh - availmem);
151 ipvs->drop_rate = ipvs->drop_counter;
152 ipvs->sysctl_drop_packet = 2;
153 } else {
154 ipvs->drop_rate = 0;
155 }
156 break;
157 case 2:
158 if (nomem) {
159 ipvs->drop_counter = amemthresh / (amemthresh - availmem);
160 ipvs->drop_rate = ipvs->drop_counter;
161 } else {
162 ipvs->drop_rate = 0;
163 ipvs->sysctl_drop_packet = 1;
164 }
165 break;
166 case 3:
167 ipvs->drop_rate = ipvs->sysctl_am_droprate;
168 break;
169 }
170 spin_unlock(&ipvs->droppacket_lock);
171
172 /* secure_tcp */
173 spin_lock(&ipvs->securetcp_lock);
174 switch (ipvs->sysctl_secure_tcp) {
175 case 0:
176 if (ipvs->old_secure_tcp >= 2)
177 to_change = 0;
178 break;
179 case 1:
180 if (nomem) {
181 if (ipvs->old_secure_tcp < 2)
182 to_change = 1;
183 ipvs->sysctl_secure_tcp = 2;
184 } else {
185 if (ipvs->old_secure_tcp >= 2)
186 to_change = 0;
187 }
188 break;
189 case 2:
190 if (nomem) {
191 if (ipvs->old_secure_tcp < 2)
192 to_change = 1;
193 } else {
194 if (ipvs->old_secure_tcp >= 2)
195 to_change = 0;
196 ipvs->sysctl_secure_tcp = 1;
197 }
198 break;
199 case 3:
200 if (ipvs->old_secure_tcp < 2)
201 to_change = 1;
202 break;
203 }
204 ipvs->old_secure_tcp = ipvs->sysctl_secure_tcp;
205 if (to_change >= 0)
206 ip_vs_protocol_timeout_change(ipvs,
207 ipvs->sysctl_secure_tcp > 1);
208 spin_unlock(&ipvs->securetcp_lock);
209
210 local_bh_enable();
211 }
212
213 /* Handler for delayed work for expiring no
214 * destination connections
215 */
expire_nodest_conn_handler(struct work_struct * work)216 static void expire_nodest_conn_handler(struct work_struct *work)
217 {
218 struct netns_ipvs *ipvs;
219
220 ipvs = container_of(work, struct netns_ipvs,
221 expire_nodest_conn_work.work);
222 ip_vs_expire_nodest_conn_flush(ipvs);
223 }
224
225 /*
226 * Timer for checking the defense
227 */
228 #define DEFENSE_TIMER_PERIOD 1*HZ
229
defense_work_handler(struct work_struct * work)230 static void defense_work_handler(struct work_struct *work)
231 {
232 struct netns_ipvs *ipvs =
233 container_of(work, struct netns_ipvs, defense_work.work);
234
235 update_defense_level(ipvs);
236 if (atomic_read(&ipvs->dropentry))
237 ip_vs_random_dropentry(ipvs);
238 queue_delayed_work(system_long_wq, &ipvs->defense_work,
239 DEFENSE_TIMER_PERIOD);
240 }
241 #endif
242
est_reload_work_handler(struct work_struct * work)243 static void est_reload_work_handler(struct work_struct *work)
244 {
245 struct netns_ipvs *ipvs =
246 container_of(work, struct netns_ipvs, est_reload_work.work);
247 int genid_done = atomic_read(&ipvs->est_genid_done);
248 unsigned long delay = HZ / 10; /* repeat startups after failure */
249 bool repeat = false;
250 int genid;
251 int id;
252
253 mutex_lock(&ipvs->est_mutex);
254 genid = atomic_read(&ipvs->est_genid);
255 for (id = 0; id < ipvs->est_kt_count; id++) {
256 struct ip_vs_est_kt_data *kd = ipvs->est_kt_arr[id];
257
258 /* netns clean up started, abort delayed work */
259 if (!READ_ONCE(ipvs->enable))
260 goto unlock;
261 if (!kd)
262 continue;
263 /* New config ? Stop kthread tasks */
264 if (genid != genid_done) {
265 if (!id) {
266 /* Only we can stop kt 0 but not under mutex */
267 mutex_unlock(&ipvs->est_mutex);
268 ip_vs_est_kthread_stop(kd);
269 mutex_lock(&ipvs->est_mutex);
270 if (!READ_ONCE(ipvs->enable))
271 goto unlock;
272 /* kd for kt 0 is never destroyed */
273 } else {
274 ip_vs_est_kthread_stop(kd);
275 }
276 }
277 if (!kd->task && !ip_vs_est_stopped(ipvs)) {
278 bool start;
279
280 /* Do not start kthreads above 0 in calc phase */
281 if (id)
282 start = !ipvs->est_calc_phase;
283 else
284 start = kd->needed;
285 if (start && ip_vs_est_kthread_start(ipvs, kd) < 0)
286 repeat = true;
287 }
288 }
289
290 atomic_set(&ipvs->est_genid_done, genid);
291
292 if (repeat)
293 queue_delayed_work(system_long_wq, &ipvs->est_reload_work,
294 delay);
295
296 unlock:
297 mutex_unlock(&ipvs->est_mutex);
298 }
299
get_conn_tab_size(struct netns_ipvs * ipvs)300 static int get_conn_tab_size(struct netns_ipvs *ipvs)
301 {
302 const struct ip_vs_rht *t;
303 int size = 0;
304
305 rcu_read_lock();
306 t = rcu_dereference(ipvs->conn_tab);
307 if (t)
308 size = t->size;
309 rcu_read_unlock();
310
311 return size;
312 }
313
314 int
ip_vs_use_count_inc(void)315 ip_vs_use_count_inc(void)
316 {
317 return try_module_get(THIS_MODULE);
318 }
319
320 void
ip_vs_use_count_dec(void)321 ip_vs_use_count_dec(void)
322 {
323 module_put(THIS_MODULE);
324 }
325
326
327 /* Service hashing:
328 * Operation Locking order
329 * ---------------------------------------------------------------------------
330 * add first table service_mutex
331 * attach new table service_mutex
332 * add/del service service_mutex, RCU, bit lock
333 * move between tables (rehash) svc_resize_sem(W), seqcount_t(W), bit lock
334 * replace old with attached svc_resize_sem(W), svc_replace_sem(W)
335 * find service RCU, seqcount_t(R)
336 * walk services(blocking) service_mutex, svc_resize_sem(R)
337 * walk services(non-blocking) RCU, seqcount_t(R)
338 * walk services(non-blocking) svc_resize_sem(R), RCU, seqcount_t(R)
339 * walk services(non-blocking) svc_replace_sem(R), RCU, seqcount_t(R)
340 * del table service_mutex after stopped work
341 *
342 * - new table is attached on resizing under service_mutex and all operations
343 * can run in parallel in 2 tables until the new table is registered as current
344 * one
345 * - two contexts can modify buckets: config and table resize (work), both in
346 * process context
347 * - only table resizer can move entries, so we do not protect t->seqc[]
348 * items with t->lock[]
349 * - lookups occur under RCU lock and seqcount reader lock to detect if
350 * services are moved to new table
351 * - move operations may disturb readers: find operation will not miss entries
352 * but walkers may see same entry twice if they are forced to retry chains
353 * or to walk the newly attached second table
354 * - walkers using cond_resched_rcu() on !PREEMPT_RCU may need to check
355 * svc_table_changes and repeat the RCU read section if new table is installed
356 * - walkers may serialize with the whole resizing process (svc_resize_sem)
357 * to prevent seeing same service twice or just with the svc_table
358 * replace (svc_replace_sem) when we can see entries twice but we
359 * prefer to run concurrently with the rehashing.
360 */
361
362 /*
363 * Returns hash value for virtual service
364 */
365 static inline u32
ip_vs_svc_hashval(struct ip_vs_rht * t,int af,unsigned int proto,const union nf_inet_addr * addr,__be16 port)366 ip_vs_svc_hashval(struct ip_vs_rht *t, int af, unsigned int proto,
367 const union nf_inet_addr *addr, __be16 port)
368 {
369 return ip_vs_rht_hash_linfo(t, af, addr, ntohs(port), proto);
370 }
371
372 /*
373 * Returns hash value of fwmark for virtual service lookup
374 */
ip_vs_svc_fwm_hashval(struct ip_vs_rht * t,int af,__u32 fwmark)375 static inline u32 ip_vs_svc_fwm_hashval(struct ip_vs_rht *t, int af,
376 __u32 fwmark)
377 {
378 return jhash_2words(fwmark, af, (u32)t->hash_key.key[0]);
379 }
380
381 /* Hashes a service in the svc_table by <proto,addr,port> or by fwmark */
ip_vs_svc_hash(struct ip_vs_service * svc)382 static int ip_vs_svc_hash(struct ip_vs_service *svc)
383 {
384 struct netns_ipvs *ipvs = svc->ipvs;
385 struct hlist_bl_head *head;
386 struct ip_vs_rht *t;
387 u32 hash;
388
389 if (svc->flags & IP_VS_SVC_F_HASHED) {
390 pr_err("%s(): request for already hashed, called from %pS\n",
391 __func__, __builtin_return_address(0));
392 return 0;
393 }
394
395 /* increase its refcnt because it is referenced by the svc table */
396 atomic_inc(&svc->refcnt);
397
398 /* We know if new table is attached under service_mutex but rely on
399 * RCU to hold the old table to be freed in resizer
400 */
401 rcu_read_lock();
402
403 /* This can be the old or the new table */
404 t = rcu_dereference(ipvs->svc_table);
405
406 /* New entries go into recent table */
407 t = rcu_dereference(t->new_tbl);
408
409 if (svc->fwmark == 0) {
410 /*
411 * Hash it by <protocol,addr,port>
412 */
413 hash = ip_vs_svc_hashval(t, svc->af, svc->protocol,
414 &svc->addr, svc->port);
415 } else {
416 /*
417 * Hash it by fwmark
418 */
419 hash = ip_vs_svc_fwm_hashval(t, svc->af, svc->fwmark);
420 }
421 head = t->buckets + (hash & t->mask);
422 hlist_bl_lock(head);
423 WRITE_ONCE(svc->hash_key, ip_vs_rht_build_hash_key(t, hash));
424 svc->flags |= IP_VS_SVC_F_HASHED;
425 hlist_bl_add_head_rcu(&svc->s_list, head);
426 hlist_bl_unlock(head);
427
428 rcu_read_unlock();
429
430 return 1;
431 }
432
433
434 /*
435 * Unhashes a service from svc_table.
436 * Should be called with locked tables.
437 */
ip_vs_svc_unhash(struct ip_vs_service * svc)438 static int ip_vs_svc_unhash(struct ip_vs_service *svc)
439 {
440 struct netns_ipvs *ipvs = svc->ipvs;
441 struct hlist_bl_head *head;
442 struct ip_vs_rht *t;
443 u32 hash_key2;
444 u32 hash_key;
445
446 if (!(svc->flags & IP_VS_SVC_F_HASHED)) {
447 pr_err("%s(): request for unhash flagged, called from %pS\n",
448 __func__, __builtin_return_address(0));
449 return 0;
450 }
451
452 /* We know if new table is attached under service_mutex but rely on
453 * RCU to hold the old table to be freed in resizer
454 */
455 rcu_read_lock();
456
457 /* This can be the old or the new table */
458 t = rcu_dereference(ipvs->svc_table);
459 hash_key = READ_ONCE(svc->hash_key);
460 /* We need to lock the bucket in the right table */
461 if (ip_vs_rht_same_table(t, hash_key)) {
462 head = t->buckets + (hash_key & t->mask);
463 hlist_bl_lock(head);
464 /* Ensure hash_key is read under lock */
465 hash_key2 = READ_ONCE(svc->hash_key);
466 /* Moved to new table ? */
467 if (hash_key != hash_key2) {
468 hlist_bl_unlock(head);
469 t = rcu_dereference(t->new_tbl);
470 head = t->buckets + (hash_key2 & t->mask);
471 hlist_bl_lock(head);
472 }
473 } else {
474 /* It is already moved to new table */
475 t = rcu_dereference(t->new_tbl);
476 head = t->buckets + (hash_key & t->mask);
477 hlist_bl_lock(head);
478 }
479 /* Remove it from svc_table */
480 hlist_bl_del_rcu(&svc->s_list);
481
482 svc->flags &= ~IP_VS_SVC_F_HASHED;
483 atomic_dec(&svc->refcnt);
484 hlist_bl_unlock(head);
485
486 rcu_read_unlock();
487 return 1;
488 }
489
490
491 /*
492 * Get service by {netns, proto,addr,port} in the service table.
493 */
494 static inline struct ip_vs_service *
__ip_vs_service_find(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * vaddr,__be16 vport)495 __ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u16 protocol,
496 const union nf_inet_addr *vaddr, __be16 vport)
497 {
498 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
499 struct hlist_bl_head *head;
500 struct ip_vs_service *svc;
501 struct ip_vs_rht *t, *p;
502 struct hlist_bl_node *e;
503 u32 hash, hash_key;
504
505 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
506 /* Check for "full" addressed entries */
507 hash = ip_vs_svc_hashval(t, af, protocol, vaddr, vport);
508
509 hash_key = ip_vs_rht_build_hash_key(t, hash);
510 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
511 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
512 if (READ_ONCE(svc->hash_key) == hash_key &&
513 svc->af == af &&
514 ip_vs_addr_equal(af, &svc->addr, vaddr) &&
515 svc->port == vport &&
516 svc->protocol == protocol && !svc->fwmark) {
517 /* HIT */
518 return svc;
519 }
520 }
521 }
522 }
523
524 return NULL;
525 }
526
527
528 /*
529 * Get service by {fwmark} in the service table.
530 */
531 static inline struct ip_vs_service *
__ip_vs_svc_fwm_find(struct netns_ipvs * ipvs,int af,__u32 fwmark)532 __ip_vs_svc_fwm_find(struct netns_ipvs *ipvs, int af, __u32 fwmark)
533 {
534 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
535 struct hlist_bl_head *head;
536 struct ip_vs_service *svc;
537 struct ip_vs_rht *t, *p;
538 struct hlist_bl_node *e;
539 u32 hash, hash_key;
540
541 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, p) {
542 /* Check for fwmark addressed entries */
543 hash = ip_vs_svc_fwm_hashval(t, af, fwmark);
544
545 hash_key = ip_vs_rht_build_hash_key(t, hash);
546 ip_vs_rht_walk_bucket_rcu(t, hash_key, head) {
547 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
548 if (READ_ONCE(svc->hash_key) == hash_key &&
549 svc->fwmark == fwmark && svc->af == af) {
550 /* HIT */
551 return svc;
552 }
553 }
554 }
555 }
556
557 return NULL;
558 }
559
560 /* Find service, called under RCU lock */
561 struct ip_vs_service *
ip_vs_service_find(struct netns_ipvs * ipvs,int af,__u32 fwmark,__u16 protocol,const union nf_inet_addr * vaddr,__be16 vport)562 ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol,
563 const union nf_inet_addr *vaddr, __be16 vport)
564 {
565 struct ip_vs_service *svc = NULL;
566 int af_id = ip_vs_af_index(af);
567
568 /*
569 * Check the table hashed by fwmark first
570 */
571 if (fwmark && atomic_read(&ipvs->fwm_services[af_id])) {
572 svc = __ip_vs_svc_fwm_find(ipvs, af, fwmark);
573 if (svc)
574 goto out;
575 }
576
577 if (!atomic_read(&ipvs->nonfwm_services[af_id]))
578 goto out;
579
580 /*
581 * Check the table hashed by <protocol,addr,port>
582 * for "full" addressed entries
583 */
584 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, vport);
585 if (svc)
586 goto out;
587
588 if (protocol == IPPROTO_TCP &&
589 atomic_read(&ipvs->ftpsvc_counter[af_id]) &&
590 (vport == FTPDATA || !inet_port_requires_bind_service(ipvs->net, ntohs(vport)))) {
591 /*
592 * Check if ftp service entry exists, the packet
593 * might belong to FTP data connections.
594 */
595 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, FTPPORT);
596 if (svc)
597 goto out;
598 }
599
600 if (atomic_read(&ipvs->nullsvc_counter[af_id])) {
601 /*
602 * Check if the catch-all port (port zero) exists
603 */
604 svc = __ip_vs_service_find(ipvs, af, protocol, vaddr, 0);
605 }
606
607 out:
608 IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n",
609 fwmark, ip_vs_proto_name(protocol),
610 IP_VS_DBG_ADDR(af, vaddr), ntohs(vport),
611 svc ? "hit" : "not hit");
612
613 return svc;
614 }
615
616 /* Return the number of registered services */
ip_vs_get_num_services(struct netns_ipvs * ipvs)617 static int ip_vs_get_num_services(struct netns_ipvs *ipvs)
618 {
619 int ns = 0, ni = IP_VS_AF_MAX;
620
621 while (--ni >= 0)
622 ns += atomic_read(&ipvs->num_services[ni]);
623 return ns;
624 }
625
626 /* Get default load factor to map num_services/u_thresh to t->size */
ip_vs_svc_default_load_factor(struct netns_ipvs * ipvs)627 static int ip_vs_svc_default_load_factor(struct netns_ipvs *ipvs)
628 {
629 int factor;
630
631 if (net_eq(ipvs->net, &init_net))
632 factor = -3; /* grow if load is above 12.5% */
633 else
634 factor = -2; /* grow if load is above 25% */
635 return factor;
636 }
637
638 /* Get the desired svc_table size */
ip_vs_svc_desired_size(struct netns_ipvs * ipvs,struct ip_vs_rht * t,int lfactor)639 static int ip_vs_svc_desired_size(struct netns_ipvs *ipvs, struct ip_vs_rht *t,
640 int lfactor)
641 {
642 return ip_vs_rht_desired_size(ipvs, t, ip_vs_get_num_services(ipvs),
643 lfactor, IP_VS_SVC_TAB_MIN_BITS,
644 IP_VS_SVC_TAB_MAX_BITS);
645 }
646
647 /* Allocate svc_table */
ip_vs_svc_table_alloc(struct netns_ipvs * ipvs,int buckets,int lfactor)648 static struct ip_vs_rht *ip_vs_svc_table_alloc(struct netns_ipvs *ipvs,
649 int buckets, int lfactor)
650 {
651 struct ip_vs_rht *t;
652 int scounts, locks;
653
654 /* No frequent lookups to race with resizing, so use max of 64
655 * seqcounts. Only resizer moves entries, so use 0 locks.
656 */
657 scounts = clamp(buckets >> 4, 1, 64);
658 locks = 0;
659
660 t = ip_vs_rht_alloc(buckets, scounts, locks);
661 if (!t)
662 return NULL;
663 t->lfactor = lfactor;
664 ip_vs_rht_set_thresholds(t, t->size, lfactor, IP_VS_SVC_TAB_MIN_BITS,
665 IP_VS_SVC_TAB_MAX_BITS);
666 return t;
667 }
668
669 /* svc_table resizer work */
svc_resize_work_handler(struct work_struct * work)670 static void svc_resize_work_handler(struct work_struct *work)
671 {
672 struct hlist_bl_head *head, *head2;
673 struct ip_vs_rht *t_free = NULL;
674 unsigned int resched_score = 0;
675 struct hlist_bl_node *cn, *nn;
676 struct ip_vs_rht *t, *t_new;
677 struct ip_vs_service *svc;
678 struct netns_ipvs *ipvs;
679 bool more_work = true;
680 seqcount_t *sc;
681 int limit = 0;
682 int new_size;
683 int lfactor;
684 u32 bucket;
685
686 ipvs = container_of(work, struct netns_ipvs, svc_resize_work.work);
687
688 if (!down_write_trylock(&ipvs->svc_resize_sem))
689 goto out;
690 if (!mutex_trylock(&ipvs->service_mutex))
691 goto unlock_sem;
692 more_work = false;
693 clear_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags);
694 if (!READ_ONCE(ipvs->enable))
695 goto unlock_m;
696 t = rcu_dereference_protected(ipvs->svc_table, 1);
697 /* Do nothing if table is removed */
698 if (!t)
699 goto unlock_m;
700 /* New table already attached? BUG! */
701 if (t != rcu_access_pointer(t->new_tbl))
702 goto unlock_m;
703
704 lfactor = sysctl_svc_lfactor(ipvs);
705 /* Should we resize ? */
706 new_size = ip_vs_svc_desired_size(ipvs, t, lfactor);
707 if (new_size == t->size && lfactor == t->lfactor)
708 goto unlock_m;
709
710 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
711 if (!t_new) {
712 more_work = true;
713 goto unlock_m;
714 }
715 /* Flip the table_id */
716 t_new->table_id = t->table_id ^ IP_VS_RHT_TABLE_ID_MASK;
717
718 /* Attach new table */
719 rcu_assign_pointer(t->new_tbl, t_new);
720 /* Allow add/del to new_tbl while moving from old table */
721 mutex_unlock(&ipvs->service_mutex);
722
723 ip_vs_rht_for_each_bucket(t, bucket, head) {
724 same_bucket:
725 if (++limit >= 16) {
726 /* Check if work is stopped */
727 if (test_bit(IP_VS_WORK_SVC_NORESIZE,
728 &ipvs->work_flags))
729 goto unlock_sem;
730 if (resched_score >= 100) {
731 resched_score = 0;
732 cond_resched();
733 }
734 limit = 0;
735 }
736 if (hlist_bl_empty(head)) {
737 resched_score++;
738 continue;
739 }
740 /* Preemption calls ahead... */
741 resched_score = 0;
742
743 sc = &t->seqc[bucket & t->seqc_mask];
744 /* seqcount_t usage considering PREEMPT_RT rules:
745 * - we are the only writer => preemption can be allowed
746 * - readers (SoftIRQ) => disable BHs
747 * - readers (processes) => preemption should be disabled
748 */
749 local_bh_disable();
750 preempt_disable_nested();
751 write_seqcount_begin(sc);
752 hlist_bl_lock(head);
753
754 hlist_bl_for_each_entry_safe(svc, cn, nn, head, s_list) {
755 u32 hash;
756
757 /* New hash for the new table */
758 if (svc->fwmark == 0) {
759 /* Hash it by <protocol,addr,port> */
760 hash = ip_vs_svc_hashval(t_new, svc->af,
761 svc->protocol,
762 &svc->addr, svc->port);
763 } else {
764 /* Hash it by fwmark */
765 hash = ip_vs_svc_fwm_hashval(t_new, svc->af,
766 svc->fwmark);
767 }
768 hlist_bl_del_rcu(&svc->s_list);
769 head2 = t_new->buckets + (hash & t_new->mask);
770
771 hlist_bl_lock(head2);
772 WRITE_ONCE(svc->hash_key,
773 ip_vs_rht_build_hash_key(t_new, hash));
774 /* t_new->seqc are not used at this stage, we race
775 * only with add/del, so only lock the bucket.
776 */
777 hlist_bl_add_head_rcu(&svc->s_list, head2);
778 hlist_bl_unlock(head2);
779 /* Too long chain? Do it in steps */
780 if (++limit >= 64)
781 break;
782 }
783
784 hlist_bl_unlock(head);
785 write_seqcount_end(sc);
786 preempt_enable_nested();
787 local_bh_enable();
788 if (limit >= 64)
789 goto same_bucket;
790 }
791
792 /* Serialize with readers that don't like svc_table changes */
793 down_write(&ipvs->svc_replace_sem);
794
795 /* Check if work is stopped to avoid synchronize_rcu() */
796 if (test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
797 goto unlock_repl;
798
799 rcu_assign_pointer(ipvs->svc_table, t_new);
800 /* Inform readers that new table is installed */
801 smp_mb__before_atomic();
802 atomic_inc(&ipvs->svc_table_changes);
803 t_free = t;
804
805 unlock_repl:
806 up_write(&ipvs->svc_replace_sem);
807
808 unlock_sem:
809 up_write(&ipvs->svc_resize_sem);
810
811 if (t_free) {
812 /* RCU readers should not see more than two tables in chain.
813 * To prevent new table to be attached wait here instead of
814 * freeing the old table in RCU callback.
815 */
816 synchronize_rcu();
817 ip_vs_rht_free(t_free);
818 }
819
820 out:
821 if (!READ_ONCE(ipvs->enable) || !more_work ||
822 test_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
823 return;
824 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work, 1);
825 return;
826
827 unlock_m:
828 mutex_unlock(&ipvs->service_mutex);
829 goto unlock_sem;
830 }
831
832 static inline void
__ip_vs_bind_svc(struct ip_vs_dest * dest,struct ip_vs_service * svc)833 __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc)
834 {
835 atomic_inc(&svc->refcnt);
836 rcu_assign_pointer(dest->svc, svc);
837 }
838
ip_vs_service_free(struct ip_vs_service * svc)839 static void ip_vs_service_free(struct ip_vs_service *svc)
840 {
841 ip_vs_stats_release(&svc->stats);
842 kfree(svc);
843 }
844
ip_vs_service_rcu_free(struct rcu_head * head)845 static void ip_vs_service_rcu_free(struct rcu_head *head)
846 {
847 struct ip_vs_service *svc;
848
849 svc = container_of(head, struct ip_vs_service, rcu_head);
850 ip_vs_service_free(svc);
851 }
852
__ip_vs_svc_put(struct ip_vs_service * svc)853 static void __ip_vs_svc_put(struct ip_vs_service *svc)
854 {
855 if (atomic_dec_and_test(&svc->refcnt)) {
856 IP_VS_DBG_BUF(3, "Removing service %u/%s:%u\n",
857 svc->fwmark,
858 IP_VS_DBG_ADDR(svc->af, &svc->addr),
859 ntohs(svc->port));
860 call_rcu(&svc->rcu_head, ip_vs_service_rcu_free);
861 }
862 }
863
864
865 /*
866 * Returns hash value for real service
867 */
ip_vs_rs_hashkey(int af,const union nf_inet_addr * addr,__be16 port)868 static inline unsigned int ip_vs_rs_hashkey(int af,
869 const union nf_inet_addr *addr,
870 __be16 port)
871 {
872 unsigned int porth = ntohs(port);
873 __be32 addr_fold = addr->ip;
874
875 #ifdef CONFIG_IP_VS_IPV6
876 if (af == AF_INET6)
877 addr_fold = addr->ip6[0]^addr->ip6[1]^
878 addr->ip6[2]^addr->ip6[3];
879 #endif
880
881 return (ntohl(addr_fold)^(porth>>IP_VS_RTAB_BITS)^porth)
882 & IP_VS_RTAB_MASK;
883 }
884
885 /* Hash ip_vs_dest in rs_table by <proto,addr,port>. */
ip_vs_rs_hash(struct netns_ipvs * ipvs,struct ip_vs_dest * dest)886 static void ip_vs_rs_hash(struct netns_ipvs *ipvs, struct ip_vs_dest *dest)
887 {
888 unsigned int hash;
889 __be16 port;
890
891 if (dest->in_rs_table)
892 return;
893
894 switch (IP_VS_DFWD_METHOD(dest)) {
895 case IP_VS_CONN_F_MASQ:
896 port = dest->port;
897 break;
898 case IP_VS_CONN_F_TUNNEL:
899 switch (dest->tun_type) {
900 case IP_VS_CONN_F_TUNNEL_TYPE_GUE:
901 port = dest->tun_port;
902 break;
903 case IP_VS_CONN_F_TUNNEL_TYPE_IPIP:
904 case IP_VS_CONN_F_TUNNEL_TYPE_GRE:
905 port = 0;
906 break;
907 default:
908 return;
909 }
910 break;
911 default:
912 return;
913 }
914
915 /*
916 * Hash by proto,addr,port,
917 * which are the parameters of the real service.
918 */
919 hash = ip_vs_rs_hashkey(dest->af, &dest->addr, port);
920
921 hlist_add_head_rcu(&dest->d_list, &ipvs->rs_table[hash]);
922 dest->in_rs_table = 1;
923 }
924
925 /* Unhash ip_vs_dest from rs_table. */
ip_vs_rs_unhash(struct ip_vs_dest * dest)926 static void ip_vs_rs_unhash(struct ip_vs_dest *dest)
927 {
928 /*
929 * Remove it from the rs_table table.
930 */
931 if (dest->in_rs_table) {
932 hlist_del_rcu(&dest->d_list);
933 dest->in_rs_table = 0;
934 }
935 }
936
937 /* Check if real service by <proto,addr,port> is present */
ip_vs_has_real_service(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * daddr,__be16 dport)938 bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
939 const union nf_inet_addr *daddr, __be16 dport)
940 {
941 unsigned int hash;
942 struct ip_vs_dest *dest;
943
944 /* Check for "full" addressed entries */
945 hash = ip_vs_rs_hashkey(af, daddr, dport);
946
947 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
948 if (dest->port == dport &&
949 dest->af == af &&
950 ip_vs_addr_equal(af, &dest->addr, daddr) &&
951 (dest->protocol == protocol || dest->vfwmark) &&
952 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
953 /* HIT */
954 return true;
955 }
956 }
957
958 return false;
959 }
960
961 /* Find real service record by <proto,addr,port>.
962 * In case of multiple records with the same <proto,addr,port>, only
963 * the first found record is returned.
964 *
965 * To be called under RCU lock.
966 */
ip_vs_find_real_service(struct netns_ipvs * ipvs,int af,__u16 protocol,const union nf_inet_addr * daddr,__be16 dport)967 struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
968 __u16 protocol,
969 const union nf_inet_addr *daddr,
970 __be16 dport)
971 {
972 unsigned int hash;
973 struct ip_vs_dest *dest;
974
975 /* Check for "full" addressed entries */
976 hash = ip_vs_rs_hashkey(af, daddr, dport);
977
978 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
979 if (dest->port == dport &&
980 dest->af == af &&
981 ip_vs_addr_equal(af, &dest->addr, daddr) &&
982 (dest->protocol == protocol || dest->vfwmark) &&
983 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_MASQ) {
984 /* HIT */
985 return dest;
986 }
987 }
988
989 return NULL;
990 }
991
992 /* Find real service record by <af,addr,tun_port>.
993 * In case of multiple records with the same <af,addr,tun_port>, only
994 * the first found record is returned.
995 *
996 * To be called under RCU lock.
997 */
ip_vs_find_tunnel(struct netns_ipvs * ipvs,int af,const union nf_inet_addr * daddr,__be16 tun_port)998 struct ip_vs_dest *ip_vs_find_tunnel(struct netns_ipvs *ipvs, int af,
999 const union nf_inet_addr *daddr,
1000 __be16 tun_port)
1001 {
1002 struct ip_vs_dest *dest;
1003 unsigned int hash;
1004
1005 /* Check for "full" addressed entries */
1006 hash = ip_vs_rs_hashkey(af, daddr, tun_port);
1007
1008 hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
1009 if (dest->tun_port == tun_port &&
1010 dest->af == af &&
1011 ip_vs_addr_equal(af, &dest->addr, daddr) &&
1012 IP_VS_DFWD_METHOD(dest) == IP_VS_CONN_F_TUNNEL) {
1013 /* HIT */
1014 return dest;
1015 }
1016 }
1017
1018 return NULL;
1019 }
1020
1021 /* Lookup destination by {addr,port} in the given service
1022 * Called under RCU lock.
1023 */
1024 static struct ip_vs_dest *
ip_vs_lookup_dest(struct ip_vs_service * svc,int dest_af,const union nf_inet_addr * daddr,__be16 dport)1025 ip_vs_lookup_dest(struct ip_vs_service *svc, int dest_af,
1026 const union nf_inet_addr *daddr, __be16 dport)
1027 {
1028 struct ip_vs_dest *dest;
1029
1030 /*
1031 * Find the destination for the given service
1032 */
1033 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
1034 if ((dest->af == dest_af) &&
1035 ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
1036 (dest->port == dport)) {
1037 /* HIT */
1038 return dest;
1039 }
1040 }
1041
1042 return NULL;
1043 }
1044
1045 /*
1046 * Find destination by {daddr,dport,vaddr,protocol}
1047 * Created to be used in ip_vs_process_message() in
1048 * the backup synchronization daemon. It finds the
1049 * destination to be bound to the received connection
1050 * on the backup.
1051 * Called under RCU lock, no refcnt is returned.
1052 */
ip_vs_find_dest(struct netns_ipvs * ipvs,int svc_af,int dest_af,const union nf_inet_addr * daddr,__be16 dport,const union nf_inet_addr * vaddr,__be16 vport,__u16 protocol,__u32 fwmark,__u32 flags)1053 struct ip_vs_dest *ip_vs_find_dest(struct netns_ipvs *ipvs, int svc_af, int dest_af,
1054 const union nf_inet_addr *daddr,
1055 __be16 dport,
1056 const union nf_inet_addr *vaddr,
1057 __be16 vport, __u16 protocol, __u32 fwmark,
1058 __u32 flags)
1059 {
1060 struct ip_vs_dest *dest;
1061 struct ip_vs_service *svc;
1062 __be16 port = dport;
1063
1064 svc = ip_vs_service_find(ipvs, svc_af, fwmark, protocol, vaddr, vport);
1065 if (!svc)
1066 return NULL;
1067 if (fwmark && (flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ)
1068 port = 0;
1069 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port);
1070 if (!dest)
1071 dest = ip_vs_lookup_dest(svc, dest_af, daddr, port ^ dport);
1072 return dest;
1073 }
1074
ip_vs_dest_dst_rcu_free(struct rcu_head * head)1075 void ip_vs_dest_dst_rcu_free(struct rcu_head *head)
1076 {
1077 struct ip_vs_dest_dst *dest_dst = container_of(head,
1078 struct ip_vs_dest_dst,
1079 rcu_head);
1080
1081 dst_release(dest_dst->dst_cache);
1082 kfree(dest_dst);
1083 }
1084
1085 /* Release dest_dst and dst_cache for dest in user context */
__ip_vs_dst_cache_reset(struct ip_vs_dest * dest)1086 static void __ip_vs_dst_cache_reset(struct ip_vs_dest *dest)
1087 {
1088 struct ip_vs_dest_dst *old;
1089
1090 old = rcu_dereference_protected(dest->dest_dst, 1);
1091 if (old) {
1092 RCU_INIT_POINTER(dest->dest_dst, NULL);
1093 call_rcu(&old->rcu_head, ip_vs_dest_dst_rcu_free);
1094 }
1095 }
1096
1097 /*
1098 * Lookup dest by {svc,addr,port} in the destination trash.
1099 * The destination trash is used to hold the destinations that are removed
1100 * from the service table but are still referenced by some conn entries.
1101 * The reason to add the destination trash is when the dest is temporary
1102 * down (either by administrator or by monitor program), the dest can be
1103 * picked back from the trash, the remaining connections to the dest can
1104 * continue, and the counting information of the dest is also useful for
1105 * scheduling.
1106 */
1107 static struct ip_vs_dest *
ip_vs_trash_get_dest(struct ip_vs_service * svc,int dest_af,const union nf_inet_addr * daddr,__be16 dport)1108 ip_vs_trash_get_dest(struct ip_vs_service *svc, int dest_af,
1109 const union nf_inet_addr *daddr, __be16 dport)
1110 {
1111 struct ip_vs_dest *dest;
1112 struct netns_ipvs *ipvs = svc->ipvs;
1113
1114 /*
1115 * Find the destination in trash
1116 */
1117 spin_lock_bh(&ipvs->dest_trash_lock);
1118 list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
1119 IP_VS_DBG_BUF(3, "Destination %u/%s:%u still in trash, "
1120 "dest->refcnt=%d\n",
1121 dest->vfwmark,
1122 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1123 ntohs(dest->port),
1124 refcount_read(&dest->refcnt));
1125 if (dest->af == dest_af &&
1126 ip_vs_addr_equal(dest_af, &dest->addr, daddr) &&
1127 dest->port == dport &&
1128 dest->vfwmark == svc->fwmark &&
1129 dest->protocol == svc->protocol &&
1130 (svc->fwmark ||
1131 (ip_vs_addr_equal(svc->af, &dest->vaddr, &svc->addr) &&
1132 dest->vport == svc->port))) {
1133 /* HIT */
1134 list_del(&dest->t_list);
1135 goto out;
1136 }
1137 }
1138
1139 dest = NULL;
1140
1141 out:
1142 spin_unlock_bh(&ipvs->dest_trash_lock);
1143
1144 return dest;
1145 }
1146
1147 /* Put destination in trash */
ip_vs_trash_put_dest(struct netns_ipvs * ipvs,struct ip_vs_dest * dest,unsigned long istart,bool cleanup)1148 static void ip_vs_trash_put_dest(struct netns_ipvs *ipvs,
1149 struct ip_vs_dest *dest, unsigned long istart,
1150 bool cleanup)
1151 {
1152 spin_lock_bh(&ipvs->dest_trash_lock);
1153 IP_VS_DBG_BUF(3, "Moving dest %s:%u into trash, dest->refcnt=%d\n",
1154 IP_VS_DBG_ADDR(dest->af, &dest->addr), ntohs(dest->port),
1155 refcount_read(&dest->refcnt));
1156 if (list_empty(&ipvs->dest_trash) && !cleanup)
1157 mod_timer(&ipvs->dest_trash_timer,
1158 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1159 /* dest lives in trash with reference */
1160 list_add(&dest->t_list, &ipvs->dest_trash);
1161 dest->idle_start = istart;
1162 spin_unlock_bh(&ipvs->dest_trash_lock);
1163 }
1164
ip_vs_dest_rcu_free(struct rcu_head * head)1165 static void ip_vs_dest_rcu_free(struct rcu_head *head)
1166 {
1167 struct ip_vs_dest *dest;
1168
1169 dest = container_of(head, struct ip_vs_dest, rcu_head);
1170 ip_vs_stats_release(&dest->stats);
1171 ip_vs_dest_put_and_free(dest);
1172 }
1173
ip_vs_dest_free(struct ip_vs_dest * dest)1174 static void ip_vs_dest_free(struct ip_vs_dest *dest)
1175 {
1176 struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
1177
1178 __ip_vs_svc_put(svc);
1179 call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
1180 }
1181
1182 /*
1183 * Clean up all the destinations in the trash
1184 * Called by the ip_vs_control_cleanup()
1185 *
1186 * When the ip_vs_control_clearup is activated by ipvs module exit,
1187 * the service tables must have been flushed and all the connections
1188 * are expired, and the refcnt of each destination in the trash must
1189 * be 1, so we simply release them here.
1190 */
ip_vs_trash_cleanup(struct netns_ipvs * ipvs)1191 static void ip_vs_trash_cleanup(struct netns_ipvs *ipvs)
1192 {
1193 struct ip_vs_dest *dest, *nxt;
1194
1195 timer_delete_sync(&ipvs->dest_trash_timer);
1196 /* No need to use dest_trash_lock */
1197 list_for_each_entry_safe(dest, nxt, &ipvs->dest_trash, t_list) {
1198 list_del(&dest->t_list);
1199 ip_vs_dest_free(dest);
1200 }
1201 }
1202
ip_vs_stats_rcu_free(struct rcu_head * head)1203 static void ip_vs_stats_rcu_free(struct rcu_head *head)
1204 {
1205 struct ip_vs_stats_rcu *rs = container_of(head,
1206 struct ip_vs_stats_rcu,
1207 rcu_head);
1208
1209 ip_vs_stats_release(&rs->s);
1210 kfree(rs);
1211 }
1212
1213 static void
ip_vs_copy_stats(struct ip_vs_kstats * dst,struct ip_vs_stats * src)1214 ip_vs_copy_stats(struct ip_vs_kstats *dst, struct ip_vs_stats *src)
1215 {
1216 #define IP_VS_SHOW_STATS_COUNTER(c) dst->c = src->kstats.c - src->kstats0.c
1217
1218 spin_lock(&src->lock);
1219
1220 IP_VS_SHOW_STATS_COUNTER(conns);
1221 IP_VS_SHOW_STATS_COUNTER(inpkts);
1222 IP_VS_SHOW_STATS_COUNTER(outpkts);
1223 IP_VS_SHOW_STATS_COUNTER(inbytes);
1224 IP_VS_SHOW_STATS_COUNTER(outbytes);
1225
1226 ip_vs_read_estimator(dst, src);
1227
1228 spin_unlock(&src->lock);
1229 }
1230
1231 static void
ip_vs_export_stats_user(struct ip_vs_stats_user * dst,struct ip_vs_kstats * src)1232 ip_vs_export_stats_user(struct ip_vs_stats_user *dst, struct ip_vs_kstats *src)
1233 {
1234 dst->conns = (u32)src->conns;
1235 dst->inpkts = (u32)src->inpkts;
1236 dst->outpkts = (u32)src->outpkts;
1237 dst->inbytes = src->inbytes;
1238 dst->outbytes = src->outbytes;
1239 dst->cps = (u32)src->cps;
1240 dst->inpps = (u32)src->inpps;
1241 dst->outpps = (u32)src->outpps;
1242 dst->inbps = (u32)src->inbps;
1243 dst->outbps = (u32)src->outbps;
1244 }
1245
1246 static void
ip_vs_zero_stats(struct ip_vs_stats * stats)1247 ip_vs_zero_stats(struct ip_vs_stats *stats)
1248 {
1249 spin_lock(&stats->lock);
1250
1251 /* get current counters as zero point, rates are zeroed */
1252
1253 #define IP_VS_ZERO_STATS_COUNTER(c) stats->kstats0.c = stats->kstats.c
1254
1255 IP_VS_ZERO_STATS_COUNTER(conns);
1256 IP_VS_ZERO_STATS_COUNTER(inpkts);
1257 IP_VS_ZERO_STATS_COUNTER(outpkts);
1258 IP_VS_ZERO_STATS_COUNTER(inbytes);
1259 IP_VS_ZERO_STATS_COUNTER(outbytes);
1260
1261 ip_vs_zero_estimator(stats);
1262
1263 spin_unlock(&stats->lock);
1264 }
1265
1266 /* Allocate fields after kzalloc */
ip_vs_stats_init_alloc(struct ip_vs_stats * s)1267 int ip_vs_stats_init_alloc(struct ip_vs_stats *s)
1268 {
1269 int i;
1270
1271 spin_lock_init(&s->lock);
1272 s->cpustats = alloc_percpu(struct ip_vs_cpu_stats);
1273 if (!s->cpustats)
1274 return -ENOMEM;
1275
1276 for_each_possible_cpu(i) {
1277 struct ip_vs_cpu_stats *cs = per_cpu_ptr(s->cpustats, i);
1278
1279 u64_stats_init(&cs->syncp);
1280 }
1281 return 0;
1282 }
1283
ip_vs_stats_alloc(void)1284 struct ip_vs_stats *ip_vs_stats_alloc(void)
1285 {
1286 struct ip_vs_stats *s = kzalloc_obj(*s);
1287
1288 if (s && ip_vs_stats_init_alloc(s) >= 0)
1289 return s;
1290 kfree(s);
1291 return NULL;
1292 }
1293
ip_vs_stats_release(struct ip_vs_stats * stats)1294 void ip_vs_stats_release(struct ip_vs_stats *stats)
1295 {
1296 free_percpu(stats->cpustats);
1297 }
1298
ip_vs_stats_free(struct ip_vs_stats * stats)1299 void ip_vs_stats_free(struct ip_vs_stats *stats)
1300 {
1301 if (stats) {
1302 ip_vs_stats_release(stats);
1303 kfree(stats);
1304 }
1305 }
1306
1307 /*
1308 * Update a destination in the given service
1309 */
1310 static void
__ip_vs_update_dest(struct ip_vs_service * svc,struct ip_vs_dest * dest,struct ip_vs_dest_user_kern * udest,int add)1311 __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
1312 struct ip_vs_dest_user_kern *udest, int add)
1313 {
1314 struct netns_ipvs *ipvs = svc->ipvs;
1315 struct ip_vs_service *old_svc;
1316 struct ip_vs_scheduler *sched;
1317 int conn_flags;
1318
1319 /* We cannot modify an address and change the address family */
1320 BUG_ON(!add && udest->af != dest->af);
1321
1322 if (add && udest->af != svc->af)
1323 ipvs->mixed_address_family_dests++;
1324
1325 /* keep the last_weight with latest non-0 weight */
1326 if (add || udest->weight != 0)
1327 atomic_set(&dest->last_weight, udest->weight);
1328
1329 /* set the weight and the flags */
1330 atomic_set(&dest->weight, udest->weight);
1331 conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK;
1332 conn_flags |= IP_VS_CONN_F_INACTIVE;
1333
1334 /* Need to rehash? */
1335 if ((udest->conn_flags & IP_VS_CONN_F_FWD_MASK) !=
1336 IP_VS_DFWD_METHOD(dest) ||
1337 udest->tun_type != dest->tun_type ||
1338 udest->tun_port != dest->tun_port)
1339 ip_vs_rs_unhash(dest);
1340
1341 /* set the tunnel info */
1342 dest->tun_type = udest->tun_type;
1343 dest->tun_port = udest->tun_port;
1344 dest->tun_flags = udest->tun_flags;
1345
1346 /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */
1347 if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) {
1348 conn_flags |= IP_VS_CONN_F_NOOUTPUT;
1349 } else {
1350 /* FTP-NAT requires conntrack for mangling */
1351 if (svc->port == FTPPORT)
1352 ip_vs_register_conntrack(svc);
1353 }
1354 atomic_set(&dest->conn_flags, conn_flags);
1355 /* Put the real service in rs_table if not present. */
1356 ip_vs_rs_hash(ipvs, dest);
1357
1358 /* bind the service */
1359 old_svc = rcu_dereference_protected(dest->svc, 1);
1360 if (!old_svc) {
1361 __ip_vs_bind_svc(dest, svc);
1362 } else {
1363 if (old_svc != svc) {
1364 ip_vs_zero_stats(&dest->stats);
1365 __ip_vs_bind_svc(dest, svc);
1366 __ip_vs_svc_put(old_svc);
1367 }
1368 }
1369
1370 /* set the dest status flags */
1371 dest->flags |= IP_VS_DEST_F_AVAILABLE;
1372
1373 if (udest->u_threshold == 0 || udest->u_threshold > dest->u_threshold)
1374 dest->flags &= ~IP_VS_DEST_F_OVERLOAD;
1375 dest->u_threshold = udest->u_threshold;
1376 dest->l_threshold = udest->l_threshold;
1377
1378 dest->af = udest->af;
1379
1380 if (add) {
1381 list_add_rcu(&dest->n_list, &svc->destinations);
1382 svc->num_dests++;
1383 sched = rcu_dereference_protected(svc->scheduler, 1);
1384 if (sched && sched->add_dest)
1385 sched->add_dest(svc, dest);
1386 } else {
1387 spin_lock_bh(&dest->dst_lock);
1388 __ip_vs_dst_cache_reset(dest);
1389 spin_unlock_bh(&dest->dst_lock);
1390
1391 sched = rcu_dereference_protected(svc->scheduler, 1);
1392 if (sched && sched->upd_dest)
1393 sched->upd_dest(svc, dest);
1394 }
1395 }
1396
1397
1398 /*
1399 * Create a destination for the given service
1400 */
1401 static int
ip_vs_new_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1402 ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1403 {
1404 struct ip_vs_dest *dest;
1405 unsigned int atype;
1406 int ret;
1407
1408 #ifdef CONFIG_IP_VS_IPV6
1409 if (udest->af == AF_INET6) {
1410 atype = ipv6_addr_type(&udest->addr.in6);
1411 if ((!(atype & IPV6_ADDR_UNICAST) ||
1412 atype & IPV6_ADDR_LINKLOCAL) &&
1413 !__ip_vs_addr_is_local_v6(svc->ipvs->net, &udest->addr.in6))
1414 return -EINVAL;
1415
1416 ret = nf_defrag_ipv6_enable(svc->ipvs->net);
1417 if (ret)
1418 return ret;
1419 } else
1420 #endif
1421 {
1422 atype = inet_addr_type(svc->ipvs->net, udest->addr.ip);
1423 if (atype != RTN_LOCAL && atype != RTN_UNICAST)
1424 return -EINVAL;
1425 }
1426
1427 dest = kzalloc_obj(struct ip_vs_dest);
1428 if (dest == NULL)
1429 return -ENOMEM;
1430
1431 ret = ip_vs_stats_init_alloc(&dest->stats);
1432 if (ret < 0)
1433 goto err_alloc;
1434
1435 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1436 if (ret < 0)
1437 goto err_stats;
1438
1439 dest->af = udest->af;
1440 dest->protocol = svc->protocol;
1441 dest->vaddr = svc->addr;
1442 dest->vport = svc->port;
1443 dest->vfwmark = svc->fwmark;
1444 ip_vs_addr_copy(udest->af, &dest->addr, &udest->addr);
1445 dest->port = udest->port;
1446
1447 atomic_set(&dest->activeconns, 0);
1448 atomic_set(&dest->inactconns, 0);
1449 atomic_set(&dest->persistconns, 0);
1450 refcount_set(&dest->refcnt, 1);
1451
1452 INIT_HLIST_NODE(&dest->d_list);
1453 spin_lock_init(&dest->dst_lock);
1454 __ip_vs_update_dest(svc, dest, udest, 1);
1455
1456 return 0;
1457
1458 err_stats:
1459 ip_vs_stats_release(&dest->stats);
1460
1461 err_alloc:
1462 kfree(dest);
1463 return ret;
1464 }
1465
1466
1467 /*
1468 * Add a destination into an existing service
1469 */
1470 static int
ip_vs_add_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1471 ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1472 {
1473 struct ip_vs_dest *dest;
1474 union nf_inet_addr daddr;
1475 __be16 dport = udest->port;
1476 int ret;
1477
1478 if (udest->weight < 0) {
1479 pr_err("%s(): server weight less than zero\n", __func__);
1480 return -ERANGE;
1481 }
1482
1483 if (udest->l_threshold > udest->u_threshold) {
1484 pr_err("%s(): lower threshold is higher than upper threshold\n",
1485 __func__);
1486 return -ERANGE;
1487 }
1488
1489 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1490 if (udest->tun_port == 0) {
1491 pr_err("%s(): tunnel port is zero\n", __func__);
1492 return -EINVAL;
1493 }
1494 }
1495
1496 ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1497
1498 /* We use function that requires RCU lock */
1499 rcu_read_lock();
1500 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1501 rcu_read_unlock();
1502
1503 if (dest != NULL) {
1504 IP_VS_DBG(1, "%s(): dest already exists\n", __func__);
1505 return -EEXIST;
1506 }
1507
1508 /*
1509 * Check if the dest already exists in the trash and
1510 * is from the same service
1511 */
1512 dest = ip_vs_trash_get_dest(svc, udest->af, &daddr, dport);
1513
1514 if (dest != NULL) {
1515 IP_VS_DBG_BUF(3, "Get destination %s:%u from trash, "
1516 "dest->refcnt=%d, service %u/%s:%u\n",
1517 IP_VS_DBG_ADDR(udest->af, &daddr), ntohs(dport),
1518 refcount_read(&dest->refcnt),
1519 dest->vfwmark,
1520 IP_VS_DBG_ADDR(svc->af, &dest->vaddr),
1521 ntohs(dest->vport));
1522
1523 ret = ip_vs_start_estimator(svc->ipvs, &dest->stats);
1524 /* On error put back dest into the trash */
1525 if (ret < 0)
1526 ip_vs_trash_put_dest(svc->ipvs, dest, dest->idle_start,
1527 false);
1528 else
1529 __ip_vs_update_dest(svc, dest, udest, 1);
1530 } else {
1531 /*
1532 * Allocate and initialize the dest structure
1533 */
1534 ret = ip_vs_new_dest(svc, udest);
1535 }
1536
1537 return ret;
1538 }
1539
1540
1541 /*
1542 * Edit a destination in the given service
1543 */
1544 static int
ip_vs_edit_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1545 ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1546 {
1547 struct ip_vs_dest *dest;
1548 union nf_inet_addr daddr;
1549 __be16 dport = udest->port;
1550
1551 if (udest->weight < 0) {
1552 pr_err("%s(): server weight less than zero\n", __func__);
1553 return -ERANGE;
1554 }
1555
1556 if (udest->l_threshold > udest->u_threshold) {
1557 pr_err("%s(): lower threshold is higher than upper threshold\n",
1558 __func__);
1559 return -ERANGE;
1560 }
1561
1562 if (udest->tun_type == IP_VS_CONN_F_TUNNEL_TYPE_GUE) {
1563 if (udest->tun_port == 0) {
1564 pr_err("%s(): tunnel port is zero\n", __func__);
1565 return -EINVAL;
1566 }
1567 }
1568
1569 ip_vs_addr_copy(udest->af, &daddr, &udest->addr);
1570
1571 /* We use function that requires RCU lock */
1572 rcu_read_lock();
1573 dest = ip_vs_lookup_dest(svc, udest->af, &daddr, dport);
1574 rcu_read_unlock();
1575
1576 if (dest == NULL) {
1577 IP_VS_DBG(1, "%s(): dest doesn't exist\n", __func__);
1578 return -ENOENT;
1579 }
1580
1581 __ip_vs_update_dest(svc, dest, udest, 0);
1582
1583 return 0;
1584 }
1585
1586 /*
1587 * Delete a destination (must be already unlinked from the service)
1588 */
__ip_vs_del_dest(struct netns_ipvs * ipvs,struct ip_vs_dest * dest,bool cleanup)1589 static void __ip_vs_del_dest(struct netns_ipvs *ipvs, struct ip_vs_dest *dest,
1590 bool cleanup)
1591 {
1592 ip_vs_stop_estimator(ipvs, &dest->stats);
1593
1594 /*
1595 * Remove it from the d-linked list with the real services.
1596 */
1597 ip_vs_rs_unhash(dest);
1598
1599 ip_vs_trash_put_dest(ipvs, dest, 0, cleanup);
1600
1601 /* Queue up delayed work to expire all no destination connections.
1602 * No-op when CONFIG_SYSCTL is disabled.
1603 */
1604 if (!cleanup)
1605 ip_vs_enqueue_expire_nodest_conns(ipvs);
1606 }
1607
1608
1609 /*
1610 * Unlink a destination from the given service
1611 */
__ip_vs_unlink_dest(struct ip_vs_service * svc,struct ip_vs_dest * dest,int svcupd)1612 static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
1613 struct ip_vs_dest *dest,
1614 int svcupd)
1615 {
1616 dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
1617
1618 spin_lock_bh(&dest->dst_lock);
1619 __ip_vs_dst_cache_reset(dest);
1620 spin_unlock_bh(&dest->dst_lock);
1621
1622 /*
1623 * Remove it from the d-linked destination list.
1624 */
1625 list_del_rcu(&dest->n_list);
1626 svc->num_dests--;
1627
1628 if (dest->af != svc->af)
1629 svc->ipvs->mixed_address_family_dests--;
1630
1631 if (svcupd) {
1632 struct ip_vs_scheduler *sched;
1633
1634 sched = rcu_dereference_protected(svc->scheduler, 1);
1635 if (sched && sched->del_dest)
1636 sched->del_dest(svc, dest);
1637 }
1638 }
1639
1640
1641 /*
1642 * Delete a destination server in the given service
1643 */
1644 static int
ip_vs_del_dest(struct ip_vs_service * svc,struct ip_vs_dest_user_kern * udest)1645 ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest)
1646 {
1647 struct ip_vs_dest *dest;
1648 __be16 dport = udest->port;
1649
1650 /* We use function that requires RCU lock */
1651 rcu_read_lock();
1652 dest = ip_vs_lookup_dest(svc, udest->af, &udest->addr, dport);
1653 rcu_read_unlock();
1654
1655 if (dest == NULL) {
1656 IP_VS_DBG(1, "%s(): destination not found!\n", __func__);
1657 return -ENOENT;
1658 }
1659
1660 /*
1661 * Unlink dest from the service
1662 */
1663 __ip_vs_unlink_dest(svc, dest, 1);
1664
1665 /*
1666 * Delete the destination
1667 */
1668 __ip_vs_del_dest(svc->ipvs, dest, false);
1669
1670 return 0;
1671 }
1672
ip_vs_dest_trash_expire(struct timer_list * t)1673 static void ip_vs_dest_trash_expire(struct timer_list *t)
1674 {
1675 struct netns_ipvs *ipvs = timer_container_of(ipvs, t,
1676 dest_trash_timer);
1677 struct ip_vs_dest *dest, *next;
1678 unsigned long now = jiffies;
1679
1680 spin_lock(&ipvs->dest_trash_lock);
1681 list_for_each_entry_safe(dest, next, &ipvs->dest_trash, t_list) {
1682 if (refcount_read(&dest->refcnt) > 1)
1683 continue;
1684 if (dest->idle_start) {
1685 if (time_before(now, dest->idle_start +
1686 IP_VS_DEST_TRASH_PERIOD))
1687 continue;
1688 } else {
1689 dest->idle_start = max(1UL, now);
1690 continue;
1691 }
1692 IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u from trash\n",
1693 dest->vfwmark,
1694 IP_VS_DBG_ADDR(dest->af, &dest->addr),
1695 ntohs(dest->port));
1696 list_del(&dest->t_list);
1697 ip_vs_dest_free(dest);
1698 }
1699 if (!list_empty(&ipvs->dest_trash))
1700 mod_timer(&ipvs->dest_trash_timer,
1701 jiffies + (IP_VS_DEST_TRASH_PERIOD >> 1));
1702 spin_unlock(&ipvs->dest_trash_lock);
1703 }
1704
1705 /*
1706 * Add a service into the service hash table
1707 */
1708 static int
ip_vs_add_service(struct netns_ipvs * ipvs,struct ip_vs_service_user_kern * u,struct ip_vs_service ** svc_p)1709 ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
1710 struct ip_vs_service **svc_p)
1711 {
1712 struct ip_vs_scheduler *sched = NULL;
1713 struct ip_vs_rht *tc_new = NULL;
1714 struct ip_vs_rht *t, *t_new = NULL;
1715 int af_id = ip_vs_af_index(u->af);
1716 struct ip_vs_service *svc = NULL;
1717 struct ip_vs_pe *pe = NULL;
1718 int ret_hooks = -1;
1719 int ret = 0;
1720 bool grow;
1721
1722 /* increase the module use count */
1723 if (!ip_vs_use_count_inc())
1724 return -ENOPROTOOPT;
1725
1726 /* Lookup the scheduler by 'u->sched_name' */
1727 if (strcmp(u->sched_name, "none")) {
1728 sched = ip_vs_scheduler_get(u->sched_name);
1729 if (!sched) {
1730 pr_info("Scheduler module ip_vs_%s not found\n",
1731 u->sched_name);
1732 ret = -ENOENT;
1733 goto out_err;
1734 }
1735 }
1736
1737 if (u->pe_name && *u->pe_name) {
1738 pe = ip_vs_pe_getbyname(u->pe_name);
1739 if (pe == NULL) {
1740 pr_info("persistence engine module ip_vs_pe_%s "
1741 "not found\n", u->pe_name);
1742 ret = -ENOENT;
1743 goto out_err;
1744 }
1745 }
1746
1747 #ifdef CONFIG_IP_VS_IPV6
1748 if (u->af == AF_INET6) {
1749 __u32 plen = (__force __u32) u->netmask;
1750
1751 if (plen < 1 || plen > 128) {
1752 ret = -EINVAL;
1753 goto out_err;
1754 }
1755
1756 ret = nf_defrag_ipv6_enable(ipvs->net);
1757 if (ret)
1758 goto out_err;
1759 }
1760 #endif
1761
1762 /* The old table can be freed, protect it with RCU */
1763 rcu_read_lock();
1764 t = rcu_dereference(ipvs->svc_table);
1765 if (!t) {
1766 int lfactor = sysctl_svc_lfactor(ipvs);
1767 int new_size = ip_vs_svc_desired_size(ipvs, NULL, lfactor);
1768
1769 rcu_read_unlock();
1770 t_new = ip_vs_svc_table_alloc(ipvs, new_size, lfactor);
1771 if (!t_new) {
1772 ret = -ENOMEM;
1773 goto out_err;
1774 }
1775 grow = false;
1776 } else {
1777 /* Even the currently attached new table may need to grow */
1778 t = rcu_dereference(t->new_tbl);
1779 grow = ip_vs_get_num_services(ipvs) + 1 > t->u_thresh;
1780 rcu_read_unlock();
1781 }
1782
1783 if (!rcu_dereference_protected(ipvs->conn_tab, 1)) {
1784 int lfactor = sysctl_conn_lfactor(ipvs);
1785 int new_size = ip_vs_conn_desired_size(ipvs, NULL, lfactor);
1786
1787 tc_new = ip_vs_conn_tab_alloc(ipvs, new_size, lfactor);
1788 if (!tc_new) {
1789 ret = -ENOMEM;
1790 goto out_err;
1791 }
1792 }
1793
1794 if (!atomic_read(&ipvs->num_services[af_id])) {
1795 ret = ip_vs_register_hooks(ipvs, u->af);
1796 if (ret < 0)
1797 goto out_err;
1798 ret_hooks = ret;
1799 }
1800
1801 svc = kzalloc_obj(struct ip_vs_service);
1802 if (svc == NULL) {
1803 IP_VS_DBG(1, "%s(): no memory\n", __func__);
1804 ret = -ENOMEM;
1805 goto out_err;
1806 }
1807 ret = ip_vs_stats_init_alloc(&svc->stats);
1808 if (ret < 0)
1809 goto out_err;
1810
1811 /* I'm the first user of the service */
1812 atomic_set(&svc->refcnt, 0);
1813
1814 svc->af = u->af;
1815 svc->protocol = u->protocol;
1816 ip_vs_addr_copy(svc->af, &svc->addr, &u->addr);
1817 svc->port = u->port;
1818 svc->fwmark = u->fwmark;
1819 svc->flags = u->flags & ~IP_VS_SVC_F_HASHED;
1820 svc->timeout = u->timeout * HZ;
1821 svc->netmask = u->netmask;
1822 svc->ipvs = ipvs;
1823
1824 INIT_LIST_HEAD(&svc->destinations);
1825 spin_lock_init(&svc->sched_lock);
1826
1827 /* Bind the scheduler */
1828 if (sched) {
1829 ret = ip_vs_bind_scheduler(svc, sched);
1830 if (ret)
1831 goto out_err;
1832 }
1833
1834 ret = ip_vs_start_estimator(ipvs, &svc->stats);
1835 if (ret < 0)
1836 goto out_err;
1837
1838 if (t_new) {
1839 /* Add table for first time */
1840 clear_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
1841 rcu_assign_pointer(ipvs->svc_table, t_new);
1842 t_new = NULL;
1843 }
1844 if (tc_new) {
1845 rcu_assign_pointer(ipvs->conn_tab, tc_new);
1846 tc_new = NULL;
1847 }
1848
1849 /* Update the virtual service counters */
1850 if (svc->port == FTPPORT)
1851 atomic_inc(&ipvs->ftpsvc_counter[af_id]);
1852 else if (!svc->port && !svc->fwmark)
1853 atomic_inc(&ipvs->nullsvc_counter[af_id]);
1854 if (pe && pe->conn_out)
1855 atomic_inc(&ipvs->conn_out_counter[af_id]);
1856
1857 /* Bind the ct retriever */
1858 RCU_INIT_POINTER(svc->pe, pe);
1859 pe = NULL;
1860
1861 if (svc->fwmark)
1862 atomic_inc(&ipvs->fwm_services[af_id]);
1863 else
1864 atomic_inc(&ipvs->nonfwm_services[af_id]);
1865 atomic_inc(&ipvs->num_services[af_id]);
1866
1867 /* Hash the service into the service table */
1868 ip_vs_svc_hash(svc);
1869
1870 /* Schedule resize work */
1871 if (grow && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE, &ipvs->work_flags))
1872 queue_delayed_work(system_unbound_wq, &ipvs->svc_resize_work,
1873 1);
1874
1875 *svc_p = svc;
1876
1877 if (!READ_ONCE(ipvs->enable)) {
1878 mutex_lock(&ipvs->est_mutex);
1879
1880 /* Now there is a service - full throttle */
1881 WRITE_ONCE(ipvs->enable, 1);
1882
1883 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
1884
1885 /* Start estimation for first time */
1886 ip_vs_est_reload_start(ipvs, true);
1887 mutex_unlock(&ipvs->est_mutex);
1888 }
1889
1890 return 0;
1891
1892
1893 out_err:
1894 if (tc_new)
1895 ip_vs_rht_free(tc_new);
1896 if (t_new)
1897 ip_vs_rht_free(t_new);
1898 if (ret_hooks >= 0)
1899 ip_vs_unregister_hooks(ipvs, u->af);
1900 if (svc != NULL) {
1901 ip_vs_unbind_scheduler(svc);
1902 ip_vs_service_free(svc);
1903 }
1904 ip_vs_scheduler_put(sched);
1905 ip_vs_pe_put(pe);
1906
1907 /* decrease the module use count */
1908 ip_vs_use_count_dec();
1909
1910 return ret;
1911 }
1912
1913
1914 /*
1915 * Edit a service and bind it with a new scheduler
1916 */
1917 static int
ip_vs_edit_service(struct ip_vs_service * svc,struct ip_vs_service_user_kern * u)1918 ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
1919 {
1920 struct ip_vs_scheduler *sched = NULL, *old_sched;
1921 struct ip_vs_pe *pe = NULL, *old_pe = NULL;
1922 int ret = 0;
1923 bool new_pe_conn_out, old_pe_conn_out;
1924 struct netns_ipvs *ipvs = svc->ipvs;
1925 int af_id = ip_vs_af_index(svc->af);
1926
1927 /*
1928 * Lookup the scheduler, by 'u->sched_name'
1929 */
1930 if (strcmp(u->sched_name, "none")) {
1931 sched = ip_vs_scheduler_get(u->sched_name);
1932 if (!sched) {
1933 pr_info("Scheduler module ip_vs_%s not found\n",
1934 u->sched_name);
1935 return -ENOENT;
1936 }
1937 }
1938 old_sched = sched;
1939
1940 if (u->pe_name && *u->pe_name) {
1941 pe = ip_vs_pe_getbyname(u->pe_name);
1942 if (pe == NULL) {
1943 pr_info("persistence engine module ip_vs_pe_%s "
1944 "not found\n", u->pe_name);
1945 ret = -ENOENT;
1946 goto out;
1947 }
1948 old_pe = pe;
1949 }
1950
1951 #ifdef CONFIG_IP_VS_IPV6
1952 if (u->af == AF_INET6) {
1953 __u32 plen = (__force __u32) u->netmask;
1954
1955 if (plen < 1 || plen > 128) {
1956 ret = -EINVAL;
1957 goto out;
1958 }
1959 }
1960 #endif
1961
1962 old_sched = rcu_dereference_protected(svc->scheduler, 1);
1963 if (sched != old_sched) {
1964 if (old_sched) {
1965 ip_vs_unbind_scheduler(svc);
1966 /* Wait all svc->scheduler/sched_data users */
1967 synchronize_rcu();
1968 }
1969 /* Bind the new scheduler */
1970 if (sched) {
1971 ret = ip_vs_bind_scheduler(svc, sched);
1972 if (ret) {
1973 ip_vs_scheduler_put(sched);
1974 /* Try to restore the old_sched */
1975 if (old_sched &&
1976 !ip_vs_bind_scheduler(svc, old_sched))
1977 old_sched = NULL;
1978 goto out;
1979 }
1980 }
1981 }
1982
1983 /*
1984 * Set the flags and timeout value
1985 */
1986 svc->flags = u->flags | IP_VS_SVC_F_HASHED;
1987 svc->timeout = u->timeout * HZ;
1988 svc->netmask = u->netmask;
1989
1990 old_pe = rcu_dereference_protected(svc->pe, 1);
1991 if (pe != old_pe) {
1992 rcu_assign_pointer(svc->pe, pe);
1993 /* check for optional methods in new pe */
1994 new_pe_conn_out = (pe && pe->conn_out) ? true : false;
1995 old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
1996 if (new_pe_conn_out && !old_pe_conn_out)
1997 atomic_inc(&ipvs->conn_out_counter[af_id]);
1998 if (old_pe_conn_out && !new_pe_conn_out)
1999 atomic_dec(&ipvs->conn_out_counter[af_id]);
2000 }
2001
2002 out:
2003 ip_vs_scheduler_put(old_sched);
2004 ip_vs_pe_put(old_pe);
2005 return ret;
2006 }
2007
2008 /*
2009 * Delete a service from the service list
2010 * - The service must be unlinked, unlocked and not referenced!
2011 * - We are called under _bh lock
2012 */
__ip_vs_del_service(struct ip_vs_service * svc,bool cleanup)2013 static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
2014 {
2015 struct ip_vs_dest *dest, *nxt;
2016 struct ip_vs_scheduler *old_sched;
2017 struct ip_vs_pe *old_pe;
2018 struct netns_ipvs *ipvs = svc->ipvs;
2019 int af_id = ip_vs_af_index(svc->af);
2020
2021 atomic_dec(&ipvs->num_services[af_id]);
2022 if (!atomic_read(&ipvs->num_services[af_id]))
2023 ip_vs_unregister_hooks(ipvs, svc->af);
2024 if (svc->fwmark)
2025 atomic_dec(&ipvs->fwm_services[af_id]);
2026 else
2027 atomic_dec(&ipvs->nonfwm_services[af_id]);
2028
2029 ip_vs_stop_estimator(svc->ipvs, &svc->stats);
2030
2031 /* Unbind scheduler */
2032 old_sched = rcu_dereference_protected(svc->scheduler, 1);
2033 ip_vs_unbind_scheduler(svc);
2034 ip_vs_scheduler_put(old_sched);
2035
2036 /* Unbind persistence engine, keep svc->pe */
2037 old_pe = rcu_dereference_protected(svc->pe, 1);
2038 if (old_pe && old_pe->conn_out)
2039 atomic_dec(&ipvs->conn_out_counter[af_id]);
2040 ip_vs_pe_put(old_pe);
2041
2042 /*
2043 * Unlink the whole destination list
2044 */
2045 list_for_each_entry_safe(dest, nxt, &svc->destinations, n_list) {
2046 __ip_vs_unlink_dest(svc, dest, 0);
2047 __ip_vs_del_dest(svc->ipvs, dest, cleanup);
2048 }
2049
2050 /*
2051 * Update the virtual service counters
2052 */
2053 if (svc->port == FTPPORT)
2054 atomic_dec(&ipvs->ftpsvc_counter[af_id]);
2055 else if (!svc->port && !svc->fwmark)
2056 atomic_dec(&ipvs->nullsvc_counter[af_id]);
2057
2058 /*
2059 * Free the service if nobody refers to it
2060 */
2061 __ip_vs_svc_put(svc);
2062
2063 /* decrease the module use count */
2064 ip_vs_use_count_dec();
2065 }
2066
2067 /*
2068 * Unlink a service from list and try to delete it if its refcnt reached 0
2069 */
ip_vs_unlink_service(struct ip_vs_service * svc,bool cleanup)2070 static void ip_vs_unlink_service(struct ip_vs_service *svc, bool cleanup)
2071 {
2072 ip_vs_unregister_conntrack(svc);
2073 /* Hold svc to avoid double release from dest_trash */
2074 atomic_inc(&svc->refcnt);
2075 /*
2076 * Unhash it from the service table
2077 */
2078 ip_vs_svc_unhash(svc);
2079
2080 __ip_vs_del_service(svc, cleanup);
2081 }
2082
2083 /*
2084 * Delete a service from the service list
2085 */
ip_vs_del_service(struct ip_vs_service * svc)2086 static int ip_vs_del_service(struct ip_vs_service *svc)
2087 {
2088 struct netns_ipvs *ipvs;
2089 struct ip_vs_rht *t, *p;
2090 int ns;
2091
2092 if (svc == NULL)
2093 return -EEXIST;
2094 ipvs = svc->ipvs;
2095 ip_vs_unlink_service(svc, false);
2096
2097 /* Drop the table if no more services */
2098 ns = ip_vs_get_num_services(ipvs);
2099 if (!ns) {
2100 /* Stop the resizer and drop the tables */
2101 set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags);
2102 cancel_delayed_work_sync(&ipvs->svc_resize_work);
2103 t = rcu_dereference_protected(ipvs->svc_table, 1);
2104 if (t) {
2105 rcu_assign_pointer(ipvs->svc_table, NULL);
2106 /* Inform readers that table is removed */
2107 smp_mb__before_atomic();
2108 atomic_inc(&ipvs->svc_table_changes);
2109 while (1) {
2110 p = rcu_dereference_protected(t->new_tbl, 1);
2111 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2112 if (p == t)
2113 break;
2114 t = p;
2115 }
2116 }
2117 } else {
2118 bool shrink;
2119
2120 rcu_read_lock();
2121 t = rcu_dereference(ipvs->svc_table);
2122 /* Even the currently attached new table may need to shrink */
2123 t = rcu_dereference(t->new_tbl);
2124 shrink = ns <= t->l_thresh;
2125 rcu_read_unlock();
2126 if (shrink && !test_and_set_bit(IP_VS_WORK_SVC_RESIZE,
2127 &ipvs->work_flags))
2128 queue_delayed_work(system_unbound_wq,
2129 &ipvs->svc_resize_work, 1);
2130 }
2131 return 0;
2132 }
2133
2134
2135 /*
2136 * Flush all the virtual services
2137 */
ip_vs_flush(struct netns_ipvs * ipvs,bool cleanup)2138 static int ip_vs_flush(struct netns_ipvs *ipvs, bool cleanup)
2139 {
2140 DECLARE_IP_VS_RHT_WALK_BUCKETS();
2141 struct hlist_bl_head *head;
2142 struct ip_vs_service *svc;
2143 struct hlist_bl_node *ne;
2144 struct hlist_bl_node *e;
2145 struct ip_vs_rht *t, *p;
2146
2147 /* Stop the resizer and drop the tables */
2148 if (!test_and_set_bit(IP_VS_WORK_SVC_NORESIZE, &ipvs->work_flags))
2149 cancel_delayed_work_sync(&ipvs->svc_resize_work);
2150 /* No resizer, so now we have exclusive write access */
2151
2152 if (ip_vs_get_num_services(ipvs)) {
2153 ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
2154 hlist_bl_for_each_entry_safe(svc, e, ne, head, s_list)
2155 ip_vs_unlink_service(svc, cleanup);
2156 }
2157 }
2158
2159 /* Unregister the hash table and release it after RCU grace period */
2160 t = rcu_dereference_protected(ipvs->svc_table, 1);
2161 if (t) {
2162 rcu_assign_pointer(ipvs->svc_table, NULL);
2163 /* Inform readers that table is removed */
2164 smp_mb__before_atomic();
2165 atomic_inc(&ipvs->svc_table_changes);
2166 while (1) {
2167 p = rcu_dereference_protected(t->new_tbl, 1);
2168 call_rcu(&t->rcu_head, ip_vs_rht_rcu_free);
2169 if (p == t)
2170 break;
2171 t = p;
2172 }
2173 }
2174 /* Stop the tot_stats estimator early under service_mutex
2175 * to avoid locking it again later.
2176 */
2177 if (cleanup)
2178 ip_vs_stop_estimator_tot_stats(ipvs);
2179 return 0;
2180 }
2181
2182 /*
2183 * Delete service by {netns} in the service table.
2184 * Called by __ip_vs_batch_cleanup()
2185 */
ip_vs_service_nets_cleanup(struct list_head * net_list)2186 void ip_vs_service_nets_cleanup(struct list_head *net_list)
2187 {
2188 struct netns_ipvs *ipvs;
2189 struct net *net;
2190
2191 /* Check for "full" addressed entries */
2192 list_for_each_entry(net, net_list, exit_list) {
2193 ipvs = net_ipvs(net);
2194 mutex_lock(&ipvs->service_mutex);
2195 ip_vs_flush(ipvs, true);
2196 mutex_unlock(&ipvs->service_mutex);
2197 }
2198 }
2199
2200 /* Put all references for device (dst_cache) */
2201 static inline void
ip_vs_forget_dev(struct ip_vs_dest * dest,struct net_device * dev)2202 ip_vs_forget_dev(struct ip_vs_dest *dest, struct net_device *dev)
2203 {
2204 struct ip_vs_dest_dst *dest_dst;
2205
2206 spin_lock_bh(&dest->dst_lock);
2207 dest_dst = rcu_dereference_protected(dest->dest_dst, 1);
2208 if (dest_dst && dest_dst->dst_cache->dev == dev) {
2209 IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n",
2210 dev->name,
2211 IP_VS_DBG_ADDR(dest->af, &dest->addr),
2212 ntohs(dest->port),
2213 refcount_read(&dest->refcnt));
2214 __ip_vs_dst_cache_reset(dest);
2215 }
2216 spin_unlock_bh(&dest->dst_lock);
2217
2218 }
2219 /* Netdev event receiver
2220 * Currently only NETDEV_DOWN is handled to release refs to cached dsts
2221 */
ip_vs_dst_event(struct notifier_block * this,unsigned long event,void * ptr)2222 static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
2223 void *ptr)
2224 {
2225 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
2226 struct net *net = dev_net(dev);
2227 struct netns_ipvs *ipvs = net_ipvs(net);
2228 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2229 unsigned int resched_score = 0;
2230 struct hlist_bl_head *head;
2231 struct ip_vs_service *svc;
2232 struct hlist_bl_node *e;
2233 struct ip_vs_dest *dest;
2234 int old_gen;
2235
2236 if (event != NETDEV_DOWN || !ipvs)
2237 return NOTIFY_DONE;
2238 IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name);
2239
2240 /* Allow concurrent rehashing on resize but to avoid loop
2241 * serialize with installing the new table.
2242 */
2243 down_read(&ipvs->svc_replace_sem);
2244
2245 old_gen = atomic_read(&ipvs->svc_table_changes);
2246
2247 rcu_read_lock();
2248
2249 smp_rmb(); /* ipvs->svc_table and svc_table_changes */
2250 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2251 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2252 list_for_each_entry_rcu(dest, &svc->destinations,
2253 n_list) {
2254 ip_vs_forget_dev(dest, dev);
2255 resched_score += 10;
2256 }
2257 resched_score++;
2258 }
2259 resched_score++;
2260 if (resched_score >= 100) {
2261 cond_resched_rcu();
2262 /* Flushed? So no more dev refs */
2263 if (atomic_read(&ipvs->svc_table_changes) != old_gen)
2264 goto done;
2265 resched_score = 0;
2266 }
2267 }
2268
2269 done:
2270 rcu_read_unlock();
2271 up_read(&ipvs->svc_replace_sem);
2272
2273 return NOTIFY_DONE;
2274 }
2275
2276 /*
2277 * Zero counters in a service or all services
2278 */
ip_vs_zero_service(struct ip_vs_service * svc)2279 static int ip_vs_zero_service(struct ip_vs_service *svc)
2280 {
2281 struct ip_vs_dest *dest;
2282
2283 list_for_each_entry(dest, &svc->destinations, n_list) {
2284 ip_vs_zero_stats(&dest->stats);
2285 }
2286 ip_vs_zero_stats(&svc->stats);
2287 return 0;
2288 }
2289
ip_vs_zero_all(struct netns_ipvs * ipvs)2290 static int ip_vs_zero_all(struct netns_ipvs *ipvs)
2291 {
2292 DECLARE_IP_VS_RHT_WALK_BUCKETS_RCU();
2293 unsigned int resched_score = 0;
2294 struct hlist_bl_head *head;
2295 struct ip_vs_service *svc;
2296 struct hlist_bl_node *e;
2297
2298 /* svc_table can not be replaced (svc_replace_sem) or
2299 * removed (service_mutex)
2300 */
2301 down_read(&ipvs->svc_replace_sem);
2302 rcu_read_lock();
2303
2304 ip_vs_rht_walk_buckets_rcu(ipvs->svc_table, head) {
2305 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
2306 ip_vs_zero_service(svc);
2307 resched_score += 10;
2308 }
2309 resched_score++;
2310 if (resched_score >= 100) {
2311 resched_score = 0;
2312 cond_resched_rcu();
2313 }
2314 }
2315
2316 rcu_read_unlock();
2317 up_read(&ipvs->svc_replace_sem);
2318
2319 ip_vs_zero_stats(&ipvs->tot_stats->s);
2320 return 0;
2321 }
2322
2323 #ifdef CONFIG_SYSCTL
2324
2325 static int
proc_do_defense_mode(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2326 proc_do_defense_mode(const struct ctl_table *table, int write,
2327 void *buffer, size_t *lenp, loff_t *ppos)
2328 {
2329 struct netns_ipvs *ipvs = table->extra2;
2330 int *valp = table->data;
2331 int val = *valp;
2332 int rc;
2333
2334 struct ctl_table tmp = {
2335 .data = &val,
2336 .maxlen = sizeof(int),
2337 .mode = table->mode,
2338 };
2339
2340 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2341 if (write && (*valp != val)) {
2342 if (val < 0 || val > 3) {
2343 rc = -EINVAL;
2344 } else {
2345 *valp = val;
2346 update_defense_level(ipvs);
2347 }
2348 }
2349 return rc;
2350 }
2351
2352 static int
proc_do_sync_threshold(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2353 proc_do_sync_threshold(const struct ctl_table *table, int write,
2354 void *buffer, size_t *lenp, loff_t *ppos)
2355 {
2356 struct netns_ipvs *ipvs = table->extra2;
2357 int *valp = table->data;
2358 int val[2];
2359 int rc;
2360 struct ctl_table tmp = {
2361 .data = &val,
2362 .maxlen = table->maxlen,
2363 .mode = table->mode,
2364 };
2365
2366 mutex_lock(&ipvs->sync_mutex);
2367 memcpy(val, valp, sizeof(val));
2368 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2369 if (write) {
2370 if (val[0] < 0 || val[1] < 0 ||
2371 (val[0] >= val[1] && val[1]))
2372 rc = -EINVAL;
2373 else
2374 memcpy(valp, val, sizeof(val));
2375 }
2376 mutex_unlock(&ipvs->sync_mutex);
2377 return rc;
2378 }
2379
2380 static int
proc_do_sync_ports(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2381 proc_do_sync_ports(const struct ctl_table *table, int write,
2382 void *buffer, size_t *lenp, loff_t *ppos)
2383 {
2384 int *valp = table->data;
2385 int val = *valp;
2386 int rc;
2387
2388 struct ctl_table tmp = {
2389 .data = &val,
2390 .maxlen = sizeof(int),
2391 .mode = table->mode,
2392 };
2393
2394 rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
2395 if (write && (*valp != val)) {
2396 if (val < 1 || !is_power_of_2(val))
2397 rc = -EINVAL;
2398 else
2399 *valp = val;
2400 }
2401 return rc;
2402 }
2403
ipvs_proc_est_cpumask_set(const struct ctl_table * table,void * buffer)2404 static int ipvs_proc_est_cpumask_set(const struct ctl_table *table,
2405 void *buffer)
2406 {
2407 struct netns_ipvs *ipvs = table->extra2;
2408 cpumask_var_t *valp = table->data;
2409 cpumask_var_t newmask;
2410 int ret;
2411
2412 if (!zalloc_cpumask_var(&newmask, GFP_KERNEL))
2413 return -ENOMEM;
2414
2415 ret = cpulist_parse(buffer, newmask);
2416 if (ret)
2417 goto out;
2418
2419 mutex_lock(&ipvs->est_mutex);
2420
2421 if (!ipvs->est_cpulist_valid) {
2422 if (!zalloc_cpumask_var(valp, GFP_KERNEL)) {
2423 ret = -ENOMEM;
2424 goto unlock;
2425 }
2426 ipvs->est_cpulist_valid = 1;
2427 }
2428 cpumask_and(newmask, newmask, ¤t->cpus_mask);
2429 cpumask_copy(*valp, newmask);
2430 /* est_max_threads may depend on cpulist size */
2431 ipvs->est_max_threads = ip_vs_est_max_threads(ipvs);
2432 ipvs->est_calc_phase = 1;
2433 ip_vs_est_reload_start(ipvs, true);
2434
2435 unlock:
2436 mutex_unlock(&ipvs->est_mutex);
2437
2438 out:
2439 free_cpumask_var(newmask);
2440 return ret;
2441 }
2442
ipvs_proc_est_cpumask_get(const struct ctl_table * table,void * buffer,size_t size)2443 static int ipvs_proc_est_cpumask_get(const struct ctl_table *table,
2444 void *buffer, size_t size)
2445 {
2446 struct netns_ipvs *ipvs = table->extra2;
2447 cpumask_var_t *valp = table->data;
2448 struct cpumask *mask;
2449 int ret;
2450
2451 mutex_lock(&ipvs->est_mutex);
2452
2453 /* HK_TYPE_KTHREAD cpumask needs RCU protection */
2454 scoped_guard(rcu) {
2455 if (ipvs->est_cpulist_valid)
2456 mask = *valp;
2457 else
2458 mask = (struct cpumask *)housekeeping_cpumask(HK_TYPE_KTHREAD);
2459 ret = scnprintf(buffer, size, "%*pbl\n", cpumask_pr_args(mask));
2460 }
2461
2462 mutex_unlock(&ipvs->est_mutex);
2463
2464 return ret;
2465 }
2466
ipvs_proc_est_cpulist(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2467 static int ipvs_proc_est_cpulist(const struct ctl_table *table, int write,
2468 void *buffer, size_t *lenp, loff_t *ppos)
2469 {
2470 int ret;
2471
2472 /* Ignore both read and write(append) if *ppos not 0 */
2473 if (*ppos || !*lenp) {
2474 *lenp = 0;
2475 return 0;
2476 }
2477 if (write) {
2478 /* proc_sys_call_handler() appends terminator */
2479 ret = ipvs_proc_est_cpumask_set(table, buffer);
2480 if (ret >= 0)
2481 *ppos += *lenp;
2482 } else {
2483 /* proc_sys_call_handler() allocates 1 byte for terminator */
2484 ret = ipvs_proc_est_cpumask_get(table, buffer, *lenp + 1);
2485 if (ret >= 0) {
2486 *lenp = ret;
2487 *ppos += *lenp;
2488 ret = 0;
2489 }
2490 }
2491 return ret;
2492 }
2493
ipvs_proc_est_nice(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2494 static int ipvs_proc_est_nice(const struct ctl_table *table, int write,
2495 void *buffer, size_t *lenp, loff_t *ppos)
2496 {
2497 struct netns_ipvs *ipvs = table->extra2;
2498 int *valp = table->data;
2499 int val = *valp;
2500 int ret;
2501
2502 struct ctl_table tmp_table = {
2503 .data = &val,
2504 .maxlen = sizeof(int),
2505 .mode = table->mode,
2506 };
2507
2508 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2509 if (write && ret >= 0) {
2510 if (val < MIN_NICE || val > MAX_NICE) {
2511 ret = -EINVAL;
2512 } else {
2513 mutex_lock(&ipvs->est_mutex);
2514 if (*valp != val) {
2515 *valp = val;
2516 ip_vs_est_reload_start(ipvs, true);
2517 }
2518 mutex_unlock(&ipvs->est_mutex);
2519 }
2520 }
2521 return ret;
2522 }
2523
ipvs_proc_run_estimation(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2524 static int ipvs_proc_run_estimation(const struct ctl_table *table, int write,
2525 void *buffer, size_t *lenp, loff_t *ppos)
2526 {
2527 struct netns_ipvs *ipvs = table->extra2;
2528 int *valp = table->data;
2529 int val = *valp;
2530 int ret;
2531
2532 struct ctl_table tmp_table = {
2533 .data = &val,
2534 .maxlen = sizeof(int),
2535 .mode = table->mode,
2536 };
2537
2538 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2539 if (write && ret >= 0) {
2540 mutex_lock(&ipvs->est_mutex);
2541 if (*valp != val) {
2542 *valp = val;
2543 ip_vs_est_reload_start(ipvs, true);
2544 }
2545 mutex_unlock(&ipvs->est_mutex);
2546 }
2547 return ret;
2548 }
2549
ipvs_proc_conn_lfactor(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2550 static int ipvs_proc_conn_lfactor(const struct ctl_table *table, int write,
2551 void *buffer, size_t *lenp, loff_t *ppos)
2552 {
2553 struct netns_ipvs *ipvs = table->extra2;
2554 int *valp = table->data;
2555 int val = *valp;
2556 int ret;
2557
2558 struct ctl_table tmp_table = {
2559 .data = &val,
2560 .maxlen = sizeof(int),
2561 };
2562
2563 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2564 if (write && ret >= 0) {
2565 if (val < -8 || val > 8) {
2566 ret = -EINVAL;
2567 } else {
2568 WRITE_ONCE(*valp, val);
2569 if (rcu_access_pointer(ipvs->conn_tab))
2570 mod_delayed_work(system_unbound_wq,
2571 &ipvs->conn_resize_work, 0);
2572 }
2573 }
2574 return ret;
2575 }
2576
ipvs_proc_svc_lfactor(const struct ctl_table * table,int write,void * buffer,size_t * lenp,loff_t * ppos)2577 static int ipvs_proc_svc_lfactor(const struct ctl_table *table, int write,
2578 void *buffer, size_t *lenp, loff_t *ppos)
2579 {
2580 struct netns_ipvs *ipvs = table->extra2;
2581 int *valp = table->data;
2582 int val = *valp;
2583 int ret;
2584
2585 struct ctl_table tmp_table = {
2586 .data = &val,
2587 .maxlen = sizeof(int),
2588 };
2589
2590 ret = proc_dointvec(&tmp_table, write, buffer, lenp, ppos);
2591 if (write && ret >= 0) {
2592 if (val < -8 || val > 8) {
2593 ret = -EINVAL;
2594 } else {
2595 mutex_lock(&ipvs->service_mutex);
2596 WRITE_ONCE(*valp, val);
2597 /* Make sure the services are present */
2598 if (rcu_access_pointer(ipvs->svc_table) &&
2599 READ_ONCE(ipvs->enable) &&
2600 !test_bit(IP_VS_WORK_SVC_NORESIZE,
2601 &ipvs->work_flags))
2602 mod_delayed_work(system_unbound_wq,
2603 &ipvs->svc_resize_work, 0);
2604 mutex_unlock(&ipvs->service_mutex);
2605 }
2606 }
2607 return ret;
2608 }
2609
2610 /*
2611 * IPVS sysctl table (under the /proc/sys/net/ipv4/vs/)
2612 * Do not change order or insert new entries without
2613 * align with netns init in ip_vs_control_net_init()
2614 */
2615
2616 static struct ctl_table vs_vars[] = {
2617 {
2618 .procname = "amemthresh",
2619 .maxlen = sizeof(int),
2620 .mode = 0644,
2621 .proc_handler = proc_dointvec,
2622 },
2623 {
2624 .procname = "am_droprate",
2625 .maxlen = sizeof(int),
2626 .mode = 0644,
2627 .proc_handler = proc_dointvec,
2628 },
2629 {
2630 .procname = "drop_entry",
2631 .maxlen = sizeof(int),
2632 .mode = 0644,
2633 .proc_handler = proc_do_defense_mode,
2634 },
2635 {
2636 .procname = "drop_packet",
2637 .maxlen = sizeof(int),
2638 .mode = 0644,
2639 .proc_handler = proc_do_defense_mode,
2640 },
2641 #ifdef CONFIG_IP_VS_NFCT
2642 {
2643 .procname = "conntrack",
2644 .maxlen = sizeof(int),
2645 .mode = 0644,
2646 .proc_handler = &proc_dointvec,
2647 },
2648 #endif
2649 {
2650 .procname = "secure_tcp",
2651 .maxlen = sizeof(int),
2652 .mode = 0644,
2653 .proc_handler = proc_do_defense_mode,
2654 },
2655 {
2656 .procname = "snat_reroute",
2657 .maxlen = sizeof(int),
2658 .mode = 0644,
2659 .proc_handler = &proc_dointvec,
2660 },
2661 {
2662 .procname = "sync_version",
2663 .maxlen = sizeof(int),
2664 .mode = 0644,
2665 .proc_handler = proc_dointvec_minmax,
2666 .extra1 = SYSCTL_ZERO,
2667 .extra2 = SYSCTL_ONE,
2668 },
2669 {
2670 .procname = "sync_ports",
2671 .maxlen = sizeof(int),
2672 .mode = 0644,
2673 .proc_handler = proc_do_sync_ports,
2674 },
2675 {
2676 .procname = "sync_persist_mode",
2677 .maxlen = sizeof(int),
2678 .mode = 0644,
2679 .proc_handler = proc_dointvec,
2680 },
2681 {
2682 .procname = "sync_qlen_max",
2683 .maxlen = sizeof(unsigned long),
2684 .mode = 0644,
2685 .proc_handler = proc_doulongvec_minmax,
2686 },
2687 {
2688 .procname = "sync_sock_size",
2689 .maxlen = sizeof(int),
2690 .mode = 0644,
2691 .proc_handler = proc_dointvec,
2692 },
2693 {
2694 .procname = "cache_bypass",
2695 .maxlen = sizeof(int),
2696 .mode = 0644,
2697 .proc_handler = proc_dointvec,
2698 },
2699 {
2700 .procname = "expire_nodest_conn",
2701 .maxlen = sizeof(int),
2702 .mode = 0644,
2703 .proc_handler = proc_dointvec,
2704 },
2705 {
2706 .procname = "sloppy_tcp",
2707 .maxlen = sizeof(int),
2708 .mode = 0644,
2709 .proc_handler = proc_dointvec,
2710 },
2711 {
2712 .procname = "sloppy_sctp",
2713 .maxlen = sizeof(int),
2714 .mode = 0644,
2715 .proc_handler = proc_dointvec,
2716 },
2717 {
2718 .procname = "expire_quiescent_template",
2719 .maxlen = sizeof(int),
2720 .mode = 0644,
2721 .proc_handler = proc_dointvec,
2722 },
2723 {
2724 .procname = "sync_threshold",
2725 .maxlen =
2726 sizeof(((struct netns_ipvs *)0)->sysctl_sync_threshold),
2727 .mode = 0644,
2728 .proc_handler = proc_do_sync_threshold,
2729 },
2730 {
2731 .procname = "sync_refresh_period",
2732 .maxlen = sizeof(int),
2733 .mode = 0644,
2734 .proc_handler = proc_dointvec_jiffies,
2735 },
2736 {
2737 .procname = "sync_retries",
2738 .maxlen = sizeof(int),
2739 .mode = 0644,
2740 .proc_handler = proc_dointvec_minmax,
2741 .extra1 = SYSCTL_ZERO,
2742 .extra2 = SYSCTL_THREE,
2743 },
2744 {
2745 .procname = "nat_icmp_send",
2746 .maxlen = sizeof(int),
2747 .mode = 0644,
2748 .proc_handler = proc_dointvec,
2749 },
2750 {
2751 .procname = "pmtu_disc",
2752 .maxlen = sizeof(int),
2753 .mode = 0644,
2754 .proc_handler = proc_dointvec,
2755 },
2756 {
2757 .procname = "backup_only",
2758 .maxlen = sizeof(int),
2759 .mode = 0644,
2760 .proc_handler = proc_dointvec,
2761 },
2762 {
2763 .procname = "conn_reuse_mode",
2764 .maxlen = sizeof(int),
2765 .mode = 0644,
2766 .proc_handler = proc_dointvec,
2767 },
2768 {
2769 .procname = "schedule_icmp",
2770 .maxlen = sizeof(int),
2771 .mode = 0644,
2772 .proc_handler = proc_dointvec,
2773 },
2774 {
2775 .procname = "ignore_tunneled",
2776 .maxlen = sizeof(int),
2777 .mode = 0644,
2778 .proc_handler = proc_dointvec,
2779 },
2780 {
2781 .procname = "run_estimation",
2782 .maxlen = sizeof(int),
2783 .mode = 0644,
2784 .proc_handler = ipvs_proc_run_estimation,
2785 },
2786 {
2787 .procname = "est_cpulist",
2788 .maxlen = NR_CPUS, /* unused */
2789 .mode = 0644,
2790 .proc_handler = ipvs_proc_est_cpulist,
2791 },
2792 {
2793 .procname = "est_nice",
2794 .maxlen = sizeof(int),
2795 .mode = 0644,
2796 .proc_handler = ipvs_proc_est_nice,
2797 },
2798 {
2799 .procname = "conn_lfactor",
2800 .maxlen = sizeof(int),
2801 .mode = 0644,
2802 .proc_handler = ipvs_proc_conn_lfactor,
2803 },
2804 {
2805 .procname = "svc_lfactor",
2806 .maxlen = sizeof(int),
2807 .mode = 0644,
2808 .proc_handler = ipvs_proc_svc_lfactor,
2809 },
2810 #ifdef CONFIG_IP_VS_DEBUG
2811 {
2812 .procname = "debug_level",
2813 .data = &sysctl_ip_vs_debug_level,
2814 .maxlen = sizeof(int),
2815 .mode = 0644,
2816 .proc_handler = proc_dointvec,
2817 },
2818 #endif
2819 };
2820
2821 #endif
2822
2823 #ifdef CONFIG_PROC_FS
2824
2825 struct ip_vs_iter {
2826 struct seq_net_private p; /* Do not move this, netns depends upon it*/
2827 struct ip_vs_rht *t;
2828 u32 bucket;
2829 };
2830
2831 /*
2832 * Write the contents of the VS rule table to a PROCfs file.
2833 * (It is kept just for backward compatibility)
2834 */
ip_vs_fwd_name(unsigned int flags)2835 static inline const char *ip_vs_fwd_name(unsigned int flags)
2836 {
2837 switch (flags & IP_VS_CONN_F_FWD_MASK) {
2838 case IP_VS_CONN_F_LOCALNODE:
2839 return "Local";
2840 case IP_VS_CONN_F_TUNNEL:
2841 return "Tunnel";
2842 case IP_VS_CONN_F_DROUTE:
2843 return "Route";
2844 default:
2845 return "Masq";
2846 }
2847 }
2848
2849 /* Do not expect consistent view during add, del and move(table resize).
2850 * We may miss entries and even show duplicates.
2851 */
ip_vs_info_array(struct seq_file * seq,loff_t pos)2852 static struct ip_vs_service *ip_vs_info_array(struct seq_file *seq, loff_t pos)
2853 {
2854 struct ip_vs_iter *iter = seq->private;
2855 struct ip_vs_rht *t = iter->t;
2856 struct ip_vs_service *svc;
2857 struct hlist_bl_node *e;
2858 int idx;
2859
2860 if (!t)
2861 return NULL;
2862 for (idx = 0; idx < t->size; idx++) {
2863 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[idx], s_list) {
2864 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2865 break;
2866 if (pos-- == 0) {
2867 iter->bucket = idx;
2868 return svc;
2869 }
2870 }
2871 }
2872 return NULL;
2873 }
2874
ip_vs_info_seq_start(struct seq_file * seq,loff_t * pos)2875 static void *ip_vs_info_seq_start(struct seq_file *seq, loff_t *pos)
2876 __acquires(RCU)
2877 {
2878 struct ip_vs_iter *iter = seq->private;
2879 struct net *net = seq_file_net(seq);
2880 struct netns_ipvs *ipvs = net_ipvs(net);
2881
2882 rcu_read_lock();
2883 iter->t = rcu_dereference(ipvs->svc_table);
2884 return *pos ? ip_vs_info_array(seq, *pos - 1) : SEQ_START_TOKEN;
2885 }
2886
2887
ip_vs_info_seq_next(struct seq_file * seq,void * v,loff_t * pos)2888 static void *ip_vs_info_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2889 {
2890 struct ip_vs_service *svc;
2891 struct ip_vs_iter *iter;
2892 struct hlist_bl_node *e;
2893 struct ip_vs_rht *t;
2894
2895 ++*pos;
2896 if (v == SEQ_START_TOKEN)
2897 return ip_vs_info_array(seq,0);
2898
2899 svc = v;
2900 iter = seq->private;
2901 t = iter->t;
2902 if (!t)
2903 return NULL;
2904
2905 hlist_bl_for_each_entry_continue_rcu(svc, e, s_list) {
2906 /* Our cursor was moved to new table ? */
2907 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2908 break;
2909 return svc;
2910 }
2911
2912 while (++iter->bucket < t->size) {
2913 hlist_bl_for_each_entry_rcu(svc, e, &t->buckets[iter->bucket],
2914 s_list) {
2915 if (!ip_vs_rht_same_table(t, READ_ONCE(svc->hash_key)))
2916 break;
2917 return svc;
2918 }
2919 }
2920 return NULL;
2921 }
2922
ip_vs_info_seq_stop(struct seq_file * seq,void * v)2923 static void ip_vs_info_seq_stop(struct seq_file *seq, void *v)
2924 __releases(RCU)
2925 {
2926 rcu_read_unlock();
2927 }
2928
2929
ip_vs_info_seq_show(struct seq_file * seq,void * v)2930 static int ip_vs_info_seq_show(struct seq_file *seq, void *v)
2931 {
2932 struct net *net = seq_file_net(seq);
2933 struct netns_ipvs *ipvs = net_ipvs(net);
2934
2935 if (v == SEQ_START_TOKEN) {
2936 seq_printf(seq,
2937 "IP Virtual Server version %d.%d.%d (size=%d)\n",
2938 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
2939 seq_puts(seq,
2940 "Prot LocalAddress:Port Scheduler Flags\n");
2941 seq_puts(seq,
2942 " -> RemoteAddress:Port Forward Weight ActiveConn InActConn\n");
2943 } else {
2944 const struct ip_vs_service *svc = v;
2945 const struct ip_vs_dest *dest;
2946 struct ip_vs_scheduler *sched = rcu_dereference(svc->scheduler);
2947 char *sched_name = sched ? sched->name : "none";
2948
2949 if (!svc->fwmark) {
2950 #ifdef CONFIG_IP_VS_IPV6
2951 if (svc->af == AF_INET6)
2952 seq_printf(seq, "%s [%pI6]:%04X %s ",
2953 ip_vs_proto_name(svc->protocol),
2954 &svc->addr.in6,
2955 ntohs(svc->port),
2956 sched_name);
2957 else
2958 #endif
2959 seq_printf(seq, "%s %08X:%04X %s %s ",
2960 ip_vs_proto_name(svc->protocol),
2961 ntohl(svc->addr.ip),
2962 ntohs(svc->port),
2963 sched_name,
2964 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2965 } else {
2966 seq_printf(seq, "FWM %08X %s %s",
2967 svc->fwmark, sched_name,
2968 (svc->flags & IP_VS_SVC_F_ONEPACKET)?"ops ":"");
2969 }
2970
2971 if (svc->flags & IP_VS_SVC_F_PERSISTENT)
2972 seq_printf(seq, "persistent %d %08X\n",
2973 svc->timeout,
2974 ntohl(svc->netmask));
2975 else
2976 seq_putc(seq, '\n');
2977
2978 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
2979 #ifdef CONFIG_IP_VS_IPV6
2980 if (dest->af == AF_INET6)
2981 seq_printf(seq,
2982 " -> [%pI6]:%04X"
2983 " %-7s %-6d %-10d %-10d\n",
2984 &dest->addr.in6,
2985 ntohs(dest->port),
2986 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2987 atomic_read(&dest->weight),
2988 atomic_read(&dest->activeconns),
2989 atomic_read(&dest->inactconns));
2990 else
2991 #endif
2992 seq_printf(seq,
2993 " -> %08X:%04X "
2994 "%-7s %-6d %-10d %-10d\n",
2995 ntohl(dest->addr.ip),
2996 ntohs(dest->port),
2997 ip_vs_fwd_name(atomic_read(&dest->conn_flags)),
2998 atomic_read(&dest->weight),
2999 atomic_read(&dest->activeconns),
3000 atomic_read(&dest->inactconns));
3001
3002 }
3003 }
3004 return 0;
3005 }
3006
3007 static const struct seq_operations ip_vs_info_seq_ops = {
3008 .start = ip_vs_info_seq_start,
3009 .next = ip_vs_info_seq_next,
3010 .stop = ip_vs_info_seq_stop,
3011 .show = ip_vs_info_seq_show,
3012 };
3013
ip_vs_stats_show(struct seq_file * seq,void * v)3014 static int ip_vs_stats_show(struct seq_file *seq, void *v)
3015 {
3016 struct net *net = seq_file_single_net(seq);
3017 struct ip_vs_kstats show;
3018
3019 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
3020 seq_puts(seq,
3021 " Total Incoming Outgoing Incoming Outgoing\n");
3022 seq_puts(seq,
3023 " Conns Packets Packets Bytes Bytes\n");
3024
3025 ip_vs_copy_stats(&show, &net_ipvs(net)->tot_stats->s);
3026 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n\n",
3027 (unsigned long long)show.conns,
3028 (unsigned long long)show.inpkts,
3029 (unsigned long long)show.outpkts,
3030 (unsigned long long)show.inbytes,
3031 (unsigned long long)show.outbytes);
3032
3033 /* 01234567 01234567 01234567 0123456701234567 0123456701234567*/
3034 seq_puts(seq,
3035 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
3036 seq_printf(seq, "%8LX %8LX %8LX %16LX %16LX\n",
3037 (unsigned long long)show.cps,
3038 (unsigned long long)show.inpps,
3039 (unsigned long long)show.outpps,
3040 (unsigned long long)show.inbps,
3041 (unsigned long long)show.outbps);
3042
3043 return 0;
3044 }
3045
ip_vs_stats_percpu_show(struct seq_file * seq,void * v)3046 static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v)
3047 {
3048 struct net *net = seq_file_single_net(seq);
3049 struct ip_vs_stats *tot_stats = &net_ipvs(net)->tot_stats->s;
3050 struct ip_vs_cpu_stats __percpu *cpustats = tot_stats->cpustats;
3051 struct ip_vs_kstats kstats;
3052 int i;
3053
3054 /* 01234567 01234567 01234567 0123456701234567 0123456701234567 */
3055 seq_puts(seq,
3056 " Total Incoming Outgoing Incoming Outgoing\n");
3057 seq_puts(seq,
3058 "CPU Conns Packets Packets Bytes Bytes\n");
3059
3060 for_each_possible_cpu(i) {
3061 struct ip_vs_cpu_stats *u = per_cpu_ptr(cpustats, i);
3062 unsigned int start;
3063 u64 conns, inpkts, outpkts, inbytes, outbytes;
3064
3065 do {
3066 start = u64_stats_fetch_begin(&u->syncp);
3067 conns = u64_stats_read(&u->cnt.conns);
3068 inpkts = u64_stats_read(&u->cnt.inpkts);
3069 outpkts = u64_stats_read(&u->cnt.outpkts);
3070 inbytes = u64_stats_read(&u->cnt.inbytes);
3071 outbytes = u64_stats_read(&u->cnt.outbytes);
3072 } while (u64_stats_fetch_retry(&u->syncp, start));
3073
3074 seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n",
3075 i, (u64)conns, (u64)inpkts,
3076 (u64)outpkts, (u64)inbytes,
3077 (u64)outbytes);
3078 }
3079
3080 ip_vs_copy_stats(&kstats, tot_stats);
3081
3082 seq_printf(seq, " ~ %8LX %8LX %8LX %16LX %16LX\n\n",
3083 (unsigned long long)kstats.conns,
3084 (unsigned long long)kstats.inpkts,
3085 (unsigned long long)kstats.outpkts,
3086 (unsigned long long)kstats.inbytes,
3087 (unsigned long long)kstats.outbytes);
3088
3089 /* ... 01234567 01234567 01234567 0123456701234567 0123456701234567 */
3090 seq_puts(seq,
3091 " Conns/s Pkts/s Pkts/s Bytes/s Bytes/s\n");
3092 seq_printf(seq, " %8LX %8LX %8LX %16LX %16LX\n",
3093 kstats.cps,
3094 kstats.inpps,
3095 kstats.outpps,
3096 kstats.inbps,
3097 kstats.outbps);
3098
3099 return 0;
3100 }
3101
ip_vs_status_show(struct seq_file * seq,void * v)3102 static int ip_vs_status_show(struct seq_file *seq, void *v)
3103 {
3104 struct net *net = seq_file_single_net(seq);
3105 struct netns_ipvs *ipvs = net_ipvs(net);
3106 unsigned int resched_score = 0;
3107 struct ip_vs_conn_hnode *hn;
3108 struct hlist_bl_head *head;
3109 struct ip_vs_service *svc;
3110 struct ip_vs_rht *t, *pt;
3111 struct hlist_bl_node *e;
3112 int old_gen, new_gen;
3113 u32 counts[8];
3114 u32 bucket;
3115 u32 count;
3116 int loops;
3117 u32 sum1;
3118 u32 sum;
3119 int i;
3120
3121 /* Info for conns */
3122 rcu_read_lock();
3123
3124 t = rcu_dereference(ipvs->conn_tab);
3125
3126 seq_printf(seq, "Conns:\t%d\n", atomic_read(&ipvs->conn_count));
3127 seq_printf(seq, "Conn buckets:\t%d (%d bits, lfactor %d)\n",
3128 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3129
3130 if (!atomic_read(&ipvs->conn_count))
3131 goto after_conns;
3132 old_gen = atomic_read(&ipvs->conn_tab_changes);
3133 loops = 0;
3134
3135 repeat_conn:
3136 smp_rmb(); /* ipvs->conn_tab and conn_tab_changes */
3137 memset(counts, 0, sizeof(counts));
3138 ip_vs_rht_for_each_table_rcu(ipvs->conn_tab, t, pt) {
3139 for (bucket = 0; bucket < t->size; bucket++) {
3140 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3141
3142 count = 0;
3143 resched_score++;
3144 ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3145 count = 0;
3146 hlist_bl_for_each_entry_rcu(hn, e, head, node) {
3147 count++;
3148 if (count >= ARRAY_SIZE(counts) - 1)
3149 break;
3150 }
3151 }
3152 resched_score += count;
3153 if (resched_score >= 100) {
3154 resched_score = 0;
3155 cond_resched_rcu();
3156 new_gen = atomic_read(&ipvs->conn_tab_changes);
3157 /* New table installed ? */
3158 if (old_gen != new_gen) {
3159 /* Too many changes? */
3160 if (++loops >= 5)
3161 goto after_conns;
3162 old_gen = new_gen;
3163 goto repeat_conn;
3164 }
3165 }
3166 counts[count]++;
3167 }
3168 }
3169 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3170 sum += counts[i];
3171 sum1 = sum - counts[0];
3172 seq_printf(seq, "Conn buckets empty:\t%u (%llu%%)\n",
3173 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
3174 for (i = 1; i < ARRAY_SIZE(counts); i++) {
3175 if (!counts[i])
3176 continue;
3177 seq_printf(seq, "Conn buckets len-%d:\t%u (%llu%%)\n",
3178 i, counts[i],
3179 div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
3180 }
3181
3182 after_conns:
3183 rcu_read_unlock();
3184
3185 /* Info for services */
3186 down_read(&ipvs->svc_replace_sem);
3187 rcu_read_lock();
3188
3189 t = rcu_dereference(ipvs->svc_table);
3190
3191 count = ip_vs_get_num_services(ipvs);
3192 seq_printf(seq, "Services:\t%u\n", count);
3193 seq_printf(seq, "Service buckets:\t%d (%d bits, lfactor %d)\n",
3194 t ? t->size : 0, t ? t->bits : 0, t ? t->lfactor : 0);
3195
3196 if (!count)
3197 goto after_svc;
3198 old_gen = atomic_read(&ipvs->svc_table_changes);
3199
3200 smp_rmb(); /* ipvs->svc_table and svc_table_changes */
3201 memset(counts, 0, sizeof(counts));
3202 ip_vs_rht_for_each_table_rcu(ipvs->svc_table, t, pt) {
3203 for (bucket = 0; bucket < t->size; bucket++) {
3204 DECLARE_IP_VS_RHT_WALK_BUCKET_RCU();
3205
3206 count = 0;
3207 resched_score++;
3208 ip_vs_rht_walk_bucket_rcu(t, bucket, head) {
3209 count = 0;
3210 hlist_bl_for_each_entry_rcu(svc, e, head,
3211 s_list) {
3212 count++;
3213 if (count >= ARRAY_SIZE(counts) - 1)
3214 break;
3215 }
3216 }
3217 resched_score += count;
3218 if (resched_score >= 100) {
3219 resched_score = 0;
3220 cond_resched_rcu();
3221 /* Flushed? */
3222 if (atomic_read(&ipvs->svc_table_changes) !=
3223 old_gen)
3224 goto after_svc;
3225 }
3226 counts[count]++;
3227 }
3228 }
3229 for (sum = 0, i = 0; i < ARRAY_SIZE(counts); i++)
3230 sum += counts[i];
3231 sum1 = sum - counts[0];
3232 seq_printf(seq, "Service buckets empty:\t%u (%llu%%)\n",
3233 counts[0], div_u64((u64)counts[0] * 100U, max(sum, 1U)));
3234 for (i = 1; i < ARRAY_SIZE(counts); i++) {
3235 if (!counts[i])
3236 continue;
3237 seq_printf(seq, "Service buckets len-%d:\t%u (%llu%%)\n",
3238 i, counts[i],
3239 div_u64((u64)counts[i] * 100U, max(sum1, 1U)));
3240 }
3241
3242 after_svc:
3243 rcu_read_unlock();
3244 up_read(&ipvs->svc_replace_sem);
3245
3246 seq_printf(seq, "Stats thread slots:\t%d (max %lu)\n",
3247 ipvs->est_kt_count, ipvs->est_max_threads);
3248 seq_printf(seq, "Stats chain max len:\t%d\n", ipvs->est_chain_max);
3249 seq_printf(seq, "Stats thread ests:\t%d\n",
3250 ipvs->est_chain_max * IPVS_EST_CHAIN_FACTOR *
3251 IPVS_EST_NTICKS);
3252
3253 return 0;
3254 }
3255
3256 #endif
3257
3258 /*
3259 * Set timeout values for tcp tcpfin udp in the timeout_table.
3260 */
ip_vs_set_timeout(struct netns_ipvs * ipvs,struct ip_vs_timeout_user * u)3261 static int ip_vs_set_timeout(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3262 {
3263 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3264 struct ip_vs_proto_data *pd;
3265 #endif
3266
3267 IP_VS_DBG(2, "Setting timeout tcp:%d tcpfin:%d udp:%d\n",
3268 u->tcp_timeout,
3269 u->tcp_fin_timeout,
3270 u->udp_timeout);
3271
3272 #ifdef CONFIG_IP_VS_PROTO_TCP
3273 if (u->tcp_timeout < 0 || u->tcp_timeout > (INT_MAX / HZ) ||
3274 u->tcp_fin_timeout < 0 || u->tcp_fin_timeout > (INT_MAX / HZ)) {
3275 return -EINVAL;
3276 }
3277 #endif
3278
3279 #ifdef CONFIG_IP_VS_PROTO_UDP
3280 if (u->udp_timeout < 0 || u->udp_timeout > (INT_MAX / HZ))
3281 return -EINVAL;
3282 #endif
3283
3284 #ifdef CONFIG_IP_VS_PROTO_TCP
3285 if (u->tcp_timeout) {
3286 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3287 pd->timeout_table[IP_VS_TCP_S_ESTABLISHED]
3288 = u->tcp_timeout * HZ;
3289 }
3290
3291 if (u->tcp_fin_timeout) {
3292 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3293 pd->timeout_table[IP_VS_TCP_S_FIN_WAIT]
3294 = u->tcp_fin_timeout * HZ;
3295 }
3296 #endif
3297
3298 #ifdef CONFIG_IP_VS_PROTO_UDP
3299 if (u->udp_timeout) {
3300 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3301 pd->timeout_table[IP_VS_UDP_S_NORMAL]
3302 = u->udp_timeout * HZ;
3303 }
3304 #endif
3305 return 0;
3306 }
3307
3308 #define CMDID(cmd) (cmd - IP_VS_BASE_CTL)
3309
3310 struct ip_vs_svcdest_user {
3311 struct ip_vs_service_user s;
3312 struct ip_vs_dest_user d;
3313 };
3314
3315 static const unsigned char set_arglen[CMDID(IP_VS_SO_SET_MAX) + 1] = {
3316 [CMDID(IP_VS_SO_SET_ADD)] = sizeof(struct ip_vs_service_user),
3317 [CMDID(IP_VS_SO_SET_EDIT)] = sizeof(struct ip_vs_service_user),
3318 [CMDID(IP_VS_SO_SET_DEL)] = sizeof(struct ip_vs_service_user),
3319 [CMDID(IP_VS_SO_SET_ADDDEST)] = sizeof(struct ip_vs_svcdest_user),
3320 [CMDID(IP_VS_SO_SET_DELDEST)] = sizeof(struct ip_vs_svcdest_user),
3321 [CMDID(IP_VS_SO_SET_EDITDEST)] = sizeof(struct ip_vs_svcdest_user),
3322 [CMDID(IP_VS_SO_SET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
3323 [CMDID(IP_VS_SO_SET_STARTDAEMON)] = sizeof(struct ip_vs_daemon_user),
3324 [CMDID(IP_VS_SO_SET_STOPDAEMON)] = sizeof(struct ip_vs_daemon_user),
3325 [CMDID(IP_VS_SO_SET_ZERO)] = sizeof(struct ip_vs_service_user),
3326 };
3327
3328 union ip_vs_set_arglen {
3329 struct ip_vs_service_user field_IP_VS_SO_SET_ADD;
3330 struct ip_vs_service_user field_IP_VS_SO_SET_EDIT;
3331 struct ip_vs_service_user field_IP_VS_SO_SET_DEL;
3332 struct ip_vs_svcdest_user field_IP_VS_SO_SET_ADDDEST;
3333 struct ip_vs_svcdest_user field_IP_VS_SO_SET_DELDEST;
3334 struct ip_vs_svcdest_user field_IP_VS_SO_SET_EDITDEST;
3335 struct ip_vs_timeout_user field_IP_VS_SO_SET_TIMEOUT;
3336 struct ip_vs_daemon_user field_IP_VS_SO_SET_STARTDAEMON;
3337 struct ip_vs_daemon_user field_IP_VS_SO_SET_STOPDAEMON;
3338 struct ip_vs_service_user field_IP_VS_SO_SET_ZERO;
3339 };
3340
3341 #define MAX_SET_ARGLEN sizeof(union ip_vs_set_arglen)
3342
ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern * usvc,struct ip_vs_service_user * usvc_compat)3343 static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc,
3344 struct ip_vs_service_user *usvc_compat)
3345 {
3346 memset(usvc, 0, sizeof(*usvc));
3347
3348 usvc->af = AF_INET;
3349 usvc->protocol = usvc_compat->protocol;
3350 usvc->addr.ip = usvc_compat->addr;
3351 usvc->port = usvc_compat->port;
3352 usvc->fwmark = usvc_compat->fwmark;
3353
3354 /* Deep copy of sched_name is not needed here */
3355 usvc->sched_name = usvc_compat->sched_name;
3356
3357 usvc->flags = usvc_compat->flags;
3358 usvc->timeout = usvc_compat->timeout;
3359 usvc->netmask = usvc_compat->netmask;
3360 }
3361
ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern * udest,struct ip_vs_dest_user * udest_compat)3362 static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest,
3363 struct ip_vs_dest_user *udest_compat)
3364 {
3365 memset(udest, 0, sizeof(*udest));
3366
3367 udest->addr.ip = udest_compat->addr;
3368 udest->port = udest_compat->port;
3369 udest->conn_flags = udest_compat->conn_flags;
3370 udest->weight = udest_compat->weight;
3371 udest->u_threshold = udest_compat->u_threshold;
3372 udest->l_threshold = udest_compat->l_threshold;
3373 udest->af = AF_INET;
3374 udest->tun_type = IP_VS_CONN_F_TUNNEL_TYPE_IPIP;
3375 }
3376
3377 static int
do_ip_vs_set_ctl(struct sock * sk,int cmd,sockptr_t ptr,unsigned int len)3378 do_ip_vs_set_ctl(struct sock *sk, int cmd, sockptr_t ptr, unsigned int len)
3379 {
3380 struct net *net = sock_net(sk);
3381 int ret;
3382 unsigned char arg[MAX_SET_ARGLEN];
3383 struct ip_vs_service_user *usvc_compat;
3384 struct ip_vs_service_user_kern usvc;
3385 struct ip_vs_service *svc;
3386 struct ip_vs_dest_user *udest_compat;
3387 struct ip_vs_dest_user_kern udest;
3388 struct netns_ipvs *ipvs = net_ipvs(net);
3389
3390 BUILD_BUG_ON(sizeof(arg) > 255);
3391 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3392 return -EPERM;
3393
3394 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_SET_MAX)
3395 return -EINVAL;
3396 if (len != set_arglen[CMDID(cmd)]) {
3397 IP_VS_DBG(1, "set_ctl: len %u != %u\n",
3398 len, set_arglen[CMDID(cmd)]);
3399 return -EINVAL;
3400 }
3401
3402 if (copy_from_sockptr(arg, ptr, len) != 0)
3403 return -EFAULT;
3404
3405 /* Handle daemons since they have another lock */
3406 if (cmd == IP_VS_SO_SET_STARTDAEMON ||
3407 cmd == IP_VS_SO_SET_STOPDAEMON) {
3408 struct ip_vs_daemon_user *dm = (struct ip_vs_daemon_user *)arg;
3409
3410 if (cmd == IP_VS_SO_SET_STARTDAEMON) {
3411 struct ipvs_sync_daemon_cfg cfg;
3412
3413 memset(&cfg, 0, sizeof(cfg));
3414 ret = -EINVAL;
3415 if (strscpy(cfg.mcast_ifn, dm->mcast_ifn,
3416 sizeof(cfg.mcast_ifn)) <= 0)
3417 return ret;
3418 cfg.syncid = dm->syncid;
3419 ret = start_sync_thread(ipvs, &cfg, dm->state);
3420 } else {
3421 ret = stop_sync_thread(ipvs, dm->state);
3422 }
3423 return ret;
3424 }
3425
3426 mutex_lock(&ipvs->service_mutex);
3427 if (cmd == IP_VS_SO_SET_FLUSH) {
3428 /* Flush the virtual service */
3429 ret = ip_vs_flush(ipvs, false);
3430 goto out_unlock;
3431 } else if (cmd == IP_VS_SO_SET_TIMEOUT) {
3432 /* Set timeout values for (tcp tcpfin udp) */
3433 ret = ip_vs_set_timeout(ipvs, (struct ip_vs_timeout_user *)arg);
3434 goto out_unlock;
3435 } else if (!len) {
3436 /* No more commands with len == 0 below */
3437 ret = -EINVAL;
3438 goto out_unlock;
3439 }
3440
3441 usvc_compat = (struct ip_vs_service_user *)arg;
3442 udest_compat = (struct ip_vs_dest_user *)(usvc_compat + 1);
3443
3444 /* We only use the new structs internally, so copy userspace compat
3445 * structs to extended internal versions */
3446 ip_vs_copy_usvc_compat(&usvc, usvc_compat);
3447 ip_vs_copy_udest_compat(&udest, udest_compat);
3448
3449 if (cmd == IP_VS_SO_SET_ZERO) {
3450 /* if no service address is set, zero counters in all */
3451 if (!usvc.fwmark && !usvc.addr.ip && !usvc.port) {
3452 ret = ip_vs_zero_all(ipvs);
3453 goto out_unlock;
3454 }
3455 }
3456
3457 if ((cmd == IP_VS_SO_SET_ADD || cmd == IP_VS_SO_SET_EDIT) &&
3458 strnlen(usvc.sched_name, IP_VS_SCHEDNAME_MAXLEN) ==
3459 IP_VS_SCHEDNAME_MAXLEN) {
3460 ret = -EINVAL;
3461 goto out_unlock;
3462 }
3463
3464 /* Check for valid protocol: TCP or UDP or SCTP, even for fwmark!=0 */
3465 if (usvc.protocol != IPPROTO_TCP && usvc.protocol != IPPROTO_UDP &&
3466 usvc.protocol != IPPROTO_SCTP) {
3467 pr_err("set_ctl: invalid protocol: %d %pI4:%d\n",
3468 usvc.protocol, &usvc.addr.ip,
3469 ntohs(usvc.port));
3470 ret = -EFAULT;
3471 goto out_unlock;
3472 }
3473
3474 /* Lookup the exact service by <protocol, addr, port> or fwmark */
3475 rcu_read_lock();
3476 if (usvc.fwmark == 0)
3477 svc = __ip_vs_service_find(ipvs, usvc.af, usvc.protocol,
3478 &usvc.addr, usvc.port);
3479 else
3480 svc = __ip_vs_svc_fwm_find(ipvs, usvc.af, usvc.fwmark);
3481 rcu_read_unlock();
3482
3483 if (cmd != IP_VS_SO_SET_ADD
3484 && (svc == NULL || svc->protocol != usvc.protocol)) {
3485 ret = -ESRCH;
3486 goto out_unlock;
3487 }
3488
3489 switch (cmd) {
3490 case IP_VS_SO_SET_ADD:
3491 if (svc != NULL)
3492 ret = -EEXIST;
3493 else
3494 ret = ip_vs_add_service(ipvs, &usvc, &svc);
3495 break;
3496 case IP_VS_SO_SET_EDIT:
3497 ret = ip_vs_edit_service(svc, &usvc);
3498 break;
3499 case IP_VS_SO_SET_DEL:
3500 ret = ip_vs_del_service(svc);
3501 if (!ret)
3502 goto out_unlock;
3503 break;
3504 case IP_VS_SO_SET_ZERO:
3505 ret = ip_vs_zero_service(svc);
3506 break;
3507 case IP_VS_SO_SET_ADDDEST:
3508 ret = ip_vs_add_dest(svc, &udest);
3509 break;
3510 case IP_VS_SO_SET_EDITDEST:
3511 ret = ip_vs_edit_dest(svc, &udest);
3512 break;
3513 case IP_VS_SO_SET_DELDEST:
3514 ret = ip_vs_del_dest(svc, &udest);
3515 break;
3516 default:
3517 WARN_ON_ONCE(1);
3518 ret = -EINVAL;
3519 break;
3520 }
3521
3522 out_unlock:
3523 mutex_unlock(&ipvs->service_mutex);
3524 return ret;
3525 }
3526
3527
3528 static void
ip_vs_copy_service(struct ip_vs_service_entry * dst,struct ip_vs_service * src)3529 ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src)
3530 {
3531 struct ip_vs_scheduler *sched;
3532 struct ip_vs_kstats kstats;
3533 char *sched_name;
3534
3535 sched = rcu_dereference_protected(src->scheduler, 1);
3536 sched_name = sched ? sched->name : "none";
3537 dst->protocol = src->protocol;
3538 dst->addr = src->addr.ip;
3539 dst->port = src->port;
3540 dst->fwmark = src->fwmark;
3541 strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name));
3542 dst->flags = src->flags;
3543 dst->timeout = src->timeout / HZ;
3544 dst->netmask = src->netmask;
3545 dst->num_dests = src->num_dests;
3546 ip_vs_copy_stats(&kstats, &src->stats);
3547 ip_vs_export_stats_user(&dst->stats, &kstats);
3548 }
3549
3550 static inline int
__ip_vs_get_service_entries(struct netns_ipvs * ipvs,const struct ip_vs_get_services * get,struct ip_vs_get_services __user * uptr)3551 __ip_vs_get_service_entries(struct netns_ipvs *ipvs,
3552 const struct ip_vs_get_services *get,
3553 struct ip_vs_get_services __user *uptr)
3554 {
3555 struct ip_vs_service_entry entry;
3556 DECLARE_IP_VS_RHT_WALK_BUCKETS();
3557 struct hlist_bl_head *head;
3558 struct ip_vs_service *svc;
3559 struct hlist_bl_node *e;
3560 int count = 0;
3561 int ret = 0;
3562
3563 lockdep_assert_held(&ipvs->svc_resize_sem);
3564 /* All svc_table modifications are disabled, go ahead */
3565 ip_vs_rht_walk_buckets(ipvs->svc_table, head) {
3566 hlist_bl_for_each_entry(svc, e, head, s_list) {
3567 /* Only expose IPv4 entries to old interface */
3568 if (svc->af != AF_INET)
3569 continue;
3570
3571 if (count >= get->num_services)
3572 goto out;
3573 memset(&entry, 0, sizeof(entry));
3574 ip_vs_copy_service(&entry, svc);
3575 if (copy_to_user(&uptr->entrytable[count],
3576 &entry, sizeof(entry))) {
3577 ret = -EFAULT;
3578 goto out;
3579 }
3580 count++;
3581 }
3582 }
3583
3584 out:
3585 return ret;
3586 }
3587
3588 static inline int
__ip_vs_get_dest_entries(struct netns_ipvs * ipvs,const struct ip_vs_get_dests * get,struct ip_vs_get_dests __user * uptr)3589 __ip_vs_get_dest_entries(struct netns_ipvs *ipvs, const struct ip_vs_get_dests *get,
3590 struct ip_vs_get_dests __user *uptr)
3591 {
3592 struct ip_vs_service *svc;
3593 union nf_inet_addr addr = { .ip = get->addr };
3594 int ret = 0;
3595
3596 rcu_read_lock();
3597 if (get->fwmark)
3598 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, get->fwmark);
3599 else
3600 svc = __ip_vs_service_find(ipvs, AF_INET, get->protocol, &addr,
3601 get->port);
3602 rcu_read_unlock();
3603
3604 if (svc) {
3605 int count = 0;
3606 struct ip_vs_dest *dest;
3607 struct ip_vs_dest_entry entry;
3608 struct ip_vs_kstats kstats;
3609
3610 memset(&entry, 0, sizeof(entry));
3611 list_for_each_entry(dest, &svc->destinations, n_list) {
3612 if (count >= get->num_dests)
3613 break;
3614
3615 /* Cannot expose heterogeneous members via sockopt
3616 * interface
3617 */
3618 if (dest->af != svc->af)
3619 continue;
3620
3621 entry.addr = dest->addr.ip;
3622 entry.port = dest->port;
3623 entry.conn_flags = atomic_read(&dest->conn_flags);
3624 entry.weight = atomic_read(&dest->weight);
3625 entry.u_threshold = dest->u_threshold;
3626 entry.l_threshold = dest->l_threshold;
3627 entry.activeconns = atomic_read(&dest->activeconns);
3628 entry.inactconns = atomic_read(&dest->inactconns);
3629 entry.persistconns = atomic_read(&dest->persistconns);
3630 ip_vs_copy_stats(&kstats, &dest->stats);
3631 ip_vs_export_stats_user(&entry.stats, &kstats);
3632 if (copy_to_user(&uptr->entrytable[count],
3633 &entry, sizeof(entry))) {
3634 ret = -EFAULT;
3635 break;
3636 }
3637 count++;
3638 }
3639 } else
3640 ret = -ESRCH;
3641 return ret;
3642 }
3643
3644 static inline void
__ip_vs_get_timeouts(struct netns_ipvs * ipvs,struct ip_vs_timeout_user * u)3645 __ip_vs_get_timeouts(struct netns_ipvs *ipvs, struct ip_vs_timeout_user *u)
3646 {
3647 #if defined(CONFIG_IP_VS_PROTO_TCP) || defined(CONFIG_IP_VS_PROTO_UDP)
3648 struct ip_vs_proto_data *pd;
3649 #endif
3650
3651 memset(u, 0, sizeof (*u));
3652
3653 #ifdef CONFIG_IP_VS_PROTO_TCP
3654 pd = ip_vs_proto_data_get(ipvs, IPPROTO_TCP);
3655 u->tcp_timeout = pd->timeout_table[IP_VS_TCP_S_ESTABLISHED] / HZ;
3656 u->tcp_fin_timeout = pd->timeout_table[IP_VS_TCP_S_FIN_WAIT] / HZ;
3657 #endif
3658 #ifdef CONFIG_IP_VS_PROTO_UDP
3659 pd = ip_vs_proto_data_get(ipvs, IPPROTO_UDP);
3660 u->udp_timeout =
3661 pd->timeout_table[IP_VS_UDP_S_NORMAL] / HZ;
3662 #endif
3663 }
3664
3665 static const unsigned char get_arglen[CMDID(IP_VS_SO_GET_MAX) + 1] = {
3666 [CMDID(IP_VS_SO_GET_VERSION)] = 64,
3667 [CMDID(IP_VS_SO_GET_INFO)] = sizeof(struct ip_vs_getinfo),
3668 [CMDID(IP_VS_SO_GET_SERVICES)] = sizeof(struct ip_vs_get_services),
3669 [CMDID(IP_VS_SO_GET_SERVICE)] = sizeof(struct ip_vs_service_entry),
3670 [CMDID(IP_VS_SO_GET_DESTS)] = sizeof(struct ip_vs_get_dests),
3671 [CMDID(IP_VS_SO_GET_TIMEOUT)] = sizeof(struct ip_vs_timeout_user),
3672 [CMDID(IP_VS_SO_GET_DAEMON)] = 2 * sizeof(struct ip_vs_daemon_user),
3673 };
3674
3675 union ip_vs_get_arglen {
3676 char field_IP_VS_SO_GET_VERSION[64];
3677 struct ip_vs_getinfo field_IP_VS_SO_GET_INFO;
3678 struct ip_vs_get_services field_IP_VS_SO_GET_SERVICES;
3679 struct ip_vs_service_entry field_IP_VS_SO_GET_SERVICE;
3680 struct ip_vs_get_dests field_IP_VS_SO_GET_DESTS;
3681 struct ip_vs_timeout_user field_IP_VS_SO_GET_TIMEOUT;
3682 struct ip_vs_daemon_user field_IP_VS_SO_GET_DAEMON[2];
3683 };
3684
3685 #define MAX_GET_ARGLEN sizeof(union ip_vs_get_arglen)
3686
3687 static int
do_ip_vs_get_ctl(struct sock * sk,int cmd,void __user * user,int * len)3688 do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
3689 {
3690 unsigned char arg[MAX_GET_ARGLEN];
3691 int ret = 0;
3692 unsigned int copylen;
3693 struct net *net = sock_net(sk);
3694 struct netns_ipvs *ipvs = net_ipvs(net);
3695
3696 BUG_ON(!net);
3697 BUILD_BUG_ON(sizeof(arg) > 255);
3698 if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
3699 return -EPERM;
3700
3701 if (cmd < IP_VS_BASE_CTL || cmd > IP_VS_SO_GET_MAX)
3702 return -EINVAL;
3703
3704 copylen = get_arglen[CMDID(cmd)];
3705 if (*len < (int) copylen) {
3706 IP_VS_DBG(1, "get_ctl: len %d < %u\n", *len, copylen);
3707 return -EINVAL;
3708 }
3709
3710 if (copy_from_user(arg, user, copylen) != 0)
3711 return -EFAULT;
3712 /*
3713 * Handle daemons first since it has its own locking
3714 */
3715 if (cmd == IP_VS_SO_GET_DAEMON) {
3716 struct ip_vs_daemon_user d[2];
3717
3718 memset(&d, 0, sizeof(d));
3719 mutex_lock(&ipvs->sync_mutex);
3720 if (ipvs->sync_state & IP_VS_STATE_MASTER) {
3721 d[0].state = IP_VS_STATE_MASTER;
3722 strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn,
3723 sizeof(d[0].mcast_ifn));
3724 d[0].syncid = ipvs->mcfg.syncid;
3725 }
3726 if (ipvs->sync_state & IP_VS_STATE_BACKUP) {
3727 d[1].state = IP_VS_STATE_BACKUP;
3728 strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn,
3729 sizeof(d[1].mcast_ifn));
3730 d[1].syncid = ipvs->bcfg.syncid;
3731 }
3732 if (copy_to_user(user, &d, sizeof(d)) != 0)
3733 ret = -EFAULT;
3734 mutex_unlock(&ipvs->sync_mutex);
3735 return ret;
3736 }
3737
3738 if (cmd == IP_VS_SO_GET_SERVICES) {
3739 struct ip_vs_get_services *get;
3740 size_t size;
3741
3742 get = (struct ip_vs_get_services *)arg;
3743 size = struct_size(get, entrytable, get->num_services);
3744 if (*len != size) {
3745 pr_err("length: %u != %zu\n", *len, size);
3746 return -EINVAL;
3747 }
3748 /* Prevent modifications to the list with services.
3749 * Try reverse locking, so that we do not hold the mutex
3750 * while waiting for semaphore.
3751 */
3752 while (1) {
3753 ret = down_read_killable(&ipvs->svc_resize_sem);
3754 if (ret < 0)
3755 return ret;
3756 if (mutex_trylock(&ipvs->service_mutex))
3757 break;
3758 up_read(&ipvs->svc_resize_sem);
3759 cond_resched();
3760 }
3761 ret = __ip_vs_get_service_entries(ipvs, get, user);
3762 up_read(&ipvs->svc_resize_sem);
3763 mutex_unlock(&ipvs->service_mutex);
3764 return ret;
3765 }
3766
3767 mutex_lock(&ipvs->service_mutex);
3768 switch (cmd) {
3769 case IP_VS_SO_GET_VERSION:
3770 {
3771 char buf[64];
3772
3773 sprintf(buf, "IP Virtual Server version %d.%d.%d (size=%d)",
3774 NVERSION(IP_VS_VERSION_CODE), get_conn_tab_size(ipvs));
3775 if (copy_to_user(user, buf, strlen(buf)+1) != 0) {
3776 ret = -EFAULT;
3777 goto out;
3778 }
3779 *len = strlen(buf)+1;
3780 }
3781 break;
3782
3783 case IP_VS_SO_GET_INFO:
3784 {
3785 struct ip_vs_getinfo info;
3786
3787 info.version = IP_VS_VERSION_CODE;
3788 info.size = get_conn_tab_size(ipvs);
3789 info.num_services =
3790 atomic_read(&ipvs->num_services[IP_VS_AF_INET]);
3791 if (copy_to_user(user, &info, sizeof(info)) != 0)
3792 ret = -EFAULT;
3793 }
3794 break;
3795
3796 case IP_VS_SO_GET_SERVICE:
3797 {
3798 struct ip_vs_service_entry *entry;
3799 struct ip_vs_service *svc;
3800 union nf_inet_addr addr;
3801
3802 entry = (struct ip_vs_service_entry *)arg;
3803 addr.ip = entry->addr;
3804 rcu_read_lock();
3805 if (entry->fwmark)
3806 svc = __ip_vs_svc_fwm_find(ipvs, AF_INET, entry->fwmark);
3807 else
3808 svc = __ip_vs_service_find(ipvs, AF_INET,
3809 entry->protocol, &addr,
3810 entry->port);
3811 rcu_read_unlock();
3812 if (svc) {
3813 ip_vs_copy_service(entry, svc);
3814 if (copy_to_user(user, entry, sizeof(*entry)) != 0)
3815 ret = -EFAULT;
3816 } else
3817 ret = -ESRCH;
3818 }
3819 break;
3820
3821 case IP_VS_SO_GET_DESTS:
3822 {
3823 struct ip_vs_get_dests *get;
3824 size_t size;
3825
3826 get = (struct ip_vs_get_dests *)arg;
3827 size = struct_size(get, entrytable, get->num_dests);
3828 if (*len != size) {
3829 pr_err("length: %u != %zu\n", *len, size);
3830 ret = -EINVAL;
3831 goto out;
3832 }
3833 ret = __ip_vs_get_dest_entries(ipvs, get, user);
3834 }
3835 break;
3836
3837 case IP_VS_SO_GET_TIMEOUT:
3838 {
3839 struct ip_vs_timeout_user t;
3840
3841 __ip_vs_get_timeouts(ipvs, &t);
3842 if (copy_to_user(user, &t, sizeof(t)) != 0)
3843 ret = -EFAULT;
3844 }
3845 break;
3846
3847 default:
3848 ret = -EINVAL;
3849 }
3850
3851 out:
3852 mutex_unlock(&ipvs->service_mutex);
3853 return ret;
3854 }
3855
3856
3857 static struct nf_sockopt_ops ip_vs_sockopts = {
3858 .pf = PF_INET,
3859 .set_optmin = IP_VS_BASE_CTL,
3860 .set_optmax = IP_VS_SO_SET_MAX+1,
3861 .set = do_ip_vs_set_ctl,
3862 .get_optmin = IP_VS_BASE_CTL,
3863 .get_optmax = IP_VS_SO_GET_MAX+1,
3864 .get = do_ip_vs_get_ctl,
3865 .owner = THIS_MODULE,
3866 };
3867
3868 /*
3869 * Generic Netlink interface
3870 */
3871
3872 /* IPVS genetlink family */
3873 static struct genl_family ip_vs_genl_family;
3874
3875 /* Policy used for first-level command attributes */
3876 static const struct nla_policy ip_vs_cmd_policy[IPVS_CMD_ATTR_MAX + 1] = {
3877 [IPVS_CMD_ATTR_SERVICE] = { .type = NLA_NESTED },
3878 [IPVS_CMD_ATTR_DEST] = { .type = NLA_NESTED },
3879 [IPVS_CMD_ATTR_DAEMON] = { .type = NLA_NESTED },
3880 [IPVS_CMD_ATTR_TIMEOUT_TCP] = { .type = NLA_U32 },
3881 [IPVS_CMD_ATTR_TIMEOUT_TCP_FIN] = { .type = NLA_U32 },
3882 [IPVS_CMD_ATTR_TIMEOUT_UDP] = { .type = NLA_U32 },
3883 };
3884
3885 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DAEMON */
3886 static const struct nla_policy ip_vs_daemon_policy[IPVS_DAEMON_ATTR_MAX + 1] = {
3887 [IPVS_DAEMON_ATTR_STATE] = { .type = NLA_U32 },
3888 [IPVS_DAEMON_ATTR_MCAST_IFN] = { .type = NLA_NUL_STRING,
3889 .len = IP_VS_IFNAME_MAXLEN - 1 },
3890 [IPVS_DAEMON_ATTR_SYNC_ID] = { .type = NLA_U32 },
3891 [IPVS_DAEMON_ATTR_SYNC_MAXLEN] = { .type = NLA_U16 },
3892 [IPVS_DAEMON_ATTR_MCAST_GROUP] = { .type = NLA_U32 },
3893 [IPVS_DAEMON_ATTR_MCAST_GROUP6] = { .len = sizeof(struct in6_addr) },
3894 [IPVS_DAEMON_ATTR_MCAST_PORT] = { .type = NLA_U16 },
3895 [IPVS_DAEMON_ATTR_MCAST_TTL] = { .type = NLA_U8 },
3896 };
3897
3898 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_SERVICE */
3899 static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = {
3900 [IPVS_SVC_ATTR_AF] = { .type = NLA_U16 },
3901 [IPVS_SVC_ATTR_PROTOCOL] = { .type = NLA_U16 },
3902 [IPVS_SVC_ATTR_ADDR] = { .type = NLA_BINARY,
3903 .len = sizeof(union nf_inet_addr) },
3904 [IPVS_SVC_ATTR_PORT] = { .type = NLA_U16 },
3905 [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 },
3906 [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING,
3907 .len = IP_VS_SCHEDNAME_MAXLEN - 1 },
3908 [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING,
3909 .len = IP_VS_PENAME_MAXLEN },
3910 [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY,
3911 .len = sizeof(struct ip_vs_flags) },
3912 [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 },
3913 [IPVS_SVC_ATTR_NETMASK] = { .type = NLA_U32 },
3914 [IPVS_SVC_ATTR_STATS] = { .type = NLA_NESTED },
3915 };
3916
3917 /* Policy used for attributes in nested attribute IPVS_CMD_ATTR_DEST */
3918 static const struct nla_policy ip_vs_dest_policy[IPVS_DEST_ATTR_MAX + 1] = {
3919 [IPVS_DEST_ATTR_ADDR] = { .type = NLA_BINARY,
3920 .len = sizeof(union nf_inet_addr) },
3921 [IPVS_DEST_ATTR_PORT] = { .type = NLA_U16 },
3922 [IPVS_DEST_ATTR_FWD_METHOD] = { .type = NLA_U32 },
3923 [IPVS_DEST_ATTR_WEIGHT] = { .type = NLA_U32 },
3924 [IPVS_DEST_ATTR_U_THRESH] = { .type = NLA_U32 },
3925 [IPVS_DEST_ATTR_L_THRESH] = { .type = NLA_U32 },
3926 [IPVS_DEST_ATTR_ACTIVE_CONNS] = { .type = NLA_U32 },
3927 [IPVS_DEST_ATTR_INACT_CONNS] = { .type = NLA_U32 },
3928 [IPVS_DEST_ATTR_PERSIST_CONNS] = { .type = NLA_U32 },
3929 [IPVS_DEST_ATTR_STATS] = { .type = NLA_NESTED },
3930 [IPVS_DEST_ATTR_ADDR_FAMILY] = { .type = NLA_U16 },
3931 [IPVS_DEST_ATTR_TUN_TYPE] = { .type = NLA_U8 },
3932 [IPVS_DEST_ATTR_TUN_PORT] = { .type = NLA_U16 },
3933 [IPVS_DEST_ATTR_TUN_FLAGS] = { .type = NLA_U16 },
3934 };
3935
ip_vs_genl_fill_stats(struct sk_buff * skb,int container_type,struct ip_vs_kstats * kstats)3936 static int ip_vs_genl_fill_stats(struct sk_buff *skb, int container_type,
3937 struct ip_vs_kstats *kstats)
3938 {
3939 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3940
3941 if (!nl_stats)
3942 return -EMSGSIZE;
3943
3944 if (nla_put_u32(skb, IPVS_STATS_ATTR_CONNS, (u32)kstats->conns) ||
3945 nla_put_u32(skb, IPVS_STATS_ATTR_INPKTS, (u32)kstats->inpkts) ||
3946 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPKTS, (u32)kstats->outpkts) ||
3947 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3948 IPVS_STATS_ATTR_PAD) ||
3949 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3950 IPVS_STATS_ATTR_PAD) ||
3951 nla_put_u32(skb, IPVS_STATS_ATTR_CPS, (u32)kstats->cps) ||
3952 nla_put_u32(skb, IPVS_STATS_ATTR_INPPS, (u32)kstats->inpps) ||
3953 nla_put_u32(skb, IPVS_STATS_ATTR_OUTPPS, (u32)kstats->outpps) ||
3954 nla_put_u32(skb, IPVS_STATS_ATTR_INBPS, (u32)kstats->inbps) ||
3955 nla_put_u32(skb, IPVS_STATS_ATTR_OUTBPS, (u32)kstats->outbps))
3956 goto nla_put_failure;
3957 nla_nest_end(skb, nl_stats);
3958
3959 return 0;
3960
3961 nla_put_failure:
3962 nla_nest_cancel(skb, nl_stats);
3963 return -EMSGSIZE;
3964 }
3965
ip_vs_genl_fill_stats64(struct sk_buff * skb,int container_type,struct ip_vs_kstats * kstats)3966 static int ip_vs_genl_fill_stats64(struct sk_buff *skb, int container_type,
3967 struct ip_vs_kstats *kstats)
3968 {
3969 struct nlattr *nl_stats = nla_nest_start_noflag(skb, container_type);
3970
3971 if (!nl_stats)
3972 return -EMSGSIZE;
3973
3974 if (nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CONNS, kstats->conns,
3975 IPVS_STATS_ATTR_PAD) ||
3976 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPKTS, kstats->inpkts,
3977 IPVS_STATS_ATTR_PAD) ||
3978 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPKTS, kstats->outpkts,
3979 IPVS_STATS_ATTR_PAD) ||
3980 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBYTES, kstats->inbytes,
3981 IPVS_STATS_ATTR_PAD) ||
3982 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBYTES, kstats->outbytes,
3983 IPVS_STATS_ATTR_PAD) ||
3984 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_CPS, kstats->cps,
3985 IPVS_STATS_ATTR_PAD) ||
3986 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INPPS, kstats->inpps,
3987 IPVS_STATS_ATTR_PAD) ||
3988 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTPPS, kstats->outpps,
3989 IPVS_STATS_ATTR_PAD) ||
3990 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_INBPS, kstats->inbps,
3991 IPVS_STATS_ATTR_PAD) ||
3992 nla_put_u64_64bit(skb, IPVS_STATS_ATTR_OUTBPS, kstats->outbps,
3993 IPVS_STATS_ATTR_PAD))
3994 goto nla_put_failure;
3995 nla_nest_end(skb, nl_stats);
3996
3997 return 0;
3998
3999 nla_put_failure:
4000 nla_nest_cancel(skb, nl_stats);
4001 return -EMSGSIZE;
4002 }
4003
ip_vs_genl_fill_service(struct sk_buff * skb,struct ip_vs_service * svc)4004 static int ip_vs_genl_fill_service(struct sk_buff *skb,
4005 struct ip_vs_service *svc)
4006 {
4007 struct ip_vs_scheduler *sched;
4008 struct ip_vs_pe *pe;
4009 struct nlattr *nl_service;
4010 struct ip_vs_flags flags = { .flags = svc->flags,
4011 .mask = ~0 };
4012 struct ip_vs_kstats kstats;
4013 char *sched_name;
4014
4015 nl_service = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_SERVICE);
4016 if (!nl_service)
4017 return -EMSGSIZE;
4018
4019 if (nla_put_u16(skb, IPVS_SVC_ATTR_AF, svc->af))
4020 goto nla_put_failure;
4021 if (svc->fwmark) {
4022 if (nla_put_u32(skb, IPVS_SVC_ATTR_FWMARK, svc->fwmark))
4023 goto nla_put_failure;
4024 } else {
4025 if (nla_put_u16(skb, IPVS_SVC_ATTR_PROTOCOL, svc->protocol) ||
4026 nla_put(skb, IPVS_SVC_ATTR_ADDR, sizeof(svc->addr), &svc->addr) ||
4027 nla_put_be16(skb, IPVS_SVC_ATTR_PORT, svc->port))
4028 goto nla_put_failure;
4029 }
4030
4031 sched = rcu_dereference(svc->scheduler);
4032 sched_name = sched ? sched->name : "none";
4033 pe = rcu_dereference(svc->pe);
4034 if (nla_put_string(skb, IPVS_SVC_ATTR_SCHED_NAME, sched_name) ||
4035 (pe && nla_put_string(skb, IPVS_SVC_ATTR_PE_NAME, pe->name)) ||
4036 nla_put(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags) ||
4037 nla_put_u32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ) ||
4038 nla_put_be32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask))
4039 goto nla_put_failure;
4040 ip_vs_copy_stats(&kstats, &svc->stats);
4041 if (ip_vs_genl_fill_stats(skb, IPVS_SVC_ATTR_STATS, &kstats))
4042 goto nla_put_failure;
4043 if (ip_vs_genl_fill_stats64(skb, IPVS_SVC_ATTR_STATS64, &kstats))
4044 goto nla_put_failure;
4045
4046 nla_nest_end(skb, nl_service);
4047
4048 return 0;
4049
4050 nla_put_failure:
4051 nla_nest_cancel(skb, nl_service);
4052 return -EMSGSIZE;
4053 }
4054
ip_vs_genl_dump_service(struct sk_buff * skb,struct ip_vs_service * svc,struct netlink_callback * cb)4055 static int ip_vs_genl_dump_service(struct sk_buff *skb,
4056 struct ip_vs_service *svc,
4057 struct netlink_callback *cb)
4058 {
4059 void *hdr;
4060
4061 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4062 &ip_vs_genl_family, NLM_F_MULTI,
4063 IPVS_CMD_NEW_SERVICE);
4064 if (!hdr)
4065 return -EMSGSIZE;
4066
4067 if (ip_vs_genl_fill_service(skb, svc) < 0)
4068 goto nla_put_failure;
4069
4070 genlmsg_end(skb, hdr);
4071 return 0;
4072
4073 nla_put_failure:
4074 genlmsg_cancel(skb, hdr);
4075 return -EMSGSIZE;
4076 }
4077
ip_vs_genl_dump_services(struct sk_buff * skb,struct netlink_callback * cb)4078 static int ip_vs_genl_dump_services(struct sk_buff *skb,
4079 struct netlink_callback *cb)
4080 {
4081 DECLARE_IP_VS_RHT_WALK_BUCKETS_SAFE_RCU();
4082 struct net *net = sock_net(skb->sk);
4083 struct netns_ipvs *ipvs = net_ipvs(net);
4084 struct hlist_bl_head *head;
4085 struct ip_vs_service *svc;
4086 struct hlist_bl_node *e;
4087 int start = cb->args[0];
4088 int idx = 0;
4089
4090 /* Make sure we do not see same service twice during resize */
4091 down_read(&ipvs->svc_resize_sem);
4092 rcu_read_lock();
4093 ip_vs_rht_walk_buckets_safe_rcu(ipvs->svc_table, head) {
4094 hlist_bl_for_each_entry_rcu(svc, e, head, s_list) {
4095 if (++idx <= start)
4096 continue;
4097 if (ip_vs_genl_dump_service(skb, svc, cb) < 0) {
4098 idx--;
4099 goto nla_put_failure;
4100 }
4101 }
4102 }
4103
4104 nla_put_failure:
4105 rcu_read_unlock();
4106 up_read(&ipvs->svc_resize_sem);
4107 cb->args[0] = idx;
4108
4109 return skb->len;
4110 }
4111
ip_vs_is_af_valid(int af)4112 static bool ip_vs_is_af_valid(int af)
4113 {
4114 if (af == AF_INET)
4115 return true;
4116 #ifdef CONFIG_IP_VS_IPV6
4117 if (af == AF_INET6 && ipv6_mod_enabled())
4118 return true;
4119 #endif
4120 return false;
4121 }
4122
ip_vs_genl_parse_service(struct netns_ipvs * ipvs,struct ip_vs_service_user_kern * usvc,struct nlattr * nla,bool full_entry,struct ip_vs_service ** ret_svc)4123 static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs,
4124 struct ip_vs_service_user_kern *usvc,
4125 struct nlattr *nla, bool full_entry,
4126 struct ip_vs_service **ret_svc)
4127 {
4128 struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1];
4129 struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr;
4130 struct ip_vs_service *svc;
4131
4132 /* Parse mandatory identifying service fields first */
4133 if (nla == NULL ||
4134 nla_parse_nested_deprecated(attrs, IPVS_SVC_ATTR_MAX, nla, ip_vs_svc_policy, NULL))
4135 return -EINVAL;
4136
4137 nla_af = attrs[IPVS_SVC_ATTR_AF];
4138 nla_protocol = attrs[IPVS_SVC_ATTR_PROTOCOL];
4139 nla_addr = attrs[IPVS_SVC_ATTR_ADDR];
4140 nla_port = attrs[IPVS_SVC_ATTR_PORT];
4141 nla_fwmark = attrs[IPVS_SVC_ATTR_FWMARK];
4142
4143 if (!(nla_af && (nla_fwmark || (nla_port && nla_protocol && nla_addr))))
4144 return -EINVAL;
4145
4146 memset(usvc, 0, sizeof(*usvc));
4147
4148 usvc->af = nla_get_u16(nla_af);
4149 if (!ip_vs_is_af_valid(usvc->af))
4150 return -EAFNOSUPPORT;
4151
4152 if (nla_fwmark) {
4153 usvc->protocol = IPPROTO_TCP;
4154 usvc->fwmark = nla_get_u32(nla_fwmark);
4155 } else {
4156 usvc->protocol = nla_get_u16(nla_protocol);
4157 nla_memcpy(&usvc->addr, nla_addr, sizeof(usvc->addr));
4158 usvc->port = nla_get_be16(nla_port);
4159 usvc->fwmark = 0;
4160 }
4161
4162 if (usvc->fwmark)
4163 svc = __ip_vs_svc_fwm_find(ipvs, usvc->af, usvc->fwmark);
4164 else
4165 svc = __ip_vs_service_find(ipvs, usvc->af, usvc->protocol,
4166 &usvc->addr, usvc->port);
4167 *ret_svc = svc;
4168
4169 /* If a full entry was requested, check for the additional fields */
4170 if (full_entry) {
4171 struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout,
4172 *nla_netmask;
4173 struct ip_vs_flags flags;
4174
4175 nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME];
4176 nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME];
4177 nla_flags = attrs[IPVS_SVC_ATTR_FLAGS];
4178 nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT];
4179 nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK];
4180
4181 if (!(nla_sched && nla_flags && nla_timeout && nla_netmask))
4182 return -EINVAL;
4183
4184 nla_memcpy(&flags, nla_flags, sizeof(flags));
4185
4186 /* prefill flags from service if it already exists */
4187 if (svc)
4188 usvc->flags = svc->flags;
4189
4190 /* set new flags from userland */
4191 usvc->flags = (usvc->flags & ~flags.mask) |
4192 (flags.flags & flags.mask);
4193 usvc->sched_name = nla_data(nla_sched);
4194 usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL;
4195 usvc->timeout = nla_get_u32(nla_timeout);
4196 usvc->netmask = nla_get_be32(nla_netmask);
4197 }
4198
4199 return 0;
4200 }
4201
ip_vs_genl_find_service(struct netns_ipvs * ipvs,struct nlattr * nla)4202 static struct ip_vs_service *ip_vs_genl_find_service(struct netns_ipvs *ipvs,
4203 struct nlattr *nla)
4204 {
4205 struct ip_vs_service_user_kern usvc;
4206 struct ip_vs_service *svc;
4207 int ret;
4208
4209 ret = ip_vs_genl_parse_service(ipvs, &usvc, nla, false, &svc);
4210 return ret ? ERR_PTR(ret) : svc;
4211 }
4212
ip_vs_genl_fill_dest(struct sk_buff * skb,struct ip_vs_dest * dest)4213 static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest)
4214 {
4215 struct nlattr *nl_dest;
4216 struct ip_vs_kstats kstats;
4217
4218 nl_dest = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DEST);
4219 if (!nl_dest)
4220 return -EMSGSIZE;
4221
4222 if (nla_put(skb, IPVS_DEST_ATTR_ADDR, sizeof(dest->addr), &dest->addr) ||
4223 nla_put_be16(skb, IPVS_DEST_ATTR_PORT, dest->port) ||
4224 nla_put_u32(skb, IPVS_DEST_ATTR_FWD_METHOD,
4225 (atomic_read(&dest->conn_flags) &
4226 IP_VS_CONN_F_FWD_MASK)) ||
4227 nla_put_u32(skb, IPVS_DEST_ATTR_WEIGHT,
4228 atomic_read(&dest->weight)) ||
4229 nla_put_u8(skb, IPVS_DEST_ATTR_TUN_TYPE,
4230 dest->tun_type) ||
4231 nla_put_be16(skb, IPVS_DEST_ATTR_TUN_PORT,
4232 dest->tun_port) ||
4233 nla_put_u16(skb, IPVS_DEST_ATTR_TUN_FLAGS,
4234 dest->tun_flags) ||
4235 nla_put_u32(skb, IPVS_DEST_ATTR_U_THRESH, dest->u_threshold) ||
4236 nla_put_u32(skb, IPVS_DEST_ATTR_L_THRESH, dest->l_threshold) ||
4237 nla_put_u32(skb, IPVS_DEST_ATTR_ACTIVE_CONNS,
4238 atomic_read(&dest->activeconns)) ||
4239 nla_put_u32(skb, IPVS_DEST_ATTR_INACT_CONNS,
4240 atomic_read(&dest->inactconns)) ||
4241 nla_put_u32(skb, IPVS_DEST_ATTR_PERSIST_CONNS,
4242 atomic_read(&dest->persistconns)) ||
4243 nla_put_u16(skb, IPVS_DEST_ATTR_ADDR_FAMILY, dest->af))
4244 goto nla_put_failure;
4245 ip_vs_copy_stats(&kstats, &dest->stats);
4246 if (ip_vs_genl_fill_stats(skb, IPVS_DEST_ATTR_STATS, &kstats))
4247 goto nla_put_failure;
4248 if (ip_vs_genl_fill_stats64(skb, IPVS_DEST_ATTR_STATS64, &kstats))
4249 goto nla_put_failure;
4250
4251 nla_nest_end(skb, nl_dest);
4252
4253 return 0;
4254
4255 nla_put_failure:
4256 nla_nest_cancel(skb, nl_dest);
4257 return -EMSGSIZE;
4258 }
4259
ip_vs_genl_dump_dest(struct sk_buff * skb,struct ip_vs_dest * dest,struct netlink_callback * cb)4260 static int ip_vs_genl_dump_dest(struct sk_buff *skb, struct ip_vs_dest *dest,
4261 struct netlink_callback *cb)
4262 {
4263 void *hdr;
4264
4265 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4266 &ip_vs_genl_family, NLM_F_MULTI,
4267 IPVS_CMD_NEW_DEST);
4268 if (!hdr)
4269 return -EMSGSIZE;
4270
4271 if (ip_vs_genl_fill_dest(skb, dest) < 0)
4272 goto nla_put_failure;
4273
4274 genlmsg_end(skb, hdr);
4275 return 0;
4276
4277 nla_put_failure:
4278 genlmsg_cancel(skb, hdr);
4279 return -EMSGSIZE;
4280 }
4281
ip_vs_genl_dump_dests(struct sk_buff * skb,struct netlink_callback * cb)4282 static int ip_vs_genl_dump_dests(struct sk_buff *skb,
4283 struct netlink_callback *cb)
4284 {
4285 int idx = 0;
4286 int start = cb->args[0];
4287 struct ip_vs_service *svc;
4288 struct ip_vs_dest *dest;
4289 struct nlattr *attrs[IPVS_CMD_ATTR_MAX + 1];
4290 struct net *net = sock_net(skb->sk);
4291 struct netns_ipvs *ipvs = net_ipvs(net);
4292
4293 rcu_read_lock();
4294
4295 /* Try to find the service for which to dump destinations */
4296 if (nlmsg_parse_deprecated(cb->nlh, GENL_HDRLEN, attrs, IPVS_CMD_ATTR_MAX, ip_vs_cmd_policy, cb->extack))
4297 goto out_err;
4298
4299
4300 svc = ip_vs_genl_find_service(ipvs, attrs[IPVS_CMD_ATTR_SERVICE]);
4301 if (IS_ERR_OR_NULL(svc))
4302 goto out_err;
4303
4304 /* Dump the destinations */
4305 list_for_each_entry_rcu(dest, &svc->destinations, n_list) {
4306 if (++idx <= start)
4307 continue;
4308 if (ip_vs_genl_dump_dest(skb, dest, cb) < 0) {
4309 idx--;
4310 goto nla_put_failure;
4311 }
4312 }
4313
4314 nla_put_failure:
4315 cb->args[0] = idx;
4316
4317 out_err:
4318 rcu_read_unlock();
4319
4320 return skb->len;
4321 }
4322
ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern * udest,struct nlattr * nla,bool full_entry)4323 static int ip_vs_genl_parse_dest(struct ip_vs_dest_user_kern *udest,
4324 struct nlattr *nla, bool full_entry)
4325 {
4326 struct nlattr *attrs[IPVS_DEST_ATTR_MAX + 1];
4327 struct nlattr *nla_addr, *nla_port;
4328 struct nlattr *nla_addr_family;
4329
4330 /* Parse mandatory identifying destination fields first */
4331 if (nla == NULL ||
4332 nla_parse_nested_deprecated(attrs, IPVS_DEST_ATTR_MAX, nla, ip_vs_dest_policy, NULL))
4333 return -EINVAL;
4334
4335 nla_addr = attrs[IPVS_DEST_ATTR_ADDR];
4336 nla_port = attrs[IPVS_DEST_ATTR_PORT];
4337 nla_addr_family = attrs[IPVS_DEST_ATTR_ADDR_FAMILY];
4338
4339 if (!(nla_addr && nla_port))
4340 return -EINVAL;
4341
4342 memset(udest, 0, sizeof(*udest));
4343
4344 nla_memcpy(&udest->addr, nla_addr, sizeof(udest->addr));
4345 udest->port = nla_get_be16(nla_port);
4346
4347 udest->af = nla_get_u16_default(nla_addr_family, 0);
4348
4349 /* If a full entry was requested, check for the additional fields */
4350 if (full_entry) {
4351 struct nlattr *nla_fwd, *nla_weight, *nla_u_thresh,
4352 *nla_l_thresh, *nla_tun_type, *nla_tun_port,
4353 *nla_tun_flags;
4354
4355 nla_fwd = attrs[IPVS_DEST_ATTR_FWD_METHOD];
4356 nla_weight = attrs[IPVS_DEST_ATTR_WEIGHT];
4357 nla_u_thresh = attrs[IPVS_DEST_ATTR_U_THRESH];
4358 nla_l_thresh = attrs[IPVS_DEST_ATTR_L_THRESH];
4359 nla_tun_type = attrs[IPVS_DEST_ATTR_TUN_TYPE];
4360 nla_tun_port = attrs[IPVS_DEST_ATTR_TUN_PORT];
4361 nla_tun_flags = attrs[IPVS_DEST_ATTR_TUN_FLAGS];
4362
4363 if (!(nla_fwd && nla_weight && nla_u_thresh && nla_l_thresh))
4364 return -EINVAL;
4365
4366 udest->conn_flags = nla_get_u32(nla_fwd)
4367 & IP_VS_CONN_F_FWD_MASK;
4368 udest->weight = nla_get_u32(nla_weight);
4369 udest->u_threshold = nla_get_u32(nla_u_thresh);
4370 udest->l_threshold = nla_get_u32(nla_l_thresh);
4371
4372 if (nla_tun_type)
4373 udest->tun_type = nla_get_u8(nla_tun_type);
4374
4375 if (nla_tun_port)
4376 udest->tun_port = nla_get_be16(nla_tun_port);
4377
4378 if (nla_tun_flags)
4379 udest->tun_flags = nla_get_u16(nla_tun_flags);
4380 }
4381
4382 return 0;
4383 }
4384
ip_vs_genl_fill_daemon(struct sk_buff * skb,__u32 state,struct ipvs_sync_daemon_cfg * c)4385 static int ip_vs_genl_fill_daemon(struct sk_buff *skb, __u32 state,
4386 struct ipvs_sync_daemon_cfg *c)
4387 {
4388 struct nlattr *nl_daemon;
4389
4390 nl_daemon = nla_nest_start_noflag(skb, IPVS_CMD_ATTR_DAEMON);
4391 if (!nl_daemon)
4392 return -EMSGSIZE;
4393
4394 if (nla_put_u32(skb, IPVS_DAEMON_ATTR_STATE, state) ||
4395 nla_put_string(skb, IPVS_DAEMON_ATTR_MCAST_IFN, c->mcast_ifn) ||
4396 nla_put_u32(skb, IPVS_DAEMON_ATTR_SYNC_ID, c->syncid) ||
4397 nla_put_u16(skb, IPVS_DAEMON_ATTR_SYNC_MAXLEN, c->sync_maxlen) ||
4398 nla_put_u16(skb, IPVS_DAEMON_ATTR_MCAST_PORT, c->mcast_port) ||
4399 nla_put_u8(skb, IPVS_DAEMON_ATTR_MCAST_TTL, c->mcast_ttl))
4400 goto nla_put_failure;
4401 #ifdef CONFIG_IP_VS_IPV6
4402 if (c->mcast_af == AF_INET6) {
4403 if (nla_put_in6_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP6,
4404 &c->mcast_group.in6))
4405 goto nla_put_failure;
4406 } else
4407 #endif
4408 if (c->mcast_af == AF_INET &&
4409 nla_put_in_addr(skb, IPVS_DAEMON_ATTR_MCAST_GROUP,
4410 c->mcast_group.ip))
4411 goto nla_put_failure;
4412 nla_nest_end(skb, nl_daemon);
4413
4414 return 0;
4415
4416 nla_put_failure:
4417 nla_nest_cancel(skb, nl_daemon);
4418 return -EMSGSIZE;
4419 }
4420
ip_vs_genl_dump_daemon(struct sk_buff * skb,__u32 state,struct ipvs_sync_daemon_cfg * c,struct netlink_callback * cb)4421 static int ip_vs_genl_dump_daemon(struct sk_buff *skb, __u32 state,
4422 struct ipvs_sync_daemon_cfg *c,
4423 struct netlink_callback *cb)
4424 {
4425 void *hdr;
4426 hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
4427 &ip_vs_genl_family, NLM_F_MULTI,
4428 IPVS_CMD_NEW_DAEMON);
4429 if (!hdr)
4430 return -EMSGSIZE;
4431
4432 if (ip_vs_genl_fill_daemon(skb, state, c))
4433 goto nla_put_failure;
4434
4435 genlmsg_end(skb, hdr);
4436 return 0;
4437
4438 nla_put_failure:
4439 genlmsg_cancel(skb, hdr);
4440 return -EMSGSIZE;
4441 }
4442
ip_vs_genl_dump_daemons(struct sk_buff * skb,struct netlink_callback * cb)4443 static int ip_vs_genl_dump_daemons(struct sk_buff *skb,
4444 struct netlink_callback *cb)
4445 {
4446 struct net *net = sock_net(skb->sk);
4447 struct netns_ipvs *ipvs = net_ipvs(net);
4448
4449 mutex_lock(&ipvs->sync_mutex);
4450 if ((ipvs->sync_state & IP_VS_STATE_MASTER) && !cb->args[0]) {
4451 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_MASTER,
4452 &ipvs->mcfg, cb) < 0)
4453 goto nla_put_failure;
4454
4455 cb->args[0] = 1;
4456 }
4457
4458 if ((ipvs->sync_state & IP_VS_STATE_BACKUP) && !cb->args[1]) {
4459 if (ip_vs_genl_dump_daemon(skb, IP_VS_STATE_BACKUP,
4460 &ipvs->bcfg, cb) < 0)
4461 goto nla_put_failure;
4462
4463 cb->args[1] = 1;
4464 }
4465
4466 nla_put_failure:
4467 mutex_unlock(&ipvs->sync_mutex);
4468
4469 return skb->len;
4470 }
4471
ip_vs_genl_new_daemon(struct netns_ipvs * ipvs,struct nlattr ** attrs)4472 static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4473 {
4474 struct ipvs_sync_daemon_cfg c;
4475 struct nlattr *a;
4476 int ret;
4477
4478 memset(&c, 0, sizeof(c));
4479 if (!(attrs[IPVS_DAEMON_ATTR_STATE] &&
4480 attrs[IPVS_DAEMON_ATTR_MCAST_IFN] &&
4481 attrs[IPVS_DAEMON_ATTR_SYNC_ID]))
4482 return -EINVAL;
4483 strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]),
4484 sizeof(c.mcast_ifn));
4485 c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]);
4486
4487 a = attrs[IPVS_DAEMON_ATTR_SYNC_MAXLEN];
4488 if (a)
4489 c.sync_maxlen = nla_get_u16(a);
4490
4491 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP];
4492 if (a) {
4493 c.mcast_af = AF_INET;
4494 c.mcast_group.ip = nla_get_in_addr(a);
4495 if (!ipv4_is_multicast(c.mcast_group.ip))
4496 return -EINVAL;
4497 } else {
4498 a = attrs[IPVS_DAEMON_ATTR_MCAST_GROUP6];
4499 if (a) {
4500 #ifdef CONFIG_IP_VS_IPV6
4501 int addr_type;
4502
4503 c.mcast_af = AF_INET6;
4504 c.mcast_group.in6 = nla_get_in6_addr(a);
4505 addr_type = ipv6_addr_type(&c.mcast_group.in6);
4506 if (!(addr_type & IPV6_ADDR_MULTICAST))
4507 return -EINVAL;
4508 #else
4509 return -EAFNOSUPPORT;
4510 #endif
4511 }
4512 }
4513
4514 a = attrs[IPVS_DAEMON_ATTR_MCAST_PORT];
4515 if (a)
4516 c.mcast_port = nla_get_u16(a);
4517
4518 a = attrs[IPVS_DAEMON_ATTR_MCAST_TTL];
4519 if (a)
4520 c.mcast_ttl = nla_get_u8(a);
4521
4522 /* The synchronization protocol is incompatible with mixed family
4523 * services
4524 */
4525 if (ipvs->mixed_address_family_dests > 0)
4526 return -EINVAL;
4527
4528 ret = start_sync_thread(ipvs, &c,
4529 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4530 return ret;
4531 }
4532
ip_vs_genl_del_daemon(struct netns_ipvs * ipvs,struct nlattr ** attrs)4533 static int ip_vs_genl_del_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs)
4534 {
4535 int ret;
4536
4537 if (!attrs[IPVS_DAEMON_ATTR_STATE])
4538 return -EINVAL;
4539
4540 ret = stop_sync_thread(ipvs,
4541 nla_get_u32(attrs[IPVS_DAEMON_ATTR_STATE]));
4542 return ret;
4543 }
4544
ip_vs_genl_set_config(struct netns_ipvs * ipvs,struct nlattr ** attrs)4545 static int ip_vs_genl_set_config(struct netns_ipvs *ipvs, struct nlattr **attrs)
4546 {
4547 struct ip_vs_timeout_user t;
4548
4549 __ip_vs_get_timeouts(ipvs, &t);
4550
4551 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP])
4552 t.tcp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP]);
4553
4554 if (attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN])
4555 t.tcp_fin_timeout =
4556 nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_TCP_FIN]);
4557
4558 if (attrs[IPVS_CMD_ATTR_TIMEOUT_UDP])
4559 t.udp_timeout = nla_get_u32(attrs[IPVS_CMD_ATTR_TIMEOUT_UDP]);
4560
4561 return ip_vs_set_timeout(ipvs, &t);
4562 }
4563
ip_vs_genl_set_daemon(struct sk_buff * skb,struct genl_info * info)4564 static int ip_vs_genl_set_daemon(struct sk_buff *skb, struct genl_info *info)
4565 {
4566 int ret = -EINVAL, cmd;
4567 struct net *net = sock_net(skb->sk);
4568 struct netns_ipvs *ipvs = net_ipvs(net);
4569
4570 cmd = info->genlhdr->cmd;
4571
4572 if (cmd == IPVS_CMD_NEW_DAEMON || cmd == IPVS_CMD_DEL_DAEMON) {
4573 struct nlattr *daemon_attrs[IPVS_DAEMON_ATTR_MAX + 1];
4574
4575 if (!info->attrs[IPVS_CMD_ATTR_DAEMON] ||
4576 nla_parse_nested_deprecated(daemon_attrs, IPVS_DAEMON_ATTR_MAX, info->attrs[IPVS_CMD_ATTR_DAEMON], ip_vs_daemon_policy, info->extack))
4577 goto out;
4578
4579 if (cmd == IPVS_CMD_NEW_DAEMON)
4580 ret = ip_vs_genl_new_daemon(ipvs, daemon_attrs);
4581 else
4582 ret = ip_vs_genl_del_daemon(ipvs, daemon_attrs);
4583 }
4584
4585 out:
4586 return ret;
4587 }
4588
ip_vs_genl_set_cmd(struct sk_buff * skb,struct genl_info * info)4589 static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info)
4590 {
4591 bool need_full_svc = false, need_full_dest = false;
4592 struct ip_vs_service *svc = NULL;
4593 struct ip_vs_service_user_kern usvc;
4594 struct ip_vs_dest_user_kern udest;
4595 int ret = 0, cmd;
4596 struct net *net = sock_net(skb->sk);
4597 struct netns_ipvs *ipvs = net_ipvs(net);
4598
4599 cmd = info->genlhdr->cmd;
4600
4601 mutex_lock(&ipvs->service_mutex);
4602
4603 if (cmd == IPVS_CMD_FLUSH) {
4604 ret = ip_vs_flush(ipvs, false);
4605 goto out;
4606 } else if (cmd == IPVS_CMD_SET_CONFIG) {
4607 ret = ip_vs_genl_set_config(ipvs, info->attrs);
4608 goto out;
4609 } else if (cmd == IPVS_CMD_ZERO &&
4610 !info->attrs[IPVS_CMD_ATTR_SERVICE]) {
4611 ret = ip_vs_zero_all(ipvs);
4612 goto out;
4613 }
4614
4615 /* All following commands require a service argument, so check if we
4616 * received a valid one. We need a full service specification when
4617 * adding / editing a service. Only identifying members otherwise. */
4618 if (cmd == IPVS_CMD_NEW_SERVICE || cmd == IPVS_CMD_SET_SERVICE)
4619 need_full_svc = true;
4620
4621 /* We use function that requires RCU lock (hlist_bl) */
4622 rcu_read_lock();
4623 ret = ip_vs_genl_parse_service(ipvs, &usvc,
4624 info->attrs[IPVS_CMD_ATTR_SERVICE],
4625 need_full_svc, &svc);
4626 rcu_read_unlock();
4627 if (ret)
4628 goto out;
4629
4630 /* Unless we're adding a new service, the service must already exist */
4631 if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) {
4632 ret = -ESRCH;
4633 goto out;
4634 }
4635
4636 /* Destination commands require a valid destination argument. For
4637 * adding / editing a destination, we need a full destination
4638 * specification. */
4639 if (cmd == IPVS_CMD_NEW_DEST || cmd == IPVS_CMD_SET_DEST ||
4640 cmd == IPVS_CMD_DEL_DEST) {
4641 if (cmd != IPVS_CMD_DEL_DEST)
4642 need_full_dest = true;
4643
4644 ret = ip_vs_genl_parse_dest(&udest,
4645 info->attrs[IPVS_CMD_ATTR_DEST],
4646 need_full_dest);
4647 if (ret)
4648 goto out;
4649
4650 /* Old protocols did not allow the user to specify address
4651 * family, so we set it to zero instead. We also didn't
4652 * allow heterogeneous pools in the old code, so it's safe
4653 * to assume that this will have the same address family as
4654 * the service.
4655 */
4656 if (udest.af == 0)
4657 udest.af = svc->af;
4658
4659 if (!ip_vs_is_af_valid(udest.af)) {
4660 ret = -EAFNOSUPPORT;
4661 goto out;
4662 }
4663
4664 if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) {
4665 /* The synchronization protocol is incompatible
4666 * with mixed family services
4667 */
4668 if (ipvs->sync_state) {
4669 ret = -EINVAL;
4670 goto out;
4671 }
4672
4673 /* Which connection types do we support? */
4674 switch (udest.conn_flags) {
4675 case IP_VS_CONN_F_TUNNEL:
4676 /* We are able to forward this */
4677 break;
4678 default:
4679 ret = -EINVAL;
4680 goto out;
4681 }
4682 }
4683 }
4684
4685 switch (cmd) {
4686 case IPVS_CMD_NEW_SERVICE:
4687 if (svc == NULL)
4688 ret = ip_vs_add_service(ipvs, &usvc, &svc);
4689 else
4690 ret = -EEXIST;
4691 break;
4692 case IPVS_CMD_SET_SERVICE:
4693 ret = ip_vs_edit_service(svc, &usvc);
4694 break;
4695 case IPVS_CMD_DEL_SERVICE:
4696 ret = ip_vs_del_service(svc);
4697 /* do not use svc, it can be freed */
4698 break;
4699 case IPVS_CMD_NEW_DEST:
4700 ret = ip_vs_add_dest(svc, &udest);
4701 break;
4702 case IPVS_CMD_SET_DEST:
4703 ret = ip_vs_edit_dest(svc, &udest);
4704 break;
4705 case IPVS_CMD_DEL_DEST:
4706 ret = ip_vs_del_dest(svc, &udest);
4707 break;
4708 case IPVS_CMD_ZERO:
4709 ret = ip_vs_zero_service(svc);
4710 break;
4711 default:
4712 ret = -EINVAL;
4713 }
4714
4715 out:
4716 mutex_unlock(&ipvs->service_mutex);
4717
4718 return ret;
4719 }
4720
ip_vs_genl_get_cmd(struct sk_buff * skb,struct genl_info * info)4721 static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info)
4722 {
4723 struct sk_buff *msg;
4724 void *reply;
4725 int ret, cmd, reply_cmd;
4726 struct net *net = sock_net(skb->sk);
4727 struct netns_ipvs *ipvs = net_ipvs(net);
4728
4729 cmd = info->genlhdr->cmd;
4730
4731 if (cmd == IPVS_CMD_GET_SERVICE)
4732 reply_cmd = IPVS_CMD_NEW_SERVICE;
4733 else if (cmd == IPVS_CMD_GET_INFO)
4734 reply_cmd = IPVS_CMD_SET_INFO;
4735 else if (cmd == IPVS_CMD_GET_CONFIG)
4736 reply_cmd = IPVS_CMD_SET_CONFIG;
4737 else {
4738 pr_err("unknown Generic Netlink command\n");
4739 return -EINVAL;
4740 }
4741
4742 msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
4743 if (!msg)
4744 return -ENOMEM;
4745
4746 rcu_read_lock();
4747
4748 reply = genlmsg_put_reply(msg, info, &ip_vs_genl_family, 0, reply_cmd);
4749 if (reply == NULL)
4750 goto nla_put_failure;
4751
4752 switch (cmd) {
4753 case IPVS_CMD_GET_SERVICE:
4754 {
4755 struct ip_vs_service *svc;
4756
4757 svc = ip_vs_genl_find_service(ipvs,
4758 info->attrs[IPVS_CMD_ATTR_SERVICE]);
4759 if (IS_ERR(svc)) {
4760 ret = PTR_ERR(svc);
4761 goto out_err;
4762 } else if (svc) {
4763 ret = ip_vs_genl_fill_service(msg, svc);
4764 if (ret)
4765 goto nla_put_failure;
4766 } else {
4767 ret = -ESRCH;
4768 goto out_err;
4769 }
4770
4771 break;
4772 }
4773
4774 case IPVS_CMD_GET_CONFIG:
4775 {
4776 struct ip_vs_timeout_user t;
4777
4778 __ip_vs_get_timeouts(ipvs, &t);
4779 #ifdef CONFIG_IP_VS_PROTO_TCP
4780 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP,
4781 t.tcp_timeout) ||
4782 nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_TCP_FIN,
4783 t.tcp_fin_timeout))
4784 goto nla_put_failure;
4785 #endif
4786 #ifdef CONFIG_IP_VS_PROTO_UDP
4787 if (nla_put_u32(msg, IPVS_CMD_ATTR_TIMEOUT_UDP, t.udp_timeout))
4788 goto nla_put_failure;
4789 #endif
4790
4791 break;
4792 }
4793
4794 case IPVS_CMD_GET_INFO:
4795 if (nla_put_u32(msg, IPVS_INFO_ATTR_VERSION,
4796 IP_VS_VERSION_CODE) ||
4797 nla_put_u32(msg, IPVS_INFO_ATTR_CONN_TAB_SIZE,
4798 get_conn_tab_size(ipvs)))
4799 goto nla_put_failure;
4800 break;
4801 }
4802
4803 genlmsg_end(msg, reply);
4804 ret = genlmsg_reply(msg, info);
4805 goto out;
4806
4807 nla_put_failure:
4808 pr_err("not enough space in Netlink message\n");
4809 ret = -EMSGSIZE;
4810
4811 out_err:
4812 nlmsg_free(msg);
4813 out:
4814 rcu_read_unlock();
4815
4816 return ret;
4817 }
4818
4819
4820 static const struct genl_small_ops ip_vs_genl_ops[] = {
4821 {
4822 .cmd = IPVS_CMD_NEW_SERVICE,
4823 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4824 .flags = GENL_ADMIN_PERM,
4825 .doit = ip_vs_genl_set_cmd,
4826 },
4827 {
4828 .cmd = IPVS_CMD_SET_SERVICE,
4829 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4830 .flags = GENL_ADMIN_PERM,
4831 .doit = ip_vs_genl_set_cmd,
4832 },
4833 {
4834 .cmd = IPVS_CMD_DEL_SERVICE,
4835 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4836 .flags = GENL_ADMIN_PERM,
4837 .doit = ip_vs_genl_set_cmd,
4838 },
4839 {
4840 .cmd = IPVS_CMD_GET_SERVICE,
4841 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4842 .flags = GENL_ADMIN_PERM,
4843 .doit = ip_vs_genl_get_cmd,
4844 .dumpit = ip_vs_genl_dump_services,
4845 },
4846 {
4847 .cmd = IPVS_CMD_NEW_DEST,
4848 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4849 .flags = GENL_ADMIN_PERM,
4850 .doit = ip_vs_genl_set_cmd,
4851 },
4852 {
4853 .cmd = IPVS_CMD_SET_DEST,
4854 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4855 .flags = GENL_ADMIN_PERM,
4856 .doit = ip_vs_genl_set_cmd,
4857 },
4858 {
4859 .cmd = IPVS_CMD_DEL_DEST,
4860 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4861 .flags = GENL_ADMIN_PERM,
4862 .doit = ip_vs_genl_set_cmd,
4863 },
4864 {
4865 .cmd = IPVS_CMD_GET_DEST,
4866 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4867 .flags = GENL_ADMIN_PERM,
4868 .dumpit = ip_vs_genl_dump_dests,
4869 },
4870 {
4871 .cmd = IPVS_CMD_NEW_DAEMON,
4872 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4873 .flags = GENL_ADMIN_PERM,
4874 .doit = ip_vs_genl_set_daemon,
4875 },
4876 {
4877 .cmd = IPVS_CMD_DEL_DAEMON,
4878 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4879 .flags = GENL_ADMIN_PERM,
4880 .doit = ip_vs_genl_set_daemon,
4881 },
4882 {
4883 .cmd = IPVS_CMD_GET_DAEMON,
4884 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4885 .flags = GENL_ADMIN_PERM,
4886 .dumpit = ip_vs_genl_dump_daemons,
4887 },
4888 {
4889 .cmd = IPVS_CMD_SET_CONFIG,
4890 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4891 .flags = GENL_ADMIN_PERM,
4892 .doit = ip_vs_genl_set_cmd,
4893 },
4894 {
4895 .cmd = IPVS_CMD_GET_CONFIG,
4896 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4897 .flags = GENL_ADMIN_PERM,
4898 .doit = ip_vs_genl_get_cmd,
4899 },
4900 {
4901 .cmd = IPVS_CMD_GET_INFO,
4902 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4903 .flags = GENL_ADMIN_PERM,
4904 .doit = ip_vs_genl_get_cmd,
4905 },
4906 {
4907 .cmd = IPVS_CMD_ZERO,
4908 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4909 .flags = GENL_ADMIN_PERM,
4910 .doit = ip_vs_genl_set_cmd,
4911 },
4912 {
4913 .cmd = IPVS_CMD_FLUSH,
4914 .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
4915 .flags = GENL_ADMIN_PERM,
4916 .doit = ip_vs_genl_set_cmd,
4917 },
4918 };
4919
4920 static struct genl_family ip_vs_genl_family __ro_after_init = {
4921 .hdrsize = 0,
4922 .name = IPVS_GENL_NAME,
4923 .version = IPVS_GENL_VERSION,
4924 .maxattr = IPVS_CMD_ATTR_MAX,
4925 .policy = ip_vs_cmd_policy,
4926 .netnsok = true, /* Make ipvsadm to work on netns */
4927 .module = THIS_MODULE,
4928 .small_ops = ip_vs_genl_ops,
4929 .n_small_ops = ARRAY_SIZE(ip_vs_genl_ops),
4930 .resv_start_op = IPVS_CMD_FLUSH + 1,
4931 .parallel_ops = 1,
4932 };
4933
ip_vs_genl_register(void)4934 static int __init ip_vs_genl_register(void)
4935 {
4936 return genl_register_family(&ip_vs_genl_family);
4937 }
4938
ip_vs_genl_unregister(void)4939 static void ip_vs_genl_unregister(void)
4940 {
4941 genl_unregister_family(&ip_vs_genl_family);
4942 }
4943
4944 /* End of Generic Netlink interface definitions */
4945
4946 /*
4947 * per netns intit/exit func.
4948 */
4949 #ifdef CONFIG_SYSCTL
ip_vs_control_net_init_sysctl(struct netns_ipvs * ipvs)4950 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
4951 {
4952 struct net *net = ipvs->net;
4953 struct ctl_table *tbl;
4954 int idx, ret;
4955 size_t ctl_table_size = ARRAY_SIZE(vs_vars);
4956 bool unpriv = net->user_ns != &init_user_ns;
4957
4958 atomic_set(&ipvs->dropentry, 0);
4959 spin_lock_init(&ipvs->dropentry_lock);
4960 spin_lock_init(&ipvs->droppacket_lock);
4961 spin_lock_init(&ipvs->securetcp_lock);
4962 INIT_DELAYED_WORK(&ipvs->defense_work, defense_work_handler);
4963 INIT_DELAYED_WORK(&ipvs->expire_nodest_conn_work,
4964 expire_nodest_conn_handler);
4965 ipvs->est_stopped = 0;
4966
4967 if (!net_eq(net, &init_net)) {
4968 tbl = kmemdup(vs_vars, sizeof(vs_vars), GFP_KERNEL);
4969 if (tbl == NULL)
4970 return -ENOMEM;
4971 } else
4972 tbl = vs_vars;
4973 /* Initialize sysctl defaults */
4974 for (idx = 0; idx < ARRAY_SIZE(vs_vars); idx++) {
4975 if (tbl[idx].proc_handler == proc_do_defense_mode)
4976 tbl[idx].extra2 = ipvs;
4977 }
4978 idx = 0;
4979 ipvs->sysctl_amemthresh = 1024;
4980 tbl[idx++].data = &ipvs->sysctl_amemthresh;
4981 ipvs->sysctl_am_droprate = 10;
4982 tbl[idx++].data = &ipvs->sysctl_am_droprate;
4983 tbl[idx++].data = &ipvs->sysctl_drop_entry;
4984 tbl[idx++].data = &ipvs->sysctl_drop_packet;
4985 #ifdef CONFIG_IP_VS_NFCT
4986 tbl[idx++].data = &ipvs->sysctl_conntrack;
4987 #endif
4988 tbl[idx++].data = &ipvs->sysctl_secure_tcp;
4989 ipvs->sysctl_snat_reroute = 1;
4990 tbl[idx++].data = &ipvs->sysctl_snat_reroute;
4991 ipvs->sysctl_sync_ver = 1;
4992 tbl[idx++].data = &ipvs->sysctl_sync_ver;
4993 ipvs->sysctl_sync_ports = 1;
4994 tbl[idx++].data = &ipvs->sysctl_sync_ports;
4995 tbl[idx++].data = &ipvs->sysctl_sync_persist_mode;
4996
4997 ipvs->sysctl_sync_qlen_max = nr_free_buffer_pages() / 32;
4998 if (unpriv)
4999 tbl[idx].mode = 0444;
5000 tbl[idx++].data = &ipvs->sysctl_sync_qlen_max;
5001
5002 ipvs->sysctl_sync_sock_size = 0;
5003 if (unpriv)
5004 tbl[idx].mode = 0444;
5005 tbl[idx++].data = &ipvs->sysctl_sync_sock_size;
5006
5007 tbl[idx++].data = &ipvs->sysctl_cache_bypass;
5008 tbl[idx++].data = &ipvs->sysctl_expire_nodest_conn;
5009 tbl[idx++].data = &ipvs->sysctl_sloppy_tcp;
5010 tbl[idx++].data = &ipvs->sysctl_sloppy_sctp;
5011 tbl[idx++].data = &ipvs->sysctl_expire_quiescent_template;
5012 ipvs->sysctl_sync_threshold[0] = DEFAULT_SYNC_THRESHOLD;
5013 ipvs->sysctl_sync_threshold[1] = DEFAULT_SYNC_PERIOD;
5014 tbl[idx].data = &ipvs->sysctl_sync_threshold;
5015 tbl[idx].extra2 = ipvs;
5016 tbl[idx++].maxlen = sizeof(ipvs->sysctl_sync_threshold);
5017 ipvs->sysctl_sync_refresh_period = DEFAULT_SYNC_REFRESH_PERIOD;
5018 tbl[idx++].data = &ipvs->sysctl_sync_refresh_period;
5019 ipvs->sysctl_sync_retries = clamp_t(int, DEFAULT_SYNC_RETRIES, 0, 3);
5020 tbl[idx++].data = &ipvs->sysctl_sync_retries;
5021 tbl[idx++].data = &ipvs->sysctl_nat_icmp_send;
5022 ipvs->sysctl_pmtu_disc = 1;
5023 tbl[idx++].data = &ipvs->sysctl_pmtu_disc;
5024 tbl[idx++].data = &ipvs->sysctl_backup_only;
5025 ipvs->sysctl_conn_reuse_mode = 1;
5026 tbl[idx++].data = &ipvs->sysctl_conn_reuse_mode;
5027 tbl[idx++].data = &ipvs->sysctl_schedule_icmp;
5028 tbl[idx++].data = &ipvs->sysctl_ignore_tunneled;
5029
5030 ipvs->sysctl_run_estimation = 1;
5031 if (unpriv)
5032 tbl[idx].mode = 0444;
5033 tbl[idx].extra2 = ipvs;
5034 tbl[idx++].data = &ipvs->sysctl_run_estimation;
5035
5036 ipvs->est_cpulist_valid = 0;
5037 if (unpriv)
5038 tbl[idx].mode = 0444;
5039 tbl[idx].extra2 = ipvs;
5040 tbl[idx++].data = &ipvs->sysctl_est_cpulist;
5041
5042 ipvs->sysctl_est_nice = IPVS_EST_NICE;
5043 if (unpriv)
5044 tbl[idx].mode = 0444;
5045 tbl[idx].extra2 = ipvs;
5046 tbl[idx++].data = &ipvs->sysctl_est_nice;
5047
5048 if (unpriv)
5049 tbl[idx].mode = 0444;
5050 tbl[idx].extra2 = ipvs;
5051 tbl[idx++].data = &ipvs->sysctl_conn_lfactor;
5052
5053 if (unpriv)
5054 tbl[idx].mode = 0444;
5055 tbl[idx].extra2 = ipvs;
5056 tbl[idx++].data = &ipvs->sysctl_svc_lfactor;
5057
5058 #ifdef CONFIG_IP_VS_DEBUG
5059 /* Global sysctls must be ro in non-init netns */
5060 if (!net_eq(net, &init_net))
5061 tbl[idx++].mode = 0444;
5062 #endif
5063
5064 ret = -ENOMEM;
5065 ipvs->sysctl_hdr = register_net_sysctl_sz(net, "net/ipv4/vs", tbl,
5066 ctl_table_size);
5067 if (!ipvs->sysctl_hdr)
5068 goto err;
5069 ipvs->sysctl_tbl = tbl;
5070
5071 ret = ip_vs_start_estimator(ipvs, &ipvs->tot_stats->s);
5072 if (ret < 0)
5073 goto err;
5074
5075 /* Schedule defense work */
5076 queue_delayed_work(system_long_wq, &ipvs->defense_work,
5077 DEFENSE_TIMER_PERIOD);
5078
5079 return 0;
5080
5081 err:
5082 unregister_net_sysctl_table(ipvs->sysctl_hdr);
5083 if (!net_eq(net, &init_net))
5084 kfree(tbl);
5085 return ret;
5086 }
5087
ip_vs_control_net_cleanup_sysctl(struct netns_ipvs * ipvs)5088 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs)
5089 {
5090 struct net *net = ipvs->net;
5091
5092 cancel_delayed_work_sync(&ipvs->expire_nodest_conn_work);
5093 cancel_delayed_work_sync(&ipvs->defense_work);
5094 cancel_work_sync(&ipvs->defense_work.work);
5095 unregister_net_sysctl_table(ipvs->sysctl_hdr);
5096 if (ipvs->tot_stats->s.est.ktid != -2) {
5097 /* Not stopped yet? This happens only on netns init error and
5098 * we even do not need to lock the service_mutex for this case.
5099 */
5100 mutex_lock(&ipvs->service_mutex);
5101 ip_vs_stop_estimator(ipvs, &ipvs->tot_stats->s);
5102 mutex_unlock(&ipvs->service_mutex);
5103 }
5104
5105 if (ipvs->est_cpulist_valid)
5106 free_cpumask_var(ipvs->sysctl_est_cpulist);
5107
5108 if (!net_eq(net, &init_net))
5109 kfree(ipvs->sysctl_tbl);
5110 }
5111
5112 #else
5113
ip_vs_control_net_init_sysctl(struct netns_ipvs * ipvs)5114 static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs) { return 0; }
ip_vs_control_net_cleanup_sysctl(struct netns_ipvs * ipvs)5115 static void __net_exit ip_vs_control_net_cleanup_sysctl(struct netns_ipvs *ipvs) { }
5116
5117 #endif
5118
5119 static struct notifier_block ip_vs_dst_notifier = {
5120 .notifier_call = ip_vs_dst_event,
5121 #ifdef CONFIG_IP_VS_IPV6
5122 .priority = ADDRCONF_NOTIFY_PRIORITY + 5,
5123 #endif
5124 };
5125
ip_vs_control_net_init(struct netns_ipvs * ipvs)5126 int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
5127 {
5128 int ret = -ENOMEM;
5129 int idx;
5130
5131 /* Initialize service_mutex, svc_table per netns */
5132 __mutex_init(&ipvs->service_mutex, "ipvs->service_mutex", &__ipvs_service_key);
5133 init_rwsem(&ipvs->svc_resize_sem);
5134 init_rwsem(&ipvs->svc_replace_sem);
5135 INIT_DELAYED_WORK(&ipvs->svc_resize_work, svc_resize_work_handler);
5136 atomic_set(&ipvs->svc_table_changes, 0);
5137 RCU_INIT_POINTER(ipvs->svc_table, NULL);
5138
5139 /* Initialize rs_table */
5140 for (idx = 0; idx < IP_VS_RTAB_SIZE; idx++)
5141 INIT_HLIST_HEAD(&ipvs->rs_table[idx]);
5142
5143 INIT_LIST_HEAD(&ipvs->dest_trash);
5144 spin_lock_init(&ipvs->dest_trash_lock);
5145 timer_setup(&ipvs->dest_trash_timer, ip_vs_dest_trash_expire, 0);
5146 for (idx = 0; idx < IP_VS_AF_MAX; idx++) {
5147 atomic_set(&ipvs->num_services[idx], 0);
5148 atomic_set(&ipvs->fwm_services[idx], 0);
5149 atomic_set(&ipvs->nonfwm_services[idx], 0);
5150 atomic_set(&ipvs->ftpsvc_counter[idx], 0);
5151 atomic_set(&ipvs->nullsvc_counter[idx], 0);
5152 atomic_set(&ipvs->conn_out_counter[idx], 0);
5153 }
5154
5155 INIT_DELAYED_WORK(&ipvs->est_reload_work, est_reload_work_handler);
5156 ipvs->sysctl_svc_lfactor = ip_vs_svc_default_load_factor(ipvs);
5157
5158 /* procfs stats */
5159 ipvs->tot_stats = kzalloc_obj(*ipvs->tot_stats);
5160 if (!ipvs->tot_stats)
5161 goto out;
5162 if (ip_vs_stats_init_alloc(&ipvs->tot_stats->s) < 0)
5163 goto err_tot_stats;
5164
5165 #ifdef CONFIG_PROC_FS
5166 if (!proc_create_net("ip_vs", 0, ipvs->net->proc_net,
5167 &ip_vs_info_seq_ops, sizeof(struct ip_vs_iter)))
5168 goto err_vs;
5169 if (!proc_create_net_single("ip_vs_stats", 0, ipvs->net->proc_net,
5170 ip_vs_stats_show, NULL))
5171 goto err_stats;
5172 if (!proc_create_net_single("ip_vs_stats_percpu", 0,
5173 ipvs->net->proc_net,
5174 ip_vs_stats_percpu_show, NULL))
5175 goto err_percpu;
5176 if (!proc_create_net_single("ip_vs_status", 0440, ipvs->net->proc_net,
5177 ip_vs_status_show, NULL))
5178 goto err_status;
5179 #endif
5180
5181 ret = ip_vs_control_net_init_sysctl(ipvs);
5182 if (ret < 0)
5183 goto err;
5184
5185 return 0;
5186
5187 err:
5188 #ifdef CONFIG_PROC_FS
5189 remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5190
5191 err_status:
5192 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5193
5194 err_percpu:
5195 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5196
5197 err_stats:
5198 remove_proc_entry("ip_vs", ipvs->net->proc_net);
5199
5200 err_vs:
5201 #endif
5202 ip_vs_stats_release(&ipvs->tot_stats->s);
5203
5204 err_tot_stats:
5205 kfree(ipvs->tot_stats);
5206
5207 out:
5208 return ret;
5209 }
5210
ip_vs_control_net_cleanup(struct netns_ipvs * ipvs)5211 void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
5212 {
5213 ip_vs_trash_cleanup(ipvs);
5214 ip_vs_control_net_cleanup_sysctl(ipvs);
5215 cancel_delayed_work_sync(&ipvs->est_reload_work);
5216 #ifdef CONFIG_PROC_FS
5217 remove_proc_entry("ip_vs_status", ipvs->net->proc_net);
5218 remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
5219 remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
5220 remove_proc_entry("ip_vs", ipvs->net->proc_net);
5221 #endif
5222 call_rcu(&ipvs->tot_stats->rcu_head, ip_vs_stats_rcu_free);
5223 }
5224
ip_vs_register_nl_ioctl(void)5225 int __init ip_vs_register_nl_ioctl(void)
5226 {
5227 int ret;
5228
5229 ret = nf_register_sockopt(&ip_vs_sockopts);
5230 if (ret) {
5231 pr_err("cannot register sockopt.\n");
5232 goto err_sock;
5233 }
5234
5235 ret = ip_vs_genl_register();
5236 if (ret) {
5237 pr_err("cannot register Generic Netlink interface.\n");
5238 goto err_genl;
5239 }
5240 return 0;
5241
5242 err_genl:
5243 nf_unregister_sockopt(&ip_vs_sockopts);
5244 err_sock:
5245 return ret;
5246 }
5247
ip_vs_unregister_nl_ioctl(void)5248 void ip_vs_unregister_nl_ioctl(void)
5249 {
5250 ip_vs_genl_unregister();
5251 nf_unregister_sockopt(&ip_vs_sockopts);
5252 }
5253
ip_vs_control_init(void)5254 int __init ip_vs_control_init(void)
5255 {
5256 int ret;
5257
5258 ret = register_netdevice_notifier(&ip_vs_dst_notifier);
5259 if (ret < 0)
5260 return ret;
5261
5262 return 0;
5263 }
5264
5265
ip_vs_control_cleanup(void)5266 void ip_vs_control_cleanup(void)
5267 {
5268 unregister_netdevice_notifier(&ip_vs_dst_notifier);
5269 /* relying on common rcu_barrier() in ip_vs_cleanup() */
5270 }
5271