1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #include <linux/module.h> 66 #include <asm/uaccess.h> 67 #include <asm/system.h> 68 #include <linux/bitops.h> 69 #include <linux/types.h> 70 #include <linux/kernel.h> 71 #include <linux/mm.h> 72 #include <linux/bootmem.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/workqueue.h> 83 #include <linux/skbuff.h> 84 #include <linux/inetdevice.h> 85 #include <linux/igmp.h> 86 #include <linux/pkt_sched.h> 87 #include <linux/mroute.h> 88 #include <linux/netfilter_ipv4.h> 89 #include <linux/random.h> 90 #include <linux/jhash.h> 91 #include <linux/rcupdate.h> 92 #include <linux/times.h> 93 #include <linux/slab.h> 94 #include <linux/prefetch.h> 95 #include <net/dst.h> 96 #include <net/net_namespace.h> 97 #include <net/protocol.h> 98 #include <net/ip.h> 99 #include <net/route.h> 100 #include <net/inetpeer.h> 101 #include <net/sock.h> 102 #include <net/ip_fib.h> 103 #include <net/arp.h> 104 #include <net/tcp.h> 105 #include <net/icmp.h> 106 #include <net/xfrm.h> 107 #include <net/netevent.h> 108 #include <net/rtnetlink.h> 109 #ifdef CONFIG_SYSCTL 110 #include <linux/sysctl.h> 111 #endif 112 #include <net/secure_seq.h> 113 114 #define RT_FL_TOS(oldflp4) \ 115 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 116 117 #define IP_MAX_MTU 0xFFF0 118 119 #define RT_GC_TIMEOUT (300*HZ) 120 121 static int ip_rt_max_size; 122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 123 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 124 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 125 static int ip_rt_redirect_number __read_mostly = 9; 126 static int ip_rt_redirect_load __read_mostly = HZ / 50; 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 128 static int ip_rt_error_cost __read_mostly = HZ; 129 static int ip_rt_error_burst __read_mostly = 5 * HZ; 130 static int ip_rt_gc_elasticity __read_mostly = 8; 131 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 132 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 133 static int ip_rt_min_advmss __read_mostly = 256; 134 static int rt_chain_length_max __read_mostly = 20; 135 static int redirect_genid; 136 137 static struct delayed_work expires_work; 138 static unsigned long expires_ljiffies; 139 140 /* 141 * Interface to generic destination cache. 142 */ 143 144 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 145 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 146 static unsigned int ipv4_mtu(const struct dst_entry *dst); 147 static void ipv4_dst_destroy(struct dst_entry *dst); 148 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 149 static void ipv4_link_failure(struct sk_buff *skb); 150 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 151 static int rt_garbage_collect(struct dst_ops *ops); 152 153 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 154 int how) 155 { 156 } 157 158 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 159 { 160 struct rtable *rt = (struct rtable *) dst; 161 struct inet_peer *peer; 162 u32 *p = NULL; 163 164 if (!rt->peer) 165 rt_bind_peer(rt, rt->rt_dst, 1); 166 167 peer = rt->peer; 168 if (peer) { 169 u32 *old_p = __DST_METRICS_PTR(old); 170 unsigned long prev, new; 171 172 p = peer->metrics; 173 if (inet_metrics_new(peer)) 174 memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 175 176 new = (unsigned long) p; 177 prev = cmpxchg(&dst->_metrics, old, new); 178 179 if (prev != old) { 180 p = __DST_METRICS_PTR(prev); 181 if (prev & DST_METRICS_READ_ONLY) 182 p = NULL; 183 } else { 184 if (rt->fi) { 185 fib_info_put(rt->fi); 186 rt->fi = NULL; 187 } 188 } 189 } 190 return p; 191 } 192 193 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); 194 195 static struct dst_ops ipv4_dst_ops = { 196 .family = AF_INET, 197 .protocol = cpu_to_be16(ETH_P_IP), 198 .gc = rt_garbage_collect, 199 .check = ipv4_dst_check, 200 .default_advmss = ipv4_default_advmss, 201 .mtu = ipv4_mtu, 202 .cow_metrics = ipv4_cow_metrics, 203 .destroy = ipv4_dst_destroy, 204 .ifdown = ipv4_dst_ifdown, 205 .negative_advice = ipv4_negative_advice, 206 .link_failure = ipv4_link_failure, 207 .update_pmtu = ip_rt_update_pmtu, 208 .local_out = __ip_local_out, 209 .neigh_lookup = ipv4_neigh_lookup, 210 }; 211 212 #define ECN_OR_COST(class) TC_PRIO_##class 213 214 const __u8 ip_tos2prio[16] = { 215 TC_PRIO_BESTEFFORT, 216 ECN_OR_COST(BESTEFFORT), 217 TC_PRIO_BESTEFFORT, 218 ECN_OR_COST(BESTEFFORT), 219 TC_PRIO_BULK, 220 ECN_OR_COST(BULK), 221 TC_PRIO_BULK, 222 ECN_OR_COST(BULK), 223 TC_PRIO_INTERACTIVE, 224 ECN_OR_COST(INTERACTIVE), 225 TC_PRIO_INTERACTIVE, 226 ECN_OR_COST(INTERACTIVE), 227 TC_PRIO_INTERACTIVE_BULK, 228 ECN_OR_COST(INTERACTIVE_BULK), 229 TC_PRIO_INTERACTIVE_BULK, 230 ECN_OR_COST(INTERACTIVE_BULK) 231 }; 232 233 234 /* 235 * Route cache. 236 */ 237 238 /* The locking scheme is rather straight forward: 239 * 240 * 1) Read-Copy Update protects the buckets of the central route hash. 241 * 2) Only writers remove entries, and they hold the lock 242 * as they look at rtable reference counts. 243 * 3) Only readers acquire references to rtable entries, 244 * they do so with atomic increments and with the 245 * lock held. 246 */ 247 248 struct rt_hash_bucket { 249 struct rtable __rcu *chain; 250 }; 251 252 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 253 defined(CONFIG_PROVE_LOCKING) 254 /* 255 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks 256 * The size of this table is a power of two and depends on the number of CPUS. 257 * (on lockdep we have a quite big spinlock_t, so keep the size down there) 258 */ 259 #ifdef CONFIG_LOCKDEP 260 # define RT_HASH_LOCK_SZ 256 261 #else 262 # if NR_CPUS >= 32 263 # define RT_HASH_LOCK_SZ 4096 264 # elif NR_CPUS >= 16 265 # define RT_HASH_LOCK_SZ 2048 266 # elif NR_CPUS >= 8 267 # define RT_HASH_LOCK_SZ 1024 268 # elif NR_CPUS >= 4 269 # define RT_HASH_LOCK_SZ 512 270 # else 271 # define RT_HASH_LOCK_SZ 256 272 # endif 273 #endif 274 275 static spinlock_t *rt_hash_locks; 276 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] 277 278 static __init void rt_hash_lock_init(void) 279 { 280 int i; 281 282 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, 283 GFP_KERNEL); 284 if (!rt_hash_locks) 285 panic("IP: failed to allocate rt_hash_locks\n"); 286 287 for (i = 0; i < RT_HASH_LOCK_SZ; i++) 288 spin_lock_init(&rt_hash_locks[i]); 289 } 290 #else 291 # define rt_hash_lock_addr(slot) NULL 292 293 static inline void rt_hash_lock_init(void) 294 { 295 } 296 #endif 297 298 static struct rt_hash_bucket *rt_hash_table __read_mostly; 299 static unsigned rt_hash_mask __read_mostly; 300 static unsigned int rt_hash_log __read_mostly; 301 302 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 303 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 304 305 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 306 int genid) 307 { 308 return jhash_3words((__force u32)daddr, (__force u32)saddr, 309 idx, genid) 310 & rt_hash_mask; 311 } 312 313 static inline int rt_genid(struct net *net) 314 { 315 return atomic_read(&net->ipv4.rt_genid); 316 } 317 318 #ifdef CONFIG_PROC_FS 319 struct rt_cache_iter_state { 320 struct seq_net_private p; 321 int bucket; 322 int genid; 323 }; 324 325 static struct rtable *rt_cache_get_first(struct seq_file *seq) 326 { 327 struct rt_cache_iter_state *st = seq->private; 328 struct rtable *r = NULL; 329 330 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 331 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) 332 continue; 333 rcu_read_lock_bh(); 334 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 335 while (r) { 336 if (dev_net(r->dst.dev) == seq_file_net(seq) && 337 r->rt_genid == st->genid) 338 return r; 339 r = rcu_dereference_bh(r->dst.rt_next); 340 } 341 rcu_read_unlock_bh(); 342 } 343 return r; 344 } 345 346 static struct rtable *__rt_cache_get_next(struct seq_file *seq, 347 struct rtable *r) 348 { 349 struct rt_cache_iter_state *st = seq->private; 350 351 r = rcu_dereference_bh(r->dst.rt_next); 352 while (!r) { 353 rcu_read_unlock_bh(); 354 do { 355 if (--st->bucket < 0) 356 return NULL; 357 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); 358 rcu_read_lock_bh(); 359 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 360 } 361 return r; 362 } 363 364 static struct rtable *rt_cache_get_next(struct seq_file *seq, 365 struct rtable *r) 366 { 367 struct rt_cache_iter_state *st = seq->private; 368 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 369 if (dev_net(r->dst.dev) != seq_file_net(seq)) 370 continue; 371 if (r->rt_genid == st->genid) 372 break; 373 } 374 return r; 375 } 376 377 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 378 { 379 struct rtable *r = rt_cache_get_first(seq); 380 381 if (r) 382 while (pos && (r = rt_cache_get_next(seq, r))) 383 --pos; 384 return pos ? NULL : r; 385 } 386 387 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 388 { 389 struct rt_cache_iter_state *st = seq->private; 390 if (*pos) 391 return rt_cache_get_idx(seq, *pos - 1); 392 st->genid = rt_genid(seq_file_net(seq)); 393 return SEQ_START_TOKEN; 394 } 395 396 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 397 { 398 struct rtable *r; 399 400 if (v == SEQ_START_TOKEN) 401 r = rt_cache_get_first(seq); 402 else 403 r = rt_cache_get_next(seq, v); 404 ++*pos; 405 return r; 406 } 407 408 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 409 { 410 if (v && v != SEQ_START_TOKEN) 411 rcu_read_unlock_bh(); 412 } 413 414 static int rt_cache_seq_show(struct seq_file *seq, void *v) 415 { 416 if (v == SEQ_START_TOKEN) 417 seq_printf(seq, "%-127s\n", 418 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 419 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 420 "HHUptod\tSpecDst"); 421 else { 422 struct rtable *r = v; 423 struct neighbour *n; 424 int len, HHUptod; 425 426 rcu_read_lock(); 427 n = dst_get_neighbour_noref(&r->dst); 428 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; 429 rcu_read_unlock(); 430 431 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 432 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 433 r->dst.dev ? r->dst.dev->name : "*", 434 (__force u32)r->rt_dst, 435 (__force u32)r->rt_gateway, 436 r->rt_flags, atomic_read(&r->dst.__refcnt), 437 r->dst.__use, 0, (__force u32)r->rt_src, 438 dst_metric_advmss(&r->dst) + 40, 439 dst_metric(&r->dst, RTAX_WINDOW), 440 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 441 dst_metric(&r->dst, RTAX_RTTVAR)), 442 r->rt_key_tos, 443 -1, 444 HHUptod, 445 r->rt_spec_dst, &len); 446 447 seq_printf(seq, "%*s\n", 127 - len, ""); 448 } 449 return 0; 450 } 451 452 static const struct seq_operations rt_cache_seq_ops = { 453 .start = rt_cache_seq_start, 454 .next = rt_cache_seq_next, 455 .stop = rt_cache_seq_stop, 456 .show = rt_cache_seq_show, 457 }; 458 459 static int rt_cache_seq_open(struct inode *inode, struct file *file) 460 { 461 return seq_open_net(inode, file, &rt_cache_seq_ops, 462 sizeof(struct rt_cache_iter_state)); 463 } 464 465 static const struct file_operations rt_cache_seq_fops = { 466 .owner = THIS_MODULE, 467 .open = rt_cache_seq_open, 468 .read = seq_read, 469 .llseek = seq_lseek, 470 .release = seq_release_net, 471 }; 472 473 474 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 475 { 476 int cpu; 477 478 if (*pos == 0) 479 return SEQ_START_TOKEN; 480 481 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 482 if (!cpu_possible(cpu)) 483 continue; 484 *pos = cpu+1; 485 return &per_cpu(rt_cache_stat, cpu); 486 } 487 return NULL; 488 } 489 490 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 491 { 492 int cpu; 493 494 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 495 if (!cpu_possible(cpu)) 496 continue; 497 *pos = cpu+1; 498 return &per_cpu(rt_cache_stat, cpu); 499 } 500 return NULL; 501 502 } 503 504 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 505 { 506 507 } 508 509 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 510 { 511 struct rt_cache_stat *st = v; 512 513 if (v == SEQ_START_TOKEN) { 514 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 515 return 0; 516 } 517 518 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 519 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 520 dst_entries_get_slow(&ipv4_dst_ops), 521 st->in_hit, 522 st->in_slow_tot, 523 st->in_slow_mc, 524 st->in_no_route, 525 st->in_brd, 526 st->in_martian_dst, 527 st->in_martian_src, 528 529 st->out_hit, 530 st->out_slow_tot, 531 st->out_slow_mc, 532 533 st->gc_total, 534 st->gc_ignored, 535 st->gc_goal_miss, 536 st->gc_dst_overflow, 537 st->in_hlist_search, 538 st->out_hlist_search 539 ); 540 return 0; 541 } 542 543 static const struct seq_operations rt_cpu_seq_ops = { 544 .start = rt_cpu_seq_start, 545 .next = rt_cpu_seq_next, 546 .stop = rt_cpu_seq_stop, 547 .show = rt_cpu_seq_show, 548 }; 549 550 551 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 552 { 553 return seq_open(file, &rt_cpu_seq_ops); 554 } 555 556 static const struct file_operations rt_cpu_seq_fops = { 557 .owner = THIS_MODULE, 558 .open = rt_cpu_seq_open, 559 .read = seq_read, 560 .llseek = seq_lseek, 561 .release = seq_release, 562 }; 563 564 #ifdef CONFIG_IP_ROUTE_CLASSID 565 static int rt_acct_proc_show(struct seq_file *m, void *v) 566 { 567 struct ip_rt_acct *dst, *src; 568 unsigned int i, j; 569 570 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 571 if (!dst) 572 return -ENOMEM; 573 574 for_each_possible_cpu(i) { 575 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 576 for (j = 0; j < 256; j++) { 577 dst[j].o_bytes += src[j].o_bytes; 578 dst[j].o_packets += src[j].o_packets; 579 dst[j].i_bytes += src[j].i_bytes; 580 dst[j].i_packets += src[j].i_packets; 581 } 582 } 583 584 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 585 kfree(dst); 586 return 0; 587 } 588 589 static int rt_acct_proc_open(struct inode *inode, struct file *file) 590 { 591 return single_open(file, rt_acct_proc_show, NULL); 592 } 593 594 static const struct file_operations rt_acct_proc_fops = { 595 .owner = THIS_MODULE, 596 .open = rt_acct_proc_open, 597 .read = seq_read, 598 .llseek = seq_lseek, 599 .release = single_release, 600 }; 601 #endif 602 603 static int __net_init ip_rt_do_proc_init(struct net *net) 604 { 605 struct proc_dir_entry *pde; 606 607 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, 608 &rt_cache_seq_fops); 609 if (!pde) 610 goto err1; 611 612 pde = proc_create("rt_cache", S_IRUGO, 613 net->proc_net_stat, &rt_cpu_seq_fops); 614 if (!pde) 615 goto err2; 616 617 #ifdef CONFIG_IP_ROUTE_CLASSID 618 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 619 if (!pde) 620 goto err3; 621 #endif 622 return 0; 623 624 #ifdef CONFIG_IP_ROUTE_CLASSID 625 err3: 626 remove_proc_entry("rt_cache", net->proc_net_stat); 627 #endif 628 err2: 629 remove_proc_entry("rt_cache", net->proc_net); 630 err1: 631 return -ENOMEM; 632 } 633 634 static void __net_exit ip_rt_do_proc_exit(struct net *net) 635 { 636 remove_proc_entry("rt_cache", net->proc_net_stat); 637 remove_proc_entry("rt_cache", net->proc_net); 638 #ifdef CONFIG_IP_ROUTE_CLASSID 639 remove_proc_entry("rt_acct", net->proc_net); 640 #endif 641 } 642 643 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 644 .init = ip_rt_do_proc_init, 645 .exit = ip_rt_do_proc_exit, 646 }; 647 648 static int __init ip_rt_proc_init(void) 649 { 650 return register_pernet_subsys(&ip_rt_proc_ops); 651 } 652 653 #else 654 static inline int ip_rt_proc_init(void) 655 { 656 return 0; 657 } 658 #endif /* CONFIG_PROC_FS */ 659 660 static inline void rt_free(struct rtable *rt) 661 { 662 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 663 } 664 665 static inline void rt_drop(struct rtable *rt) 666 { 667 ip_rt_put(rt); 668 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 669 } 670 671 static inline int rt_fast_clean(struct rtable *rth) 672 { 673 /* Kill broadcast/multicast entries very aggresively, if they 674 collide in hash table with more useful entries */ 675 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 676 rt_is_input_route(rth) && rth->dst.rt_next; 677 } 678 679 static inline int rt_valuable(struct rtable *rth) 680 { 681 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 682 (rth->peer && rth->peer->pmtu_expires); 683 } 684 685 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 686 { 687 unsigned long age; 688 int ret = 0; 689 690 if (atomic_read(&rth->dst.__refcnt)) 691 goto out; 692 693 age = jiffies - rth->dst.lastuse; 694 if ((age <= tmo1 && !rt_fast_clean(rth)) || 695 (age <= tmo2 && rt_valuable(rth))) 696 goto out; 697 ret = 1; 698 out: return ret; 699 } 700 701 /* Bits of score are: 702 * 31: very valuable 703 * 30: not quite useless 704 * 29..0: usage counter 705 */ 706 static inline u32 rt_score(struct rtable *rt) 707 { 708 u32 score = jiffies - rt->dst.lastuse; 709 710 score = ~score & ~(3<<30); 711 712 if (rt_valuable(rt)) 713 score |= (1<<31); 714 715 if (rt_is_output_route(rt) || 716 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 717 score |= (1<<30); 718 719 return score; 720 } 721 722 static inline bool rt_caching(const struct net *net) 723 { 724 return net->ipv4.current_rt_cache_rebuild_count <= 725 net->ipv4.sysctl_rt_cache_rebuild_count; 726 } 727 728 static inline bool compare_hash_inputs(const struct rtable *rt1, 729 const struct rtable *rt2) 730 { 731 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 732 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 733 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); 734 } 735 736 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) 737 { 738 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 739 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 740 (rt1->rt_mark ^ rt2->rt_mark) | 741 (rt1->rt_key_tos ^ rt2->rt_key_tos) | 742 (rt1->rt_route_iif ^ rt2->rt_route_iif) | 743 (rt1->rt_oif ^ rt2->rt_oif)) == 0; 744 } 745 746 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 747 { 748 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); 749 } 750 751 static inline int rt_is_expired(struct rtable *rth) 752 { 753 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 754 } 755 756 /* 757 * Perform a full scan of hash table and free all entries. 758 * Can be called by a softirq or a process. 759 * In the later case, we want to be reschedule if necessary 760 */ 761 static void rt_do_flush(struct net *net, int process_context) 762 { 763 unsigned int i; 764 struct rtable *rth, *next; 765 766 for (i = 0; i <= rt_hash_mask; i++) { 767 struct rtable __rcu **pprev; 768 struct rtable *list; 769 770 if (process_context && need_resched()) 771 cond_resched(); 772 rth = rcu_access_pointer(rt_hash_table[i].chain); 773 if (!rth) 774 continue; 775 776 spin_lock_bh(rt_hash_lock_addr(i)); 777 778 list = NULL; 779 pprev = &rt_hash_table[i].chain; 780 rth = rcu_dereference_protected(*pprev, 781 lockdep_is_held(rt_hash_lock_addr(i))); 782 783 while (rth) { 784 next = rcu_dereference_protected(rth->dst.rt_next, 785 lockdep_is_held(rt_hash_lock_addr(i))); 786 787 if (!net || 788 net_eq(dev_net(rth->dst.dev), net)) { 789 rcu_assign_pointer(*pprev, next); 790 rcu_assign_pointer(rth->dst.rt_next, list); 791 list = rth; 792 } else { 793 pprev = &rth->dst.rt_next; 794 } 795 rth = next; 796 } 797 798 spin_unlock_bh(rt_hash_lock_addr(i)); 799 800 for (; list; list = next) { 801 next = rcu_dereference_protected(list->dst.rt_next, 1); 802 rt_free(list); 803 } 804 } 805 } 806 807 /* 808 * While freeing expired entries, we compute average chain length 809 * and standard deviation, using fixed-point arithmetic. 810 * This to have an estimation of rt_chain_length_max 811 * rt_chain_length_max = max(elasticity, AVG + 4*SD) 812 * We use 3 bits for frational part, and 29 (or 61) for magnitude. 813 */ 814 815 #define FRACT_BITS 3 816 #define ONE (1UL << FRACT_BITS) 817 818 /* 819 * Given a hash chain and an item in this hash chain, 820 * find if a previous entry has the same hash_inputs 821 * (but differs on tos, mark or oif) 822 * Returns 0 if an alias is found. 823 * Returns ONE if rth has no alias before itself. 824 */ 825 static int has_noalias(const struct rtable *head, const struct rtable *rth) 826 { 827 const struct rtable *aux = head; 828 829 while (aux != rth) { 830 if (compare_hash_inputs(aux, rth)) 831 return 0; 832 aux = rcu_dereference_protected(aux->dst.rt_next, 1); 833 } 834 return ONE; 835 } 836 837 static void rt_check_expire(void) 838 { 839 static unsigned int rover; 840 unsigned int i = rover, goal; 841 struct rtable *rth; 842 struct rtable __rcu **rthp; 843 unsigned long samples = 0; 844 unsigned long sum = 0, sum2 = 0; 845 unsigned long delta; 846 u64 mult; 847 848 delta = jiffies - expires_ljiffies; 849 expires_ljiffies = jiffies; 850 mult = ((u64)delta) << rt_hash_log; 851 if (ip_rt_gc_timeout > 1) 852 do_div(mult, ip_rt_gc_timeout); 853 goal = (unsigned int)mult; 854 if (goal > rt_hash_mask) 855 goal = rt_hash_mask + 1; 856 for (; goal > 0; goal--) { 857 unsigned long tmo = ip_rt_gc_timeout; 858 unsigned long length; 859 860 i = (i + 1) & rt_hash_mask; 861 rthp = &rt_hash_table[i].chain; 862 863 if (need_resched()) 864 cond_resched(); 865 866 samples++; 867 868 if (rcu_dereference_raw(*rthp) == NULL) 869 continue; 870 length = 0; 871 spin_lock_bh(rt_hash_lock_addr(i)); 872 while ((rth = rcu_dereference_protected(*rthp, 873 lockdep_is_held(rt_hash_lock_addr(i)))) != NULL) { 874 prefetch(rth->dst.rt_next); 875 if (rt_is_expired(rth)) { 876 *rthp = rth->dst.rt_next; 877 rt_free(rth); 878 continue; 879 } 880 if (rth->dst.expires) { 881 /* Entry is expired even if it is in use */ 882 if (time_before_eq(jiffies, rth->dst.expires)) { 883 nofree: 884 tmo >>= 1; 885 rthp = &rth->dst.rt_next; 886 /* 887 * We only count entries on 888 * a chain with equal hash inputs once 889 * so that entries for different QOS 890 * levels, and other non-hash input 891 * attributes don't unfairly skew 892 * the length computation 893 */ 894 length += has_noalias(rt_hash_table[i].chain, rth); 895 continue; 896 } 897 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) 898 goto nofree; 899 900 /* Cleanup aged off entries. */ 901 *rthp = rth->dst.rt_next; 902 rt_free(rth); 903 } 904 spin_unlock_bh(rt_hash_lock_addr(i)); 905 sum += length; 906 sum2 += length*length; 907 } 908 if (samples) { 909 unsigned long avg = sum / samples; 910 unsigned long sd = int_sqrt(sum2 / samples - avg*avg); 911 rt_chain_length_max = max_t(unsigned long, 912 ip_rt_gc_elasticity, 913 (avg + 4*sd) >> FRACT_BITS); 914 } 915 rover = i; 916 } 917 918 /* 919 * rt_worker_func() is run in process context. 920 * we call rt_check_expire() to scan part of the hash table 921 */ 922 static void rt_worker_func(struct work_struct *work) 923 { 924 rt_check_expire(); 925 schedule_delayed_work(&expires_work, ip_rt_gc_interval); 926 } 927 928 /* 929 * Perturbation of rt_genid by a small quantity [1..256] 930 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 931 * many times (2^24) without giving recent rt_genid. 932 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 933 */ 934 static void rt_cache_invalidate(struct net *net) 935 { 936 unsigned char shuffle; 937 938 get_random_bytes(&shuffle, sizeof(shuffle)); 939 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 940 redirect_genid++; 941 } 942 943 /* 944 * delay < 0 : invalidate cache (fast : entries will be deleted later) 945 * delay >= 0 : invalidate & flush cache (can be long) 946 */ 947 void rt_cache_flush(struct net *net, int delay) 948 { 949 rt_cache_invalidate(net); 950 if (delay >= 0) 951 rt_do_flush(net, !in_softirq()); 952 } 953 954 /* Flush previous cache invalidated entries from the cache */ 955 void rt_cache_flush_batch(struct net *net) 956 { 957 rt_do_flush(net, !in_softirq()); 958 } 959 960 static void rt_emergency_hash_rebuild(struct net *net) 961 { 962 if (net_ratelimit()) 963 printk(KERN_WARNING "Route hash chain too long!\n"); 964 rt_cache_invalidate(net); 965 } 966 967 /* 968 Short description of GC goals. 969 970 We want to build algorithm, which will keep routing cache 971 at some equilibrium point, when number of aged off entries 972 is kept approximately equal to newly generated ones. 973 974 Current expiration strength is variable "expire". 975 We try to adjust it dynamically, so that if networking 976 is idle expires is large enough to keep enough of warm entries, 977 and when load increases it reduces to limit cache size. 978 */ 979 980 static int rt_garbage_collect(struct dst_ops *ops) 981 { 982 static unsigned long expire = RT_GC_TIMEOUT; 983 static unsigned long last_gc; 984 static int rover; 985 static int equilibrium; 986 struct rtable *rth; 987 struct rtable __rcu **rthp; 988 unsigned long now = jiffies; 989 int goal; 990 int entries = dst_entries_get_fast(&ipv4_dst_ops); 991 992 /* 993 * Garbage collection is pretty expensive, 994 * do not make it too frequently. 995 */ 996 997 RT_CACHE_STAT_INC(gc_total); 998 999 if (now - last_gc < ip_rt_gc_min_interval && 1000 entries < ip_rt_max_size) { 1001 RT_CACHE_STAT_INC(gc_ignored); 1002 goto out; 1003 } 1004 1005 entries = dst_entries_get_slow(&ipv4_dst_ops); 1006 /* Calculate number of entries, which we want to expire now. */ 1007 goal = entries - (ip_rt_gc_elasticity << rt_hash_log); 1008 if (goal <= 0) { 1009 if (equilibrium < ipv4_dst_ops.gc_thresh) 1010 equilibrium = ipv4_dst_ops.gc_thresh; 1011 goal = entries - equilibrium; 1012 if (goal > 0) { 1013 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 1014 goal = entries - equilibrium; 1015 } 1016 } else { 1017 /* We are in dangerous area. Try to reduce cache really 1018 * aggressively. 1019 */ 1020 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 1021 equilibrium = entries - goal; 1022 } 1023 1024 if (now - last_gc >= ip_rt_gc_min_interval) 1025 last_gc = now; 1026 1027 if (goal <= 0) { 1028 equilibrium += goal; 1029 goto work_done; 1030 } 1031 1032 do { 1033 int i, k; 1034 1035 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 1036 unsigned long tmo = expire; 1037 1038 k = (k + 1) & rt_hash_mask; 1039 rthp = &rt_hash_table[k].chain; 1040 spin_lock_bh(rt_hash_lock_addr(k)); 1041 while ((rth = rcu_dereference_protected(*rthp, 1042 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { 1043 if (!rt_is_expired(rth) && 1044 !rt_may_expire(rth, tmo, expire)) { 1045 tmo >>= 1; 1046 rthp = &rth->dst.rt_next; 1047 continue; 1048 } 1049 *rthp = rth->dst.rt_next; 1050 rt_free(rth); 1051 goal--; 1052 } 1053 spin_unlock_bh(rt_hash_lock_addr(k)); 1054 if (goal <= 0) 1055 break; 1056 } 1057 rover = k; 1058 1059 if (goal <= 0) 1060 goto work_done; 1061 1062 /* Goal is not achieved. We stop process if: 1063 1064 - if expire reduced to zero. Otherwise, expire is halfed. 1065 - if table is not full. 1066 - if we are called from interrupt. 1067 - jiffies check is just fallback/debug loop breaker. 1068 We will not spin here for long time in any case. 1069 */ 1070 1071 RT_CACHE_STAT_INC(gc_goal_miss); 1072 1073 if (expire == 0) 1074 break; 1075 1076 expire >>= 1; 1077 1078 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 1079 goto out; 1080 } while (!in_softirq() && time_before_eq(jiffies, now)); 1081 1082 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 1083 goto out; 1084 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) 1085 goto out; 1086 if (net_ratelimit()) 1087 printk(KERN_WARNING "dst cache overflow\n"); 1088 RT_CACHE_STAT_INC(gc_dst_overflow); 1089 return 1; 1090 1091 work_done: 1092 expire += ip_rt_gc_min_interval; 1093 if (expire > ip_rt_gc_timeout || 1094 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || 1095 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) 1096 expire = ip_rt_gc_timeout; 1097 out: return 0; 1098 } 1099 1100 /* 1101 * Returns number of entries in a hash chain that have different hash_inputs 1102 */ 1103 static int slow_chain_length(const struct rtable *head) 1104 { 1105 int length = 0; 1106 const struct rtable *rth = head; 1107 1108 while (rth) { 1109 length += has_noalias(head, rth); 1110 rth = rcu_dereference_protected(rth->dst.rt_next, 1); 1111 } 1112 return length >> FRACT_BITS; 1113 } 1114 1115 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) 1116 { 1117 static const __be32 inaddr_any = 0; 1118 struct net_device *dev = dst->dev; 1119 const __be32 *pkey = daddr; 1120 const struct rtable *rt; 1121 struct neighbour *n; 1122 1123 rt = (const struct rtable *) dst; 1124 1125 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) 1126 pkey = &inaddr_any; 1127 else if (rt->rt_gateway) 1128 pkey = (const __be32 *) &rt->rt_gateway; 1129 1130 n = __ipv4_neigh_lookup(&arp_tbl, dev, *(__force u32 *)pkey); 1131 if (n) 1132 return n; 1133 return neigh_create(&arp_tbl, pkey, dev); 1134 } 1135 1136 static int rt_bind_neighbour(struct rtable *rt) 1137 { 1138 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); 1139 if (IS_ERR(n)) 1140 return PTR_ERR(n); 1141 dst_set_neighbour(&rt->dst, n); 1142 1143 return 0; 1144 } 1145 1146 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, 1147 struct sk_buff *skb, int ifindex) 1148 { 1149 struct rtable *rth, *cand; 1150 struct rtable __rcu **rthp, **candp; 1151 unsigned long now; 1152 u32 min_score; 1153 int chain_length; 1154 int attempts = !in_softirq(); 1155 1156 restart: 1157 chain_length = 0; 1158 min_score = ~(u32)0; 1159 cand = NULL; 1160 candp = NULL; 1161 now = jiffies; 1162 1163 if (!rt_caching(dev_net(rt->dst.dev))) { 1164 /* 1165 * If we're not caching, just tell the caller we 1166 * were successful and don't touch the route. The 1167 * caller hold the sole reference to the cache entry, and 1168 * it will be released when the caller is done with it. 1169 * If we drop it here, the callers have no way to resolve routes 1170 * when we're not caching. Instead, just point *rp at rt, so 1171 * the caller gets a single use out of the route 1172 * Note that we do rt_free on this new route entry, so that 1173 * once its refcount hits zero, we are still able to reap it 1174 * (Thanks Alexey) 1175 * Note: To avoid expensive rcu stuff for this uncached dst, 1176 * we set DST_NOCACHE so that dst_release() can free dst without 1177 * waiting a grace period. 1178 */ 1179 1180 rt->dst.flags |= DST_NOCACHE; 1181 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1182 int err = rt_bind_neighbour(rt); 1183 if (err) { 1184 if (net_ratelimit()) 1185 printk(KERN_WARNING 1186 "Neighbour table failure & not caching routes.\n"); 1187 ip_rt_put(rt); 1188 return ERR_PTR(err); 1189 } 1190 } 1191 1192 goto skip_hashing; 1193 } 1194 1195 rthp = &rt_hash_table[hash].chain; 1196 1197 spin_lock_bh(rt_hash_lock_addr(hash)); 1198 while ((rth = rcu_dereference_protected(*rthp, 1199 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1200 if (rt_is_expired(rth)) { 1201 *rthp = rth->dst.rt_next; 1202 rt_free(rth); 1203 continue; 1204 } 1205 if (compare_keys(rth, rt) && compare_netns(rth, rt)) { 1206 /* Put it first */ 1207 *rthp = rth->dst.rt_next; 1208 /* 1209 * Since lookup is lockfree, the deletion 1210 * must be visible to another weakly ordered CPU before 1211 * the insertion at the start of the hash chain. 1212 */ 1213 rcu_assign_pointer(rth->dst.rt_next, 1214 rt_hash_table[hash].chain); 1215 /* 1216 * Since lookup is lockfree, the update writes 1217 * must be ordered for consistency on SMP. 1218 */ 1219 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1220 1221 dst_use(&rth->dst, now); 1222 spin_unlock_bh(rt_hash_lock_addr(hash)); 1223 1224 rt_drop(rt); 1225 if (skb) 1226 skb_dst_set(skb, &rth->dst); 1227 return rth; 1228 } 1229 1230 if (!atomic_read(&rth->dst.__refcnt)) { 1231 u32 score = rt_score(rth); 1232 1233 if (score <= min_score) { 1234 cand = rth; 1235 candp = rthp; 1236 min_score = score; 1237 } 1238 } 1239 1240 chain_length++; 1241 1242 rthp = &rth->dst.rt_next; 1243 } 1244 1245 if (cand) { 1246 /* ip_rt_gc_elasticity used to be average length of chain 1247 * length, when exceeded gc becomes really aggressive. 1248 * 1249 * The second limit is less certain. At the moment it allows 1250 * only 2 entries per bucket. We will see. 1251 */ 1252 if (chain_length > ip_rt_gc_elasticity) { 1253 *candp = cand->dst.rt_next; 1254 rt_free(cand); 1255 } 1256 } else { 1257 if (chain_length > rt_chain_length_max && 1258 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { 1259 struct net *net = dev_net(rt->dst.dev); 1260 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1261 if (!rt_caching(net)) { 1262 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1263 rt->dst.dev->name, num); 1264 } 1265 rt_emergency_hash_rebuild(net); 1266 spin_unlock_bh(rt_hash_lock_addr(hash)); 1267 1268 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1269 ifindex, rt_genid(net)); 1270 goto restart; 1271 } 1272 } 1273 1274 /* Try to bind route to arp only if it is output 1275 route or unicast forwarding path. 1276 */ 1277 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1278 int err = rt_bind_neighbour(rt); 1279 if (err) { 1280 spin_unlock_bh(rt_hash_lock_addr(hash)); 1281 1282 if (err != -ENOBUFS) { 1283 rt_drop(rt); 1284 return ERR_PTR(err); 1285 } 1286 1287 /* Neighbour tables are full and nothing 1288 can be released. Try to shrink route cache, 1289 it is most likely it holds some neighbour records. 1290 */ 1291 if (attempts-- > 0) { 1292 int saved_elasticity = ip_rt_gc_elasticity; 1293 int saved_int = ip_rt_gc_min_interval; 1294 ip_rt_gc_elasticity = 1; 1295 ip_rt_gc_min_interval = 0; 1296 rt_garbage_collect(&ipv4_dst_ops); 1297 ip_rt_gc_min_interval = saved_int; 1298 ip_rt_gc_elasticity = saved_elasticity; 1299 goto restart; 1300 } 1301 1302 if (net_ratelimit()) 1303 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1304 rt_drop(rt); 1305 return ERR_PTR(-ENOBUFS); 1306 } 1307 } 1308 1309 rt->dst.rt_next = rt_hash_table[hash].chain; 1310 1311 /* 1312 * Since lookup is lockfree, we must make sure 1313 * previous writes to rt are committed to memory 1314 * before making rt visible to other CPUS. 1315 */ 1316 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1317 1318 spin_unlock_bh(rt_hash_lock_addr(hash)); 1319 1320 skip_hashing: 1321 if (skb) 1322 skb_dst_set(skb, &rt->dst); 1323 return rt; 1324 } 1325 1326 static atomic_t __rt_peer_genid = ATOMIC_INIT(0); 1327 1328 static u32 rt_peer_genid(void) 1329 { 1330 return atomic_read(&__rt_peer_genid); 1331 } 1332 1333 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) 1334 { 1335 struct inet_peer *peer; 1336 1337 peer = inet_getpeer_v4(daddr, create); 1338 1339 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1340 inet_putpeer(peer); 1341 else 1342 rt->rt_peer_genid = rt_peer_genid(); 1343 } 1344 1345 /* 1346 * Peer allocation may fail only in serious out-of-memory conditions. However 1347 * we still can generate some output. 1348 * Random ID selection looks a bit dangerous because we have no chances to 1349 * select ID being unique in a reasonable period of time. 1350 * But broken packet identifier may be better than no packet at all. 1351 */ 1352 static void ip_select_fb_ident(struct iphdr *iph) 1353 { 1354 static DEFINE_SPINLOCK(ip_fb_id_lock); 1355 static u32 ip_fallback_id; 1356 u32 salt; 1357 1358 spin_lock_bh(&ip_fb_id_lock); 1359 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 1360 iph->id = htons(salt & 0xFFFF); 1361 ip_fallback_id = salt; 1362 spin_unlock_bh(&ip_fb_id_lock); 1363 } 1364 1365 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1366 { 1367 struct rtable *rt = (struct rtable *) dst; 1368 1369 if (rt && !(rt->dst.flags & DST_NOPEER)) { 1370 if (rt->peer == NULL) 1371 rt_bind_peer(rt, rt->rt_dst, 1); 1372 1373 /* If peer is attached to destination, it is never detached, 1374 so that we need not to grab a lock to dereference it. 1375 */ 1376 if (rt->peer) { 1377 iph->id = htons(inet_getid(rt->peer, more)); 1378 return; 1379 } 1380 } else if (!rt) 1381 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1382 __builtin_return_address(0)); 1383 1384 ip_select_fb_ident(iph); 1385 } 1386 EXPORT_SYMBOL(__ip_select_ident); 1387 1388 static void rt_del(unsigned hash, struct rtable *rt) 1389 { 1390 struct rtable __rcu **rthp; 1391 struct rtable *aux; 1392 1393 rthp = &rt_hash_table[hash].chain; 1394 spin_lock_bh(rt_hash_lock_addr(hash)); 1395 ip_rt_put(rt); 1396 while ((aux = rcu_dereference_protected(*rthp, 1397 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1398 if (aux == rt || rt_is_expired(aux)) { 1399 *rthp = aux->dst.rt_next; 1400 rt_free(aux); 1401 continue; 1402 } 1403 rthp = &aux->dst.rt_next; 1404 } 1405 spin_unlock_bh(rt_hash_lock_addr(hash)); 1406 } 1407 1408 static void check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) 1409 { 1410 struct rtable *rt = (struct rtable *) dst; 1411 __be32 orig_gw = rt->rt_gateway; 1412 struct neighbour *n, *old_n; 1413 1414 dst_confirm(&rt->dst); 1415 1416 rt->rt_gateway = peer->redirect_learned.a4; 1417 1418 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); 1419 if (IS_ERR(n)) { 1420 rt->rt_gateway = orig_gw; 1421 return; 1422 } 1423 old_n = xchg(&rt->dst._neighbour, n); 1424 if (old_n) 1425 neigh_release(old_n); 1426 if (!(n->nud_state & NUD_VALID)) { 1427 neigh_event_send(n, NULL); 1428 } else { 1429 rt->rt_flags |= RTCF_REDIRECTED; 1430 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 1431 } 1432 } 1433 1434 /* called in rcu_read_lock() section */ 1435 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1436 __be32 saddr, struct net_device *dev) 1437 { 1438 int s, i; 1439 struct in_device *in_dev = __in_dev_get_rcu(dev); 1440 __be32 skeys[2] = { saddr, 0 }; 1441 int ikeys[2] = { dev->ifindex, 0 }; 1442 struct inet_peer *peer; 1443 struct net *net; 1444 1445 if (!in_dev) 1446 return; 1447 1448 net = dev_net(dev); 1449 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 1450 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 1451 ipv4_is_zeronet(new_gw)) 1452 goto reject_redirect; 1453 1454 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1455 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1456 goto reject_redirect; 1457 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1458 goto reject_redirect; 1459 } else { 1460 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 1461 goto reject_redirect; 1462 } 1463 1464 for (s = 0; s < 2; s++) { 1465 for (i = 0; i < 2; i++) { 1466 unsigned int hash; 1467 struct rtable __rcu **rthp; 1468 struct rtable *rt; 1469 1470 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); 1471 1472 rthp = &rt_hash_table[hash].chain; 1473 1474 while ((rt = rcu_dereference(*rthp)) != NULL) { 1475 rthp = &rt->dst.rt_next; 1476 1477 if (rt->rt_key_dst != daddr || 1478 rt->rt_key_src != skeys[s] || 1479 rt->rt_oif != ikeys[i] || 1480 rt_is_input_route(rt) || 1481 rt_is_expired(rt) || 1482 !net_eq(dev_net(rt->dst.dev), net) || 1483 rt->dst.error || 1484 rt->dst.dev != dev || 1485 rt->rt_gateway != old_gw) 1486 continue; 1487 1488 if (!rt->peer) 1489 rt_bind_peer(rt, rt->rt_dst, 1); 1490 1491 peer = rt->peer; 1492 if (peer) { 1493 if (peer->redirect_learned.a4 != new_gw || 1494 peer->redirect_genid != redirect_genid) { 1495 peer->redirect_learned.a4 = new_gw; 1496 peer->redirect_genid = redirect_genid; 1497 atomic_inc(&__rt_peer_genid); 1498 } 1499 check_peer_redir(&rt->dst, peer); 1500 } 1501 } 1502 } 1503 } 1504 return; 1505 1506 reject_redirect: 1507 #ifdef CONFIG_IP_ROUTE_VERBOSE 1508 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1509 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" 1510 " Advised path = %pI4 -> %pI4\n", 1511 &old_gw, dev->name, &new_gw, 1512 &saddr, &daddr); 1513 #endif 1514 ; 1515 } 1516 1517 static bool peer_pmtu_expired(struct inet_peer *peer) 1518 { 1519 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1520 1521 return orig && 1522 time_after_eq(jiffies, orig) && 1523 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1524 } 1525 1526 static bool peer_pmtu_cleaned(struct inet_peer *peer) 1527 { 1528 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1529 1530 return orig && 1531 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1532 } 1533 1534 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1535 { 1536 struct rtable *rt = (struct rtable *)dst; 1537 struct dst_entry *ret = dst; 1538 1539 if (rt) { 1540 if (dst->obsolete > 0) { 1541 ip_rt_put(rt); 1542 ret = NULL; 1543 } else if (rt->rt_flags & RTCF_REDIRECTED) { 1544 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1545 rt->rt_oif, 1546 rt_genid(dev_net(dst->dev))); 1547 rt_del(hash, rt); 1548 ret = NULL; 1549 } else if (rt->peer && peer_pmtu_expired(rt->peer)) { 1550 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); 1551 } 1552 } 1553 return ret; 1554 } 1555 1556 /* 1557 * Algorithm: 1558 * 1. The first ip_rt_redirect_number redirects are sent 1559 * with exponential backoff, then we stop sending them at all, 1560 * assuming that the host ignores our redirects. 1561 * 2. If we did not see packets requiring redirects 1562 * during ip_rt_redirect_silence, we assume that the host 1563 * forgot redirected route and start to send redirects again. 1564 * 1565 * This algorithm is much cheaper and more intelligent than dumb load limiting 1566 * in icmp.c. 1567 * 1568 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1569 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1570 */ 1571 1572 void ip_rt_send_redirect(struct sk_buff *skb) 1573 { 1574 struct rtable *rt = skb_rtable(skb); 1575 struct in_device *in_dev; 1576 struct inet_peer *peer; 1577 int log_martians; 1578 1579 rcu_read_lock(); 1580 in_dev = __in_dev_get_rcu(rt->dst.dev); 1581 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1582 rcu_read_unlock(); 1583 return; 1584 } 1585 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1586 rcu_read_unlock(); 1587 1588 if (!rt->peer) 1589 rt_bind_peer(rt, rt->rt_dst, 1); 1590 peer = rt->peer; 1591 if (!peer) { 1592 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1593 return; 1594 } 1595 1596 /* No redirected packets during ip_rt_redirect_silence; 1597 * reset the algorithm. 1598 */ 1599 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) 1600 peer->rate_tokens = 0; 1601 1602 /* Too many ignored redirects; do not send anything 1603 * set dst.rate_last to the last seen redirected packet. 1604 */ 1605 if (peer->rate_tokens >= ip_rt_redirect_number) { 1606 peer->rate_last = jiffies; 1607 return; 1608 } 1609 1610 /* Check for load limit; set rate_last to the latest sent 1611 * redirect. 1612 */ 1613 if (peer->rate_tokens == 0 || 1614 time_after(jiffies, 1615 (peer->rate_last + 1616 (ip_rt_redirect_load << peer->rate_tokens)))) { 1617 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1618 peer->rate_last = jiffies; 1619 ++peer->rate_tokens; 1620 #ifdef CONFIG_IP_ROUTE_VERBOSE 1621 if (log_martians && 1622 peer->rate_tokens == ip_rt_redirect_number && 1623 net_ratelimit()) 1624 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1625 &ip_hdr(skb)->saddr, rt->rt_iif, 1626 &rt->rt_dst, &rt->rt_gateway); 1627 #endif 1628 } 1629 } 1630 1631 static int ip_error(struct sk_buff *skb) 1632 { 1633 struct rtable *rt = skb_rtable(skb); 1634 struct inet_peer *peer; 1635 unsigned long now; 1636 bool send; 1637 int code; 1638 1639 switch (rt->dst.error) { 1640 case EINVAL: 1641 default: 1642 goto out; 1643 case EHOSTUNREACH: 1644 code = ICMP_HOST_UNREACH; 1645 break; 1646 case ENETUNREACH: 1647 code = ICMP_NET_UNREACH; 1648 IP_INC_STATS_BH(dev_net(rt->dst.dev), 1649 IPSTATS_MIB_INNOROUTES); 1650 break; 1651 case EACCES: 1652 code = ICMP_PKT_FILTERED; 1653 break; 1654 } 1655 1656 if (!rt->peer) 1657 rt_bind_peer(rt, rt->rt_dst, 1); 1658 peer = rt->peer; 1659 1660 send = true; 1661 if (peer) { 1662 now = jiffies; 1663 peer->rate_tokens += now - peer->rate_last; 1664 if (peer->rate_tokens > ip_rt_error_burst) 1665 peer->rate_tokens = ip_rt_error_burst; 1666 peer->rate_last = now; 1667 if (peer->rate_tokens >= ip_rt_error_cost) 1668 peer->rate_tokens -= ip_rt_error_cost; 1669 else 1670 send = false; 1671 } 1672 if (send) 1673 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1674 1675 out: kfree_skb(skb); 1676 return 0; 1677 } 1678 1679 /* 1680 * The last two values are not from the RFC but 1681 * are needed for AMPRnet AX.25 paths. 1682 */ 1683 1684 static const unsigned short mtu_plateau[] = 1685 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1686 1687 static inline unsigned short guess_mtu(unsigned short old_mtu) 1688 { 1689 int i; 1690 1691 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1692 if (old_mtu > mtu_plateau[i]) 1693 return mtu_plateau[i]; 1694 return 68; 1695 } 1696 1697 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, 1698 unsigned short new_mtu, 1699 struct net_device *dev) 1700 { 1701 unsigned short old_mtu = ntohs(iph->tot_len); 1702 unsigned short est_mtu = 0; 1703 struct inet_peer *peer; 1704 1705 peer = inet_getpeer_v4(iph->daddr, 1); 1706 if (peer) { 1707 unsigned short mtu = new_mtu; 1708 1709 if (new_mtu < 68 || new_mtu >= old_mtu) { 1710 /* BSD 4.2 derived systems incorrectly adjust 1711 * tot_len by the IP header length, and report 1712 * a zero MTU in the ICMP message. 1713 */ 1714 if (mtu == 0 && 1715 old_mtu >= 68 + (iph->ihl << 2)) 1716 old_mtu -= iph->ihl << 2; 1717 mtu = guess_mtu(old_mtu); 1718 } 1719 1720 if (mtu < ip_rt_min_pmtu) 1721 mtu = ip_rt_min_pmtu; 1722 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { 1723 unsigned long pmtu_expires; 1724 1725 pmtu_expires = jiffies + ip_rt_mtu_expires; 1726 if (!pmtu_expires) 1727 pmtu_expires = 1UL; 1728 1729 est_mtu = mtu; 1730 peer->pmtu_learned = mtu; 1731 peer->pmtu_expires = pmtu_expires; 1732 atomic_inc(&__rt_peer_genid); 1733 } 1734 1735 inet_putpeer(peer); 1736 } 1737 return est_mtu ? : new_mtu; 1738 } 1739 1740 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) 1741 { 1742 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); 1743 1744 if (!expires) 1745 return; 1746 if (time_before(jiffies, expires)) { 1747 u32 orig_dst_mtu = dst_mtu(dst); 1748 if (peer->pmtu_learned < orig_dst_mtu) { 1749 if (!peer->pmtu_orig) 1750 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); 1751 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); 1752 } 1753 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) 1754 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); 1755 } 1756 1757 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1758 { 1759 struct rtable *rt = (struct rtable *) dst; 1760 struct inet_peer *peer; 1761 1762 dst_confirm(dst); 1763 1764 if (!rt->peer) 1765 rt_bind_peer(rt, rt->rt_dst, 1); 1766 peer = rt->peer; 1767 if (peer) { 1768 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); 1769 1770 if (mtu < ip_rt_min_pmtu) 1771 mtu = ip_rt_min_pmtu; 1772 if (!pmtu_expires || mtu < peer->pmtu_learned) { 1773 1774 pmtu_expires = jiffies + ip_rt_mtu_expires; 1775 if (!pmtu_expires) 1776 pmtu_expires = 1UL; 1777 1778 peer->pmtu_learned = mtu; 1779 peer->pmtu_expires = pmtu_expires; 1780 1781 atomic_inc(&__rt_peer_genid); 1782 rt->rt_peer_genid = rt_peer_genid(); 1783 } 1784 check_peer_pmtu(dst, peer); 1785 } 1786 } 1787 1788 1789 static void ipv4_validate_peer(struct rtable *rt) 1790 { 1791 if (rt->rt_peer_genid != rt_peer_genid()) { 1792 struct inet_peer *peer; 1793 1794 if (!rt->peer) 1795 rt_bind_peer(rt, rt->rt_dst, 0); 1796 1797 peer = rt->peer; 1798 if (peer) { 1799 check_peer_pmtu(&rt->dst, peer); 1800 1801 if (peer->redirect_genid != redirect_genid) 1802 peer->redirect_learned.a4 = 0; 1803 if (peer->redirect_learned.a4 && 1804 peer->redirect_learned.a4 != rt->rt_gateway) 1805 check_peer_redir(&rt->dst, peer); 1806 } 1807 1808 rt->rt_peer_genid = rt_peer_genid(); 1809 } 1810 } 1811 1812 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1813 { 1814 struct rtable *rt = (struct rtable *) dst; 1815 1816 if (rt_is_expired(rt)) 1817 return NULL; 1818 ipv4_validate_peer(rt); 1819 return dst; 1820 } 1821 1822 static void ipv4_dst_destroy(struct dst_entry *dst) 1823 { 1824 struct rtable *rt = (struct rtable *) dst; 1825 struct inet_peer *peer = rt->peer; 1826 1827 if (rt->fi) { 1828 fib_info_put(rt->fi); 1829 rt->fi = NULL; 1830 } 1831 if (peer) { 1832 rt->peer = NULL; 1833 inet_putpeer(peer); 1834 } 1835 } 1836 1837 1838 static void ipv4_link_failure(struct sk_buff *skb) 1839 { 1840 struct rtable *rt; 1841 1842 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1843 1844 rt = skb_rtable(skb); 1845 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) 1846 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); 1847 } 1848 1849 static int ip_rt_bug(struct sk_buff *skb) 1850 { 1851 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", 1852 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1853 skb->dev ? skb->dev->name : "?"); 1854 kfree_skb(skb); 1855 WARN_ON(1); 1856 return 0; 1857 } 1858 1859 /* 1860 We do not cache source address of outgoing interface, 1861 because it is used only by IP RR, TS and SRR options, 1862 so that it out of fast path. 1863 1864 BTW remember: "addr" is allowed to be not aligned 1865 in IP options! 1866 */ 1867 1868 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1869 { 1870 __be32 src; 1871 1872 if (rt_is_output_route(rt)) 1873 src = ip_hdr(skb)->saddr; 1874 else { 1875 struct fib_result res; 1876 struct flowi4 fl4; 1877 struct iphdr *iph; 1878 1879 iph = ip_hdr(skb); 1880 1881 memset(&fl4, 0, sizeof(fl4)); 1882 fl4.daddr = iph->daddr; 1883 fl4.saddr = iph->saddr; 1884 fl4.flowi4_tos = RT_TOS(iph->tos); 1885 fl4.flowi4_oif = rt->dst.dev->ifindex; 1886 fl4.flowi4_iif = skb->dev->ifindex; 1887 fl4.flowi4_mark = skb->mark; 1888 1889 rcu_read_lock(); 1890 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1891 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1892 else 1893 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1894 RT_SCOPE_UNIVERSE); 1895 rcu_read_unlock(); 1896 } 1897 memcpy(addr, &src, 4); 1898 } 1899 1900 #ifdef CONFIG_IP_ROUTE_CLASSID 1901 static void set_class_tag(struct rtable *rt, u32 tag) 1902 { 1903 if (!(rt->dst.tclassid & 0xFFFF)) 1904 rt->dst.tclassid |= tag & 0xFFFF; 1905 if (!(rt->dst.tclassid & 0xFFFF0000)) 1906 rt->dst.tclassid |= tag & 0xFFFF0000; 1907 } 1908 #endif 1909 1910 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1911 { 1912 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); 1913 1914 if (advmss == 0) { 1915 advmss = max_t(unsigned int, dst->dev->mtu - 40, 1916 ip_rt_min_advmss); 1917 if (advmss > 65535 - 40) 1918 advmss = 65535 - 40; 1919 } 1920 return advmss; 1921 } 1922 1923 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1924 { 1925 const struct rtable *rt = (const struct rtable *) dst; 1926 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 1927 1928 if (mtu && rt_is_output_route(rt)) 1929 return mtu; 1930 1931 mtu = dst->dev->mtu; 1932 1933 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1934 1935 if (rt->rt_gateway != rt->rt_dst && mtu > 576) 1936 mtu = 576; 1937 } 1938 1939 if (mtu > IP_MAX_MTU) 1940 mtu = IP_MAX_MTU; 1941 1942 return mtu; 1943 } 1944 1945 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1946 struct fib_info *fi) 1947 { 1948 struct inet_peer *peer; 1949 int create = 0; 1950 1951 /* If a peer entry exists for this destination, we must hook 1952 * it up in order to get at cached metrics. 1953 */ 1954 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) 1955 create = 1; 1956 1957 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); 1958 if (peer) { 1959 rt->rt_peer_genid = rt_peer_genid(); 1960 if (inet_metrics_new(peer)) 1961 memcpy(peer->metrics, fi->fib_metrics, 1962 sizeof(u32) * RTAX_MAX); 1963 dst_init_metrics(&rt->dst, peer->metrics, false); 1964 1965 check_peer_pmtu(&rt->dst, peer); 1966 if (peer->redirect_genid != redirect_genid) 1967 peer->redirect_learned.a4 = 0; 1968 if (peer->redirect_learned.a4 && 1969 peer->redirect_learned.a4 != rt->rt_gateway) { 1970 rt->rt_gateway = peer->redirect_learned.a4; 1971 rt->rt_flags |= RTCF_REDIRECTED; 1972 } 1973 } else { 1974 if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1975 rt->fi = fi; 1976 atomic_inc(&fi->fib_clntref); 1977 } 1978 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1979 } 1980 } 1981 1982 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1983 const struct fib_result *res, 1984 struct fib_info *fi, u16 type, u32 itag) 1985 { 1986 struct dst_entry *dst = &rt->dst; 1987 1988 if (fi) { 1989 if (FIB_RES_GW(*res) && 1990 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1991 rt->rt_gateway = FIB_RES_GW(*res); 1992 rt_init_metrics(rt, fl4, fi); 1993 #ifdef CONFIG_IP_ROUTE_CLASSID 1994 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1995 #endif 1996 } 1997 1998 if (dst_mtu(dst) > IP_MAX_MTU) 1999 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); 2000 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 2001 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 2002 2003 #ifdef CONFIG_IP_ROUTE_CLASSID 2004 #ifdef CONFIG_IP_MULTIPLE_TABLES 2005 set_class_tag(rt, fib_rules_tclass(res)); 2006 #endif 2007 set_class_tag(rt, itag); 2008 #endif 2009 } 2010 2011 static struct rtable *rt_dst_alloc(struct net_device *dev, 2012 bool nopolicy, bool noxfrm) 2013 { 2014 return dst_alloc(&ipv4_dst_ops, dev, 1, -1, 2015 DST_HOST | 2016 (nopolicy ? DST_NOPOLICY : 0) | 2017 (noxfrm ? DST_NOXFRM : 0)); 2018 } 2019 2020 /* called in rcu_read_lock() section */ 2021 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2022 u8 tos, struct net_device *dev, int our) 2023 { 2024 unsigned int hash; 2025 struct rtable *rth; 2026 __be32 spec_dst; 2027 struct in_device *in_dev = __in_dev_get_rcu(dev); 2028 u32 itag = 0; 2029 int err; 2030 2031 /* Primary sanity checks. */ 2032 2033 if (in_dev == NULL) 2034 return -EINVAL; 2035 2036 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2037 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 2038 goto e_inval; 2039 2040 if (ipv4_is_zeronet(saddr)) { 2041 if (!ipv4_is_local_multicast(daddr)) 2042 goto e_inval; 2043 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2044 } else { 2045 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 2046 &itag); 2047 if (err < 0) 2048 goto e_err; 2049 } 2050 rth = rt_dst_alloc(init_net.loopback_dev, 2051 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 2052 if (!rth) 2053 goto e_nobufs; 2054 2055 #ifdef CONFIG_IP_ROUTE_CLASSID 2056 rth->dst.tclassid = itag; 2057 #endif 2058 rth->dst.output = ip_rt_bug; 2059 2060 rth->rt_key_dst = daddr; 2061 rth->rt_key_src = saddr; 2062 rth->rt_genid = rt_genid(dev_net(dev)); 2063 rth->rt_flags = RTCF_MULTICAST; 2064 rth->rt_type = RTN_MULTICAST; 2065 rth->rt_key_tos = tos; 2066 rth->rt_dst = daddr; 2067 rth->rt_src = saddr; 2068 rth->rt_route_iif = dev->ifindex; 2069 rth->rt_iif = dev->ifindex; 2070 rth->rt_oif = 0; 2071 rth->rt_mark = skb->mark; 2072 rth->rt_gateway = daddr; 2073 rth->rt_spec_dst= spec_dst; 2074 rth->rt_peer_genid = 0; 2075 rth->peer = NULL; 2076 rth->fi = NULL; 2077 if (our) { 2078 rth->dst.input= ip_local_deliver; 2079 rth->rt_flags |= RTCF_LOCAL; 2080 } 2081 2082 #ifdef CONFIG_IP_MROUTE 2083 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 2084 rth->dst.input = ip_mr_input; 2085 #endif 2086 RT_CACHE_STAT_INC(in_slow_mc); 2087 2088 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 2089 rth = rt_intern_hash(hash, rth, skb, dev->ifindex); 2090 return IS_ERR(rth) ? PTR_ERR(rth) : 0; 2091 2092 e_nobufs: 2093 return -ENOBUFS; 2094 e_inval: 2095 return -EINVAL; 2096 e_err: 2097 return err; 2098 } 2099 2100 2101 static void ip_handle_martian_source(struct net_device *dev, 2102 struct in_device *in_dev, 2103 struct sk_buff *skb, 2104 __be32 daddr, 2105 __be32 saddr) 2106 { 2107 RT_CACHE_STAT_INC(in_martian_src); 2108 #ifdef CONFIG_IP_ROUTE_VERBOSE 2109 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 2110 /* 2111 * RFC1812 recommendation, if source is martian, 2112 * the only hint is MAC header. 2113 */ 2114 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", 2115 &daddr, &saddr, dev->name); 2116 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 2117 int i; 2118 const unsigned char *p = skb_mac_header(skb); 2119 printk(KERN_WARNING "ll header: "); 2120 for (i = 0; i < dev->hard_header_len; i++, p++) { 2121 printk("%02x", *p); 2122 if (i < (dev->hard_header_len - 1)) 2123 printk(":"); 2124 } 2125 printk("\n"); 2126 } 2127 } 2128 #endif 2129 } 2130 2131 /* called in rcu_read_lock() section */ 2132 static int __mkroute_input(struct sk_buff *skb, 2133 const struct fib_result *res, 2134 struct in_device *in_dev, 2135 __be32 daddr, __be32 saddr, u32 tos, 2136 struct rtable **result) 2137 { 2138 struct rtable *rth; 2139 int err; 2140 struct in_device *out_dev; 2141 unsigned int flags = 0; 2142 __be32 spec_dst; 2143 u32 itag; 2144 2145 /* get a working reference to the output device */ 2146 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 2147 if (out_dev == NULL) { 2148 if (net_ratelimit()) 2149 printk(KERN_CRIT "Bug in ip_route_input" \ 2150 "_slow(). Please, report\n"); 2151 return -EINVAL; 2152 } 2153 2154 2155 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 2156 in_dev->dev, &spec_dst, &itag); 2157 if (err < 0) { 2158 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 2159 saddr); 2160 2161 goto cleanup; 2162 } 2163 2164 if (err) 2165 flags |= RTCF_DIRECTSRC; 2166 2167 if (out_dev == in_dev && err && 2168 (IN_DEV_SHARED_MEDIA(out_dev) || 2169 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 2170 flags |= RTCF_DOREDIRECT; 2171 2172 if (skb->protocol != htons(ETH_P_IP)) { 2173 /* Not IP (i.e. ARP). Do not create route, if it is 2174 * invalid for proxy arp. DNAT routes are always valid. 2175 * 2176 * Proxy arp feature have been extended to allow, ARP 2177 * replies back to the same interface, to support 2178 * Private VLAN switch technologies. See arp.c. 2179 */ 2180 if (out_dev == in_dev && 2181 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 2182 err = -EINVAL; 2183 goto cleanup; 2184 } 2185 } 2186 2187 rth = rt_dst_alloc(out_dev->dev, 2188 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2189 IN_DEV_CONF_GET(out_dev, NOXFRM)); 2190 if (!rth) { 2191 err = -ENOBUFS; 2192 goto cleanup; 2193 } 2194 2195 rth->rt_key_dst = daddr; 2196 rth->rt_key_src = saddr; 2197 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2198 rth->rt_flags = flags; 2199 rth->rt_type = res->type; 2200 rth->rt_key_tos = tos; 2201 rth->rt_dst = daddr; 2202 rth->rt_src = saddr; 2203 rth->rt_route_iif = in_dev->dev->ifindex; 2204 rth->rt_iif = in_dev->dev->ifindex; 2205 rth->rt_oif = 0; 2206 rth->rt_mark = skb->mark; 2207 rth->rt_gateway = daddr; 2208 rth->rt_spec_dst= spec_dst; 2209 rth->rt_peer_genid = 0; 2210 rth->peer = NULL; 2211 rth->fi = NULL; 2212 2213 rth->dst.input = ip_forward; 2214 rth->dst.output = ip_output; 2215 2216 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); 2217 2218 *result = rth; 2219 err = 0; 2220 cleanup: 2221 return err; 2222 } 2223 2224 static int ip_mkroute_input(struct sk_buff *skb, 2225 struct fib_result *res, 2226 const struct flowi4 *fl4, 2227 struct in_device *in_dev, 2228 __be32 daddr, __be32 saddr, u32 tos) 2229 { 2230 struct rtable* rth = NULL; 2231 int err; 2232 unsigned hash; 2233 2234 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2235 if (res->fi && res->fi->fib_nhs > 1) 2236 fib_select_multipath(res); 2237 #endif 2238 2239 /* create a routing cache entry */ 2240 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 2241 if (err) 2242 return err; 2243 2244 /* put it into the cache */ 2245 hash = rt_hash(daddr, saddr, fl4->flowi4_iif, 2246 rt_genid(dev_net(rth->dst.dev))); 2247 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); 2248 if (IS_ERR(rth)) 2249 return PTR_ERR(rth); 2250 return 0; 2251 } 2252 2253 /* 2254 * NOTE. We drop all the packets that has local source 2255 * addresses, because every properly looped back packet 2256 * must have correct destination already attached by output routine. 2257 * 2258 * Such approach solves two big problems: 2259 * 1. Not simplex devices are handled properly. 2260 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2261 * called with rcu_read_lock() 2262 */ 2263 2264 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2265 u8 tos, struct net_device *dev) 2266 { 2267 struct fib_result res; 2268 struct in_device *in_dev = __in_dev_get_rcu(dev); 2269 struct flowi4 fl4; 2270 unsigned flags = 0; 2271 u32 itag = 0; 2272 struct rtable * rth; 2273 unsigned hash; 2274 __be32 spec_dst; 2275 int err = -EINVAL; 2276 struct net * net = dev_net(dev); 2277 2278 /* IP on this device is disabled. */ 2279 2280 if (!in_dev) 2281 goto out; 2282 2283 /* Check for the most weird martians, which can be not detected 2284 by fib_lookup. 2285 */ 2286 2287 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2288 ipv4_is_loopback(saddr)) 2289 goto martian_source; 2290 2291 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2292 goto brd_input; 2293 2294 /* Accept zero addresses only to limited broadcast; 2295 * I even do not know to fix it or not. Waiting for complains :-) 2296 */ 2297 if (ipv4_is_zeronet(saddr)) 2298 goto martian_source; 2299 2300 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) 2301 goto martian_destination; 2302 2303 /* 2304 * Now we are ready to route packet. 2305 */ 2306 fl4.flowi4_oif = 0; 2307 fl4.flowi4_iif = dev->ifindex; 2308 fl4.flowi4_mark = skb->mark; 2309 fl4.flowi4_tos = tos; 2310 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2311 fl4.daddr = daddr; 2312 fl4.saddr = saddr; 2313 err = fib_lookup(net, &fl4, &res); 2314 if (err != 0) { 2315 if (!IN_DEV_FORWARD(in_dev)) 2316 goto e_hostunreach; 2317 goto no_route; 2318 } 2319 2320 RT_CACHE_STAT_INC(in_slow_tot); 2321 2322 if (res.type == RTN_BROADCAST) 2323 goto brd_input; 2324 2325 if (res.type == RTN_LOCAL) { 2326 err = fib_validate_source(skb, saddr, daddr, tos, 2327 net->loopback_dev->ifindex, 2328 dev, &spec_dst, &itag); 2329 if (err < 0) 2330 goto martian_source_keep_err; 2331 if (err) 2332 flags |= RTCF_DIRECTSRC; 2333 spec_dst = daddr; 2334 goto local_input; 2335 } 2336 2337 if (!IN_DEV_FORWARD(in_dev)) 2338 goto e_hostunreach; 2339 if (res.type != RTN_UNICAST) 2340 goto martian_destination; 2341 2342 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); 2343 out: return err; 2344 2345 brd_input: 2346 if (skb->protocol != htons(ETH_P_IP)) 2347 goto e_inval; 2348 2349 if (ipv4_is_zeronet(saddr)) 2350 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2351 else { 2352 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 2353 &itag); 2354 if (err < 0) 2355 goto martian_source_keep_err; 2356 if (err) 2357 flags |= RTCF_DIRECTSRC; 2358 } 2359 flags |= RTCF_BROADCAST; 2360 res.type = RTN_BROADCAST; 2361 RT_CACHE_STAT_INC(in_brd); 2362 2363 local_input: 2364 rth = rt_dst_alloc(net->loopback_dev, 2365 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 2366 if (!rth) 2367 goto e_nobufs; 2368 2369 rth->dst.input= ip_local_deliver; 2370 rth->dst.output= ip_rt_bug; 2371 #ifdef CONFIG_IP_ROUTE_CLASSID 2372 rth->dst.tclassid = itag; 2373 #endif 2374 2375 rth->rt_key_dst = daddr; 2376 rth->rt_key_src = saddr; 2377 rth->rt_genid = rt_genid(net); 2378 rth->rt_flags = flags|RTCF_LOCAL; 2379 rth->rt_type = res.type; 2380 rth->rt_key_tos = tos; 2381 rth->rt_dst = daddr; 2382 rth->rt_src = saddr; 2383 #ifdef CONFIG_IP_ROUTE_CLASSID 2384 rth->dst.tclassid = itag; 2385 #endif 2386 rth->rt_route_iif = dev->ifindex; 2387 rth->rt_iif = dev->ifindex; 2388 rth->rt_oif = 0; 2389 rth->rt_mark = skb->mark; 2390 rth->rt_gateway = daddr; 2391 rth->rt_spec_dst= spec_dst; 2392 rth->rt_peer_genid = 0; 2393 rth->peer = NULL; 2394 rth->fi = NULL; 2395 if (res.type == RTN_UNREACHABLE) { 2396 rth->dst.input= ip_error; 2397 rth->dst.error= -err; 2398 rth->rt_flags &= ~RTCF_LOCAL; 2399 } 2400 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 2401 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); 2402 err = 0; 2403 if (IS_ERR(rth)) 2404 err = PTR_ERR(rth); 2405 goto out; 2406 2407 no_route: 2408 RT_CACHE_STAT_INC(in_no_route); 2409 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2410 res.type = RTN_UNREACHABLE; 2411 if (err == -ESRCH) 2412 err = -ENETUNREACH; 2413 goto local_input; 2414 2415 /* 2416 * Do not cache martian addresses: they should be logged (RFC1812) 2417 */ 2418 martian_destination: 2419 RT_CACHE_STAT_INC(in_martian_dst); 2420 #ifdef CONFIG_IP_ROUTE_VERBOSE 2421 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2422 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", 2423 &daddr, &saddr, dev->name); 2424 #endif 2425 2426 e_hostunreach: 2427 err = -EHOSTUNREACH; 2428 goto out; 2429 2430 e_inval: 2431 err = -EINVAL; 2432 goto out; 2433 2434 e_nobufs: 2435 err = -ENOBUFS; 2436 goto out; 2437 2438 martian_source: 2439 err = -EINVAL; 2440 martian_source_keep_err: 2441 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2442 goto out; 2443 } 2444 2445 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2446 u8 tos, struct net_device *dev, bool noref) 2447 { 2448 struct rtable * rth; 2449 unsigned hash; 2450 int iif = dev->ifindex; 2451 struct net *net; 2452 int res; 2453 2454 net = dev_net(dev); 2455 2456 rcu_read_lock(); 2457 2458 if (!rt_caching(net)) 2459 goto skip_cache; 2460 2461 tos &= IPTOS_RT_MASK; 2462 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2463 2464 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2465 rth = rcu_dereference(rth->dst.rt_next)) { 2466 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | 2467 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | 2468 (rth->rt_route_iif ^ iif) | 2469 (rth->rt_key_tos ^ tos)) == 0 && 2470 rth->rt_mark == skb->mark && 2471 net_eq(dev_net(rth->dst.dev), net) && 2472 !rt_is_expired(rth)) { 2473 ipv4_validate_peer(rth); 2474 if (noref) { 2475 dst_use_noref(&rth->dst, jiffies); 2476 skb_dst_set_noref(skb, &rth->dst); 2477 } else { 2478 dst_use(&rth->dst, jiffies); 2479 skb_dst_set(skb, &rth->dst); 2480 } 2481 RT_CACHE_STAT_INC(in_hit); 2482 rcu_read_unlock(); 2483 return 0; 2484 } 2485 RT_CACHE_STAT_INC(in_hlist_search); 2486 } 2487 2488 skip_cache: 2489 /* Multicast recognition logic is moved from route cache to here. 2490 The problem was that too many Ethernet cards have broken/missing 2491 hardware multicast filters :-( As result the host on multicasting 2492 network acquires a lot of useless route cache entries, sort of 2493 SDR messages from all the world. Now we try to get rid of them. 2494 Really, provided software IP multicast filter is organized 2495 reasonably (at least, hashed), it does not result in a slowdown 2496 comparing with route cache reject entries. 2497 Note, that multicast routers are not affected, because 2498 route cache entry is created eventually. 2499 */ 2500 if (ipv4_is_multicast(daddr)) { 2501 struct in_device *in_dev = __in_dev_get_rcu(dev); 2502 2503 if (in_dev) { 2504 int our = ip_check_mc_rcu(in_dev, daddr, saddr, 2505 ip_hdr(skb)->protocol); 2506 if (our 2507 #ifdef CONFIG_IP_MROUTE 2508 || 2509 (!ipv4_is_local_multicast(daddr) && 2510 IN_DEV_MFORWARD(in_dev)) 2511 #endif 2512 ) { 2513 int res = ip_route_input_mc(skb, daddr, saddr, 2514 tos, dev, our); 2515 rcu_read_unlock(); 2516 return res; 2517 } 2518 } 2519 rcu_read_unlock(); 2520 return -EINVAL; 2521 } 2522 res = ip_route_input_slow(skb, daddr, saddr, tos, dev); 2523 rcu_read_unlock(); 2524 return res; 2525 } 2526 EXPORT_SYMBOL(ip_route_input_common); 2527 2528 /* called with rcu_read_lock() */ 2529 static struct rtable *__mkroute_output(const struct fib_result *res, 2530 const struct flowi4 *fl4, 2531 __be32 orig_daddr, __be32 orig_saddr, 2532 int orig_oif, __u8 orig_rtos, 2533 struct net_device *dev_out, 2534 unsigned int flags) 2535 { 2536 struct fib_info *fi = res->fi; 2537 struct in_device *in_dev; 2538 u16 type = res->type; 2539 struct rtable *rth; 2540 2541 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 2542 return ERR_PTR(-EINVAL); 2543 2544 if (ipv4_is_lbcast(fl4->daddr)) 2545 type = RTN_BROADCAST; 2546 else if (ipv4_is_multicast(fl4->daddr)) 2547 type = RTN_MULTICAST; 2548 else if (ipv4_is_zeronet(fl4->daddr)) 2549 return ERR_PTR(-EINVAL); 2550 2551 if (dev_out->flags & IFF_LOOPBACK) 2552 flags |= RTCF_LOCAL; 2553 2554 in_dev = __in_dev_get_rcu(dev_out); 2555 if (!in_dev) 2556 return ERR_PTR(-EINVAL); 2557 2558 if (type == RTN_BROADCAST) { 2559 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2560 fi = NULL; 2561 } else if (type == RTN_MULTICAST) { 2562 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2563 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2564 fl4->flowi4_proto)) 2565 flags &= ~RTCF_LOCAL; 2566 /* If multicast route do not exist use 2567 * default one, but do not gateway in this case. 2568 * Yes, it is hack. 2569 */ 2570 if (fi && res->prefixlen < 4) 2571 fi = NULL; 2572 } 2573 2574 rth = rt_dst_alloc(dev_out, 2575 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2576 IN_DEV_CONF_GET(in_dev, NOXFRM)); 2577 if (!rth) 2578 return ERR_PTR(-ENOBUFS); 2579 2580 rth->dst.output = ip_output; 2581 2582 rth->rt_key_dst = orig_daddr; 2583 rth->rt_key_src = orig_saddr; 2584 rth->rt_genid = rt_genid(dev_net(dev_out)); 2585 rth->rt_flags = flags; 2586 rth->rt_type = type; 2587 rth->rt_key_tos = orig_rtos; 2588 rth->rt_dst = fl4->daddr; 2589 rth->rt_src = fl4->saddr; 2590 rth->rt_route_iif = 0; 2591 rth->rt_iif = orig_oif ? : dev_out->ifindex; 2592 rth->rt_oif = orig_oif; 2593 rth->rt_mark = fl4->flowi4_mark; 2594 rth->rt_gateway = fl4->daddr; 2595 rth->rt_spec_dst= fl4->saddr; 2596 rth->rt_peer_genid = 0; 2597 rth->peer = NULL; 2598 rth->fi = NULL; 2599 2600 RT_CACHE_STAT_INC(out_slow_tot); 2601 2602 if (flags & RTCF_LOCAL) { 2603 rth->dst.input = ip_local_deliver; 2604 rth->rt_spec_dst = fl4->daddr; 2605 } 2606 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2607 rth->rt_spec_dst = fl4->saddr; 2608 if (flags & RTCF_LOCAL && 2609 !(dev_out->flags & IFF_LOOPBACK)) { 2610 rth->dst.output = ip_mc_output; 2611 RT_CACHE_STAT_INC(out_slow_mc); 2612 } 2613 #ifdef CONFIG_IP_MROUTE 2614 if (type == RTN_MULTICAST) { 2615 if (IN_DEV_MFORWARD(in_dev) && 2616 !ipv4_is_local_multicast(fl4->daddr)) { 2617 rth->dst.input = ip_mr_input; 2618 rth->dst.output = ip_mc_output; 2619 } 2620 } 2621 #endif 2622 } 2623 2624 rt_set_nexthop(rth, fl4, res, fi, type, 0); 2625 2626 return rth; 2627 } 2628 2629 /* 2630 * Major route resolver routine. 2631 * called with rcu_read_lock(); 2632 */ 2633 2634 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) 2635 { 2636 struct net_device *dev_out = NULL; 2637 __u8 tos = RT_FL_TOS(fl4); 2638 unsigned int flags = 0; 2639 struct fib_result res; 2640 struct rtable *rth; 2641 __be32 orig_daddr; 2642 __be32 orig_saddr; 2643 int orig_oif; 2644 2645 res.fi = NULL; 2646 #ifdef CONFIG_IP_MULTIPLE_TABLES 2647 res.r = NULL; 2648 #endif 2649 2650 orig_daddr = fl4->daddr; 2651 orig_saddr = fl4->saddr; 2652 orig_oif = fl4->flowi4_oif; 2653 2654 fl4->flowi4_iif = net->loopback_dev->ifindex; 2655 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2656 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2657 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2658 2659 rcu_read_lock(); 2660 if (fl4->saddr) { 2661 rth = ERR_PTR(-EINVAL); 2662 if (ipv4_is_multicast(fl4->saddr) || 2663 ipv4_is_lbcast(fl4->saddr) || 2664 ipv4_is_zeronet(fl4->saddr)) 2665 goto out; 2666 2667 /* I removed check for oif == dev_out->oif here. 2668 It was wrong for two reasons: 2669 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2670 is assigned to multiple interfaces. 2671 2. Moreover, we are allowed to send packets with saddr 2672 of another iface. --ANK 2673 */ 2674 2675 if (fl4->flowi4_oif == 0 && 2676 (ipv4_is_multicast(fl4->daddr) || 2677 ipv4_is_lbcast(fl4->daddr))) { 2678 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2679 dev_out = __ip_dev_find(net, fl4->saddr, false); 2680 if (dev_out == NULL) 2681 goto out; 2682 2683 /* Special hack: user can direct multicasts 2684 and limited broadcast via necessary interface 2685 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2686 This hack is not just for fun, it allows 2687 vic,vat and friends to work. 2688 They bind socket to loopback, set ttl to zero 2689 and expect that it will work. 2690 From the viewpoint of routing cache they are broken, 2691 because we are not allowed to build multicast path 2692 with loopback source addr (look, routing cache 2693 cannot know, that ttl is zero, so that packet 2694 will not leave this host and route is valid). 2695 Luckily, this hack is good workaround. 2696 */ 2697 2698 fl4->flowi4_oif = dev_out->ifindex; 2699 goto make_route; 2700 } 2701 2702 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2703 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2704 if (!__ip_dev_find(net, fl4->saddr, false)) 2705 goto out; 2706 } 2707 } 2708 2709 2710 if (fl4->flowi4_oif) { 2711 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2712 rth = ERR_PTR(-ENODEV); 2713 if (dev_out == NULL) 2714 goto out; 2715 2716 /* RACE: Check return value of inet_select_addr instead. */ 2717 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2718 rth = ERR_PTR(-ENETUNREACH); 2719 goto out; 2720 } 2721 if (ipv4_is_local_multicast(fl4->daddr) || 2722 ipv4_is_lbcast(fl4->daddr)) { 2723 if (!fl4->saddr) 2724 fl4->saddr = inet_select_addr(dev_out, 0, 2725 RT_SCOPE_LINK); 2726 goto make_route; 2727 } 2728 if (fl4->saddr) { 2729 if (ipv4_is_multicast(fl4->daddr)) 2730 fl4->saddr = inet_select_addr(dev_out, 0, 2731 fl4->flowi4_scope); 2732 else if (!fl4->daddr) 2733 fl4->saddr = inet_select_addr(dev_out, 0, 2734 RT_SCOPE_HOST); 2735 } 2736 } 2737 2738 if (!fl4->daddr) { 2739 fl4->daddr = fl4->saddr; 2740 if (!fl4->daddr) 2741 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2742 dev_out = net->loopback_dev; 2743 fl4->flowi4_oif = net->loopback_dev->ifindex; 2744 res.type = RTN_LOCAL; 2745 flags |= RTCF_LOCAL; 2746 goto make_route; 2747 } 2748 2749 if (fib_lookup(net, fl4, &res)) { 2750 res.fi = NULL; 2751 if (fl4->flowi4_oif) { 2752 /* Apparently, routing tables are wrong. Assume, 2753 that the destination is on link. 2754 2755 WHY? DW. 2756 Because we are allowed to send to iface 2757 even if it has NO routes and NO assigned 2758 addresses. When oif is specified, routing 2759 tables are looked up with only one purpose: 2760 to catch if destination is gatewayed, rather than 2761 direct. Moreover, if MSG_DONTROUTE is set, 2762 we send packet, ignoring both routing tables 2763 and ifaddr state. --ANK 2764 2765 2766 We could make it even if oif is unknown, 2767 likely IPv6, but we do not. 2768 */ 2769 2770 if (fl4->saddr == 0) 2771 fl4->saddr = inet_select_addr(dev_out, 0, 2772 RT_SCOPE_LINK); 2773 res.type = RTN_UNICAST; 2774 goto make_route; 2775 } 2776 rth = ERR_PTR(-ENETUNREACH); 2777 goto out; 2778 } 2779 2780 if (res.type == RTN_LOCAL) { 2781 if (!fl4->saddr) { 2782 if (res.fi->fib_prefsrc) 2783 fl4->saddr = res.fi->fib_prefsrc; 2784 else 2785 fl4->saddr = fl4->daddr; 2786 } 2787 dev_out = net->loopback_dev; 2788 fl4->flowi4_oif = dev_out->ifindex; 2789 res.fi = NULL; 2790 flags |= RTCF_LOCAL; 2791 goto make_route; 2792 } 2793 2794 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2795 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) 2796 fib_select_multipath(&res); 2797 else 2798 #endif 2799 if (!res.prefixlen && 2800 res.table->tb_num_default > 1 && 2801 res.type == RTN_UNICAST && !fl4->flowi4_oif) 2802 fib_select_default(&res); 2803 2804 if (!fl4->saddr) 2805 fl4->saddr = FIB_RES_PREFSRC(net, res); 2806 2807 dev_out = FIB_RES_DEV(res); 2808 fl4->flowi4_oif = dev_out->ifindex; 2809 2810 2811 make_route: 2812 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, 2813 tos, dev_out, flags); 2814 if (!IS_ERR(rth)) { 2815 unsigned int hash; 2816 2817 hash = rt_hash(orig_daddr, orig_saddr, orig_oif, 2818 rt_genid(dev_net(dev_out))); 2819 rth = rt_intern_hash(hash, rth, NULL, orig_oif); 2820 } 2821 2822 out: 2823 rcu_read_unlock(); 2824 return rth; 2825 } 2826 2827 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) 2828 { 2829 struct rtable *rth; 2830 unsigned int hash; 2831 2832 if (!rt_caching(net)) 2833 goto slow_output; 2834 2835 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); 2836 2837 rcu_read_lock_bh(); 2838 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2839 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2840 if (rth->rt_key_dst == flp4->daddr && 2841 rth->rt_key_src == flp4->saddr && 2842 rt_is_output_route(rth) && 2843 rth->rt_oif == flp4->flowi4_oif && 2844 rth->rt_mark == flp4->flowi4_mark && 2845 !((rth->rt_key_tos ^ flp4->flowi4_tos) & 2846 (IPTOS_RT_MASK | RTO_ONLINK)) && 2847 net_eq(dev_net(rth->dst.dev), net) && 2848 !rt_is_expired(rth)) { 2849 ipv4_validate_peer(rth); 2850 dst_use(&rth->dst, jiffies); 2851 RT_CACHE_STAT_INC(out_hit); 2852 rcu_read_unlock_bh(); 2853 if (!flp4->saddr) 2854 flp4->saddr = rth->rt_src; 2855 if (!flp4->daddr) 2856 flp4->daddr = rth->rt_dst; 2857 return rth; 2858 } 2859 RT_CACHE_STAT_INC(out_hlist_search); 2860 } 2861 rcu_read_unlock_bh(); 2862 2863 slow_output: 2864 return ip_route_output_slow(net, flp4); 2865 } 2866 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2867 2868 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2869 { 2870 return NULL; 2871 } 2872 2873 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2874 { 2875 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2876 2877 return mtu ? : dst->dev->mtu; 2878 } 2879 2880 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2881 { 2882 } 2883 2884 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2885 unsigned long old) 2886 { 2887 return NULL; 2888 } 2889 2890 static struct dst_ops ipv4_dst_blackhole_ops = { 2891 .family = AF_INET, 2892 .protocol = cpu_to_be16(ETH_P_IP), 2893 .destroy = ipv4_dst_destroy, 2894 .check = ipv4_blackhole_dst_check, 2895 .mtu = ipv4_blackhole_mtu, 2896 .default_advmss = ipv4_default_advmss, 2897 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2898 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2899 .neigh_lookup = ipv4_neigh_lookup, 2900 }; 2901 2902 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2903 { 2904 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); 2905 struct rtable *ort = (struct rtable *) dst_orig; 2906 2907 if (rt) { 2908 struct dst_entry *new = &rt->dst; 2909 2910 new->__use = 1; 2911 new->input = dst_discard; 2912 new->output = dst_discard; 2913 dst_copy_metrics(new, &ort->dst); 2914 2915 new->dev = ort->dst.dev; 2916 if (new->dev) 2917 dev_hold(new->dev); 2918 2919 rt->rt_key_dst = ort->rt_key_dst; 2920 rt->rt_key_src = ort->rt_key_src; 2921 rt->rt_key_tos = ort->rt_key_tos; 2922 rt->rt_route_iif = ort->rt_route_iif; 2923 rt->rt_iif = ort->rt_iif; 2924 rt->rt_oif = ort->rt_oif; 2925 rt->rt_mark = ort->rt_mark; 2926 2927 rt->rt_genid = rt_genid(net); 2928 rt->rt_flags = ort->rt_flags; 2929 rt->rt_type = ort->rt_type; 2930 rt->rt_dst = ort->rt_dst; 2931 rt->rt_src = ort->rt_src; 2932 rt->rt_gateway = ort->rt_gateway; 2933 rt->rt_spec_dst = ort->rt_spec_dst; 2934 rt->peer = ort->peer; 2935 if (rt->peer) 2936 atomic_inc(&rt->peer->refcnt); 2937 rt->fi = ort->fi; 2938 if (rt->fi) 2939 atomic_inc(&rt->fi->fib_clntref); 2940 2941 dst_free(new); 2942 } 2943 2944 dst_release(dst_orig); 2945 2946 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2947 } 2948 2949 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2950 struct sock *sk) 2951 { 2952 struct rtable *rt = __ip_route_output_key(net, flp4); 2953 2954 if (IS_ERR(rt)) 2955 return rt; 2956 2957 if (flp4->flowi4_proto) 2958 rt = (struct rtable *) xfrm_lookup(net, &rt->dst, 2959 flowi4_to_flowi(flp4), 2960 sk, 0); 2961 2962 return rt; 2963 } 2964 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2965 2966 static int rt_fill_info(struct net *net, 2967 struct sk_buff *skb, u32 pid, u32 seq, int event, 2968 int nowait, unsigned int flags) 2969 { 2970 struct rtable *rt = skb_rtable(skb); 2971 struct rtmsg *r; 2972 struct nlmsghdr *nlh; 2973 unsigned long expires = 0; 2974 const struct inet_peer *peer = rt->peer; 2975 u32 id = 0, ts = 0, tsage = 0, error; 2976 2977 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2978 if (nlh == NULL) 2979 return -EMSGSIZE; 2980 2981 r = nlmsg_data(nlh); 2982 r->rtm_family = AF_INET; 2983 r->rtm_dst_len = 32; 2984 r->rtm_src_len = 0; 2985 r->rtm_tos = rt->rt_key_tos; 2986 r->rtm_table = RT_TABLE_MAIN; 2987 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2988 r->rtm_type = rt->rt_type; 2989 r->rtm_scope = RT_SCOPE_UNIVERSE; 2990 r->rtm_protocol = RTPROT_UNSPEC; 2991 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2992 if (rt->rt_flags & RTCF_NOTIFY) 2993 r->rtm_flags |= RTM_F_NOTIFY; 2994 2995 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2996 2997 if (rt->rt_key_src) { 2998 r->rtm_src_len = 32; 2999 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); 3000 } 3001 if (rt->dst.dev) 3002 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 3003 #ifdef CONFIG_IP_ROUTE_CLASSID 3004 if (rt->dst.tclassid) 3005 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 3006 #endif 3007 if (rt_is_input_route(rt)) 3008 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 3009 else if (rt->rt_src != rt->rt_key_src) 3010 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 3011 3012 if (rt->rt_dst != rt->rt_gateway) 3013 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 3014 3015 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 3016 goto nla_put_failure; 3017 3018 if (rt->rt_mark) 3019 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); 3020 3021 error = rt->dst.error; 3022 if (peer) { 3023 inet_peer_refcheck(rt->peer); 3024 id = atomic_read(&peer->ip_id_count) & 0xffff; 3025 if (peer->tcp_ts_stamp) { 3026 ts = peer->tcp_ts; 3027 tsage = get_seconds() - peer->tcp_ts_stamp; 3028 } 3029 expires = ACCESS_ONCE(peer->pmtu_expires); 3030 if (expires) { 3031 if (time_before(jiffies, expires)) 3032 expires -= jiffies; 3033 else 3034 expires = 0; 3035 } 3036 } 3037 3038 if (rt_is_input_route(rt)) { 3039 #ifdef CONFIG_IP_MROUTE 3040 __be32 dst = rt->rt_dst; 3041 3042 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 3043 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 3044 int err = ipmr_get_route(net, skb, 3045 rt->rt_src, rt->rt_dst, 3046 r, nowait); 3047 if (err <= 0) { 3048 if (!nowait) { 3049 if (err == 0) 3050 return 0; 3051 goto nla_put_failure; 3052 } else { 3053 if (err == -EMSGSIZE) 3054 goto nla_put_failure; 3055 error = err; 3056 } 3057 } 3058 } else 3059 #endif 3060 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); 3061 } 3062 3063 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 3064 expires, error) < 0) 3065 goto nla_put_failure; 3066 3067 return nlmsg_end(skb, nlh); 3068 3069 nla_put_failure: 3070 nlmsg_cancel(skb, nlh); 3071 return -EMSGSIZE; 3072 } 3073 3074 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 3075 { 3076 struct net *net = sock_net(in_skb->sk); 3077 struct rtmsg *rtm; 3078 struct nlattr *tb[RTA_MAX+1]; 3079 struct rtable *rt = NULL; 3080 __be32 dst = 0; 3081 __be32 src = 0; 3082 u32 iif; 3083 int err; 3084 int mark; 3085 struct sk_buff *skb; 3086 3087 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 3088 if (err < 0) 3089 goto errout; 3090 3091 rtm = nlmsg_data(nlh); 3092 3093 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3094 if (skb == NULL) { 3095 err = -ENOBUFS; 3096 goto errout; 3097 } 3098 3099 /* Reserve room for dummy headers, this skb can pass 3100 through good chunk of routing engine. 3101 */ 3102 skb_reset_mac_header(skb); 3103 skb_reset_network_header(skb); 3104 3105 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 3106 ip_hdr(skb)->protocol = IPPROTO_ICMP; 3107 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 3108 3109 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 3110 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 3111 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 3112 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 3113 3114 if (iif) { 3115 struct net_device *dev; 3116 3117 dev = __dev_get_by_index(net, iif); 3118 if (dev == NULL) { 3119 err = -ENODEV; 3120 goto errout_free; 3121 } 3122 3123 skb->protocol = htons(ETH_P_IP); 3124 skb->dev = dev; 3125 skb->mark = mark; 3126 local_bh_disable(); 3127 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 3128 local_bh_enable(); 3129 3130 rt = skb_rtable(skb); 3131 if (err == 0 && rt->dst.error) 3132 err = -rt->dst.error; 3133 } else { 3134 struct flowi4 fl4 = { 3135 .daddr = dst, 3136 .saddr = src, 3137 .flowi4_tos = rtm->rtm_tos, 3138 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 3139 .flowi4_mark = mark, 3140 }; 3141 rt = ip_route_output_key(net, &fl4); 3142 3143 err = 0; 3144 if (IS_ERR(rt)) 3145 err = PTR_ERR(rt); 3146 } 3147 3148 if (err) 3149 goto errout_free; 3150 3151 skb_dst_set(skb, &rt->dst); 3152 if (rtm->rtm_flags & RTM_F_NOTIFY) 3153 rt->rt_flags |= RTCF_NOTIFY; 3154 3155 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 3156 RTM_NEWROUTE, 0, 0); 3157 if (err <= 0) 3158 goto errout_free; 3159 3160 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 3161 errout: 3162 return err; 3163 3164 errout_free: 3165 kfree_skb(skb); 3166 goto errout; 3167 } 3168 3169 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 3170 { 3171 struct rtable *rt; 3172 int h, s_h; 3173 int idx, s_idx; 3174 struct net *net; 3175 3176 net = sock_net(skb->sk); 3177 3178 s_h = cb->args[0]; 3179 if (s_h < 0) 3180 s_h = 0; 3181 s_idx = idx = cb->args[1]; 3182 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { 3183 if (!rt_hash_table[h].chain) 3184 continue; 3185 rcu_read_lock_bh(); 3186 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; 3187 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { 3188 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) 3189 continue; 3190 if (rt_is_expired(rt)) 3191 continue; 3192 skb_dst_set_noref(skb, &rt->dst); 3193 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3194 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3195 1, NLM_F_MULTI) <= 0) { 3196 skb_dst_drop(skb); 3197 rcu_read_unlock_bh(); 3198 goto done; 3199 } 3200 skb_dst_drop(skb); 3201 } 3202 rcu_read_unlock_bh(); 3203 } 3204 3205 done: 3206 cb->args[0] = h; 3207 cb->args[1] = idx; 3208 return skb->len; 3209 } 3210 3211 void ip_rt_multicast_event(struct in_device *in_dev) 3212 { 3213 rt_cache_flush(dev_net(in_dev->dev), 0); 3214 } 3215 3216 #ifdef CONFIG_SYSCTL 3217 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3218 void __user *buffer, 3219 size_t *lenp, loff_t *ppos) 3220 { 3221 if (write) { 3222 int flush_delay; 3223 ctl_table ctl; 3224 struct net *net; 3225 3226 memcpy(&ctl, __ctl, sizeof(ctl)); 3227 ctl.data = &flush_delay; 3228 proc_dointvec(&ctl, write, buffer, lenp, ppos); 3229 3230 net = (struct net *)__ctl->extra1; 3231 rt_cache_flush(net, flush_delay); 3232 return 0; 3233 } 3234 3235 return -EINVAL; 3236 } 3237 3238 static ctl_table ipv4_route_table[] = { 3239 { 3240 .procname = "gc_thresh", 3241 .data = &ipv4_dst_ops.gc_thresh, 3242 .maxlen = sizeof(int), 3243 .mode = 0644, 3244 .proc_handler = proc_dointvec, 3245 }, 3246 { 3247 .procname = "max_size", 3248 .data = &ip_rt_max_size, 3249 .maxlen = sizeof(int), 3250 .mode = 0644, 3251 .proc_handler = proc_dointvec, 3252 }, 3253 { 3254 /* Deprecated. Use gc_min_interval_ms */ 3255 3256 .procname = "gc_min_interval", 3257 .data = &ip_rt_gc_min_interval, 3258 .maxlen = sizeof(int), 3259 .mode = 0644, 3260 .proc_handler = proc_dointvec_jiffies, 3261 }, 3262 { 3263 .procname = "gc_min_interval_ms", 3264 .data = &ip_rt_gc_min_interval, 3265 .maxlen = sizeof(int), 3266 .mode = 0644, 3267 .proc_handler = proc_dointvec_ms_jiffies, 3268 }, 3269 { 3270 .procname = "gc_timeout", 3271 .data = &ip_rt_gc_timeout, 3272 .maxlen = sizeof(int), 3273 .mode = 0644, 3274 .proc_handler = proc_dointvec_jiffies, 3275 }, 3276 { 3277 .procname = "gc_interval", 3278 .data = &ip_rt_gc_interval, 3279 .maxlen = sizeof(int), 3280 .mode = 0644, 3281 .proc_handler = proc_dointvec_jiffies, 3282 }, 3283 { 3284 .procname = "redirect_load", 3285 .data = &ip_rt_redirect_load, 3286 .maxlen = sizeof(int), 3287 .mode = 0644, 3288 .proc_handler = proc_dointvec, 3289 }, 3290 { 3291 .procname = "redirect_number", 3292 .data = &ip_rt_redirect_number, 3293 .maxlen = sizeof(int), 3294 .mode = 0644, 3295 .proc_handler = proc_dointvec, 3296 }, 3297 { 3298 .procname = "redirect_silence", 3299 .data = &ip_rt_redirect_silence, 3300 .maxlen = sizeof(int), 3301 .mode = 0644, 3302 .proc_handler = proc_dointvec, 3303 }, 3304 { 3305 .procname = "error_cost", 3306 .data = &ip_rt_error_cost, 3307 .maxlen = sizeof(int), 3308 .mode = 0644, 3309 .proc_handler = proc_dointvec, 3310 }, 3311 { 3312 .procname = "error_burst", 3313 .data = &ip_rt_error_burst, 3314 .maxlen = sizeof(int), 3315 .mode = 0644, 3316 .proc_handler = proc_dointvec, 3317 }, 3318 { 3319 .procname = "gc_elasticity", 3320 .data = &ip_rt_gc_elasticity, 3321 .maxlen = sizeof(int), 3322 .mode = 0644, 3323 .proc_handler = proc_dointvec, 3324 }, 3325 { 3326 .procname = "mtu_expires", 3327 .data = &ip_rt_mtu_expires, 3328 .maxlen = sizeof(int), 3329 .mode = 0644, 3330 .proc_handler = proc_dointvec_jiffies, 3331 }, 3332 { 3333 .procname = "min_pmtu", 3334 .data = &ip_rt_min_pmtu, 3335 .maxlen = sizeof(int), 3336 .mode = 0644, 3337 .proc_handler = proc_dointvec, 3338 }, 3339 { 3340 .procname = "min_adv_mss", 3341 .data = &ip_rt_min_advmss, 3342 .maxlen = sizeof(int), 3343 .mode = 0644, 3344 .proc_handler = proc_dointvec, 3345 }, 3346 { } 3347 }; 3348 3349 static struct ctl_table empty[1]; 3350 3351 static struct ctl_table ipv4_skeleton[] = 3352 { 3353 { .procname = "route", 3354 .mode = 0555, .child = ipv4_route_table}, 3355 { .procname = "neigh", 3356 .mode = 0555, .child = empty}, 3357 { } 3358 }; 3359 3360 static __net_initdata struct ctl_path ipv4_path[] = { 3361 { .procname = "net", }, 3362 { .procname = "ipv4", }, 3363 { }, 3364 }; 3365 3366 static struct ctl_table ipv4_route_flush_table[] = { 3367 { 3368 .procname = "flush", 3369 .maxlen = sizeof(int), 3370 .mode = 0200, 3371 .proc_handler = ipv4_sysctl_rtcache_flush, 3372 }, 3373 { }, 3374 }; 3375 3376 static __net_initdata struct ctl_path ipv4_route_path[] = { 3377 { .procname = "net", }, 3378 { .procname = "ipv4", }, 3379 { .procname = "route", }, 3380 { }, 3381 }; 3382 3383 static __net_init int sysctl_route_net_init(struct net *net) 3384 { 3385 struct ctl_table *tbl; 3386 3387 tbl = ipv4_route_flush_table; 3388 if (!net_eq(net, &init_net)) { 3389 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3390 if (tbl == NULL) 3391 goto err_dup; 3392 } 3393 tbl[0].extra1 = net; 3394 3395 net->ipv4.route_hdr = 3396 register_net_sysctl_table(net, ipv4_route_path, tbl); 3397 if (net->ipv4.route_hdr == NULL) 3398 goto err_reg; 3399 return 0; 3400 3401 err_reg: 3402 if (tbl != ipv4_route_flush_table) 3403 kfree(tbl); 3404 err_dup: 3405 return -ENOMEM; 3406 } 3407 3408 static __net_exit void sysctl_route_net_exit(struct net *net) 3409 { 3410 struct ctl_table *tbl; 3411 3412 tbl = net->ipv4.route_hdr->ctl_table_arg; 3413 unregister_net_sysctl_table(net->ipv4.route_hdr); 3414 BUG_ON(tbl == ipv4_route_flush_table); 3415 kfree(tbl); 3416 } 3417 3418 static __net_initdata struct pernet_operations sysctl_route_ops = { 3419 .init = sysctl_route_net_init, 3420 .exit = sysctl_route_net_exit, 3421 }; 3422 #endif 3423 3424 static __net_init int rt_genid_init(struct net *net) 3425 { 3426 get_random_bytes(&net->ipv4.rt_genid, 3427 sizeof(net->ipv4.rt_genid)); 3428 get_random_bytes(&net->ipv4.dev_addr_genid, 3429 sizeof(net->ipv4.dev_addr_genid)); 3430 return 0; 3431 } 3432 3433 static __net_initdata struct pernet_operations rt_genid_ops = { 3434 .init = rt_genid_init, 3435 }; 3436 3437 3438 #ifdef CONFIG_IP_ROUTE_CLASSID 3439 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3440 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3441 3442 static __initdata unsigned long rhash_entries; 3443 static int __init set_rhash_entries(char *str) 3444 { 3445 if (!str) 3446 return 0; 3447 rhash_entries = simple_strtoul(str, &str, 0); 3448 return 1; 3449 } 3450 __setup("rhash_entries=", set_rhash_entries); 3451 3452 int __init ip_rt_init(void) 3453 { 3454 int rc = 0; 3455 3456 #ifdef CONFIG_IP_ROUTE_CLASSID 3457 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3458 if (!ip_rt_acct) 3459 panic("IP: failed to allocate ip_rt_acct\n"); 3460 #endif 3461 3462 ipv4_dst_ops.kmem_cachep = 3463 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3464 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3465 3466 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3467 3468 if (dst_entries_init(&ipv4_dst_ops) < 0) 3469 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3470 3471 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3472 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3473 3474 rt_hash_table = (struct rt_hash_bucket *) 3475 alloc_large_system_hash("IP route cache", 3476 sizeof(struct rt_hash_bucket), 3477 rhash_entries, 3478 (totalram_pages >= 128 * 1024) ? 3479 15 : 17, 3480 0, 3481 &rt_hash_log, 3482 &rt_hash_mask, 3483 rhash_entries ? 0 : 512 * 1024); 3484 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3485 rt_hash_lock_init(); 3486 3487 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3488 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3489 3490 devinet_init(); 3491 ip_fib_init(); 3492 3493 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); 3494 expires_ljiffies = jiffies; 3495 schedule_delayed_work(&expires_work, 3496 net_random() % ip_rt_gc_interval + ip_rt_gc_interval); 3497 3498 if (ip_rt_proc_init()) 3499 printk(KERN_ERR "Unable to create route proc files\n"); 3500 #ifdef CONFIG_XFRM 3501 xfrm_init(); 3502 xfrm4_init(ip_rt_max_size); 3503 #endif 3504 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); 3505 3506 #ifdef CONFIG_SYSCTL 3507 register_pernet_subsys(&sysctl_route_ops); 3508 #endif 3509 register_pernet_subsys(&rt_genid_ops); 3510 return rc; 3511 } 3512 3513 #ifdef CONFIG_SYSCTL 3514 /* 3515 * We really need to sanitize the damn ipv4 init order, then all 3516 * this nonsense will go away. 3517 */ 3518 void __init ip_static_sysctl_init(void) 3519 { 3520 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3521 } 3522 #endif 3523