1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #include <linux/module.h> 66 #include <asm/uaccess.h> 67 #include <asm/system.h> 68 #include <linux/bitops.h> 69 #include <linux/types.h> 70 #include <linux/kernel.h> 71 #include <linux/mm.h> 72 #include <linux/bootmem.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/workqueue.h> 83 #include <linux/skbuff.h> 84 #include <linux/inetdevice.h> 85 #include <linux/igmp.h> 86 #include <linux/pkt_sched.h> 87 #include <linux/mroute.h> 88 #include <linux/netfilter_ipv4.h> 89 #include <linux/random.h> 90 #include <linux/jhash.h> 91 #include <linux/rcupdate.h> 92 #include <linux/times.h> 93 #include <net/dst.h> 94 #include <net/net_namespace.h> 95 #include <net/protocol.h> 96 #include <net/ip.h> 97 #include <net/route.h> 98 #include <net/inetpeer.h> 99 #include <net/sock.h> 100 #include <net/ip_fib.h> 101 #include <net/arp.h> 102 #include <net/tcp.h> 103 #include <net/icmp.h> 104 #include <net/xfrm.h> 105 #include <net/netevent.h> 106 #include <net/rtnetlink.h> 107 #ifdef CONFIG_SYSCTL 108 #include <linux/sysctl.h> 109 #endif 110 111 #define RT_FL_TOS(oldflp) \ 112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 114 #define IP_MAX_MTU 0xFFF0 115 116 #define RT_GC_TIMEOUT (300*HZ) 117 118 static int ip_rt_max_size; 119 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 120 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 121 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 122 static int ip_rt_redirect_number __read_mostly = 9; 123 static int ip_rt_redirect_load __read_mostly = HZ / 50; 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 125 static int ip_rt_error_cost __read_mostly = HZ; 126 static int ip_rt_error_burst __read_mostly = 5 * HZ; 127 static int ip_rt_gc_elasticity __read_mostly = 8; 128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 129 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 130 static int ip_rt_min_advmss __read_mostly = 256; 131 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; 132 static int rt_chain_length_max __read_mostly = 20; 133 134 static struct delayed_work expires_work; 135 static unsigned long expires_ljiffies; 136 137 /* 138 * Interface to generic destination cache. 139 */ 140 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 142 static void ipv4_dst_destroy(struct dst_entry *dst); 143 static void ipv4_dst_ifdown(struct dst_entry *dst, 144 struct net_device *dev, int how); 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 146 static void ipv4_link_failure(struct sk_buff *skb); 147 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 148 static int rt_garbage_collect(struct dst_ops *ops); 149 static void rt_emergency_hash_rebuild(struct net *net); 150 151 152 static struct dst_ops ipv4_dst_ops = { 153 .family = AF_INET, 154 .protocol = cpu_to_be16(ETH_P_IP), 155 .gc = rt_garbage_collect, 156 .check = ipv4_dst_check, 157 .destroy = ipv4_dst_destroy, 158 .ifdown = ipv4_dst_ifdown, 159 .negative_advice = ipv4_negative_advice, 160 .link_failure = ipv4_link_failure, 161 .update_pmtu = ip_rt_update_pmtu, 162 .local_out = __ip_local_out, 163 .entries = ATOMIC_INIT(0), 164 }; 165 166 #define ECN_OR_COST(class) TC_PRIO_##class 167 168 const __u8 ip_tos2prio[16] = { 169 TC_PRIO_BESTEFFORT, 170 ECN_OR_COST(FILLER), 171 TC_PRIO_BESTEFFORT, 172 ECN_OR_COST(BESTEFFORT), 173 TC_PRIO_BULK, 174 ECN_OR_COST(BULK), 175 TC_PRIO_BULK, 176 ECN_OR_COST(BULK), 177 TC_PRIO_INTERACTIVE, 178 ECN_OR_COST(INTERACTIVE), 179 TC_PRIO_INTERACTIVE, 180 ECN_OR_COST(INTERACTIVE), 181 TC_PRIO_INTERACTIVE_BULK, 182 ECN_OR_COST(INTERACTIVE_BULK), 183 TC_PRIO_INTERACTIVE_BULK, 184 ECN_OR_COST(INTERACTIVE_BULK) 185 }; 186 187 188 /* 189 * Route cache. 190 */ 191 192 /* The locking scheme is rather straight forward: 193 * 194 * 1) Read-Copy Update protects the buckets of the central route hash. 195 * 2) Only writers remove entries, and they hold the lock 196 * as they look at rtable reference counts. 197 * 3) Only readers acquire references to rtable entries, 198 * they do so with atomic increments and with the 199 * lock held. 200 */ 201 202 struct rt_hash_bucket { 203 struct rtable *chain; 204 }; 205 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 207 defined(CONFIG_PROVE_LOCKING) 208 /* 209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks 210 * The size of this table is a power of two and depends on the number of CPUS. 211 * (on lockdep we have a quite big spinlock_t, so keep the size down there) 212 */ 213 #ifdef CONFIG_LOCKDEP 214 # define RT_HASH_LOCK_SZ 256 215 #else 216 # if NR_CPUS >= 32 217 # define RT_HASH_LOCK_SZ 4096 218 # elif NR_CPUS >= 16 219 # define RT_HASH_LOCK_SZ 2048 220 # elif NR_CPUS >= 8 221 # define RT_HASH_LOCK_SZ 1024 222 # elif NR_CPUS >= 4 223 # define RT_HASH_LOCK_SZ 512 224 # else 225 # define RT_HASH_LOCK_SZ 256 226 # endif 227 #endif 228 229 static spinlock_t *rt_hash_locks; 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] 231 232 static __init void rt_hash_lock_init(void) 233 { 234 int i; 235 236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, 237 GFP_KERNEL); 238 if (!rt_hash_locks) 239 panic("IP: failed to allocate rt_hash_locks\n"); 240 241 for (i = 0; i < RT_HASH_LOCK_SZ; i++) 242 spin_lock_init(&rt_hash_locks[i]); 243 } 244 #else 245 # define rt_hash_lock_addr(slot) NULL 246 247 static inline void rt_hash_lock_init(void) 248 { 249 } 250 #endif 251 252 static struct rt_hash_bucket *rt_hash_table __read_mostly; 253 static unsigned rt_hash_mask __read_mostly; 254 static unsigned int rt_hash_log __read_mostly; 255 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 257 #define RT_CACHE_STAT_INC(field) \ 258 (__raw_get_cpu_var(rt_cache_stat).field++) 259 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 261 int genid) 262 { 263 return jhash_3words((__force u32)(__be32)(daddr), 264 (__force u32)(__be32)(saddr), 265 idx, genid) 266 & rt_hash_mask; 267 } 268 269 static inline int rt_genid(struct net *net) 270 { 271 return atomic_read(&net->ipv4.rt_genid); 272 } 273 274 #ifdef CONFIG_PROC_FS 275 struct rt_cache_iter_state { 276 struct seq_net_private p; 277 int bucket; 278 int genid; 279 }; 280 281 static struct rtable *rt_cache_get_first(struct seq_file *seq) 282 { 283 struct rt_cache_iter_state *st = seq->private; 284 struct rtable *r = NULL; 285 286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 287 if (!rt_hash_table[st->bucket].chain) 288 continue; 289 rcu_read_lock_bh(); 290 r = rcu_dereference(rt_hash_table[st->bucket].chain); 291 while (r) { 292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) && 293 r->rt_genid == st->genid) 294 return r; 295 r = rcu_dereference(r->u.dst.rt_next); 296 } 297 rcu_read_unlock_bh(); 298 } 299 return r; 300 } 301 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq, 303 struct rtable *r) 304 { 305 struct rt_cache_iter_state *st = seq->private; 306 307 r = r->u.dst.rt_next; 308 while (!r) { 309 rcu_read_unlock_bh(); 310 do { 311 if (--st->bucket < 0) 312 return NULL; 313 } while (!rt_hash_table[st->bucket].chain); 314 rcu_read_lock_bh(); 315 r = rt_hash_table[st->bucket].chain; 316 } 317 return rcu_dereference(r); 318 } 319 320 static struct rtable *rt_cache_get_next(struct seq_file *seq, 321 struct rtable *r) 322 { 323 struct rt_cache_iter_state *st = seq->private; 324 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 325 if (dev_net(r->u.dst.dev) != seq_file_net(seq)) 326 continue; 327 if (r->rt_genid == st->genid) 328 break; 329 } 330 return r; 331 } 332 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 334 { 335 struct rtable *r = rt_cache_get_first(seq); 336 337 if (r) 338 while (pos && (r = rt_cache_get_next(seq, r))) 339 --pos; 340 return pos ? NULL : r; 341 } 342 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 344 { 345 struct rt_cache_iter_state *st = seq->private; 346 if (*pos) 347 return rt_cache_get_idx(seq, *pos - 1); 348 st->genid = rt_genid(seq_file_net(seq)); 349 return SEQ_START_TOKEN; 350 } 351 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 353 { 354 struct rtable *r; 355 356 if (v == SEQ_START_TOKEN) 357 r = rt_cache_get_first(seq); 358 else 359 r = rt_cache_get_next(seq, v); 360 ++*pos; 361 return r; 362 } 363 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 365 { 366 if (v && v != SEQ_START_TOKEN) 367 rcu_read_unlock_bh(); 368 } 369 370 static int rt_cache_seq_show(struct seq_file *seq, void *v) 371 { 372 if (v == SEQ_START_TOKEN) 373 seq_printf(seq, "%-127s\n", 374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 376 "HHUptod\tSpecDst"); 377 else { 378 struct rtable *r = v; 379 int len; 380 381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 383 r->u.dst.dev ? r->u.dst.dev->name : "*", 384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, 385 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 386 r->u.dst.__use, 0, (unsigned long)r->rt_src, 387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 389 dst_metric(&r->u.dst, RTAX_WINDOW), 390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 391 dst_metric(&r->u.dst, RTAX_RTTVAR)), 392 r->fl.fl4_tos, 393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 394 r->u.dst.hh ? (r->u.dst.hh->hh_output == 395 dev_queue_xmit) : 0, 396 r->rt_spec_dst, &len); 397 398 seq_printf(seq, "%*s\n", 127 - len, ""); 399 } 400 return 0; 401 } 402 403 static const struct seq_operations rt_cache_seq_ops = { 404 .start = rt_cache_seq_start, 405 .next = rt_cache_seq_next, 406 .stop = rt_cache_seq_stop, 407 .show = rt_cache_seq_show, 408 }; 409 410 static int rt_cache_seq_open(struct inode *inode, struct file *file) 411 { 412 return seq_open_net(inode, file, &rt_cache_seq_ops, 413 sizeof(struct rt_cache_iter_state)); 414 } 415 416 static const struct file_operations rt_cache_seq_fops = { 417 .owner = THIS_MODULE, 418 .open = rt_cache_seq_open, 419 .read = seq_read, 420 .llseek = seq_lseek, 421 .release = seq_release_net, 422 }; 423 424 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 426 { 427 int cpu; 428 429 if (*pos == 0) 430 return SEQ_START_TOKEN; 431 432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 433 if (!cpu_possible(cpu)) 434 continue; 435 *pos = cpu+1; 436 return &per_cpu(rt_cache_stat, cpu); 437 } 438 return NULL; 439 } 440 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 442 { 443 int cpu; 444 445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 446 if (!cpu_possible(cpu)) 447 continue; 448 *pos = cpu+1; 449 return &per_cpu(rt_cache_stat, cpu); 450 } 451 return NULL; 452 453 } 454 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 456 { 457 458 } 459 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 461 { 462 struct rt_cache_stat *st = v; 463 464 if (v == SEQ_START_TOKEN) { 465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 466 return 0; 467 } 468 469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 471 atomic_read(&ipv4_dst_ops.entries), 472 st->in_hit, 473 st->in_slow_tot, 474 st->in_slow_mc, 475 st->in_no_route, 476 st->in_brd, 477 st->in_martian_dst, 478 st->in_martian_src, 479 480 st->out_hit, 481 st->out_slow_tot, 482 st->out_slow_mc, 483 484 st->gc_total, 485 st->gc_ignored, 486 st->gc_goal_miss, 487 st->gc_dst_overflow, 488 st->in_hlist_search, 489 st->out_hlist_search 490 ); 491 return 0; 492 } 493 494 static const struct seq_operations rt_cpu_seq_ops = { 495 .start = rt_cpu_seq_start, 496 .next = rt_cpu_seq_next, 497 .stop = rt_cpu_seq_stop, 498 .show = rt_cpu_seq_show, 499 }; 500 501 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 503 { 504 return seq_open(file, &rt_cpu_seq_ops); 505 } 506 507 static const struct file_operations rt_cpu_seq_fops = { 508 .owner = THIS_MODULE, 509 .open = rt_cpu_seq_open, 510 .read = seq_read, 511 .llseek = seq_lseek, 512 .release = seq_release, 513 }; 514 515 #ifdef CONFIG_NET_CLS_ROUTE 516 static int rt_acct_proc_show(struct seq_file *m, void *v) 517 { 518 struct ip_rt_acct *dst, *src; 519 unsigned int i, j; 520 521 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 522 if (!dst) 523 return -ENOMEM; 524 525 for_each_possible_cpu(i) { 526 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 527 for (j = 0; j < 256; j++) { 528 dst[j].o_bytes += src[j].o_bytes; 529 dst[j].o_packets += src[j].o_packets; 530 dst[j].i_bytes += src[j].i_bytes; 531 dst[j].i_packets += src[j].i_packets; 532 } 533 } 534 535 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 536 kfree(dst); 537 return 0; 538 } 539 540 static int rt_acct_proc_open(struct inode *inode, struct file *file) 541 { 542 return single_open(file, rt_acct_proc_show, NULL); 543 } 544 545 static const struct file_operations rt_acct_proc_fops = { 546 .owner = THIS_MODULE, 547 .open = rt_acct_proc_open, 548 .read = seq_read, 549 .llseek = seq_lseek, 550 .release = single_release, 551 }; 552 #endif 553 554 static int __net_init ip_rt_do_proc_init(struct net *net) 555 { 556 struct proc_dir_entry *pde; 557 558 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, 559 &rt_cache_seq_fops); 560 if (!pde) 561 goto err1; 562 563 pde = proc_create("rt_cache", S_IRUGO, 564 net->proc_net_stat, &rt_cpu_seq_fops); 565 if (!pde) 566 goto err2; 567 568 #ifdef CONFIG_NET_CLS_ROUTE 569 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 570 if (!pde) 571 goto err3; 572 #endif 573 return 0; 574 575 #ifdef CONFIG_NET_CLS_ROUTE 576 err3: 577 remove_proc_entry("rt_cache", net->proc_net_stat); 578 #endif 579 err2: 580 remove_proc_entry("rt_cache", net->proc_net); 581 err1: 582 return -ENOMEM; 583 } 584 585 static void __net_exit ip_rt_do_proc_exit(struct net *net) 586 { 587 remove_proc_entry("rt_cache", net->proc_net_stat); 588 remove_proc_entry("rt_cache", net->proc_net); 589 remove_proc_entry("rt_acct", net->proc_net); 590 } 591 592 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 593 .init = ip_rt_do_proc_init, 594 .exit = ip_rt_do_proc_exit, 595 }; 596 597 static int __init ip_rt_proc_init(void) 598 { 599 return register_pernet_subsys(&ip_rt_proc_ops); 600 } 601 602 #else 603 static inline int ip_rt_proc_init(void) 604 { 605 return 0; 606 } 607 #endif /* CONFIG_PROC_FS */ 608 609 static inline void rt_free(struct rtable *rt) 610 { 611 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 612 } 613 614 static inline void rt_drop(struct rtable *rt) 615 { 616 ip_rt_put(rt); 617 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 618 } 619 620 static inline int rt_fast_clean(struct rtable *rth) 621 { 622 /* Kill broadcast/multicast entries very aggresively, if they 623 collide in hash table with more useful entries */ 624 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 625 rth->fl.iif && rth->u.dst.rt_next; 626 } 627 628 static inline int rt_valuable(struct rtable *rth) 629 { 630 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 631 rth->u.dst.expires; 632 } 633 634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 635 { 636 unsigned long age; 637 int ret = 0; 638 639 if (atomic_read(&rth->u.dst.__refcnt)) 640 goto out; 641 642 ret = 1; 643 if (rth->u.dst.expires && 644 time_after_eq(jiffies, rth->u.dst.expires)) 645 goto out; 646 647 age = jiffies - rth->u.dst.lastuse; 648 ret = 0; 649 if ((age <= tmo1 && !rt_fast_clean(rth)) || 650 (age <= tmo2 && rt_valuable(rth))) 651 goto out; 652 ret = 1; 653 out: return ret; 654 } 655 656 /* Bits of score are: 657 * 31: very valuable 658 * 30: not quite useless 659 * 29..0: usage counter 660 */ 661 static inline u32 rt_score(struct rtable *rt) 662 { 663 u32 score = jiffies - rt->u.dst.lastuse; 664 665 score = ~score & ~(3<<30); 666 667 if (rt_valuable(rt)) 668 score |= (1<<31); 669 670 if (!rt->fl.iif || 671 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 672 score |= (1<<30); 673 674 return score; 675 } 676 677 static inline bool rt_caching(const struct net *net) 678 { 679 return net->ipv4.current_rt_cache_rebuild_count <= 680 net->ipv4.sysctl_rt_cache_rebuild_count; 681 } 682 683 static inline bool compare_hash_inputs(const struct flowi *fl1, 684 const struct flowi *fl2) 685 { 686 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 687 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | 688 (fl1->iif ^ fl2->iif)) == 0); 689 } 690 691 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 692 { 693 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 694 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | 695 (fl1->mark ^ fl2->mark) | 696 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ 697 *(u16 *)&fl2->nl_u.ip4_u.tos) | 698 (fl1->oif ^ fl2->oif) | 699 (fl1->iif ^ fl2->iif)) == 0; 700 } 701 702 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 703 { 704 return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); 705 } 706 707 static inline int rt_is_expired(struct rtable *rth) 708 { 709 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); 710 } 711 712 /* 713 * Perform a full scan of hash table and free all entries. 714 * Can be called by a softirq or a process. 715 * In the later case, we want to be reschedule if necessary 716 */ 717 static void rt_do_flush(int process_context) 718 { 719 unsigned int i; 720 struct rtable *rth, *next; 721 struct rtable * tail; 722 723 for (i = 0; i <= rt_hash_mask; i++) { 724 if (process_context && need_resched()) 725 cond_resched(); 726 rth = rt_hash_table[i].chain; 727 if (!rth) 728 continue; 729 730 spin_lock_bh(rt_hash_lock_addr(i)); 731 #ifdef CONFIG_NET_NS 732 { 733 struct rtable ** prev, * p; 734 735 rth = rt_hash_table[i].chain; 736 737 /* defer releasing the head of the list after spin_unlock */ 738 for (tail = rth; tail; tail = tail->u.dst.rt_next) 739 if (!rt_is_expired(tail)) 740 break; 741 if (rth != tail) 742 rt_hash_table[i].chain = tail; 743 744 /* call rt_free on entries after the tail requiring flush */ 745 prev = &rt_hash_table[i].chain; 746 for (p = *prev; p; p = next) { 747 next = p->u.dst.rt_next; 748 if (!rt_is_expired(p)) { 749 prev = &p->u.dst.rt_next; 750 } else { 751 *prev = next; 752 rt_free(p); 753 } 754 } 755 } 756 #else 757 rth = rt_hash_table[i].chain; 758 rt_hash_table[i].chain = NULL; 759 tail = NULL; 760 #endif 761 spin_unlock_bh(rt_hash_lock_addr(i)); 762 763 for (; rth != tail; rth = next) { 764 next = rth->u.dst.rt_next; 765 rt_free(rth); 766 } 767 } 768 } 769 770 /* 771 * While freeing expired entries, we compute average chain length 772 * and standard deviation, using fixed-point arithmetic. 773 * This to have an estimation of rt_chain_length_max 774 * rt_chain_length_max = max(elasticity, AVG + 4*SD) 775 * We use 3 bits for frational part, and 29 (or 61) for magnitude. 776 */ 777 778 #define FRACT_BITS 3 779 #define ONE (1UL << FRACT_BITS) 780 781 static void rt_check_expire(void) 782 { 783 static unsigned int rover; 784 unsigned int i = rover, goal; 785 struct rtable *rth, *aux, **rthp; 786 unsigned long samples = 0; 787 unsigned long sum = 0, sum2 = 0; 788 unsigned long delta; 789 u64 mult; 790 791 delta = jiffies - expires_ljiffies; 792 expires_ljiffies = jiffies; 793 mult = ((u64)delta) << rt_hash_log; 794 if (ip_rt_gc_timeout > 1) 795 do_div(mult, ip_rt_gc_timeout); 796 goal = (unsigned int)mult; 797 if (goal > rt_hash_mask) 798 goal = rt_hash_mask + 1; 799 for (; goal > 0; goal--) { 800 unsigned long tmo = ip_rt_gc_timeout; 801 unsigned long length; 802 803 i = (i + 1) & rt_hash_mask; 804 rthp = &rt_hash_table[i].chain; 805 806 if (need_resched()) 807 cond_resched(); 808 809 samples++; 810 811 if (*rthp == NULL) 812 continue; 813 length = 0; 814 spin_lock_bh(rt_hash_lock_addr(i)); 815 while ((rth = *rthp) != NULL) { 816 prefetch(rth->u.dst.rt_next); 817 if (rt_is_expired(rth)) { 818 *rthp = rth->u.dst.rt_next; 819 rt_free(rth); 820 continue; 821 } 822 if (rth->u.dst.expires) { 823 /* Entry is expired even if it is in use */ 824 if (time_before_eq(jiffies, rth->u.dst.expires)) { 825 nofree: 826 tmo >>= 1; 827 rthp = &rth->u.dst.rt_next; 828 /* 829 * We only count entries on 830 * a chain with equal hash inputs once 831 * so that entries for different QOS 832 * levels, and other non-hash input 833 * attributes don't unfairly skew 834 * the length computation 835 */ 836 for (aux = rt_hash_table[i].chain;;) { 837 if (aux == rth) { 838 length += ONE; 839 break; 840 } 841 if (compare_hash_inputs(&aux->fl, &rth->fl)) 842 break; 843 aux = aux->u.dst.rt_next; 844 } 845 continue; 846 } 847 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) 848 goto nofree; 849 850 /* Cleanup aged off entries. */ 851 *rthp = rth->u.dst.rt_next; 852 rt_free(rth); 853 } 854 spin_unlock_bh(rt_hash_lock_addr(i)); 855 sum += length; 856 sum2 += length*length; 857 } 858 if (samples) { 859 unsigned long avg = sum / samples; 860 unsigned long sd = int_sqrt(sum2 / samples - avg*avg); 861 rt_chain_length_max = max_t(unsigned long, 862 ip_rt_gc_elasticity, 863 (avg + 4*sd) >> FRACT_BITS); 864 } 865 rover = i; 866 } 867 868 /* 869 * rt_worker_func() is run in process context. 870 * we call rt_check_expire() to scan part of the hash table 871 */ 872 static void rt_worker_func(struct work_struct *work) 873 { 874 rt_check_expire(); 875 schedule_delayed_work(&expires_work, ip_rt_gc_interval); 876 } 877 878 /* 879 * Pertubation of rt_genid by a small quantity [1..256] 880 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 881 * many times (2^24) without giving recent rt_genid. 882 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 883 */ 884 static void rt_cache_invalidate(struct net *net) 885 { 886 unsigned char shuffle; 887 888 get_random_bytes(&shuffle, sizeof(shuffle)); 889 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 890 } 891 892 /* 893 * delay < 0 : invalidate cache (fast : entries will be deleted later) 894 * delay >= 0 : invalidate & flush cache (can be long) 895 */ 896 void rt_cache_flush(struct net *net, int delay) 897 { 898 rt_cache_invalidate(net); 899 if (delay >= 0) 900 rt_do_flush(!in_softirq()); 901 } 902 903 /* Flush previous cache invalidated entries from the cache */ 904 void rt_cache_flush_batch(void) 905 { 906 rt_do_flush(!in_softirq()); 907 } 908 909 /* 910 * We change rt_genid and let gc do the cleanup 911 */ 912 static void rt_secret_rebuild(unsigned long __net) 913 { 914 struct net *net = (struct net *)__net; 915 rt_cache_invalidate(net); 916 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); 917 } 918 919 static void rt_secret_rebuild_oneshot(struct net *net) 920 { 921 del_timer_sync(&net->ipv4.rt_secret_timer); 922 rt_cache_invalidate(net); 923 if (ip_rt_secret_interval) { 924 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; 925 add_timer(&net->ipv4.rt_secret_timer); 926 } 927 } 928 929 static void rt_emergency_hash_rebuild(struct net *net) 930 { 931 if (net_ratelimit()) { 932 printk(KERN_WARNING "Route hash chain too long!\n"); 933 printk(KERN_WARNING "Adjust your secret_interval!\n"); 934 } 935 936 rt_secret_rebuild_oneshot(net); 937 } 938 939 /* 940 Short description of GC goals. 941 942 We want to build algorithm, which will keep routing cache 943 at some equilibrium point, when number of aged off entries 944 is kept approximately equal to newly generated ones. 945 946 Current expiration strength is variable "expire". 947 We try to adjust it dynamically, so that if networking 948 is idle expires is large enough to keep enough of warm entries, 949 and when load increases it reduces to limit cache size. 950 */ 951 952 static int rt_garbage_collect(struct dst_ops *ops) 953 { 954 static unsigned long expire = RT_GC_TIMEOUT; 955 static unsigned long last_gc; 956 static int rover; 957 static int equilibrium; 958 struct rtable *rth, **rthp; 959 unsigned long now = jiffies; 960 int goal; 961 962 /* 963 * Garbage collection is pretty expensive, 964 * do not make it too frequently. 965 */ 966 967 RT_CACHE_STAT_INC(gc_total); 968 969 if (now - last_gc < ip_rt_gc_min_interval && 970 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 971 RT_CACHE_STAT_INC(gc_ignored); 972 goto out; 973 } 974 975 /* Calculate number of entries, which we want to expire now. */ 976 goal = atomic_read(&ipv4_dst_ops.entries) - 977 (ip_rt_gc_elasticity << rt_hash_log); 978 if (goal <= 0) { 979 if (equilibrium < ipv4_dst_ops.gc_thresh) 980 equilibrium = ipv4_dst_ops.gc_thresh; 981 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 982 if (goal > 0) { 983 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 984 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 985 } 986 } else { 987 /* We are in dangerous area. Try to reduce cache really 988 * aggressively. 989 */ 990 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 991 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 992 } 993 994 if (now - last_gc >= ip_rt_gc_min_interval) 995 last_gc = now; 996 997 if (goal <= 0) { 998 equilibrium += goal; 999 goto work_done; 1000 } 1001 1002 do { 1003 int i, k; 1004 1005 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 1006 unsigned long tmo = expire; 1007 1008 k = (k + 1) & rt_hash_mask; 1009 rthp = &rt_hash_table[k].chain; 1010 spin_lock_bh(rt_hash_lock_addr(k)); 1011 while ((rth = *rthp) != NULL) { 1012 if (!rt_is_expired(rth) && 1013 !rt_may_expire(rth, tmo, expire)) { 1014 tmo >>= 1; 1015 rthp = &rth->u.dst.rt_next; 1016 continue; 1017 } 1018 *rthp = rth->u.dst.rt_next; 1019 rt_free(rth); 1020 goal--; 1021 } 1022 spin_unlock_bh(rt_hash_lock_addr(k)); 1023 if (goal <= 0) 1024 break; 1025 } 1026 rover = k; 1027 1028 if (goal <= 0) 1029 goto work_done; 1030 1031 /* Goal is not achieved. We stop process if: 1032 1033 - if expire reduced to zero. Otherwise, expire is halfed. 1034 - if table is not full. 1035 - if we are called from interrupt. 1036 - jiffies check is just fallback/debug loop breaker. 1037 We will not spin here for long time in any case. 1038 */ 1039 1040 RT_CACHE_STAT_INC(gc_goal_miss); 1041 1042 if (expire == 0) 1043 break; 1044 1045 expire >>= 1; 1046 #if RT_CACHE_DEBUG >= 2 1047 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 1048 atomic_read(&ipv4_dst_ops.entries), goal, i); 1049 #endif 1050 1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1052 goto out; 1053 } while (!in_softirq() && time_before_eq(jiffies, now)); 1054 1055 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1056 goto out; 1057 if (net_ratelimit()) 1058 printk(KERN_WARNING "dst cache overflow\n"); 1059 RT_CACHE_STAT_INC(gc_dst_overflow); 1060 return 1; 1061 1062 work_done: 1063 expire += ip_rt_gc_min_interval; 1064 if (expire > ip_rt_gc_timeout || 1065 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 1066 expire = ip_rt_gc_timeout; 1067 #if RT_CACHE_DEBUG >= 2 1068 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 1069 atomic_read(&ipv4_dst_ops.entries), goal, rover); 1070 #endif 1071 out: return 0; 1072 } 1073 1074 static int rt_intern_hash(unsigned hash, struct rtable *rt, 1075 struct rtable **rp, struct sk_buff *skb) 1076 { 1077 struct rtable *rth, **rthp; 1078 unsigned long now; 1079 struct rtable *cand, **candp; 1080 u32 min_score; 1081 int chain_length; 1082 int attempts = !in_softirq(); 1083 1084 restart: 1085 chain_length = 0; 1086 min_score = ~(u32)0; 1087 cand = NULL; 1088 candp = NULL; 1089 now = jiffies; 1090 1091 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1092 /* 1093 * If we're not caching, just tell the caller we 1094 * were successful and don't touch the route. The 1095 * caller hold the sole reference to the cache entry, and 1096 * it will be released when the caller is done with it. 1097 * If we drop it here, the callers have no way to resolve routes 1098 * when we're not caching. Instead, just point *rp at rt, so 1099 * the caller gets a single use out of the route 1100 * Note that we do rt_free on this new route entry, so that 1101 * once its refcount hits zero, we are still able to reap it 1102 * (Thanks Alexey) 1103 * Note also the rt_free uses call_rcu. We don't actually 1104 * need rcu protection here, this is just our path to get 1105 * on the route gc list. 1106 */ 1107 1108 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1109 int err = arp_bind_neighbour(&rt->u.dst); 1110 if (err) { 1111 if (net_ratelimit()) 1112 printk(KERN_WARNING 1113 "Neighbour table failure & not caching routes.\n"); 1114 rt_drop(rt); 1115 return err; 1116 } 1117 } 1118 1119 rt_free(rt); 1120 goto skip_hashing; 1121 } 1122 1123 rthp = &rt_hash_table[hash].chain; 1124 1125 spin_lock_bh(rt_hash_lock_addr(hash)); 1126 while ((rth = *rthp) != NULL) { 1127 if (rt_is_expired(rth)) { 1128 *rthp = rth->u.dst.rt_next; 1129 rt_free(rth); 1130 continue; 1131 } 1132 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1133 /* Put it first */ 1134 *rthp = rth->u.dst.rt_next; 1135 /* 1136 * Since lookup is lockfree, the deletion 1137 * must be visible to another weakly ordered CPU before 1138 * the insertion at the start of the hash chain. 1139 */ 1140 rcu_assign_pointer(rth->u.dst.rt_next, 1141 rt_hash_table[hash].chain); 1142 /* 1143 * Since lookup is lockfree, the update writes 1144 * must be ordered for consistency on SMP. 1145 */ 1146 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1147 1148 dst_use(&rth->u.dst, now); 1149 spin_unlock_bh(rt_hash_lock_addr(hash)); 1150 1151 rt_drop(rt); 1152 if (rp) 1153 *rp = rth; 1154 else 1155 skb_dst_set(skb, &rth->u.dst); 1156 return 0; 1157 } 1158 1159 if (!atomic_read(&rth->u.dst.__refcnt)) { 1160 u32 score = rt_score(rth); 1161 1162 if (score <= min_score) { 1163 cand = rth; 1164 candp = rthp; 1165 min_score = score; 1166 } 1167 } 1168 1169 chain_length++; 1170 1171 rthp = &rth->u.dst.rt_next; 1172 } 1173 1174 if (cand) { 1175 /* ip_rt_gc_elasticity used to be average length of chain 1176 * length, when exceeded gc becomes really aggressive. 1177 * 1178 * The second limit is less certain. At the moment it allows 1179 * only 2 entries per bucket. We will see. 1180 */ 1181 if (chain_length > ip_rt_gc_elasticity) { 1182 *candp = cand->u.dst.rt_next; 1183 rt_free(cand); 1184 } 1185 } else { 1186 if (chain_length > rt_chain_length_max) { 1187 struct net *net = dev_net(rt->u.dst.dev); 1188 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1189 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1190 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1191 rt->u.dst.dev->name, num); 1192 } 1193 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); 1194 } 1195 } 1196 1197 /* Try to bind route to arp only if it is output 1198 route or unicast forwarding path. 1199 */ 1200 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1201 int err = arp_bind_neighbour(&rt->u.dst); 1202 if (err) { 1203 spin_unlock_bh(rt_hash_lock_addr(hash)); 1204 1205 if (err != -ENOBUFS) { 1206 rt_drop(rt); 1207 return err; 1208 } 1209 1210 /* Neighbour tables are full and nothing 1211 can be released. Try to shrink route cache, 1212 it is most likely it holds some neighbour records. 1213 */ 1214 if (attempts-- > 0) { 1215 int saved_elasticity = ip_rt_gc_elasticity; 1216 int saved_int = ip_rt_gc_min_interval; 1217 ip_rt_gc_elasticity = 1; 1218 ip_rt_gc_min_interval = 0; 1219 rt_garbage_collect(&ipv4_dst_ops); 1220 ip_rt_gc_min_interval = saved_int; 1221 ip_rt_gc_elasticity = saved_elasticity; 1222 goto restart; 1223 } 1224 1225 if (net_ratelimit()) 1226 printk(KERN_WARNING "Neighbour table overflow.\n"); 1227 rt_drop(rt); 1228 return -ENOBUFS; 1229 } 1230 } 1231 1232 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1233 1234 #if RT_CACHE_DEBUG >= 2 1235 if (rt->u.dst.rt_next) { 1236 struct rtable *trt; 1237 printk(KERN_DEBUG "rt_cache @%02x: %pI4", 1238 hash, &rt->rt_dst); 1239 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1240 printk(" . %pI4", &trt->rt_dst); 1241 printk("\n"); 1242 } 1243 #endif 1244 /* 1245 * Since lookup is lockfree, we must make sure 1246 * previous writes to rt are comitted to memory 1247 * before making rt visible to other CPUS. 1248 */ 1249 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1250 1251 spin_unlock_bh(rt_hash_lock_addr(hash)); 1252 1253 skip_hashing: 1254 if (rp) 1255 *rp = rt; 1256 else 1257 skb_dst_set(skb, &rt->u.dst); 1258 return 0; 1259 } 1260 1261 void rt_bind_peer(struct rtable *rt, int create) 1262 { 1263 static DEFINE_SPINLOCK(rt_peer_lock); 1264 struct inet_peer *peer; 1265 1266 peer = inet_getpeer(rt->rt_dst, create); 1267 1268 spin_lock_bh(&rt_peer_lock); 1269 if (rt->peer == NULL) { 1270 rt->peer = peer; 1271 peer = NULL; 1272 } 1273 spin_unlock_bh(&rt_peer_lock); 1274 if (peer) 1275 inet_putpeer(peer); 1276 } 1277 1278 /* 1279 * Peer allocation may fail only in serious out-of-memory conditions. However 1280 * we still can generate some output. 1281 * Random ID selection looks a bit dangerous because we have no chances to 1282 * select ID being unique in a reasonable period of time. 1283 * But broken packet identifier may be better than no packet at all. 1284 */ 1285 static void ip_select_fb_ident(struct iphdr *iph) 1286 { 1287 static DEFINE_SPINLOCK(ip_fb_id_lock); 1288 static u32 ip_fallback_id; 1289 u32 salt; 1290 1291 spin_lock_bh(&ip_fb_id_lock); 1292 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 1293 iph->id = htons(salt & 0xFFFF); 1294 ip_fallback_id = salt; 1295 spin_unlock_bh(&ip_fb_id_lock); 1296 } 1297 1298 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1299 { 1300 struct rtable *rt = (struct rtable *) dst; 1301 1302 if (rt) { 1303 if (rt->peer == NULL) 1304 rt_bind_peer(rt, 1); 1305 1306 /* If peer is attached to destination, it is never detached, 1307 so that we need not to grab a lock to dereference it. 1308 */ 1309 if (rt->peer) { 1310 iph->id = htons(inet_getid(rt->peer, more)); 1311 return; 1312 } 1313 } else 1314 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1315 __builtin_return_address(0)); 1316 1317 ip_select_fb_ident(iph); 1318 } 1319 1320 static void rt_del(unsigned hash, struct rtable *rt) 1321 { 1322 struct rtable **rthp, *aux; 1323 1324 rthp = &rt_hash_table[hash].chain; 1325 spin_lock_bh(rt_hash_lock_addr(hash)); 1326 ip_rt_put(rt); 1327 while ((aux = *rthp) != NULL) { 1328 if (aux == rt || rt_is_expired(aux)) { 1329 *rthp = aux->u.dst.rt_next; 1330 rt_free(aux); 1331 continue; 1332 } 1333 rthp = &aux->u.dst.rt_next; 1334 } 1335 spin_unlock_bh(rt_hash_lock_addr(hash)); 1336 } 1337 1338 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1339 __be32 saddr, struct net_device *dev) 1340 { 1341 int i, k; 1342 struct in_device *in_dev = in_dev_get(dev); 1343 struct rtable *rth, **rthp; 1344 __be32 skeys[2] = { saddr, 0 }; 1345 int ikeys[2] = { dev->ifindex, 0 }; 1346 struct netevent_redirect netevent; 1347 struct net *net; 1348 1349 if (!in_dev) 1350 return; 1351 1352 net = dev_net(dev); 1353 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 1354 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 1355 ipv4_is_zeronet(new_gw)) 1356 goto reject_redirect; 1357 1358 if (!rt_caching(net)) 1359 goto reject_redirect; 1360 1361 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1362 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1363 goto reject_redirect; 1364 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1365 goto reject_redirect; 1366 } else { 1367 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 1368 goto reject_redirect; 1369 } 1370 1371 for (i = 0; i < 2; i++) { 1372 for (k = 0; k < 2; k++) { 1373 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1374 rt_genid(net)); 1375 1376 rthp=&rt_hash_table[hash].chain; 1377 1378 rcu_read_lock(); 1379 while ((rth = rcu_dereference(*rthp)) != NULL) { 1380 struct rtable *rt; 1381 1382 if (rth->fl.fl4_dst != daddr || 1383 rth->fl.fl4_src != skeys[i] || 1384 rth->fl.oif != ikeys[k] || 1385 rth->fl.iif != 0 || 1386 rt_is_expired(rth) || 1387 !net_eq(dev_net(rth->u.dst.dev), net)) { 1388 rthp = &rth->u.dst.rt_next; 1389 continue; 1390 } 1391 1392 if (rth->rt_dst != daddr || 1393 rth->rt_src != saddr || 1394 rth->u.dst.error || 1395 rth->rt_gateway != old_gw || 1396 rth->u.dst.dev != dev) 1397 break; 1398 1399 dst_hold(&rth->u.dst); 1400 rcu_read_unlock(); 1401 1402 rt = dst_alloc(&ipv4_dst_ops); 1403 if (rt == NULL) { 1404 ip_rt_put(rth); 1405 in_dev_put(in_dev); 1406 return; 1407 } 1408 1409 /* Copy all the information. */ 1410 *rt = *rth; 1411 rt->u.dst.__use = 1; 1412 atomic_set(&rt->u.dst.__refcnt, 1); 1413 rt->u.dst.child = NULL; 1414 if (rt->u.dst.dev) 1415 dev_hold(rt->u.dst.dev); 1416 if (rt->idev) 1417 in_dev_hold(rt->idev); 1418 rt->u.dst.obsolete = 0; 1419 rt->u.dst.lastuse = jiffies; 1420 rt->u.dst.path = &rt->u.dst; 1421 rt->u.dst.neighbour = NULL; 1422 rt->u.dst.hh = NULL; 1423 #ifdef CONFIG_XFRM 1424 rt->u.dst.xfrm = NULL; 1425 #endif 1426 rt->rt_genid = rt_genid(net); 1427 rt->rt_flags |= RTCF_REDIRECTED; 1428 1429 /* Gateway is different ... */ 1430 rt->rt_gateway = new_gw; 1431 1432 /* Redirect received -> path was valid */ 1433 dst_confirm(&rth->u.dst); 1434 1435 if (rt->peer) 1436 atomic_inc(&rt->peer->refcnt); 1437 1438 if (arp_bind_neighbour(&rt->u.dst) || 1439 !(rt->u.dst.neighbour->nud_state & 1440 NUD_VALID)) { 1441 if (rt->u.dst.neighbour) 1442 neigh_event_send(rt->u.dst.neighbour, NULL); 1443 ip_rt_put(rth); 1444 rt_drop(rt); 1445 goto do_next; 1446 } 1447 1448 netevent.old = &rth->u.dst; 1449 netevent.new = &rt->u.dst; 1450 call_netevent_notifiers(NETEVENT_REDIRECT, 1451 &netevent); 1452 1453 rt_del(hash, rth); 1454 if (!rt_intern_hash(hash, rt, &rt, NULL)) 1455 ip_rt_put(rt); 1456 goto do_next; 1457 } 1458 rcu_read_unlock(); 1459 do_next: 1460 ; 1461 } 1462 } 1463 in_dev_put(in_dev); 1464 return; 1465 1466 reject_redirect: 1467 #ifdef CONFIG_IP_ROUTE_VERBOSE 1468 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1469 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" 1470 " Advised path = %pI4 -> %pI4\n", 1471 &old_gw, dev->name, &new_gw, 1472 &saddr, &daddr); 1473 #endif 1474 in_dev_put(in_dev); 1475 } 1476 1477 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1478 { 1479 struct rtable *rt = (struct rtable *)dst; 1480 struct dst_entry *ret = dst; 1481 1482 if (rt) { 1483 if (dst->obsolete) { 1484 ip_rt_put(rt); 1485 ret = NULL; 1486 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1487 rt->u.dst.expires) { 1488 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1489 rt->fl.oif, 1490 rt_genid(dev_net(dst->dev))); 1491 #if RT_CACHE_DEBUG >= 1 1492 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", 1493 &rt->rt_dst, rt->fl.fl4_tos); 1494 #endif 1495 rt_del(hash, rt); 1496 ret = NULL; 1497 } 1498 } 1499 return ret; 1500 } 1501 1502 /* 1503 * Algorithm: 1504 * 1. The first ip_rt_redirect_number redirects are sent 1505 * with exponential backoff, then we stop sending them at all, 1506 * assuming that the host ignores our redirects. 1507 * 2. If we did not see packets requiring redirects 1508 * during ip_rt_redirect_silence, we assume that the host 1509 * forgot redirected route and start to send redirects again. 1510 * 1511 * This algorithm is much cheaper and more intelligent than dumb load limiting 1512 * in icmp.c. 1513 * 1514 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1515 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1516 */ 1517 1518 void ip_rt_send_redirect(struct sk_buff *skb) 1519 { 1520 struct rtable *rt = skb_rtable(skb); 1521 struct in_device *in_dev; 1522 int log_martians; 1523 1524 rcu_read_lock(); 1525 in_dev = __in_dev_get_rcu(rt->u.dst.dev); 1526 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1527 rcu_read_unlock(); 1528 return; 1529 } 1530 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1531 rcu_read_unlock(); 1532 1533 /* No redirected packets during ip_rt_redirect_silence; 1534 * reset the algorithm. 1535 */ 1536 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1537 rt->u.dst.rate_tokens = 0; 1538 1539 /* Too many ignored redirects; do not send anything 1540 * set u.dst.rate_last to the last seen redirected packet. 1541 */ 1542 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1543 rt->u.dst.rate_last = jiffies; 1544 return; 1545 } 1546 1547 /* Check for load limit; set rate_last to the latest sent 1548 * redirect. 1549 */ 1550 if (rt->u.dst.rate_tokens == 0 || 1551 time_after(jiffies, 1552 (rt->u.dst.rate_last + 1553 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1554 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1555 rt->u.dst.rate_last = jiffies; 1556 ++rt->u.dst.rate_tokens; 1557 #ifdef CONFIG_IP_ROUTE_VERBOSE 1558 if (log_martians && 1559 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1560 net_ratelimit()) 1561 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1562 &rt->rt_src, rt->rt_iif, 1563 &rt->rt_dst, &rt->rt_gateway); 1564 #endif 1565 } 1566 } 1567 1568 static int ip_error(struct sk_buff *skb) 1569 { 1570 struct rtable *rt = skb_rtable(skb); 1571 unsigned long now; 1572 int code; 1573 1574 switch (rt->u.dst.error) { 1575 case EINVAL: 1576 default: 1577 goto out; 1578 case EHOSTUNREACH: 1579 code = ICMP_HOST_UNREACH; 1580 break; 1581 case ENETUNREACH: 1582 code = ICMP_NET_UNREACH; 1583 IP_INC_STATS_BH(dev_net(rt->u.dst.dev), 1584 IPSTATS_MIB_INNOROUTES); 1585 break; 1586 case EACCES: 1587 code = ICMP_PKT_FILTERED; 1588 break; 1589 } 1590 1591 now = jiffies; 1592 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1593 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1594 rt->u.dst.rate_tokens = ip_rt_error_burst; 1595 rt->u.dst.rate_last = now; 1596 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1597 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1598 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1599 } 1600 1601 out: kfree_skb(skb); 1602 return 0; 1603 } 1604 1605 /* 1606 * The last two values are not from the RFC but 1607 * are needed for AMPRnet AX.25 paths. 1608 */ 1609 1610 static const unsigned short mtu_plateau[] = 1611 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1612 1613 static inline unsigned short guess_mtu(unsigned short old_mtu) 1614 { 1615 int i; 1616 1617 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1618 if (old_mtu > mtu_plateau[i]) 1619 return mtu_plateau[i]; 1620 return 68; 1621 } 1622 1623 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, 1624 unsigned short new_mtu, 1625 struct net_device *dev) 1626 { 1627 int i, k; 1628 unsigned short old_mtu = ntohs(iph->tot_len); 1629 struct rtable *rth; 1630 int ikeys[2] = { dev->ifindex, 0 }; 1631 __be32 skeys[2] = { iph->saddr, 0, }; 1632 __be32 daddr = iph->daddr; 1633 unsigned short est_mtu = 0; 1634 1635 for (k = 0; k < 2; k++) { 1636 for (i = 0; i < 2; i++) { 1637 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1638 rt_genid(net)); 1639 1640 rcu_read_lock(); 1641 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1642 rth = rcu_dereference(rth->u.dst.rt_next)) { 1643 unsigned short mtu = new_mtu; 1644 1645 if (rth->fl.fl4_dst != daddr || 1646 rth->fl.fl4_src != skeys[i] || 1647 rth->rt_dst != daddr || 1648 rth->rt_src != iph->saddr || 1649 rth->fl.oif != ikeys[k] || 1650 rth->fl.iif != 0 || 1651 dst_metric_locked(&rth->u.dst, RTAX_MTU) || 1652 !net_eq(dev_net(rth->u.dst.dev), net) || 1653 rt_is_expired(rth)) 1654 continue; 1655 1656 if (new_mtu < 68 || new_mtu >= old_mtu) { 1657 1658 /* BSD 4.2 compatibility hack :-( */ 1659 if (mtu == 0 && 1660 old_mtu >= dst_mtu(&rth->u.dst) && 1661 old_mtu >= 68 + (iph->ihl << 2)) 1662 old_mtu -= iph->ihl << 2; 1663 1664 mtu = guess_mtu(old_mtu); 1665 } 1666 if (mtu <= dst_mtu(&rth->u.dst)) { 1667 if (mtu < dst_mtu(&rth->u.dst)) { 1668 dst_confirm(&rth->u.dst); 1669 if (mtu < ip_rt_min_pmtu) { 1670 mtu = ip_rt_min_pmtu; 1671 rth->u.dst.metrics[RTAX_LOCK-1] |= 1672 (1 << RTAX_MTU); 1673 } 1674 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1675 dst_set_expires(&rth->u.dst, 1676 ip_rt_mtu_expires); 1677 } 1678 est_mtu = mtu; 1679 } 1680 } 1681 rcu_read_unlock(); 1682 } 1683 } 1684 return est_mtu ? : new_mtu; 1685 } 1686 1687 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1688 { 1689 if (dst_mtu(dst) > mtu && mtu >= 68 && 1690 !(dst_metric_locked(dst, RTAX_MTU))) { 1691 if (mtu < ip_rt_min_pmtu) { 1692 mtu = ip_rt_min_pmtu; 1693 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1694 } 1695 dst->metrics[RTAX_MTU-1] = mtu; 1696 dst_set_expires(dst, ip_rt_mtu_expires); 1697 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1698 } 1699 } 1700 1701 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1702 { 1703 return NULL; 1704 } 1705 1706 static void ipv4_dst_destroy(struct dst_entry *dst) 1707 { 1708 struct rtable *rt = (struct rtable *) dst; 1709 struct inet_peer *peer = rt->peer; 1710 struct in_device *idev = rt->idev; 1711 1712 if (peer) { 1713 rt->peer = NULL; 1714 inet_putpeer(peer); 1715 } 1716 1717 if (idev) { 1718 rt->idev = NULL; 1719 in_dev_put(idev); 1720 } 1721 } 1722 1723 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 1724 int how) 1725 { 1726 struct rtable *rt = (struct rtable *) dst; 1727 struct in_device *idev = rt->idev; 1728 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) { 1729 struct in_device *loopback_idev = 1730 in_dev_get(dev_net(dev)->loopback_dev); 1731 if (loopback_idev) { 1732 rt->idev = loopback_idev; 1733 in_dev_put(idev); 1734 } 1735 } 1736 } 1737 1738 static void ipv4_link_failure(struct sk_buff *skb) 1739 { 1740 struct rtable *rt; 1741 1742 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1743 1744 rt = skb_rtable(skb); 1745 if (rt) 1746 dst_set_expires(&rt->u.dst, 0); 1747 } 1748 1749 static int ip_rt_bug(struct sk_buff *skb) 1750 { 1751 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", 1752 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1753 skb->dev ? skb->dev->name : "?"); 1754 kfree_skb(skb); 1755 return 0; 1756 } 1757 1758 /* 1759 We do not cache source address of outgoing interface, 1760 because it is used only by IP RR, TS and SRR options, 1761 so that it out of fast path. 1762 1763 BTW remember: "addr" is allowed to be not aligned 1764 in IP options! 1765 */ 1766 1767 void ip_rt_get_source(u8 *addr, struct rtable *rt) 1768 { 1769 __be32 src; 1770 struct fib_result res; 1771 1772 if (rt->fl.iif == 0) 1773 src = rt->rt_src; 1774 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { 1775 src = FIB_RES_PREFSRC(res); 1776 fib_res_put(&res); 1777 } else 1778 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1779 RT_SCOPE_UNIVERSE); 1780 memcpy(addr, &src, 4); 1781 } 1782 1783 #ifdef CONFIG_NET_CLS_ROUTE 1784 static void set_class_tag(struct rtable *rt, u32 tag) 1785 { 1786 if (!(rt->u.dst.tclassid & 0xFFFF)) 1787 rt->u.dst.tclassid |= tag & 0xFFFF; 1788 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1789 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1790 } 1791 #endif 1792 1793 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1794 { 1795 struct fib_info *fi = res->fi; 1796 1797 if (fi) { 1798 if (FIB_RES_GW(*res) && 1799 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1800 rt->rt_gateway = FIB_RES_GW(*res); 1801 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1802 sizeof(rt->u.dst.metrics)); 1803 if (fi->fib_mtu == 0) { 1804 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; 1805 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && 1806 rt->rt_gateway != rt->rt_dst && 1807 rt->u.dst.dev->mtu > 576) 1808 rt->u.dst.metrics[RTAX_MTU-1] = 576; 1809 } 1810 #ifdef CONFIG_NET_CLS_ROUTE 1811 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1812 #endif 1813 } else 1814 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1815 1816 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) 1817 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1818 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) 1819 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1820 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) 1821 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 1822 ip_rt_min_advmss); 1823 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) 1824 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; 1825 1826 #ifdef CONFIG_NET_CLS_ROUTE 1827 #ifdef CONFIG_IP_MULTIPLE_TABLES 1828 set_class_tag(rt, fib_rules_tclass(res)); 1829 #endif 1830 set_class_tag(rt, itag); 1831 #endif 1832 rt->rt_type = res->type; 1833 } 1834 1835 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1836 u8 tos, struct net_device *dev, int our) 1837 { 1838 unsigned hash; 1839 struct rtable *rth; 1840 __be32 spec_dst; 1841 struct in_device *in_dev = in_dev_get(dev); 1842 u32 itag = 0; 1843 1844 /* Primary sanity checks. */ 1845 1846 if (in_dev == NULL) 1847 return -EINVAL; 1848 1849 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1850 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 1851 goto e_inval; 1852 1853 if (ipv4_is_zeronet(saddr)) { 1854 if (!ipv4_is_local_multicast(daddr)) 1855 goto e_inval; 1856 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1857 } else if (fib_validate_source(saddr, 0, tos, 0, 1858 dev, &spec_dst, &itag, 0) < 0) 1859 goto e_inval; 1860 1861 rth = dst_alloc(&ipv4_dst_ops); 1862 if (!rth) 1863 goto e_nobufs; 1864 1865 rth->u.dst.output= ip_rt_bug; 1866 1867 atomic_set(&rth->u.dst.__refcnt, 1); 1868 rth->u.dst.flags= DST_HOST; 1869 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1870 rth->u.dst.flags |= DST_NOPOLICY; 1871 rth->fl.fl4_dst = daddr; 1872 rth->rt_dst = daddr; 1873 rth->fl.fl4_tos = tos; 1874 rth->fl.mark = skb->mark; 1875 rth->fl.fl4_src = saddr; 1876 rth->rt_src = saddr; 1877 #ifdef CONFIG_NET_CLS_ROUTE 1878 rth->u.dst.tclassid = itag; 1879 #endif 1880 rth->rt_iif = 1881 rth->fl.iif = dev->ifindex; 1882 rth->u.dst.dev = init_net.loopback_dev; 1883 dev_hold(rth->u.dst.dev); 1884 rth->idev = in_dev_get(rth->u.dst.dev); 1885 rth->fl.oif = 0; 1886 rth->rt_gateway = daddr; 1887 rth->rt_spec_dst= spec_dst; 1888 rth->rt_genid = rt_genid(dev_net(dev)); 1889 rth->rt_flags = RTCF_MULTICAST; 1890 rth->rt_type = RTN_MULTICAST; 1891 if (our) { 1892 rth->u.dst.input= ip_local_deliver; 1893 rth->rt_flags |= RTCF_LOCAL; 1894 } 1895 1896 #ifdef CONFIG_IP_MROUTE 1897 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1898 rth->u.dst.input = ip_mr_input; 1899 #endif 1900 RT_CACHE_STAT_INC(in_slow_mc); 1901 1902 in_dev_put(in_dev); 1903 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1904 return rt_intern_hash(hash, rth, NULL, skb); 1905 1906 e_nobufs: 1907 in_dev_put(in_dev); 1908 return -ENOBUFS; 1909 1910 e_inval: 1911 in_dev_put(in_dev); 1912 return -EINVAL; 1913 } 1914 1915 1916 static void ip_handle_martian_source(struct net_device *dev, 1917 struct in_device *in_dev, 1918 struct sk_buff *skb, 1919 __be32 daddr, 1920 __be32 saddr) 1921 { 1922 RT_CACHE_STAT_INC(in_martian_src); 1923 #ifdef CONFIG_IP_ROUTE_VERBOSE 1924 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1925 /* 1926 * RFC1812 recommendation, if source is martian, 1927 * the only hint is MAC header. 1928 */ 1929 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", 1930 &daddr, &saddr, dev->name); 1931 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1932 int i; 1933 const unsigned char *p = skb_mac_header(skb); 1934 printk(KERN_WARNING "ll header: "); 1935 for (i = 0; i < dev->hard_header_len; i++, p++) { 1936 printk("%02x", *p); 1937 if (i < (dev->hard_header_len - 1)) 1938 printk(":"); 1939 } 1940 printk("\n"); 1941 } 1942 } 1943 #endif 1944 } 1945 1946 static int __mkroute_input(struct sk_buff *skb, 1947 struct fib_result *res, 1948 struct in_device *in_dev, 1949 __be32 daddr, __be32 saddr, u32 tos, 1950 struct rtable **result) 1951 { 1952 1953 struct rtable *rth; 1954 int err; 1955 struct in_device *out_dev; 1956 unsigned flags = 0; 1957 __be32 spec_dst; 1958 u32 itag; 1959 1960 /* get a working reference to the output device */ 1961 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1962 if (out_dev == NULL) { 1963 if (net_ratelimit()) 1964 printk(KERN_CRIT "Bug in ip_route_input" \ 1965 "_slow(). Please, report\n"); 1966 return -EINVAL; 1967 } 1968 1969 1970 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 1971 in_dev->dev, &spec_dst, &itag, skb->mark); 1972 if (err < 0) { 1973 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1974 saddr); 1975 1976 err = -EINVAL; 1977 goto cleanup; 1978 } 1979 1980 if (err) 1981 flags |= RTCF_DIRECTSRC; 1982 1983 if (out_dev == in_dev && err && 1984 (IN_DEV_SHARED_MEDIA(out_dev) || 1985 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1986 flags |= RTCF_DOREDIRECT; 1987 1988 if (skb->protocol != htons(ETH_P_IP)) { 1989 /* Not IP (i.e. ARP). Do not create route, if it is 1990 * invalid for proxy arp. DNAT routes are always valid. 1991 * 1992 * Proxy arp feature have been extended to allow, ARP 1993 * replies back to the same interface, to support 1994 * Private VLAN switch technologies. See arp.c. 1995 */ 1996 if (out_dev == in_dev && 1997 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1998 err = -EINVAL; 1999 goto cleanup; 2000 } 2001 } 2002 2003 2004 rth = dst_alloc(&ipv4_dst_ops); 2005 if (!rth) { 2006 err = -ENOBUFS; 2007 goto cleanup; 2008 } 2009 2010 atomic_set(&rth->u.dst.__refcnt, 1); 2011 rth->u.dst.flags= DST_HOST; 2012 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2013 rth->u.dst.flags |= DST_NOPOLICY; 2014 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 2015 rth->u.dst.flags |= DST_NOXFRM; 2016 rth->fl.fl4_dst = daddr; 2017 rth->rt_dst = daddr; 2018 rth->fl.fl4_tos = tos; 2019 rth->fl.mark = skb->mark; 2020 rth->fl.fl4_src = saddr; 2021 rth->rt_src = saddr; 2022 rth->rt_gateway = daddr; 2023 rth->rt_iif = 2024 rth->fl.iif = in_dev->dev->ifindex; 2025 rth->u.dst.dev = (out_dev)->dev; 2026 dev_hold(rth->u.dst.dev); 2027 rth->idev = in_dev_get(rth->u.dst.dev); 2028 rth->fl.oif = 0; 2029 rth->rt_spec_dst= spec_dst; 2030 2031 rth->u.dst.input = ip_forward; 2032 rth->u.dst.output = ip_output; 2033 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 2034 2035 rt_set_nexthop(rth, res, itag); 2036 2037 rth->rt_flags = flags; 2038 2039 *result = rth; 2040 err = 0; 2041 cleanup: 2042 /* release the working reference to the output device */ 2043 in_dev_put(out_dev); 2044 return err; 2045 } 2046 2047 static int ip_mkroute_input(struct sk_buff *skb, 2048 struct fib_result *res, 2049 const struct flowi *fl, 2050 struct in_device *in_dev, 2051 __be32 daddr, __be32 saddr, u32 tos) 2052 { 2053 struct rtable* rth = NULL; 2054 int err; 2055 unsigned hash; 2056 2057 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2058 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 2059 fib_select_multipath(fl, res); 2060 #endif 2061 2062 /* create a routing cache entry */ 2063 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 2064 if (err) 2065 return err; 2066 2067 /* put it into the cache */ 2068 hash = rt_hash(daddr, saddr, fl->iif, 2069 rt_genid(dev_net(rth->u.dst.dev))); 2070 return rt_intern_hash(hash, rth, NULL, skb); 2071 } 2072 2073 /* 2074 * NOTE. We drop all the packets that has local source 2075 * addresses, because every properly looped back packet 2076 * must have correct destination already attached by output routine. 2077 * 2078 * Such approach solves two big problems: 2079 * 1. Not simplex devices are handled properly. 2080 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2081 */ 2082 2083 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2084 u8 tos, struct net_device *dev) 2085 { 2086 struct fib_result res; 2087 struct in_device *in_dev = in_dev_get(dev); 2088 struct flowi fl = { .nl_u = { .ip4_u = 2089 { .daddr = daddr, 2090 .saddr = saddr, 2091 .tos = tos, 2092 .scope = RT_SCOPE_UNIVERSE, 2093 } }, 2094 .mark = skb->mark, 2095 .iif = dev->ifindex }; 2096 unsigned flags = 0; 2097 u32 itag = 0; 2098 struct rtable * rth; 2099 unsigned hash; 2100 __be32 spec_dst; 2101 int err = -EINVAL; 2102 int free_res = 0; 2103 struct net * net = dev_net(dev); 2104 2105 /* IP on this device is disabled. */ 2106 2107 if (!in_dev) 2108 goto out; 2109 2110 /* Check for the most weird martians, which can be not detected 2111 by fib_lookup. 2112 */ 2113 2114 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2115 ipv4_is_loopback(saddr)) 2116 goto martian_source; 2117 2118 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 2119 goto brd_input; 2120 2121 /* Accept zero addresses only to limited broadcast; 2122 * I even do not know to fix it or not. Waiting for complains :-) 2123 */ 2124 if (ipv4_is_zeronet(saddr)) 2125 goto martian_source; 2126 2127 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || 2128 ipv4_is_loopback(daddr)) 2129 goto martian_destination; 2130 2131 /* 2132 * Now we are ready to route packet. 2133 */ 2134 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2135 if (!IN_DEV_FORWARD(in_dev)) 2136 goto e_hostunreach; 2137 goto no_route; 2138 } 2139 free_res = 1; 2140 2141 RT_CACHE_STAT_INC(in_slow_tot); 2142 2143 if (res.type == RTN_BROADCAST) 2144 goto brd_input; 2145 2146 if (res.type == RTN_LOCAL) { 2147 int result; 2148 result = fib_validate_source(saddr, daddr, tos, 2149 net->loopback_dev->ifindex, 2150 dev, &spec_dst, &itag, skb->mark); 2151 if (result < 0) 2152 goto martian_source; 2153 if (result) 2154 flags |= RTCF_DIRECTSRC; 2155 spec_dst = daddr; 2156 goto local_input; 2157 } 2158 2159 if (!IN_DEV_FORWARD(in_dev)) 2160 goto e_hostunreach; 2161 if (res.type != RTN_UNICAST) 2162 goto martian_destination; 2163 2164 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2165 done: 2166 in_dev_put(in_dev); 2167 if (free_res) 2168 fib_res_put(&res); 2169 out: return err; 2170 2171 brd_input: 2172 if (skb->protocol != htons(ETH_P_IP)) 2173 goto e_inval; 2174 2175 if (ipv4_is_zeronet(saddr)) 2176 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2177 else { 2178 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2179 &itag, skb->mark); 2180 if (err < 0) 2181 goto martian_source; 2182 if (err) 2183 flags |= RTCF_DIRECTSRC; 2184 } 2185 flags |= RTCF_BROADCAST; 2186 res.type = RTN_BROADCAST; 2187 RT_CACHE_STAT_INC(in_brd); 2188 2189 local_input: 2190 rth = dst_alloc(&ipv4_dst_ops); 2191 if (!rth) 2192 goto e_nobufs; 2193 2194 rth->u.dst.output= ip_rt_bug; 2195 rth->rt_genid = rt_genid(net); 2196 2197 atomic_set(&rth->u.dst.__refcnt, 1); 2198 rth->u.dst.flags= DST_HOST; 2199 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2200 rth->u.dst.flags |= DST_NOPOLICY; 2201 rth->fl.fl4_dst = daddr; 2202 rth->rt_dst = daddr; 2203 rth->fl.fl4_tos = tos; 2204 rth->fl.mark = skb->mark; 2205 rth->fl.fl4_src = saddr; 2206 rth->rt_src = saddr; 2207 #ifdef CONFIG_NET_CLS_ROUTE 2208 rth->u.dst.tclassid = itag; 2209 #endif 2210 rth->rt_iif = 2211 rth->fl.iif = dev->ifindex; 2212 rth->u.dst.dev = net->loopback_dev; 2213 dev_hold(rth->u.dst.dev); 2214 rth->idev = in_dev_get(rth->u.dst.dev); 2215 rth->rt_gateway = daddr; 2216 rth->rt_spec_dst= spec_dst; 2217 rth->u.dst.input= ip_local_deliver; 2218 rth->rt_flags = flags|RTCF_LOCAL; 2219 if (res.type == RTN_UNREACHABLE) { 2220 rth->u.dst.input= ip_error; 2221 rth->u.dst.error= -err; 2222 rth->rt_flags &= ~RTCF_LOCAL; 2223 } 2224 rth->rt_type = res.type; 2225 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2226 err = rt_intern_hash(hash, rth, NULL, skb); 2227 goto done; 2228 2229 no_route: 2230 RT_CACHE_STAT_INC(in_no_route); 2231 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2232 res.type = RTN_UNREACHABLE; 2233 if (err == -ESRCH) 2234 err = -ENETUNREACH; 2235 goto local_input; 2236 2237 /* 2238 * Do not cache martian addresses: they should be logged (RFC1812) 2239 */ 2240 martian_destination: 2241 RT_CACHE_STAT_INC(in_martian_dst); 2242 #ifdef CONFIG_IP_ROUTE_VERBOSE 2243 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2244 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", 2245 &daddr, &saddr, dev->name); 2246 #endif 2247 2248 e_hostunreach: 2249 err = -EHOSTUNREACH; 2250 goto done; 2251 2252 e_inval: 2253 err = -EINVAL; 2254 goto done; 2255 2256 e_nobufs: 2257 err = -ENOBUFS; 2258 goto done; 2259 2260 martian_source: 2261 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2262 goto e_inval; 2263 } 2264 2265 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2266 u8 tos, struct net_device *dev) 2267 { 2268 struct rtable * rth; 2269 unsigned hash; 2270 int iif = dev->ifindex; 2271 struct net *net; 2272 2273 net = dev_net(dev); 2274 2275 if (!rt_caching(net)) 2276 goto skip_cache; 2277 2278 tos &= IPTOS_RT_MASK; 2279 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2280 2281 rcu_read_lock(); 2282 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2283 rth = rcu_dereference(rth->u.dst.rt_next)) { 2284 if (((rth->fl.fl4_dst ^ daddr) | 2285 (rth->fl.fl4_src ^ saddr) | 2286 (rth->fl.iif ^ iif) | 2287 rth->fl.oif | 2288 (rth->fl.fl4_tos ^ tos)) == 0 && 2289 rth->fl.mark == skb->mark && 2290 net_eq(dev_net(rth->u.dst.dev), net) && 2291 !rt_is_expired(rth)) { 2292 dst_use(&rth->u.dst, jiffies); 2293 RT_CACHE_STAT_INC(in_hit); 2294 rcu_read_unlock(); 2295 skb_dst_set(skb, &rth->u.dst); 2296 return 0; 2297 } 2298 RT_CACHE_STAT_INC(in_hlist_search); 2299 } 2300 rcu_read_unlock(); 2301 2302 skip_cache: 2303 /* Multicast recognition logic is moved from route cache to here. 2304 The problem was that too many Ethernet cards have broken/missing 2305 hardware multicast filters :-( As result the host on multicasting 2306 network acquires a lot of useless route cache entries, sort of 2307 SDR messages from all the world. Now we try to get rid of them. 2308 Really, provided software IP multicast filter is organized 2309 reasonably (at least, hashed), it does not result in a slowdown 2310 comparing with route cache reject entries. 2311 Note, that multicast routers are not affected, because 2312 route cache entry is created eventually. 2313 */ 2314 if (ipv4_is_multicast(daddr)) { 2315 struct in_device *in_dev; 2316 2317 rcu_read_lock(); 2318 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { 2319 int our = ip_check_mc(in_dev, daddr, saddr, 2320 ip_hdr(skb)->protocol); 2321 if (our 2322 #ifdef CONFIG_IP_MROUTE 2323 || 2324 (!ipv4_is_local_multicast(daddr) && 2325 IN_DEV_MFORWARD(in_dev)) 2326 #endif 2327 ) { 2328 rcu_read_unlock(); 2329 return ip_route_input_mc(skb, daddr, saddr, 2330 tos, dev, our); 2331 } 2332 } 2333 rcu_read_unlock(); 2334 return -EINVAL; 2335 } 2336 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2337 } 2338 2339 static int __mkroute_output(struct rtable **result, 2340 struct fib_result *res, 2341 const struct flowi *fl, 2342 const struct flowi *oldflp, 2343 struct net_device *dev_out, 2344 unsigned flags) 2345 { 2346 struct rtable *rth; 2347 struct in_device *in_dev; 2348 u32 tos = RT_FL_TOS(oldflp); 2349 int err = 0; 2350 2351 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2352 return -EINVAL; 2353 2354 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2355 res->type = RTN_BROADCAST; 2356 else if (ipv4_is_multicast(fl->fl4_dst)) 2357 res->type = RTN_MULTICAST; 2358 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) 2359 return -EINVAL; 2360 2361 if (dev_out->flags & IFF_LOOPBACK) 2362 flags |= RTCF_LOCAL; 2363 2364 /* get work reference to inet device */ 2365 in_dev = in_dev_get(dev_out); 2366 if (!in_dev) 2367 return -EINVAL; 2368 2369 if (res->type == RTN_BROADCAST) { 2370 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2371 if (res->fi) { 2372 fib_info_put(res->fi); 2373 res->fi = NULL; 2374 } 2375 } else if (res->type == RTN_MULTICAST) { 2376 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2377 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2378 oldflp->proto)) 2379 flags &= ~RTCF_LOCAL; 2380 /* If multicast route do not exist use 2381 default one, but do not gateway in this case. 2382 Yes, it is hack. 2383 */ 2384 if (res->fi && res->prefixlen < 4) { 2385 fib_info_put(res->fi); 2386 res->fi = NULL; 2387 } 2388 } 2389 2390 2391 rth = dst_alloc(&ipv4_dst_ops); 2392 if (!rth) { 2393 err = -ENOBUFS; 2394 goto cleanup; 2395 } 2396 2397 atomic_set(&rth->u.dst.__refcnt, 1); 2398 rth->u.dst.flags= DST_HOST; 2399 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2400 rth->u.dst.flags |= DST_NOXFRM; 2401 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2402 rth->u.dst.flags |= DST_NOPOLICY; 2403 2404 rth->fl.fl4_dst = oldflp->fl4_dst; 2405 rth->fl.fl4_tos = tos; 2406 rth->fl.fl4_src = oldflp->fl4_src; 2407 rth->fl.oif = oldflp->oif; 2408 rth->fl.mark = oldflp->mark; 2409 rth->rt_dst = fl->fl4_dst; 2410 rth->rt_src = fl->fl4_src; 2411 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2412 /* get references to the devices that are to be hold by the routing 2413 cache entry */ 2414 rth->u.dst.dev = dev_out; 2415 dev_hold(dev_out); 2416 rth->idev = in_dev_get(dev_out); 2417 rth->rt_gateway = fl->fl4_dst; 2418 rth->rt_spec_dst= fl->fl4_src; 2419 2420 rth->u.dst.output=ip_output; 2421 rth->rt_genid = rt_genid(dev_net(dev_out)); 2422 2423 RT_CACHE_STAT_INC(out_slow_tot); 2424 2425 if (flags & RTCF_LOCAL) { 2426 rth->u.dst.input = ip_local_deliver; 2427 rth->rt_spec_dst = fl->fl4_dst; 2428 } 2429 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2430 rth->rt_spec_dst = fl->fl4_src; 2431 if (flags & RTCF_LOCAL && 2432 !(dev_out->flags & IFF_LOOPBACK)) { 2433 rth->u.dst.output = ip_mc_output; 2434 RT_CACHE_STAT_INC(out_slow_mc); 2435 } 2436 #ifdef CONFIG_IP_MROUTE 2437 if (res->type == RTN_MULTICAST) { 2438 if (IN_DEV_MFORWARD(in_dev) && 2439 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2440 rth->u.dst.input = ip_mr_input; 2441 rth->u.dst.output = ip_mc_output; 2442 } 2443 } 2444 #endif 2445 } 2446 2447 rt_set_nexthop(rth, res, 0); 2448 2449 rth->rt_flags = flags; 2450 2451 *result = rth; 2452 cleanup: 2453 /* release work reference to inet device */ 2454 in_dev_put(in_dev); 2455 2456 return err; 2457 } 2458 2459 static int ip_mkroute_output(struct rtable **rp, 2460 struct fib_result *res, 2461 const struct flowi *fl, 2462 const struct flowi *oldflp, 2463 struct net_device *dev_out, 2464 unsigned flags) 2465 { 2466 struct rtable *rth = NULL; 2467 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2468 unsigned hash; 2469 if (err == 0) { 2470 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, 2471 rt_genid(dev_net(dev_out))); 2472 err = rt_intern_hash(hash, rth, rp, NULL); 2473 } 2474 2475 return err; 2476 } 2477 2478 /* 2479 * Major route resolver routine. 2480 */ 2481 2482 static int ip_route_output_slow(struct net *net, struct rtable **rp, 2483 const struct flowi *oldflp) 2484 { 2485 u32 tos = RT_FL_TOS(oldflp); 2486 struct flowi fl = { .nl_u = { .ip4_u = 2487 { .daddr = oldflp->fl4_dst, 2488 .saddr = oldflp->fl4_src, 2489 .tos = tos & IPTOS_RT_MASK, 2490 .scope = ((tos & RTO_ONLINK) ? 2491 RT_SCOPE_LINK : 2492 RT_SCOPE_UNIVERSE), 2493 } }, 2494 .mark = oldflp->mark, 2495 .iif = net->loopback_dev->ifindex, 2496 .oif = oldflp->oif }; 2497 struct fib_result res; 2498 unsigned flags = 0; 2499 struct net_device *dev_out = NULL; 2500 int free_res = 0; 2501 int err; 2502 2503 2504 res.fi = NULL; 2505 #ifdef CONFIG_IP_MULTIPLE_TABLES 2506 res.r = NULL; 2507 #endif 2508 2509 if (oldflp->fl4_src) { 2510 err = -EINVAL; 2511 if (ipv4_is_multicast(oldflp->fl4_src) || 2512 ipv4_is_lbcast(oldflp->fl4_src) || 2513 ipv4_is_zeronet(oldflp->fl4_src)) 2514 goto out; 2515 2516 /* I removed check for oif == dev_out->oif here. 2517 It was wrong for two reasons: 2518 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2519 is assigned to multiple interfaces. 2520 2. Moreover, we are allowed to send packets with saddr 2521 of another iface. --ANK 2522 */ 2523 2524 if (oldflp->oif == 0 && 2525 (ipv4_is_multicast(oldflp->fl4_dst) || 2526 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2527 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2528 dev_out = ip_dev_find(net, oldflp->fl4_src); 2529 if (dev_out == NULL) 2530 goto out; 2531 2532 /* Special hack: user can direct multicasts 2533 and limited broadcast via necessary interface 2534 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2535 This hack is not just for fun, it allows 2536 vic,vat and friends to work. 2537 They bind socket to loopback, set ttl to zero 2538 and expect that it will work. 2539 From the viewpoint of routing cache they are broken, 2540 because we are not allowed to build multicast path 2541 with loopback source addr (look, routing cache 2542 cannot know, that ttl is zero, so that packet 2543 will not leave this host and route is valid). 2544 Luckily, this hack is good workaround. 2545 */ 2546 2547 fl.oif = dev_out->ifindex; 2548 goto make_route; 2549 } 2550 2551 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2552 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2553 dev_out = ip_dev_find(net, oldflp->fl4_src); 2554 if (dev_out == NULL) 2555 goto out; 2556 dev_put(dev_out); 2557 dev_out = NULL; 2558 } 2559 } 2560 2561 2562 if (oldflp->oif) { 2563 dev_out = dev_get_by_index(net, oldflp->oif); 2564 err = -ENODEV; 2565 if (dev_out == NULL) 2566 goto out; 2567 2568 /* RACE: Check return value of inet_select_addr instead. */ 2569 if (__in_dev_get_rtnl(dev_out) == NULL) { 2570 dev_put(dev_out); 2571 goto out; /* Wrong error code */ 2572 } 2573 2574 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2575 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2576 if (!fl.fl4_src) 2577 fl.fl4_src = inet_select_addr(dev_out, 0, 2578 RT_SCOPE_LINK); 2579 goto make_route; 2580 } 2581 if (!fl.fl4_src) { 2582 if (ipv4_is_multicast(oldflp->fl4_dst)) 2583 fl.fl4_src = inet_select_addr(dev_out, 0, 2584 fl.fl4_scope); 2585 else if (!oldflp->fl4_dst) 2586 fl.fl4_src = inet_select_addr(dev_out, 0, 2587 RT_SCOPE_HOST); 2588 } 2589 } 2590 2591 if (!fl.fl4_dst) { 2592 fl.fl4_dst = fl.fl4_src; 2593 if (!fl.fl4_dst) 2594 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2595 if (dev_out) 2596 dev_put(dev_out); 2597 dev_out = net->loopback_dev; 2598 dev_hold(dev_out); 2599 fl.oif = net->loopback_dev->ifindex; 2600 res.type = RTN_LOCAL; 2601 flags |= RTCF_LOCAL; 2602 goto make_route; 2603 } 2604 2605 if (fib_lookup(net, &fl, &res)) { 2606 res.fi = NULL; 2607 if (oldflp->oif) { 2608 /* Apparently, routing tables are wrong. Assume, 2609 that the destination is on link. 2610 2611 WHY? DW. 2612 Because we are allowed to send to iface 2613 even if it has NO routes and NO assigned 2614 addresses. When oif is specified, routing 2615 tables are looked up with only one purpose: 2616 to catch if destination is gatewayed, rather than 2617 direct. Moreover, if MSG_DONTROUTE is set, 2618 we send packet, ignoring both routing tables 2619 and ifaddr state. --ANK 2620 2621 2622 We could make it even if oif is unknown, 2623 likely IPv6, but we do not. 2624 */ 2625 2626 if (fl.fl4_src == 0) 2627 fl.fl4_src = inet_select_addr(dev_out, 0, 2628 RT_SCOPE_LINK); 2629 res.type = RTN_UNICAST; 2630 goto make_route; 2631 } 2632 if (dev_out) 2633 dev_put(dev_out); 2634 err = -ENETUNREACH; 2635 goto out; 2636 } 2637 free_res = 1; 2638 2639 if (res.type == RTN_LOCAL) { 2640 if (!fl.fl4_src) 2641 fl.fl4_src = fl.fl4_dst; 2642 if (dev_out) 2643 dev_put(dev_out); 2644 dev_out = net->loopback_dev; 2645 dev_hold(dev_out); 2646 fl.oif = dev_out->ifindex; 2647 if (res.fi) 2648 fib_info_put(res.fi); 2649 res.fi = NULL; 2650 flags |= RTCF_LOCAL; 2651 goto make_route; 2652 } 2653 2654 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2655 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2656 fib_select_multipath(&fl, &res); 2657 else 2658 #endif 2659 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2660 fib_select_default(net, &fl, &res); 2661 2662 if (!fl.fl4_src) 2663 fl.fl4_src = FIB_RES_PREFSRC(res); 2664 2665 if (dev_out) 2666 dev_put(dev_out); 2667 dev_out = FIB_RES_DEV(res); 2668 dev_hold(dev_out); 2669 fl.oif = dev_out->ifindex; 2670 2671 2672 make_route: 2673 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2674 2675 2676 if (free_res) 2677 fib_res_put(&res); 2678 if (dev_out) 2679 dev_put(dev_out); 2680 out: return err; 2681 } 2682 2683 int __ip_route_output_key(struct net *net, struct rtable **rp, 2684 const struct flowi *flp) 2685 { 2686 unsigned hash; 2687 struct rtable *rth; 2688 2689 if (!rt_caching(net)) 2690 goto slow_output; 2691 2692 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2693 2694 rcu_read_lock_bh(); 2695 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2696 rth = rcu_dereference(rth->u.dst.rt_next)) { 2697 if (rth->fl.fl4_dst == flp->fl4_dst && 2698 rth->fl.fl4_src == flp->fl4_src && 2699 rth->fl.iif == 0 && 2700 rth->fl.oif == flp->oif && 2701 rth->fl.mark == flp->mark && 2702 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2703 (IPTOS_RT_MASK | RTO_ONLINK)) && 2704 net_eq(dev_net(rth->u.dst.dev), net) && 2705 !rt_is_expired(rth)) { 2706 dst_use(&rth->u.dst, jiffies); 2707 RT_CACHE_STAT_INC(out_hit); 2708 rcu_read_unlock_bh(); 2709 *rp = rth; 2710 return 0; 2711 } 2712 RT_CACHE_STAT_INC(out_hlist_search); 2713 } 2714 rcu_read_unlock_bh(); 2715 2716 slow_output: 2717 return ip_route_output_slow(net, rp, flp); 2718 } 2719 2720 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2721 2722 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2723 { 2724 } 2725 2726 static struct dst_ops ipv4_dst_blackhole_ops = { 2727 .family = AF_INET, 2728 .protocol = cpu_to_be16(ETH_P_IP), 2729 .destroy = ipv4_dst_destroy, 2730 .check = ipv4_dst_check, 2731 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2732 .entries = ATOMIC_INIT(0), 2733 }; 2734 2735 2736 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) 2737 { 2738 struct rtable *ort = *rp; 2739 struct rtable *rt = (struct rtable *) 2740 dst_alloc(&ipv4_dst_blackhole_ops); 2741 2742 if (rt) { 2743 struct dst_entry *new = &rt->u.dst; 2744 2745 atomic_set(&new->__refcnt, 1); 2746 new->__use = 1; 2747 new->input = dst_discard; 2748 new->output = dst_discard; 2749 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2750 2751 new->dev = ort->u.dst.dev; 2752 if (new->dev) 2753 dev_hold(new->dev); 2754 2755 rt->fl = ort->fl; 2756 2757 rt->idev = ort->idev; 2758 if (rt->idev) 2759 in_dev_hold(rt->idev); 2760 rt->rt_genid = rt_genid(net); 2761 rt->rt_flags = ort->rt_flags; 2762 rt->rt_type = ort->rt_type; 2763 rt->rt_dst = ort->rt_dst; 2764 rt->rt_src = ort->rt_src; 2765 rt->rt_iif = ort->rt_iif; 2766 rt->rt_gateway = ort->rt_gateway; 2767 rt->rt_spec_dst = ort->rt_spec_dst; 2768 rt->peer = ort->peer; 2769 if (rt->peer) 2770 atomic_inc(&rt->peer->refcnt); 2771 2772 dst_free(new); 2773 } 2774 2775 dst_release(&(*rp)->u.dst); 2776 *rp = rt; 2777 return (rt ? 0 : -ENOMEM); 2778 } 2779 2780 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2781 struct sock *sk, int flags) 2782 { 2783 int err; 2784 2785 if ((err = __ip_route_output_key(net, rp, flp)) != 0) 2786 return err; 2787 2788 if (flp->proto) { 2789 if (!flp->fl4_src) 2790 flp->fl4_src = (*rp)->rt_src; 2791 if (!flp->fl4_dst) 2792 flp->fl4_dst = (*rp)->rt_dst; 2793 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, 2794 flags ? XFRM_LOOKUP_WAIT : 0); 2795 if (err == -EREMOTE) 2796 err = ipv4_dst_blackhole(net, rp, flp); 2797 2798 return err; 2799 } 2800 2801 return 0; 2802 } 2803 2804 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2805 2806 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) 2807 { 2808 return ip_route_output_flow(net, rp, flp, NULL, 0); 2809 } 2810 2811 static int rt_fill_info(struct net *net, 2812 struct sk_buff *skb, u32 pid, u32 seq, int event, 2813 int nowait, unsigned int flags) 2814 { 2815 struct rtable *rt = skb_rtable(skb); 2816 struct rtmsg *r; 2817 struct nlmsghdr *nlh; 2818 long expires; 2819 u32 id = 0, ts = 0, tsage = 0, error; 2820 2821 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2822 if (nlh == NULL) 2823 return -EMSGSIZE; 2824 2825 r = nlmsg_data(nlh); 2826 r->rtm_family = AF_INET; 2827 r->rtm_dst_len = 32; 2828 r->rtm_src_len = 0; 2829 r->rtm_tos = rt->fl.fl4_tos; 2830 r->rtm_table = RT_TABLE_MAIN; 2831 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2832 r->rtm_type = rt->rt_type; 2833 r->rtm_scope = RT_SCOPE_UNIVERSE; 2834 r->rtm_protocol = RTPROT_UNSPEC; 2835 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2836 if (rt->rt_flags & RTCF_NOTIFY) 2837 r->rtm_flags |= RTM_F_NOTIFY; 2838 2839 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2840 2841 if (rt->fl.fl4_src) { 2842 r->rtm_src_len = 32; 2843 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2844 } 2845 if (rt->u.dst.dev) 2846 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); 2847 #ifdef CONFIG_NET_CLS_ROUTE 2848 if (rt->u.dst.tclassid) 2849 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); 2850 #endif 2851 if (rt->fl.iif) 2852 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2853 else if (rt->rt_src != rt->fl.fl4_src) 2854 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2855 2856 if (rt->rt_dst != rt->rt_gateway) 2857 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2858 2859 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2860 goto nla_put_failure; 2861 2862 error = rt->u.dst.error; 2863 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2864 if (rt->peer) { 2865 id = atomic_read(&rt->peer->ip_id_count) & 0xffff; 2866 if (rt->peer->tcp_ts_stamp) { 2867 ts = rt->peer->tcp_ts; 2868 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2869 } 2870 } 2871 2872 if (rt->fl.iif) { 2873 #ifdef CONFIG_IP_MROUTE 2874 __be32 dst = rt->rt_dst; 2875 2876 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2877 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2878 int err = ipmr_get_route(net, skb, r, nowait); 2879 if (err <= 0) { 2880 if (!nowait) { 2881 if (err == 0) 2882 return 0; 2883 goto nla_put_failure; 2884 } else { 2885 if (err == -EMSGSIZE) 2886 goto nla_put_failure; 2887 error = err; 2888 } 2889 } 2890 } else 2891 #endif 2892 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2893 } 2894 2895 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, 2896 expires, error) < 0) 2897 goto nla_put_failure; 2898 2899 return nlmsg_end(skb, nlh); 2900 2901 nla_put_failure: 2902 nlmsg_cancel(skb, nlh); 2903 return -EMSGSIZE; 2904 } 2905 2906 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2907 { 2908 struct net *net = sock_net(in_skb->sk); 2909 struct rtmsg *rtm; 2910 struct nlattr *tb[RTA_MAX+1]; 2911 struct rtable *rt = NULL; 2912 __be32 dst = 0; 2913 __be32 src = 0; 2914 u32 iif; 2915 int err; 2916 struct sk_buff *skb; 2917 2918 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2919 if (err < 0) 2920 goto errout; 2921 2922 rtm = nlmsg_data(nlh); 2923 2924 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2925 if (skb == NULL) { 2926 err = -ENOBUFS; 2927 goto errout; 2928 } 2929 2930 /* Reserve room for dummy headers, this skb can pass 2931 through good chunk of routing engine. 2932 */ 2933 skb_reset_mac_header(skb); 2934 skb_reset_network_header(skb); 2935 2936 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2937 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2938 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2939 2940 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2941 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2942 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2943 2944 if (iif) { 2945 struct net_device *dev; 2946 2947 dev = __dev_get_by_index(net, iif); 2948 if (dev == NULL) { 2949 err = -ENODEV; 2950 goto errout_free; 2951 } 2952 2953 skb->protocol = htons(ETH_P_IP); 2954 skb->dev = dev; 2955 local_bh_disable(); 2956 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2957 local_bh_enable(); 2958 2959 rt = skb_rtable(skb); 2960 if (err == 0 && rt->u.dst.error) 2961 err = -rt->u.dst.error; 2962 } else { 2963 struct flowi fl = { 2964 .nl_u = { 2965 .ip4_u = { 2966 .daddr = dst, 2967 .saddr = src, 2968 .tos = rtm->rtm_tos, 2969 }, 2970 }, 2971 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2972 }; 2973 err = ip_route_output_key(net, &rt, &fl); 2974 } 2975 2976 if (err) 2977 goto errout_free; 2978 2979 skb_dst_set(skb, &rt->u.dst); 2980 if (rtm->rtm_flags & RTM_F_NOTIFY) 2981 rt->rt_flags |= RTCF_NOTIFY; 2982 2983 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2984 RTM_NEWROUTE, 0, 0); 2985 if (err <= 0) 2986 goto errout_free; 2987 2988 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 2989 errout: 2990 return err; 2991 2992 errout_free: 2993 kfree_skb(skb); 2994 goto errout; 2995 } 2996 2997 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2998 { 2999 struct rtable *rt; 3000 int h, s_h; 3001 int idx, s_idx; 3002 struct net *net; 3003 3004 net = sock_net(skb->sk); 3005 3006 s_h = cb->args[0]; 3007 if (s_h < 0) 3008 s_h = 0; 3009 s_idx = idx = cb->args[1]; 3010 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { 3011 if (!rt_hash_table[h].chain) 3012 continue; 3013 rcu_read_lock_bh(); 3014 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 3015 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 3016 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) 3017 continue; 3018 if (rt_is_expired(rt)) 3019 continue; 3020 skb_dst_set(skb, dst_clone(&rt->u.dst)); 3021 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3022 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3023 1, NLM_F_MULTI) <= 0) { 3024 skb_dst_drop(skb); 3025 rcu_read_unlock_bh(); 3026 goto done; 3027 } 3028 skb_dst_drop(skb); 3029 } 3030 rcu_read_unlock_bh(); 3031 } 3032 3033 done: 3034 cb->args[0] = h; 3035 cb->args[1] = idx; 3036 return skb->len; 3037 } 3038 3039 void ip_rt_multicast_event(struct in_device *in_dev) 3040 { 3041 rt_cache_flush(dev_net(in_dev->dev), 0); 3042 } 3043 3044 #ifdef CONFIG_SYSCTL 3045 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3046 void __user *buffer, 3047 size_t *lenp, loff_t *ppos) 3048 { 3049 if (write) { 3050 int flush_delay; 3051 ctl_table ctl; 3052 struct net *net; 3053 3054 memcpy(&ctl, __ctl, sizeof(ctl)); 3055 ctl.data = &flush_delay; 3056 proc_dointvec(&ctl, write, buffer, lenp, ppos); 3057 3058 net = (struct net *)__ctl->extra1; 3059 rt_cache_flush(net, flush_delay); 3060 return 0; 3061 } 3062 3063 return -EINVAL; 3064 } 3065 3066 static void rt_secret_reschedule(int old) 3067 { 3068 struct net *net; 3069 int new = ip_rt_secret_interval; 3070 int diff = new - old; 3071 3072 if (!diff) 3073 return; 3074 3075 rtnl_lock(); 3076 for_each_net(net) { 3077 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); 3078 3079 if (!new) 3080 continue; 3081 3082 if (deleted) { 3083 long time = net->ipv4.rt_secret_timer.expires - jiffies; 3084 3085 if (time <= 0 || (time += diff) <= 0) 3086 time = 0; 3087 3088 net->ipv4.rt_secret_timer.expires = time; 3089 } else 3090 net->ipv4.rt_secret_timer.expires = new; 3091 3092 net->ipv4.rt_secret_timer.expires += jiffies; 3093 add_timer(&net->ipv4.rt_secret_timer); 3094 } 3095 rtnl_unlock(); 3096 } 3097 3098 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, 3099 void __user *buffer, size_t *lenp, 3100 loff_t *ppos) 3101 { 3102 int old = ip_rt_secret_interval; 3103 int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); 3104 3105 rt_secret_reschedule(old); 3106 3107 return ret; 3108 } 3109 3110 static ctl_table ipv4_route_table[] = { 3111 { 3112 .procname = "gc_thresh", 3113 .data = &ipv4_dst_ops.gc_thresh, 3114 .maxlen = sizeof(int), 3115 .mode = 0644, 3116 .proc_handler = proc_dointvec, 3117 }, 3118 { 3119 .procname = "max_size", 3120 .data = &ip_rt_max_size, 3121 .maxlen = sizeof(int), 3122 .mode = 0644, 3123 .proc_handler = proc_dointvec, 3124 }, 3125 { 3126 /* Deprecated. Use gc_min_interval_ms */ 3127 3128 .procname = "gc_min_interval", 3129 .data = &ip_rt_gc_min_interval, 3130 .maxlen = sizeof(int), 3131 .mode = 0644, 3132 .proc_handler = proc_dointvec_jiffies, 3133 }, 3134 { 3135 .procname = "gc_min_interval_ms", 3136 .data = &ip_rt_gc_min_interval, 3137 .maxlen = sizeof(int), 3138 .mode = 0644, 3139 .proc_handler = proc_dointvec_ms_jiffies, 3140 }, 3141 { 3142 .procname = "gc_timeout", 3143 .data = &ip_rt_gc_timeout, 3144 .maxlen = sizeof(int), 3145 .mode = 0644, 3146 .proc_handler = proc_dointvec_jiffies, 3147 }, 3148 { 3149 .procname = "gc_interval", 3150 .data = &ip_rt_gc_interval, 3151 .maxlen = sizeof(int), 3152 .mode = 0644, 3153 .proc_handler = proc_dointvec_jiffies, 3154 }, 3155 { 3156 .procname = "redirect_load", 3157 .data = &ip_rt_redirect_load, 3158 .maxlen = sizeof(int), 3159 .mode = 0644, 3160 .proc_handler = proc_dointvec, 3161 }, 3162 { 3163 .procname = "redirect_number", 3164 .data = &ip_rt_redirect_number, 3165 .maxlen = sizeof(int), 3166 .mode = 0644, 3167 .proc_handler = proc_dointvec, 3168 }, 3169 { 3170 .procname = "redirect_silence", 3171 .data = &ip_rt_redirect_silence, 3172 .maxlen = sizeof(int), 3173 .mode = 0644, 3174 .proc_handler = proc_dointvec, 3175 }, 3176 { 3177 .procname = "error_cost", 3178 .data = &ip_rt_error_cost, 3179 .maxlen = sizeof(int), 3180 .mode = 0644, 3181 .proc_handler = proc_dointvec, 3182 }, 3183 { 3184 .procname = "error_burst", 3185 .data = &ip_rt_error_burst, 3186 .maxlen = sizeof(int), 3187 .mode = 0644, 3188 .proc_handler = proc_dointvec, 3189 }, 3190 { 3191 .procname = "gc_elasticity", 3192 .data = &ip_rt_gc_elasticity, 3193 .maxlen = sizeof(int), 3194 .mode = 0644, 3195 .proc_handler = proc_dointvec, 3196 }, 3197 { 3198 .procname = "mtu_expires", 3199 .data = &ip_rt_mtu_expires, 3200 .maxlen = sizeof(int), 3201 .mode = 0644, 3202 .proc_handler = proc_dointvec_jiffies, 3203 }, 3204 { 3205 .procname = "min_pmtu", 3206 .data = &ip_rt_min_pmtu, 3207 .maxlen = sizeof(int), 3208 .mode = 0644, 3209 .proc_handler = proc_dointvec, 3210 }, 3211 { 3212 .procname = "min_adv_mss", 3213 .data = &ip_rt_min_advmss, 3214 .maxlen = sizeof(int), 3215 .mode = 0644, 3216 .proc_handler = proc_dointvec, 3217 }, 3218 { 3219 .procname = "secret_interval", 3220 .data = &ip_rt_secret_interval, 3221 .maxlen = sizeof(int), 3222 .mode = 0644, 3223 .proc_handler = ipv4_sysctl_rt_secret_interval, 3224 }, 3225 { } 3226 }; 3227 3228 static struct ctl_table empty[1]; 3229 3230 static struct ctl_table ipv4_skeleton[] = 3231 { 3232 { .procname = "route", 3233 .mode = 0555, .child = ipv4_route_table}, 3234 { .procname = "neigh", 3235 .mode = 0555, .child = empty}, 3236 { } 3237 }; 3238 3239 static __net_initdata struct ctl_path ipv4_path[] = { 3240 { .procname = "net", }, 3241 { .procname = "ipv4", }, 3242 { }, 3243 }; 3244 3245 static struct ctl_table ipv4_route_flush_table[] = { 3246 { 3247 .procname = "flush", 3248 .maxlen = sizeof(int), 3249 .mode = 0200, 3250 .proc_handler = ipv4_sysctl_rtcache_flush, 3251 }, 3252 { }, 3253 }; 3254 3255 static __net_initdata struct ctl_path ipv4_route_path[] = { 3256 { .procname = "net", }, 3257 { .procname = "ipv4", }, 3258 { .procname = "route", }, 3259 { }, 3260 }; 3261 3262 static __net_init int sysctl_route_net_init(struct net *net) 3263 { 3264 struct ctl_table *tbl; 3265 3266 tbl = ipv4_route_flush_table; 3267 if (!net_eq(net, &init_net)) { 3268 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3269 if (tbl == NULL) 3270 goto err_dup; 3271 } 3272 tbl[0].extra1 = net; 3273 3274 net->ipv4.route_hdr = 3275 register_net_sysctl_table(net, ipv4_route_path, tbl); 3276 if (net->ipv4.route_hdr == NULL) 3277 goto err_reg; 3278 return 0; 3279 3280 err_reg: 3281 if (tbl != ipv4_route_flush_table) 3282 kfree(tbl); 3283 err_dup: 3284 return -ENOMEM; 3285 } 3286 3287 static __net_exit void sysctl_route_net_exit(struct net *net) 3288 { 3289 struct ctl_table *tbl; 3290 3291 tbl = net->ipv4.route_hdr->ctl_table_arg; 3292 unregister_net_sysctl_table(net->ipv4.route_hdr); 3293 BUG_ON(tbl == ipv4_route_flush_table); 3294 kfree(tbl); 3295 } 3296 3297 static __net_initdata struct pernet_operations sysctl_route_ops = { 3298 .init = sysctl_route_net_init, 3299 .exit = sysctl_route_net_exit, 3300 }; 3301 #endif 3302 3303 3304 static __net_init int rt_secret_timer_init(struct net *net) 3305 { 3306 atomic_set(&net->ipv4.rt_genid, 3307 (int) ((num_physpages ^ (num_physpages>>8)) ^ 3308 (jiffies ^ (jiffies >> 7)))); 3309 3310 net->ipv4.rt_secret_timer.function = rt_secret_rebuild; 3311 net->ipv4.rt_secret_timer.data = (unsigned long)net; 3312 init_timer_deferrable(&net->ipv4.rt_secret_timer); 3313 3314 if (ip_rt_secret_interval) { 3315 net->ipv4.rt_secret_timer.expires = 3316 jiffies + net_random() % ip_rt_secret_interval + 3317 ip_rt_secret_interval; 3318 add_timer(&net->ipv4.rt_secret_timer); 3319 } 3320 return 0; 3321 } 3322 3323 static __net_exit void rt_secret_timer_exit(struct net *net) 3324 { 3325 del_timer_sync(&net->ipv4.rt_secret_timer); 3326 } 3327 3328 static __net_initdata struct pernet_operations rt_secret_timer_ops = { 3329 .init = rt_secret_timer_init, 3330 .exit = rt_secret_timer_exit, 3331 }; 3332 3333 3334 #ifdef CONFIG_NET_CLS_ROUTE 3335 struct ip_rt_acct *ip_rt_acct __read_mostly; 3336 #endif /* CONFIG_NET_CLS_ROUTE */ 3337 3338 static __initdata unsigned long rhash_entries; 3339 static int __init set_rhash_entries(char *str) 3340 { 3341 if (!str) 3342 return 0; 3343 rhash_entries = simple_strtoul(str, &str, 0); 3344 return 1; 3345 } 3346 __setup("rhash_entries=", set_rhash_entries); 3347 3348 int __init ip_rt_init(void) 3349 { 3350 int rc = 0; 3351 3352 #ifdef CONFIG_NET_CLS_ROUTE 3353 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3354 if (!ip_rt_acct) 3355 panic("IP: failed to allocate ip_rt_acct\n"); 3356 #endif 3357 3358 ipv4_dst_ops.kmem_cachep = 3359 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3360 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3361 3362 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3363 3364 rt_hash_table = (struct rt_hash_bucket *) 3365 alloc_large_system_hash("IP route cache", 3366 sizeof(struct rt_hash_bucket), 3367 rhash_entries, 3368 (totalram_pages >= 128 * 1024) ? 3369 15 : 17, 3370 0, 3371 &rt_hash_log, 3372 &rt_hash_mask, 3373 rhash_entries ? 0 : 512 * 1024); 3374 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3375 rt_hash_lock_init(); 3376 3377 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3378 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3379 3380 devinet_init(); 3381 ip_fib_init(); 3382 3383 /* All the timers, started at system startup tend 3384 to synchronize. Perturb it a bit. 3385 */ 3386 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); 3387 expires_ljiffies = jiffies; 3388 schedule_delayed_work(&expires_work, 3389 net_random() % ip_rt_gc_interval + ip_rt_gc_interval); 3390 3391 if (register_pernet_subsys(&rt_secret_timer_ops)) 3392 printk(KERN_ERR "Unable to setup rt_secret_timer\n"); 3393 3394 if (ip_rt_proc_init()) 3395 printk(KERN_ERR "Unable to create route proc files\n"); 3396 #ifdef CONFIG_XFRM 3397 xfrm_init(); 3398 xfrm4_init(ip_rt_max_size); 3399 #endif 3400 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); 3401 3402 #ifdef CONFIG_SYSCTL 3403 register_pernet_subsys(&sysctl_route_ops); 3404 #endif 3405 return rc; 3406 } 3407 3408 #ifdef CONFIG_SYSCTL 3409 /* 3410 * We really need to sanitize the damn ipv4 init order, then all 3411 * this nonsense will go away. 3412 */ 3413 void __init ip_static_sysctl_init(void) 3414 { 3415 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3416 } 3417 #endif 3418 3419 EXPORT_SYMBOL(__ip_select_ident); 3420 EXPORT_SYMBOL(ip_route_input); 3421 EXPORT_SYMBOL(ip_route_output_key); 3422