1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #include <linux/module.h> 66 #include <asm/uaccess.h> 67 #include <asm/system.h> 68 #include <linux/bitops.h> 69 #include <linux/types.h> 70 #include <linux/kernel.h> 71 #include <linux/mm.h> 72 #include <linux/bootmem.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/workqueue.h> 83 #include <linux/skbuff.h> 84 #include <linux/inetdevice.h> 85 #include <linux/igmp.h> 86 #include <linux/pkt_sched.h> 87 #include <linux/mroute.h> 88 #include <linux/netfilter_ipv4.h> 89 #include <linux/random.h> 90 #include <linux/jhash.h> 91 #include <linux/rcupdate.h> 92 #include <linux/times.h> 93 #include <net/dst.h> 94 #include <net/net_namespace.h> 95 #include <net/protocol.h> 96 #include <net/ip.h> 97 #include <net/route.h> 98 #include <net/inetpeer.h> 99 #include <net/sock.h> 100 #include <net/ip_fib.h> 101 #include <net/arp.h> 102 #include <net/tcp.h> 103 #include <net/icmp.h> 104 #include <net/xfrm.h> 105 #include <net/netevent.h> 106 #include <net/rtnetlink.h> 107 #ifdef CONFIG_SYSCTL 108 #include <linux/sysctl.h> 109 #endif 110 111 #define RT_FL_TOS(oldflp) \ 112 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 113 114 #define IP_MAX_MTU 0xFFF0 115 116 #define RT_GC_TIMEOUT (300*HZ) 117 118 static int ip_rt_max_size; 119 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 120 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 121 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 122 static int ip_rt_redirect_number __read_mostly = 9; 123 static int ip_rt_redirect_load __read_mostly = HZ / 50; 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 125 static int ip_rt_error_cost __read_mostly = HZ; 126 static int ip_rt_error_burst __read_mostly = 5 * HZ; 127 static int ip_rt_gc_elasticity __read_mostly = 8; 128 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 129 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 130 static int ip_rt_min_advmss __read_mostly = 256; 131 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; 132 static int rt_chain_length_max __read_mostly = 20; 133 134 static struct delayed_work expires_work; 135 static unsigned long expires_ljiffies; 136 137 /* 138 * Interface to generic destination cache. 139 */ 140 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 142 static void ipv4_dst_destroy(struct dst_entry *dst); 143 static void ipv4_dst_ifdown(struct dst_entry *dst, 144 struct net_device *dev, int how); 145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 146 static void ipv4_link_failure(struct sk_buff *skb); 147 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 148 static int rt_garbage_collect(struct dst_ops *ops); 149 static void rt_emergency_hash_rebuild(struct net *net); 150 151 152 static struct dst_ops ipv4_dst_ops = { 153 .family = AF_INET, 154 .protocol = cpu_to_be16(ETH_P_IP), 155 .gc = rt_garbage_collect, 156 .check = ipv4_dst_check, 157 .destroy = ipv4_dst_destroy, 158 .ifdown = ipv4_dst_ifdown, 159 .negative_advice = ipv4_negative_advice, 160 .link_failure = ipv4_link_failure, 161 .update_pmtu = ip_rt_update_pmtu, 162 .local_out = __ip_local_out, 163 .entries = ATOMIC_INIT(0), 164 }; 165 166 #define ECN_OR_COST(class) TC_PRIO_##class 167 168 const __u8 ip_tos2prio[16] = { 169 TC_PRIO_BESTEFFORT, 170 ECN_OR_COST(FILLER), 171 TC_PRIO_BESTEFFORT, 172 ECN_OR_COST(BESTEFFORT), 173 TC_PRIO_BULK, 174 ECN_OR_COST(BULK), 175 TC_PRIO_BULK, 176 ECN_OR_COST(BULK), 177 TC_PRIO_INTERACTIVE, 178 ECN_OR_COST(INTERACTIVE), 179 TC_PRIO_INTERACTIVE, 180 ECN_OR_COST(INTERACTIVE), 181 TC_PRIO_INTERACTIVE_BULK, 182 ECN_OR_COST(INTERACTIVE_BULK), 183 TC_PRIO_INTERACTIVE_BULK, 184 ECN_OR_COST(INTERACTIVE_BULK) 185 }; 186 187 188 /* 189 * Route cache. 190 */ 191 192 /* The locking scheme is rather straight forward: 193 * 194 * 1) Read-Copy Update protects the buckets of the central route hash. 195 * 2) Only writers remove entries, and they hold the lock 196 * as they look at rtable reference counts. 197 * 3) Only readers acquire references to rtable entries, 198 * they do so with atomic increments and with the 199 * lock held. 200 */ 201 202 struct rt_hash_bucket { 203 struct rtable *chain; 204 }; 205 206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 207 defined(CONFIG_PROVE_LOCKING) 208 /* 209 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks 210 * The size of this table is a power of two and depends on the number of CPUS. 211 * (on lockdep we have a quite big spinlock_t, so keep the size down there) 212 */ 213 #ifdef CONFIG_LOCKDEP 214 # define RT_HASH_LOCK_SZ 256 215 #else 216 # if NR_CPUS >= 32 217 # define RT_HASH_LOCK_SZ 4096 218 # elif NR_CPUS >= 16 219 # define RT_HASH_LOCK_SZ 2048 220 # elif NR_CPUS >= 8 221 # define RT_HASH_LOCK_SZ 1024 222 # elif NR_CPUS >= 4 223 # define RT_HASH_LOCK_SZ 512 224 # else 225 # define RT_HASH_LOCK_SZ 256 226 # endif 227 #endif 228 229 static spinlock_t *rt_hash_locks; 230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] 231 232 static __init void rt_hash_lock_init(void) 233 { 234 int i; 235 236 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, 237 GFP_KERNEL); 238 if (!rt_hash_locks) 239 panic("IP: failed to allocate rt_hash_locks\n"); 240 241 for (i = 0; i < RT_HASH_LOCK_SZ; i++) 242 spin_lock_init(&rt_hash_locks[i]); 243 } 244 #else 245 # define rt_hash_lock_addr(slot) NULL 246 247 static inline void rt_hash_lock_init(void) 248 { 249 } 250 #endif 251 252 static struct rt_hash_bucket *rt_hash_table __read_mostly; 253 static unsigned rt_hash_mask __read_mostly; 254 static unsigned int rt_hash_log __read_mostly; 255 256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 257 #define RT_CACHE_STAT_INC(field) \ 258 (__raw_get_cpu_var(rt_cache_stat).field++) 259 260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 261 int genid) 262 { 263 return jhash_3words((__force u32)(__be32)(daddr), 264 (__force u32)(__be32)(saddr), 265 idx, genid) 266 & rt_hash_mask; 267 } 268 269 static inline int rt_genid(struct net *net) 270 { 271 return atomic_read(&net->ipv4.rt_genid); 272 } 273 274 #ifdef CONFIG_PROC_FS 275 struct rt_cache_iter_state { 276 struct seq_net_private p; 277 int bucket; 278 int genid; 279 }; 280 281 static struct rtable *rt_cache_get_first(struct seq_file *seq) 282 { 283 struct rt_cache_iter_state *st = seq->private; 284 struct rtable *r = NULL; 285 286 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 287 if (!rt_hash_table[st->bucket].chain) 288 continue; 289 rcu_read_lock_bh(); 290 r = rcu_dereference(rt_hash_table[st->bucket].chain); 291 while (r) { 292 if (dev_net(r->u.dst.dev) == seq_file_net(seq) && 293 r->rt_genid == st->genid) 294 return r; 295 r = rcu_dereference(r->u.dst.rt_next); 296 } 297 rcu_read_unlock_bh(); 298 } 299 return r; 300 } 301 302 static struct rtable *__rt_cache_get_next(struct seq_file *seq, 303 struct rtable *r) 304 { 305 struct rt_cache_iter_state *st = seq->private; 306 307 r = r->u.dst.rt_next; 308 while (!r) { 309 rcu_read_unlock_bh(); 310 do { 311 if (--st->bucket < 0) 312 return NULL; 313 } while (!rt_hash_table[st->bucket].chain); 314 rcu_read_lock_bh(); 315 r = rt_hash_table[st->bucket].chain; 316 } 317 return rcu_dereference(r); 318 } 319 320 static struct rtable *rt_cache_get_next(struct seq_file *seq, 321 struct rtable *r) 322 { 323 struct rt_cache_iter_state *st = seq->private; 324 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 325 if (dev_net(r->u.dst.dev) != seq_file_net(seq)) 326 continue; 327 if (r->rt_genid == st->genid) 328 break; 329 } 330 return r; 331 } 332 333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 334 { 335 struct rtable *r = rt_cache_get_first(seq); 336 337 if (r) 338 while (pos && (r = rt_cache_get_next(seq, r))) 339 --pos; 340 return pos ? NULL : r; 341 } 342 343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 344 { 345 struct rt_cache_iter_state *st = seq->private; 346 if (*pos) 347 return rt_cache_get_idx(seq, *pos - 1); 348 st->genid = rt_genid(seq_file_net(seq)); 349 return SEQ_START_TOKEN; 350 } 351 352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 353 { 354 struct rtable *r; 355 356 if (v == SEQ_START_TOKEN) 357 r = rt_cache_get_first(seq); 358 else 359 r = rt_cache_get_next(seq, v); 360 ++*pos; 361 return r; 362 } 363 364 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 365 { 366 if (v && v != SEQ_START_TOKEN) 367 rcu_read_unlock_bh(); 368 } 369 370 static int rt_cache_seq_show(struct seq_file *seq, void *v) 371 { 372 if (v == SEQ_START_TOKEN) 373 seq_printf(seq, "%-127s\n", 374 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 375 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 376 "HHUptod\tSpecDst"); 377 else { 378 struct rtable *r = v; 379 int len; 380 381 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 382 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 383 r->u.dst.dev ? r->u.dst.dev->name : "*", 384 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, 385 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 386 r->u.dst.__use, 0, (unsigned long)r->rt_src, 387 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 388 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 389 dst_metric(&r->u.dst, RTAX_WINDOW), 390 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 391 dst_metric(&r->u.dst, RTAX_RTTVAR)), 392 r->fl.fl4_tos, 393 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 394 r->u.dst.hh ? (r->u.dst.hh->hh_output == 395 dev_queue_xmit) : 0, 396 r->rt_spec_dst, &len); 397 398 seq_printf(seq, "%*s\n", 127 - len, ""); 399 } 400 return 0; 401 } 402 403 static const struct seq_operations rt_cache_seq_ops = { 404 .start = rt_cache_seq_start, 405 .next = rt_cache_seq_next, 406 .stop = rt_cache_seq_stop, 407 .show = rt_cache_seq_show, 408 }; 409 410 static int rt_cache_seq_open(struct inode *inode, struct file *file) 411 { 412 return seq_open_net(inode, file, &rt_cache_seq_ops, 413 sizeof(struct rt_cache_iter_state)); 414 } 415 416 static const struct file_operations rt_cache_seq_fops = { 417 .owner = THIS_MODULE, 418 .open = rt_cache_seq_open, 419 .read = seq_read, 420 .llseek = seq_lseek, 421 .release = seq_release_net, 422 }; 423 424 425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 426 { 427 int cpu; 428 429 if (*pos == 0) 430 return SEQ_START_TOKEN; 431 432 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 433 if (!cpu_possible(cpu)) 434 continue; 435 *pos = cpu+1; 436 return &per_cpu(rt_cache_stat, cpu); 437 } 438 return NULL; 439 } 440 441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 442 { 443 int cpu; 444 445 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 446 if (!cpu_possible(cpu)) 447 continue; 448 *pos = cpu+1; 449 return &per_cpu(rt_cache_stat, cpu); 450 } 451 return NULL; 452 453 } 454 455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 456 { 457 458 } 459 460 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 461 { 462 struct rt_cache_stat *st = v; 463 464 if (v == SEQ_START_TOKEN) { 465 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 466 return 0; 467 } 468 469 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 470 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 471 atomic_read(&ipv4_dst_ops.entries), 472 st->in_hit, 473 st->in_slow_tot, 474 st->in_slow_mc, 475 st->in_no_route, 476 st->in_brd, 477 st->in_martian_dst, 478 st->in_martian_src, 479 480 st->out_hit, 481 st->out_slow_tot, 482 st->out_slow_mc, 483 484 st->gc_total, 485 st->gc_ignored, 486 st->gc_goal_miss, 487 st->gc_dst_overflow, 488 st->in_hlist_search, 489 st->out_hlist_search 490 ); 491 return 0; 492 } 493 494 static const struct seq_operations rt_cpu_seq_ops = { 495 .start = rt_cpu_seq_start, 496 .next = rt_cpu_seq_next, 497 .stop = rt_cpu_seq_stop, 498 .show = rt_cpu_seq_show, 499 }; 500 501 502 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 503 { 504 return seq_open(file, &rt_cpu_seq_ops); 505 } 506 507 static const struct file_operations rt_cpu_seq_fops = { 508 .owner = THIS_MODULE, 509 .open = rt_cpu_seq_open, 510 .read = seq_read, 511 .llseek = seq_lseek, 512 .release = seq_release, 513 }; 514 515 #ifdef CONFIG_NET_CLS_ROUTE 516 static int ip_rt_acct_read(char *buffer, char **start, off_t offset, 517 int length, int *eof, void *data) 518 { 519 unsigned int i; 520 521 if ((offset & 3) || (length & 3)) 522 return -EIO; 523 524 if (offset >= sizeof(struct ip_rt_acct) * 256) { 525 *eof = 1; 526 return 0; 527 } 528 529 if (offset + length >= sizeof(struct ip_rt_acct) * 256) { 530 length = sizeof(struct ip_rt_acct) * 256 - offset; 531 *eof = 1; 532 } 533 534 offset /= sizeof(u32); 535 536 if (length > 0) { 537 u32 *dst = (u32 *) buffer; 538 539 *start = buffer; 540 memset(dst, 0, length); 541 542 for_each_possible_cpu(i) { 543 unsigned int j; 544 u32 *src; 545 546 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; 547 for (j = 0; j < length/4; j++) 548 dst[j] += src[j]; 549 } 550 } 551 return length; 552 } 553 #endif 554 555 static int __net_init ip_rt_do_proc_init(struct net *net) 556 { 557 struct proc_dir_entry *pde; 558 559 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, 560 &rt_cache_seq_fops); 561 if (!pde) 562 goto err1; 563 564 pde = proc_create("rt_cache", S_IRUGO, 565 net->proc_net_stat, &rt_cpu_seq_fops); 566 if (!pde) 567 goto err2; 568 569 #ifdef CONFIG_NET_CLS_ROUTE 570 pde = create_proc_read_entry("rt_acct", 0, net->proc_net, 571 ip_rt_acct_read, NULL); 572 if (!pde) 573 goto err3; 574 #endif 575 return 0; 576 577 #ifdef CONFIG_NET_CLS_ROUTE 578 err3: 579 remove_proc_entry("rt_cache", net->proc_net_stat); 580 #endif 581 err2: 582 remove_proc_entry("rt_cache", net->proc_net); 583 err1: 584 return -ENOMEM; 585 } 586 587 static void __net_exit ip_rt_do_proc_exit(struct net *net) 588 { 589 remove_proc_entry("rt_cache", net->proc_net_stat); 590 remove_proc_entry("rt_cache", net->proc_net); 591 remove_proc_entry("rt_acct", net->proc_net); 592 } 593 594 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 595 .init = ip_rt_do_proc_init, 596 .exit = ip_rt_do_proc_exit, 597 }; 598 599 static int __init ip_rt_proc_init(void) 600 { 601 return register_pernet_subsys(&ip_rt_proc_ops); 602 } 603 604 #else 605 static inline int ip_rt_proc_init(void) 606 { 607 return 0; 608 } 609 #endif /* CONFIG_PROC_FS */ 610 611 static inline void rt_free(struct rtable *rt) 612 { 613 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 614 } 615 616 static inline void rt_drop(struct rtable *rt) 617 { 618 ip_rt_put(rt); 619 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 620 } 621 622 static inline int rt_fast_clean(struct rtable *rth) 623 { 624 /* Kill broadcast/multicast entries very aggresively, if they 625 collide in hash table with more useful entries */ 626 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 627 rth->fl.iif && rth->u.dst.rt_next; 628 } 629 630 static inline int rt_valuable(struct rtable *rth) 631 { 632 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 633 rth->u.dst.expires; 634 } 635 636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 637 { 638 unsigned long age; 639 int ret = 0; 640 641 if (atomic_read(&rth->u.dst.__refcnt)) 642 goto out; 643 644 ret = 1; 645 if (rth->u.dst.expires && 646 time_after_eq(jiffies, rth->u.dst.expires)) 647 goto out; 648 649 age = jiffies - rth->u.dst.lastuse; 650 ret = 0; 651 if ((age <= tmo1 && !rt_fast_clean(rth)) || 652 (age <= tmo2 && rt_valuable(rth))) 653 goto out; 654 ret = 1; 655 out: return ret; 656 } 657 658 /* Bits of score are: 659 * 31: very valuable 660 * 30: not quite useless 661 * 29..0: usage counter 662 */ 663 static inline u32 rt_score(struct rtable *rt) 664 { 665 u32 score = jiffies - rt->u.dst.lastuse; 666 667 score = ~score & ~(3<<30); 668 669 if (rt_valuable(rt)) 670 score |= (1<<31); 671 672 if (!rt->fl.iif || 673 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 674 score |= (1<<30); 675 676 return score; 677 } 678 679 static inline bool rt_caching(const struct net *net) 680 { 681 return net->ipv4.current_rt_cache_rebuild_count <= 682 net->ipv4.sysctl_rt_cache_rebuild_count; 683 } 684 685 static inline bool compare_hash_inputs(const struct flowi *fl1, 686 const struct flowi *fl2) 687 { 688 return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 689 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | 690 (fl1->iif ^ fl2->iif)) == 0); 691 } 692 693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 694 { 695 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | 696 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) | 697 (fl1->mark ^ fl2->mark) | 698 (*(u16 *)&fl1->nl_u.ip4_u.tos ^ 699 *(u16 *)&fl2->nl_u.ip4_u.tos) | 700 (fl1->oif ^ fl2->oif) | 701 (fl1->iif ^ fl2->iif)) == 0; 702 } 703 704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 705 { 706 return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); 707 } 708 709 static inline int rt_is_expired(struct rtable *rth) 710 { 711 return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); 712 } 713 714 /* 715 * Perform a full scan of hash table and free all entries. 716 * Can be called by a softirq or a process. 717 * In the later case, we want to be reschedule if necessary 718 */ 719 static void rt_do_flush(int process_context) 720 { 721 unsigned int i; 722 struct rtable *rth, *next; 723 struct rtable * tail; 724 725 for (i = 0; i <= rt_hash_mask; i++) { 726 if (process_context && need_resched()) 727 cond_resched(); 728 rth = rt_hash_table[i].chain; 729 if (!rth) 730 continue; 731 732 spin_lock_bh(rt_hash_lock_addr(i)); 733 #ifdef CONFIG_NET_NS 734 { 735 struct rtable ** prev, * p; 736 737 rth = rt_hash_table[i].chain; 738 739 /* defer releasing the head of the list after spin_unlock */ 740 for (tail = rth; tail; tail = tail->u.dst.rt_next) 741 if (!rt_is_expired(tail)) 742 break; 743 if (rth != tail) 744 rt_hash_table[i].chain = tail; 745 746 /* call rt_free on entries after the tail requiring flush */ 747 prev = &rt_hash_table[i].chain; 748 for (p = *prev; p; p = next) { 749 next = p->u.dst.rt_next; 750 if (!rt_is_expired(p)) { 751 prev = &p->u.dst.rt_next; 752 } else { 753 *prev = next; 754 rt_free(p); 755 } 756 } 757 } 758 #else 759 rth = rt_hash_table[i].chain; 760 rt_hash_table[i].chain = NULL; 761 tail = NULL; 762 #endif 763 spin_unlock_bh(rt_hash_lock_addr(i)); 764 765 for (; rth != tail; rth = next) { 766 next = rth->u.dst.rt_next; 767 rt_free(rth); 768 } 769 } 770 } 771 772 /* 773 * While freeing expired entries, we compute average chain length 774 * and standard deviation, using fixed-point arithmetic. 775 * This to have an estimation of rt_chain_length_max 776 * rt_chain_length_max = max(elasticity, AVG + 4*SD) 777 * We use 3 bits for frational part, and 29 (or 61) for magnitude. 778 */ 779 780 #define FRACT_BITS 3 781 #define ONE (1UL << FRACT_BITS) 782 783 static void rt_check_expire(void) 784 { 785 static unsigned int rover; 786 unsigned int i = rover, goal; 787 struct rtable *rth, *aux, **rthp; 788 unsigned long samples = 0; 789 unsigned long sum = 0, sum2 = 0; 790 unsigned long delta; 791 u64 mult; 792 793 delta = jiffies - expires_ljiffies; 794 expires_ljiffies = jiffies; 795 mult = ((u64)delta) << rt_hash_log; 796 if (ip_rt_gc_timeout > 1) 797 do_div(mult, ip_rt_gc_timeout); 798 goal = (unsigned int)mult; 799 if (goal > rt_hash_mask) 800 goal = rt_hash_mask + 1; 801 for (; goal > 0; goal--) { 802 unsigned long tmo = ip_rt_gc_timeout; 803 unsigned long length; 804 805 i = (i + 1) & rt_hash_mask; 806 rthp = &rt_hash_table[i].chain; 807 808 if (need_resched()) 809 cond_resched(); 810 811 samples++; 812 813 if (*rthp == NULL) 814 continue; 815 length = 0; 816 spin_lock_bh(rt_hash_lock_addr(i)); 817 while ((rth = *rthp) != NULL) { 818 prefetch(rth->u.dst.rt_next); 819 if (rt_is_expired(rth)) { 820 *rthp = rth->u.dst.rt_next; 821 rt_free(rth); 822 continue; 823 } 824 if (rth->u.dst.expires) { 825 /* Entry is expired even if it is in use */ 826 if (time_before_eq(jiffies, rth->u.dst.expires)) { 827 nofree: 828 tmo >>= 1; 829 rthp = &rth->u.dst.rt_next; 830 /* 831 * We only count entries on 832 * a chain with equal hash inputs once 833 * so that entries for different QOS 834 * levels, and other non-hash input 835 * attributes don't unfairly skew 836 * the length computation 837 */ 838 for (aux = rt_hash_table[i].chain;;) { 839 if (aux == rth) { 840 length += ONE; 841 break; 842 } 843 if (compare_hash_inputs(&aux->fl, &rth->fl)) 844 break; 845 aux = aux->u.dst.rt_next; 846 } 847 continue; 848 } 849 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) 850 goto nofree; 851 852 /* Cleanup aged off entries. */ 853 *rthp = rth->u.dst.rt_next; 854 rt_free(rth); 855 } 856 spin_unlock_bh(rt_hash_lock_addr(i)); 857 sum += length; 858 sum2 += length*length; 859 } 860 if (samples) { 861 unsigned long avg = sum / samples; 862 unsigned long sd = int_sqrt(sum2 / samples - avg*avg); 863 rt_chain_length_max = max_t(unsigned long, 864 ip_rt_gc_elasticity, 865 (avg + 4*sd) >> FRACT_BITS); 866 } 867 rover = i; 868 } 869 870 /* 871 * rt_worker_func() is run in process context. 872 * we call rt_check_expire() to scan part of the hash table 873 */ 874 static void rt_worker_func(struct work_struct *work) 875 { 876 rt_check_expire(); 877 schedule_delayed_work(&expires_work, ip_rt_gc_interval); 878 } 879 880 /* 881 * Pertubation of rt_genid by a small quantity [1..256] 882 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 883 * many times (2^24) without giving recent rt_genid. 884 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 885 */ 886 static void rt_cache_invalidate(struct net *net) 887 { 888 unsigned char shuffle; 889 890 get_random_bytes(&shuffle, sizeof(shuffle)); 891 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 892 } 893 894 /* 895 * delay < 0 : invalidate cache (fast : entries will be deleted later) 896 * delay >= 0 : invalidate & flush cache (can be long) 897 */ 898 void rt_cache_flush(struct net *net, int delay) 899 { 900 rt_cache_invalidate(net); 901 if (delay >= 0) 902 rt_do_flush(!in_softirq()); 903 } 904 905 /* 906 * We change rt_genid and let gc do the cleanup 907 */ 908 static void rt_secret_rebuild(unsigned long __net) 909 { 910 struct net *net = (struct net *)__net; 911 rt_cache_invalidate(net); 912 mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); 913 } 914 915 static void rt_secret_rebuild_oneshot(struct net *net) 916 { 917 del_timer_sync(&net->ipv4.rt_secret_timer); 918 rt_cache_invalidate(net); 919 if (ip_rt_secret_interval) { 920 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; 921 add_timer(&net->ipv4.rt_secret_timer); 922 } 923 } 924 925 static void rt_emergency_hash_rebuild(struct net *net) 926 { 927 if (net_ratelimit()) { 928 printk(KERN_WARNING "Route hash chain too long!\n"); 929 printk(KERN_WARNING "Adjust your secret_interval!\n"); 930 } 931 932 rt_secret_rebuild_oneshot(net); 933 } 934 935 /* 936 Short description of GC goals. 937 938 We want to build algorithm, which will keep routing cache 939 at some equilibrium point, when number of aged off entries 940 is kept approximately equal to newly generated ones. 941 942 Current expiration strength is variable "expire". 943 We try to adjust it dynamically, so that if networking 944 is idle expires is large enough to keep enough of warm entries, 945 and when load increases it reduces to limit cache size. 946 */ 947 948 static int rt_garbage_collect(struct dst_ops *ops) 949 { 950 static unsigned long expire = RT_GC_TIMEOUT; 951 static unsigned long last_gc; 952 static int rover; 953 static int equilibrium; 954 struct rtable *rth, **rthp; 955 unsigned long now = jiffies; 956 int goal; 957 958 /* 959 * Garbage collection is pretty expensive, 960 * do not make it too frequently. 961 */ 962 963 RT_CACHE_STAT_INC(gc_total); 964 965 if (now - last_gc < ip_rt_gc_min_interval && 966 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 967 RT_CACHE_STAT_INC(gc_ignored); 968 goto out; 969 } 970 971 /* Calculate number of entries, which we want to expire now. */ 972 goal = atomic_read(&ipv4_dst_ops.entries) - 973 (ip_rt_gc_elasticity << rt_hash_log); 974 if (goal <= 0) { 975 if (equilibrium < ipv4_dst_ops.gc_thresh) 976 equilibrium = ipv4_dst_ops.gc_thresh; 977 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 978 if (goal > 0) { 979 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 980 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 981 } 982 } else { 983 /* We are in dangerous area. Try to reduce cache really 984 * aggressively. 985 */ 986 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 987 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 988 } 989 990 if (now - last_gc >= ip_rt_gc_min_interval) 991 last_gc = now; 992 993 if (goal <= 0) { 994 equilibrium += goal; 995 goto work_done; 996 } 997 998 do { 999 int i, k; 1000 1001 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 1002 unsigned long tmo = expire; 1003 1004 k = (k + 1) & rt_hash_mask; 1005 rthp = &rt_hash_table[k].chain; 1006 spin_lock_bh(rt_hash_lock_addr(k)); 1007 while ((rth = *rthp) != NULL) { 1008 if (!rt_is_expired(rth) && 1009 !rt_may_expire(rth, tmo, expire)) { 1010 tmo >>= 1; 1011 rthp = &rth->u.dst.rt_next; 1012 continue; 1013 } 1014 *rthp = rth->u.dst.rt_next; 1015 rt_free(rth); 1016 goal--; 1017 } 1018 spin_unlock_bh(rt_hash_lock_addr(k)); 1019 if (goal <= 0) 1020 break; 1021 } 1022 rover = k; 1023 1024 if (goal <= 0) 1025 goto work_done; 1026 1027 /* Goal is not achieved. We stop process if: 1028 1029 - if expire reduced to zero. Otherwise, expire is halfed. 1030 - if table is not full. 1031 - if we are called from interrupt. 1032 - jiffies check is just fallback/debug loop breaker. 1033 We will not spin here for long time in any case. 1034 */ 1035 1036 RT_CACHE_STAT_INC(gc_goal_miss); 1037 1038 if (expire == 0) 1039 break; 1040 1041 expire >>= 1; 1042 #if RT_CACHE_DEBUG >= 2 1043 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 1044 atomic_read(&ipv4_dst_ops.entries), goal, i); 1045 #endif 1046 1047 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1048 goto out; 1049 } while (!in_softirq() && time_before_eq(jiffies, now)); 1050 1051 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 1052 goto out; 1053 if (net_ratelimit()) 1054 printk(KERN_WARNING "dst cache overflow\n"); 1055 RT_CACHE_STAT_INC(gc_dst_overflow); 1056 return 1; 1057 1058 work_done: 1059 expire += ip_rt_gc_min_interval; 1060 if (expire > ip_rt_gc_timeout || 1061 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 1062 expire = ip_rt_gc_timeout; 1063 #if RT_CACHE_DEBUG >= 2 1064 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 1065 atomic_read(&ipv4_dst_ops.entries), goal, rover); 1066 #endif 1067 out: return 0; 1068 } 1069 1070 static int rt_intern_hash(unsigned hash, struct rtable *rt, 1071 struct rtable **rp, struct sk_buff *skb) 1072 { 1073 struct rtable *rth, **rthp; 1074 unsigned long now; 1075 struct rtable *cand, **candp; 1076 u32 min_score; 1077 int chain_length; 1078 int attempts = !in_softirq(); 1079 1080 restart: 1081 chain_length = 0; 1082 min_score = ~(u32)0; 1083 cand = NULL; 1084 candp = NULL; 1085 now = jiffies; 1086 1087 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1088 rt_drop(rt); 1089 return 0; 1090 } 1091 1092 rthp = &rt_hash_table[hash].chain; 1093 1094 spin_lock_bh(rt_hash_lock_addr(hash)); 1095 while ((rth = *rthp) != NULL) { 1096 if (rt_is_expired(rth)) { 1097 *rthp = rth->u.dst.rt_next; 1098 rt_free(rth); 1099 continue; 1100 } 1101 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { 1102 /* Put it first */ 1103 *rthp = rth->u.dst.rt_next; 1104 /* 1105 * Since lookup is lockfree, the deletion 1106 * must be visible to another weakly ordered CPU before 1107 * the insertion at the start of the hash chain. 1108 */ 1109 rcu_assign_pointer(rth->u.dst.rt_next, 1110 rt_hash_table[hash].chain); 1111 /* 1112 * Since lookup is lockfree, the update writes 1113 * must be ordered for consistency on SMP. 1114 */ 1115 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1116 1117 dst_use(&rth->u.dst, now); 1118 spin_unlock_bh(rt_hash_lock_addr(hash)); 1119 1120 rt_drop(rt); 1121 if (rp) 1122 *rp = rth; 1123 else 1124 skb_dst_set(skb, &rth->u.dst); 1125 return 0; 1126 } 1127 1128 if (!atomic_read(&rth->u.dst.__refcnt)) { 1129 u32 score = rt_score(rth); 1130 1131 if (score <= min_score) { 1132 cand = rth; 1133 candp = rthp; 1134 min_score = score; 1135 } 1136 } 1137 1138 chain_length++; 1139 1140 rthp = &rth->u.dst.rt_next; 1141 } 1142 1143 if (cand) { 1144 /* ip_rt_gc_elasticity used to be average length of chain 1145 * length, when exceeded gc becomes really aggressive. 1146 * 1147 * The second limit is less certain. At the moment it allows 1148 * only 2 entries per bucket. We will see. 1149 */ 1150 if (chain_length > ip_rt_gc_elasticity) { 1151 *candp = cand->u.dst.rt_next; 1152 rt_free(cand); 1153 } 1154 } else { 1155 if (chain_length > rt_chain_length_max) { 1156 struct net *net = dev_net(rt->u.dst.dev); 1157 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1158 if (!rt_caching(dev_net(rt->u.dst.dev))) { 1159 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1160 rt->u.dst.dev->name, num); 1161 } 1162 rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); 1163 } 1164 } 1165 1166 /* Try to bind route to arp only if it is output 1167 route or unicast forwarding path. 1168 */ 1169 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 1170 int err = arp_bind_neighbour(&rt->u.dst); 1171 if (err) { 1172 spin_unlock_bh(rt_hash_lock_addr(hash)); 1173 1174 if (err != -ENOBUFS) { 1175 rt_drop(rt); 1176 return err; 1177 } 1178 1179 /* Neighbour tables are full and nothing 1180 can be released. Try to shrink route cache, 1181 it is most likely it holds some neighbour records. 1182 */ 1183 if (attempts-- > 0) { 1184 int saved_elasticity = ip_rt_gc_elasticity; 1185 int saved_int = ip_rt_gc_min_interval; 1186 ip_rt_gc_elasticity = 1; 1187 ip_rt_gc_min_interval = 0; 1188 rt_garbage_collect(&ipv4_dst_ops); 1189 ip_rt_gc_min_interval = saved_int; 1190 ip_rt_gc_elasticity = saved_elasticity; 1191 goto restart; 1192 } 1193 1194 if (net_ratelimit()) 1195 printk(KERN_WARNING "Neighbour table overflow.\n"); 1196 rt_drop(rt); 1197 return -ENOBUFS; 1198 } 1199 } 1200 1201 rt->u.dst.rt_next = rt_hash_table[hash].chain; 1202 1203 #if RT_CACHE_DEBUG >= 2 1204 if (rt->u.dst.rt_next) { 1205 struct rtable *trt; 1206 printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst); 1207 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) 1208 printk(" . %pI4", &trt->rt_dst); 1209 printk("\n"); 1210 } 1211 #endif 1212 /* 1213 * Since lookup is lockfree, we must make sure 1214 * previous writes to rt are comitted to memory 1215 * before making rt visible to other CPUS. 1216 */ 1217 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1218 1219 spin_unlock_bh(rt_hash_lock_addr(hash)); 1220 if (rp) 1221 *rp = rt; 1222 else 1223 skb_dst_set(skb, &rt->u.dst); 1224 return 0; 1225 } 1226 1227 void rt_bind_peer(struct rtable *rt, int create) 1228 { 1229 static DEFINE_SPINLOCK(rt_peer_lock); 1230 struct inet_peer *peer; 1231 1232 peer = inet_getpeer(rt->rt_dst, create); 1233 1234 spin_lock_bh(&rt_peer_lock); 1235 if (rt->peer == NULL) { 1236 rt->peer = peer; 1237 peer = NULL; 1238 } 1239 spin_unlock_bh(&rt_peer_lock); 1240 if (peer) 1241 inet_putpeer(peer); 1242 } 1243 1244 /* 1245 * Peer allocation may fail only in serious out-of-memory conditions. However 1246 * we still can generate some output. 1247 * Random ID selection looks a bit dangerous because we have no chances to 1248 * select ID being unique in a reasonable period of time. 1249 * But broken packet identifier may be better than no packet at all. 1250 */ 1251 static void ip_select_fb_ident(struct iphdr *iph) 1252 { 1253 static DEFINE_SPINLOCK(ip_fb_id_lock); 1254 static u32 ip_fallback_id; 1255 u32 salt; 1256 1257 spin_lock_bh(&ip_fb_id_lock); 1258 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 1259 iph->id = htons(salt & 0xFFFF); 1260 ip_fallback_id = salt; 1261 spin_unlock_bh(&ip_fb_id_lock); 1262 } 1263 1264 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1265 { 1266 struct rtable *rt = (struct rtable *) dst; 1267 1268 if (rt) { 1269 if (rt->peer == NULL) 1270 rt_bind_peer(rt, 1); 1271 1272 /* If peer is attached to destination, it is never detached, 1273 so that we need not to grab a lock to dereference it. 1274 */ 1275 if (rt->peer) { 1276 iph->id = htons(inet_getid(rt->peer, more)); 1277 return; 1278 } 1279 } else 1280 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1281 __builtin_return_address(0)); 1282 1283 ip_select_fb_ident(iph); 1284 } 1285 1286 static void rt_del(unsigned hash, struct rtable *rt) 1287 { 1288 struct rtable **rthp, *aux; 1289 1290 rthp = &rt_hash_table[hash].chain; 1291 spin_lock_bh(rt_hash_lock_addr(hash)); 1292 ip_rt_put(rt); 1293 while ((aux = *rthp) != NULL) { 1294 if (aux == rt || rt_is_expired(aux)) { 1295 *rthp = aux->u.dst.rt_next; 1296 rt_free(aux); 1297 continue; 1298 } 1299 rthp = &aux->u.dst.rt_next; 1300 } 1301 spin_unlock_bh(rt_hash_lock_addr(hash)); 1302 } 1303 1304 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1305 __be32 saddr, struct net_device *dev) 1306 { 1307 int i, k; 1308 struct in_device *in_dev = in_dev_get(dev); 1309 struct rtable *rth, **rthp; 1310 __be32 skeys[2] = { saddr, 0 }; 1311 int ikeys[2] = { dev->ifindex, 0 }; 1312 struct netevent_redirect netevent; 1313 struct net *net; 1314 1315 if (!in_dev) 1316 return; 1317 1318 net = dev_net(dev); 1319 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 1320 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) 1321 || ipv4_is_zeronet(new_gw)) 1322 goto reject_redirect; 1323 1324 if (!rt_caching(net)) 1325 goto reject_redirect; 1326 1327 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1328 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1329 goto reject_redirect; 1330 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1331 goto reject_redirect; 1332 } else { 1333 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 1334 goto reject_redirect; 1335 } 1336 1337 for (i = 0; i < 2; i++) { 1338 for (k = 0; k < 2; k++) { 1339 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1340 rt_genid(net)); 1341 1342 rthp=&rt_hash_table[hash].chain; 1343 1344 rcu_read_lock(); 1345 while ((rth = rcu_dereference(*rthp)) != NULL) { 1346 struct rtable *rt; 1347 1348 if (rth->fl.fl4_dst != daddr || 1349 rth->fl.fl4_src != skeys[i] || 1350 rth->fl.oif != ikeys[k] || 1351 rth->fl.iif != 0 || 1352 rt_is_expired(rth) || 1353 !net_eq(dev_net(rth->u.dst.dev), net)) { 1354 rthp = &rth->u.dst.rt_next; 1355 continue; 1356 } 1357 1358 if (rth->rt_dst != daddr || 1359 rth->rt_src != saddr || 1360 rth->u.dst.error || 1361 rth->rt_gateway != old_gw || 1362 rth->u.dst.dev != dev) 1363 break; 1364 1365 dst_hold(&rth->u.dst); 1366 rcu_read_unlock(); 1367 1368 rt = dst_alloc(&ipv4_dst_ops); 1369 if (rt == NULL) { 1370 ip_rt_put(rth); 1371 in_dev_put(in_dev); 1372 return; 1373 } 1374 1375 /* Copy all the information. */ 1376 *rt = *rth; 1377 rt->u.dst.__use = 1; 1378 atomic_set(&rt->u.dst.__refcnt, 1); 1379 rt->u.dst.child = NULL; 1380 if (rt->u.dst.dev) 1381 dev_hold(rt->u.dst.dev); 1382 if (rt->idev) 1383 in_dev_hold(rt->idev); 1384 rt->u.dst.obsolete = 0; 1385 rt->u.dst.lastuse = jiffies; 1386 rt->u.dst.path = &rt->u.dst; 1387 rt->u.dst.neighbour = NULL; 1388 rt->u.dst.hh = NULL; 1389 #ifdef CONFIG_XFRM 1390 rt->u.dst.xfrm = NULL; 1391 #endif 1392 rt->rt_genid = rt_genid(net); 1393 rt->rt_flags |= RTCF_REDIRECTED; 1394 1395 /* Gateway is different ... */ 1396 rt->rt_gateway = new_gw; 1397 1398 /* Redirect received -> path was valid */ 1399 dst_confirm(&rth->u.dst); 1400 1401 if (rt->peer) 1402 atomic_inc(&rt->peer->refcnt); 1403 1404 if (arp_bind_neighbour(&rt->u.dst) || 1405 !(rt->u.dst.neighbour->nud_state & 1406 NUD_VALID)) { 1407 if (rt->u.dst.neighbour) 1408 neigh_event_send(rt->u.dst.neighbour, NULL); 1409 ip_rt_put(rth); 1410 rt_drop(rt); 1411 goto do_next; 1412 } 1413 1414 netevent.old = &rth->u.dst; 1415 netevent.new = &rt->u.dst; 1416 call_netevent_notifiers(NETEVENT_REDIRECT, 1417 &netevent); 1418 1419 rt_del(hash, rth); 1420 if (!rt_intern_hash(hash, rt, &rt, NULL)) 1421 ip_rt_put(rt); 1422 goto do_next; 1423 } 1424 rcu_read_unlock(); 1425 do_next: 1426 ; 1427 } 1428 } 1429 in_dev_put(in_dev); 1430 return; 1431 1432 reject_redirect: 1433 #ifdef CONFIG_IP_ROUTE_VERBOSE 1434 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1435 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" 1436 " Advised path = %pI4 -> %pI4\n", 1437 &old_gw, dev->name, &new_gw, 1438 &saddr, &daddr); 1439 #endif 1440 in_dev_put(in_dev); 1441 } 1442 1443 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1444 { 1445 struct rtable *rt = (struct rtable *)dst; 1446 struct dst_entry *ret = dst; 1447 1448 if (rt) { 1449 if (dst->obsolete) { 1450 ip_rt_put(rt); 1451 ret = NULL; 1452 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1453 rt->u.dst.expires) { 1454 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, 1455 rt->fl.oif, 1456 rt_genid(dev_net(dst->dev))); 1457 #if RT_CACHE_DEBUG >= 1 1458 printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", 1459 &rt->rt_dst, rt->fl.fl4_tos); 1460 #endif 1461 rt_del(hash, rt); 1462 ret = NULL; 1463 } 1464 } 1465 return ret; 1466 } 1467 1468 /* 1469 * Algorithm: 1470 * 1. The first ip_rt_redirect_number redirects are sent 1471 * with exponential backoff, then we stop sending them at all, 1472 * assuming that the host ignores our redirects. 1473 * 2. If we did not see packets requiring redirects 1474 * during ip_rt_redirect_silence, we assume that the host 1475 * forgot redirected route and start to send redirects again. 1476 * 1477 * This algorithm is much cheaper and more intelligent than dumb load limiting 1478 * in icmp.c. 1479 * 1480 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1481 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1482 */ 1483 1484 void ip_rt_send_redirect(struct sk_buff *skb) 1485 { 1486 struct rtable *rt = skb_rtable(skb); 1487 struct in_device *in_dev = in_dev_get(rt->u.dst.dev); 1488 1489 if (!in_dev) 1490 return; 1491 1492 if (!IN_DEV_TX_REDIRECTS(in_dev)) 1493 goto out; 1494 1495 /* No redirected packets during ip_rt_redirect_silence; 1496 * reset the algorithm. 1497 */ 1498 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1499 rt->u.dst.rate_tokens = 0; 1500 1501 /* Too many ignored redirects; do not send anything 1502 * set u.dst.rate_last to the last seen redirected packet. 1503 */ 1504 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1505 rt->u.dst.rate_last = jiffies; 1506 goto out; 1507 } 1508 1509 /* Check for load limit; set rate_last to the latest sent 1510 * redirect. 1511 */ 1512 if (rt->u.dst.rate_tokens == 0 || 1513 time_after(jiffies, 1514 (rt->u.dst.rate_last + 1515 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1516 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1517 rt->u.dst.rate_last = jiffies; 1518 ++rt->u.dst.rate_tokens; 1519 #ifdef CONFIG_IP_ROUTE_VERBOSE 1520 if (IN_DEV_LOG_MARTIANS(in_dev) && 1521 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1522 net_ratelimit()) 1523 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1524 &rt->rt_src, rt->rt_iif, 1525 &rt->rt_dst, &rt->rt_gateway); 1526 #endif 1527 } 1528 out: 1529 in_dev_put(in_dev); 1530 } 1531 1532 static int ip_error(struct sk_buff *skb) 1533 { 1534 struct rtable *rt = skb_rtable(skb); 1535 unsigned long now; 1536 int code; 1537 1538 switch (rt->u.dst.error) { 1539 case EINVAL: 1540 default: 1541 goto out; 1542 case EHOSTUNREACH: 1543 code = ICMP_HOST_UNREACH; 1544 break; 1545 case ENETUNREACH: 1546 code = ICMP_NET_UNREACH; 1547 IP_INC_STATS_BH(dev_net(rt->u.dst.dev), 1548 IPSTATS_MIB_INNOROUTES); 1549 break; 1550 case EACCES: 1551 code = ICMP_PKT_FILTERED; 1552 break; 1553 } 1554 1555 now = jiffies; 1556 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1557 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1558 rt->u.dst.rate_tokens = ip_rt_error_burst; 1559 rt->u.dst.rate_last = now; 1560 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1561 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1562 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1563 } 1564 1565 out: kfree_skb(skb); 1566 return 0; 1567 } 1568 1569 /* 1570 * The last two values are not from the RFC but 1571 * are needed for AMPRnet AX.25 paths. 1572 */ 1573 1574 static const unsigned short mtu_plateau[] = 1575 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1576 1577 static inline unsigned short guess_mtu(unsigned short old_mtu) 1578 { 1579 int i; 1580 1581 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1582 if (old_mtu > mtu_plateau[i]) 1583 return mtu_plateau[i]; 1584 return 68; 1585 } 1586 1587 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, 1588 unsigned short new_mtu, 1589 struct net_device *dev) 1590 { 1591 int i, k; 1592 unsigned short old_mtu = ntohs(iph->tot_len); 1593 struct rtable *rth; 1594 int ikeys[2] = { dev->ifindex, 0 }; 1595 __be32 skeys[2] = { iph->saddr, 0, }; 1596 __be32 daddr = iph->daddr; 1597 unsigned short est_mtu = 0; 1598 1599 if (ipv4_config.no_pmtu_disc) 1600 return 0; 1601 1602 for (k = 0; k < 2; k++) { 1603 for (i = 0; i < 2; i++) { 1604 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], 1605 rt_genid(net)); 1606 1607 rcu_read_lock(); 1608 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1609 rth = rcu_dereference(rth->u.dst.rt_next)) { 1610 unsigned short mtu = new_mtu; 1611 1612 if (rth->fl.fl4_dst != daddr || 1613 rth->fl.fl4_src != skeys[i] || 1614 rth->rt_dst != daddr || 1615 rth->rt_src != iph->saddr || 1616 rth->fl.oif != ikeys[k] || 1617 rth->fl.iif != 0 || 1618 dst_metric_locked(&rth->u.dst, RTAX_MTU) || 1619 !net_eq(dev_net(rth->u.dst.dev), net) || 1620 rt_is_expired(rth)) 1621 continue; 1622 1623 if (new_mtu < 68 || new_mtu >= old_mtu) { 1624 1625 /* BSD 4.2 compatibility hack :-( */ 1626 if (mtu == 0 && 1627 old_mtu >= dst_mtu(&rth->u.dst) && 1628 old_mtu >= 68 + (iph->ihl << 2)) 1629 old_mtu -= iph->ihl << 2; 1630 1631 mtu = guess_mtu(old_mtu); 1632 } 1633 if (mtu <= dst_mtu(&rth->u.dst)) { 1634 if (mtu < dst_mtu(&rth->u.dst)) { 1635 dst_confirm(&rth->u.dst); 1636 if (mtu < ip_rt_min_pmtu) { 1637 mtu = ip_rt_min_pmtu; 1638 rth->u.dst.metrics[RTAX_LOCK-1] |= 1639 (1 << RTAX_MTU); 1640 } 1641 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1642 dst_set_expires(&rth->u.dst, 1643 ip_rt_mtu_expires); 1644 } 1645 est_mtu = mtu; 1646 } 1647 } 1648 rcu_read_unlock(); 1649 } 1650 } 1651 return est_mtu ? : new_mtu; 1652 } 1653 1654 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1655 { 1656 if (dst_mtu(dst) > mtu && mtu >= 68 && 1657 !(dst_metric_locked(dst, RTAX_MTU))) { 1658 if (mtu < ip_rt_min_pmtu) { 1659 mtu = ip_rt_min_pmtu; 1660 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1661 } 1662 dst->metrics[RTAX_MTU-1] = mtu; 1663 dst_set_expires(dst, ip_rt_mtu_expires); 1664 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst); 1665 } 1666 } 1667 1668 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1669 { 1670 return NULL; 1671 } 1672 1673 static void ipv4_dst_destroy(struct dst_entry *dst) 1674 { 1675 struct rtable *rt = (struct rtable *) dst; 1676 struct inet_peer *peer = rt->peer; 1677 struct in_device *idev = rt->idev; 1678 1679 if (peer) { 1680 rt->peer = NULL; 1681 inet_putpeer(peer); 1682 } 1683 1684 if (idev) { 1685 rt->idev = NULL; 1686 in_dev_put(idev); 1687 } 1688 } 1689 1690 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 1691 int how) 1692 { 1693 struct rtable *rt = (struct rtable *) dst; 1694 struct in_device *idev = rt->idev; 1695 if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) { 1696 struct in_device *loopback_idev = 1697 in_dev_get(dev_net(dev)->loopback_dev); 1698 if (loopback_idev) { 1699 rt->idev = loopback_idev; 1700 in_dev_put(idev); 1701 } 1702 } 1703 } 1704 1705 static void ipv4_link_failure(struct sk_buff *skb) 1706 { 1707 struct rtable *rt; 1708 1709 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1710 1711 rt = skb_rtable(skb); 1712 if (rt) 1713 dst_set_expires(&rt->u.dst, 0); 1714 } 1715 1716 static int ip_rt_bug(struct sk_buff *skb) 1717 { 1718 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", 1719 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1720 skb->dev ? skb->dev->name : "?"); 1721 kfree_skb(skb); 1722 return 0; 1723 } 1724 1725 /* 1726 We do not cache source address of outgoing interface, 1727 because it is used only by IP RR, TS and SRR options, 1728 so that it out of fast path. 1729 1730 BTW remember: "addr" is allowed to be not aligned 1731 in IP options! 1732 */ 1733 1734 void ip_rt_get_source(u8 *addr, struct rtable *rt) 1735 { 1736 __be32 src; 1737 struct fib_result res; 1738 1739 if (rt->fl.iif == 0) 1740 src = rt->rt_src; 1741 else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { 1742 src = FIB_RES_PREFSRC(res); 1743 fib_res_put(&res); 1744 } else 1745 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1746 RT_SCOPE_UNIVERSE); 1747 memcpy(addr, &src, 4); 1748 } 1749 1750 #ifdef CONFIG_NET_CLS_ROUTE 1751 static void set_class_tag(struct rtable *rt, u32 tag) 1752 { 1753 if (!(rt->u.dst.tclassid & 0xFFFF)) 1754 rt->u.dst.tclassid |= tag & 0xFFFF; 1755 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1756 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1757 } 1758 #endif 1759 1760 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1761 { 1762 struct fib_info *fi = res->fi; 1763 1764 if (fi) { 1765 if (FIB_RES_GW(*res) && 1766 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1767 rt->rt_gateway = FIB_RES_GW(*res); 1768 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1769 sizeof(rt->u.dst.metrics)); 1770 if (fi->fib_mtu == 0) { 1771 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; 1772 if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && 1773 rt->rt_gateway != rt->rt_dst && 1774 rt->u.dst.dev->mtu > 576) 1775 rt->u.dst.metrics[RTAX_MTU-1] = 576; 1776 } 1777 #ifdef CONFIG_NET_CLS_ROUTE 1778 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1779 #endif 1780 } else 1781 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1782 1783 if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) 1784 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1785 if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) 1786 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1787 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) 1788 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 1789 ip_rt_min_advmss); 1790 if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) 1791 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; 1792 1793 #ifdef CONFIG_NET_CLS_ROUTE 1794 #ifdef CONFIG_IP_MULTIPLE_TABLES 1795 set_class_tag(rt, fib_rules_tclass(res)); 1796 #endif 1797 set_class_tag(rt, itag); 1798 #endif 1799 rt->rt_type = res->type; 1800 } 1801 1802 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1803 u8 tos, struct net_device *dev, int our) 1804 { 1805 unsigned hash; 1806 struct rtable *rth; 1807 __be32 spec_dst; 1808 struct in_device *in_dev = in_dev_get(dev); 1809 u32 itag = 0; 1810 1811 /* Primary sanity checks. */ 1812 1813 if (in_dev == NULL) 1814 return -EINVAL; 1815 1816 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1817 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 1818 goto e_inval; 1819 1820 if (ipv4_is_zeronet(saddr)) { 1821 if (!ipv4_is_local_multicast(daddr)) 1822 goto e_inval; 1823 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1824 } else if (fib_validate_source(saddr, 0, tos, 0, 1825 dev, &spec_dst, &itag) < 0) 1826 goto e_inval; 1827 1828 rth = dst_alloc(&ipv4_dst_ops); 1829 if (!rth) 1830 goto e_nobufs; 1831 1832 rth->u.dst.output= ip_rt_bug; 1833 1834 atomic_set(&rth->u.dst.__refcnt, 1); 1835 rth->u.dst.flags= DST_HOST; 1836 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1837 rth->u.dst.flags |= DST_NOPOLICY; 1838 rth->fl.fl4_dst = daddr; 1839 rth->rt_dst = daddr; 1840 rth->fl.fl4_tos = tos; 1841 rth->fl.mark = skb->mark; 1842 rth->fl.fl4_src = saddr; 1843 rth->rt_src = saddr; 1844 #ifdef CONFIG_NET_CLS_ROUTE 1845 rth->u.dst.tclassid = itag; 1846 #endif 1847 rth->rt_iif = 1848 rth->fl.iif = dev->ifindex; 1849 rth->u.dst.dev = init_net.loopback_dev; 1850 dev_hold(rth->u.dst.dev); 1851 rth->idev = in_dev_get(rth->u.dst.dev); 1852 rth->fl.oif = 0; 1853 rth->rt_gateway = daddr; 1854 rth->rt_spec_dst= spec_dst; 1855 rth->rt_genid = rt_genid(dev_net(dev)); 1856 rth->rt_flags = RTCF_MULTICAST; 1857 rth->rt_type = RTN_MULTICAST; 1858 if (our) { 1859 rth->u.dst.input= ip_local_deliver; 1860 rth->rt_flags |= RTCF_LOCAL; 1861 } 1862 1863 #ifdef CONFIG_IP_MROUTE 1864 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1865 rth->u.dst.input = ip_mr_input; 1866 #endif 1867 RT_CACHE_STAT_INC(in_slow_mc); 1868 1869 in_dev_put(in_dev); 1870 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1871 return rt_intern_hash(hash, rth, NULL, skb); 1872 1873 e_nobufs: 1874 in_dev_put(in_dev); 1875 return -ENOBUFS; 1876 1877 e_inval: 1878 in_dev_put(in_dev); 1879 return -EINVAL; 1880 } 1881 1882 1883 static void ip_handle_martian_source(struct net_device *dev, 1884 struct in_device *in_dev, 1885 struct sk_buff *skb, 1886 __be32 daddr, 1887 __be32 saddr) 1888 { 1889 RT_CACHE_STAT_INC(in_martian_src); 1890 #ifdef CONFIG_IP_ROUTE_VERBOSE 1891 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1892 /* 1893 * RFC1812 recommendation, if source is martian, 1894 * the only hint is MAC header. 1895 */ 1896 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", 1897 &daddr, &saddr, dev->name); 1898 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1899 int i; 1900 const unsigned char *p = skb_mac_header(skb); 1901 printk(KERN_WARNING "ll header: "); 1902 for (i = 0; i < dev->hard_header_len; i++, p++) { 1903 printk("%02x", *p); 1904 if (i < (dev->hard_header_len - 1)) 1905 printk(":"); 1906 } 1907 printk("\n"); 1908 } 1909 } 1910 #endif 1911 } 1912 1913 static int __mkroute_input(struct sk_buff *skb, 1914 struct fib_result *res, 1915 struct in_device *in_dev, 1916 __be32 daddr, __be32 saddr, u32 tos, 1917 struct rtable **result) 1918 { 1919 1920 struct rtable *rth; 1921 int err; 1922 struct in_device *out_dev; 1923 unsigned flags = 0; 1924 __be32 spec_dst; 1925 u32 itag; 1926 1927 /* get a working reference to the output device */ 1928 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1929 if (out_dev == NULL) { 1930 if (net_ratelimit()) 1931 printk(KERN_CRIT "Bug in ip_route_input" \ 1932 "_slow(). Please, report\n"); 1933 return -EINVAL; 1934 } 1935 1936 1937 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 1938 in_dev->dev, &spec_dst, &itag); 1939 if (err < 0) { 1940 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1941 saddr); 1942 1943 err = -EINVAL; 1944 goto cleanup; 1945 } 1946 1947 if (err) 1948 flags |= RTCF_DIRECTSRC; 1949 1950 if (out_dev == in_dev && err && 1951 (IN_DEV_SHARED_MEDIA(out_dev) || 1952 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1953 flags |= RTCF_DOREDIRECT; 1954 1955 if (skb->protocol != htons(ETH_P_IP)) { 1956 /* Not IP (i.e. ARP). Do not create route, if it is 1957 * invalid for proxy arp. DNAT routes are always valid. 1958 */ 1959 if (out_dev == in_dev) { 1960 err = -EINVAL; 1961 goto cleanup; 1962 } 1963 } 1964 1965 1966 rth = dst_alloc(&ipv4_dst_ops); 1967 if (!rth) { 1968 err = -ENOBUFS; 1969 goto cleanup; 1970 } 1971 1972 atomic_set(&rth->u.dst.__refcnt, 1); 1973 rth->u.dst.flags= DST_HOST; 1974 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 1975 rth->u.dst.flags |= DST_NOPOLICY; 1976 if (IN_DEV_CONF_GET(out_dev, NOXFRM)) 1977 rth->u.dst.flags |= DST_NOXFRM; 1978 rth->fl.fl4_dst = daddr; 1979 rth->rt_dst = daddr; 1980 rth->fl.fl4_tos = tos; 1981 rth->fl.mark = skb->mark; 1982 rth->fl.fl4_src = saddr; 1983 rth->rt_src = saddr; 1984 rth->rt_gateway = daddr; 1985 rth->rt_iif = 1986 rth->fl.iif = in_dev->dev->ifindex; 1987 rth->u.dst.dev = (out_dev)->dev; 1988 dev_hold(rth->u.dst.dev); 1989 rth->idev = in_dev_get(rth->u.dst.dev); 1990 rth->fl.oif = 0; 1991 rth->rt_spec_dst= spec_dst; 1992 1993 rth->u.dst.input = ip_forward; 1994 rth->u.dst.output = ip_output; 1995 rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); 1996 1997 rt_set_nexthop(rth, res, itag); 1998 1999 rth->rt_flags = flags; 2000 2001 *result = rth; 2002 err = 0; 2003 cleanup: 2004 /* release the working reference to the output device */ 2005 in_dev_put(out_dev); 2006 return err; 2007 } 2008 2009 static int ip_mkroute_input(struct sk_buff *skb, 2010 struct fib_result *res, 2011 const struct flowi *fl, 2012 struct in_device *in_dev, 2013 __be32 daddr, __be32 saddr, u32 tos) 2014 { 2015 struct rtable* rth = NULL; 2016 int err; 2017 unsigned hash; 2018 2019 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2020 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 2021 fib_select_multipath(fl, res); 2022 #endif 2023 2024 /* create a routing cache entry */ 2025 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 2026 if (err) 2027 return err; 2028 2029 /* put it into the cache */ 2030 hash = rt_hash(daddr, saddr, fl->iif, 2031 rt_genid(dev_net(rth->u.dst.dev))); 2032 return rt_intern_hash(hash, rth, NULL, skb); 2033 } 2034 2035 /* 2036 * NOTE. We drop all the packets that has local source 2037 * addresses, because every properly looped back packet 2038 * must have correct destination already attached by output routine. 2039 * 2040 * Such approach solves two big problems: 2041 * 1. Not simplex devices are handled properly. 2042 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2043 */ 2044 2045 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2046 u8 tos, struct net_device *dev) 2047 { 2048 struct fib_result res; 2049 struct in_device *in_dev = in_dev_get(dev); 2050 struct flowi fl = { .nl_u = { .ip4_u = 2051 { .daddr = daddr, 2052 .saddr = saddr, 2053 .tos = tos, 2054 .scope = RT_SCOPE_UNIVERSE, 2055 } }, 2056 .mark = skb->mark, 2057 .iif = dev->ifindex }; 2058 unsigned flags = 0; 2059 u32 itag = 0; 2060 struct rtable * rth; 2061 unsigned hash; 2062 __be32 spec_dst; 2063 int err = -EINVAL; 2064 int free_res = 0; 2065 struct net * net = dev_net(dev); 2066 2067 /* IP on this device is disabled. */ 2068 2069 if (!in_dev) 2070 goto out; 2071 2072 /* Check for the most weird martians, which can be not detected 2073 by fib_lookup. 2074 */ 2075 2076 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2077 ipv4_is_loopback(saddr)) 2078 goto martian_source; 2079 2080 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) 2081 goto brd_input; 2082 2083 /* Accept zero addresses only to limited broadcast; 2084 * I even do not know to fix it or not. Waiting for complains :-) 2085 */ 2086 if (ipv4_is_zeronet(saddr)) 2087 goto martian_source; 2088 2089 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || 2090 ipv4_is_loopback(daddr)) 2091 goto martian_destination; 2092 2093 /* 2094 * Now we are ready to route packet. 2095 */ 2096 if ((err = fib_lookup(net, &fl, &res)) != 0) { 2097 if (!IN_DEV_FORWARD(in_dev)) 2098 goto e_hostunreach; 2099 goto no_route; 2100 } 2101 free_res = 1; 2102 2103 RT_CACHE_STAT_INC(in_slow_tot); 2104 2105 if (res.type == RTN_BROADCAST) 2106 goto brd_input; 2107 2108 if (res.type == RTN_LOCAL) { 2109 int result; 2110 result = fib_validate_source(saddr, daddr, tos, 2111 net->loopback_dev->ifindex, 2112 dev, &spec_dst, &itag); 2113 if (result < 0) 2114 goto martian_source; 2115 if (result) 2116 flags |= RTCF_DIRECTSRC; 2117 spec_dst = daddr; 2118 goto local_input; 2119 } 2120 2121 if (!IN_DEV_FORWARD(in_dev)) 2122 goto e_hostunreach; 2123 if (res.type != RTN_UNICAST) 2124 goto martian_destination; 2125 2126 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 2127 done: 2128 in_dev_put(in_dev); 2129 if (free_res) 2130 fib_res_put(&res); 2131 out: return err; 2132 2133 brd_input: 2134 if (skb->protocol != htons(ETH_P_IP)) 2135 goto e_inval; 2136 2137 if (ipv4_is_zeronet(saddr)) 2138 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2139 else { 2140 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 2141 &itag); 2142 if (err < 0) 2143 goto martian_source; 2144 if (err) 2145 flags |= RTCF_DIRECTSRC; 2146 } 2147 flags |= RTCF_BROADCAST; 2148 res.type = RTN_BROADCAST; 2149 RT_CACHE_STAT_INC(in_brd); 2150 2151 local_input: 2152 rth = dst_alloc(&ipv4_dst_ops); 2153 if (!rth) 2154 goto e_nobufs; 2155 2156 rth->u.dst.output= ip_rt_bug; 2157 rth->rt_genid = rt_genid(net); 2158 2159 atomic_set(&rth->u.dst.__refcnt, 1); 2160 rth->u.dst.flags= DST_HOST; 2161 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2162 rth->u.dst.flags |= DST_NOPOLICY; 2163 rth->fl.fl4_dst = daddr; 2164 rth->rt_dst = daddr; 2165 rth->fl.fl4_tos = tos; 2166 rth->fl.mark = skb->mark; 2167 rth->fl.fl4_src = saddr; 2168 rth->rt_src = saddr; 2169 #ifdef CONFIG_NET_CLS_ROUTE 2170 rth->u.dst.tclassid = itag; 2171 #endif 2172 rth->rt_iif = 2173 rth->fl.iif = dev->ifindex; 2174 rth->u.dst.dev = net->loopback_dev; 2175 dev_hold(rth->u.dst.dev); 2176 rth->idev = in_dev_get(rth->u.dst.dev); 2177 rth->rt_gateway = daddr; 2178 rth->rt_spec_dst= spec_dst; 2179 rth->u.dst.input= ip_local_deliver; 2180 rth->rt_flags = flags|RTCF_LOCAL; 2181 if (res.type == RTN_UNREACHABLE) { 2182 rth->u.dst.input= ip_error; 2183 rth->u.dst.error= -err; 2184 rth->rt_flags &= ~RTCF_LOCAL; 2185 } 2186 rth->rt_type = res.type; 2187 hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); 2188 err = rt_intern_hash(hash, rth, NULL, skb); 2189 goto done; 2190 2191 no_route: 2192 RT_CACHE_STAT_INC(in_no_route); 2193 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2194 res.type = RTN_UNREACHABLE; 2195 if (err == -ESRCH) 2196 err = -ENETUNREACH; 2197 goto local_input; 2198 2199 /* 2200 * Do not cache martian addresses: they should be logged (RFC1812) 2201 */ 2202 martian_destination: 2203 RT_CACHE_STAT_INC(in_martian_dst); 2204 #ifdef CONFIG_IP_ROUTE_VERBOSE 2205 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2206 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", 2207 &daddr, &saddr, dev->name); 2208 #endif 2209 2210 e_hostunreach: 2211 err = -EHOSTUNREACH; 2212 goto done; 2213 2214 e_inval: 2215 err = -EINVAL; 2216 goto done; 2217 2218 e_nobufs: 2219 err = -ENOBUFS; 2220 goto done; 2221 2222 martian_source: 2223 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2224 goto e_inval; 2225 } 2226 2227 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2228 u8 tos, struct net_device *dev) 2229 { 2230 struct rtable * rth; 2231 unsigned hash; 2232 int iif = dev->ifindex; 2233 struct net *net; 2234 2235 net = dev_net(dev); 2236 2237 if (!rt_caching(net)) 2238 goto skip_cache; 2239 2240 tos &= IPTOS_RT_MASK; 2241 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2242 2243 rcu_read_lock(); 2244 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2245 rth = rcu_dereference(rth->u.dst.rt_next)) { 2246 if (((rth->fl.fl4_dst ^ daddr) | 2247 (rth->fl.fl4_src ^ saddr) | 2248 (rth->fl.iif ^ iif) | 2249 rth->fl.oif | 2250 (rth->fl.fl4_tos ^ tos)) == 0 && 2251 rth->fl.mark == skb->mark && 2252 net_eq(dev_net(rth->u.dst.dev), net) && 2253 !rt_is_expired(rth)) { 2254 dst_use(&rth->u.dst, jiffies); 2255 RT_CACHE_STAT_INC(in_hit); 2256 rcu_read_unlock(); 2257 skb_dst_set(skb, &rth->u.dst); 2258 return 0; 2259 } 2260 RT_CACHE_STAT_INC(in_hlist_search); 2261 } 2262 rcu_read_unlock(); 2263 2264 skip_cache: 2265 /* Multicast recognition logic is moved from route cache to here. 2266 The problem was that too many Ethernet cards have broken/missing 2267 hardware multicast filters :-( As result the host on multicasting 2268 network acquires a lot of useless route cache entries, sort of 2269 SDR messages from all the world. Now we try to get rid of them. 2270 Really, provided software IP multicast filter is organized 2271 reasonably (at least, hashed), it does not result in a slowdown 2272 comparing with route cache reject entries. 2273 Note, that multicast routers are not affected, because 2274 route cache entry is created eventually. 2275 */ 2276 if (ipv4_is_multicast(daddr)) { 2277 struct in_device *in_dev; 2278 2279 rcu_read_lock(); 2280 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) { 2281 int our = ip_check_mc(in_dev, daddr, saddr, 2282 ip_hdr(skb)->protocol); 2283 if (our 2284 #ifdef CONFIG_IP_MROUTE 2285 || (!ipv4_is_local_multicast(daddr) && 2286 IN_DEV_MFORWARD(in_dev)) 2287 #endif 2288 ) { 2289 rcu_read_unlock(); 2290 return ip_route_input_mc(skb, daddr, saddr, 2291 tos, dev, our); 2292 } 2293 } 2294 rcu_read_unlock(); 2295 return -EINVAL; 2296 } 2297 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2298 } 2299 2300 static int __mkroute_output(struct rtable **result, 2301 struct fib_result *res, 2302 const struct flowi *fl, 2303 const struct flowi *oldflp, 2304 struct net_device *dev_out, 2305 unsigned flags) 2306 { 2307 struct rtable *rth; 2308 struct in_device *in_dev; 2309 u32 tos = RT_FL_TOS(oldflp); 2310 int err = 0; 2311 2312 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2313 return -EINVAL; 2314 2315 if (fl->fl4_dst == htonl(0xFFFFFFFF)) 2316 res->type = RTN_BROADCAST; 2317 else if (ipv4_is_multicast(fl->fl4_dst)) 2318 res->type = RTN_MULTICAST; 2319 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) 2320 return -EINVAL; 2321 2322 if (dev_out->flags & IFF_LOOPBACK) 2323 flags |= RTCF_LOCAL; 2324 2325 /* get work reference to inet device */ 2326 in_dev = in_dev_get(dev_out); 2327 if (!in_dev) 2328 return -EINVAL; 2329 2330 if (res->type == RTN_BROADCAST) { 2331 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2332 if (res->fi) { 2333 fib_info_put(res->fi); 2334 res->fi = NULL; 2335 } 2336 } else if (res->type == RTN_MULTICAST) { 2337 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2338 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2339 oldflp->proto)) 2340 flags &= ~RTCF_LOCAL; 2341 /* If multicast route do not exist use 2342 default one, but do not gateway in this case. 2343 Yes, it is hack. 2344 */ 2345 if (res->fi && res->prefixlen < 4) { 2346 fib_info_put(res->fi); 2347 res->fi = NULL; 2348 } 2349 } 2350 2351 2352 rth = dst_alloc(&ipv4_dst_ops); 2353 if (!rth) { 2354 err = -ENOBUFS; 2355 goto cleanup; 2356 } 2357 2358 atomic_set(&rth->u.dst.__refcnt, 1); 2359 rth->u.dst.flags= DST_HOST; 2360 if (IN_DEV_CONF_GET(in_dev, NOXFRM)) 2361 rth->u.dst.flags |= DST_NOXFRM; 2362 if (IN_DEV_CONF_GET(in_dev, NOPOLICY)) 2363 rth->u.dst.flags |= DST_NOPOLICY; 2364 2365 rth->fl.fl4_dst = oldflp->fl4_dst; 2366 rth->fl.fl4_tos = tos; 2367 rth->fl.fl4_src = oldflp->fl4_src; 2368 rth->fl.oif = oldflp->oif; 2369 rth->fl.mark = oldflp->mark; 2370 rth->rt_dst = fl->fl4_dst; 2371 rth->rt_src = fl->fl4_src; 2372 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2373 /* get references to the devices that are to be hold by the routing 2374 cache entry */ 2375 rth->u.dst.dev = dev_out; 2376 dev_hold(dev_out); 2377 rth->idev = in_dev_get(dev_out); 2378 rth->rt_gateway = fl->fl4_dst; 2379 rth->rt_spec_dst= fl->fl4_src; 2380 2381 rth->u.dst.output=ip_output; 2382 rth->rt_genid = rt_genid(dev_net(dev_out)); 2383 2384 RT_CACHE_STAT_INC(out_slow_tot); 2385 2386 if (flags & RTCF_LOCAL) { 2387 rth->u.dst.input = ip_local_deliver; 2388 rth->rt_spec_dst = fl->fl4_dst; 2389 } 2390 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2391 rth->rt_spec_dst = fl->fl4_src; 2392 if (flags & RTCF_LOCAL && 2393 !(dev_out->flags & IFF_LOOPBACK)) { 2394 rth->u.dst.output = ip_mc_output; 2395 RT_CACHE_STAT_INC(out_slow_mc); 2396 } 2397 #ifdef CONFIG_IP_MROUTE 2398 if (res->type == RTN_MULTICAST) { 2399 if (IN_DEV_MFORWARD(in_dev) && 2400 !ipv4_is_local_multicast(oldflp->fl4_dst)) { 2401 rth->u.dst.input = ip_mr_input; 2402 rth->u.dst.output = ip_mc_output; 2403 } 2404 } 2405 #endif 2406 } 2407 2408 rt_set_nexthop(rth, res, 0); 2409 2410 rth->rt_flags = flags; 2411 2412 *result = rth; 2413 cleanup: 2414 /* release work reference to inet device */ 2415 in_dev_put(in_dev); 2416 2417 return err; 2418 } 2419 2420 static int ip_mkroute_output(struct rtable **rp, 2421 struct fib_result *res, 2422 const struct flowi *fl, 2423 const struct flowi *oldflp, 2424 struct net_device *dev_out, 2425 unsigned flags) 2426 { 2427 struct rtable *rth = NULL; 2428 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2429 unsigned hash; 2430 if (err == 0) { 2431 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, 2432 rt_genid(dev_net(dev_out))); 2433 err = rt_intern_hash(hash, rth, rp, NULL); 2434 } 2435 2436 return err; 2437 } 2438 2439 /* 2440 * Major route resolver routine. 2441 */ 2442 2443 static int ip_route_output_slow(struct net *net, struct rtable **rp, 2444 const struct flowi *oldflp) 2445 { 2446 u32 tos = RT_FL_TOS(oldflp); 2447 struct flowi fl = { .nl_u = { .ip4_u = 2448 { .daddr = oldflp->fl4_dst, 2449 .saddr = oldflp->fl4_src, 2450 .tos = tos & IPTOS_RT_MASK, 2451 .scope = ((tos & RTO_ONLINK) ? 2452 RT_SCOPE_LINK : 2453 RT_SCOPE_UNIVERSE), 2454 } }, 2455 .mark = oldflp->mark, 2456 .iif = net->loopback_dev->ifindex, 2457 .oif = oldflp->oif }; 2458 struct fib_result res; 2459 unsigned flags = 0; 2460 struct net_device *dev_out = NULL; 2461 int free_res = 0; 2462 int err; 2463 2464 2465 res.fi = NULL; 2466 #ifdef CONFIG_IP_MULTIPLE_TABLES 2467 res.r = NULL; 2468 #endif 2469 2470 if (oldflp->fl4_src) { 2471 err = -EINVAL; 2472 if (ipv4_is_multicast(oldflp->fl4_src) || 2473 ipv4_is_lbcast(oldflp->fl4_src) || 2474 ipv4_is_zeronet(oldflp->fl4_src)) 2475 goto out; 2476 2477 /* I removed check for oif == dev_out->oif here. 2478 It was wrong for two reasons: 2479 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2480 is assigned to multiple interfaces. 2481 2. Moreover, we are allowed to send packets with saddr 2482 of another iface. --ANK 2483 */ 2484 2485 if (oldflp->oif == 0 2486 && (ipv4_is_multicast(oldflp->fl4_dst) || 2487 oldflp->fl4_dst == htonl(0xFFFFFFFF))) { 2488 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2489 dev_out = ip_dev_find(net, oldflp->fl4_src); 2490 if (dev_out == NULL) 2491 goto out; 2492 2493 /* Special hack: user can direct multicasts 2494 and limited broadcast via necessary interface 2495 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2496 This hack is not just for fun, it allows 2497 vic,vat and friends to work. 2498 They bind socket to loopback, set ttl to zero 2499 and expect that it will work. 2500 From the viewpoint of routing cache they are broken, 2501 because we are not allowed to build multicast path 2502 with loopback source addr (look, routing cache 2503 cannot know, that ttl is zero, so that packet 2504 will not leave this host and route is valid). 2505 Luckily, this hack is good workaround. 2506 */ 2507 2508 fl.oif = dev_out->ifindex; 2509 goto make_route; 2510 } 2511 2512 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { 2513 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2514 dev_out = ip_dev_find(net, oldflp->fl4_src); 2515 if (dev_out == NULL) 2516 goto out; 2517 dev_put(dev_out); 2518 dev_out = NULL; 2519 } 2520 } 2521 2522 2523 if (oldflp->oif) { 2524 dev_out = dev_get_by_index(net, oldflp->oif); 2525 err = -ENODEV; 2526 if (dev_out == NULL) 2527 goto out; 2528 2529 /* RACE: Check return value of inet_select_addr instead. */ 2530 if (__in_dev_get_rtnl(dev_out) == NULL) { 2531 dev_put(dev_out); 2532 goto out; /* Wrong error code */ 2533 } 2534 2535 if (ipv4_is_local_multicast(oldflp->fl4_dst) || 2536 oldflp->fl4_dst == htonl(0xFFFFFFFF)) { 2537 if (!fl.fl4_src) 2538 fl.fl4_src = inet_select_addr(dev_out, 0, 2539 RT_SCOPE_LINK); 2540 goto make_route; 2541 } 2542 if (!fl.fl4_src) { 2543 if (ipv4_is_multicast(oldflp->fl4_dst)) 2544 fl.fl4_src = inet_select_addr(dev_out, 0, 2545 fl.fl4_scope); 2546 else if (!oldflp->fl4_dst) 2547 fl.fl4_src = inet_select_addr(dev_out, 0, 2548 RT_SCOPE_HOST); 2549 } 2550 } 2551 2552 if (!fl.fl4_dst) { 2553 fl.fl4_dst = fl.fl4_src; 2554 if (!fl.fl4_dst) 2555 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2556 if (dev_out) 2557 dev_put(dev_out); 2558 dev_out = net->loopback_dev; 2559 dev_hold(dev_out); 2560 fl.oif = net->loopback_dev->ifindex; 2561 res.type = RTN_LOCAL; 2562 flags |= RTCF_LOCAL; 2563 goto make_route; 2564 } 2565 2566 if (fib_lookup(net, &fl, &res)) { 2567 res.fi = NULL; 2568 if (oldflp->oif) { 2569 /* Apparently, routing tables are wrong. Assume, 2570 that the destination is on link. 2571 2572 WHY? DW. 2573 Because we are allowed to send to iface 2574 even if it has NO routes and NO assigned 2575 addresses. When oif is specified, routing 2576 tables are looked up with only one purpose: 2577 to catch if destination is gatewayed, rather than 2578 direct. Moreover, if MSG_DONTROUTE is set, 2579 we send packet, ignoring both routing tables 2580 and ifaddr state. --ANK 2581 2582 2583 We could make it even if oif is unknown, 2584 likely IPv6, but we do not. 2585 */ 2586 2587 if (fl.fl4_src == 0) 2588 fl.fl4_src = inet_select_addr(dev_out, 0, 2589 RT_SCOPE_LINK); 2590 res.type = RTN_UNICAST; 2591 goto make_route; 2592 } 2593 if (dev_out) 2594 dev_put(dev_out); 2595 err = -ENETUNREACH; 2596 goto out; 2597 } 2598 free_res = 1; 2599 2600 if (res.type == RTN_LOCAL) { 2601 if (!fl.fl4_src) 2602 fl.fl4_src = fl.fl4_dst; 2603 if (dev_out) 2604 dev_put(dev_out); 2605 dev_out = net->loopback_dev; 2606 dev_hold(dev_out); 2607 fl.oif = dev_out->ifindex; 2608 if (res.fi) 2609 fib_info_put(res.fi); 2610 res.fi = NULL; 2611 flags |= RTCF_LOCAL; 2612 goto make_route; 2613 } 2614 2615 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2616 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2617 fib_select_multipath(&fl, &res); 2618 else 2619 #endif 2620 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2621 fib_select_default(net, &fl, &res); 2622 2623 if (!fl.fl4_src) 2624 fl.fl4_src = FIB_RES_PREFSRC(res); 2625 2626 if (dev_out) 2627 dev_put(dev_out); 2628 dev_out = FIB_RES_DEV(res); 2629 dev_hold(dev_out); 2630 fl.oif = dev_out->ifindex; 2631 2632 2633 make_route: 2634 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2635 2636 2637 if (free_res) 2638 fib_res_put(&res); 2639 if (dev_out) 2640 dev_put(dev_out); 2641 out: return err; 2642 } 2643 2644 int __ip_route_output_key(struct net *net, struct rtable **rp, 2645 const struct flowi *flp) 2646 { 2647 unsigned hash; 2648 struct rtable *rth; 2649 2650 if (!rt_caching(net)) 2651 goto slow_output; 2652 2653 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); 2654 2655 rcu_read_lock_bh(); 2656 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2657 rth = rcu_dereference(rth->u.dst.rt_next)) { 2658 if (rth->fl.fl4_dst == flp->fl4_dst && 2659 rth->fl.fl4_src == flp->fl4_src && 2660 rth->fl.iif == 0 && 2661 rth->fl.oif == flp->oif && 2662 rth->fl.mark == flp->mark && 2663 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2664 (IPTOS_RT_MASK | RTO_ONLINK)) && 2665 net_eq(dev_net(rth->u.dst.dev), net) && 2666 !rt_is_expired(rth)) { 2667 dst_use(&rth->u.dst, jiffies); 2668 RT_CACHE_STAT_INC(out_hit); 2669 rcu_read_unlock_bh(); 2670 *rp = rth; 2671 return 0; 2672 } 2673 RT_CACHE_STAT_INC(out_hlist_search); 2674 } 2675 rcu_read_unlock_bh(); 2676 2677 slow_output: 2678 return ip_route_output_slow(net, rp, flp); 2679 } 2680 2681 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2682 2683 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2684 { 2685 } 2686 2687 static struct dst_ops ipv4_dst_blackhole_ops = { 2688 .family = AF_INET, 2689 .protocol = cpu_to_be16(ETH_P_IP), 2690 .destroy = ipv4_dst_destroy, 2691 .check = ipv4_dst_check, 2692 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2693 .entries = ATOMIC_INIT(0), 2694 }; 2695 2696 2697 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) 2698 { 2699 struct rtable *ort = *rp; 2700 struct rtable *rt = (struct rtable *) 2701 dst_alloc(&ipv4_dst_blackhole_ops); 2702 2703 if (rt) { 2704 struct dst_entry *new = &rt->u.dst; 2705 2706 atomic_set(&new->__refcnt, 1); 2707 new->__use = 1; 2708 new->input = dst_discard; 2709 new->output = dst_discard; 2710 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32)); 2711 2712 new->dev = ort->u.dst.dev; 2713 if (new->dev) 2714 dev_hold(new->dev); 2715 2716 rt->fl = ort->fl; 2717 2718 rt->idev = ort->idev; 2719 if (rt->idev) 2720 in_dev_hold(rt->idev); 2721 rt->rt_genid = rt_genid(net); 2722 rt->rt_flags = ort->rt_flags; 2723 rt->rt_type = ort->rt_type; 2724 rt->rt_dst = ort->rt_dst; 2725 rt->rt_src = ort->rt_src; 2726 rt->rt_iif = ort->rt_iif; 2727 rt->rt_gateway = ort->rt_gateway; 2728 rt->rt_spec_dst = ort->rt_spec_dst; 2729 rt->peer = ort->peer; 2730 if (rt->peer) 2731 atomic_inc(&rt->peer->refcnt); 2732 2733 dst_free(new); 2734 } 2735 2736 dst_release(&(*rp)->u.dst); 2737 *rp = rt; 2738 return (rt ? 0 : -ENOMEM); 2739 } 2740 2741 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, 2742 struct sock *sk, int flags) 2743 { 2744 int err; 2745 2746 if ((err = __ip_route_output_key(net, rp, flp)) != 0) 2747 return err; 2748 2749 if (flp->proto) { 2750 if (!flp->fl4_src) 2751 flp->fl4_src = (*rp)->rt_src; 2752 if (!flp->fl4_dst) 2753 flp->fl4_dst = (*rp)->rt_dst; 2754 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, 2755 flags ? XFRM_LOOKUP_WAIT : 0); 2756 if (err == -EREMOTE) 2757 err = ipv4_dst_blackhole(net, rp, flp); 2758 2759 return err; 2760 } 2761 2762 return 0; 2763 } 2764 2765 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2766 2767 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) 2768 { 2769 return ip_route_output_flow(net, rp, flp, NULL, 0); 2770 } 2771 2772 static int rt_fill_info(struct net *net, 2773 struct sk_buff *skb, u32 pid, u32 seq, int event, 2774 int nowait, unsigned int flags) 2775 { 2776 struct rtable *rt = skb_rtable(skb); 2777 struct rtmsg *r; 2778 struct nlmsghdr *nlh; 2779 long expires; 2780 u32 id = 0, ts = 0, tsage = 0, error; 2781 2782 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2783 if (nlh == NULL) 2784 return -EMSGSIZE; 2785 2786 r = nlmsg_data(nlh); 2787 r->rtm_family = AF_INET; 2788 r->rtm_dst_len = 32; 2789 r->rtm_src_len = 0; 2790 r->rtm_tos = rt->fl.fl4_tos; 2791 r->rtm_table = RT_TABLE_MAIN; 2792 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2793 r->rtm_type = rt->rt_type; 2794 r->rtm_scope = RT_SCOPE_UNIVERSE; 2795 r->rtm_protocol = RTPROT_UNSPEC; 2796 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2797 if (rt->rt_flags & RTCF_NOTIFY) 2798 r->rtm_flags |= RTM_F_NOTIFY; 2799 2800 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2801 2802 if (rt->fl.fl4_src) { 2803 r->rtm_src_len = 32; 2804 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src); 2805 } 2806 if (rt->u.dst.dev) 2807 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex); 2808 #ifdef CONFIG_NET_CLS_ROUTE 2809 if (rt->u.dst.tclassid) 2810 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid); 2811 #endif 2812 if (rt->fl.iif) 2813 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2814 else if (rt->rt_src != rt->fl.fl4_src) 2815 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2816 2817 if (rt->rt_dst != rt->rt_gateway) 2818 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2819 2820 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2821 goto nla_put_failure; 2822 2823 error = rt->u.dst.error; 2824 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; 2825 if (rt->peer) { 2826 id = rt->peer->ip_id_count; 2827 if (rt->peer->tcp_ts_stamp) { 2828 ts = rt->peer->tcp_ts; 2829 tsage = get_seconds() - rt->peer->tcp_ts_stamp; 2830 } 2831 } 2832 2833 if (rt->fl.iif) { 2834 #ifdef CONFIG_IP_MROUTE 2835 __be32 dst = rt->rt_dst; 2836 2837 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2838 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2839 int err = ipmr_get_route(net, skb, r, nowait); 2840 if (err <= 0) { 2841 if (!nowait) { 2842 if (err == 0) 2843 return 0; 2844 goto nla_put_failure; 2845 } else { 2846 if (err == -EMSGSIZE) 2847 goto nla_put_failure; 2848 error = err; 2849 } 2850 } 2851 } else 2852 #endif 2853 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif); 2854 } 2855 2856 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage, 2857 expires, error) < 0) 2858 goto nla_put_failure; 2859 2860 return nlmsg_end(skb, nlh); 2861 2862 nla_put_failure: 2863 nlmsg_cancel(skb, nlh); 2864 return -EMSGSIZE; 2865 } 2866 2867 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2868 { 2869 struct net *net = sock_net(in_skb->sk); 2870 struct rtmsg *rtm; 2871 struct nlattr *tb[RTA_MAX+1]; 2872 struct rtable *rt = NULL; 2873 __be32 dst = 0; 2874 __be32 src = 0; 2875 u32 iif; 2876 int err; 2877 struct sk_buff *skb; 2878 2879 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2880 if (err < 0) 2881 goto errout; 2882 2883 rtm = nlmsg_data(nlh); 2884 2885 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2886 if (skb == NULL) { 2887 err = -ENOBUFS; 2888 goto errout; 2889 } 2890 2891 /* Reserve room for dummy headers, this skb can pass 2892 through good chunk of routing engine. 2893 */ 2894 skb_reset_mac_header(skb); 2895 skb_reset_network_header(skb); 2896 2897 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2898 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2899 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2900 2901 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2902 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2903 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2904 2905 if (iif) { 2906 struct net_device *dev; 2907 2908 dev = __dev_get_by_index(net, iif); 2909 if (dev == NULL) { 2910 err = -ENODEV; 2911 goto errout_free; 2912 } 2913 2914 skb->protocol = htons(ETH_P_IP); 2915 skb->dev = dev; 2916 local_bh_disable(); 2917 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2918 local_bh_enable(); 2919 2920 rt = skb_rtable(skb); 2921 if (err == 0 && rt->u.dst.error) 2922 err = -rt->u.dst.error; 2923 } else { 2924 struct flowi fl = { 2925 .nl_u = { 2926 .ip4_u = { 2927 .daddr = dst, 2928 .saddr = src, 2929 .tos = rtm->rtm_tos, 2930 }, 2931 }, 2932 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 2933 }; 2934 err = ip_route_output_key(net, &rt, &fl); 2935 } 2936 2937 if (err) 2938 goto errout_free; 2939 2940 skb_dst_set(skb, &rt->u.dst); 2941 if (rtm->rtm_flags & RTM_F_NOTIFY) 2942 rt->rt_flags |= RTCF_NOTIFY; 2943 2944 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2945 RTM_NEWROUTE, 0, 0); 2946 if (err <= 0) 2947 goto errout_free; 2948 2949 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 2950 errout: 2951 return err; 2952 2953 errout_free: 2954 kfree_skb(skb); 2955 goto errout; 2956 } 2957 2958 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2959 { 2960 struct rtable *rt; 2961 int h, s_h; 2962 int idx, s_idx; 2963 struct net *net; 2964 2965 net = sock_net(skb->sk); 2966 2967 s_h = cb->args[0]; 2968 if (s_h < 0) 2969 s_h = 0; 2970 s_idx = idx = cb->args[1]; 2971 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { 2972 if (!rt_hash_table[h].chain) 2973 continue; 2974 rcu_read_lock_bh(); 2975 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 2976 rt = rcu_dereference(rt->u.dst.rt_next), idx++) { 2977 if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) 2978 continue; 2979 if (rt_is_expired(rt)) 2980 continue; 2981 skb_dst_set(skb, dst_clone(&rt->u.dst)); 2982 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 2983 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 2984 1, NLM_F_MULTI) <= 0) { 2985 skb_dst_drop(skb); 2986 rcu_read_unlock_bh(); 2987 goto done; 2988 } 2989 skb_dst_drop(skb); 2990 } 2991 rcu_read_unlock_bh(); 2992 } 2993 2994 done: 2995 cb->args[0] = h; 2996 cb->args[1] = idx; 2997 return skb->len; 2998 } 2999 3000 void ip_rt_multicast_event(struct in_device *in_dev) 3001 { 3002 rt_cache_flush(dev_net(in_dev->dev), 0); 3003 } 3004 3005 #ifdef CONFIG_SYSCTL 3006 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3007 struct file *filp, void __user *buffer, 3008 size_t *lenp, loff_t *ppos) 3009 { 3010 if (write) { 3011 int flush_delay; 3012 ctl_table ctl; 3013 struct net *net; 3014 3015 memcpy(&ctl, __ctl, sizeof(ctl)); 3016 ctl.data = &flush_delay; 3017 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); 3018 3019 net = (struct net *)__ctl->extra1; 3020 rt_cache_flush(net, flush_delay); 3021 return 0; 3022 } 3023 3024 return -EINVAL; 3025 } 3026 3027 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, 3028 void __user *oldval, 3029 size_t __user *oldlenp, 3030 void __user *newval, 3031 size_t newlen) 3032 { 3033 int delay; 3034 struct net *net; 3035 if (newlen != sizeof(int)) 3036 return -EINVAL; 3037 if (get_user(delay, (int __user *)newval)) 3038 return -EFAULT; 3039 net = (struct net *)table->extra1; 3040 rt_cache_flush(net, delay); 3041 return 0; 3042 } 3043 3044 static void rt_secret_reschedule(int old) 3045 { 3046 struct net *net; 3047 int new = ip_rt_secret_interval; 3048 int diff = new - old; 3049 3050 if (!diff) 3051 return; 3052 3053 rtnl_lock(); 3054 for_each_net(net) { 3055 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); 3056 3057 if (!new) 3058 continue; 3059 3060 if (deleted) { 3061 long time = net->ipv4.rt_secret_timer.expires - jiffies; 3062 3063 if (time <= 0 || (time += diff) <= 0) 3064 time = 0; 3065 3066 net->ipv4.rt_secret_timer.expires = time; 3067 } else 3068 net->ipv4.rt_secret_timer.expires = new; 3069 3070 net->ipv4.rt_secret_timer.expires += jiffies; 3071 add_timer(&net->ipv4.rt_secret_timer); 3072 } 3073 rtnl_unlock(); 3074 } 3075 3076 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, 3077 struct file *filp, 3078 void __user *buffer, size_t *lenp, 3079 loff_t *ppos) 3080 { 3081 int old = ip_rt_secret_interval; 3082 int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); 3083 3084 rt_secret_reschedule(old); 3085 3086 return ret; 3087 } 3088 3089 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, 3090 void __user *oldval, 3091 size_t __user *oldlenp, 3092 void __user *newval, 3093 size_t newlen) 3094 { 3095 int old = ip_rt_secret_interval; 3096 int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); 3097 3098 rt_secret_reschedule(old); 3099 3100 return ret; 3101 } 3102 3103 static ctl_table ipv4_route_table[] = { 3104 { 3105 .ctl_name = NET_IPV4_ROUTE_GC_THRESH, 3106 .procname = "gc_thresh", 3107 .data = &ipv4_dst_ops.gc_thresh, 3108 .maxlen = sizeof(int), 3109 .mode = 0644, 3110 .proc_handler = proc_dointvec, 3111 }, 3112 { 3113 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, 3114 .procname = "max_size", 3115 .data = &ip_rt_max_size, 3116 .maxlen = sizeof(int), 3117 .mode = 0644, 3118 .proc_handler = proc_dointvec, 3119 }, 3120 { 3121 /* Deprecated. Use gc_min_interval_ms */ 3122 3123 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, 3124 .procname = "gc_min_interval", 3125 .data = &ip_rt_gc_min_interval, 3126 .maxlen = sizeof(int), 3127 .mode = 0644, 3128 .proc_handler = proc_dointvec_jiffies, 3129 .strategy = sysctl_jiffies, 3130 }, 3131 { 3132 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, 3133 .procname = "gc_min_interval_ms", 3134 .data = &ip_rt_gc_min_interval, 3135 .maxlen = sizeof(int), 3136 .mode = 0644, 3137 .proc_handler = proc_dointvec_ms_jiffies, 3138 .strategy = sysctl_ms_jiffies, 3139 }, 3140 { 3141 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, 3142 .procname = "gc_timeout", 3143 .data = &ip_rt_gc_timeout, 3144 .maxlen = sizeof(int), 3145 .mode = 0644, 3146 .proc_handler = proc_dointvec_jiffies, 3147 .strategy = sysctl_jiffies, 3148 }, 3149 { 3150 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, 3151 .procname = "gc_interval", 3152 .data = &ip_rt_gc_interval, 3153 .maxlen = sizeof(int), 3154 .mode = 0644, 3155 .proc_handler = proc_dointvec_jiffies, 3156 .strategy = sysctl_jiffies, 3157 }, 3158 { 3159 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, 3160 .procname = "redirect_load", 3161 .data = &ip_rt_redirect_load, 3162 .maxlen = sizeof(int), 3163 .mode = 0644, 3164 .proc_handler = proc_dointvec, 3165 }, 3166 { 3167 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, 3168 .procname = "redirect_number", 3169 .data = &ip_rt_redirect_number, 3170 .maxlen = sizeof(int), 3171 .mode = 0644, 3172 .proc_handler = proc_dointvec, 3173 }, 3174 { 3175 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, 3176 .procname = "redirect_silence", 3177 .data = &ip_rt_redirect_silence, 3178 .maxlen = sizeof(int), 3179 .mode = 0644, 3180 .proc_handler = proc_dointvec, 3181 }, 3182 { 3183 .ctl_name = NET_IPV4_ROUTE_ERROR_COST, 3184 .procname = "error_cost", 3185 .data = &ip_rt_error_cost, 3186 .maxlen = sizeof(int), 3187 .mode = 0644, 3188 .proc_handler = proc_dointvec, 3189 }, 3190 { 3191 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, 3192 .procname = "error_burst", 3193 .data = &ip_rt_error_burst, 3194 .maxlen = sizeof(int), 3195 .mode = 0644, 3196 .proc_handler = proc_dointvec, 3197 }, 3198 { 3199 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, 3200 .procname = "gc_elasticity", 3201 .data = &ip_rt_gc_elasticity, 3202 .maxlen = sizeof(int), 3203 .mode = 0644, 3204 .proc_handler = proc_dointvec, 3205 }, 3206 { 3207 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, 3208 .procname = "mtu_expires", 3209 .data = &ip_rt_mtu_expires, 3210 .maxlen = sizeof(int), 3211 .mode = 0644, 3212 .proc_handler = proc_dointvec_jiffies, 3213 .strategy = sysctl_jiffies, 3214 }, 3215 { 3216 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, 3217 .procname = "min_pmtu", 3218 .data = &ip_rt_min_pmtu, 3219 .maxlen = sizeof(int), 3220 .mode = 0644, 3221 .proc_handler = proc_dointvec, 3222 }, 3223 { 3224 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, 3225 .procname = "min_adv_mss", 3226 .data = &ip_rt_min_advmss, 3227 .maxlen = sizeof(int), 3228 .mode = 0644, 3229 .proc_handler = proc_dointvec, 3230 }, 3231 { 3232 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, 3233 .procname = "secret_interval", 3234 .data = &ip_rt_secret_interval, 3235 .maxlen = sizeof(int), 3236 .mode = 0644, 3237 .proc_handler = ipv4_sysctl_rt_secret_interval, 3238 .strategy = ipv4_sysctl_rt_secret_interval_strategy, 3239 }, 3240 { .ctl_name = 0 } 3241 }; 3242 3243 static struct ctl_table empty[1]; 3244 3245 static struct ctl_table ipv4_skeleton[] = 3246 { 3247 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, 3248 .mode = 0555, .child = ipv4_route_table}, 3249 { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH, 3250 .mode = 0555, .child = empty}, 3251 { } 3252 }; 3253 3254 static __net_initdata struct ctl_path ipv4_path[] = { 3255 { .procname = "net", .ctl_name = CTL_NET, }, 3256 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 3257 { }, 3258 }; 3259 3260 static struct ctl_table ipv4_route_flush_table[] = { 3261 { 3262 .ctl_name = NET_IPV4_ROUTE_FLUSH, 3263 .procname = "flush", 3264 .maxlen = sizeof(int), 3265 .mode = 0200, 3266 .proc_handler = ipv4_sysctl_rtcache_flush, 3267 .strategy = ipv4_sysctl_rtcache_flush_strategy, 3268 }, 3269 { .ctl_name = 0 }, 3270 }; 3271 3272 static __net_initdata struct ctl_path ipv4_route_path[] = { 3273 { .procname = "net", .ctl_name = CTL_NET, }, 3274 { .procname = "ipv4", .ctl_name = NET_IPV4, }, 3275 { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, 3276 { }, 3277 }; 3278 3279 static __net_init int sysctl_route_net_init(struct net *net) 3280 { 3281 struct ctl_table *tbl; 3282 3283 tbl = ipv4_route_flush_table; 3284 if (net != &init_net) { 3285 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3286 if (tbl == NULL) 3287 goto err_dup; 3288 } 3289 tbl[0].extra1 = net; 3290 3291 net->ipv4.route_hdr = 3292 register_net_sysctl_table(net, ipv4_route_path, tbl); 3293 if (net->ipv4.route_hdr == NULL) 3294 goto err_reg; 3295 return 0; 3296 3297 err_reg: 3298 if (tbl != ipv4_route_flush_table) 3299 kfree(tbl); 3300 err_dup: 3301 return -ENOMEM; 3302 } 3303 3304 static __net_exit void sysctl_route_net_exit(struct net *net) 3305 { 3306 struct ctl_table *tbl; 3307 3308 tbl = net->ipv4.route_hdr->ctl_table_arg; 3309 unregister_net_sysctl_table(net->ipv4.route_hdr); 3310 BUG_ON(tbl == ipv4_route_flush_table); 3311 kfree(tbl); 3312 } 3313 3314 static __net_initdata struct pernet_operations sysctl_route_ops = { 3315 .init = sysctl_route_net_init, 3316 .exit = sysctl_route_net_exit, 3317 }; 3318 #endif 3319 3320 3321 static __net_init int rt_secret_timer_init(struct net *net) 3322 { 3323 atomic_set(&net->ipv4.rt_genid, 3324 (int) ((num_physpages ^ (num_physpages>>8)) ^ 3325 (jiffies ^ (jiffies >> 7)))); 3326 3327 net->ipv4.rt_secret_timer.function = rt_secret_rebuild; 3328 net->ipv4.rt_secret_timer.data = (unsigned long)net; 3329 init_timer_deferrable(&net->ipv4.rt_secret_timer); 3330 3331 if (ip_rt_secret_interval) { 3332 net->ipv4.rt_secret_timer.expires = 3333 jiffies + net_random() % ip_rt_secret_interval + 3334 ip_rt_secret_interval; 3335 add_timer(&net->ipv4.rt_secret_timer); 3336 } 3337 return 0; 3338 } 3339 3340 static __net_exit void rt_secret_timer_exit(struct net *net) 3341 { 3342 del_timer_sync(&net->ipv4.rt_secret_timer); 3343 } 3344 3345 static __net_initdata struct pernet_operations rt_secret_timer_ops = { 3346 .init = rt_secret_timer_init, 3347 .exit = rt_secret_timer_exit, 3348 }; 3349 3350 3351 #ifdef CONFIG_NET_CLS_ROUTE 3352 struct ip_rt_acct *ip_rt_acct __read_mostly; 3353 #endif /* CONFIG_NET_CLS_ROUTE */ 3354 3355 static __initdata unsigned long rhash_entries; 3356 static int __init set_rhash_entries(char *str) 3357 { 3358 if (!str) 3359 return 0; 3360 rhash_entries = simple_strtoul(str, &str, 0); 3361 return 1; 3362 } 3363 __setup("rhash_entries=", set_rhash_entries); 3364 3365 int __init ip_rt_init(void) 3366 { 3367 int rc = 0; 3368 3369 #ifdef CONFIG_NET_CLS_ROUTE 3370 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3371 if (!ip_rt_acct) 3372 panic("IP: failed to allocate ip_rt_acct\n"); 3373 #endif 3374 3375 ipv4_dst_ops.kmem_cachep = 3376 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3377 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3378 3379 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3380 3381 rt_hash_table = (struct rt_hash_bucket *) 3382 alloc_large_system_hash("IP route cache", 3383 sizeof(struct rt_hash_bucket), 3384 rhash_entries, 3385 (num_physpages >= 128 * 1024) ? 3386 15 : 17, 3387 0, 3388 &rt_hash_log, 3389 &rt_hash_mask, 3390 rhash_entries ? 0 : 512 * 1024); 3391 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3392 rt_hash_lock_init(); 3393 3394 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3395 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3396 3397 devinet_init(); 3398 ip_fib_init(); 3399 3400 /* All the timers, started at system startup tend 3401 to synchronize. Perturb it a bit. 3402 */ 3403 INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); 3404 expires_ljiffies = jiffies; 3405 schedule_delayed_work(&expires_work, 3406 net_random() % ip_rt_gc_interval + ip_rt_gc_interval); 3407 3408 if (register_pernet_subsys(&rt_secret_timer_ops)) 3409 printk(KERN_ERR "Unable to setup rt_secret_timer\n"); 3410 3411 if (ip_rt_proc_init()) 3412 printk(KERN_ERR "Unable to create route proc files\n"); 3413 #ifdef CONFIG_XFRM 3414 xfrm_init(); 3415 xfrm4_init(); 3416 #endif 3417 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); 3418 3419 #ifdef CONFIG_SYSCTL 3420 register_pernet_subsys(&sysctl_route_ops); 3421 #endif 3422 return rc; 3423 } 3424 3425 #ifdef CONFIG_SYSCTL 3426 /* 3427 * We really need to sanitize the damn ipv4 init order, then all 3428 * this nonsense will go away. 3429 */ 3430 void __init ip_static_sysctl_init(void) 3431 { 3432 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3433 } 3434 #endif 3435 3436 EXPORT_SYMBOL(__ip_select_ident); 3437 EXPORT_SYMBOL(ip_route_input); 3438 EXPORT_SYMBOL(ip_route_output_key); 3439