1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ 9 * 10 * Authors: Ross Biro 11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 12 * Alan Cox, <gw4pts@gw4pts.ampr.org> 13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 15 * 16 * Fixes: 17 * Alan Cox : Verify area fixes. 18 * Alan Cox : cli() protects routing changes 19 * Rui Oliveira : ICMP routing table updates 20 * (rco@di.uminho.pt) Routing table insertion and update 21 * Linus Torvalds : Rewrote bits to be sensible 22 * Alan Cox : Added BSD route gw semantics 23 * Alan Cox : Super /proc >4K 24 * Alan Cox : MTU in route table 25 * Alan Cox : MSS actually. Also added the window 26 * clamper. 27 * Sam Lantinga : Fixed route matching in rt_del() 28 * Alan Cox : Routing cache support. 29 * Alan Cox : Removed compatibility cruft. 30 * Alan Cox : RTF_REJECT support. 31 * Alan Cox : TCP irtt support. 32 * Jonathan Naylor : Added Metric support. 33 * Miquel van Smoorenburg : BSD API fixes. 34 * Miquel van Smoorenburg : Metrics. 35 * Alan Cox : Use __u32 properly 36 * Alan Cox : Aligned routing errors more closely with BSD 37 * our system is still very different. 38 * Alan Cox : Faster /proc handling 39 * Alexey Kuznetsov : Massive rework to support tree based routing, 40 * routing caches and better behaviour. 41 * 42 * Olaf Erb : irtt wasn't being copied right. 43 * Bjorn Ekwall : Kerneld route support. 44 * Alan Cox : Multicast fixed (I hope) 45 * Pavel Krauz : Limited broadcast fixed 46 * Mike McLagan : Routing by source 47 * Alexey Kuznetsov : End of old history. Split to fib.c and 48 * route.c and rewritten from scratch. 49 * Andi Kleen : Load-limit warning messages. 50 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 54 * Marc Boucher : routing by fwmark 55 * Robert Olsson : Added rt_cache statistics 56 * Arnaldo C. Melo : Convert proc stuff to seq_file 57 * 58 * This program is free software; you can redistribute it and/or 59 * modify it under the terms of the GNU General Public License 60 * as published by the Free Software Foundation; either version 61 * 2 of the License, or (at your option) any later version. 62 */ 63 64 #include <linux/config.h> 65 #include <linux/module.h> 66 #include <asm/uaccess.h> 67 #include <asm/system.h> 68 #include <linux/bitops.h> 69 #include <linux/types.h> 70 #include <linux/kernel.h> 71 #include <linux/sched.h> 72 #include <linux/mm.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/skbuff.h> 83 #include <linux/rtnetlink.h> 84 #include <linux/inetdevice.h> 85 #include <linux/igmp.h> 86 #include <linux/pkt_sched.h> 87 #include <linux/mroute.h> 88 #include <linux/netfilter_ipv4.h> 89 #include <linux/random.h> 90 #include <linux/jhash.h> 91 #include <linux/rcupdate.h> 92 #include <linux/times.h> 93 #include <net/protocol.h> 94 #include <net/ip.h> 95 #include <net/route.h> 96 #include <net/inetpeer.h> 97 #include <net/sock.h> 98 #include <net/ip_fib.h> 99 #include <net/arp.h> 100 #include <net/tcp.h> 101 #include <net/icmp.h> 102 #include <net/xfrm.h> 103 #include <net/ip_mp_alg.h> 104 #ifdef CONFIG_SYSCTL 105 #include <linux/sysctl.h> 106 #endif 107 108 #define RT_FL_TOS(oldflp) \ 109 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 110 111 #define IP_MAX_MTU 0xFFF0 112 113 #define RT_GC_TIMEOUT (300*HZ) 114 115 static int ip_rt_min_delay = 2 * HZ; 116 static int ip_rt_max_delay = 10 * HZ; 117 static int ip_rt_max_size; 118 static int ip_rt_gc_timeout = RT_GC_TIMEOUT; 119 static int ip_rt_gc_interval = 60 * HZ; 120 static int ip_rt_gc_min_interval = HZ / 2; 121 static int ip_rt_redirect_number = 9; 122 static int ip_rt_redirect_load = HZ / 50; 123 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); 124 static int ip_rt_error_cost = HZ; 125 static int ip_rt_error_burst = 5 * HZ; 126 static int ip_rt_gc_elasticity = 8; 127 static int ip_rt_mtu_expires = 10 * 60 * HZ; 128 static int ip_rt_min_pmtu = 512 + 20 + 20; 129 static int ip_rt_min_advmss = 256; 130 static int ip_rt_secret_interval = 10 * 60 * HZ; 131 static unsigned long rt_deadline; 132 133 #define RTprint(a...) printk(KERN_DEBUG a) 134 135 static struct timer_list rt_flush_timer; 136 static struct timer_list rt_periodic_timer; 137 static struct timer_list rt_secret_timer; 138 139 /* 140 * Interface to generic destination cache. 141 */ 142 143 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 144 static void ipv4_dst_destroy(struct dst_entry *dst); 145 static void ipv4_dst_ifdown(struct dst_entry *dst, 146 struct net_device *dev, int how); 147 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 148 static void ipv4_link_failure(struct sk_buff *skb); 149 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 150 static int rt_garbage_collect(void); 151 152 153 static struct dst_ops ipv4_dst_ops = { 154 .family = AF_INET, 155 .protocol = __constant_htons(ETH_P_IP), 156 .gc = rt_garbage_collect, 157 .check = ipv4_dst_check, 158 .destroy = ipv4_dst_destroy, 159 .ifdown = ipv4_dst_ifdown, 160 .negative_advice = ipv4_negative_advice, 161 .link_failure = ipv4_link_failure, 162 .update_pmtu = ip_rt_update_pmtu, 163 .entry_size = sizeof(struct rtable), 164 }; 165 166 #define ECN_OR_COST(class) TC_PRIO_##class 167 168 __u8 ip_tos2prio[16] = { 169 TC_PRIO_BESTEFFORT, 170 ECN_OR_COST(FILLER), 171 TC_PRIO_BESTEFFORT, 172 ECN_OR_COST(BESTEFFORT), 173 TC_PRIO_BULK, 174 ECN_OR_COST(BULK), 175 TC_PRIO_BULK, 176 ECN_OR_COST(BULK), 177 TC_PRIO_INTERACTIVE, 178 ECN_OR_COST(INTERACTIVE), 179 TC_PRIO_INTERACTIVE, 180 ECN_OR_COST(INTERACTIVE), 181 TC_PRIO_INTERACTIVE_BULK, 182 ECN_OR_COST(INTERACTIVE_BULK), 183 TC_PRIO_INTERACTIVE_BULK, 184 ECN_OR_COST(INTERACTIVE_BULK) 185 }; 186 187 188 /* 189 * Route cache. 190 */ 191 192 /* The locking scheme is rather straight forward: 193 * 194 * 1) Read-Copy Update protects the buckets of the central route hash. 195 * 2) Only writers remove entries, and they hold the lock 196 * as they look at rtable reference counts. 197 * 3) Only readers acquire references to rtable entries, 198 * they do so with atomic increments and with the 199 * lock held. 200 */ 201 202 struct rt_hash_bucket { 203 struct rtable *chain; 204 spinlock_t lock; 205 } __attribute__((__aligned__(8))); 206 207 static struct rt_hash_bucket *rt_hash_table; 208 static unsigned rt_hash_mask; 209 static int rt_hash_log; 210 static unsigned int rt_hash_rnd; 211 212 struct rt_cache_stat *rt_cache_stat; 213 214 static int rt_intern_hash(unsigned hash, struct rtable *rth, 215 struct rtable **res); 216 217 static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos) 218 { 219 return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd) 220 & rt_hash_mask); 221 } 222 223 #ifdef CONFIG_PROC_FS 224 struct rt_cache_iter_state { 225 int bucket; 226 }; 227 228 static struct rtable *rt_cache_get_first(struct seq_file *seq) 229 { 230 struct rtable *r = NULL; 231 struct rt_cache_iter_state *st = seq->private; 232 233 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 234 rcu_read_lock_bh(); 235 r = rt_hash_table[st->bucket].chain; 236 if (r) 237 break; 238 rcu_read_unlock_bh(); 239 } 240 return r; 241 } 242 243 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) 244 { 245 struct rt_cache_iter_state *st = rcu_dereference(seq->private); 246 247 r = r->u.rt_next; 248 while (!r) { 249 rcu_read_unlock_bh(); 250 if (--st->bucket < 0) 251 break; 252 rcu_read_lock_bh(); 253 r = rt_hash_table[st->bucket].chain; 254 } 255 return r; 256 } 257 258 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 259 { 260 struct rtable *r = rt_cache_get_first(seq); 261 262 if (r) 263 while (pos && (r = rt_cache_get_next(seq, r))) 264 --pos; 265 return pos ? NULL : r; 266 } 267 268 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 269 { 270 return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; 271 } 272 273 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 274 { 275 struct rtable *r = NULL; 276 277 if (v == SEQ_START_TOKEN) 278 r = rt_cache_get_first(seq); 279 else 280 r = rt_cache_get_next(seq, v); 281 ++*pos; 282 return r; 283 } 284 285 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 286 { 287 if (v && v != SEQ_START_TOKEN) 288 rcu_read_unlock_bh(); 289 } 290 291 static int rt_cache_seq_show(struct seq_file *seq, void *v) 292 { 293 if (v == SEQ_START_TOKEN) 294 seq_printf(seq, "%-127s\n", 295 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 296 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 297 "HHUptod\tSpecDst"); 298 else { 299 struct rtable *r = v; 300 char temp[256]; 301 302 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" 303 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", 304 r->u.dst.dev ? r->u.dst.dev->name : "*", 305 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, 306 r->rt_flags, atomic_read(&r->u.dst.__refcnt), 307 r->u.dst.__use, 0, (unsigned long)r->rt_src, 308 (dst_metric(&r->u.dst, RTAX_ADVMSS) ? 309 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0), 310 dst_metric(&r->u.dst, RTAX_WINDOW), 311 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) + 312 dst_metric(&r->u.dst, RTAX_RTTVAR)), 313 r->fl.fl4_tos, 314 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, 315 r->u.dst.hh ? (r->u.dst.hh->hh_output == 316 dev_queue_xmit) : 0, 317 r->rt_spec_dst); 318 seq_printf(seq, "%-127s\n", temp); 319 } 320 return 0; 321 } 322 323 static struct seq_operations rt_cache_seq_ops = { 324 .start = rt_cache_seq_start, 325 .next = rt_cache_seq_next, 326 .stop = rt_cache_seq_stop, 327 .show = rt_cache_seq_show, 328 }; 329 330 static int rt_cache_seq_open(struct inode *inode, struct file *file) 331 { 332 struct seq_file *seq; 333 int rc = -ENOMEM; 334 struct rt_cache_iter_state *s = kmalloc(sizeof(*s), GFP_KERNEL); 335 336 if (!s) 337 goto out; 338 rc = seq_open(file, &rt_cache_seq_ops); 339 if (rc) 340 goto out_kfree; 341 seq = file->private_data; 342 seq->private = s; 343 memset(s, 0, sizeof(*s)); 344 out: 345 return rc; 346 out_kfree: 347 kfree(s); 348 goto out; 349 } 350 351 static struct file_operations rt_cache_seq_fops = { 352 .owner = THIS_MODULE, 353 .open = rt_cache_seq_open, 354 .read = seq_read, 355 .llseek = seq_lseek, 356 .release = seq_release_private, 357 }; 358 359 360 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 361 { 362 int cpu; 363 364 if (*pos == 0) 365 return SEQ_START_TOKEN; 366 367 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { 368 if (!cpu_possible(cpu)) 369 continue; 370 *pos = cpu+1; 371 return per_cpu_ptr(rt_cache_stat, cpu); 372 } 373 return NULL; 374 } 375 376 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 377 { 378 int cpu; 379 380 for (cpu = *pos; cpu < NR_CPUS; ++cpu) { 381 if (!cpu_possible(cpu)) 382 continue; 383 *pos = cpu+1; 384 return per_cpu_ptr(rt_cache_stat, cpu); 385 } 386 return NULL; 387 388 } 389 390 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 391 { 392 393 } 394 395 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 396 { 397 struct rt_cache_stat *st = v; 398 399 if (v == SEQ_START_TOKEN) { 400 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 401 return 0; 402 } 403 404 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 405 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 406 atomic_read(&ipv4_dst_ops.entries), 407 st->in_hit, 408 st->in_slow_tot, 409 st->in_slow_mc, 410 st->in_no_route, 411 st->in_brd, 412 st->in_martian_dst, 413 st->in_martian_src, 414 415 st->out_hit, 416 st->out_slow_tot, 417 st->out_slow_mc, 418 419 st->gc_total, 420 st->gc_ignored, 421 st->gc_goal_miss, 422 st->gc_dst_overflow, 423 st->in_hlist_search, 424 st->out_hlist_search 425 ); 426 return 0; 427 } 428 429 static struct seq_operations rt_cpu_seq_ops = { 430 .start = rt_cpu_seq_start, 431 .next = rt_cpu_seq_next, 432 .stop = rt_cpu_seq_stop, 433 .show = rt_cpu_seq_show, 434 }; 435 436 437 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 438 { 439 return seq_open(file, &rt_cpu_seq_ops); 440 } 441 442 static struct file_operations rt_cpu_seq_fops = { 443 .owner = THIS_MODULE, 444 .open = rt_cpu_seq_open, 445 .read = seq_read, 446 .llseek = seq_lseek, 447 .release = seq_release, 448 }; 449 450 #endif /* CONFIG_PROC_FS */ 451 452 static __inline__ void rt_free(struct rtable *rt) 453 { 454 multipath_remove(rt); 455 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 456 } 457 458 static __inline__ void rt_drop(struct rtable *rt) 459 { 460 multipath_remove(rt); 461 ip_rt_put(rt); 462 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); 463 } 464 465 static __inline__ int rt_fast_clean(struct rtable *rth) 466 { 467 /* Kill broadcast/multicast entries very aggresively, if they 468 collide in hash table with more useful entries */ 469 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 470 rth->fl.iif && rth->u.rt_next; 471 } 472 473 static __inline__ int rt_valuable(struct rtable *rth) 474 { 475 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 476 rth->u.dst.expires; 477 } 478 479 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 480 { 481 unsigned long age; 482 int ret = 0; 483 484 if (atomic_read(&rth->u.dst.__refcnt)) 485 goto out; 486 487 ret = 1; 488 if (rth->u.dst.expires && 489 time_after_eq(jiffies, rth->u.dst.expires)) 490 goto out; 491 492 age = jiffies - rth->u.dst.lastuse; 493 ret = 0; 494 if ((age <= tmo1 && !rt_fast_clean(rth)) || 495 (age <= tmo2 && rt_valuable(rth))) 496 goto out; 497 ret = 1; 498 out: return ret; 499 } 500 501 /* Bits of score are: 502 * 31: very valuable 503 * 30: not quite useless 504 * 29..0: usage counter 505 */ 506 static inline u32 rt_score(struct rtable *rt) 507 { 508 u32 score = jiffies - rt->u.dst.lastuse; 509 510 score = ~score & ~(3<<30); 511 512 if (rt_valuable(rt)) 513 score |= (1<<31); 514 515 if (!rt->fl.iif || 516 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 517 score |= (1<<30); 518 519 return score; 520 } 521 522 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) 523 { 524 return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && 525 fl1->oif == fl2->oif && 526 fl1->iif == fl2->iif; 527 } 528 529 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 530 static struct rtable **rt_remove_balanced_route(struct rtable **chain_head, 531 struct rtable *expentry, 532 int *removed_count) 533 { 534 int passedexpired = 0; 535 struct rtable **nextstep = NULL; 536 struct rtable **rthp = chain_head; 537 struct rtable *rth; 538 539 if (removed_count) 540 *removed_count = 0; 541 542 while ((rth = *rthp) != NULL) { 543 if (rth == expentry) 544 passedexpired = 1; 545 546 if (((*rthp)->u.dst.flags & DST_BALANCED) != 0 && 547 compare_keys(&(*rthp)->fl, &expentry->fl)) { 548 if (*rthp == expentry) { 549 *rthp = rth->u.rt_next; 550 continue; 551 } else { 552 *rthp = rth->u.rt_next; 553 rt_free(rth); 554 if (removed_count) 555 ++(*removed_count); 556 } 557 } else { 558 if (!((*rthp)->u.dst.flags & DST_BALANCED) && 559 passedexpired && !nextstep) 560 nextstep = &rth->u.rt_next; 561 562 rthp = &rth->u.rt_next; 563 } 564 } 565 566 rt_free(expentry); 567 if (removed_count) 568 ++(*removed_count); 569 570 return nextstep; 571 } 572 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 573 574 575 /* This runs via a timer and thus is always in BH context. */ 576 static void rt_check_expire(unsigned long dummy) 577 { 578 static int rover; 579 int i = rover, t; 580 struct rtable *rth, **rthp; 581 unsigned long now = jiffies; 582 583 for (t = ip_rt_gc_interval << rt_hash_log; t >= 0; 584 t -= ip_rt_gc_timeout) { 585 unsigned long tmo = ip_rt_gc_timeout; 586 587 i = (i + 1) & rt_hash_mask; 588 rthp = &rt_hash_table[i].chain; 589 590 spin_lock(&rt_hash_table[i].lock); 591 while ((rth = *rthp) != NULL) { 592 if (rth->u.dst.expires) { 593 /* Entry is expired even if it is in use */ 594 if (time_before_eq(now, rth->u.dst.expires)) { 595 tmo >>= 1; 596 rthp = &rth->u.rt_next; 597 continue; 598 } 599 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { 600 tmo >>= 1; 601 rthp = &rth->u.rt_next; 602 continue; 603 } 604 605 /* Cleanup aged off entries. */ 606 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 607 /* remove all related balanced entries if necessary */ 608 if (rth->u.dst.flags & DST_BALANCED) { 609 rthp = rt_remove_balanced_route( 610 &rt_hash_table[i].chain, 611 rth, NULL); 612 if (!rthp) 613 break; 614 } else { 615 *rthp = rth->u.rt_next; 616 rt_free(rth); 617 } 618 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 619 *rthp = rth->u.rt_next; 620 rt_free(rth); 621 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 622 } 623 spin_unlock(&rt_hash_table[i].lock); 624 625 /* Fallback loop breaker. */ 626 if (time_after(jiffies, now)) 627 break; 628 } 629 rover = i; 630 mod_timer(&rt_periodic_timer, now + ip_rt_gc_interval); 631 } 632 633 /* This can run from both BH and non-BH contexts, the latter 634 * in the case of a forced flush event. 635 */ 636 static void rt_run_flush(unsigned long dummy) 637 { 638 int i; 639 struct rtable *rth, *next; 640 641 rt_deadline = 0; 642 643 get_random_bytes(&rt_hash_rnd, 4); 644 645 for (i = rt_hash_mask; i >= 0; i--) { 646 spin_lock_bh(&rt_hash_table[i].lock); 647 rth = rt_hash_table[i].chain; 648 if (rth) 649 rt_hash_table[i].chain = NULL; 650 spin_unlock_bh(&rt_hash_table[i].lock); 651 652 for (; rth; rth = next) { 653 next = rth->u.rt_next; 654 rt_free(rth); 655 } 656 } 657 } 658 659 static DEFINE_SPINLOCK(rt_flush_lock); 660 661 void rt_cache_flush(int delay) 662 { 663 unsigned long now = jiffies; 664 int user_mode = !in_softirq(); 665 666 if (delay < 0) 667 delay = ip_rt_min_delay; 668 669 /* flush existing multipath state*/ 670 multipath_flush(); 671 672 spin_lock_bh(&rt_flush_lock); 673 674 if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { 675 long tmo = (long)(rt_deadline - now); 676 677 /* If flush timer is already running 678 and flush request is not immediate (delay > 0): 679 680 if deadline is not achieved, prolongate timer to "delay", 681 otherwise fire it at deadline time. 682 */ 683 684 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) 685 tmo = 0; 686 687 if (delay > tmo) 688 delay = tmo; 689 } 690 691 if (delay <= 0) { 692 spin_unlock_bh(&rt_flush_lock); 693 rt_run_flush(0); 694 return; 695 } 696 697 if (rt_deadline == 0) 698 rt_deadline = now + ip_rt_max_delay; 699 700 mod_timer(&rt_flush_timer, now+delay); 701 spin_unlock_bh(&rt_flush_lock); 702 } 703 704 static void rt_secret_rebuild(unsigned long dummy) 705 { 706 unsigned long now = jiffies; 707 708 rt_cache_flush(0); 709 mod_timer(&rt_secret_timer, now + ip_rt_secret_interval); 710 } 711 712 /* 713 Short description of GC goals. 714 715 We want to build algorithm, which will keep routing cache 716 at some equilibrium point, when number of aged off entries 717 is kept approximately equal to newly generated ones. 718 719 Current expiration strength is variable "expire". 720 We try to adjust it dynamically, so that if networking 721 is idle expires is large enough to keep enough of warm entries, 722 and when load increases it reduces to limit cache size. 723 */ 724 725 static int rt_garbage_collect(void) 726 { 727 static unsigned long expire = RT_GC_TIMEOUT; 728 static unsigned long last_gc; 729 static int rover; 730 static int equilibrium; 731 struct rtable *rth, **rthp; 732 unsigned long now = jiffies; 733 int goal; 734 735 /* 736 * Garbage collection is pretty expensive, 737 * do not make it too frequently. 738 */ 739 740 RT_CACHE_STAT_INC(gc_total); 741 742 if (now - last_gc < ip_rt_gc_min_interval && 743 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { 744 RT_CACHE_STAT_INC(gc_ignored); 745 goto out; 746 } 747 748 /* Calculate number of entries, which we want to expire now. */ 749 goal = atomic_read(&ipv4_dst_ops.entries) - 750 (ip_rt_gc_elasticity << rt_hash_log); 751 if (goal <= 0) { 752 if (equilibrium < ipv4_dst_ops.gc_thresh) 753 equilibrium = ipv4_dst_ops.gc_thresh; 754 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 755 if (goal > 0) { 756 equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); 757 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; 758 } 759 } else { 760 /* We are in dangerous area. Try to reduce cache really 761 * aggressively. 762 */ 763 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); 764 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; 765 } 766 767 if (now - last_gc >= ip_rt_gc_min_interval) 768 last_gc = now; 769 770 if (goal <= 0) { 771 equilibrium += goal; 772 goto work_done; 773 } 774 775 do { 776 int i, k; 777 778 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 779 unsigned long tmo = expire; 780 781 k = (k + 1) & rt_hash_mask; 782 rthp = &rt_hash_table[k].chain; 783 spin_lock_bh(&rt_hash_table[k].lock); 784 while ((rth = *rthp) != NULL) { 785 if (!rt_may_expire(rth, tmo, expire)) { 786 tmo >>= 1; 787 rthp = &rth->u.rt_next; 788 continue; 789 } 790 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 791 /* remove all related balanced entries 792 * if necessary 793 */ 794 if (rth->u.dst.flags & DST_BALANCED) { 795 int r; 796 797 rthp = rt_remove_balanced_route( 798 &rt_hash_table[i].chain, 799 rth, 800 &r); 801 goal -= r; 802 if (!rthp) 803 break; 804 } else { 805 *rthp = rth->u.rt_next; 806 rt_free(rth); 807 goal--; 808 } 809 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 810 *rthp = rth->u.rt_next; 811 rt_free(rth); 812 goal--; 813 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 814 } 815 spin_unlock_bh(&rt_hash_table[k].lock); 816 if (goal <= 0) 817 break; 818 } 819 rover = k; 820 821 if (goal <= 0) 822 goto work_done; 823 824 /* Goal is not achieved. We stop process if: 825 826 - if expire reduced to zero. Otherwise, expire is halfed. 827 - if table is not full. 828 - if we are called from interrupt. 829 - jiffies check is just fallback/debug loop breaker. 830 We will not spin here for long time in any case. 831 */ 832 833 RT_CACHE_STAT_INC(gc_goal_miss); 834 835 if (expire == 0) 836 break; 837 838 expire >>= 1; 839 #if RT_CACHE_DEBUG >= 2 840 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, 841 atomic_read(&ipv4_dst_ops.entries), goal, i); 842 #endif 843 844 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 845 goto out; 846 } while (!in_softirq() && time_before_eq(jiffies, now)); 847 848 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) 849 goto out; 850 if (net_ratelimit()) 851 printk(KERN_WARNING "dst cache overflow\n"); 852 RT_CACHE_STAT_INC(gc_dst_overflow); 853 return 1; 854 855 work_done: 856 expire += ip_rt_gc_min_interval; 857 if (expire > ip_rt_gc_timeout || 858 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) 859 expire = ip_rt_gc_timeout; 860 #if RT_CACHE_DEBUG >= 2 861 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, 862 atomic_read(&ipv4_dst_ops.entries), goal, rover); 863 #endif 864 out: return 0; 865 } 866 867 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) 868 { 869 struct rtable *rth, **rthp; 870 unsigned long now; 871 struct rtable *cand, **candp; 872 u32 min_score; 873 int chain_length; 874 int attempts = !in_softirq(); 875 876 restart: 877 chain_length = 0; 878 min_score = ~(u32)0; 879 cand = NULL; 880 candp = NULL; 881 now = jiffies; 882 883 rthp = &rt_hash_table[hash].chain; 884 885 spin_lock_bh(&rt_hash_table[hash].lock); 886 while ((rth = *rthp) != NULL) { 887 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 888 if (!(rth->u.dst.flags & DST_BALANCED) && 889 compare_keys(&rth->fl, &rt->fl)) { 890 #else 891 if (compare_keys(&rth->fl, &rt->fl)) { 892 #endif 893 /* Put it first */ 894 *rthp = rth->u.rt_next; 895 /* 896 * Since lookup is lockfree, the deletion 897 * must be visible to another weakly ordered CPU before 898 * the insertion at the start of the hash chain. 899 */ 900 rcu_assign_pointer(rth->u.rt_next, 901 rt_hash_table[hash].chain); 902 /* 903 * Since lookup is lockfree, the update writes 904 * must be ordered for consistency on SMP. 905 */ 906 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 907 908 rth->u.dst.__use++; 909 dst_hold(&rth->u.dst); 910 rth->u.dst.lastuse = now; 911 spin_unlock_bh(&rt_hash_table[hash].lock); 912 913 rt_drop(rt); 914 *rp = rth; 915 return 0; 916 } 917 918 if (!atomic_read(&rth->u.dst.__refcnt)) { 919 u32 score = rt_score(rth); 920 921 if (score <= min_score) { 922 cand = rth; 923 candp = rthp; 924 min_score = score; 925 } 926 } 927 928 chain_length++; 929 930 rthp = &rth->u.rt_next; 931 } 932 933 if (cand) { 934 /* ip_rt_gc_elasticity used to be average length of chain 935 * length, when exceeded gc becomes really aggressive. 936 * 937 * The second limit is less certain. At the moment it allows 938 * only 2 entries per bucket. We will see. 939 */ 940 if (chain_length > ip_rt_gc_elasticity) { 941 *candp = cand->u.rt_next; 942 rt_free(cand); 943 } 944 } 945 946 /* Try to bind route to arp only if it is output 947 route or unicast forwarding path. 948 */ 949 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { 950 int err = arp_bind_neighbour(&rt->u.dst); 951 if (err) { 952 spin_unlock_bh(&rt_hash_table[hash].lock); 953 954 if (err != -ENOBUFS) { 955 rt_drop(rt); 956 return err; 957 } 958 959 /* Neighbour tables are full and nothing 960 can be released. Try to shrink route cache, 961 it is most likely it holds some neighbour records. 962 */ 963 if (attempts-- > 0) { 964 int saved_elasticity = ip_rt_gc_elasticity; 965 int saved_int = ip_rt_gc_min_interval; 966 ip_rt_gc_elasticity = 1; 967 ip_rt_gc_min_interval = 0; 968 rt_garbage_collect(); 969 ip_rt_gc_min_interval = saved_int; 970 ip_rt_gc_elasticity = saved_elasticity; 971 goto restart; 972 } 973 974 if (net_ratelimit()) 975 printk(KERN_WARNING "Neighbour table overflow.\n"); 976 rt_drop(rt); 977 return -ENOBUFS; 978 } 979 } 980 981 rt->u.rt_next = rt_hash_table[hash].chain; 982 #if RT_CACHE_DEBUG >= 2 983 if (rt->u.rt_next) { 984 struct rtable *trt; 985 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, 986 NIPQUAD(rt->rt_dst)); 987 for (trt = rt->u.rt_next; trt; trt = trt->u.rt_next) 988 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); 989 printk("\n"); 990 } 991 #endif 992 rt_hash_table[hash].chain = rt; 993 spin_unlock_bh(&rt_hash_table[hash].lock); 994 *rp = rt; 995 return 0; 996 } 997 998 void rt_bind_peer(struct rtable *rt, int create) 999 { 1000 static DEFINE_SPINLOCK(rt_peer_lock); 1001 struct inet_peer *peer; 1002 1003 peer = inet_getpeer(rt->rt_dst, create); 1004 1005 spin_lock_bh(&rt_peer_lock); 1006 if (rt->peer == NULL) { 1007 rt->peer = peer; 1008 peer = NULL; 1009 } 1010 spin_unlock_bh(&rt_peer_lock); 1011 if (peer) 1012 inet_putpeer(peer); 1013 } 1014 1015 /* 1016 * Peer allocation may fail only in serious out-of-memory conditions. However 1017 * we still can generate some output. 1018 * Random ID selection looks a bit dangerous because we have no chances to 1019 * select ID being unique in a reasonable period of time. 1020 * But broken packet identifier may be better than no packet at all. 1021 */ 1022 static void ip_select_fb_ident(struct iphdr *iph) 1023 { 1024 static DEFINE_SPINLOCK(ip_fb_id_lock); 1025 static u32 ip_fallback_id; 1026 u32 salt; 1027 1028 spin_lock_bh(&ip_fb_id_lock); 1029 salt = secure_ip_id(ip_fallback_id ^ iph->daddr); 1030 iph->id = htons(salt & 0xFFFF); 1031 ip_fallback_id = salt; 1032 spin_unlock_bh(&ip_fb_id_lock); 1033 } 1034 1035 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1036 { 1037 struct rtable *rt = (struct rtable *) dst; 1038 1039 if (rt) { 1040 if (rt->peer == NULL) 1041 rt_bind_peer(rt, 1); 1042 1043 /* If peer is attached to destination, it is never detached, 1044 so that we need not to grab a lock to dereference it. 1045 */ 1046 if (rt->peer) { 1047 iph->id = htons(inet_getid(rt->peer, more)); 1048 return; 1049 } 1050 } else 1051 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1052 __builtin_return_address(0)); 1053 1054 ip_select_fb_ident(iph); 1055 } 1056 1057 static void rt_del(unsigned hash, struct rtable *rt) 1058 { 1059 struct rtable **rthp; 1060 1061 spin_lock_bh(&rt_hash_table[hash].lock); 1062 ip_rt_put(rt); 1063 for (rthp = &rt_hash_table[hash].chain; *rthp; 1064 rthp = &(*rthp)->u.rt_next) 1065 if (*rthp == rt) { 1066 *rthp = rt->u.rt_next; 1067 rt_free(rt); 1068 break; 1069 } 1070 spin_unlock_bh(&rt_hash_table[hash].lock); 1071 } 1072 1073 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw, 1074 u32 saddr, u8 tos, struct net_device *dev) 1075 { 1076 int i, k; 1077 struct in_device *in_dev = in_dev_get(dev); 1078 struct rtable *rth, **rthp; 1079 u32 skeys[2] = { saddr, 0 }; 1080 int ikeys[2] = { dev->ifindex, 0 }; 1081 1082 tos &= IPTOS_RT_MASK; 1083 1084 if (!in_dev) 1085 return; 1086 1087 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) 1088 || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) 1089 goto reject_redirect; 1090 1091 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1092 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1093 goto reject_redirect; 1094 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1095 goto reject_redirect; 1096 } else { 1097 if (inet_addr_type(new_gw) != RTN_UNICAST) 1098 goto reject_redirect; 1099 } 1100 1101 for (i = 0; i < 2; i++) { 1102 for (k = 0; k < 2; k++) { 1103 unsigned hash = rt_hash_code(daddr, 1104 skeys[i] ^ (ikeys[k] << 5), 1105 tos); 1106 1107 rthp=&rt_hash_table[hash].chain; 1108 1109 rcu_read_lock(); 1110 while ((rth = rcu_dereference(*rthp)) != NULL) { 1111 struct rtable *rt; 1112 1113 if (rth->fl.fl4_dst != daddr || 1114 rth->fl.fl4_src != skeys[i] || 1115 rth->fl.fl4_tos != tos || 1116 rth->fl.oif != ikeys[k] || 1117 rth->fl.iif != 0) { 1118 rthp = &rth->u.rt_next; 1119 continue; 1120 } 1121 1122 if (rth->rt_dst != daddr || 1123 rth->rt_src != saddr || 1124 rth->u.dst.error || 1125 rth->rt_gateway != old_gw || 1126 rth->u.dst.dev != dev) 1127 break; 1128 1129 dst_hold(&rth->u.dst); 1130 rcu_read_unlock(); 1131 1132 rt = dst_alloc(&ipv4_dst_ops); 1133 if (rt == NULL) { 1134 ip_rt_put(rth); 1135 in_dev_put(in_dev); 1136 return; 1137 } 1138 1139 /* Copy all the information. */ 1140 *rt = *rth; 1141 INIT_RCU_HEAD(&rt->u.dst.rcu_head); 1142 rt->u.dst.__use = 1; 1143 atomic_set(&rt->u.dst.__refcnt, 1); 1144 rt->u.dst.child = NULL; 1145 if (rt->u.dst.dev) 1146 dev_hold(rt->u.dst.dev); 1147 if (rt->idev) 1148 in_dev_hold(rt->idev); 1149 rt->u.dst.obsolete = 0; 1150 rt->u.dst.lastuse = jiffies; 1151 rt->u.dst.path = &rt->u.dst; 1152 rt->u.dst.neighbour = NULL; 1153 rt->u.dst.hh = NULL; 1154 rt->u.dst.xfrm = NULL; 1155 1156 rt->rt_flags |= RTCF_REDIRECTED; 1157 1158 /* Gateway is different ... */ 1159 rt->rt_gateway = new_gw; 1160 1161 /* Redirect received -> path was valid */ 1162 dst_confirm(&rth->u.dst); 1163 1164 if (rt->peer) 1165 atomic_inc(&rt->peer->refcnt); 1166 1167 if (arp_bind_neighbour(&rt->u.dst) || 1168 !(rt->u.dst.neighbour->nud_state & 1169 NUD_VALID)) { 1170 if (rt->u.dst.neighbour) 1171 neigh_event_send(rt->u.dst.neighbour, NULL); 1172 ip_rt_put(rth); 1173 rt_drop(rt); 1174 goto do_next; 1175 } 1176 1177 rt_del(hash, rth); 1178 if (!rt_intern_hash(hash, rt, &rt)) 1179 ip_rt_put(rt); 1180 goto do_next; 1181 } 1182 rcu_read_unlock(); 1183 do_next: 1184 ; 1185 } 1186 } 1187 in_dev_put(in_dev); 1188 return; 1189 1190 reject_redirect: 1191 #ifdef CONFIG_IP_ROUTE_VERBOSE 1192 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1193 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " 1194 "%u.%u.%u.%u ignored.\n" 1195 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, " 1196 "tos %02x\n", 1197 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), 1198 NIPQUAD(saddr), NIPQUAD(daddr), tos); 1199 #endif 1200 in_dev_put(in_dev); 1201 } 1202 1203 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1204 { 1205 struct rtable *rt = (struct rtable*)dst; 1206 struct dst_entry *ret = dst; 1207 1208 if (rt) { 1209 if (dst->obsolete) { 1210 ip_rt_put(rt); 1211 ret = NULL; 1212 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 1213 rt->u.dst.expires) { 1214 unsigned hash = rt_hash_code(rt->fl.fl4_dst, 1215 rt->fl.fl4_src ^ 1216 (rt->fl.oif << 5), 1217 rt->fl.fl4_tos); 1218 #if RT_CACHE_DEBUG >= 1 1219 printk(KERN_DEBUG "ip_rt_advice: redirect to " 1220 "%u.%u.%u.%u/%02x dropped\n", 1221 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); 1222 #endif 1223 rt_del(hash, rt); 1224 ret = NULL; 1225 } 1226 } 1227 return ret; 1228 } 1229 1230 /* 1231 * Algorithm: 1232 * 1. The first ip_rt_redirect_number redirects are sent 1233 * with exponential backoff, then we stop sending them at all, 1234 * assuming that the host ignores our redirects. 1235 * 2. If we did not see packets requiring redirects 1236 * during ip_rt_redirect_silence, we assume that the host 1237 * forgot redirected route and start to send redirects again. 1238 * 1239 * This algorithm is much cheaper and more intelligent than dumb load limiting 1240 * in icmp.c. 1241 * 1242 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1243 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1244 */ 1245 1246 void ip_rt_send_redirect(struct sk_buff *skb) 1247 { 1248 struct rtable *rt = (struct rtable*)skb->dst; 1249 struct in_device *in_dev = in_dev_get(rt->u.dst.dev); 1250 1251 if (!in_dev) 1252 return; 1253 1254 if (!IN_DEV_TX_REDIRECTS(in_dev)) 1255 goto out; 1256 1257 /* No redirected packets during ip_rt_redirect_silence; 1258 * reset the algorithm. 1259 */ 1260 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence)) 1261 rt->u.dst.rate_tokens = 0; 1262 1263 /* Too many ignored redirects; do not send anything 1264 * set u.dst.rate_last to the last seen redirected packet. 1265 */ 1266 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { 1267 rt->u.dst.rate_last = jiffies; 1268 goto out; 1269 } 1270 1271 /* Check for load limit; set rate_last to the latest sent 1272 * redirect. 1273 */ 1274 if (time_after(jiffies, 1275 (rt->u.dst.rate_last + 1276 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) { 1277 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1278 rt->u.dst.rate_last = jiffies; 1279 ++rt->u.dst.rate_tokens; 1280 #ifdef CONFIG_IP_ROUTE_VERBOSE 1281 if (IN_DEV_LOG_MARTIANS(in_dev) && 1282 rt->u.dst.rate_tokens == ip_rt_redirect_number && 1283 net_ratelimit()) 1284 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " 1285 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", 1286 NIPQUAD(rt->rt_src), rt->rt_iif, 1287 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); 1288 #endif 1289 } 1290 out: 1291 in_dev_put(in_dev); 1292 } 1293 1294 static int ip_error(struct sk_buff *skb) 1295 { 1296 struct rtable *rt = (struct rtable*)skb->dst; 1297 unsigned long now; 1298 int code; 1299 1300 switch (rt->u.dst.error) { 1301 case EINVAL: 1302 default: 1303 goto out; 1304 case EHOSTUNREACH: 1305 code = ICMP_HOST_UNREACH; 1306 break; 1307 case ENETUNREACH: 1308 code = ICMP_NET_UNREACH; 1309 break; 1310 case EACCES: 1311 code = ICMP_PKT_FILTERED; 1312 break; 1313 } 1314 1315 now = jiffies; 1316 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last; 1317 if (rt->u.dst.rate_tokens > ip_rt_error_burst) 1318 rt->u.dst.rate_tokens = ip_rt_error_burst; 1319 rt->u.dst.rate_last = now; 1320 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) { 1321 rt->u.dst.rate_tokens -= ip_rt_error_cost; 1322 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1323 } 1324 1325 out: kfree_skb(skb); 1326 return 0; 1327 } 1328 1329 /* 1330 * The last two values are not from the RFC but 1331 * are needed for AMPRnet AX.25 paths. 1332 */ 1333 1334 static unsigned short mtu_plateau[] = 1335 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1336 1337 static __inline__ unsigned short guess_mtu(unsigned short old_mtu) 1338 { 1339 int i; 1340 1341 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1342 if (old_mtu > mtu_plateau[i]) 1343 return mtu_plateau[i]; 1344 return 68; 1345 } 1346 1347 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) 1348 { 1349 int i; 1350 unsigned short old_mtu = ntohs(iph->tot_len); 1351 struct rtable *rth; 1352 u32 skeys[2] = { iph->saddr, 0, }; 1353 u32 daddr = iph->daddr; 1354 u8 tos = iph->tos & IPTOS_RT_MASK; 1355 unsigned short est_mtu = 0; 1356 1357 if (ipv4_config.no_pmtu_disc) 1358 return 0; 1359 1360 for (i = 0; i < 2; i++) { 1361 unsigned hash = rt_hash_code(daddr, skeys[i], tos); 1362 1363 rcu_read_lock(); 1364 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 1365 rth = rcu_dereference(rth->u.rt_next)) { 1366 if (rth->fl.fl4_dst == daddr && 1367 rth->fl.fl4_src == skeys[i] && 1368 rth->rt_dst == daddr && 1369 rth->rt_src == iph->saddr && 1370 rth->fl.fl4_tos == tos && 1371 rth->fl.iif == 0 && 1372 !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { 1373 unsigned short mtu = new_mtu; 1374 1375 if (new_mtu < 68 || new_mtu >= old_mtu) { 1376 1377 /* BSD 4.2 compatibility hack :-( */ 1378 if (mtu == 0 && 1379 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && 1380 old_mtu >= 68 + (iph->ihl << 2)) 1381 old_mtu -= iph->ihl << 2; 1382 1383 mtu = guess_mtu(old_mtu); 1384 } 1385 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { 1386 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { 1387 dst_confirm(&rth->u.dst); 1388 if (mtu < ip_rt_min_pmtu) { 1389 mtu = ip_rt_min_pmtu; 1390 rth->u.dst.metrics[RTAX_LOCK-1] |= 1391 (1 << RTAX_MTU); 1392 } 1393 rth->u.dst.metrics[RTAX_MTU-1] = mtu; 1394 dst_set_expires(&rth->u.dst, 1395 ip_rt_mtu_expires); 1396 } 1397 est_mtu = mtu; 1398 } 1399 } 1400 } 1401 rcu_read_unlock(); 1402 } 1403 return est_mtu ? : new_mtu; 1404 } 1405 1406 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1407 { 1408 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && 1409 !(dst_metric_locked(dst, RTAX_MTU))) { 1410 if (mtu < ip_rt_min_pmtu) { 1411 mtu = ip_rt_min_pmtu; 1412 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU); 1413 } 1414 dst->metrics[RTAX_MTU-1] = mtu; 1415 dst_set_expires(dst, ip_rt_mtu_expires); 1416 } 1417 } 1418 1419 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1420 { 1421 return NULL; 1422 } 1423 1424 static void ipv4_dst_destroy(struct dst_entry *dst) 1425 { 1426 struct rtable *rt = (struct rtable *) dst; 1427 struct inet_peer *peer = rt->peer; 1428 struct in_device *idev = rt->idev; 1429 1430 if (peer) { 1431 rt->peer = NULL; 1432 inet_putpeer(peer); 1433 } 1434 1435 if (idev) { 1436 rt->idev = NULL; 1437 in_dev_put(idev); 1438 } 1439 } 1440 1441 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 1442 int how) 1443 { 1444 struct rtable *rt = (struct rtable *) dst; 1445 struct in_device *idev = rt->idev; 1446 if (dev != &loopback_dev && idev && idev->dev == dev) { 1447 struct in_device *loopback_idev = in_dev_get(&loopback_dev); 1448 if (loopback_idev) { 1449 rt->idev = loopback_idev; 1450 in_dev_put(idev); 1451 } 1452 } 1453 } 1454 1455 static void ipv4_link_failure(struct sk_buff *skb) 1456 { 1457 struct rtable *rt; 1458 1459 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1460 1461 rt = (struct rtable *) skb->dst; 1462 if (rt) 1463 dst_set_expires(&rt->u.dst, 0); 1464 } 1465 1466 static int ip_rt_bug(struct sk_buff *skb) 1467 { 1468 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", 1469 NIPQUAD(skb->nh.iph->saddr), NIPQUAD(skb->nh.iph->daddr), 1470 skb->dev ? skb->dev->name : "?"); 1471 kfree_skb(skb); 1472 return 0; 1473 } 1474 1475 /* 1476 We do not cache source address of outgoing interface, 1477 because it is used only by IP RR, TS and SRR options, 1478 so that it out of fast path. 1479 1480 BTW remember: "addr" is allowed to be not aligned 1481 in IP options! 1482 */ 1483 1484 void ip_rt_get_source(u8 *addr, struct rtable *rt) 1485 { 1486 u32 src; 1487 struct fib_result res; 1488 1489 if (rt->fl.iif == 0) 1490 src = rt->rt_src; 1491 else if (fib_lookup(&rt->fl, &res) == 0) { 1492 src = FIB_RES_PREFSRC(res); 1493 fib_res_put(&res); 1494 } else 1495 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway, 1496 RT_SCOPE_UNIVERSE); 1497 memcpy(addr, &src, 4); 1498 } 1499 1500 #ifdef CONFIG_NET_CLS_ROUTE 1501 static void set_class_tag(struct rtable *rt, u32 tag) 1502 { 1503 if (!(rt->u.dst.tclassid & 0xFFFF)) 1504 rt->u.dst.tclassid |= tag & 0xFFFF; 1505 if (!(rt->u.dst.tclassid & 0xFFFF0000)) 1506 rt->u.dst.tclassid |= tag & 0xFFFF0000; 1507 } 1508 #endif 1509 1510 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) 1511 { 1512 struct fib_info *fi = res->fi; 1513 1514 if (fi) { 1515 if (FIB_RES_GW(*res) && 1516 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1517 rt->rt_gateway = FIB_RES_GW(*res); 1518 memcpy(rt->u.dst.metrics, fi->fib_metrics, 1519 sizeof(rt->u.dst.metrics)); 1520 if (fi->fib_mtu == 0) { 1521 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; 1522 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && 1523 rt->rt_gateway != rt->rt_dst && 1524 rt->u.dst.dev->mtu > 576) 1525 rt->u.dst.metrics[RTAX_MTU-1] = 576; 1526 } 1527 #ifdef CONFIG_NET_CLS_ROUTE 1528 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid; 1529 #endif 1530 } else 1531 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; 1532 1533 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) 1534 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; 1535 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) 1536 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; 1537 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) 1538 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, 1539 ip_rt_min_advmss); 1540 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) 1541 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; 1542 1543 #ifdef CONFIG_NET_CLS_ROUTE 1544 #ifdef CONFIG_IP_MULTIPLE_TABLES 1545 set_class_tag(rt, fib_rules_tclass(res)); 1546 #endif 1547 set_class_tag(rt, itag); 1548 #endif 1549 rt->rt_type = res->type; 1550 } 1551 1552 static int ip_route_input_mc(struct sk_buff *skb, u32 daddr, u32 saddr, 1553 u8 tos, struct net_device *dev, int our) 1554 { 1555 unsigned hash; 1556 struct rtable *rth; 1557 u32 spec_dst; 1558 struct in_device *in_dev = in_dev_get(dev); 1559 u32 itag = 0; 1560 1561 /* Primary sanity checks. */ 1562 1563 if (in_dev == NULL) 1564 return -EINVAL; 1565 1566 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || 1567 skb->protocol != htons(ETH_P_IP)) 1568 goto e_inval; 1569 1570 if (ZERONET(saddr)) { 1571 if (!LOCAL_MCAST(daddr)) 1572 goto e_inval; 1573 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1574 } else if (fib_validate_source(saddr, 0, tos, 0, 1575 dev, &spec_dst, &itag) < 0) 1576 goto e_inval; 1577 1578 rth = dst_alloc(&ipv4_dst_ops); 1579 if (!rth) 1580 goto e_nobufs; 1581 1582 rth->u.dst.output= ip_rt_bug; 1583 1584 atomic_set(&rth->u.dst.__refcnt, 1); 1585 rth->u.dst.flags= DST_HOST; 1586 if (in_dev->cnf.no_policy) 1587 rth->u.dst.flags |= DST_NOPOLICY; 1588 rth->fl.fl4_dst = daddr; 1589 rth->rt_dst = daddr; 1590 rth->fl.fl4_tos = tos; 1591 #ifdef CONFIG_IP_ROUTE_FWMARK 1592 rth->fl.fl4_fwmark= skb->nfmark; 1593 #endif 1594 rth->fl.fl4_src = saddr; 1595 rth->rt_src = saddr; 1596 #ifdef CONFIG_NET_CLS_ROUTE 1597 rth->u.dst.tclassid = itag; 1598 #endif 1599 rth->rt_iif = 1600 rth->fl.iif = dev->ifindex; 1601 rth->u.dst.dev = &loopback_dev; 1602 dev_hold(rth->u.dst.dev); 1603 rth->idev = in_dev_get(rth->u.dst.dev); 1604 rth->fl.oif = 0; 1605 rth->rt_gateway = daddr; 1606 rth->rt_spec_dst= spec_dst; 1607 rth->rt_type = RTN_MULTICAST; 1608 rth->rt_flags = RTCF_MULTICAST; 1609 if (our) { 1610 rth->u.dst.input= ip_local_deliver; 1611 rth->rt_flags |= RTCF_LOCAL; 1612 } 1613 1614 #ifdef CONFIG_IP_MROUTE 1615 if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) 1616 rth->u.dst.input = ip_mr_input; 1617 #endif 1618 RT_CACHE_STAT_INC(in_slow_mc); 1619 1620 in_dev_put(in_dev); 1621 hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos); 1622 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); 1623 1624 e_nobufs: 1625 in_dev_put(in_dev); 1626 return -ENOBUFS; 1627 1628 e_inval: 1629 in_dev_put(in_dev); 1630 return -EINVAL; 1631 } 1632 1633 1634 static void ip_handle_martian_source(struct net_device *dev, 1635 struct in_device *in_dev, 1636 struct sk_buff *skb, 1637 u32 daddr, 1638 u32 saddr) 1639 { 1640 RT_CACHE_STAT_INC(in_martian_src); 1641 #ifdef CONFIG_IP_ROUTE_VERBOSE 1642 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1643 /* 1644 * RFC1812 recommendation, if source is martian, 1645 * the only hint is MAC header. 1646 */ 1647 printk(KERN_WARNING "martian source %u.%u.%u.%u from " 1648 "%u.%u.%u.%u, on dev %s\n", 1649 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 1650 if (dev->hard_header_len) { 1651 int i; 1652 unsigned char *p = skb->mac.raw; 1653 printk(KERN_WARNING "ll header: "); 1654 for (i = 0; i < dev->hard_header_len; i++, p++) { 1655 printk("%02x", *p); 1656 if (i < (dev->hard_header_len - 1)) 1657 printk(":"); 1658 } 1659 printk("\n"); 1660 } 1661 } 1662 #endif 1663 } 1664 1665 static inline int __mkroute_input(struct sk_buff *skb, 1666 struct fib_result* res, 1667 struct in_device *in_dev, 1668 u32 daddr, u32 saddr, u32 tos, 1669 struct rtable **result) 1670 { 1671 1672 struct rtable *rth; 1673 int err; 1674 struct in_device *out_dev; 1675 unsigned flags = 0; 1676 u32 spec_dst, itag; 1677 1678 /* get a working reference to the output device */ 1679 out_dev = in_dev_get(FIB_RES_DEV(*res)); 1680 if (out_dev == NULL) { 1681 if (net_ratelimit()) 1682 printk(KERN_CRIT "Bug in ip_route_input" \ 1683 "_slow(). Please, report\n"); 1684 return -EINVAL; 1685 } 1686 1687 1688 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), 1689 in_dev->dev, &spec_dst, &itag); 1690 if (err < 0) { 1691 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1692 saddr); 1693 1694 err = -EINVAL; 1695 goto cleanup; 1696 } 1697 1698 if (err) 1699 flags |= RTCF_DIRECTSRC; 1700 1701 if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && 1702 (IN_DEV_SHARED_MEDIA(out_dev) || 1703 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1704 flags |= RTCF_DOREDIRECT; 1705 1706 if (skb->protocol != htons(ETH_P_IP)) { 1707 /* Not IP (i.e. ARP). Do not create route, if it is 1708 * invalid for proxy arp. DNAT routes are always valid. 1709 */ 1710 if (out_dev == in_dev && !(flags & RTCF_DNAT)) { 1711 err = -EINVAL; 1712 goto cleanup; 1713 } 1714 } 1715 1716 1717 rth = dst_alloc(&ipv4_dst_ops); 1718 if (!rth) { 1719 err = -ENOBUFS; 1720 goto cleanup; 1721 } 1722 1723 rth->u.dst.flags= DST_HOST; 1724 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1725 if (res->fi->fib_nhs > 1) 1726 rth->u.dst.flags |= DST_BALANCED; 1727 #endif 1728 if (in_dev->cnf.no_policy) 1729 rth->u.dst.flags |= DST_NOPOLICY; 1730 if (in_dev->cnf.no_xfrm) 1731 rth->u.dst.flags |= DST_NOXFRM; 1732 rth->fl.fl4_dst = daddr; 1733 rth->rt_dst = daddr; 1734 rth->fl.fl4_tos = tos; 1735 #ifdef CONFIG_IP_ROUTE_FWMARK 1736 rth->fl.fl4_fwmark= skb->nfmark; 1737 #endif 1738 rth->fl.fl4_src = saddr; 1739 rth->rt_src = saddr; 1740 rth->rt_gateway = daddr; 1741 rth->rt_iif = 1742 rth->fl.iif = in_dev->dev->ifindex; 1743 rth->u.dst.dev = (out_dev)->dev; 1744 dev_hold(rth->u.dst.dev); 1745 rth->idev = in_dev_get(rth->u.dst.dev); 1746 rth->fl.oif = 0; 1747 rth->rt_spec_dst= spec_dst; 1748 1749 rth->u.dst.input = ip_forward; 1750 rth->u.dst.output = ip_output; 1751 1752 rt_set_nexthop(rth, res, itag); 1753 1754 rth->rt_flags = flags; 1755 1756 *result = rth; 1757 err = 0; 1758 cleanup: 1759 /* release the working reference to the output device */ 1760 in_dev_put(out_dev); 1761 return err; 1762 } 1763 1764 static inline int ip_mkroute_input_def(struct sk_buff *skb, 1765 struct fib_result* res, 1766 const struct flowi *fl, 1767 struct in_device *in_dev, 1768 u32 daddr, u32 saddr, u32 tos) 1769 { 1770 struct rtable* rth; 1771 int err; 1772 unsigned hash; 1773 1774 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1775 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0) 1776 fib_select_multipath(fl, res); 1777 #endif 1778 1779 /* create a routing cache entry */ 1780 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 1781 if (err) 1782 return err; 1783 atomic_set(&rth->u.dst.__refcnt, 1); 1784 1785 /* put it into the cache */ 1786 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); 1787 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); 1788 } 1789 1790 static inline int ip_mkroute_input(struct sk_buff *skb, 1791 struct fib_result* res, 1792 const struct flowi *fl, 1793 struct in_device *in_dev, 1794 u32 daddr, u32 saddr, u32 tos) 1795 { 1796 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 1797 struct rtable* rth; 1798 unsigned char hop, hopcount, lasthop; 1799 int err = -EINVAL; 1800 unsigned int hash; 1801 1802 if (res->fi) 1803 hopcount = res->fi->fib_nhs; 1804 else 1805 hopcount = 1; 1806 1807 lasthop = hopcount - 1; 1808 1809 /* distinguish between multipath and singlepath */ 1810 if (hopcount < 2) 1811 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, 1812 saddr, tos); 1813 1814 /* add all alternatives to the routing cache */ 1815 for (hop = 0; hop < hopcount; hop++) { 1816 res->nh_sel = hop; 1817 1818 /* create a routing cache entry */ 1819 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, 1820 &rth); 1821 if (err) 1822 return err; 1823 1824 /* put it into the cache */ 1825 hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos); 1826 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); 1827 if (err) 1828 return err; 1829 1830 /* forward hop information to multipath impl. */ 1831 multipath_set_nhinfo(rth, 1832 FIB_RES_NETWORK(*res), 1833 FIB_RES_NETMASK(*res), 1834 res->prefixlen, 1835 &FIB_RES_NH(*res)); 1836 1837 /* only for the last hop the reference count is handled 1838 * outside 1839 */ 1840 if (hop == lasthop) 1841 atomic_set(&(skb->dst->__refcnt), 1); 1842 } 1843 return err; 1844 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 1845 return ip_mkroute_input_def(skb, res, fl, in_dev, daddr, saddr, tos); 1846 #endif /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 1847 } 1848 1849 1850 /* 1851 * NOTE. We drop all the packets that has local source 1852 * addresses, because every properly looped back packet 1853 * must have correct destination already attached by output routine. 1854 * 1855 * Such approach solves two big problems: 1856 * 1. Not simplex devices are handled properly. 1857 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1858 */ 1859 1860 static int ip_route_input_slow(struct sk_buff *skb, u32 daddr, u32 saddr, 1861 u8 tos, struct net_device *dev) 1862 { 1863 struct fib_result res; 1864 struct in_device *in_dev = in_dev_get(dev); 1865 struct flowi fl = { .nl_u = { .ip4_u = 1866 { .daddr = daddr, 1867 .saddr = saddr, 1868 .tos = tos, 1869 .scope = RT_SCOPE_UNIVERSE, 1870 #ifdef CONFIG_IP_ROUTE_FWMARK 1871 .fwmark = skb->nfmark 1872 #endif 1873 } }, 1874 .iif = dev->ifindex }; 1875 unsigned flags = 0; 1876 u32 itag = 0; 1877 struct rtable * rth; 1878 unsigned hash; 1879 u32 spec_dst; 1880 int err = -EINVAL; 1881 int free_res = 0; 1882 1883 /* IP on this device is disabled. */ 1884 1885 if (!in_dev) 1886 goto out; 1887 1888 /* Check for the most weird martians, which can be not detected 1889 by fib_lookup. 1890 */ 1891 1892 if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) 1893 goto martian_source; 1894 1895 if (daddr == 0xFFFFFFFF || (saddr == 0 && daddr == 0)) 1896 goto brd_input; 1897 1898 /* Accept zero addresses only to limited broadcast; 1899 * I even do not know to fix it or not. Waiting for complains :-) 1900 */ 1901 if (ZERONET(saddr)) 1902 goto martian_source; 1903 1904 if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) 1905 goto martian_destination; 1906 1907 /* 1908 * Now we are ready to route packet. 1909 */ 1910 if ((err = fib_lookup(&fl, &res)) != 0) { 1911 if (!IN_DEV_FORWARD(in_dev)) 1912 goto e_inval; 1913 goto no_route; 1914 } 1915 free_res = 1; 1916 1917 RT_CACHE_STAT_INC(in_slow_tot); 1918 1919 if (res.type == RTN_BROADCAST) 1920 goto brd_input; 1921 1922 if (res.type == RTN_LOCAL) { 1923 int result; 1924 result = fib_validate_source(saddr, daddr, tos, 1925 loopback_dev.ifindex, 1926 dev, &spec_dst, &itag); 1927 if (result < 0) 1928 goto martian_source; 1929 if (result) 1930 flags |= RTCF_DIRECTSRC; 1931 spec_dst = daddr; 1932 goto local_input; 1933 } 1934 1935 if (!IN_DEV_FORWARD(in_dev)) 1936 goto e_inval; 1937 if (res.type != RTN_UNICAST) 1938 goto martian_destination; 1939 1940 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); 1941 if (err == -ENOBUFS) 1942 goto e_nobufs; 1943 if (err == -EINVAL) 1944 goto e_inval; 1945 1946 done: 1947 in_dev_put(in_dev); 1948 if (free_res) 1949 fib_res_put(&res); 1950 out: return err; 1951 1952 brd_input: 1953 if (skb->protocol != htons(ETH_P_IP)) 1954 goto e_inval; 1955 1956 if (ZERONET(saddr)) 1957 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1958 else { 1959 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, 1960 &itag); 1961 if (err < 0) 1962 goto martian_source; 1963 if (err) 1964 flags |= RTCF_DIRECTSRC; 1965 } 1966 flags |= RTCF_BROADCAST; 1967 res.type = RTN_BROADCAST; 1968 RT_CACHE_STAT_INC(in_brd); 1969 1970 local_input: 1971 rth = dst_alloc(&ipv4_dst_ops); 1972 if (!rth) 1973 goto e_nobufs; 1974 1975 rth->u.dst.output= ip_rt_bug; 1976 1977 atomic_set(&rth->u.dst.__refcnt, 1); 1978 rth->u.dst.flags= DST_HOST; 1979 if (in_dev->cnf.no_policy) 1980 rth->u.dst.flags |= DST_NOPOLICY; 1981 rth->fl.fl4_dst = daddr; 1982 rth->rt_dst = daddr; 1983 rth->fl.fl4_tos = tos; 1984 #ifdef CONFIG_IP_ROUTE_FWMARK 1985 rth->fl.fl4_fwmark= skb->nfmark; 1986 #endif 1987 rth->fl.fl4_src = saddr; 1988 rth->rt_src = saddr; 1989 #ifdef CONFIG_NET_CLS_ROUTE 1990 rth->u.dst.tclassid = itag; 1991 #endif 1992 rth->rt_iif = 1993 rth->fl.iif = dev->ifindex; 1994 rth->u.dst.dev = &loopback_dev; 1995 dev_hold(rth->u.dst.dev); 1996 rth->idev = in_dev_get(rth->u.dst.dev); 1997 rth->rt_gateway = daddr; 1998 rth->rt_spec_dst= spec_dst; 1999 rth->u.dst.input= ip_local_deliver; 2000 rth->rt_flags = flags|RTCF_LOCAL; 2001 if (res.type == RTN_UNREACHABLE) { 2002 rth->u.dst.input= ip_error; 2003 rth->u.dst.error= -err; 2004 rth->rt_flags &= ~RTCF_LOCAL; 2005 } 2006 rth->rt_type = res.type; 2007 hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos); 2008 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); 2009 goto done; 2010 2011 no_route: 2012 RT_CACHE_STAT_INC(in_no_route); 2013 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2014 res.type = RTN_UNREACHABLE; 2015 goto local_input; 2016 2017 /* 2018 * Do not cache martian addresses: they should be logged (RFC1812) 2019 */ 2020 martian_destination: 2021 RT_CACHE_STAT_INC(in_martian_dst); 2022 #ifdef CONFIG_IP_ROUTE_VERBOSE 2023 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2024 printk(KERN_WARNING "martian destination %u.%u.%u.%u from " 2025 "%u.%u.%u.%u, dev %s\n", 2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name); 2027 #endif 2028 e_inval: 2029 err = -EINVAL; 2030 goto done; 2031 2032 e_nobufs: 2033 err = -ENOBUFS; 2034 goto done; 2035 2036 martian_source: 2037 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2038 goto e_inval; 2039 } 2040 2041 int ip_route_input(struct sk_buff *skb, u32 daddr, u32 saddr, 2042 u8 tos, struct net_device *dev) 2043 { 2044 struct rtable * rth; 2045 unsigned hash; 2046 int iif = dev->ifindex; 2047 2048 tos &= IPTOS_RT_MASK; 2049 hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); 2050 2051 rcu_read_lock(); 2052 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2053 rth = rcu_dereference(rth->u.rt_next)) { 2054 if (rth->fl.fl4_dst == daddr && 2055 rth->fl.fl4_src == saddr && 2056 rth->fl.iif == iif && 2057 rth->fl.oif == 0 && 2058 #ifdef CONFIG_IP_ROUTE_FWMARK 2059 rth->fl.fl4_fwmark == skb->nfmark && 2060 #endif 2061 rth->fl.fl4_tos == tos) { 2062 rth->u.dst.lastuse = jiffies; 2063 dst_hold(&rth->u.dst); 2064 rth->u.dst.__use++; 2065 RT_CACHE_STAT_INC(in_hit); 2066 rcu_read_unlock(); 2067 skb->dst = (struct dst_entry*)rth; 2068 return 0; 2069 } 2070 RT_CACHE_STAT_INC(in_hlist_search); 2071 } 2072 rcu_read_unlock(); 2073 2074 /* Multicast recognition logic is moved from route cache to here. 2075 The problem was that too many Ethernet cards have broken/missing 2076 hardware multicast filters :-( As result the host on multicasting 2077 network acquires a lot of useless route cache entries, sort of 2078 SDR messages from all the world. Now we try to get rid of them. 2079 Really, provided software IP multicast filter is organized 2080 reasonably (at least, hashed), it does not result in a slowdown 2081 comparing with route cache reject entries. 2082 Note, that multicast routers are not affected, because 2083 route cache entry is created eventually. 2084 */ 2085 if (MULTICAST(daddr)) { 2086 struct in_device *in_dev; 2087 2088 rcu_read_lock(); 2089 if ((in_dev = __in_dev_get(dev)) != NULL) { 2090 int our = ip_check_mc(in_dev, daddr, saddr, 2091 skb->nh.iph->protocol); 2092 if (our 2093 #ifdef CONFIG_IP_MROUTE 2094 || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) 2095 #endif 2096 ) { 2097 rcu_read_unlock(); 2098 return ip_route_input_mc(skb, daddr, saddr, 2099 tos, dev, our); 2100 } 2101 } 2102 rcu_read_unlock(); 2103 return -EINVAL; 2104 } 2105 return ip_route_input_slow(skb, daddr, saddr, tos, dev); 2106 } 2107 2108 static inline int __mkroute_output(struct rtable **result, 2109 struct fib_result* res, 2110 const struct flowi *fl, 2111 const struct flowi *oldflp, 2112 struct net_device *dev_out, 2113 unsigned flags) 2114 { 2115 struct rtable *rth; 2116 struct in_device *in_dev; 2117 u32 tos = RT_FL_TOS(oldflp); 2118 int err = 0; 2119 2120 if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) 2121 return -EINVAL; 2122 2123 if (fl->fl4_dst == 0xFFFFFFFF) 2124 res->type = RTN_BROADCAST; 2125 else if (MULTICAST(fl->fl4_dst)) 2126 res->type = RTN_MULTICAST; 2127 else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) 2128 return -EINVAL; 2129 2130 if (dev_out->flags & IFF_LOOPBACK) 2131 flags |= RTCF_LOCAL; 2132 2133 /* get work reference to inet device */ 2134 in_dev = in_dev_get(dev_out); 2135 if (!in_dev) 2136 return -EINVAL; 2137 2138 if (res->type == RTN_BROADCAST) { 2139 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2140 if (res->fi) { 2141 fib_info_put(res->fi); 2142 res->fi = NULL; 2143 } 2144 } else if (res->type == RTN_MULTICAST) { 2145 flags |= RTCF_MULTICAST|RTCF_LOCAL; 2146 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, 2147 oldflp->proto)) 2148 flags &= ~RTCF_LOCAL; 2149 /* If multicast route do not exist use 2150 default one, but do not gateway in this case. 2151 Yes, it is hack. 2152 */ 2153 if (res->fi && res->prefixlen < 4) { 2154 fib_info_put(res->fi); 2155 res->fi = NULL; 2156 } 2157 } 2158 2159 2160 rth = dst_alloc(&ipv4_dst_ops); 2161 if (!rth) { 2162 err = -ENOBUFS; 2163 goto cleanup; 2164 } 2165 2166 rth->u.dst.flags= DST_HOST; 2167 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 2168 if (res->fi) { 2169 rth->rt_multipath_alg = res->fi->fib_mp_alg; 2170 if (res->fi->fib_nhs > 1) 2171 rth->u.dst.flags |= DST_BALANCED; 2172 } 2173 #endif 2174 if (in_dev->cnf.no_xfrm) 2175 rth->u.dst.flags |= DST_NOXFRM; 2176 if (in_dev->cnf.no_policy) 2177 rth->u.dst.flags |= DST_NOPOLICY; 2178 2179 rth->fl.fl4_dst = oldflp->fl4_dst; 2180 rth->fl.fl4_tos = tos; 2181 rth->fl.fl4_src = oldflp->fl4_src; 2182 rth->fl.oif = oldflp->oif; 2183 #ifdef CONFIG_IP_ROUTE_FWMARK 2184 rth->fl.fl4_fwmark= oldflp->fl4_fwmark; 2185 #endif 2186 rth->rt_dst = fl->fl4_dst; 2187 rth->rt_src = fl->fl4_src; 2188 rth->rt_iif = oldflp->oif ? : dev_out->ifindex; 2189 /* get references to the devices that are to be hold by the routing 2190 cache entry */ 2191 rth->u.dst.dev = dev_out; 2192 dev_hold(dev_out); 2193 rth->idev = in_dev_get(dev_out); 2194 rth->rt_gateway = fl->fl4_dst; 2195 rth->rt_spec_dst= fl->fl4_src; 2196 2197 rth->u.dst.output=ip_output; 2198 2199 RT_CACHE_STAT_INC(out_slow_tot); 2200 2201 if (flags & RTCF_LOCAL) { 2202 rth->u.dst.input = ip_local_deliver; 2203 rth->rt_spec_dst = fl->fl4_dst; 2204 } 2205 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2206 rth->rt_spec_dst = fl->fl4_src; 2207 if (flags & RTCF_LOCAL && 2208 !(dev_out->flags & IFF_LOOPBACK)) { 2209 rth->u.dst.output = ip_mc_output; 2210 RT_CACHE_STAT_INC(out_slow_mc); 2211 } 2212 #ifdef CONFIG_IP_MROUTE 2213 if (res->type == RTN_MULTICAST) { 2214 if (IN_DEV_MFORWARD(in_dev) && 2215 !LOCAL_MCAST(oldflp->fl4_dst)) { 2216 rth->u.dst.input = ip_mr_input; 2217 rth->u.dst.output = ip_mc_output; 2218 } 2219 } 2220 #endif 2221 } 2222 2223 rt_set_nexthop(rth, res, 0); 2224 2225 rth->rt_flags = flags; 2226 2227 *result = rth; 2228 cleanup: 2229 /* release work reference to inet device */ 2230 in_dev_put(in_dev); 2231 2232 return err; 2233 } 2234 2235 static inline int ip_mkroute_output_def(struct rtable **rp, 2236 struct fib_result* res, 2237 const struct flowi *fl, 2238 const struct flowi *oldflp, 2239 struct net_device *dev_out, 2240 unsigned flags) 2241 { 2242 struct rtable *rth; 2243 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); 2244 unsigned hash; 2245 if (err == 0) { 2246 u32 tos = RT_FL_TOS(oldflp); 2247 2248 atomic_set(&rth->u.dst.__refcnt, 1); 2249 2250 hash = rt_hash_code(oldflp->fl4_dst, 2251 oldflp->fl4_src ^ (oldflp->oif << 5), tos); 2252 err = rt_intern_hash(hash, rth, rp); 2253 } 2254 2255 return err; 2256 } 2257 2258 static inline int ip_mkroute_output(struct rtable** rp, 2259 struct fib_result* res, 2260 const struct flowi *fl, 2261 const struct flowi *oldflp, 2262 struct net_device *dev_out, 2263 unsigned flags) 2264 { 2265 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 2266 u32 tos = RT_FL_TOS(oldflp); 2267 unsigned char hop; 2268 unsigned hash; 2269 int err = -EINVAL; 2270 struct rtable *rth; 2271 2272 if (res->fi && res->fi->fib_nhs > 1) { 2273 unsigned char hopcount = res->fi->fib_nhs; 2274 2275 for (hop = 0; hop < hopcount; hop++) { 2276 struct net_device *dev2nexthop; 2277 2278 res->nh_sel = hop; 2279 2280 /* hold a work reference to the output device */ 2281 dev2nexthop = FIB_RES_DEV(*res); 2282 dev_hold(dev2nexthop); 2283 2284 err = __mkroute_output(&rth, res, fl, oldflp, 2285 dev2nexthop, flags); 2286 2287 if (err != 0) 2288 goto cleanup; 2289 2290 hash = rt_hash_code(oldflp->fl4_dst, 2291 oldflp->fl4_src ^ 2292 (oldflp->oif << 5), tos); 2293 err = rt_intern_hash(hash, rth, rp); 2294 2295 /* forward hop information to multipath impl. */ 2296 multipath_set_nhinfo(rth, 2297 FIB_RES_NETWORK(*res), 2298 FIB_RES_NETMASK(*res), 2299 res->prefixlen, 2300 &FIB_RES_NH(*res)); 2301 cleanup: 2302 /* release work reference to output device */ 2303 dev_put(dev2nexthop); 2304 2305 if (err != 0) 2306 return err; 2307 } 2308 atomic_set(&(*rp)->u.dst.__refcnt, 1); 2309 return err; 2310 } else { 2311 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, 2312 flags); 2313 } 2314 #else /* CONFIG_IP_ROUTE_MULTIPATH_CACHED */ 2315 return ip_mkroute_output_def(rp, res, fl, oldflp, dev_out, flags); 2316 #endif 2317 } 2318 2319 /* 2320 * Major route resolver routine. 2321 */ 2322 2323 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) 2324 { 2325 u32 tos = RT_FL_TOS(oldflp); 2326 struct flowi fl = { .nl_u = { .ip4_u = 2327 { .daddr = oldflp->fl4_dst, 2328 .saddr = oldflp->fl4_src, 2329 .tos = tos & IPTOS_RT_MASK, 2330 .scope = ((tos & RTO_ONLINK) ? 2331 RT_SCOPE_LINK : 2332 RT_SCOPE_UNIVERSE), 2333 #ifdef CONFIG_IP_ROUTE_FWMARK 2334 .fwmark = oldflp->fl4_fwmark 2335 #endif 2336 } }, 2337 .iif = loopback_dev.ifindex, 2338 .oif = oldflp->oif }; 2339 struct fib_result res; 2340 unsigned flags = 0; 2341 struct net_device *dev_out = NULL; 2342 int free_res = 0; 2343 int err; 2344 2345 2346 res.fi = NULL; 2347 #ifdef CONFIG_IP_MULTIPLE_TABLES 2348 res.r = NULL; 2349 #endif 2350 2351 if (oldflp->fl4_src) { 2352 err = -EINVAL; 2353 if (MULTICAST(oldflp->fl4_src) || 2354 BADCLASS(oldflp->fl4_src) || 2355 ZERONET(oldflp->fl4_src)) 2356 goto out; 2357 2358 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2359 dev_out = ip_dev_find(oldflp->fl4_src); 2360 if (dev_out == NULL) 2361 goto out; 2362 2363 /* I removed check for oif == dev_out->oif here. 2364 It was wrong for two reasons: 2365 1. ip_dev_find(saddr) can return wrong iface, if saddr is 2366 assigned to multiple interfaces. 2367 2. Moreover, we are allowed to send packets with saddr 2368 of another iface. --ANK 2369 */ 2370 2371 if (oldflp->oif == 0 2372 && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF)) { 2373 /* Special hack: user can direct multicasts 2374 and limited broadcast via necessary interface 2375 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2376 This hack is not just for fun, it allows 2377 vic,vat and friends to work. 2378 They bind socket to loopback, set ttl to zero 2379 and expect that it will work. 2380 From the viewpoint of routing cache they are broken, 2381 because we are not allowed to build multicast path 2382 with loopback source addr (look, routing cache 2383 cannot know, that ttl is zero, so that packet 2384 will not leave this host and route is valid). 2385 Luckily, this hack is good workaround. 2386 */ 2387 2388 fl.oif = dev_out->ifindex; 2389 goto make_route; 2390 } 2391 if (dev_out) 2392 dev_put(dev_out); 2393 dev_out = NULL; 2394 } 2395 2396 2397 if (oldflp->oif) { 2398 dev_out = dev_get_by_index(oldflp->oif); 2399 err = -ENODEV; 2400 if (dev_out == NULL) 2401 goto out; 2402 if (__in_dev_get(dev_out) == NULL) { 2403 dev_put(dev_out); 2404 goto out; /* Wrong error code */ 2405 } 2406 2407 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == 0xFFFFFFFF) { 2408 if (!fl.fl4_src) 2409 fl.fl4_src = inet_select_addr(dev_out, 0, 2410 RT_SCOPE_LINK); 2411 goto make_route; 2412 } 2413 if (!fl.fl4_src) { 2414 if (MULTICAST(oldflp->fl4_dst)) 2415 fl.fl4_src = inet_select_addr(dev_out, 0, 2416 fl.fl4_scope); 2417 else if (!oldflp->fl4_dst) 2418 fl.fl4_src = inet_select_addr(dev_out, 0, 2419 RT_SCOPE_HOST); 2420 } 2421 } 2422 2423 if (!fl.fl4_dst) { 2424 fl.fl4_dst = fl.fl4_src; 2425 if (!fl.fl4_dst) 2426 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); 2427 if (dev_out) 2428 dev_put(dev_out); 2429 dev_out = &loopback_dev; 2430 dev_hold(dev_out); 2431 fl.oif = loopback_dev.ifindex; 2432 res.type = RTN_LOCAL; 2433 flags |= RTCF_LOCAL; 2434 goto make_route; 2435 } 2436 2437 if (fib_lookup(&fl, &res)) { 2438 res.fi = NULL; 2439 if (oldflp->oif) { 2440 /* Apparently, routing tables are wrong. Assume, 2441 that the destination is on link. 2442 2443 WHY? DW. 2444 Because we are allowed to send to iface 2445 even if it has NO routes and NO assigned 2446 addresses. When oif is specified, routing 2447 tables are looked up with only one purpose: 2448 to catch if destination is gatewayed, rather than 2449 direct. Moreover, if MSG_DONTROUTE is set, 2450 we send packet, ignoring both routing tables 2451 and ifaddr state. --ANK 2452 2453 2454 We could make it even if oif is unknown, 2455 likely IPv6, but we do not. 2456 */ 2457 2458 if (fl.fl4_src == 0) 2459 fl.fl4_src = inet_select_addr(dev_out, 0, 2460 RT_SCOPE_LINK); 2461 res.type = RTN_UNICAST; 2462 goto make_route; 2463 } 2464 if (dev_out) 2465 dev_put(dev_out); 2466 err = -ENETUNREACH; 2467 goto out; 2468 } 2469 free_res = 1; 2470 2471 if (res.type == RTN_LOCAL) { 2472 if (!fl.fl4_src) 2473 fl.fl4_src = fl.fl4_dst; 2474 if (dev_out) 2475 dev_put(dev_out); 2476 dev_out = &loopback_dev; 2477 dev_hold(dev_out); 2478 fl.oif = dev_out->ifindex; 2479 if (res.fi) 2480 fib_info_put(res.fi); 2481 res.fi = NULL; 2482 flags |= RTCF_LOCAL; 2483 goto make_route; 2484 } 2485 2486 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2487 if (res.fi->fib_nhs > 1 && fl.oif == 0) 2488 fib_select_multipath(&fl, &res); 2489 else 2490 #endif 2491 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) 2492 fib_select_default(&fl, &res); 2493 2494 if (!fl.fl4_src) 2495 fl.fl4_src = FIB_RES_PREFSRC(res); 2496 2497 if (dev_out) 2498 dev_put(dev_out); 2499 dev_out = FIB_RES_DEV(res); 2500 dev_hold(dev_out); 2501 fl.oif = dev_out->ifindex; 2502 2503 2504 make_route: 2505 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); 2506 2507 2508 if (free_res) 2509 fib_res_put(&res); 2510 if (dev_out) 2511 dev_put(dev_out); 2512 out: return err; 2513 } 2514 2515 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) 2516 { 2517 unsigned hash; 2518 struct rtable *rth; 2519 2520 hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); 2521 2522 rcu_read_lock_bh(); 2523 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2524 rth = rcu_dereference(rth->u.rt_next)) { 2525 if (rth->fl.fl4_dst == flp->fl4_dst && 2526 rth->fl.fl4_src == flp->fl4_src && 2527 rth->fl.iif == 0 && 2528 rth->fl.oif == flp->oif && 2529 #ifdef CONFIG_IP_ROUTE_FWMARK 2530 rth->fl.fl4_fwmark == flp->fl4_fwmark && 2531 #endif 2532 !((rth->fl.fl4_tos ^ flp->fl4_tos) & 2533 (IPTOS_RT_MASK | RTO_ONLINK))) { 2534 2535 /* check for multipath routes and choose one if 2536 * necessary 2537 */ 2538 if (multipath_select_route(flp, rth, rp)) { 2539 dst_hold(&(*rp)->u.dst); 2540 RT_CACHE_STAT_INC(out_hit); 2541 rcu_read_unlock_bh(); 2542 return 0; 2543 } 2544 2545 rth->u.dst.lastuse = jiffies; 2546 dst_hold(&rth->u.dst); 2547 rth->u.dst.__use++; 2548 RT_CACHE_STAT_INC(out_hit); 2549 rcu_read_unlock_bh(); 2550 *rp = rth; 2551 return 0; 2552 } 2553 RT_CACHE_STAT_INC(out_hlist_search); 2554 } 2555 rcu_read_unlock_bh(); 2556 2557 return ip_route_output_slow(rp, flp); 2558 } 2559 2560 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) 2561 { 2562 int err; 2563 2564 if ((err = __ip_route_output_key(rp, flp)) != 0) 2565 return err; 2566 2567 if (flp->proto) { 2568 if (!flp->fl4_src) 2569 flp->fl4_src = (*rp)->rt_src; 2570 if (!flp->fl4_dst) 2571 flp->fl4_dst = (*rp)->rt_dst; 2572 return xfrm_lookup((struct dst_entry **)rp, flp, sk, flags); 2573 } 2574 2575 return 0; 2576 } 2577 2578 int ip_route_output_key(struct rtable **rp, struct flowi *flp) 2579 { 2580 return ip_route_output_flow(rp, flp, NULL, 0); 2581 } 2582 2583 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, 2584 int nowait, unsigned int flags) 2585 { 2586 struct rtable *rt = (struct rtable*)skb->dst; 2587 struct rtmsg *r; 2588 struct nlmsghdr *nlh; 2589 unsigned char *b = skb->tail; 2590 struct rta_cacheinfo ci; 2591 #ifdef CONFIG_IP_MROUTE 2592 struct rtattr *eptr; 2593 #endif 2594 nlh = NLMSG_NEW(skb, pid, seq, event, sizeof(*r), flags); 2595 r = NLMSG_DATA(nlh); 2596 r->rtm_family = AF_INET; 2597 r->rtm_dst_len = 32; 2598 r->rtm_src_len = 0; 2599 r->rtm_tos = rt->fl.fl4_tos; 2600 r->rtm_table = RT_TABLE_MAIN; 2601 r->rtm_type = rt->rt_type; 2602 r->rtm_scope = RT_SCOPE_UNIVERSE; 2603 r->rtm_protocol = RTPROT_UNSPEC; 2604 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2605 if (rt->rt_flags & RTCF_NOTIFY) 2606 r->rtm_flags |= RTM_F_NOTIFY; 2607 RTA_PUT(skb, RTA_DST, 4, &rt->rt_dst); 2608 if (rt->fl.fl4_src) { 2609 r->rtm_src_len = 32; 2610 RTA_PUT(skb, RTA_SRC, 4, &rt->fl.fl4_src); 2611 } 2612 if (rt->u.dst.dev) 2613 RTA_PUT(skb, RTA_OIF, sizeof(int), &rt->u.dst.dev->ifindex); 2614 #ifdef CONFIG_NET_CLS_ROUTE 2615 if (rt->u.dst.tclassid) 2616 RTA_PUT(skb, RTA_FLOW, 4, &rt->u.dst.tclassid); 2617 #endif 2618 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED 2619 if (rt->rt_multipath_alg != IP_MP_ALG_NONE) { 2620 __u32 alg = rt->rt_multipath_alg; 2621 2622 RTA_PUT(skb, RTA_MP_ALGO, 4, &alg); 2623 } 2624 #endif 2625 if (rt->fl.iif) 2626 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_spec_dst); 2627 else if (rt->rt_src != rt->fl.fl4_src) 2628 RTA_PUT(skb, RTA_PREFSRC, 4, &rt->rt_src); 2629 if (rt->rt_dst != rt->rt_gateway) 2630 RTA_PUT(skb, RTA_GATEWAY, 4, &rt->rt_gateway); 2631 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0) 2632 goto rtattr_failure; 2633 ci.rta_lastuse = jiffies_to_clock_t(jiffies - rt->u.dst.lastuse); 2634 ci.rta_used = rt->u.dst.__use; 2635 ci.rta_clntref = atomic_read(&rt->u.dst.__refcnt); 2636 if (rt->u.dst.expires) 2637 ci.rta_expires = jiffies_to_clock_t(rt->u.dst.expires - jiffies); 2638 else 2639 ci.rta_expires = 0; 2640 ci.rta_error = rt->u.dst.error; 2641 ci.rta_id = ci.rta_ts = ci.rta_tsage = 0; 2642 if (rt->peer) { 2643 ci.rta_id = rt->peer->ip_id_count; 2644 if (rt->peer->tcp_ts_stamp) { 2645 ci.rta_ts = rt->peer->tcp_ts; 2646 ci.rta_tsage = xtime.tv_sec - rt->peer->tcp_ts_stamp; 2647 } 2648 } 2649 #ifdef CONFIG_IP_MROUTE 2650 eptr = (struct rtattr*)skb->tail; 2651 #endif 2652 RTA_PUT(skb, RTA_CACHEINFO, sizeof(ci), &ci); 2653 if (rt->fl.iif) { 2654 #ifdef CONFIG_IP_MROUTE 2655 u32 dst = rt->rt_dst; 2656 2657 if (MULTICAST(dst) && !LOCAL_MCAST(dst) && 2658 ipv4_devconf.mc_forwarding) { 2659 int err = ipmr_get_route(skb, r, nowait); 2660 if (err <= 0) { 2661 if (!nowait) { 2662 if (err == 0) 2663 return 0; 2664 goto nlmsg_failure; 2665 } else { 2666 if (err == -EMSGSIZE) 2667 goto nlmsg_failure; 2668 ((struct rta_cacheinfo*)RTA_DATA(eptr))->rta_error = err; 2669 } 2670 } 2671 } else 2672 #endif 2673 RTA_PUT(skb, RTA_IIF, sizeof(int), &rt->fl.iif); 2674 } 2675 2676 nlh->nlmsg_len = skb->tail - b; 2677 return skb->len; 2678 2679 nlmsg_failure: 2680 rtattr_failure: 2681 skb_trim(skb, b - skb->data); 2682 return -1; 2683 } 2684 2685 int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2686 { 2687 struct rtattr **rta = arg; 2688 struct rtmsg *rtm = NLMSG_DATA(nlh); 2689 struct rtable *rt = NULL; 2690 u32 dst = 0; 2691 u32 src = 0; 2692 int iif = 0; 2693 int err = -ENOBUFS; 2694 struct sk_buff *skb; 2695 2696 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2697 if (!skb) 2698 goto out; 2699 2700 /* Reserve room for dummy headers, this skb can pass 2701 through good chunk of routing engine. 2702 */ 2703 skb->mac.raw = skb->data; 2704 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2705 2706 if (rta[RTA_SRC - 1]) 2707 memcpy(&src, RTA_DATA(rta[RTA_SRC - 1]), 4); 2708 if (rta[RTA_DST - 1]) 2709 memcpy(&dst, RTA_DATA(rta[RTA_DST - 1]), 4); 2710 if (rta[RTA_IIF - 1]) 2711 memcpy(&iif, RTA_DATA(rta[RTA_IIF - 1]), sizeof(int)); 2712 2713 if (iif) { 2714 struct net_device *dev = __dev_get_by_index(iif); 2715 err = -ENODEV; 2716 if (!dev) 2717 goto out_free; 2718 skb->protocol = htons(ETH_P_IP); 2719 skb->dev = dev; 2720 local_bh_disable(); 2721 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2722 local_bh_enable(); 2723 rt = (struct rtable*)skb->dst; 2724 if (!err && rt->u.dst.error) 2725 err = -rt->u.dst.error; 2726 } else { 2727 struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dst, 2728 .saddr = src, 2729 .tos = rtm->rtm_tos } } }; 2730 int oif = 0; 2731 if (rta[RTA_OIF - 1]) 2732 memcpy(&oif, RTA_DATA(rta[RTA_OIF - 1]), sizeof(int)); 2733 fl.oif = oif; 2734 err = ip_route_output_key(&rt, &fl); 2735 } 2736 if (err) 2737 goto out_free; 2738 2739 skb->dst = &rt->u.dst; 2740 if (rtm->rtm_flags & RTM_F_NOTIFY) 2741 rt->rt_flags |= RTCF_NOTIFY; 2742 2743 NETLINK_CB(skb).dst_pid = NETLINK_CB(in_skb).pid; 2744 2745 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2746 RTM_NEWROUTE, 0, 0); 2747 if (!err) 2748 goto out_free; 2749 if (err < 0) { 2750 err = -EMSGSIZE; 2751 goto out_free; 2752 } 2753 2754 err = netlink_unicast(rtnl, skb, NETLINK_CB(in_skb).pid, MSG_DONTWAIT); 2755 if (err > 0) 2756 err = 0; 2757 out: return err; 2758 2759 out_free: 2760 kfree_skb(skb); 2761 goto out; 2762 } 2763 2764 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2765 { 2766 struct rtable *rt; 2767 int h, s_h; 2768 int idx, s_idx; 2769 2770 s_h = cb->args[0]; 2771 s_idx = idx = cb->args[1]; 2772 for (h = 0; h <= rt_hash_mask; h++) { 2773 if (h < s_h) continue; 2774 if (h > s_h) 2775 s_idx = 0; 2776 rcu_read_lock_bh(); 2777 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; 2778 rt = rcu_dereference(rt->u.rt_next), idx++) { 2779 if (idx < s_idx) 2780 continue; 2781 skb->dst = dst_clone(&rt->u.dst); 2782 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, 2783 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 2784 1, NLM_F_MULTI) <= 0) { 2785 dst_release(xchg(&skb->dst, NULL)); 2786 rcu_read_unlock_bh(); 2787 goto done; 2788 } 2789 dst_release(xchg(&skb->dst, NULL)); 2790 } 2791 rcu_read_unlock_bh(); 2792 } 2793 2794 done: 2795 cb->args[0] = h; 2796 cb->args[1] = idx; 2797 return skb->len; 2798 } 2799 2800 void ip_rt_multicast_event(struct in_device *in_dev) 2801 { 2802 rt_cache_flush(0); 2803 } 2804 2805 #ifdef CONFIG_SYSCTL 2806 static int flush_delay; 2807 2808 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, 2809 struct file *filp, void __user *buffer, 2810 size_t *lenp, loff_t *ppos) 2811 { 2812 if (write) { 2813 proc_dointvec(ctl, write, filp, buffer, lenp, ppos); 2814 rt_cache_flush(flush_delay); 2815 return 0; 2816 } 2817 2818 return -EINVAL; 2819 } 2820 2821 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, 2822 int __user *name, 2823 int nlen, 2824 void __user *oldval, 2825 size_t __user *oldlenp, 2826 void __user *newval, 2827 size_t newlen, 2828 void **context) 2829 { 2830 int delay; 2831 if (newlen != sizeof(int)) 2832 return -EINVAL; 2833 if (get_user(delay, (int __user *)newval)) 2834 return -EFAULT; 2835 rt_cache_flush(delay); 2836 return 0; 2837 } 2838 2839 ctl_table ipv4_route_table[] = { 2840 { 2841 .ctl_name = NET_IPV4_ROUTE_FLUSH, 2842 .procname = "flush", 2843 .data = &flush_delay, 2844 .maxlen = sizeof(int), 2845 .mode = 0200, 2846 .proc_handler = &ipv4_sysctl_rtcache_flush, 2847 .strategy = &ipv4_sysctl_rtcache_flush_strategy, 2848 }, 2849 { 2850 .ctl_name = NET_IPV4_ROUTE_MIN_DELAY, 2851 .procname = "min_delay", 2852 .data = &ip_rt_min_delay, 2853 .maxlen = sizeof(int), 2854 .mode = 0644, 2855 .proc_handler = &proc_dointvec_jiffies, 2856 .strategy = &sysctl_jiffies, 2857 }, 2858 { 2859 .ctl_name = NET_IPV4_ROUTE_MAX_DELAY, 2860 .procname = "max_delay", 2861 .data = &ip_rt_max_delay, 2862 .maxlen = sizeof(int), 2863 .mode = 0644, 2864 .proc_handler = &proc_dointvec_jiffies, 2865 .strategy = &sysctl_jiffies, 2866 }, 2867 { 2868 .ctl_name = NET_IPV4_ROUTE_GC_THRESH, 2869 .procname = "gc_thresh", 2870 .data = &ipv4_dst_ops.gc_thresh, 2871 .maxlen = sizeof(int), 2872 .mode = 0644, 2873 .proc_handler = &proc_dointvec, 2874 }, 2875 { 2876 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, 2877 .procname = "max_size", 2878 .data = &ip_rt_max_size, 2879 .maxlen = sizeof(int), 2880 .mode = 0644, 2881 .proc_handler = &proc_dointvec, 2882 }, 2883 { 2884 /* Deprecated. Use gc_min_interval_ms */ 2885 2886 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL, 2887 .procname = "gc_min_interval", 2888 .data = &ip_rt_gc_min_interval, 2889 .maxlen = sizeof(int), 2890 .mode = 0644, 2891 .proc_handler = &proc_dointvec_jiffies, 2892 .strategy = &sysctl_jiffies, 2893 }, 2894 { 2895 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, 2896 .procname = "gc_min_interval_ms", 2897 .data = &ip_rt_gc_min_interval, 2898 .maxlen = sizeof(int), 2899 .mode = 0644, 2900 .proc_handler = &proc_dointvec_ms_jiffies, 2901 .strategy = &sysctl_ms_jiffies, 2902 }, 2903 { 2904 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, 2905 .procname = "gc_timeout", 2906 .data = &ip_rt_gc_timeout, 2907 .maxlen = sizeof(int), 2908 .mode = 0644, 2909 .proc_handler = &proc_dointvec_jiffies, 2910 .strategy = &sysctl_jiffies, 2911 }, 2912 { 2913 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, 2914 .procname = "gc_interval", 2915 .data = &ip_rt_gc_interval, 2916 .maxlen = sizeof(int), 2917 .mode = 0644, 2918 .proc_handler = &proc_dointvec_jiffies, 2919 .strategy = &sysctl_jiffies, 2920 }, 2921 { 2922 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, 2923 .procname = "redirect_load", 2924 .data = &ip_rt_redirect_load, 2925 .maxlen = sizeof(int), 2926 .mode = 0644, 2927 .proc_handler = &proc_dointvec, 2928 }, 2929 { 2930 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, 2931 .procname = "redirect_number", 2932 .data = &ip_rt_redirect_number, 2933 .maxlen = sizeof(int), 2934 .mode = 0644, 2935 .proc_handler = &proc_dointvec, 2936 }, 2937 { 2938 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, 2939 .procname = "redirect_silence", 2940 .data = &ip_rt_redirect_silence, 2941 .maxlen = sizeof(int), 2942 .mode = 0644, 2943 .proc_handler = &proc_dointvec, 2944 }, 2945 { 2946 .ctl_name = NET_IPV4_ROUTE_ERROR_COST, 2947 .procname = "error_cost", 2948 .data = &ip_rt_error_cost, 2949 .maxlen = sizeof(int), 2950 .mode = 0644, 2951 .proc_handler = &proc_dointvec, 2952 }, 2953 { 2954 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, 2955 .procname = "error_burst", 2956 .data = &ip_rt_error_burst, 2957 .maxlen = sizeof(int), 2958 .mode = 0644, 2959 .proc_handler = &proc_dointvec, 2960 }, 2961 { 2962 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, 2963 .procname = "gc_elasticity", 2964 .data = &ip_rt_gc_elasticity, 2965 .maxlen = sizeof(int), 2966 .mode = 0644, 2967 .proc_handler = &proc_dointvec, 2968 }, 2969 { 2970 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, 2971 .procname = "mtu_expires", 2972 .data = &ip_rt_mtu_expires, 2973 .maxlen = sizeof(int), 2974 .mode = 0644, 2975 .proc_handler = &proc_dointvec_jiffies, 2976 .strategy = &sysctl_jiffies, 2977 }, 2978 { 2979 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, 2980 .procname = "min_pmtu", 2981 .data = &ip_rt_min_pmtu, 2982 .maxlen = sizeof(int), 2983 .mode = 0644, 2984 .proc_handler = &proc_dointvec, 2985 }, 2986 { 2987 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, 2988 .procname = "min_adv_mss", 2989 .data = &ip_rt_min_advmss, 2990 .maxlen = sizeof(int), 2991 .mode = 0644, 2992 .proc_handler = &proc_dointvec, 2993 }, 2994 { 2995 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, 2996 .procname = "secret_interval", 2997 .data = &ip_rt_secret_interval, 2998 .maxlen = sizeof(int), 2999 .mode = 0644, 3000 .proc_handler = &proc_dointvec_jiffies, 3001 .strategy = &sysctl_jiffies, 3002 }, 3003 { .ctl_name = 0 } 3004 }; 3005 #endif 3006 3007 #ifdef CONFIG_NET_CLS_ROUTE 3008 struct ip_rt_acct *ip_rt_acct; 3009 3010 /* This code sucks. But you should have seen it before! --RR */ 3011 3012 /* IP route accounting ptr for this logical cpu number. */ 3013 #define IP_RT_ACCT_CPU(i) (ip_rt_acct + i * 256) 3014 3015 #ifdef CONFIG_PROC_FS 3016 static int ip_rt_acct_read(char *buffer, char **start, off_t offset, 3017 int length, int *eof, void *data) 3018 { 3019 unsigned int i; 3020 3021 if ((offset & 3) || (length & 3)) 3022 return -EIO; 3023 3024 if (offset >= sizeof(struct ip_rt_acct) * 256) { 3025 *eof = 1; 3026 return 0; 3027 } 3028 3029 if (offset + length >= sizeof(struct ip_rt_acct) * 256) { 3030 length = sizeof(struct ip_rt_acct) * 256 - offset; 3031 *eof = 1; 3032 } 3033 3034 offset /= sizeof(u32); 3035 3036 if (length > 0) { 3037 u32 *src = ((u32 *) IP_RT_ACCT_CPU(0)) + offset; 3038 u32 *dst = (u32 *) buffer; 3039 3040 /* Copy first cpu. */ 3041 *start = buffer; 3042 memcpy(dst, src, length); 3043 3044 /* Add the other cpus in, one int at a time */ 3045 for_each_cpu(i) { 3046 unsigned int j; 3047 3048 src = ((u32 *) IP_RT_ACCT_CPU(i)) + offset; 3049 3050 for (j = 0; j < length/4; j++) 3051 dst[j] += src[j]; 3052 } 3053 } 3054 return length; 3055 } 3056 #endif /* CONFIG_PROC_FS */ 3057 #endif /* CONFIG_NET_CLS_ROUTE */ 3058 3059 static __initdata unsigned long rhash_entries; 3060 static int __init set_rhash_entries(char *str) 3061 { 3062 if (!str) 3063 return 0; 3064 rhash_entries = simple_strtoul(str, &str, 0); 3065 return 1; 3066 } 3067 __setup("rhash_entries=", set_rhash_entries); 3068 3069 int __init ip_rt_init(void) 3070 { 3071 int i, order, goal, rc = 0; 3072 3073 rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ 3074 (jiffies ^ (jiffies >> 7))); 3075 3076 #ifdef CONFIG_NET_CLS_ROUTE 3077 for (order = 0; 3078 (PAGE_SIZE << order) < 256 * sizeof(struct ip_rt_acct) * NR_CPUS; order++) 3079 /* NOTHING */; 3080 ip_rt_acct = (struct ip_rt_acct *)__get_free_pages(GFP_KERNEL, order); 3081 if (!ip_rt_acct) 3082 panic("IP: failed to allocate ip_rt_acct\n"); 3083 memset(ip_rt_acct, 0, PAGE_SIZE << order); 3084 #endif 3085 3086 ipv4_dst_ops.kmem_cachep = kmem_cache_create("ip_dst_cache", 3087 sizeof(struct rtable), 3088 0, SLAB_HWCACHE_ALIGN, 3089 NULL, NULL); 3090 3091 if (!ipv4_dst_ops.kmem_cachep) 3092 panic("IP: failed to allocate ip_dst_cache\n"); 3093 3094 goal = num_physpages >> (26 - PAGE_SHIFT); 3095 if (rhash_entries) 3096 goal = (rhash_entries * sizeof(struct rt_hash_bucket)) >> PAGE_SHIFT; 3097 for (order = 0; (1UL << order) < goal; order++) 3098 /* NOTHING */; 3099 3100 do { 3101 rt_hash_mask = (1UL << order) * PAGE_SIZE / 3102 sizeof(struct rt_hash_bucket); 3103 while (rt_hash_mask & (rt_hash_mask - 1)) 3104 rt_hash_mask--; 3105 rt_hash_table = (struct rt_hash_bucket *) 3106 __get_free_pages(GFP_ATOMIC, order); 3107 } while (rt_hash_table == NULL && --order > 0); 3108 3109 if (!rt_hash_table) 3110 panic("Failed to allocate IP route cache hash table\n"); 3111 3112 printk(KERN_INFO "IP: routing cache hash table of %u buckets, %ldKbytes\n", 3113 rt_hash_mask, 3114 (long) (rt_hash_mask * sizeof(struct rt_hash_bucket)) / 1024); 3115 3116 for (rt_hash_log = 0; (1 << rt_hash_log) != rt_hash_mask; rt_hash_log++) 3117 /* NOTHING */; 3118 3119 rt_hash_mask--; 3120 for (i = 0; i <= rt_hash_mask; i++) { 3121 spin_lock_init(&rt_hash_table[i].lock); 3122 rt_hash_table[i].chain = NULL; 3123 } 3124 3125 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3126 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3127 3128 rt_cache_stat = alloc_percpu(struct rt_cache_stat); 3129 if (!rt_cache_stat) 3130 return -ENOMEM; 3131 3132 devinet_init(); 3133 ip_fib_init(); 3134 3135 init_timer(&rt_flush_timer); 3136 rt_flush_timer.function = rt_run_flush; 3137 init_timer(&rt_periodic_timer); 3138 rt_periodic_timer.function = rt_check_expire; 3139 init_timer(&rt_secret_timer); 3140 rt_secret_timer.function = rt_secret_rebuild; 3141 3142 /* All the timers, started at system startup tend 3143 to synchronize. Perturb it a bit. 3144 */ 3145 rt_periodic_timer.expires = jiffies + net_random() % ip_rt_gc_interval + 3146 ip_rt_gc_interval; 3147 add_timer(&rt_periodic_timer); 3148 3149 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + 3150 ip_rt_secret_interval; 3151 add_timer(&rt_secret_timer); 3152 3153 #ifdef CONFIG_PROC_FS 3154 { 3155 struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */ 3156 if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) || 3157 !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 3158 proc_net_stat))) { 3159 free_percpu(rt_cache_stat); 3160 return -ENOMEM; 3161 } 3162 rtstat_pde->proc_fops = &rt_cpu_seq_fops; 3163 } 3164 #ifdef CONFIG_NET_CLS_ROUTE 3165 create_proc_read_entry("rt_acct", 0, proc_net, ip_rt_acct_read, NULL); 3166 #endif 3167 #endif 3168 #ifdef CONFIG_XFRM 3169 xfrm_init(); 3170 xfrm4_init(); 3171 #endif 3172 return rc; 3173 } 3174 3175 EXPORT_SYMBOL(__ip_select_ident); 3176 EXPORT_SYMBOL(ip_route_input); 3177 EXPORT_SYMBOL(ip_route_output_key); 3178