1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #include <linux/module.h> 66 #include <asm/uaccess.h> 67 #include <asm/system.h> 68 #include <linux/bitops.h> 69 #include <linux/types.h> 70 #include <linux/kernel.h> 71 #include <linux/mm.h> 72 #include <linux/bootmem.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/workqueue.h> 83 #include <linux/skbuff.h> 84 #include <linux/inetdevice.h> 85 #include <linux/igmp.h> 86 #include <linux/pkt_sched.h> 87 #include <linux/mroute.h> 88 #include <linux/netfilter_ipv4.h> 89 #include <linux/random.h> 90 #include <linux/jhash.h> 91 #include <linux/rcupdate.h> 92 #include <linux/times.h> 93 #include <linux/slab.h> 94 #include <net/dst.h> 95 #include <net/net_namespace.h> 96 #include <net/protocol.h> 97 #include <net/ip.h> 98 #include <net/route.h> 99 #include <net/inetpeer.h> 100 #include <net/sock.h> 101 #include <net/ip_fib.h> 102 #include <net/arp.h> 103 #include <net/tcp.h> 104 #include <net/icmp.h> 105 #include <net/xfrm.h> 106 #include <net/netevent.h> 107 #include <net/rtnetlink.h> 108 #ifdef CONFIG_SYSCTL 109 #include <linux/sysctl.h> 110 #endif 111 #include <net/atmclip.h> 112 #include <net/secure_seq.h> 113 114 #define RT_FL_TOS(oldflp4) \ 115 ((u32)(oldflp4->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))) 116 117 #define IP_MAX_MTU 0xFFF0 118 119 #define RT_GC_TIMEOUT (300*HZ) 120 121 static int ip_rt_max_size; 122 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 123 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 124 static int ip_rt_redirect_number __read_mostly = 9; 125 static int ip_rt_redirect_load __read_mostly = HZ / 50; 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 127 static int ip_rt_error_cost __read_mostly = HZ; 128 static int ip_rt_error_burst __read_mostly = 5 * HZ; 129 static int ip_rt_gc_elasticity __read_mostly = 8; 130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 132 static int ip_rt_min_advmss __read_mostly = 256; 133 static int rt_chain_length_max __read_mostly = 20; 134 static int redirect_genid; 135 136 /* 137 * Interface to generic destination cache. 138 */ 139 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 142 static unsigned int ipv4_mtu(const struct dst_entry *dst); 143 static void ipv4_dst_destroy(struct dst_entry *dst); 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 145 static void ipv4_link_failure(struct sk_buff *skb); 146 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); 147 static int rt_garbage_collect(struct dst_ops *ops); 148 149 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 150 int how) 151 { 152 } 153 154 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 155 { 156 struct rtable *rt = (struct rtable *) dst; 157 struct inet_peer *peer; 158 u32 *p = NULL; 159 160 if (!rt->peer) 161 rt_bind_peer(rt, rt->rt_dst, 1); 162 163 peer = rt->peer; 164 if (peer) { 165 u32 *old_p = __DST_METRICS_PTR(old); 166 unsigned long prev, new; 167 168 p = peer->metrics; 169 if (inet_metrics_new(peer)) 170 memcpy(p, old_p, sizeof(u32) * RTAX_MAX); 171 172 new = (unsigned long) p; 173 prev = cmpxchg(&dst->_metrics, old, new); 174 175 if (prev != old) { 176 p = __DST_METRICS_PTR(prev); 177 if (prev & DST_METRICS_READ_ONLY) 178 p = NULL; 179 } else { 180 if (rt->fi) { 181 fib_info_put(rt->fi); 182 rt->fi = NULL; 183 } 184 } 185 } 186 return p; 187 } 188 189 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr); 190 191 static struct dst_ops ipv4_dst_ops = { 192 .family = AF_INET, 193 .protocol = cpu_to_be16(ETH_P_IP), 194 .gc = rt_garbage_collect, 195 .check = ipv4_dst_check, 196 .default_advmss = ipv4_default_advmss, 197 .mtu = ipv4_mtu, 198 .cow_metrics = ipv4_cow_metrics, 199 .destroy = ipv4_dst_destroy, 200 .ifdown = ipv4_dst_ifdown, 201 .negative_advice = ipv4_negative_advice, 202 .link_failure = ipv4_link_failure, 203 .update_pmtu = ip_rt_update_pmtu, 204 .local_out = __ip_local_out, 205 .neigh_lookup = ipv4_neigh_lookup, 206 }; 207 208 #define ECN_OR_COST(class) TC_PRIO_##class 209 210 const __u8 ip_tos2prio[16] = { 211 TC_PRIO_BESTEFFORT, 212 ECN_OR_COST(BESTEFFORT), 213 TC_PRIO_BESTEFFORT, 214 ECN_OR_COST(BESTEFFORT), 215 TC_PRIO_BULK, 216 ECN_OR_COST(BULK), 217 TC_PRIO_BULK, 218 ECN_OR_COST(BULK), 219 TC_PRIO_INTERACTIVE, 220 ECN_OR_COST(INTERACTIVE), 221 TC_PRIO_INTERACTIVE, 222 ECN_OR_COST(INTERACTIVE), 223 TC_PRIO_INTERACTIVE_BULK, 224 ECN_OR_COST(INTERACTIVE_BULK), 225 TC_PRIO_INTERACTIVE_BULK, 226 ECN_OR_COST(INTERACTIVE_BULK) 227 }; 228 229 230 /* 231 * Route cache. 232 */ 233 234 /* The locking scheme is rather straight forward: 235 * 236 * 1) Read-Copy Update protects the buckets of the central route hash. 237 * 2) Only writers remove entries, and they hold the lock 238 * as they look at rtable reference counts. 239 * 3) Only readers acquire references to rtable entries, 240 * they do so with atomic increments and with the 241 * lock held. 242 */ 243 244 struct rt_hash_bucket { 245 struct rtable __rcu *chain; 246 }; 247 248 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ 249 defined(CONFIG_PROVE_LOCKING) 250 /* 251 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks 252 * The size of this table is a power of two and depends on the number of CPUS. 253 * (on lockdep we have a quite big spinlock_t, so keep the size down there) 254 */ 255 #ifdef CONFIG_LOCKDEP 256 # define RT_HASH_LOCK_SZ 256 257 #else 258 # if NR_CPUS >= 32 259 # define RT_HASH_LOCK_SZ 4096 260 # elif NR_CPUS >= 16 261 # define RT_HASH_LOCK_SZ 2048 262 # elif NR_CPUS >= 8 263 # define RT_HASH_LOCK_SZ 1024 264 # elif NR_CPUS >= 4 265 # define RT_HASH_LOCK_SZ 512 266 # else 267 # define RT_HASH_LOCK_SZ 256 268 # endif 269 #endif 270 271 static spinlock_t *rt_hash_locks; 272 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)] 273 274 static __init void rt_hash_lock_init(void) 275 { 276 int i; 277 278 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, 279 GFP_KERNEL); 280 if (!rt_hash_locks) 281 panic("IP: failed to allocate rt_hash_locks\n"); 282 283 for (i = 0; i < RT_HASH_LOCK_SZ; i++) 284 spin_lock_init(&rt_hash_locks[i]); 285 } 286 #else 287 # define rt_hash_lock_addr(slot) NULL 288 289 static inline void rt_hash_lock_init(void) 290 { 291 } 292 #endif 293 294 static struct rt_hash_bucket *rt_hash_table __read_mostly; 295 static unsigned rt_hash_mask __read_mostly; 296 static unsigned int rt_hash_log __read_mostly; 297 298 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 299 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 300 301 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, 302 int genid) 303 { 304 return jhash_3words((__force u32)daddr, (__force u32)saddr, 305 idx, genid) 306 & rt_hash_mask; 307 } 308 309 static inline int rt_genid(struct net *net) 310 { 311 return atomic_read(&net->ipv4.rt_genid); 312 } 313 314 #ifdef CONFIG_PROC_FS 315 struct rt_cache_iter_state { 316 struct seq_net_private p; 317 int bucket; 318 int genid; 319 }; 320 321 static struct rtable *rt_cache_get_first(struct seq_file *seq) 322 { 323 struct rt_cache_iter_state *st = seq->private; 324 struct rtable *r = NULL; 325 326 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { 327 if (!rcu_access_pointer(rt_hash_table[st->bucket].chain)) 328 continue; 329 rcu_read_lock_bh(); 330 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 331 while (r) { 332 if (dev_net(r->dst.dev) == seq_file_net(seq) && 333 r->rt_genid == st->genid) 334 return r; 335 r = rcu_dereference_bh(r->dst.rt_next); 336 } 337 rcu_read_unlock_bh(); 338 } 339 return r; 340 } 341 342 static struct rtable *__rt_cache_get_next(struct seq_file *seq, 343 struct rtable *r) 344 { 345 struct rt_cache_iter_state *st = seq->private; 346 347 r = rcu_dereference_bh(r->dst.rt_next); 348 while (!r) { 349 rcu_read_unlock_bh(); 350 do { 351 if (--st->bucket < 0) 352 return NULL; 353 } while (!rcu_access_pointer(rt_hash_table[st->bucket].chain)); 354 rcu_read_lock_bh(); 355 r = rcu_dereference_bh(rt_hash_table[st->bucket].chain); 356 } 357 return r; 358 } 359 360 static struct rtable *rt_cache_get_next(struct seq_file *seq, 361 struct rtable *r) 362 { 363 struct rt_cache_iter_state *st = seq->private; 364 while ((r = __rt_cache_get_next(seq, r)) != NULL) { 365 if (dev_net(r->dst.dev) != seq_file_net(seq)) 366 continue; 367 if (r->rt_genid == st->genid) 368 break; 369 } 370 return r; 371 } 372 373 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) 374 { 375 struct rtable *r = rt_cache_get_first(seq); 376 377 if (r) 378 while (pos && (r = rt_cache_get_next(seq, r))) 379 --pos; 380 return pos ? NULL : r; 381 } 382 383 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 384 { 385 struct rt_cache_iter_state *st = seq->private; 386 if (*pos) 387 return rt_cache_get_idx(seq, *pos - 1); 388 st->genid = rt_genid(seq_file_net(seq)); 389 return SEQ_START_TOKEN; 390 } 391 392 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 393 { 394 struct rtable *r; 395 396 if (v == SEQ_START_TOKEN) 397 r = rt_cache_get_first(seq); 398 else 399 r = rt_cache_get_next(seq, v); 400 ++*pos; 401 return r; 402 } 403 404 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 405 { 406 if (v && v != SEQ_START_TOKEN) 407 rcu_read_unlock_bh(); 408 } 409 410 static int rt_cache_seq_show(struct seq_file *seq, void *v) 411 { 412 if (v == SEQ_START_TOKEN) 413 seq_printf(seq, "%-127s\n", 414 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 415 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 416 "HHUptod\tSpecDst"); 417 else { 418 struct rtable *r = v; 419 struct neighbour *n; 420 int len, HHUptod; 421 422 rcu_read_lock(); 423 n = dst_get_neighbour(&r->dst); 424 HHUptod = (n && (n->nud_state & NUD_CONNECTED)) ? 1 : 0; 425 rcu_read_unlock(); 426 427 seq_printf(seq, "%s\t%08X\t%08X\t%8X\t%d\t%u\t%d\t" 428 "%08X\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", 429 r->dst.dev ? r->dst.dev->name : "*", 430 (__force u32)r->rt_dst, 431 (__force u32)r->rt_gateway, 432 r->rt_flags, atomic_read(&r->dst.__refcnt), 433 r->dst.__use, 0, (__force u32)r->rt_src, 434 dst_metric_advmss(&r->dst) + 40, 435 dst_metric(&r->dst, RTAX_WINDOW), 436 (int)((dst_metric(&r->dst, RTAX_RTT) >> 3) + 437 dst_metric(&r->dst, RTAX_RTTVAR)), 438 r->rt_key_tos, 439 -1, 440 HHUptod, 441 r->rt_spec_dst, &len); 442 443 seq_printf(seq, "%*s\n", 127 - len, ""); 444 } 445 return 0; 446 } 447 448 static const struct seq_operations rt_cache_seq_ops = { 449 .start = rt_cache_seq_start, 450 .next = rt_cache_seq_next, 451 .stop = rt_cache_seq_stop, 452 .show = rt_cache_seq_show, 453 }; 454 455 static int rt_cache_seq_open(struct inode *inode, struct file *file) 456 { 457 return seq_open_net(inode, file, &rt_cache_seq_ops, 458 sizeof(struct rt_cache_iter_state)); 459 } 460 461 static const struct file_operations rt_cache_seq_fops = { 462 .owner = THIS_MODULE, 463 .open = rt_cache_seq_open, 464 .read = seq_read, 465 .llseek = seq_lseek, 466 .release = seq_release_net, 467 }; 468 469 470 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 471 { 472 int cpu; 473 474 if (*pos == 0) 475 return SEQ_START_TOKEN; 476 477 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 478 if (!cpu_possible(cpu)) 479 continue; 480 *pos = cpu+1; 481 return &per_cpu(rt_cache_stat, cpu); 482 } 483 return NULL; 484 } 485 486 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 487 { 488 int cpu; 489 490 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 491 if (!cpu_possible(cpu)) 492 continue; 493 *pos = cpu+1; 494 return &per_cpu(rt_cache_stat, cpu); 495 } 496 return NULL; 497 498 } 499 500 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 501 { 502 503 } 504 505 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 506 { 507 struct rt_cache_stat *st = v; 508 509 if (v == SEQ_START_TOKEN) { 510 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 511 return 0; 512 } 513 514 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 515 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 516 dst_entries_get_slow(&ipv4_dst_ops), 517 st->in_hit, 518 st->in_slow_tot, 519 st->in_slow_mc, 520 st->in_no_route, 521 st->in_brd, 522 st->in_martian_dst, 523 st->in_martian_src, 524 525 st->out_hit, 526 st->out_slow_tot, 527 st->out_slow_mc, 528 529 st->gc_total, 530 st->gc_ignored, 531 st->gc_goal_miss, 532 st->gc_dst_overflow, 533 st->in_hlist_search, 534 st->out_hlist_search 535 ); 536 return 0; 537 } 538 539 static const struct seq_operations rt_cpu_seq_ops = { 540 .start = rt_cpu_seq_start, 541 .next = rt_cpu_seq_next, 542 .stop = rt_cpu_seq_stop, 543 .show = rt_cpu_seq_show, 544 }; 545 546 547 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 548 { 549 return seq_open(file, &rt_cpu_seq_ops); 550 } 551 552 static const struct file_operations rt_cpu_seq_fops = { 553 .owner = THIS_MODULE, 554 .open = rt_cpu_seq_open, 555 .read = seq_read, 556 .llseek = seq_lseek, 557 .release = seq_release, 558 }; 559 560 #ifdef CONFIG_IP_ROUTE_CLASSID 561 static int rt_acct_proc_show(struct seq_file *m, void *v) 562 { 563 struct ip_rt_acct *dst, *src; 564 unsigned int i, j; 565 566 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 567 if (!dst) 568 return -ENOMEM; 569 570 for_each_possible_cpu(i) { 571 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 572 for (j = 0; j < 256; j++) { 573 dst[j].o_bytes += src[j].o_bytes; 574 dst[j].o_packets += src[j].o_packets; 575 dst[j].i_bytes += src[j].i_bytes; 576 dst[j].i_packets += src[j].i_packets; 577 } 578 } 579 580 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 581 kfree(dst); 582 return 0; 583 } 584 585 static int rt_acct_proc_open(struct inode *inode, struct file *file) 586 { 587 return single_open(file, rt_acct_proc_show, NULL); 588 } 589 590 static const struct file_operations rt_acct_proc_fops = { 591 .owner = THIS_MODULE, 592 .open = rt_acct_proc_open, 593 .read = seq_read, 594 .llseek = seq_lseek, 595 .release = single_release, 596 }; 597 #endif 598 599 static int __net_init ip_rt_do_proc_init(struct net *net) 600 { 601 struct proc_dir_entry *pde; 602 603 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, 604 &rt_cache_seq_fops); 605 if (!pde) 606 goto err1; 607 608 pde = proc_create("rt_cache", S_IRUGO, 609 net->proc_net_stat, &rt_cpu_seq_fops); 610 if (!pde) 611 goto err2; 612 613 #ifdef CONFIG_IP_ROUTE_CLASSID 614 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 615 if (!pde) 616 goto err3; 617 #endif 618 return 0; 619 620 #ifdef CONFIG_IP_ROUTE_CLASSID 621 err3: 622 remove_proc_entry("rt_cache", net->proc_net_stat); 623 #endif 624 err2: 625 remove_proc_entry("rt_cache", net->proc_net); 626 err1: 627 return -ENOMEM; 628 } 629 630 static void __net_exit ip_rt_do_proc_exit(struct net *net) 631 { 632 remove_proc_entry("rt_cache", net->proc_net_stat); 633 remove_proc_entry("rt_cache", net->proc_net); 634 #ifdef CONFIG_IP_ROUTE_CLASSID 635 remove_proc_entry("rt_acct", net->proc_net); 636 #endif 637 } 638 639 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 640 .init = ip_rt_do_proc_init, 641 .exit = ip_rt_do_proc_exit, 642 }; 643 644 static int __init ip_rt_proc_init(void) 645 { 646 return register_pernet_subsys(&ip_rt_proc_ops); 647 } 648 649 #else 650 static inline int ip_rt_proc_init(void) 651 { 652 return 0; 653 } 654 #endif /* CONFIG_PROC_FS */ 655 656 static inline void rt_free(struct rtable *rt) 657 { 658 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 659 } 660 661 static inline void rt_drop(struct rtable *rt) 662 { 663 ip_rt_put(rt); 664 call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free); 665 } 666 667 static inline int rt_fast_clean(struct rtable *rth) 668 { 669 /* Kill broadcast/multicast entries very aggresively, if they 670 collide in hash table with more useful entries */ 671 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) && 672 rt_is_input_route(rth) && rth->dst.rt_next; 673 } 674 675 static inline int rt_valuable(struct rtable *rth) 676 { 677 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || 678 (rth->peer && rth->peer->pmtu_expires); 679 } 680 681 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2) 682 { 683 unsigned long age; 684 int ret = 0; 685 686 if (atomic_read(&rth->dst.__refcnt)) 687 goto out; 688 689 age = jiffies - rth->dst.lastuse; 690 if ((age <= tmo1 && !rt_fast_clean(rth)) || 691 (age <= tmo2 && rt_valuable(rth))) 692 goto out; 693 ret = 1; 694 out: return ret; 695 } 696 697 /* Bits of score are: 698 * 31: very valuable 699 * 30: not quite useless 700 * 29..0: usage counter 701 */ 702 static inline u32 rt_score(struct rtable *rt) 703 { 704 u32 score = jiffies - rt->dst.lastuse; 705 706 score = ~score & ~(3<<30); 707 708 if (rt_valuable(rt)) 709 score |= (1<<31); 710 711 if (rt_is_output_route(rt) || 712 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL))) 713 score |= (1<<30); 714 715 return score; 716 } 717 718 static inline bool rt_caching(const struct net *net) 719 { 720 return net->ipv4.current_rt_cache_rebuild_count <= 721 net->ipv4.sysctl_rt_cache_rebuild_count; 722 } 723 724 static inline bool compare_hash_inputs(const struct rtable *rt1, 725 const struct rtable *rt2) 726 { 727 return ((((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 728 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 729 (rt1->rt_route_iif ^ rt2->rt_route_iif)) == 0); 730 } 731 732 static inline int compare_keys(struct rtable *rt1, struct rtable *rt2) 733 { 734 return (((__force u32)rt1->rt_key_dst ^ (__force u32)rt2->rt_key_dst) | 735 ((__force u32)rt1->rt_key_src ^ (__force u32)rt2->rt_key_src) | 736 (rt1->rt_mark ^ rt2->rt_mark) | 737 (rt1->rt_key_tos ^ rt2->rt_key_tos) | 738 (rt1->rt_route_iif ^ rt2->rt_route_iif) | 739 (rt1->rt_oif ^ rt2->rt_oif)) == 0; 740 } 741 742 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) 743 { 744 return net_eq(dev_net(rt1->dst.dev), dev_net(rt2->dst.dev)); 745 } 746 747 static inline int rt_is_expired(struct rtable *rth) 748 { 749 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 750 } 751 752 /* 753 * Perform a full scan of hash table and free all entries. 754 * Can be called by a softirq or a process. 755 * In the later case, we want to be reschedule if necessary 756 */ 757 static void rt_do_flush(struct net *net, int process_context) 758 { 759 unsigned int i; 760 struct rtable *rth, *next; 761 762 for (i = 0; i <= rt_hash_mask; i++) { 763 struct rtable __rcu **pprev; 764 struct rtable *list; 765 766 if (process_context && need_resched()) 767 cond_resched(); 768 rth = rcu_access_pointer(rt_hash_table[i].chain); 769 if (!rth) 770 continue; 771 772 spin_lock_bh(rt_hash_lock_addr(i)); 773 774 list = NULL; 775 pprev = &rt_hash_table[i].chain; 776 rth = rcu_dereference_protected(*pprev, 777 lockdep_is_held(rt_hash_lock_addr(i))); 778 779 while (rth) { 780 next = rcu_dereference_protected(rth->dst.rt_next, 781 lockdep_is_held(rt_hash_lock_addr(i))); 782 783 if (!net || 784 net_eq(dev_net(rth->dst.dev), net)) { 785 rcu_assign_pointer(*pprev, next); 786 rcu_assign_pointer(rth->dst.rt_next, list); 787 list = rth; 788 } else { 789 pprev = &rth->dst.rt_next; 790 } 791 rth = next; 792 } 793 794 spin_unlock_bh(rt_hash_lock_addr(i)); 795 796 for (; list; list = next) { 797 next = rcu_dereference_protected(list->dst.rt_next, 1); 798 rt_free(list); 799 } 800 } 801 } 802 803 /* 804 * While freeing expired entries, we compute average chain length 805 * and standard deviation, using fixed-point arithmetic. 806 * This to have an estimation of rt_chain_length_max 807 * rt_chain_length_max = max(elasticity, AVG + 4*SD) 808 * We use 3 bits for frational part, and 29 (or 61) for magnitude. 809 */ 810 811 #define FRACT_BITS 3 812 #define ONE (1UL << FRACT_BITS) 813 814 /* 815 * Given a hash chain and an item in this hash chain, 816 * find if a previous entry has the same hash_inputs 817 * (but differs on tos, mark or oif) 818 * Returns 0 if an alias is found. 819 * Returns ONE if rth has no alias before itself. 820 */ 821 static int has_noalias(const struct rtable *head, const struct rtable *rth) 822 { 823 const struct rtable *aux = head; 824 825 while (aux != rth) { 826 if (compare_hash_inputs(aux, rth)) 827 return 0; 828 aux = rcu_dereference_protected(aux->dst.rt_next, 1); 829 } 830 return ONE; 831 } 832 833 /* 834 * Perturbation of rt_genid by a small quantity [1..256] 835 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() 836 * many times (2^24) without giving recent rt_genid. 837 * Jenkins hash is strong enough that litle changes of rt_genid are OK. 838 */ 839 static void rt_cache_invalidate(struct net *net) 840 { 841 unsigned char shuffle; 842 843 get_random_bytes(&shuffle, sizeof(shuffle)); 844 atomic_add(shuffle + 1U, &net->ipv4.rt_genid); 845 redirect_genid++; 846 } 847 848 /* 849 * delay < 0 : invalidate cache (fast : entries will be deleted later) 850 * delay >= 0 : invalidate & flush cache (can be long) 851 */ 852 void rt_cache_flush(struct net *net, int delay) 853 { 854 rt_cache_invalidate(net); 855 if (delay >= 0) 856 rt_do_flush(net, !in_softirq()); 857 } 858 859 /* Flush previous cache invalidated entries from the cache */ 860 void rt_cache_flush_batch(struct net *net) 861 { 862 rt_do_flush(net, !in_softirq()); 863 } 864 865 static void rt_emergency_hash_rebuild(struct net *net) 866 { 867 if (net_ratelimit()) 868 printk(KERN_WARNING "Route hash chain too long!\n"); 869 rt_cache_invalidate(net); 870 } 871 872 /* 873 Short description of GC goals. 874 875 We want to build algorithm, which will keep routing cache 876 at some equilibrium point, when number of aged off entries 877 is kept approximately equal to newly generated ones. 878 879 Current expiration strength is variable "expire". 880 We try to adjust it dynamically, so that if networking 881 is idle expires is large enough to keep enough of warm entries, 882 and when load increases it reduces to limit cache size. 883 */ 884 885 static int rt_garbage_collect(struct dst_ops *ops) 886 { 887 static unsigned long expire = RT_GC_TIMEOUT; 888 static unsigned long last_gc; 889 static int rover; 890 static int equilibrium; 891 struct rtable *rth; 892 struct rtable __rcu **rthp; 893 unsigned long now = jiffies; 894 int goal; 895 int entries = dst_entries_get_fast(&ipv4_dst_ops); 896 897 /* 898 * Garbage collection is pretty expensive, 899 * do not make it too frequently. 900 */ 901 902 RT_CACHE_STAT_INC(gc_total); 903 904 if (now - last_gc < ip_rt_gc_min_interval && 905 entries < ip_rt_max_size) { 906 RT_CACHE_STAT_INC(gc_ignored); 907 goto out; 908 } 909 910 entries = dst_entries_get_slow(&ipv4_dst_ops); 911 /* Calculate number of entries, which we want to expire now. */ 912 goal = entries - (ip_rt_gc_elasticity << rt_hash_log); 913 if (goal <= 0) { 914 if (equilibrium < ipv4_dst_ops.gc_thresh) 915 equilibrium = ipv4_dst_ops.gc_thresh; 916 goal = entries - equilibrium; 917 if (goal > 0) { 918 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); 919 goal = entries - equilibrium; 920 } 921 } else { 922 /* We are in dangerous area. Try to reduce cache really 923 * aggressively. 924 */ 925 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); 926 equilibrium = entries - goal; 927 } 928 929 if (now - last_gc >= ip_rt_gc_min_interval) 930 last_gc = now; 931 932 if (goal <= 0) { 933 equilibrium += goal; 934 goto work_done; 935 } 936 937 do { 938 int i, k; 939 940 for (i = rt_hash_mask, k = rover; i >= 0; i--) { 941 unsigned long tmo = expire; 942 943 k = (k + 1) & rt_hash_mask; 944 rthp = &rt_hash_table[k].chain; 945 spin_lock_bh(rt_hash_lock_addr(k)); 946 while ((rth = rcu_dereference_protected(*rthp, 947 lockdep_is_held(rt_hash_lock_addr(k)))) != NULL) { 948 if (!rt_is_expired(rth) && 949 !rt_may_expire(rth, tmo, expire)) { 950 tmo >>= 1; 951 rthp = &rth->dst.rt_next; 952 continue; 953 } 954 *rthp = rth->dst.rt_next; 955 rt_free(rth); 956 goal--; 957 } 958 spin_unlock_bh(rt_hash_lock_addr(k)); 959 if (goal <= 0) 960 break; 961 } 962 rover = k; 963 964 if (goal <= 0) 965 goto work_done; 966 967 /* Goal is not achieved. We stop process if: 968 969 - if expire reduced to zero. Otherwise, expire is halfed. 970 - if table is not full. 971 - if we are called from interrupt. 972 - jiffies check is just fallback/debug loop breaker. 973 We will not spin here for long time in any case. 974 */ 975 976 RT_CACHE_STAT_INC(gc_goal_miss); 977 978 if (expire == 0) 979 break; 980 981 expire >>= 1; 982 983 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 984 goto out; 985 } while (!in_softirq() && time_before_eq(jiffies, now)); 986 987 if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) 988 goto out; 989 if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) 990 goto out; 991 if (net_ratelimit()) 992 printk(KERN_WARNING "dst cache overflow\n"); 993 RT_CACHE_STAT_INC(gc_dst_overflow); 994 return 1; 995 996 work_done: 997 expire += ip_rt_gc_min_interval; 998 if (expire > ip_rt_gc_timeout || 999 dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || 1000 dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) 1001 expire = ip_rt_gc_timeout; 1002 out: return 0; 1003 } 1004 1005 /* 1006 * Returns number of entries in a hash chain that have different hash_inputs 1007 */ 1008 static int slow_chain_length(const struct rtable *head) 1009 { 1010 int length = 0; 1011 const struct rtable *rth = head; 1012 1013 while (rth) { 1014 length += has_noalias(head, rth); 1015 rth = rcu_dereference_protected(rth->dst.rt_next, 1); 1016 } 1017 return length >> FRACT_BITS; 1018 } 1019 1020 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, const void *daddr) 1021 { 1022 struct neigh_table *tbl = &arp_tbl; 1023 static const __be32 inaddr_any = 0; 1024 struct net_device *dev = dst->dev; 1025 const __be32 *pkey = daddr; 1026 struct neighbour *n; 1027 1028 #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) 1029 if (dev->type == ARPHRD_ATM) 1030 tbl = clip_tbl_hook; 1031 #endif 1032 if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) 1033 pkey = &inaddr_any; 1034 1035 n = __ipv4_neigh_lookup(tbl, dev, *(__force u32 *)pkey); 1036 if (n) 1037 return n; 1038 return neigh_create(tbl, pkey, dev); 1039 } 1040 1041 static int rt_bind_neighbour(struct rtable *rt) 1042 { 1043 struct neighbour *n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); 1044 if (IS_ERR(n)) 1045 return PTR_ERR(n); 1046 dst_set_neighbour(&rt->dst, n); 1047 1048 return 0; 1049 } 1050 1051 static struct rtable *rt_intern_hash(unsigned hash, struct rtable *rt, 1052 struct sk_buff *skb, int ifindex) 1053 { 1054 struct rtable *rth, *cand; 1055 struct rtable __rcu **rthp, **candp; 1056 unsigned long now; 1057 u32 min_score; 1058 int chain_length; 1059 int attempts = !in_softirq(); 1060 1061 restart: 1062 chain_length = 0; 1063 min_score = ~(u32)0; 1064 cand = NULL; 1065 candp = NULL; 1066 now = jiffies; 1067 1068 if (!rt_caching(dev_net(rt->dst.dev))) { 1069 /* 1070 * If we're not caching, just tell the caller we 1071 * were successful and don't touch the route. The 1072 * caller hold the sole reference to the cache entry, and 1073 * it will be released when the caller is done with it. 1074 * If we drop it here, the callers have no way to resolve routes 1075 * when we're not caching. Instead, just point *rp at rt, so 1076 * the caller gets a single use out of the route 1077 * Note that we do rt_free on this new route entry, so that 1078 * once its refcount hits zero, we are still able to reap it 1079 * (Thanks Alexey) 1080 * Note: To avoid expensive rcu stuff for this uncached dst, 1081 * we set DST_NOCACHE so that dst_release() can free dst without 1082 * waiting a grace period. 1083 */ 1084 1085 rt->dst.flags |= DST_NOCACHE; 1086 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1087 int err = rt_bind_neighbour(rt); 1088 if (err) { 1089 if (net_ratelimit()) 1090 printk(KERN_WARNING 1091 "Neighbour table failure & not caching routes.\n"); 1092 ip_rt_put(rt); 1093 return ERR_PTR(err); 1094 } 1095 } 1096 1097 goto skip_hashing; 1098 } 1099 1100 rthp = &rt_hash_table[hash].chain; 1101 1102 spin_lock_bh(rt_hash_lock_addr(hash)); 1103 while ((rth = rcu_dereference_protected(*rthp, 1104 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1105 if (rt_is_expired(rth)) { 1106 *rthp = rth->dst.rt_next; 1107 rt_free(rth); 1108 continue; 1109 } 1110 if (compare_keys(rth, rt) && compare_netns(rth, rt)) { 1111 /* Put it first */ 1112 *rthp = rth->dst.rt_next; 1113 /* 1114 * Since lookup is lockfree, the deletion 1115 * must be visible to another weakly ordered CPU before 1116 * the insertion at the start of the hash chain. 1117 */ 1118 rcu_assign_pointer(rth->dst.rt_next, 1119 rt_hash_table[hash].chain); 1120 /* 1121 * Since lookup is lockfree, the update writes 1122 * must be ordered for consistency on SMP. 1123 */ 1124 rcu_assign_pointer(rt_hash_table[hash].chain, rth); 1125 1126 dst_use(&rth->dst, now); 1127 spin_unlock_bh(rt_hash_lock_addr(hash)); 1128 1129 rt_drop(rt); 1130 if (skb) 1131 skb_dst_set(skb, &rth->dst); 1132 return rth; 1133 } 1134 1135 if (!atomic_read(&rth->dst.__refcnt)) { 1136 u32 score = rt_score(rth); 1137 1138 if (score <= min_score) { 1139 cand = rth; 1140 candp = rthp; 1141 min_score = score; 1142 } 1143 } 1144 1145 chain_length++; 1146 1147 rthp = &rth->dst.rt_next; 1148 } 1149 1150 if (cand) { 1151 /* ip_rt_gc_elasticity used to be average length of chain 1152 * length, when exceeded gc becomes really aggressive. 1153 * 1154 * The second limit is less certain. At the moment it allows 1155 * only 2 entries per bucket. We will see. 1156 */ 1157 if (chain_length > ip_rt_gc_elasticity) { 1158 *candp = cand->dst.rt_next; 1159 rt_free(cand); 1160 } 1161 } else { 1162 if (chain_length > rt_chain_length_max && 1163 slow_chain_length(rt_hash_table[hash].chain) > rt_chain_length_max) { 1164 struct net *net = dev_net(rt->dst.dev); 1165 int num = ++net->ipv4.current_rt_cache_rebuild_count; 1166 if (!rt_caching(net)) { 1167 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", 1168 rt->dst.dev->name, num); 1169 } 1170 rt_emergency_hash_rebuild(net); 1171 spin_unlock_bh(rt_hash_lock_addr(hash)); 1172 1173 hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1174 ifindex, rt_genid(net)); 1175 goto restart; 1176 } 1177 } 1178 1179 /* Try to bind route to arp only if it is output 1180 route or unicast forwarding path. 1181 */ 1182 if (rt->rt_type == RTN_UNICAST || rt_is_output_route(rt)) { 1183 int err = rt_bind_neighbour(rt); 1184 if (err) { 1185 spin_unlock_bh(rt_hash_lock_addr(hash)); 1186 1187 if (err != -ENOBUFS) { 1188 rt_drop(rt); 1189 return ERR_PTR(err); 1190 } 1191 1192 /* Neighbour tables are full and nothing 1193 can be released. Try to shrink route cache, 1194 it is most likely it holds some neighbour records. 1195 */ 1196 if (attempts-- > 0) { 1197 int saved_elasticity = ip_rt_gc_elasticity; 1198 int saved_int = ip_rt_gc_min_interval; 1199 ip_rt_gc_elasticity = 1; 1200 ip_rt_gc_min_interval = 0; 1201 rt_garbage_collect(&ipv4_dst_ops); 1202 ip_rt_gc_min_interval = saved_int; 1203 ip_rt_gc_elasticity = saved_elasticity; 1204 goto restart; 1205 } 1206 1207 if (net_ratelimit()) 1208 printk(KERN_WARNING "ipv4: Neighbour table overflow.\n"); 1209 rt_drop(rt); 1210 return ERR_PTR(-ENOBUFS); 1211 } 1212 } 1213 1214 rt->dst.rt_next = rt_hash_table[hash].chain; 1215 1216 /* 1217 * Since lookup is lockfree, we must make sure 1218 * previous writes to rt are committed to memory 1219 * before making rt visible to other CPUS. 1220 */ 1221 rcu_assign_pointer(rt_hash_table[hash].chain, rt); 1222 1223 spin_unlock_bh(rt_hash_lock_addr(hash)); 1224 1225 skip_hashing: 1226 if (skb) 1227 skb_dst_set(skb, &rt->dst); 1228 return rt; 1229 } 1230 1231 static atomic_t __rt_peer_genid = ATOMIC_INIT(0); 1232 1233 static u32 rt_peer_genid(void) 1234 { 1235 return atomic_read(&__rt_peer_genid); 1236 } 1237 1238 void rt_bind_peer(struct rtable *rt, __be32 daddr, int create) 1239 { 1240 struct inet_peer *peer; 1241 1242 peer = inet_getpeer_v4(daddr, create); 1243 1244 if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) 1245 inet_putpeer(peer); 1246 else 1247 rt->rt_peer_genid = rt_peer_genid(); 1248 } 1249 1250 /* 1251 * Peer allocation may fail only in serious out-of-memory conditions. However 1252 * we still can generate some output. 1253 * Random ID selection looks a bit dangerous because we have no chances to 1254 * select ID being unique in a reasonable period of time. 1255 * But broken packet identifier may be better than no packet at all. 1256 */ 1257 static void ip_select_fb_ident(struct iphdr *iph) 1258 { 1259 static DEFINE_SPINLOCK(ip_fb_id_lock); 1260 static u32 ip_fallback_id; 1261 u32 salt; 1262 1263 spin_lock_bh(&ip_fb_id_lock); 1264 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 1265 iph->id = htons(salt & 0xFFFF); 1266 ip_fallback_id = salt; 1267 spin_unlock_bh(&ip_fb_id_lock); 1268 } 1269 1270 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 1271 { 1272 struct rtable *rt = (struct rtable *) dst; 1273 1274 if (rt) { 1275 if (rt->peer == NULL) 1276 rt_bind_peer(rt, rt->rt_dst, 1); 1277 1278 /* If peer is attached to destination, it is never detached, 1279 so that we need not to grab a lock to dereference it. 1280 */ 1281 if (rt->peer) { 1282 iph->id = htons(inet_getid(rt->peer, more)); 1283 return; 1284 } 1285 } else 1286 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n", 1287 __builtin_return_address(0)); 1288 1289 ip_select_fb_ident(iph); 1290 } 1291 EXPORT_SYMBOL(__ip_select_ident); 1292 1293 static void rt_del(unsigned hash, struct rtable *rt) 1294 { 1295 struct rtable __rcu **rthp; 1296 struct rtable *aux; 1297 1298 rthp = &rt_hash_table[hash].chain; 1299 spin_lock_bh(rt_hash_lock_addr(hash)); 1300 ip_rt_put(rt); 1301 while ((aux = rcu_dereference_protected(*rthp, 1302 lockdep_is_held(rt_hash_lock_addr(hash)))) != NULL) { 1303 if (aux == rt || rt_is_expired(aux)) { 1304 *rthp = aux->dst.rt_next; 1305 rt_free(aux); 1306 continue; 1307 } 1308 rthp = &aux->dst.rt_next; 1309 } 1310 spin_unlock_bh(rt_hash_lock_addr(hash)); 1311 } 1312 1313 static int check_peer_redir(struct dst_entry *dst, struct inet_peer *peer) 1314 { 1315 struct rtable *rt = (struct rtable *) dst; 1316 __be32 orig_gw = rt->rt_gateway; 1317 struct neighbour *n, *old_n; 1318 1319 dst_confirm(&rt->dst); 1320 1321 rt->rt_gateway = peer->redirect_learned.a4; 1322 1323 n = ipv4_neigh_lookup(&rt->dst, &rt->rt_gateway); 1324 if (IS_ERR(n)) 1325 return PTR_ERR(n); 1326 old_n = xchg(&rt->dst._neighbour, n); 1327 if (old_n) 1328 neigh_release(old_n); 1329 if (!n || !(n->nud_state & NUD_VALID)) { 1330 if (n) 1331 neigh_event_send(n, NULL); 1332 rt->rt_gateway = orig_gw; 1333 return -EAGAIN; 1334 } else { 1335 rt->rt_flags |= RTCF_REDIRECTED; 1336 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 1337 } 1338 return 0; 1339 } 1340 1341 /* called in rcu_read_lock() section */ 1342 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, 1343 __be32 saddr, struct net_device *dev) 1344 { 1345 int s, i; 1346 struct in_device *in_dev = __in_dev_get_rcu(dev); 1347 __be32 skeys[2] = { saddr, 0 }; 1348 int ikeys[2] = { dev->ifindex, 0 }; 1349 struct inet_peer *peer; 1350 struct net *net; 1351 1352 if (!in_dev) 1353 return; 1354 1355 net = dev_net(dev); 1356 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 1357 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 1358 ipv4_is_zeronet(new_gw)) 1359 goto reject_redirect; 1360 1361 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 1362 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 1363 goto reject_redirect; 1364 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 1365 goto reject_redirect; 1366 } else { 1367 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 1368 goto reject_redirect; 1369 } 1370 1371 for (s = 0; s < 2; s++) { 1372 for (i = 0; i < 2; i++) { 1373 unsigned int hash; 1374 struct rtable __rcu **rthp; 1375 struct rtable *rt; 1376 1377 hash = rt_hash(daddr, skeys[s], ikeys[i], rt_genid(net)); 1378 1379 rthp = &rt_hash_table[hash].chain; 1380 1381 while ((rt = rcu_dereference(*rthp)) != NULL) { 1382 rthp = &rt->dst.rt_next; 1383 1384 if (rt->rt_key_dst != daddr || 1385 rt->rt_key_src != skeys[s] || 1386 rt->rt_oif != ikeys[i] || 1387 rt_is_input_route(rt) || 1388 rt_is_expired(rt) || 1389 !net_eq(dev_net(rt->dst.dev), net) || 1390 rt->dst.error || 1391 rt->dst.dev != dev || 1392 rt->rt_gateway != old_gw) 1393 continue; 1394 1395 if (!rt->peer) 1396 rt_bind_peer(rt, rt->rt_dst, 1); 1397 1398 peer = rt->peer; 1399 if (peer) { 1400 if (peer->redirect_learned.a4 != new_gw || 1401 peer->redirect_genid != redirect_genid) { 1402 peer->redirect_learned.a4 = new_gw; 1403 peer->redirect_genid = redirect_genid; 1404 atomic_inc(&__rt_peer_genid); 1405 } 1406 check_peer_redir(&rt->dst, peer); 1407 } 1408 } 1409 } 1410 } 1411 return; 1412 1413 reject_redirect: 1414 #ifdef CONFIG_IP_ROUTE_VERBOSE 1415 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 1416 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" 1417 " Advised path = %pI4 -> %pI4\n", 1418 &old_gw, dev->name, &new_gw, 1419 &saddr, &daddr); 1420 #endif 1421 ; 1422 } 1423 1424 static bool peer_pmtu_expired(struct inet_peer *peer) 1425 { 1426 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1427 1428 return orig && 1429 time_after_eq(jiffies, orig) && 1430 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1431 } 1432 1433 static bool peer_pmtu_cleaned(struct inet_peer *peer) 1434 { 1435 unsigned long orig = ACCESS_ONCE(peer->pmtu_expires); 1436 1437 return orig && 1438 cmpxchg(&peer->pmtu_expires, orig, 0) == orig; 1439 } 1440 1441 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 1442 { 1443 struct rtable *rt = (struct rtable *)dst; 1444 struct dst_entry *ret = dst; 1445 1446 if (rt) { 1447 if (dst->obsolete > 0) { 1448 ip_rt_put(rt); 1449 ret = NULL; 1450 } else if (rt->rt_flags & RTCF_REDIRECTED) { 1451 unsigned hash = rt_hash(rt->rt_key_dst, rt->rt_key_src, 1452 rt->rt_oif, 1453 rt_genid(dev_net(dst->dev))); 1454 rt_del(hash, rt); 1455 ret = NULL; 1456 } else if (rt->peer && peer_pmtu_expired(rt->peer)) { 1457 dst_metric_set(dst, RTAX_MTU, rt->peer->pmtu_orig); 1458 } 1459 } 1460 return ret; 1461 } 1462 1463 /* 1464 * Algorithm: 1465 * 1. The first ip_rt_redirect_number redirects are sent 1466 * with exponential backoff, then we stop sending them at all, 1467 * assuming that the host ignores our redirects. 1468 * 2. If we did not see packets requiring redirects 1469 * during ip_rt_redirect_silence, we assume that the host 1470 * forgot redirected route and start to send redirects again. 1471 * 1472 * This algorithm is much cheaper and more intelligent than dumb load limiting 1473 * in icmp.c. 1474 * 1475 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 1476 * and "frag. need" (breaks PMTU discovery) in icmp.c. 1477 */ 1478 1479 void ip_rt_send_redirect(struct sk_buff *skb) 1480 { 1481 struct rtable *rt = skb_rtable(skb); 1482 struct in_device *in_dev; 1483 struct inet_peer *peer; 1484 int log_martians; 1485 1486 rcu_read_lock(); 1487 in_dev = __in_dev_get_rcu(rt->dst.dev); 1488 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 1489 rcu_read_unlock(); 1490 return; 1491 } 1492 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 1493 rcu_read_unlock(); 1494 1495 if (!rt->peer) 1496 rt_bind_peer(rt, rt->rt_dst, 1); 1497 peer = rt->peer; 1498 if (!peer) { 1499 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1500 return; 1501 } 1502 1503 /* No redirected packets during ip_rt_redirect_silence; 1504 * reset the algorithm. 1505 */ 1506 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) 1507 peer->rate_tokens = 0; 1508 1509 /* Too many ignored redirects; do not send anything 1510 * set dst.rate_last to the last seen redirected packet. 1511 */ 1512 if (peer->rate_tokens >= ip_rt_redirect_number) { 1513 peer->rate_last = jiffies; 1514 return; 1515 } 1516 1517 /* Check for load limit; set rate_last to the latest sent 1518 * redirect. 1519 */ 1520 if (peer->rate_tokens == 0 || 1521 time_after(jiffies, 1522 (peer->rate_last + 1523 (ip_rt_redirect_load << peer->rate_tokens)))) { 1524 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 1525 peer->rate_last = jiffies; 1526 ++peer->rate_tokens; 1527 #ifdef CONFIG_IP_ROUTE_VERBOSE 1528 if (log_martians && 1529 peer->rate_tokens == ip_rt_redirect_number && 1530 net_ratelimit()) 1531 printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", 1532 &ip_hdr(skb)->saddr, rt->rt_iif, 1533 &rt->rt_dst, &rt->rt_gateway); 1534 #endif 1535 } 1536 } 1537 1538 static int ip_error(struct sk_buff *skb) 1539 { 1540 struct rtable *rt = skb_rtable(skb); 1541 struct inet_peer *peer; 1542 unsigned long now; 1543 bool send; 1544 int code; 1545 1546 switch (rt->dst.error) { 1547 case EINVAL: 1548 default: 1549 goto out; 1550 case EHOSTUNREACH: 1551 code = ICMP_HOST_UNREACH; 1552 break; 1553 case ENETUNREACH: 1554 code = ICMP_NET_UNREACH; 1555 IP_INC_STATS_BH(dev_net(rt->dst.dev), 1556 IPSTATS_MIB_INNOROUTES); 1557 break; 1558 case EACCES: 1559 code = ICMP_PKT_FILTERED; 1560 break; 1561 } 1562 1563 if (!rt->peer) 1564 rt_bind_peer(rt, rt->rt_dst, 1); 1565 peer = rt->peer; 1566 1567 send = true; 1568 if (peer) { 1569 now = jiffies; 1570 peer->rate_tokens += now - peer->rate_last; 1571 if (peer->rate_tokens > ip_rt_error_burst) 1572 peer->rate_tokens = ip_rt_error_burst; 1573 peer->rate_last = now; 1574 if (peer->rate_tokens >= ip_rt_error_cost) 1575 peer->rate_tokens -= ip_rt_error_cost; 1576 else 1577 send = false; 1578 } 1579 if (send) 1580 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1581 1582 out: kfree_skb(skb); 1583 return 0; 1584 } 1585 1586 /* 1587 * The last two values are not from the RFC but 1588 * are needed for AMPRnet AX.25 paths. 1589 */ 1590 1591 static const unsigned short mtu_plateau[] = 1592 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; 1593 1594 static inline unsigned short guess_mtu(unsigned short old_mtu) 1595 { 1596 int i; 1597 1598 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++) 1599 if (old_mtu > mtu_plateau[i]) 1600 return mtu_plateau[i]; 1601 return 68; 1602 } 1603 1604 unsigned short ip_rt_frag_needed(struct net *net, const struct iphdr *iph, 1605 unsigned short new_mtu, 1606 struct net_device *dev) 1607 { 1608 unsigned short old_mtu = ntohs(iph->tot_len); 1609 unsigned short est_mtu = 0; 1610 struct inet_peer *peer; 1611 1612 peer = inet_getpeer_v4(iph->daddr, 1); 1613 if (peer) { 1614 unsigned short mtu = new_mtu; 1615 1616 if (new_mtu < 68 || new_mtu >= old_mtu) { 1617 /* BSD 4.2 derived systems incorrectly adjust 1618 * tot_len by the IP header length, and report 1619 * a zero MTU in the ICMP message. 1620 */ 1621 if (mtu == 0 && 1622 old_mtu >= 68 + (iph->ihl << 2)) 1623 old_mtu -= iph->ihl << 2; 1624 mtu = guess_mtu(old_mtu); 1625 } 1626 1627 if (mtu < ip_rt_min_pmtu) 1628 mtu = ip_rt_min_pmtu; 1629 if (!peer->pmtu_expires || mtu < peer->pmtu_learned) { 1630 unsigned long pmtu_expires; 1631 1632 pmtu_expires = jiffies + ip_rt_mtu_expires; 1633 if (!pmtu_expires) 1634 pmtu_expires = 1UL; 1635 1636 est_mtu = mtu; 1637 peer->pmtu_learned = mtu; 1638 peer->pmtu_expires = pmtu_expires; 1639 atomic_inc(&__rt_peer_genid); 1640 } 1641 1642 inet_putpeer(peer); 1643 } 1644 return est_mtu ? : new_mtu; 1645 } 1646 1647 static void check_peer_pmtu(struct dst_entry *dst, struct inet_peer *peer) 1648 { 1649 unsigned long expires = ACCESS_ONCE(peer->pmtu_expires); 1650 1651 if (!expires) 1652 return; 1653 if (time_before(jiffies, expires)) { 1654 u32 orig_dst_mtu = dst_mtu(dst); 1655 if (peer->pmtu_learned < orig_dst_mtu) { 1656 if (!peer->pmtu_orig) 1657 peer->pmtu_orig = dst_metric_raw(dst, RTAX_MTU); 1658 dst_metric_set(dst, RTAX_MTU, peer->pmtu_learned); 1659 } 1660 } else if (cmpxchg(&peer->pmtu_expires, expires, 0) == expires) 1661 dst_metric_set(dst, RTAX_MTU, peer->pmtu_orig); 1662 } 1663 1664 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) 1665 { 1666 struct rtable *rt = (struct rtable *) dst; 1667 struct inet_peer *peer; 1668 1669 dst_confirm(dst); 1670 1671 if (!rt->peer) 1672 rt_bind_peer(rt, rt->rt_dst, 1); 1673 peer = rt->peer; 1674 if (peer) { 1675 unsigned long pmtu_expires = ACCESS_ONCE(peer->pmtu_expires); 1676 1677 if (mtu < ip_rt_min_pmtu) 1678 mtu = ip_rt_min_pmtu; 1679 if (!pmtu_expires || mtu < peer->pmtu_learned) { 1680 1681 pmtu_expires = jiffies + ip_rt_mtu_expires; 1682 if (!pmtu_expires) 1683 pmtu_expires = 1UL; 1684 1685 peer->pmtu_learned = mtu; 1686 peer->pmtu_expires = pmtu_expires; 1687 1688 atomic_inc(&__rt_peer_genid); 1689 rt->rt_peer_genid = rt_peer_genid(); 1690 } 1691 check_peer_pmtu(dst, peer); 1692 } 1693 } 1694 1695 1696 static struct rtable *ipv4_validate_peer(struct rtable *rt) 1697 { 1698 if (rt->rt_peer_genid != rt_peer_genid()) { 1699 struct inet_peer *peer; 1700 1701 if (!rt->peer) 1702 rt_bind_peer(rt, rt->rt_dst, 0); 1703 1704 peer = rt->peer; 1705 if (peer) { 1706 check_peer_pmtu(&rt->dst, peer); 1707 1708 if (peer->redirect_genid != redirect_genid) 1709 peer->redirect_learned.a4 = 0; 1710 if (peer->redirect_learned.a4 && 1711 peer->redirect_learned.a4 != rt->rt_gateway) { 1712 if (check_peer_redir(&rt->dst, peer)) 1713 return NULL; 1714 } 1715 } 1716 1717 rt->rt_peer_genid = rt_peer_genid(); 1718 } 1719 return rt; 1720 } 1721 1722 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1723 { 1724 struct rtable *rt = (struct rtable *) dst; 1725 1726 if (rt_is_expired(rt)) 1727 return NULL; 1728 dst = (struct dst_entry *) ipv4_validate_peer(rt); 1729 return dst; 1730 } 1731 1732 static void ipv4_dst_destroy(struct dst_entry *dst) 1733 { 1734 struct rtable *rt = (struct rtable *) dst; 1735 struct inet_peer *peer = rt->peer; 1736 1737 if (rt->fi) { 1738 fib_info_put(rt->fi); 1739 rt->fi = NULL; 1740 } 1741 if (peer) { 1742 rt->peer = NULL; 1743 inet_putpeer(peer); 1744 } 1745 } 1746 1747 1748 static void ipv4_link_failure(struct sk_buff *skb) 1749 { 1750 struct rtable *rt; 1751 1752 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1753 1754 rt = skb_rtable(skb); 1755 if (rt && rt->peer && peer_pmtu_cleaned(rt->peer)) 1756 dst_metric_set(&rt->dst, RTAX_MTU, rt->peer->pmtu_orig); 1757 } 1758 1759 static int ip_rt_bug(struct sk_buff *skb) 1760 { 1761 printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", 1762 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1763 skb->dev ? skb->dev->name : "?"); 1764 kfree_skb(skb); 1765 WARN_ON(1); 1766 return 0; 1767 } 1768 1769 /* 1770 We do not cache source address of outgoing interface, 1771 because it is used only by IP RR, TS and SRR options, 1772 so that it out of fast path. 1773 1774 BTW remember: "addr" is allowed to be not aligned 1775 in IP options! 1776 */ 1777 1778 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1779 { 1780 __be32 src; 1781 1782 if (rt_is_output_route(rt)) 1783 src = ip_hdr(skb)->saddr; 1784 else { 1785 struct fib_result res; 1786 struct flowi4 fl4; 1787 struct iphdr *iph; 1788 1789 iph = ip_hdr(skb); 1790 1791 memset(&fl4, 0, sizeof(fl4)); 1792 fl4.daddr = iph->daddr; 1793 fl4.saddr = iph->saddr; 1794 fl4.flowi4_tos = RT_TOS(iph->tos); 1795 fl4.flowi4_oif = rt->dst.dev->ifindex; 1796 fl4.flowi4_iif = skb->dev->ifindex; 1797 fl4.flowi4_mark = skb->mark; 1798 1799 rcu_read_lock(); 1800 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1801 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1802 else 1803 src = inet_select_addr(rt->dst.dev, rt->rt_gateway, 1804 RT_SCOPE_UNIVERSE); 1805 rcu_read_unlock(); 1806 } 1807 memcpy(addr, &src, 4); 1808 } 1809 1810 #ifdef CONFIG_IP_ROUTE_CLASSID 1811 static void set_class_tag(struct rtable *rt, u32 tag) 1812 { 1813 if (!(rt->dst.tclassid & 0xFFFF)) 1814 rt->dst.tclassid |= tag & 0xFFFF; 1815 if (!(rt->dst.tclassid & 0xFFFF0000)) 1816 rt->dst.tclassid |= tag & 0xFFFF0000; 1817 } 1818 #endif 1819 1820 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1821 { 1822 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); 1823 1824 if (advmss == 0) { 1825 advmss = max_t(unsigned int, dst->dev->mtu - 40, 1826 ip_rt_min_advmss); 1827 if (advmss > 65535 - 40) 1828 advmss = 65535 - 40; 1829 } 1830 return advmss; 1831 } 1832 1833 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1834 { 1835 const struct rtable *rt = (const struct rtable *) dst; 1836 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 1837 1838 if (mtu && rt_is_output_route(rt)) 1839 return mtu; 1840 1841 mtu = dst->dev->mtu; 1842 1843 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1844 1845 if (rt->rt_gateway != rt->rt_dst && mtu > 576) 1846 mtu = 576; 1847 } 1848 1849 if (mtu > IP_MAX_MTU) 1850 mtu = IP_MAX_MTU; 1851 1852 return mtu; 1853 } 1854 1855 static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, 1856 struct fib_info *fi) 1857 { 1858 struct inet_peer *peer; 1859 int create = 0; 1860 1861 /* If a peer entry exists for this destination, we must hook 1862 * it up in order to get at cached metrics. 1863 */ 1864 if (fl4 && (fl4->flowi4_flags & FLOWI_FLAG_PRECOW_METRICS)) 1865 create = 1; 1866 1867 rt->peer = peer = inet_getpeer_v4(rt->rt_dst, create); 1868 if (peer) { 1869 rt->rt_peer_genid = rt_peer_genid(); 1870 if (inet_metrics_new(peer)) 1871 memcpy(peer->metrics, fi->fib_metrics, 1872 sizeof(u32) * RTAX_MAX); 1873 dst_init_metrics(&rt->dst, peer->metrics, false); 1874 1875 check_peer_pmtu(&rt->dst, peer); 1876 if (peer->redirect_genid != redirect_genid) 1877 peer->redirect_learned.a4 = 0; 1878 if (peer->redirect_learned.a4 && 1879 peer->redirect_learned.a4 != rt->rt_gateway) { 1880 rt->rt_gateway = peer->redirect_learned.a4; 1881 rt->rt_flags |= RTCF_REDIRECTED; 1882 } 1883 } else { 1884 if (fi->fib_metrics != (u32 *) dst_default_metrics) { 1885 rt->fi = fi; 1886 atomic_inc(&fi->fib_clntref); 1887 } 1888 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1889 } 1890 } 1891 1892 static void rt_set_nexthop(struct rtable *rt, const struct flowi4 *fl4, 1893 const struct fib_result *res, 1894 struct fib_info *fi, u16 type, u32 itag) 1895 { 1896 struct dst_entry *dst = &rt->dst; 1897 1898 if (fi) { 1899 if (FIB_RES_GW(*res) && 1900 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) 1901 rt->rt_gateway = FIB_RES_GW(*res); 1902 rt_init_metrics(rt, fl4, fi); 1903 #ifdef CONFIG_IP_ROUTE_CLASSID 1904 dst->tclassid = FIB_RES_NH(*res).nh_tclassid; 1905 #endif 1906 } 1907 1908 if (dst_mtu(dst) > IP_MAX_MTU) 1909 dst_metric_set(dst, RTAX_MTU, IP_MAX_MTU); 1910 if (dst_metric_raw(dst, RTAX_ADVMSS) > 65535 - 40) 1911 dst_metric_set(dst, RTAX_ADVMSS, 65535 - 40); 1912 1913 #ifdef CONFIG_IP_ROUTE_CLASSID 1914 #ifdef CONFIG_IP_MULTIPLE_TABLES 1915 set_class_tag(rt, fib_rules_tclass(res)); 1916 #endif 1917 set_class_tag(rt, itag); 1918 #endif 1919 } 1920 1921 static struct rtable *rt_dst_alloc(struct net_device *dev, 1922 bool nopolicy, bool noxfrm) 1923 { 1924 return dst_alloc(&ipv4_dst_ops, dev, 1, -1, 1925 DST_HOST | 1926 (nopolicy ? DST_NOPOLICY : 0) | 1927 (noxfrm ? DST_NOXFRM : 0)); 1928 } 1929 1930 /* called in rcu_read_lock() section */ 1931 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1932 u8 tos, struct net_device *dev, int our) 1933 { 1934 unsigned int hash; 1935 struct rtable *rth; 1936 __be32 spec_dst; 1937 struct in_device *in_dev = __in_dev_get_rcu(dev); 1938 u32 itag = 0; 1939 int err; 1940 1941 /* Primary sanity checks. */ 1942 1943 if (in_dev == NULL) 1944 return -EINVAL; 1945 1946 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1947 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) 1948 goto e_inval; 1949 1950 if (ipv4_is_zeronet(saddr)) { 1951 if (!ipv4_is_local_multicast(daddr)) 1952 goto e_inval; 1953 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 1954 } else { 1955 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 1956 &itag); 1957 if (err < 0) 1958 goto e_err; 1959 } 1960 rth = rt_dst_alloc(init_net.loopback_dev, 1961 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1962 if (!rth) 1963 goto e_nobufs; 1964 1965 #ifdef CONFIG_IP_ROUTE_CLASSID 1966 rth->dst.tclassid = itag; 1967 #endif 1968 rth->dst.output = ip_rt_bug; 1969 1970 rth->rt_key_dst = daddr; 1971 rth->rt_key_src = saddr; 1972 rth->rt_genid = rt_genid(dev_net(dev)); 1973 rth->rt_flags = RTCF_MULTICAST; 1974 rth->rt_type = RTN_MULTICAST; 1975 rth->rt_key_tos = tos; 1976 rth->rt_dst = daddr; 1977 rth->rt_src = saddr; 1978 rth->rt_route_iif = dev->ifindex; 1979 rth->rt_iif = dev->ifindex; 1980 rth->rt_oif = 0; 1981 rth->rt_mark = skb->mark; 1982 rth->rt_gateway = daddr; 1983 rth->rt_spec_dst= spec_dst; 1984 rth->rt_peer_genid = 0; 1985 rth->peer = NULL; 1986 rth->fi = NULL; 1987 if (our) { 1988 rth->dst.input= ip_local_deliver; 1989 rth->rt_flags |= RTCF_LOCAL; 1990 } 1991 1992 #ifdef CONFIG_IP_MROUTE 1993 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1994 rth->dst.input = ip_mr_input; 1995 #endif 1996 RT_CACHE_STAT_INC(in_slow_mc); 1997 1998 hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); 1999 rth = rt_intern_hash(hash, rth, skb, dev->ifindex); 2000 return IS_ERR(rth) ? PTR_ERR(rth) : 0; 2001 2002 e_nobufs: 2003 return -ENOBUFS; 2004 e_inval: 2005 return -EINVAL; 2006 e_err: 2007 return err; 2008 } 2009 2010 2011 static void ip_handle_martian_source(struct net_device *dev, 2012 struct in_device *in_dev, 2013 struct sk_buff *skb, 2014 __be32 daddr, 2015 __be32 saddr) 2016 { 2017 RT_CACHE_STAT_INC(in_martian_src); 2018 #ifdef CONFIG_IP_ROUTE_VERBOSE 2019 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 2020 /* 2021 * RFC1812 recommendation, if source is martian, 2022 * the only hint is MAC header. 2023 */ 2024 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", 2025 &daddr, &saddr, dev->name); 2026 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 2027 int i; 2028 const unsigned char *p = skb_mac_header(skb); 2029 printk(KERN_WARNING "ll header: "); 2030 for (i = 0; i < dev->hard_header_len; i++, p++) { 2031 printk("%02x", *p); 2032 if (i < (dev->hard_header_len - 1)) 2033 printk(":"); 2034 } 2035 printk("\n"); 2036 } 2037 } 2038 #endif 2039 } 2040 2041 /* called in rcu_read_lock() section */ 2042 static int __mkroute_input(struct sk_buff *skb, 2043 const struct fib_result *res, 2044 struct in_device *in_dev, 2045 __be32 daddr, __be32 saddr, u32 tos, 2046 struct rtable **result) 2047 { 2048 struct rtable *rth; 2049 int err; 2050 struct in_device *out_dev; 2051 unsigned int flags = 0; 2052 __be32 spec_dst; 2053 u32 itag; 2054 2055 /* get a working reference to the output device */ 2056 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 2057 if (out_dev == NULL) { 2058 if (net_ratelimit()) 2059 printk(KERN_CRIT "Bug in ip_route_input" \ 2060 "_slow(). Please, report\n"); 2061 return -EINVAL; 2062 } 2063 2064 2065 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 2066 in_dev->dev, &spec_dst, &itag); 2067 if (err < 0) { 2068 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 2069 saddr); 2070 2071 goto cleanup; 2072 } 2073 2074 if (err) 2075 flags |= RTCF_DIRECTSRC; 2076 2077 if (out_dev == in_dev && err && 2078 (IN_DEV_SHARED_MEDIA(out_dev) || 2079 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 2080 flags |= RTCF_DOREDIRECT; 2081 2082 if (skb->protocol != htons(ETH_P_IP)) { 2083 /* Not IP (i.e. ARP). Do not create route, if it is 2084 * invalid for proxy arp. DNAT routes are always valid. 2085 * 2086 * Proxy arp feature have been extended to allow, ARP 2087 * replies back to the same interface, to support 2088 * Private VLAN switch technologies. See arp.c. 2089 */ 2090 if (out_dev == in_dev && 2091 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 2092 err = -EINVAL; 2093 goto cleanup; 2094 } 2095 } 2096 2097 rth = rt_dst_alloc(out_dev->dev, 2098 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2099 IN_DEV_CONF_GET(out_dev, NOXFRM)); 2100 if (!rth) { 2101 err = -ENOBUFS; 2102 goto cleanup; 2103 } 2104 2105 rth->rt_key_dst = daddr; 2106 rth->rt_key_src = saddr; 2107 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 2108 rth->rt_flags = flags; 2109 rth->rt_type = res->type; 2110 rth->rt_key_tos = tos; 2111 rth->rt_dst = daddr; 2112 rth->rt_src = saddr; 2113 rth->rt_route_iif = in_dev->dev->ifindex; 2114 rth->rt_iif = in_dev->dev->ifindex; 2115 rth->rt_oif = 0; 2116 rth->rt_mark = skb->mark; 2117 rth->rt_gateway = daddr; 2118 rth->rt_spec_dst= spec_dst; 2119 rth->rt_peer_genid = 0; 2120 rth->peer = NULL; 2121 rth->fi = NULL; 2122 2123 rth->dst.input = ip_forward; 2124 rth->dst.output = ip_output; 2125 2126 rt_set_nexthop(rth, NULL, res, res->fi, res->type, itag); 2127 2128 *result = rth; 2129 err = 0; 2130 cleanup: 2131 return err; 2132 } 2133 2134 static int ip_mkroute_input(struct sk_buff *skb, 2135 struct fib_result *res, 2136 const struct flowi4 *fl4, 2137 struct in_device *in_dev, 2138 __be32 daddr, __be32 saddr, u32 tos) 2139 { 2140 struct rtable* rth = NULL; 2141 int err; 2142 unsigned hash; 2143 2144 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2145 if (res->fi && res->fi->fib_nhs > 1) 2146 fib_select_multipath(res); 2147 #endif 2148 2149 /* create a routing cache entry */ 2150 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth); 2151 if (err) 2152 return err; 2153 2154 /* put it into the cache */ 2155 hash = rt_hash(daddr, saddr, fl4->flowi4_iif, 2156 rt_genid(dev_net(rth->dst.dev))); 2157 rth = rt_intern_hash(hash, rth, skb, fl4->flowi4_iif); 2158 if (IS_ERR(rth)) 2159 return PTR_ERR(rth); 2160 return 0; 2161 } 2162 2163 /* 2164 * NOTE. We drop all the packets that has local source 2165 * addresses, because every properly looped back packet 2166 * must have correct destination already attached by output routine. 2167 * 2168 * Such approach solves two big problems: 2169 * 1. Not simplex devices are handled properly. 2170 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2171 * called with rcu_read_lock() 2172 */ 2173 2174 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2175 u8 tos, struct net_device *dev) 2176 { 2177 struct fib_result res; 2178 struct in_device *in_dev = __in_dev_get_rcu(dev); 2179 struct flowi4 fl4; 2180 unsigned flags = 0; 2181 u32 itag = 0; 2182 struct rtable * rth; 2183 unsigned hash; 2184 __be32 spec_dst; 2185 int err = -EINVAL; 2186 struct net * net = dev_net(dev); 2187 2188 /* IP on this device is disabled. */ 2189 2190 if (!in_dev) 2191 goto out; 2192 2193 /* Check for the most weird martians, which can be not detected 2194 by fib_lookup. 2195 */ 2196 2197 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 2198 ipv4_is_loopback(saddr)) 2199 goto martian_source; 2200 2201 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2202 goto brd_input; 2203 2204 /* Accept zero addresses only to limited broadcast; 2205 * I even do not know to fix it or not. Waiting for complains :-) 2206 */ 2207 if (ipv4_is_zeronet(saddr)) 2208 goto martian_source; 2209 2210 if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) 2211 goto martian_destination; 2212 2213 /* 2214 * Now we are ready to route packet. 2215 */ 2216 fl4.flowi4_oif = 0; 2217 fl4.flowi4_iif = dev->ifindex; 2218 fl4.flowi4_mark = skb->mark; 2219 fl4.flowi4_tos = tos; 2220 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2221 fl4.daddr = daddr; 2222 fl4.saddr = saddr; 2223 err = fib_lookup(net, &fl4, &res); 2224 if (err != 0) { 2225 if (!IN_DEV_FORWARD(in_dev)) 2226 goto e_hostunreach; 2227 goto no_route; 2228 } 2229 2230 RT_CACHE_STAT_INC(in_slow_tot); 2231 2232 if (res.type == RTN_BROADCAST) 2233 goto brd_input; 2234 2235 if (res.type == RTN_LOCAL) { 2236 err = fib_validate_source(skb, saddr, daddr, tos, 2237 net->loopback_dev->ifindex, 2238 dev, &spec_dst, &itag); 2239 if (err < 0) 2240 goto martian_source_keep_err; 2241 if (err) 2242 flags |= RTCF_DIRECTSRC; 2243 spec_dst = daddr; 2244 goto local_input; 2245 } 2246 2247 if (!IN_DEV_FORWARD(in_dev)) 2248 goto e_hostunreach; 2249 if (res.type != RTN_UNICAST) 2250 goto martian_destination; 2251 2252 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); 2253 out: return err; 2254 2255 brd_input: 2256 if (skb->protocol != htons(ETH_P_IP)) 2257 goto e_inval; 2258 2259 if (ipv4_is_zeronet(saddr)) 2260 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); 2261 else { 2262 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, &spec_dst, 2263 &itag); 2264 if (err < 0) 2265 goto martian_source_keep_err; 2266 if (err) 2267 flags |= RTCF_DIRECTSRC; 2268 } 2269 flags |= RTCF_BROADCAST; 2270 res.type = RTN_BROADCAST; 2271 RT_CACHE_STAT_INC(in_brd); 2272 2273 local_input: 2274 rth = rt_dst_alloc(net->loopback_dev, 2275 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 2276 if (!rth) 2277 goto e_nobufs; 2278 2279 rth->dst.input= ip_local_deliver; 2280 rth->dst.output= ip_rt_bug; 2281 #ifdef CONFIG_IP_ROUTE_CLASSID 2282 rth->dst.tclassid = itag; 2283 #endif 2284 2285 rth->rt_key_dst = daddr; 2286 rth->rt_key_src = saddr; 2287 rth->rt_genid = rt_genid(net); 2288 rth->rt_flags = flags|RTCF_LOCAL; 2289 rth->rt_type = res.type; 2290 rth->rt_key_tos = tos; 2291 rth->rt_dst = daddr; 2292 rth->rt_src = saddr; 2293 #ifdef CONFIG_IP_ROUTE_CLASSID 2294 rth->dst.tclassid = itag; 2295 #endif 2296 rth->rt_route_iif = dev->ifindex; 2297 rth->rt_iif = dev->ifindex; 2298 rth->rt_oif = 0; 2299 rth->rt_mark = skb->mark; 2300 rth->rt_gateway = daddr; 2301 rth->rt_spec_dst= spec_dst; 2302 rth->rt_peer_genid = 0; 2303 rth->peer = NULL; 2304 rth->fi = NULL; 2305 if (res.type == RTN_UNREACHABLE) { 2306 rth->dst.input= ip_error; 2307 rth->dst.error= -err; 2308 rth->rt_flags &= ~RTCF_LOCAL; 2309 } 2310 hash = rt_hash(daddr, saddr, fl4.flowi4_iif, rt_genid(net)); 2311 rth = rt_intern_hash(hash, rth, skb, fl4.flowi4_iif); 2312 err = 0; 2313 if (IS_ERR(rth)) 2314 err = PTR_ERR(rth); 2315 goto out; 2316 2317 no_route: 2318 RT_CACHE_STAT_INC(in_no_route); 2319 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE); 2320 res.type = RTN_UNREACHABLE; 2321 if (err == -ESRCH) 2322 err = -ENETUNREACH; 2323 goto local_input; 2324 2325 /* 2326 * Do not cache martian addresses: they should be logged (RFC1812) 2327 */ 2328 martian_destination: 2329 RT_CACHE_STAT_INC(in_martian_dst); 2330 #ifdef CONFIG_IP_ROUTE_VERBOSE 2331 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) 2332 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", 2333 &daddr, &saddr, dev->name); 2334 #endif 2335 2336 e_hostunreach: 2337 err = -EHOSTUNREACH; 2338 goto out; 2339 2340 e_inval: 2341 err = -EINVAL; 2342 goto out; 2343 2344 e_nobufs: 2345 err = -ENOBUFS; 2346 goto out; 2347 2348 martian_source: 2349 err = -EINVAL; 2350 martian_source_keep_err: 2351 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2352 goto out; 2353 } 2354 2355 int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2356 u8 tos, struct net_device *dev, bool noref) 2357 { 2358 struct rtable * rth; 2359 unsigned hash; 2360 int iif = dev->ifindex; 2361 struct net *net; 2362 int res; 2363 2364 net = dev_net(dev); 2365 2366 rcu_read_lock(); 2367 2368 if (!rt_caching(net)) 2369 goto skip_cache; 2370 2371 tos &= IPTOS_RT_MASK; 2372 hash = rt_hash(daddr, saddr, iif, rt_genid(net)); 2373 2374 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; 2375 rth = rcu_dereference(rth->dst.rt_next)) { 2376 if ((((__force u32)rth->rt_key_dst ^ (__force u32)daddr) | 2377 ((__force u32)rth->rt_key_src ^ (__force u32)saddr) | 2378 (rth->rt_route_iif ^ iif) | 2379 (rth->rt_key_tos ^ tos)) == 0 && 2380 rth->rt_mark == skb->mark && 2381 net_eq(dev_net(rth->dst.dev), net) && 2382 !rt_is_expired(rth)) { 2383 rth = ipv4_validate_peer(rth); 2384 if (!rth) 2385 continue; 2386 if (noref) { 2387 dst_use_noref(&rth->dst, jiffies); 2388 skb_dst_set_noref(skb, &rth->dst); 2389 } else { 2390 dst_use(&rth->dst, jiffies); 2391 skb_dst_set(skb, &rth->dst); 2392 } 2393 RT_CACHE_STAT_INC(in_hit); 2394 rcu_read_unlock(); 2395 return 0; 2396 } 2397 RT_CACHE_STAT_INC(in_hlist_search); 2398 } 2399 2400 skip_cache: 2401 /* Multicast recognition logic is moved from route cache to here. 2402 The problem was that too many Ethernet cards have broken/missing 2403 hardware multicast filters :-( As result the host on multicasting 2404 network acquires a lot of useless route cache entries, sort of 2405 SDR messages from all the world. Now we try to get rid of them. 2406 Really, provided software IP multicast filter is organized 2407 reasonably (at least, hashed), it does not result in a slowdown 2408 comparing with route cache reject entries. 2409 Note, that multicast routers are not affected, because 2410 route cache entry is created eventually. 2411 */ 2412 if (ipv4_is_multicast(daddr)) { 2413 struct in_device *in_dev = __in_dev_get_rcu(dev); 2414 2415 if (in_dev) { 2416 int our = ip_check_mc_rcu(in_dev, daddr, saddr, 2417 ip_hdr(skb)->protocol); 2418 if (our 2419 #ifdef CONFIG_IP_MROUTE 2420 || 2421 (!ipv4_is_local_multicast(daddr) && 2422 IN_DEV_MFORWARD(in_dev)) 2423 #endif 2424 ) { 2425 int res = ip_route_input_mc(skb, daddr, saddr, 2426 tos, dev, our); 2427 rcu_read_unlock(); 2428 return res; 2429 } 2430 } 2431 rcu_read_unlock(); 2432 return -EINVAL; 2433 } 2434 res = ip_route_input_slow(skb, daddr, saddr, tos, dev); 2435 rcu_read_unlock(); 2436 return res; 2437 } 2438 EXPORT_SYMBOL(ip_route_input_common); 2439 2440 /* called with rcu_read_lock() */ 2441 static struct rtable *__mkroute_output(const struct fib_result *res, 2442 const struct flowi4 *fl4, 2443 __be32 orig_daddr, __be32 orig_saddr, 2444 int orig_oif, struct net_device *dev_out, 2445 unsigned int flags) 2446 { 2447 struct fib_info *fi = res->fi; 2448 u32 tos = RT_FL_TOS(fl4); 2449 struct in_device *in_dev; 2450 u16 type = res->type; 2451 struct rtable *rth; 2452 2453 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 2454 return ERR_PTR(-EINVAL); 2455 2456 if (ipv4_is_lbcast(fl4->daddr)) 2457 type = RTN_BROADCAST; 2458 else if (ipv4_is_multicast(fl4->daddr)) 2459 type = RTN_MULTICAST; 2460 else if (ipv4_is_zeronet(fl4->daddr)) 2461 return ERR_PTR(-EINVAL); 2462 2463 if (dev_out->flags & IFF_LOOPBACK) 2464 flags |= RTCF_LOCAL; 2465 2466 in_dev = __in_dev_get_rcu(dev_out); 2467 if (!in_dev) 2468 return ERR_PTR(-EINVAL); 2469 2470 if (type == RTN_BROADCAST) { 2471 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2472 fi = NULL; 2473 } else if (type == RTN_MULTICAST) { 2474 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2475 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2476 fl4->flowi4_proto)) 2477 flags &= ~RTCF_LOCAL; 2478 /* If multicast route do not exist use 2479 * default one, but do not gateway in this case. 2480 * Yes, it is hack. 2481 */ 2482 if (fi && res->prefixlen < 4) 2483 fi = NULL; 2484 } 2485 2486 rth = rt_dst_alloc(dev_out, 2487 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2488 IN_DEV_CONF_GET(in_dev, NOXFRM)); 2489 if (!rth) 2490 return ERR_PTR(-ENOBUFS); 2491 2492 rth->dst.output = ip_output; 2493 2494 rth->rt_key_dst = orig_daddr; 2495 rth->rt_key_src = orig_saddr; 2496 rth->rt_genid = rt_genid(dev_net(dev_out)); 2497 rth->rt_flags = flags; 2498 rth->rt_type = type; 2499 rth->rt_key_tos = tos; 2500 rth->rt_dst = fl4->daddr; 2501 rth->rt_src = fl4->saddr; 2502 rth->rt_route_iif = 0; 2503 rth->rt_iif = orig_oif ? : dev_out->ifindex; 2504 rth->rt_oif = orig_oif; 2505 rth->rt_mark = fl4->flowi4_mark; 2506 rth->rt_gateway = fl4->daddr; 2507 rth->rt_spec_dst= fl4->saddr; 2508 rth->rt_peer_genid = 0; 2509 rth->peer = NULL; 2510 rth->fi = NULL; 2511 2512 RT_CACHE_STAT_INC(out_slow_tot); 2513 2514 if (flags & RTCF_LOCAL) { 2515 rth->dst.input = ip_local_deliver; 2516 rth->rt_spec_dst = fl4->daddr; 2517 } 2518 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2519 rth->rt_spec_dst = fl4->saddr; 2520 if (flags & RTCF_LOCAL && 2521 !(dev_out->flags & IFF_LOOPBACK)) { 2522 rth->dst.output = ip_mc_output; 2523 RT_CACHE_STAT_INC(out_slow_mc); 2524 } 2525 #ifdef CONFIG_IP_MROUTE 2526 if (type == RTN_MULTICAST) { 2527 if (IN_DEV_MFORWARD(in_dev) && 2528 !ipv4_is_local_multicast(fl4->daddr)) { 2529 rth->dst.input = ip_mr_input; 2530 rth->dst.output = ip_mc_output; 2531 } 2532 } 2533 #endif 2534 } 2535 2536 rt_set_nexthop(rth, fl4, res, fi, type, 0); 2537 2538 return rth; 2539 } 2540 2541 /* 2542 * Major route resolver routine. 2543 * called with rcu_read_lock(); 2544 */ 2545 2546 static struct rtable *ip_route_output_slow(struct net *net, struct flowi4 *fl4) 2547 { 2548 struct net_device *dev_out = NULL; 2549 u32 tos = RT_FL_TOS(fl4); 2550 unsigned int flags = 0; 2551 struct fib_result res; 2552 struct rtable *rth; 2553 __be32 orig_daddr; 2554 __be32 orig_saddr; 2555 int orig_oif; 2556 2557 res.fi = NULL; 2558 #ifdef CONFIG_IP_MULTIPLE_TABLES 2559 res.r = NULL; 2560 #endif 2561 2562 orig_daddr = fl4->daddr; 2563 orig_saddr = fl4->saddr; 2564 orig_oif = fl4->flowi4_oif; 2565 2566 fl4->flowi4_iif = net->loopback_dev->ifindex; 2567 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2568 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2569 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2570 2571 rcu_read_lock(); 2572 if (fl4->saddr) { 2573 rth = ERR_PTR(-EINVAL); 2574 if (ipv4_is_multicast(fl4->saddr) || 2575 ipv4_is_lbcast(fl4->saddr) || 2576 ipv4_is_zeronet(fl4->saddr)) 2577 goto out; 2578 2579 /* I removed check for oif == dev_out->oif here. 2580 It was wrong for two reasons: 2581 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2582 is assigned to multiple interfaces. 2583 2. Moreover, we are allowed to send packets with saddr 2584 of another iface. --ANK 2585 */ 2586 2587 if (fl4->flowi4_oif == 0 && 2588 (ipv4_is_multicast(fl4->daddr) || 2589 ipv4_is_lbcast(fl4->daddr))) { 2590 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2591 dev_out = __ip_dev_find(net, fl4->saddr, false); 2592 if (dev_out == NULL) 2593 goto out; 2594 2595 /* Special hack: user can direct multicasts 2596 and limited broadcast via necessary interface 2597 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2598 This hack is not just for fun, it allows 2599 vic,vat and friends to work. 2600 They bind socket to loopback, set ttl to zero 2601 and expect that it will work. 2602 From the viewpoint of routing cache they are broken, 2603 because we are not allowed to build multicast path 2604 with loopback source addr (look, routing cache 2605 cannot know, that ttl is zero, so that packet 2606 will not leave this host and route is valid). 2607 Luckily, this hack is good workaround. 2608 */ 2609 2610 fl4->flowi4_oif = dev_out->ifindex; 2611 goto make_route; 2612 } 2613 2614 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2615 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2616 if (!__ip_dev_find(net, fl4->saddr, false)) 2617 goto out; 2618 } 2619 } 2620 2621 2622 if (fl4->flowi4_oif) { 2623 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2624 rth = ERR_PTR(-ENODEV); 2625 if (dev_out == NULL) 2626 goto out; 2627 2628 /* RACE: Check return value of inet_select_addr instead. */ 2629 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2630 rth = ERR_PTR(-ENETUNREACH); 2631 goto out; 2632 } 2633 if (ipv4_is_local_multicast(fl4->daddr) || 2634 ipv4_is_lbcast(fl4->daddr)) { 2635 if (!fl4->saddr) 2636 fl4->saddr = inet_select_addr(dev_out, 0, 2637 RT_SCOPE_LINK); 2638 goto make_route; 2639 } 2640 if (fl4->saddr) { 2641 if (ipv4_is_multicast(fl4->daddr)) 2642 fl4->saddr = inet_select_addr(dev_out, 0, 2643 fl4->flowi4_scope); 2644 else if (!fl4->daddr) 2645 fl4->saddr = inet_select_addr(dev_out, 0, 2646 RT_SCOPE_HOST); 2647 } 2648 } 2649 2650 if (!fl4->daddr) { 2651 fl4->daddr = fl4->saddr; 2652 if (!fl4->daddr) 2653 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2654 dev_out = net->loopback_dev; 2655 fl4->flowi4_oif = net->loopback_dev->ifindex; 2656 res.type = RTN_LOCAL; 2657 flags |= RTCF_LOCAL; 2658 goto make_route; 2659 } 2660 2661 if (fib_lookup(net, fl4, &res)) { 2662 res.fi = NULL; 2663 if (fl4->flowi4_oif) { 2664 /* Apparently, routing tables are wrong. Assume, 2665 that the destination is on link. 2666 2667 WHY? DW. 2668 Because we are allowed to send to iface 2669 even if it has NO routes and NO assigned 2670 addresses. When oif is specified, routing 2671 tables are looked up with only one purpose: 2672 to catch if destination is gatewayed, rather than 2673 direct. Moreover, if MSG_DONTROUTE is set, 2674 we send packet, ignoring both routing tables 2675 and ifaddr state. --ANK 2676 2677 2678 We could make it even if oif is unknown, 2679 likely IPv6, but we do not. 2680 */ 2681 2682 if (fl4->saddr == 0) 2683 fl4->saddr = inet_select_addr(dev_out, 0, 2684 RT_SCOPE_LINK); 2685 res.type = RTN_UNICAST; 2686 goto make_route; 2687 } 2688 rth = ERR_PTR(-ENETUNREACH); 2689 goto out; 2690 } 2691 2692 if (res.type == RTN_LOCAL) { 2693 if (!fl4->saddr) { 2694 if (res.fi->fib_prefsrc) 2695 fl4->saddr = res.fi->fib_prefsrc; 2696 else 2697 fl4->saddr = fl4->daddr; 2698 } 2699 dev_out = net->loopback_dev; 2700 fl4->flowi4_oif = dev_out->ifindex; 2701 res.fi = NULL; 2702 flags |= RTCF_LOCAL; 2703 goto make_route; 2704 } 2705 2706 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2707 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) 2708 fib_select_multipath(&res); 2709 else 2710 #endif 2711 if (!res.prefixlen && 2712 res.table->tb_num_default > 1 && 2713 res.type == RTN_UNICAST && !fl4->flowi4_oif) 2714 fib_select_default(&res); 2715 2716 if (!fl4->saddr) 2717 fl4->saddr = FIB_RES_PREFSRC(net, res); 2718 2719 dev_out = FIB_RES_DEV(res); 2720 fl4->flowi4_oif = dev_out->ifindex; 2721 2722 2723 make_route: 2724 rth = __mkroute_output(&res, fl4, orig_daddr, orig_saddr, orig_oif, 2725 dev_out, flags); 2726 if (!IS_ERR(rth)) { 2727 unsigned int hash; 2728 2729 hash = rt_hash(orig_daddr, orig_saddr, orig_oif, 2730 rt_genid(dev_net(dev_out))); 2731 rth = rt_intern_hash(hash, rth, NULL, orig_oif); 2732 } 2733 2734 out: 2735 rcu_read_unlock(); 2736 return rth; 2737 } 2738 2739 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *flp4) 2740 { 2741 struct rtable *rth; 2742 unsigned int hash; 2743 2744 if (!rt_caching(net)) 2745 goto slow_output; 2746 2747 hash = rt_hash(flp4->daddr, flp4->saddr, flp4->flowi4_oif, rt_genid(net)); 2748 2749 rcu_read_lock_bh(); 2750 for (rth = rcu_dereference_bh(rt_hash_table[hash].chain); rth; 2751 rth = rcu_dereference_bh(rth->dst.rt_next)) { 2752 if (rth->rt_key_dst == flp4->daddr && 2753 rth->rt_key_src == flp4->saddr && 2754 rt_is_output_route(rth) && 2755 rth->rt_oif == flp4->flowi4_oif && 2756 rth->rt_mark == flp4->flowi4_mark && 2757 !((rth->rt_key_tos ^ flp4->flowi4_tos) & 2758 (IPTOS_RT_MASK | RTO_ONLINK)) && 2759 net_eq(dev_net(rth->dst.dev), net) && 2760 !rt_is_expired(rth)) { 2761 rth = ipv4_validate_peer(rth); 2762 if (!rth) 2763 continue; 2764 dst_use(&rth->dst, jiffies); 2765 RT_CACHE_STAT_INC(out_hit); 2766 rcu_read_unlock_bh(); 2767 if (!flp4->saddr) 2768 flp4->saddr = rth->rt_src; 2769 if (!flp4->daddr) 2770 flp4->daddr = rth->rt_dst; 2771 return rth; 2772 } 2773 RT_CACHE_STAT_INC(out_hlist_search); 2774 } 2775 rcu_read_unlock_bh(); 2776 2777 slow_output: 2778 return ip_route_output_slow(net, flp4); 2779 } 2780 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2781 2782 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2783 { 2784 return NULL; 2785 } 2786 2787 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2788 { 2789 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2790 2791 return mtu ? : dst->dev->mtu; 2792 } 2793 2794 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) 2795 { 2796 } 2797 2798 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2799 unsigned long old) 2800 { 2801 return NULL; 2802 } 2803 2804 static struct dst_ops ipv4_dst_blackhole_ops = { 2805 .family = AF_INET, 2806 .protocol = cpu_to_be16(ETH_P_IP), 2807 .destroy = ipv4_dst_destroy, 2808 .check = ipv4_blackhole_dst_check, 2809 .mtu = ipv4_blackhole_mtu, 2810 .default_advmss = ipv4_default_advmss, 2811 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2812 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2813 .neigh_lookup = ipv4_neigh_lookup, 2814 }; 2815 2816 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2817 { 2818 struct rtable *rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, 0, 0); 2819 struct rtable *ort = (struct rtable *) dst_orig; 2820 2821 if (rt) { 2822 struct dst_entry *new = &rt->dst; 2823 2824 new->__use = 1; 2825 new->input = dst_discard; 2826 new->output = dst_discard; 2827 dst_copy_metrics(new, &ort->dst); 2828 2829 new->dev = ort->dst.dev; 2830 if (new->dev) 2831 dev_hold(new->dev); 2832 2833 rt->rt_key_dst = ort->rt_key_dst; 2834 rt->rt_key_src = ort->rt_key_src; 2835 rt->rt_key_tos = ort->rt_key_tos; 2836 rt->rt_route_iif = ort->rt_route_iif; 2837 rt->rt_iif = ort->rt_iif; 2838 rt->rt_oif = ort->rt_oif; 2839 rt->rt_mark = ort->rt_mark; 2840 2841 rt->rt_genid = rt_genid(net); 2842 rt->rt_flags = ort->rt_flags; 2843 rt->rt_type = ort->rt_type; 2844 rt->rt_dst = ort->rt_dst; 2845 rt->rt_src = ort->rt_src; 2846 rt->rt_gateway = ort->rt_gateway; 2847 rt->rt_spec_dst = ort->rt_spec_dst; 2848 rt->peer = ort->peer; 2849 if (rt->peer) 2850 atomic_inc(&rt->peer->refcnt); 2851 rt->fi = ort->fi; 2852 if (rt->fi) 2853 atomic_inc(&rt->fi->fib_clntref); 2854 2855 dst_free(new); 2856 } 2857 2858 dst_release(dst_orig); 2859 2860 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2861 } 2862 2863 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2864 struct sock *sk) 2865 { 2866 struct rtable *rt = __ip_route_output_key(net, flp4); 2867 2868 if (IS_ERR(rt)) 2869 return rt; 2870 2871 if (flp4->flowi4_proto) 2872 rt = (struct rtable *) xfrm_lookup(net, &rt->dst, 2873 flowi4_to_flowi(flp4), 2874 sk, 0); 2875 2876 return rt; 2877 } 2878 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2879 2880 static int rt_fill_info(struct net *net, 2881 struct sk_buff *skb, u32 pid, u32 seq, int event, 2882 int nowait, unsigned int flags) 2883 { 2884 struct rtable *rt = skb_rtable(skb); 2885 struct rtmsg *r; 2886 struct nlmsghdr *nlh; 2887 unsigned long expires = 0; 2888 const struct inet_peer *peer = rt->peer; 2889 u32 id = 0, ts = 0, tsage = 0, error; 2890 2891 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2892 if (nlh == NULL) 2893 return -EMSGSIZE; 2894 2895 r = nlmsg_data(nlh); 2896 r->rtm_family = AF_INET; 2897 r->rtm_dst_len = 32; 2898 r->rtm_src_len = 0; 2899 r->rtm_tos = rt->rt_key_tos; 2900 r->rtm_table = RT_TABLE_MAIN; 2901 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN); 2902 r->rtm_type = rt->rt_type; 2903 r->rtm_scope = RT_SCOPE_UNIVERSE; 2904 r->rtm_protocol = RTPROT_UNSPEC; 2905 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2906 if (rt->rt_flags & RTCF_NOTIFY) 2907 r->rtm_flags |= RTM_F_NOTIFY; 2908 2909 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst); 2910 2911 if (rt->rt_key_src) { 2912 r->rtm_src_len = 32; 2913 NLA_PUT_BE32(skb, RTA_SRC, rt->rt_key_src); 2914 } 2915 if (rt->dst.dev) 2916 NLA_PUT_U32(skb, RTA_OIF, rt->dst.dev->ifindex); 2917 #ifdef CONFIG_IP_ROUTE_CLASSID 2918 if (rt->dst.tclassid) 2919 NLA_PUT_U32(skb, RTA_FLOW, rt->dst.tclassid); 2920 #endif 2921 if (rt_is_input_route(rt)) 2922 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst); 2923 else if (rt->rt_src != rt->rt_key_src) 2924 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src); 2925 2926 if (rt->rt_dst != rt->rt_gateway) 2927 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway); 2928 2929 if (rtnetlink_put_metrics(skb, dst_metrics_ptr(&rt->dst)) < 0) 2930 goto nla_put_failure; 2931 2932 if (rt->rt_mark) 2933 NLA_PUT_BE32(skb, RTA_MARK, rt->rt_mark); 2934 2935 error = rt->dst.error; 2936 if (peer) { 2937 inet_peer_refcheck(rt->peer); 2938 id = atomic_read(&peer->ip_id_count) & 0xffff; 2939 if (peer->tcp_ts_stamp) { 2940 ts = peer->tcp_ts; 2941 tsage = get_seconds() - peer->tcp_ts_stamp; 2942 } 2943 expires = ACCESS_ONCE(peer->pmtu_expires); 2944 if (expires) { 2945 if (time_before(jiffies, expires)) 2946 expires -= jiffies; 2947 else 2948 expires = 0; 2949 } 2950 } 2951 2952 if (rt_is_input_route(rt)) { 2953 #ifdef CONFIG_IP_MROUTE 2954 __be32 dst = rt->rt_dst; 2955 2956 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2957 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2958 int err = ipmr_get_route(net, skb, 2959 rt->rt_src, rt->rt_dst, 2960 r, nowait); 2961 if (err <= 0) { 2962 if (!nowait) { 2963 if (err == 0) 2964 return 0; 2965 goto nla_put_failure; 2966 } else { 2967 if (err == -EMSGSIZE) 2968 goto nla_put_failure; 2969 error = err; 2970 } 2971 } 2972 } else 2973 #endif 2974 NLA_PUT_U32(skb, RTA_IIF, rt->rt_iif); 2975 } 2976 2977 if (rtnl_put_cacheinfo(skb, &rt->dst, id, ts, tsage, 2978 expires, error) < 0) 2979 goto nla_put_failure; 2980 2981 return nlmsg_end(skb, nlh); 2982 2983 nla_put_failure: 2984 nlmsg_cancel(skb, nlh); 2985 return -EMSGSIZE; 2986 } 2987 2988 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) 2989 { 2990 struct net *net = sock_net(in_skb->sk); 2991 struct rtmsg *rtm; 2992 struct nlattr *tb[RTA_MAX+1]; 2993 struct rtable *rt = NULL; 2994 __be32 dst = 0; 2995 __be32 src = 0; 2996 u32 iif; 2997 int err; 2998 int mark; 2999 struct sk_buff *skb; 3000 3001 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 3002 if (err < 0) 3003 goto errout; 3004 3005 rtm = nlmsg_data(nlh); 3006 3007 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3008 if (skb == NULL) { 3009 err = -ENOBUFS; 3010 goto errout; 3011 } 3012 3013 /* Reserve room for dummy headers, this skb can pass 3014 through good chunk of routing engine. 3015 */ 3016 skb_reset_mac_header(skb); 3017 skb_reset_network_header(skb); 3018 3019 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 3020 ip_hdr(skb)->protocol = IPPROTO_ICMP; 3021 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 3022 3023 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 3024 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 3025 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 3026 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 3027 3028 if (iif) { 3029 struct net_device *dev; 3030 3031 dev = __dev_get_by_index(net, iif); 3032 if (dev == NULL) { 3033 err = -ENODEV; 3034 goto errout_free; 3035 } 3036 3037 skb->protocol = htons(ETH_P_IP); 3038 skb->dev = dev; 3039 skb->mark = mark; 3040 local_bh_disable(); 3041 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 3042 local_bh_enable(); 3043 3044 rt = skb_rtable(skb); 3045 if (err == 0 && rt->dst.error) 3046 err = -rt->dst.error; 3047 } else { 3048 struct flowi4 fl4 = { 3049 .daddr = dst, 3050 .saddr = src, 3051 .flowi4_tos = rtm->rtm_tos, 3052 .flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, 3053 .flowi4_mark = mark, 3054 }; 3055 rt = ip_route_output_key(net, &fl4); 3056 3057 err = 0; 3058 if (IS_ERR(rt)) 3059 err = PTR_ERR(rt); 3060 } 3061 3062 if (err) 3063 goto errout_free; 3064 3065 skb_dst_set(skb, &rt->dst); 3066 if (rtm->rtm_flags & RTM_F_NOTIFY) 3067 rt->rt_flags |= RTCF_NOTIFY; 3068 3069 err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 3070 RTM_NEWROUTE, 0, 0); 3071 if (err <= 0) 3072 goto errout_free; 3073 3074 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 3075 errout: 3076 return err; 3077 3078 errout_free: 3079 kfree_skb(skb); 3080 goto errout; 3081 } 3082 3083 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 3084 { 3085 struct rtable *rt; 3086 int h, s_h; 3087 int idx, s_idx; 3088 struct net *net; 3089 3090 net = sock_net(skb->sk); 3091 3092 s_h = cb->args[0]; 3093 if (s_h < 0) 3094 s_h = 0; 3095 s_idx = idx = cb->args[1]; 3096 for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { 3097 if (!rt_hash_table[h].chain) 3098 continue; 3099 rcu_read_lock_bh(); 3100 for (rt = rcu_dereference_bh(rt_hash_table[h].chain), idx = 0; rt; 3101 rt = rcu_dereference_bh(rt->dst.rt_next), idx++) { 3102 if (!net_eq(dev_net(rt->dst.dev), net) || idx < s_idx) 3103 continue; 3104 if (rt_is_expired(rt)) 3105 continue; 3106 skb_dst_set_noref(skb, &rt->dst); 3107 if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, 3108 cb->nlh->nlmsg_seq, RTM_NEWROUTE, 3109 1, NLM_F_MULTI) <= 0) { 3110 skb_dst_drop(skb); 3111 rcu_read_unlock_bh(); 3112 goto done; 3113 } 3114 skb_dst_drop(skb); 3115 } 3116 rcu_read_unlock_bh(); 3117 } 3118 3119 done: 3120 cb->args[0] = h; 3121 cb->args[1] = idx; 3122 return skb->len; 3123 } 3124 3125 void ip_rt_multicast_event(struct in_device *in_dev) 3126 { 3127 rt_cache_flush(dev_net(in_dev->dev), 0); 3128 } 3129 3130 #ifdef CONFIG_SYSCTL 3131 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 3132 void __user *buffer, 3133 size_t *lenp, loff_t *ppos) 3134 { 3135 if (write) { 3136 int flush_delay; 3137 ctl_table ctl; 3138 struct net *net; 3139 3140 memcpy(&ctl, __ctl, sizeof(ctl)); 3141 ctl.data = &flush_delay; 3142 proc_dointvec(&ctl, write, buffer, lenp, ppos); 3143 3144 net = (struct net *)__ctl->extra1; 3145 rt_cache_flush(net, flush_delay); 3146 return 0; 3147 } 3148 3149 return -EINVAL; 3150 } 3151 3152 static ctl_table ipv4_route_table[] = { 3153 { 3154 .procname = "gc_thresh", 3155 .data = &ipv4_dst_ops.gc_thresh, 3156 .maxlen = sizeof(int), 3157 .mode = 0644, 3158 .proc_handler = proc_dointvec, 3159 }, 3160 { 3161 .procname = "max_size", 3162 .data = &ip_rt_max_size, 3163 .maxlen = sizeof(int), 3164 .mode = 0644, 3165 .proc_handler = proc_dointvec, 3166 }, 3167 { 3168 /* Deprecated. Use gc_min_interval_ms */ 3169 3170 .procname = "gc_min_interval", 3171 .data = &ip_rt_gc_min_interval, 3172 .maxlen = sizeof(int), 3173 .mode = 0644, 3174 .proc_handler = proc_dointvec_jiffies, 3175 }, 3176 { 3177 .procname = "gc_min_interval_ms", 3178 .data = &ip_rt_gc_min_interval, 3179 .maxlen = sizeof(int), 3180 .mode = 0644, 3181 .proc_handler = proc_dointvec_ms_jiffies, 3182 }, 3183 { 3184 .procname = "gc_timeout", 3185 .data = &ip_rt_gc_timeout, 3186 .maxlen = sizeof(int), 3187 .mode = 0644, 3188 .proc_handler = proc_dointvec_jiffies, 3189 }, 3190 { 3191 .procname = "redirect_load", 3192 .data = &ip_rt_redirect_load, 3193 .maxlen = sizeof(int), 3194 .mode = 0644, 3195 .proc_handler = proc_dointvec, 3196 }, 3197 { 3198 .procname = "redirect_number", 3199 .data = &ip_rt_redirect_number, 3200 .maxlen = sizeof(int), 3201 .mode = 0644, 3202 .proc_handler = proc_dointvec, 3203 }, 3204 { 3205 .procname = "redirect_silence", 3206 .data = &ip_rt_redirect_silence, 3207 .maxlen = sizeof(int), 3208 .mode = 0644, 3209 .proc_handler = proc_dointvec, 3210 }, 3211 { 3212 .procname = "error_cost", 3213 .data = &ip_rt_error_cost, 3214 .maxlen = sizeof(int), 3215 .mode = 0644, 3216 .proc_handler = proc_dointvec, 3217 }, 3218 { 3219 .procname = "error_burst", 3220 .data = &ip_rt_error_burst, 3221 .maxlen = sizeof(int), 3222 .mode = 0644, 3223 .proc_handler = proc_dointvec, 3224 }, 3225 { 3226 .procname = "gc_elasticity", 3227 .data = &ip_rt_gc_elasticity, 3228 .maxlen = sizeof(int), 3229 .mode = 0644, 3230 .proc_handler = proc_dointvec, 3231 }, 3232 { 3233 .procname = "mtu_expires", 3234 .data = &ip_rt_mtu_expires, 3235 .maxlen = sizeof(int), 3236 .mode = 0644, 3237 .proc_handler = proc_dointvec_jiffies, 3238 }, 3239 { 3240 .procname = "min_pmtu", 3241 .data = &ip_rt_min_pmtu, 3242 .maxlen = sizeof(int), 3243 .mode = 0644, 3244 .proc_handler = proc_dointvec, 3245 }, 3246 { 3247 .procname = "min_adv_mss", 3248 .data = &ip_rt_min_advmss, 3249 .maxlen = sizeof(int), 3250 .mode = 0644, 3251 .proc_handler = proc_dointvec, 3252 }, 3253 { } 3254 }; 3255 3256 static struct ctl_table empty[1]; 3257 3258 static struct ctl_table ipv4_skeleton[] = 3259 { 3260 { .procname = "route", 3261 .mode = 0555, .child = ipv4_route_table}, 3262 { .procname = "neigh", 3263 .mode = 0555, .child = empty}, 3264 { } 3265 }; 3266 3267 static __net_initdata struct ctl_path ipv4_path[] = { 3268 { .procname = "net", }, 3269 { .procname = "ipv4", }, 3270 { }, 3271 }; 3272 3273 static struct ctl_table ipv4_route_flush_table[] = { 3274 { 3275 .procname = "flush", 3276 .maxlen = sizeof(int), 3277 .mode = 0200, 3278 .proc_handler = ipv4_sysctl_rtcache_flush, 3279 }, 3280 { }, 3281 }; 3282 3283 static __net_initdata struct ctl_path ipv4_route_path[] = { 3284 { .procname = "net", }, 3285 { .procname = "ipv4", }, 3286 { .procname = "route", }, 3287 { }, 3288 }; 3289 3290 static __net_init int sysctl_route_net_init(struct net *net) 3291 { 3292 struct ctl_table *tbl; 3293 3294 tbl = ipv4_route_flush_table; 3295 if (!net_eq(net, &init_net)) { 3296 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3297 if (tbl == NULL) 3298 goto err_dup; 3299 } 3300 tbl[0].extra1 = net; 3301 3302 net->ipv4.route_hdr = 3303 register_net_sysctl_table(net, ipv4_route_path, tbl); 3304 if (net->ipv4.route_hdr == NULL) 3305 goto err_reg; 3306 return 0; 3307 3308 err_reg: 3309 if (tbl != ipv4_route_flush_table) 3310 kfree(tbl); 3311 err_dup: 3312 return -ENOMEM; 3313 } 3314 3315 static __net_exit void sysctl_route_net_exit(struct net *net) 3316 { 3317 struct ctl_table *tbl; 3318 3319 tbl = net->ipv4.route_hdr->ctl_table_arg; 3320 unregister_net_sysctl_table(net->ipv4.route_hdr); 3321 BUG_ON(tbl == ipv4_route_flush_table); 3322 kfree(tbl); 3323 } 3324 3325 static __net_initdata struct pernet_operations sysctl_route_ops = { 3326 .init = sysctl_route_net_init, 3327 .exit = sysctl_route_net_exit, 3328 }; 3329 #endif 3330 3331 static __net_init int rt_genid_init(struct net *net) 3332 { 3333 get_random_bytes(&net->ipv4.rt_genid, 3334 sizeof(net->ipv4.rt_genid)); 3335 get_random_bytes(&net->ipv4.dev_addr_genid, 3336 sizeof(net->ipv4.dev_addr_genid)); 3337 return 0; 3338 } 3339 3340 static __net_initdata struct pernet_operations rt_genid_ops = { 3341 .init = rt_genid_init, 3342 }; 3343 3344 3345 #ifdef CONFIG_IP_ROUTE_CLASSID 3346 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3347 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3348 3349 static __initdata unsigned long rhash_entries; 3350 static int __init set_rhash_entries(char *str) 3351 { 3352 if (!str) 3353 return 0; 3354 rhash_entries = simple_strtoul(str, &str, 0); 3355 return 1; 3356 } 3357 __setup("rhash_entries=", set_rhash_entries); 3358 3359 int __init ip_rt_init(void) 3360 { 3361 int rc = 0; 3362 3363 #ifdef CONFIG_IP_ROUTE_CLASSID 3364 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3365 if (!ip_rt_acct) 3366 panic("IP: failed to allocate ip_rt_acct\n"); 3367 #endif 3368 3369 ipv4_dst_ops.kmem_cachep = 3370 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3371 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3372 3373 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3374 3375 if (dst_entries_init(&ipv4_dst_ops) < 0) 3376 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3377 3378 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3379 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3380 3381 rt_hash_table = (struct rt_hash_bucket *) 3382 alloc_large_system_hash("IP route cache", 3383 sizeof(struct rt_hash_bucket), 3384 rhash_entries, 3385 (totalram_pages >= 128 * 1024) ? 3386 15 : 17, 3387 0, 3388 &rt_hash_log, 3389 &rt_hash_mask, 3390 rhash_entries ? 0 : 512 * 1024); 3391 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); 3392 rt_hash_lock_init(); 3393 3394 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1); 3395 ip_rt_max_size = (rt_hash_mask + 1) * 16; 3396 3397 devinet_init(); 3398 ip_fib_init(); 3399 3400 if (ip_rt_proc_init()) 3401 printk(KERN_ERR "Unable to create route proc files\n"); 3402 #ifdef CONFIG_XFRM 3403 xfrm_init(); 3404 xfrm4_init(ip_rt_max_size); 3405 #endif 3406 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); 3407 3408 #ifdef CONFIG_SYSCTL 3409 register_pernet_subsys(&sysctl_route_ops); 3410 #endif 3411 register_pernet_subsys(&rt_genid_ops); 3412 return rc; 3413 } 3414 3415 #ifdef CONFIG_SYSCTL 3416 /* 3417 * We really need to sanitize the damn ipv4 init order, then all 3418 * this nonsense will go away. 3419 */ 3420 void __init ip_static_sysctl_init(void) 3421 { 3422 register_sysctl_paths(ipv4_path, ipv4_skeleton); 3423 } 3424 #endif 3425