1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #define pr_fmt(fmt) "IPv4: " fmt 66 67 #include <linux/module.h> 68 #include <asm/uaccess.h> 69 #include <linux/bitops.h> 70 #include <linux/types.h> 71 #include <linux/kernel.h> 72 #include <linux/mm.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/skbuff.h> 83 #include <linux/inetdevice.h> 84 #include <linux/igmp.h> 85 #include <linux/pkt_sched.h> 86 #include <linux/mroute.h> 87 #include <linux/netfilter_ipv4.h> 88 #include <linux/random.h> 89 #include <linux/rcupdate.h> 90 #include <linux/times.h> 91 #include <linux/slab.h> 92 #include <net/dst.h> 93 #include <net/net_namespace.h> 94 #include <net/protocol.h> 95 #include <net/ip.h> 96 #include <net/route.h> 97 #include <net/inetpeer.h> 98 #include <net/sock.h> 99 #include <net/ip_fib.h> 100 #include <net/arp.h> 101 #include <net/tcp.h> 102 #include <net/icmp.h> 103 #include <net/xfrm.h> 104 #include <net/netevent.h> 105 #include <net/rtnetlink.h> 106 #ifdef CONFIG_SYSCTL 107 #include <linux/sysctl.h> 108 #include <linux/kmemleak.h> 109 #endif 110 #include <net/secure_seq.h> 111 112 #define RT_FL_TOS(oldflp4) \ 113 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 114 115 #define IP_MAX_MTU 0xFFF0 116 117 #define RT_GC_TIMEOUT (300*HZ) 118 119 static int ip_rt_max_size; 120 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 121 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 122 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 123 static int ip_rt_redirect_number __read_mostly = 9; 124 static int ip_rt_redirect_load __read_mostly = HZ / 50; 125 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 126 static int ip_rt_error_cost __read_mostly = HZ; 127 static int ip_rt_error_burst __read_mostly = 5 * HZ; 128 static int ip_rt_gc_elasticity __read_mostly = 8; 129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 130 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 131 static int ip_rt_min_advmss __read_mostly = 256; 132 133 /* 134 * Interface to generic destination cache. 135 */ 136 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 139 static unsigned int ipv4_mtu(const struct dst_entry *dst); 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 141 static void ipv4_link_failure(struct sk_buff *skb); 142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 143 struct sk_buff *skb, u32 mtu); 144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 145 struct sk_buff *skb); 146 static void ipv4_dst_destroy(struct dst_entry *dst); 147 148 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, 149 int how) 150 { 151 } 152 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 154 { 155 WARN_ON(1); 156 return NULL; 157 } 158 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 160 struct sk_buff *skb, 161 const void *daddr); 162 163 static struct dst_ops ipv4_dst_ops = { 164 .family = AF_INET, 165 .protocol = cpu_to_be16(ETH_P_IP), 166 .check = ipv4_dst_check, 167 .default_advmss = ipv4_default_advmss, 168 .mtu = ipv4_mtu, 169 .cow_metrics = ipv4_cow_metrics, 170 .destroy = ipv4_dst_destroy, 171 .ifdown = ipv4_dst_ifdown, 172 .negative_advice = ipv4_negative_advice, 173 .link_failure = ipv4_link_failure, 174 .update_pmtu = ip_rt_update_pmtu, 175 .redirect = ip_do_redirect, 176 .local_out = __ip_local_out, 177 .neigh_lookup = ipv4_neigh_lookup, 178 }; 179 180 #define ECN_OR_COST(class) TC_PRIO_##class 181 182 const __u8 ip_tos2prio[16] = { 183 TC_PRIO_BESTEFFORT, 184 ECN_OR_COST(BESTEFFORT), 185 TC_PRIO_BESTEFFORT, 186 ECN_OR_COST(BESTEFFORT), 187 TC_PRIO_BULK, 188 ECN_OR_COST(BULK), 189 TC_PRIO_BULK, 190 ECN_OR_COST(BULK), 191 TC_PRIO_INTERACTIVE, 192 ECN_OR_COST(INTERACTIVE), 193 TC_PRIO_INTERACTIVE, 194 ECN_OR_COST(INTERACTIVE), 195 TC_PRIO_INTERACTIVE_BULK, 196 ECN_OR_COST(INTERACTIVE_BULK), 197 TC_PRIO_INTERACTIVE_BULK, 198 ECN_OR_COST(INTERACTIVE_BULK) 199 }; 200 EXPORT_SYMBOL(ip_tos2prio); 201 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 203 #define RT_CACHE_STAT_INC(field) __this_cpu_inc(rt_cache_stat.field) 204 205 #ifdef CONFIG_PROC_FS 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 207 { 208 if (*pos) 209 return NULL; 210 return SEQ_START_TOKEN; 211 } 212 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 214 { 215 ++*pos; 216 return NULL; 217 } 218 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 220 { 221 } 222 223 static int rt_cache_seq_show(struct seq_file *seq, void *v) 224 { 225 if (v == SEQ_START_TOKEN) 226 seq_printf(seq, "%-127s\n", 227 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 228 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 229 "HHUptod\tSpecDst"); 230 return 0; 231 } 232 233 static const struct seq_operations rt_cache_seq_ops = { 234 .start = rt_cache_seq_start, 235 .next = rt_cache_seq_next, 236 .stop = rt_cache_seq_stop, 237 .show = rt_cache_seq_show, 238 }; 239 240 static int rt_cache_seq_open(struct inode *inode, struct file *file) 241 { 242 return seq_open(file, &rt_cache_seq_ops); 243 } 244 245 static const struct file_operations rt_cache_seq_fops = { 246 .owner = THIS_MODULE, 247 .open = rt_cache_seq_open, 248 .read = seq_read, 249 .llseek = seq_lseek, 250 .release = seq_release, 251 }; 252 253 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 255 { 256 int cpu; 257 258 if (*pos == 0) 259 return SEQ_START_TOKEN; 260 261 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 262 if (!cpu_possible(cpu)) 263 continue; 264 *pos = cpu+1; 265 return &per_cpu(rt_cache_stat, cpu); 266 } 267 return NULL; 268 } 269 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 271 { 272 int cpu; 273 274 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 275 if (!cpu_possible(cpu)) 276 continue; 277 *pos = cpu+1; 278 return &per_cpu(rt_cache_stat, cpu); 279 } 280 return NULL; 281 282 } 283 284 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 285 { 286 287 } 288 289 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 290 { 291 struct rt_cache_stat *st = v; 292 293 if (v == SEQ_START_TOKEN) { 294 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 295 return 0; 296 } 297 298 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 299 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 300 dst_entries_get_slow(&ipv4_dst_ops), 301 st->in_hit, 302 st->in_slow_tot, 303 st->in_slow_mc, 304 st->in_no_route, 305 st->in_brd, 306 st->in_martian_dst, 307 st->in_martian_src, 308 309 st->out_hit, 310 st->out_slow_tot, 311 st->out_slow_mc, 312 313 st->gc_total, 314 st->gc_ignored, 315 st->gc_goal_miss, 316 st->gc_dst_overflow, 317 st->in_hlist_search, 318 st->out_hlist_search 319 ); 320 return 0; 321 } 322 323 static const struct seq_operations rt_cpu_seq_ops = { 324 .start = rt_cpu_seq_start, 325 .next = rt_cpu_seq_next, 326 .stop = rt_cpu_seq_stop, 327 .show = rt_cpu_seq_show, 328 }; 329 330 331 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 332 { 333 return seq_open(file, &rt_cpu_seq_ops); 334 } 335 336 static const struct file_operations rt_cpu_seq_fops = { 337 .owner = THIS_MODULE, 338 .open = rt_cpu_seq_open, 339 .read = seq_read, 340 .llseek = seq_lseek, 341 .release = seq_release, 342 }; 343 344 #ifdef CONFIG_IP_ROUTE_CLASSID 345 static int rt_acct_proc_show(struct seq_file *m, void *v) 346 { 347 struct ip_rt_acct *dst, *src; 348 unsigned int i, j; 349 350 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 351 if (!dst) 352 return -ENOMEM; 353 354 for_each_possible_cpu(i) { 355 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 356 for (j = 0; j < 256; j++) { 357 dst[j].o_bytes += src[j].o_bytes; 358 dst[j].o_packets += src[j].o_packets; 359 dst[j].i_bytes += src[j].i_bytes; 360 dst[j].i_packets += src[j].i_packets; 361 } 362 } 363 364 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 365 kfree(dst); 366 return 0; 367 } 368 369 static int rt_acct_proc_open(struct inode *inode, struct file *file) 370 { 371 return single_open(file, rt_acct_proc_show, NULL); 372 } 373 374 static const struct file_operations rt_acct_proc_fops = { 375 .owner = THIS_MODULE, 376 .open = rt_acct_proc_open, 377 .read = seq_read, 378 .llseek = seq_lseek, 379 .release = single_release, 380 }; 381 #endif 382 383 static int __net_init ip_rt_do_proc_init(struct net *net) 384 { 385 struct proc_dir_entry *pde; 386 387 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO, 388 &rt_cache_seq_fops); 389 if (!pde) 390 goto err1; 391 392 pde = proc_create("rt_cache", S_IRUGO, 393 net->proc_net_stat, &rt_cpu_seq_fops); 394 if (!pde) 395 goto err2; 396 397 #ifdef CONFIG_IP_ROUTE_CLASSID 398 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 399 if (!pde) 400 goto err3; 401 #endif 402 return 0; 403 404 #ifdef CONFIG_IP_ROUTE_CLASSID 405 err3: 406 remove_proc_entry("rt_cache", net->proc_net_stat); 407 #endif 408 err2: 409 remove_proc_entry("rt_cache", net->proc_net); 410 err1: 411 return -ENOMEM; 412 } 413 414 static void __net_exit ip_rt_do_proc_exit(struct net *net) 415 { 416 remove_proc_entry("rt_cache", net->proc_net_stat); 417 remove_proc_entry("rt_cache", net->proc_net); 418 #ifdef CONFIG_IP_ROUTE_CLASSID 419 remove_proc_entry("rt_acct", net->proc_net); 420 #endif 421 } 422 423 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 424 .init = ip_rt_do_proc_init, 425 .exit = ip_rt_do_proc_exit, 426 }; 427 428 static int __init ip_rt_proc_init(void) 429 { 430 return register_pernet_subsys(&ip_rt_proc_ops); 431 } 432 433 #else 434 static inline int ip_rt_proc_init(void) 435 { 436 return 0; 437 } 438 #endif /* CONFIG_PROC_FS */ 439 440 static inline bool rt_is_expired(const struct rtable *rth) 441 { 442 return rth->rt_genid != rt_genid(dev_net(rth->dst.dev)); 443 } 444 445 void rt_cache_flush(struct net *net) 446 { 447 rt_genid_bump(net); 448 } 449 450 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 451 struct sk_buff *skb, 452 const void *daddr) 453 { 454 struct net_device *dev = dst->dev; 455 const __be32 *pkey = daddr; 456 const struct rtable *rt; 457 struct neighbour *n; 458 459 rt = (const struct rtable *) dst; 460 if (rt->rt_gateway) 461 pkey = (const __be32 *) &rt->rt_gateway; 462 else if (skb) 463 pkey = &ip_hdr(skb)->daddr; 464 465 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); 466 if (n) 467 return n; 468 return neigh_create(&arp_tbl, pkey, dev); 469 } 470 471 /* 472 * Peer allocation may fail only in serious out-of-memory conditions. However 473 * we still can generate some output. 474 * Random ID selection looks a bit dangerous because we have no chances to 475 * select ID being unique in a reasonable period of time. 476 * But broken packet identifier may be better than no packet at all. 477 */ 478 static void ip_select_fb_ident(struct iphdr *iph) 479 { 480 static DEFINE_SPINLOCK(ip_fb_id_lock); 481 static u32 ip_fallback_id; 482 u32 salt; 483 484 spin_lock_bh(&ip_fb_id_lock); 485 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr); 486 iph->id = htons(salt & 0xFFFF); 487 ip_fallback_id = salt; 488 spin_unlock_bh(&ip_fb_id_lock); 489 } 490 491 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) 492 { 493 struct net *net = dev_net(dst->dev); 494 struct inet_peer *peer; 495 496 peer = inet_getpeer_v4(net->ipv4.peers, iph->daddr, 1); 497 if (peer) { 498 iph->id = htons(inet_getid(peer, more)); 499 inet_putpeer(peer); 500 return; 501 } 502 503 ip_select_fb_ident(iph); 504 } 505 EXPORT_SYMBOL(__ip_select_ident); 506 507 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk, 508 const struct iphdr *iph, 509 int oif, u8 tos, 510 u8 prot, u32 mark, int flow_flags) 511 { 512 if (sk) { 513 const struct inet_sock *inet = inet_sk(sk); 514 515 oif = sk->sk_bound_dev_if; 516 mark = sk->sk_mark; 517 tos = RT_CONN_FLAGS(sk); 518 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 519 } 520 flowi4_init_output(fl4, oif, mark, tos, 521 RT_SCOPE_UNIVERSE, prot, 522 flow_flags, 523 iph->daddr, iph->saddr, 0, 0); 524 } 525 526 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 527 const struct sock *sk) 528 { 529 const struct iphdr *iph = ip_hdr(skb); 530 int oif = skb->dev->ifindex; 531 u8 tos = RT_TOS(iph->tos); 532 u8 prot = iph->protocol; 533 u32 mark = skb->mark; 534 535 __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0); 536 } 537 538 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 539 { 540 const struct inet_sock *inet = inet_sk(sk); 541 const struct ip_options_rcu *inet_opt; 542 __be32 daddr = inet->inet_daddr; 543 544 rcu_read_lock(); 545 inet_opt = rcu_dereference(inet->inet_opt); 546 if (inet_opt && inet_opt->opt.srr) 547 daddr = inet_opt->opt.faddr; 548 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 549 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 550 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 551 inet_sk_flowi_flags(sk), 552 daddr, inet->inet_saddr, 0, 0); 553 rcu_read_unlock(); 554 } 555 556 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 557 const struct sk_buff *skb) 558 { 559 if (skb) 560 build_skb_flow_key(fl4, skb, sk); 561 else 562 build_sk_flow_key(fl4, sk); 563 } 564 565 static inline void rt_free(struct rtable *rt) 566 { 567 call_rcu(&rt->dst.rcu_head, dst_rcu_free); 568 } 569 570 static DEFINE_SPINLOCK(fnhe_lock); 571 572 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 573 { 574 struct fib_nh_exception *fnhe, *oldest; 575 struct rtable *orig; 576 577 oldest = rcu_dereference(hash->chain); 578 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 579 fnhe = rcu_dereference(fnhe->fnhe_next)) { 580 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 581 oldest = fnhe; 582 } 583 orig = rcu_dereference(oldest->fnhe_rth); 584 if (orig) { 585 RCU_INIT_POINTER(oldest->fnhe_rth, NULL); 586 rt_free(orig); 587 } 588 return oldest; 589 } 590 591 static inline u32 fnhe_hashfun(__be32 daddr) 592 { 593 u32 hval; 594 595 hval = (__force u32) daddr; 596 hval ^= (hval >> 11) ^ (hval >> 22); 597 598 return hval & (FNHE_HASH_SIZE - 1); 599 } 600 601 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 602 u32 pmtu, unsigned long expires) 603 { 604 struct fnhe_hash_bucket *hash; 605 struct fib_nh_exception *fnhe; 606 int depth; 607 u32 hval = fnhe_hashfun(daddr); 608 609 spin_lock_bh(&fnhe_lock); 610 611 hash = nh->nh_exceptions; 612 if (!hash) { 613 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); 614 if (!hash) 615 goto out_unlock; 616 nh->nh_exceptions = hash; 617 } 618 619 hash += hval; 620 621 depth = 0; 622 for (fnhe = rcu_dereference(hash->chain); fnhe; 623 fnhe = rcu_dereference(fnhe->fnhe_next)) { 624 if (fnhe->fnhe_daddr == daddr) 625 break; 626 depth++; 627 } 628 629 if (fnhe) { 630 if (gw) 631 fnhe->fnhe_gw = gw; 632 if (pmtu) { 633 fnhe->fnhe_pmtu = pmtu; 634 fnhe->fnhe_expires = expires; 635 } 636 } else { 637 if (depth > FNHE_RECLAIM_DEPTH) 638 fnhe = fnhe_oldest(hash); 639 else { 640 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 641 if (!fnhe) 642 goto out_unlock; 643 644 fnhe->fnhe_next = hash->chain; 645 rcu_assign_pointer(hash->chain, fnhe); 646 } 647 fnhe->fnhe_daddr = daddr; 648 fnhe->fnhe_gw = gw; 649 fnhe->fnhe_pmtu = pmtu; 650 fnhe->fnhe_expires = expires; 651 } 652 653 fnhe->fnhe_stamp = jiffies; 654 655 out_unlock: 656 spin_unlock_bh(&fnhe_lock); 657 return; 658 } 659 660 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 661 bool kill_route) 662 { 663 __be32 new_gw = icmp_hdr(skb)->un.gateway; 664 __be32 old_gw = ip_hdr(skb)->saddr; 665 struct net_device *dev = skb->dev; 666 struct in_device *in_dev; 667 struct fib_result res; 668 struct neighbour *n; 669 struct net *net; 670 671 switch (icmp_hdr(skb)->code & 7) { 672 case ICMP_REDIR_NET: 673 case ICMP_REDIR_NETTOS: 674 case ICMP_REDIR_HOST: 675 case ICMP_REDIR_HOSTTOS: 676 break; 677 678 default: 679 return; 680 } 681 682 if (rt->rt_gateway != old_gw) 683 return; 684 685 in_dev = __in_dev_get_rcu(dev); 686 if (!in_dev) 687 return; 688 689 net = dev_net(dev); 690 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 691 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 692 ipv4_is_zeronet(new_gw)) 693 goto reject_redirect; 694 695 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 696 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 697 goto reject_redirect; 698 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 699 goto reject_redirect; 700 } else { 701 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 702 goto reject_redirect; 703 } 704 705 n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); 706 if (n) { 707 if (!(n->nud_state & NUD_VALID)) { 708 neigh_event_send(n, NULL); 709 } else { 710 if (fib_lookup(net, fl4, &res) == 0) { 711 struct fib_nh *nh = &FIB_RES_NH(res); 712 713 update_or_create_fnhe(nh, fl4->daddr, new_gw, 714 0, 0); 715 } 716 if (kill_route) 717 rt->dst.obsolete = DST_OBSOLETE_KILL; 718 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 719 } 720 neigh_release(n); 721 } 722 return; 723 724 reject_redirect: 725 #ifdef CONFIG_IP_ROUTE_VERBOSE 726 if (IN_DEV_LOG_MARTIANS(in_dev)) { 727 const struct iphdr *iph = (const struct iphdr *) skb->data; 728 __be32 daddr = iph->daddr; 729 __be32 saddr = iph->saddr; 730 731 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 732 " Advised path = %pI4 -> %pI4\n", 733 &old_gw, dev->name, &new_gw, 734 &saddr, &daddr); 735 } 736 #endif 737 ; 738 } 739 740 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 741 { 742 struct rtable *rt; 743 struct flowi4 fl4; 744 745 rt = (struct rtable *) dst; 746 747 ip_rt_build_flow_key(&fl4, sk, skb); 748 __ip_do_redirect(rt, skb, &fl4, true); 749 } 750 751 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 752 { 753 struct rtable *rt = (struct rtable *)dst; 754 struct dst_entry *ret = dst; 755 756 if (rt) { 757 if (dst->obsolete > 0) { 758 ip_rt_put(rt); 759 ret = NULL; 760 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 761 rt->dst.expires) { 762 ip_rt_put(rt); 763 ret = NULL; 764 } 765 } 766 return ret; 767 } 768 769 /* 770 * Algorithm: 771 * 1. The first ip_rt_redirect_number redirects are sent 772 * with exponential backoff, then we stop sending them at all, 773 * assuming that the host ignores our redirects. 774 * 2. If we did not see packets requiring redirects 775 * during ip_rt_redirect_silence, we assume that the host 776 * forgot redirected route and start to send redirects again. 777 * 778 * This algorithm is much cheaper and more intelligent than dumb load limiting 779 * in icmp.c. 780 * 781 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 782 * and "frag. need" (breaks PMTU discovery) in icmp.c. 783 */ 784 785 void ip_rt_send_redirect(struct sk_buff *skb) 786 { 787 struct rtable *rt = skb_rtable(skb); 788 struct in_device *in_dev; 789 struct inet_peer *peer; 790 struct net *net; 791 int log_martians; 792 793 rcu_read_lock(); 794 in_dev = __in_dev_get_rcu(rt->dst.dev); 795 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 796 rcu_read_unlock(); 797 return; 798 } 799 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 800 rcu_read_unlock(); 801 802 net = dev_net(rt->dst.dev); 803 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); 804 if (!peer) { 805 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 806 return; 807 } 808 809 /* No redirected packets during ip_rt_redirect_silence; 810 * reset the algorithm. 811 */ 812 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) 813 peer->rate_tokens = 0; 814 815 /* Too many ignored redirects; do not send anything 816 * set dst.rate_last to the last seen redirected packet. 817 */ 818 if (peer->rate_tokens >= ip_rt_redirect_number) { 819 peer->rate_last = jiffies; 820 goto out_put_peer; 821 } 822 823 /* Check for load limit; set rate_last to the latest sent 824 * redirect. 825 */ 826 if (peer->rate_tokens == 0 || 827 time_after(jiffies, 828 (peer->rate_last + 829 (ip_rt_redirect_load << peer->rate_tokens)))) { 830 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway); 831 peer->rate_last = jiffies; 832 ++peer->rate_tokens; 833 #ifdef CONFIG_IP_ROUTE_VERBOSE 834 if (log_martians && 835 peer->rate_tokens == ip_rt_redirect_number) 836 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 837 &ip_hdr(skb)->saddr, inet_iif(skb), 838 &ip_hdr(skb)->daddr, &rt->rt_gateway); 839 #endif 840 } 841 out_put_peer: 842 inet_putpeer(peer); 843 } 844 845 static int ip_error(struct sk_buff *skb) 846 { 847 struct in_device *in_dev = __in_dev_get_rcu(skb->dev); 848 struct rtable *rt = skb_rtable(skb); 849 struct inet_peer *peer; 850 unsigned long now; 851 struct net *net; 852 bool send; 853 int code; 854 855 net = dev_net(rt->dst.dev); 856 if (!IN_DEV_FORWARD(in_dev)) { 857 switch (rt->dst.error) { 858 case EHOSTUNREACH: 859 IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS); 860 break; 861 862 case ENETUNREACH: 863 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); 864 break; 865 } 866 goto out; 867 } 868 869 switch (rt->dst.error) { 870 case EINVAL: 871 default: 872 goto out; 873 case EHOSTUNREACH: 874 code = ICMP_HOST_UNREACH; 875 break; 876 case ENETUNREACH: 877 code = ICMP_NET_UNREACH; 878 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES); 879 break; 880 case EACCES: 881 code = ICMP_PKT_FILTERED; 882 break; 883 } 884 885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 1); 886 887 send = true; 888 if (peer) { 889 now = jiffies; 890 peer->rate_tokens += now - peer->rate_last; 891 if (peer->rate_tokens > ip_rt_error_burst) 892 peer->rate_tokens = ip_rt_error_burst; 893 peer->rate_last = now; 894 if (peer->rate_tokens >= ip_rt_error_cost) 895 peer->rate_tokens -= ip_rt_error_cost; 896 else 897 send = false; 898 inet_putpeer(peer); 899 } 900 if (send) 901 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 902 903 out: kfree_skb(skb); 904 return 0; 905 } 906 907 static u32 __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 908 { 909 struct fib_result res; 910 911 if (mtu < ip_rt_min_pmtu) 912 mtu = ip_rt_min_pmtu; 913 914 rcu_read_lock(); 915 if (fib_lookup(dev_net(rt->dst.dev), fl4, &res) == 0) { 916 struct fib_nh *nh = &FIB_RES_NH(res); 917 918 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 919 jiffies + ip_rt_mtu_expires); 920 } 921 rcu_read_unlock(); 922 return mtu; 923 } 924 925 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 926 struct sk_buff *skb, u32 mtu) 927 { 928 struct rtable *rt = (struct rtable *) dst; 929 struct flowi4 fl4; 930 931 ip_rt_build_flow_key(&fl4, sk, skb); 932 mtu = __ip_rt_update_pmtu(rt, &fl4, mtu); 933 934 if (!rt->rt_pmtu) { 935 dst->obsolete = DST_OBSOLETE_KILL; 936 } else { 937 rt->rt_pmtu = mtu; 938 rt->dst.expires = max(1UL, jiffies + ip_rt_mtu_expires); 939 } 940 } 941 942 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 943 int oif, u32 mark, u8 protocol, int flow_flags) 944 { 945 const struct iphdr *iph = (const struct iphdr *) skb->data; 946 struct flowi4 fl4; 947 struct rtable *rt; 948 949 __build_flow_key(&fl4, NULL, iph, oif, 950 RT_TOS(iph->tos), protocol, mark, flow_flags); 951 rt = __ip_route_output_key(net, &fl4); 952 if (!IS_ERR(rt)) { 953 __ip_rt_update_pmtu(rt, &fl4, mtu); 954 ip_rt_put(rt); 955 } 956 } 957 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 958 959 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 960 { 961 const struct iphdr *iph = (const struct iphdr *) skb->data; 962 struct flowi4 fl4; 963 struct rtable *rt; 964 965 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 966 rt = __ip_route_output_key(sock_net(sk), &fl4); 967 if (!IS_ERR(rt)) { 968 __ip_rt_update_pmtu(rt, &fl4, mtu); 969 ip_rt_put(rt); 970 } 971 } 972 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 973 974 void ipv4_redirect(struct sk_buff *skb, struct net *net, 975 int oif, u32 mark, u8 protocol, int flow_flags) 976 { 977 const struct iphdr *iph = (const struct iphdr *) skb->data; 978 struct flowi4 fl4; 979 struct rtable *rt; 980 981 __build_flow_key(&fl4, NULL, iph, oif, 982 RT_TOS(iph->tos), protocol, mark, flow_flags); 983 rt = __ip_route_output_key(net, &fl4); 984 if (!IS_ERR(rt)) { 985 __ip_do_redirect(rt, skb, &fl4, false); 986 ip_rt_put(rt); 987 } 988 } 989 EXPORT_SYMBOL_GPL(ipv4_redirect); 990 991 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 992 { 993 const struct iphdr *iph = (const struct iphdr *) skb->data; 994 struct flowi4 fl4; 995 struct rtable *rt; 996 997 __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0); 998 rt = __ip_route_output_key(sock_net(sk), &fl4); 999 if (!IS_ERR(rt)) { 1000 __ip_do_redirect(rt, skb, &fl4, false); 1001 ip_rt_put(rt); 1002 } 1003 } 1004 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1005 1006 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1007 { 1008 struct rtable *rt = (struct rtable *) dst; 1009 1010 /* All IPV4 dsts are created with ->obsolete set to the value 1011 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1012 * into this function always. 1013 * 1014 * When a PMTU/redirect information update invalidates a 1015 * route, this is indicated by setting obsolete to 1016 * DST_OBSOLETE_KILL. 1017 */ 1018 if (dst->obsolete == DST_OBSOLETE_KILL || rt_is_expired(rt)) 1019 return NULL; 1020 return dst; 1021 } 1022 1023 static void ipv4_link_failure(struct sk_buff *skb) 1024 { 1025 struct rtable *rt; 1026 1027 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1028 1029 rt = skb_rtable(skb); 1030 if (rt) 1031 dst_set_expires(&rt->dst, 0); 1032 } 1033 1034 static int ip_rt_bug(struct sk_buff *skb) 1035 { 1036 pr_debug("%s: %pI4 -> %pI4, %s\n", 1037 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1038 skb->dev ? skb->dev->name : "?"); 1039 kfree_skb(skb); 1040 WARN_ON(1); 1041 return 0; 1042 } 1043 1044 /* 1045 We do not cache source address of outgoing interface, 1046 because it is used only by IP RR, TS and SRR options, 1047 so that it out of fast path. 1048 1049 BTW remember: "addr" is allowed to be not aligned 1050 in IP options! 1051 */ 1052 1053 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1054 { 1055 __be32 src; 1056 1057 if (rt_is_output_route(rt)) 1058 src = ip_hdr(skb)->saddr; 1059 else { 1060 struct fib_result res; 1061 struct flowi4 fl4; 1062 struct iphdr *iph; 1063 1064 iph = ip_hdr(skb); 1065 1066 memset(&fl4, 0, sizeof(fl4)); 1067 fl4.daddr = iph->daddr; 1068 fl4.saddr = iph->saddr; 1069 fl4.flowi4_tos = RT_TOS(iph->tos); 1070 fl4.flowi4_oif = rt->dst.dev->ifindex; 1071 fl4.flowi4_iif = skb->dev->ifindex; 1072 fl4.flowi4_mark = skb->mark; 1073 1074 rcu_read_lock(); 1075 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res) == 0) 1076 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1077 else 1078 src = inet_select_addr(rt->dst.dev, 1079 rt_nexthop(rt, iph->daddr), 1080 RT_SCOPE_UNIVERSE); 1081 rcu_read_unlock(); 1082 } 1083 memcpy(addr, &src, 4); 1084 } 1085 1086 #ifdef CONFIG_IP_ROUTE_CLASSID 1087 static void set_class_tag(struct rtable *rt, u32 tag) 1088 { 1089 if (!(rt->dst.tclassid & 0xFFFF)) 1090 rt->dst.tclassid |= tag & 0xFFFF; 1091 if (!(rt->dst.tclassid & 0xFFFF0000)) 1092 rt->dst.tclassid |= tag & 0xFFFF0000; 1093 } 1094 #endif 1095 1096 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1097 { 1098 unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS); 1099 1100 if (advmss == 0) { 1101 advmss = max_t(unsigned int, dst->dev->mtu - 40, 1102 ip_rt_min_advmss); 1103 if (advmss > 65535 - 40) 1104 advmss = 65535 - 40; 1105 } 1106 return advmss; 1107 } 1108 1109 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1110 { 1111 const struct rtable *rt = (const struct rtable *) dst; 1112 unsigned int mtu = rt->rt_pmtu; 1113 1114 if (mtu && time_after_eq(jiffies, rt->dst.expires)) 1115 mtu = 0; 1116 1117 if (!mtu) 1118 mtu = dst_metric_raw(dst, RTAX_MTU); 1119 1120 if (mtu && rt_is_output_route(rt)) 1121 return mtu; 1122 1123 mtu = dst->dev->mtu; 1124 1125 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1126 if (rt->rt_gateway && mtu > 576) 1127 mtu = 576; 1128 } 1129 1130 if (mtu > IP_MAX_MTU) 1131 mtu = IP_MAX_MTU; 1132 1133 return mtu; 1134 } 1135 1136 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1137 { 1138 struct fnhe_hash_bucket *hash = nh->nh_exceptions; 1139 struct fib_nh_exception *fnhe; 1140 u32 hval; 1141 1142 if (!hash) 1143 return NULL; 1144 1145 hval = fnhe_hashfun(daddr); 1146 1147 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1148 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1149 if (fnhe->fnhe_daddr == daddr) 1150 return fnhe; 1151 } 1152 return NULL; 1153 } 1154 1155 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1156 __be32 daddr) 1157 { 1158 bool ret = false; 1159 1160 spin_lock_bh(&fnhe_lock); 1161 1162 if (daddr == fnhe->fnhe_daddr) { 1163 struct rtable *orig; 1164 1165 if (fnhe->fnhe_pmtu) { 1166 unsigned long expires = fnhe->fnhe_expires; 1167 unsigned long diff = expires - jiffies; 1168 1169 if (time_before(jiffies, expires)) { 1170 rt->rt_pmtu = fnhe->fnhe_pmtu; 1171 dst_set_expires(&rt->dst, diff); 1172 } 1173 } 1174 if (fnhe->fnhe_gw) { 1175 rt->rt_flags |= RTCF_REDIRECTED; 1176 rt->rt_gateway = fnhe->fnhe_gw; 1177 } 1178 1179 orig = rcu_dereference(fnhe->fnhe_rth); 1180 rcu_assign_pointer(fnhe->fnhe_rth, rt); 1181 if (orig) 1182 rt_free(orig); 1183 1184 fnhe->fnhe_stamp = jiffies; 1185 ret = true; 1186 } else { 1187 /* Routes we intend to cache in nexthop exception have 1188 * the DST_NOCACHE bit clear. However, if we are 1189 * unsuccessful at storing this route into the cache 1190 * we really need to set it. 1191 */ 1192 rt->dst.flags |= DST_NOCACHE; 1193 } 1194 spin_unlock_bh(&fnhe_lock); 1195 1196 return ret; 1197 } 1198 1199 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1200 { 1201 struct rtable *orig, *prev, **p; 1202 bool ret = true; 1203 1204 if (rt_is_input_route(rt)) { 1205 p = (struct rtable **)&nh->nh_rth_input; 1206 } else { 1207 if (!nh->nh_pcpu_rth_output) 1208 goto nocache; 1209 p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output); 1210 } 1211 orig = *p; 1212 1213 prev = cmpxchg(p, orig, rt); 1214 if (prev == orig) { 1215 if (orig) 1216 rt_free(orig); 1217 } else { 1218 /* Routes we intend to cache in the FIB nexthop have 1219 * the DST_NOCACHE bit clear. However, if we are 1220 * unsuccessful at storing this route into the cache 1221 * we really need to set it. 1222 */ 1223 nocache: 1224 rt->dst.flags |= DST_NOCACHE; 1225 ret = false; 1226 } 1227 1228 return ret; 1229 } 1230 1231 static DEFINE_SPINLOCK(rt_uncached_lock); 1232 static LIST_HEAD(rt_uncached_list); 1233 1234 static void rt_add_uncached_list(struct rtable *rt) 1235 { 1236 spin_lock_bh(&rt_uncached_lock); 1237 list_add_tail(&rt->rt_uncached, &rt_uncached_list); 1238 spin_unlock_bh(&rt_uncached_lock); 1239 } 1240 1241 static void ipv4_dst_destroy(struct dst_entry *dst) 1242 { 1243 struct rtable *rt = (struct rtable *) dst; 1244 1245 if (!list_empty(&rt->rt_uncached)) { 1246 spin_lock_bh(&rt_uncached_lock); 1247 list_del(&rt->rt_uncached); 1248 spin_unlock_bh(&rt_uncached_lock); 1249 } 1250 } 1251 1252 void rt_flush_dev(struct net_device *dev) 1253 { 1254 if (!list_empty(&rt_uncached_list)) { 1255 struct net *net = dev_net(dev); 1256 struct rtable *rt; 1257 1258 spin_lock_bh(&rt_uncached_lock); 1259 list_for_each_entry(rt, &rt_uncached_list, rt_uncached) { 1260 if (rt->dst.dev != dev) 1261 continue; 1262 rt->dst.dev = net->loopback_dev; 1263 dev_hold(rt->dst.dev); 1264 dev_put(dev); 1265 } 1266 spin_unlock_bh(&rt_uncached_lock); 1267 } 1268 } 1269 1270 static bool rt_cache_valid(const struct rtable *rt) 1271 { 1272 return rt && 1273 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1274 !rt_is_expired(rt); 1275 } 1276 1277 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1278 const struct fib_result *res, 1279 struct fib_nh_exception *fnhe, 1280 struct fib_info *fi, u16 type, u32 itag) 1281 { 1282 bool cached = false; 1283 1284 if (fi) { 1285 struct fib_nh *nh = &FIB_RES_NH(*res); 1286 1287 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) 1288 rt->rt_gateway = nh->nh_gw; 1289 dst_init_metrics(&rt->dst, fi->fib_metrics, true); 1290 #ifdef CONFIG_IP_ROUTE_CLASSID 1291 rt->dst.tclassid = nh->nh_tclassid; 1292 #endif 1293 if (unlikely(fnhe)) 1294 cached = rt_bind_exception(rt, fnhe, daddr); 1295 else if (!(rt->dst.flags & DST_NOCACHE)) 1296 cached = rt_cache_route(nh, rt); 1297 } 1298 if (unlikely(!cached)) 1299 rt_add_uncached_list(rt); 1300 1301 #ifdef CONFIG_IP_ROUTE_CLASSID 1302 #ifdef CONFIG_IP_MULTIPLE_TABLES 1303 set_class_tag(rt, res->tclassid); 1304 #endif 1305 set_class_tag(rt, itag); 1306 #endif 1307 } 1308 1309 static struct rtable *rt_dst_alloc(struct net_device *dev, 1310 bool nopolicy, bool noxfrm, bool will_cache) 1311 { 1312 return dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1313 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) | 1314 (nopolicy ? DST_NOPOLICY : 0) | 1315 (noxfrm ? DST_NOXFRM : 0)); 1316 } 1317 1318 /* called in rcu_read_lock() section */ 1319 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1320 u8 tos, struct net_device *dev, int our) 1321 { 1322 struct rtable *rth; 1323 struct in_device *in_dev = __in_dev_get_rcu(dev); 1324 u32 itag = 0; 1325 int err; 1326 1327 /* Primary sanity checks. */ 1328 1329 if (in_dev == NULL) 1330 return -EINVAL; 1331 1332 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1333 skb->protocol != htons(ETH_P_IP)) 1334 goto e_inval; 1335 1336 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 1337 if (ipv4_is_loopback(saddr)) 1338 goto e_inval; 1339 1340 if (ipv4_is_zeronet(saddr)) { 1341 if (!ipv4_is_local_multicast(daddr)) 1342 goto e_inval; 1343 } else { 1344 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1345 in_dev, &itag); 1346 if (err < 0) 1347 goto e_err; 1348 } 1349 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, 1350 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); 1351 if (!rth) 1352 goto e_nobufs; 1353 1354 #ifdef CONFIG_IP_ROUTE_CLASSID 1355 rth->dst.tclassid = itag; 1356 #endif 1357 rth->dst.output = ip_rt_bug; 1358 1359 rth->rt_genid = rt_genid(dev_net(dev)); 1360 rth->rt_flags = RTCF_MULTICAST; 1361 rth->rt_type = RTN_MULTICAST; 1362 rth->rt_is_input= 1; 1363 rth->rt_iif = 0; 1364 rth->rt_pmtu = 0; 1365 rth->rt_gateway = 0; 1366 INIT_LIST_HEAD(&rth->rt_uncached); 1367 if (our) { 1368 rth->dst.input= ip_local_deliver; 1369 rth->rt_flags |= RTCF_LOCAL; 1370 } 1371 1372 #ifdef CONFIG_IP_MROUTE 1373 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1374 rth->dst.input = ip_mr_input; 1375 #endif 1376 RT_CACHE_STAT_INC(in_slow_mc); 1377 1378 skb_dst_set(skb, &rth->dst); 1379 return 0; 1380 1381 e_nobufs: 1382 return -ENOBUFS; 1383 e_inval: 1384 return -EINVAL; 1385 e_err: 1386 return err; 1387 } 1388 1389 1390 static void ip_handle_martian_source(struct net_device *dev, 1391 struct in_device *in_dev, 1392 struct sk_buff *skb, 1393 __be32 daddr, 1394 __be32 saddr) 1395 { 1396 RT_CACHE_STAT_INC(in_martian_src); 1397 #ifdef CONFIG_IP_ROUTE_VERBOSE 1398 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1399 /* 1400 * RFC1812 recommendation, if source is martian, 1401 * the only hint is MAC header. 1402 */ 1403 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1404 &daddr, &saddr, dev->name); 1405 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1406 print_hex_dump(KERN_WARNING, "ll header: ", 1407 DUMP_PREFIX_OFFSET, 16, 1, 1408 skb_mac_header(skb), 1409 dev->hard_header_len, true); 1410 } 1411 } 1412 #endif 1413 } 1414 1415 /* called in rcu_read_lock() section */ 1416 static int __mkroute_input(struct sk_buff *skb, 1417 const struct fib_result *res, 1418 struct in_device *in_dev, 1419 __be32 daddr, __be32 saddr, u32 tos) 1420 { 1421 struct rtable *rth; 1422 int err; 1423 struct in_device *out_dev; 1424 unsigned int flags = 0; 1425 bool do_cache; 1426 u32 itag; 1427 1428 /* get a working reference to the output device */ 1429 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 1430 if (out_dev == NULL) { 1431 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1432 return -EINVAL; 1433 } 1434 1435 1436 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1437 in_dev->dev, in_dev, &itag); 1438 if (err < 0) { 1439 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1440 saddr); 1441 1442 goto cleanup; 1443 } 1444 1445 if (out_dev == in_dev && err && 1446 (IN_DEV_SHARED_MEDIA(out_dev) || 1447 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1448 flags |= RTCF_DOREDIRECT; 1449 1450 if (skb->protocol != htons(ETH_P_IP)) { 1451 /* Not IP (i.e. ARP). Do not create route, if it is 1452 * invalid for proxy arp. DNAT routes are always valid. 1453 * 1454 * Proxy arp feature have been extended to allow, ARP 1455 * replies back to the same interface, to support 1456 * Private VLAN switch technologies. See arp.c. 1457 */ 1458 if (out_dev == in_dev && 1459 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1460 err = -EINVAL; 1461 goto cleanup; 1462 } 1463 } 1464 1465 do_cache = false; 1466 if (res->fi) { 1467 if (!itag) { 1468 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); 1469 if (rt_cache_valid(rth)) { 1470 skb_dst_set_noref(skb, &rth->dst); 1471 goto out; 1472 } 1473 do_cache = true; 1474 } 1475 } 1476 1477 rth = rt_dst_alloc(out_dev->dev, 1478 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1479 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); 1480 if (!rth) { 1481 err = -ENOBUFS; 1482 goto cleanup; 1483 } 1484 1485 rth->rt_genid = rt_genid(dev_net(rth->dst.dev)); 1486 rth->rt_flags = flags; 1487 rth->rt_type = res->type; 1488 rth->rt_is_input = 1; 1489 rth->rt_iif = 0; 1490 rth->rt_pmtu = 0; 1491 rth->rt_gateway = 0; 1492 INIT_LIST_HEAD(&rth->rt_uncached); 1493 1494 rth->dst.input = ip_forward; 1495 rth->dst.output = ip_output; 1496 1497 rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag); 1498 skb_dst_set(skb, &rth->dst); 1499 out: 1500 err = 0; 1501 cleanup: 1502 return err; 1503 } 1504 1505 static int ip_mkroute_input(struct sk_buff *skb, 1506 struct fib_result *res, 1507 const struct flowi4 *fl4, 1508 struct in_device *in_dev, 1509 __be32 daddr, __be32 saddr, u32 tos) 1510 { 1511 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1512 if (res->fi && res->fi->fib_nhs > 1) 1513 fib_select_multipath(res); 1514 #endif 1515 1516 /* create a routing cache entry */ 1517 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1518 } 1519 1520 /* 1521 * NOTE. We drop all the packets that has local source 1522 * addresses, because every properly looped back packet 1523 * must have correct destination already attached by output routine. 1524 * 1525 * Such approach solves two big problems: 1526 * 1. Not simplex devices are handled properly. 1527 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1528 * called with rcu_read_lock() 1529 */ 1530 1531 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1532 u8 tos, struct net_device *dev) 1533 { 1534 struct fib_result res; 1535 struct in_device *in_dev = __in_dev_get_rcu(dev); 1536 struct flowi4 fl4; 1537 unsigned int flags = 0; 1538 u32 itag = 0; 1539 struct rtable *rth; 1540 int err = -EINVAL; 1541 struct net *net = dev_net(dev); 1542 bool do_cache; 1543 1544 /* IP on this device is disabled. */ 1545 1546 if (!in_dev) 1547 goto out; 1548 1549 /* Check for the most weird martians, which can be not detected 1550 by fib_lookup. 1551 */ 1552 1553 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 1554 goto martian_source; 1555 1556 res.fi = NULL; 1557 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 1558 goto brd_input; 1559 1560 /* Accept zero addresses only to limited broadcast; 1561 * I even do not know to fix it or not. Waiting for complains :-) 1562 */ 1563 if (ipv4_is_zeronet(saddr)) 1564 goto martian_source; 1565 1566 if (ipv4_is_zeronet(daddr)) 1567 goto martian_destination; 1568 1569 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) { 1570 if (ipv4_is_loopback(daddr)) 1571 goto martian_destination; 1572 1573 if (ipv4_is_loopback(saddr)) 1574 goto martian_source; 1575 } 1576 1577 /* 1578 * Now we are ready to route packet. 1579 */ 1580 fl4.flowi4_oif = 0; 1581 fl4.flowi4_iif = dev->ifindex; 1582 fl4.flowi4_mark = skb->mark; 1583 fl4.flowi4_tos = tos; 1584 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 1585 fl4.daddr = daddr; 1586 fl4.saddr = saddr; 1587 err = fib_lookup(net, &fl4, &res); 1588 if (err != 0) 1589 goto no_route; 1590 1591 RT_CACHE_STAT_INC(in_slow_tot); 1592 1593 if (res.type == RTN_BROADCAST) 1594 goto brd_input; 1595 1596 if (res.type == RTN_LOCAL) { 1597 err = fib_validate_source(skb, saddr, daddr, tos, 1598 net->loopback_dev->ifindex, 1599 dev, in_dev, &itag); 1600 if (err < 0) 1601 goto martian_source_keep_err; 1602 goto local_input; 1603 } 1604 1605 if (!IN_DEV_FORWARD(in_dev)) 1606 goto no_route; 1607 if (res.type != RTN_UNICAST) 1608 goto martian_destination; 1609 1610 err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos); 1611 out: return err; 1612 1613 brd_input: 1614 if (skb->protocol != htons(ETH_P_IP)) 1615 goto e_inval; 1616 1617 if (!ipv4_is_zeronet(saddr)) { 1618 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1619 in_dev, &itag); 1620 if (err < 0) 1621 goto martian_source_keep_err; 1622 } 1623 flags |= RTCF_BROADCAST; 1624 res.type = RTN_BROADCAST; 1625 RT_CACHE_STAT_INC(in_brd); 1626 1627 local_input: 1628 do_cache = false; 1629 if (res.fi) { 1630 if (!itag) { 1631 rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input); 1632 if (rt_cache_valid(rth)) { 1633 skb_dst_set_noref(skb, &rth->dst); 1634 err = 0; 1635 goto out; 1636 } 1637 do_cache = true; 1638 } 1639 } 1640 1641 rth = rt_dst_alloc(net->loopback_dev, 1642 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 1643 if (!rth) 1644 goto e_nobufs; 1645 1646 rth->dst.input= ip_local_deliver; 1647 rth->dst.output= ip_rt_bug; 1648 #ifdef CONFIG_IP_ROUTE_CLASSID 1649 rth->dst.tclassid = itag; 1650 #endif 1651 1652 rth->rt_genid = rt_genid(net); 1653 rth->rt_flags = flags|RTCF_LOCAL; 1654 rth->rt_type = res.type; 1655 rth->rt_is_input = 1; 1656 rth->rt_iif = 0; 1657 rth->rt_pmtu = 0; 1658 rth->rt_gateway = 0; 1659 INIT_LIST_HEAD(&rth->rt_uncached); 1660 if (res.type == RTN_UNREACHABLE) { 1661 rth->dst.input= ip_error; 1662 rth->dst.error= -err; 1663 rth->rt_flags &= ~RTCF_LOCAL; 1664 } 1665 if (do_cache) 1666 rt_cache_route(&FIB_RES_NH(res), rth); 1667 skb_dst_set(skb, &rth->dst); 1668 err = 0; 1669 goto out; 1670 1671 no_route: 1672 RT_CACHE_STAT_INC(in_no_route); 1673 res.type = RTN_UNREACHABLE; 1674 if (err == -ESRCH) 1675 err = -ENETUNREACH; 1676 goto local_input; 1677 1678 /* 1679 * Do not cache martian addresses: they should be logged (RFC1812) 1680 */ 1681 martian_destination: 1682 RT_CACHE_STAT_INC(in_martian_dst); 1683 #ifdef CONFIG_IP_ROUTE_VERBOSE 1684 if (IN_DEV_LOG_MARTIANS(in_dev)) 1685 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 1686 &daddr, &saddr, dev->name); 1687 #endif 1688 1689 e_inval: 1690 err = -EINVAL; 1691 goto out; 1692 1693 e_nobufs: 1694 err = -ENOBUFS; 1695 goto out; 1696 1697 martian_source: 1698 err = -EINVAL; 1699 martian_source_keep_err: 1700 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 1701 goto out; 1702 } 1703 1704 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1705 u8 tos, struct net_device *dev) 1706 { 1707 int res; 1708 1709 rcu_read_lock(); 1710 1711 /* Multicast recognition logic is moved from route cache to here. 1712 The problem was that too many Ethernet cards have broken/missing 1713 hardware multicast filters :-( As result the host on multicasting 1714 network acquires a lot of useless route cache entries, sort of 1715 SDR messages from all the world. Now we try to get rid of them. 1716 Really, provided software IP multicast filter is organized 1717 reasonably (at least, hashed), it does not result in a slowdown 1718 comparing with route cache reject entries. 1719 Note, that multicast routers are not affected, because 1720 route cache entry is created eventually. 1721 */ 1722 if (ipv4_is_multicast(daddr)) { 1723 struct in_device *in_dev = __in_dev_get_rcu(dev); 1724 1725 if (in_dev) { 1726 int our = ip_check_mc_rcu(in_dev, daddr, saddr, 1727 ip_hdr(skb)->protocol); 1728 if (our 1729 #ifdef CONFIG_IP_MROUTE 1730 || 1731 (!ipv4_is_local_multicast(daddr) && 1732 IN_DEV_MFORWARD(in_dev)) 1733 #endif 1734 ) { 1735 int res = ip_route_input_mc(skb, daddr, saddr, 1736 tos, dev, our); 1737 rcu_read_unlock(); 1738 return res; 1739 } 1740 } 1741 rcu_read_unlock(); 1742 return -EINVAL; 1743 } 1744 res = ip_route_input_slow(skb, daddr, saddr, tos, dev); 1745 rcu_read_unlock(); 1746 return res; 1747 } 1748 EXPORT_SYMBOL(ip_route_input_noref); 1749 1750 /* called with rcu_read_lock() */ 1751 static struct rtable *__mkroute_output(const struct fib_result *res, 1752 const struct flowi4 *fl4, int orig_oif, 1753 struct net_device *dev_out, 1754 unsigned int flags) 1755 { 1756 struct fib_info *fi = res->fi; 1757 struct fib_nh_exception *fnhe; 1758 struct in_device *in_dev; 1759 u16 type = res->type; 1760 struct rtable *rth; 1761 1762 in_dev = __in_dev_get_rcu(dev_out); 1763 if (!in_dev) 1764 return ERR_PTR(-EINVAL); 1765 1766 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 1767 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK)) 1768 return ERR_PTR(-EINVAL); 1769 1770 if (ipv4_is_lbcast(fl4->daddr)) 1771 type = RTN_BROADCAST; 1772 else if (ipv4_is_multicast(fl4->daddr)) 1773 type = RTN_MULTICAST; 1774 else if (ipv4_is_zeronet(fl4->daddr)) 1775 return ERR_PTR(-EINVAL); 1776 1777 if (dev_out->flags & IFF_LOOPBACK) 1778 flags |= RTCF_LOCAL; 1779 1780 if (type == RTN_BROADCAST) { 1781 flags |= RTCF_BROADCAST | RTCF_LOCAL; 1782 fi = NULL; 1783 } else if (type == RTN_MULTICAST) { 1784 flags |= RTCF_MULTICAST | RTCF_LOCAL; 1785 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 1786 fl4->flowi4_proto)) 1787 flags &= ~RTCF_LOCAL; 1788 /* If multicast route do not exist use 1789 * default one, but do not gateway in this case. 1790 * Yes, it is hack. 1791 */ 1792 if (fi && res->prefixlen < 4) 1793 fi = NULL; 1794 } 1795 1796 fnhe = NULL; 1797 if (fi) { 1798 struct rtable __rcu **prth; 1799 1800 fnhe = find_exception(&FIB_RES_NH(*res), fl4->daddr); 1801 if (fnhe) 1802 prth = &fnhe->fnhe_rth; 1803 else 1804 prth = __this_cpu_ptr(FIB_RES_NH(*res).nh_pcpu_rth_output); 1805 rth = rcu_dereference(*prth); 1806 if (rt_cache_valid(rth)) { 1807 dst_hold(&rth->dst); 1808 return rth; 1809 } 1810 } 1811 rth = rt_dst_alloc(dev_out, 1812 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1813 IN_DEV_CONF_GET(in_dev, NOXFRM), 1814 fi); 1815 if (!rth) 1816 return ERR_PTR(-ENOBUFS); 1817 1818 rth->dst.output = ip_output; 1819 1820 rth->rt_genid = rt_genid(dev_net(dev_out)); 1821 rth->rt_flags = flags; 1822 rth->rt_type = type; 1823 rth->rt_is_input = 0; 1824 rth->rt_iif = orig_oif ? : 0; 1825 rth->rt_pmtu = 0; 1826 rth->rt_gateway = 0; 1827 INIT_LIST_HEAD(&rth->rt_uncached); 1828 1829 RT_CACHE_STAT_INC(out_slow_tot); 1830 1831 if (flags & RTCF_LOCAL) 1832 rth->dst.input = ip_local_deliver; 1833 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 1834 if (flags & RTCF_LOCAL && 1835 !(dev_out->flags & IFF_LOOPBACK)) { 1836 rth->dst.output = ip_mc_output; 1837 RT_CACHE_STAT_INC(out_slow_mc); 1838 } 1839 #ifdef CONFIG_IP_MROUTE 1840 if (type == RTN_MULTICAST) { 1841 if (IN_DEV_MFORWARD(in_dev) && 1842 !ipv4_is_local_multicast(fl4->daddr)) { 1843 rth->dst.input = ip_mr_input; 1844 rth->dst.output = ip_mc_output; 1845 } 1846 } 1847 #endif 1848 } 1849 1850 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0); 1851 1852 return rth; 1853 } 1854 1855 /* 1856 * Major route resolver routine. 1857 */ 1858 1859 struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) 1860 { 1861 struct net_device *dev_out = NULL; 1862 __u8 tos = RT_FL_TOS(fl4); 1863 unsigned int flags = 0; 1864 struct fib_result res; 1865 struct rtable *rth; 1866 int orig_oif; 1867 1868 res.tclassid = 0; 1869 res.fi = NULL; 1870 res.table = NULL; 1871 1872 orig_oif = fl4->flowi4_oif; 1873 1874 fl4->flowi4_iif = net->loopback_dev->ifindex; 1875 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 1876 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 1877 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 1878 1879 rcu_read_lock(); 1880 if (fl4->saddr) { 1881 rth = ERR_PTR(-EINVAL); 1882 if (ipv4_is_multicast(fl4->saddr) || 1883 ipv4_is_lbcast(fl4->saddr) || 1884 ipv4_is_zeronet(fl4->saddr)) 1885 goto out; 1886 1887 /* I removed check for oif == dev_out->oif here. 1888 It was wrong for two reasons: 1889 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 1890 is assigned to multiple interfaces. 1891 2. Moreover, we are allowed to send packets with saddr 1892 of another iface. --ANK 1893 */ 1894 1895 if (fl4->flowi4_oif == 0 && 1896 (ipv4_is_multicast(fl4->daddr) || 1897 ipv4_is_lbcast(fl4->daddr))) { 1898 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 1899 dev_out = __ip_dev_find(net, fl4->saddr, false); 1900 if (dev_out == NULL) 1901 goto out; 1902 1903 /* Special hack: user can direct multicasts 1904 and limited broadcast via necessary interface 1905 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 1906 This hack is not just for fun, it allows 1907 vic,vat and friends to work. 1908 They bind socket to loopback, set ttl to zero 1909 and expect that it will work. 1910 From the viewpoint of routing cache they are broken, 1911 because we are not allowed to build multicast path 1912 with loopback source addr (look, routing cache 1913 cannot know, that ttl is zero, so that packet 1914 will not leave this host and route is valid). 1915 Luckily, this hack is good workaround. 1916 */ 1917 1918 fl4->flowi4_oif = dev_out->ifindex; 1919 goto make_route; 1920 } 1921 1922 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 1923 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 1924 if (!__ip_dev_find(net, fl4->saddr, false)) 1925 goto out; 1926 } 1927 } 1928 1929 1930 if (fl4->flowi4_oif) { 1931 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 1932 rth = ERR_PTR(-ENODEV); 1933 if (dev_out == NULL) 1934 goto out; 1935 1936 /* RACE: Check return value of inet_select_addr instead. */ 1937 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 1938 rth = ERR_PTR(-ENETUNREACH); 1939 goto out; 1940 } 1941 if (ipv4_is_local_multicast(fl4->daddr) || 1942 ipv4_is_lbcast(fl4->daddr)) { 1943 if (!fl4->saddr) 1944 fl4->saddr = inet_select_addr(dev_out, 0, 1945 RT_SCOPE_LINK); 1946 goto make_route; 1947 } 1948 if (fl4->saddr) { 1949 if (ipv4_is_multicast(fl4->daddr)) 1950 fl4->saddr = inet_select_addr(dev_out, 0, 1951 fl4->flowi4_scope); 1952 else if (!fl4->daddr) 1953 fl4->saddr = inet_select_addr(dev_out, 0, 1954 RT_SCOPE_HOST); 1955 } 1956 } 1957 1958 if (!fl4->daddr) { 1959 fl4->daddr = fl4->saddr; 1960 if (!fl4->daddr) 1961 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 1962 dev_out = net->loopback_dev; 1963 fl4->flowi4_oif = net->loopback_dev->ifindex; 1964 res.type = RTN_LOCAL; 1965 flags |= RTCF_LOCAL; 1966 goto make_route; 1967 } 1968 1969 if (fib_lookup(net, fl4, &res)) { 1970 res.fi = NULL; 1971 res.table = NULL; 1972 if (fl4->flowi4_oif) { 1973 /* Apparently, routing tables are wrong. Assume, 1974 that the destination is on link. 1975 1976 WHY? DW. 1977 Because we are allowed to send to iface 1978 even if it has NO routes and NO assigned 1979 addresses. When oif is specified, routing 1980 tables are looked up with only one purpose: 1981 to catch if destination is gatewayed, rather than 1982 direct. Moreover, if MSG_DONTROUTE is set, 1983 we send packet, ignoring both routing tables 1984 and ifaddr state. --ANK 1985 1986 1987 We could make it even if oif is unknown, 1988 likely IPv6, but we do not. 1989 */ 1990 1991 if (fl4->saddr == 0) 1992 fl4->saddr = inet_select_addr(dev_out, 0, 1993 RT_SCOPE_LINK); 1994 res.type = RTN_UNICAST; 1995 goto make_route; 1996 } 1997 rth = ERR_PTR(-ENETUNREACH); 1998 goto out; 1999 } 2000 2001 if (res.type == RTN_LOCAL) { 2002 if (!fl4->saddr) { 2003 if (res.fi->fib_prefsrc) 2004 fl4->saddr = res.fi->fib_prefsrc; 2005 else 2006 fl4->saddr = fl4->daddr; 2007 } 2008 dev_out = net->loopback_dev; 2009 fl4->flowi4_oif = dev_out->ifindex; 2010 flags |= RTCF_LOCAL; 2011 goto make_route; 2012 } 2013 2014 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2015 if (res.fi->fib_nhs > 1 && fl4->flowi4_oif == 0) 2016 fib_select_multipath(&res); 2017 else 2018 #endif 2019 if (!res.prefixlen && 2020 res.table->tb_num_default > 1 && 2021 res.type == RTN_UNICAST && !fl4->flowi4_oif) 2022 fib_select_default(&res); 2023 2024 if (!fl4->saddr) 2025 fl4->saddr = FIB_RES_PREFSRC(net, res); 2026 2027 dev_out = FIB_RES_DEV(res); 2028 fl4->flowi4_oif = dev_out->ifindex; 2029 2030 2031 make_route: 2032 rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags); 2033 2034 out: 2035 rcu_read_unlock(); 2036 return rth; 2037 } 2038 EXPORT_SYMBOL_GPL(__ip_route_output_key); 2039 2040 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2041 { 2042 return NULL; 2043 } 2044 2045 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2046 { 2047 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2048 2049 return mtu ? : dst->dev->mtu; 2050 } 2051 2052 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2053 struct sk_buff *skb, u32 mtu) 2054 { 2055 } 2056 2057 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2058 struct sk_buff *skb) 2059 { 2060 } 2061 2062 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2063 unsigned long old) 2064 { 2065 return NULL; 2066 } 2067 2068 static struct dst_ops ipv4_dst_blackhole_ops = { 2069 .family = AF_INET, 2070 .protocol = cpu_to_be16(ETH_P_IP), 2071 .check = ipv4_blackhole_dst_check, 2072 .mtu = ipv4_blackhole_mtu, 2073 .default_advmss = ipv4_default_advmss, 2074 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2075 .redirect = ipv4_rt_blackhole_redirect, 2076 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2077 .neigh_lookup = ipv4_neigh_lookup, 2078 }; 2079 2080 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2081 { 2082 struct rtable *ort = (struct rtable *) dst_orig; 2083 struct rtable *rt; 2084 2085 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0); 2086 if (rt) { 2087 struct dst_entry *new = &rt->dst; 2088 2089 new->__use = 1; 2090 new->input = dst_discard; 2091 new->output = dst_discard; 2092 2093 new->dev = ort->dst.dev; 2094 if (new->dev) 2095 dev_hold(new->dev); 2096 2097 rt->rt_is_input = ort->rt_is_input; 2098 rt->rt_iif = ort->rt_iif; 2099 rt->rt_pmtu = ort->rt_pmtu; 2100 2101 rt->rt_genid = rt_genid(net); 2102 rt->rt_flags = ort->rt_flags; 2103 rt->rt_type = ort->rt_type; 2104 rt->rt_gateway = ort->rt_gateway; 2105 2106 INIT_LIST_HEAD(&rt->rt_uncached); 2107 2108 dst_free(new); 2109 } 2110 2111 dst_release(dst_orig); 2112 2113 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2114 } 2115 2116 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2117 struct sock *sk) 2118 { 2119 struct rtable *rt = __ip_route_output_key(net, flp4); 2120 2121 if (IS_ERR(rt)) 2122 return rt; 2123 2124 if (flp4->flowi4_proto) 2125 rt = (struct rtable *) xfrm_lookup(net, &rt->dst, 2126 flowi4_to_flowi(flp4), 2127 sk, 0); 2128 2129 return rt; 2130 } 2131 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2132 2133 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2134 struct flowi4 *fl4, struct sk_buff *skb, u32 pid, 2135 u32 seq, int event, int nowait, unsigned int flags) 2136 { 2137 struct rtable *rt = skb_rtable(skb); 2138 struct rtmsg *r; 2139 struct nlmsghdr *nlh; 2140 unsigned long expires = 0; 2141 u32 error; 2142 u32 metrics[RTAX_MAX]; 2143 2144 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags); 2145 if (nlh == NULL) 2146 return -EMSGSIZE; 2147 2148 r = nlmsg_data(nlh); 2149 r->rtm_family = AF_INET; 2150 r->rtm_dst_len = 32; 2151 r->rtm_src_len = 0; 2152 r->rtm_tos = fl4->flowi4_tos; 2153 r->rtm_table = RT_TABLE_MAIN; 2154 if (nla_put_u32(skb, RTA_TABLE, RT_TABLE_MAIN)) 2155 goto nla_put_failure; 2156 r->rtm_type = rt->rt_type; 2157 r->rtm_scope = RT_SCOPE_UNIVERSE; 2158 r->rtm_protocol = RTPROT_UNSPEC; 2159 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2160 if (rt->rt_flags & RTCF_NOTIFY) 2161 r->rtm_flags |= RTM_F_NOTIFY; 2162 2163 if (nla_put_be32(skb, RTA_DST, dst)) 2164 goto nla_put_failure; 2165 if (src) { 2166 r->rtm_src_len = 32; 2167 if (nla_put_be32(skb, RTA_SRC, src)) 2168 goto nla_put_failure; 2169 } 2170 if (rt->dst.dev && 2171 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2172 goto nla_put_failure; 2173 #ifdef CONFIG_IP_ROUTE_CLASSID 2174 if (rt->dst.tclassid && 2175 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2176 goto nla_put_failure; 2177 #endif 2178 if (!rt_is_input_route(rt) && 2179 fl4->saddr != src) { 2180 if (nla_put_be32(skb, RTA_PREFSRC, fl4->saddr)) 2181 goto nla_put_failure; 2182 } 2183 if (rt->rt_gateway && 2184 nla_put_be32(skb, RTA_GATEWAY, rt->rt_gateway)) 2185 goto nla_put_failure; 2186 2187 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2188 if (rt->rt_pmtu) 2189 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2190 if (rtnetlink_put_metrics(skb, metrics) < 0) 2191 goto nla_put_failure; 2192 2193 if (fl4->flowi4_mark && 2194 nla_put_be32(skb, RTA_MARK, fl4->flowi4_mark)) 2195 goto nla_put_failure; 2196 2197 error = rt->dst.error; 2198 expires = rt->dst.expires; 2199 if (expires) { 2200 if (time_before(jiffies, expires)) 2201 expires -= jiffies; 2202 else 2203 expires = 0; 2204 } 2205 2206 if (rt_is_input_route(rt)) { 2207 if (nla_put_u32(skb, RTA_IIF, rt->rt_iif)) 2208 goto nla_put_failure; 2209 } 2210 2211 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2212 goto nla_put_failure; 2213 2214 return nlmsg_end(skb, nlh); 2215 2216 nla_put_failure: 2217 nlmsg_cancel(skb, nlh); 2218 return -EMSGSIZE; 2219 } 2220 2221 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, void *arg) 2222 { 2223 struct net *net = sock_net(in_skb->sk); 2224 struct rtmsg *rtm; 2225 struct nlattr *tb[RTA_MAX+1]; 2226 struct rtable *rt = NULL; 2227 struct flowi4 fl4; 2228 __be32 dst = 0; 2229 __be32 src = 0; 2230 u32 iif; 2231 int err; 2232 int mark; 2233 struct sk_buff *skb; 2234 2235 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); 2236 if (err < 0) 2237 goto errout; 2238 2239 rtm = nlmsg_data(nlh); 2240 2241 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2242 if (skb == NULL) { 2243 err = -ENOBUFS; 2244 goto errout; 2245 } 2246 2247 /* Reserve room for dummy headers, this skb can pass 2248 through good chunk of routing engine. 2249 */ 2250 skb_reset_mac_header(skb); 2251 skb_reset_network_header(skb); 2252 2253 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */ 2254 ip_hdr(skb)->protocol = IPPROTO_ICMP; 2255 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2256 2257 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0; 2258 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0; 2259 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2260 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2261 2262 memset(&fl4, 0, sizeof(fl4)); 2263 fl4.daddr = dst; 2264 fl4.saddr = src; 2265 fl4.flowi4_tos = rtm->rtm_tos; 2266 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2267 fl4.flowi4_mark = mark; 2268 2269 if (iif) { 2270 struct net_device *dev; 2271 2272 dev = __dev_get_by_index(net, iif); 2273 if (dev == NULL) { 2274 err = -ENODEV; 2275 goto errout_free; 2276 } 2277 2278 skb->protocol = htons(ETH_P_IP); 2279 skb->dev = dev; 2280 skb->mark = mark; 2281 local_bh_disable(); 2282 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); 2283 local_bh_enable(); 2284 2285 rt = skb_rtable(skb); 2286 if (err == 0 && rt->dst.error) 2287 err = -rt->dst.error; 2288 } else { 2289 rt = ip_route_output_key(net, &fl4); 2290 2291 err = 0; 2292 if (IS_ERR(rt)) 2293 err = PTR_ERR(rt); 2294 } 2295 2296 if (err) 2297 goto errout_free; 2298 2299 skb_dst_set(skb, &rt->dst); 2300 if (rtm->rtm_flags & RTM_F_NOTIFY) 2301 rt->rt_flags |= RTCF_NOTIFY; 2302 2303 err = rt_fill_info(net, dst, src, &fl4, skb, 2304 NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, 2305 RTM_NEWROUTE, 0, 0); 2306 if (err <= 0) 2307 goto errout_free; 2308 2309 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); 2310 errout: 2311 return err; 2312 2313 errout_free: 2314 kfree_skb(skb); 2315 goto errout; 2316 } 2317 2318 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) 2319 { 2320 return skb->len; 2321 } 2322 2323 void ip_rt_multicast_event(struct in_device *in_dev) 2324 { 2325 rt_cache_flush(dev_net(in_dev->dev)); 2326 } 2327 2328 #ifdef CONFIG_SYSCTL 2329 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, 2330 void __user *buffer, 2331 size_t *lenp, loff_t *ppos) 2332 { 2333 if (write) { 2334 rt_cache_flush((struct net *)__ctl->extra1); 2335 return 0; 2336 } 2337 2338 return -EINVAL; 2339 } 2340 2341 static ctl_table ipv4_route_table[] = { 2342 { 2343 .procname = "gc_thresh", 2344 .data = &ipv4_dst_ops.gc_thresh, 2345 .maxlen = sizeof(int), 2346 .mode = 0644, 2347 .proc_handler = proc_dointvec, 2348 }, 2349 { 2350 .procname = "max_size", 2351 .data = &ip_rt_max_size, 2352 .maxlen = sizeof(int), 2353 .mode = 0644, 2354 .proc_handler = proc_dointvec, 2355 }, 2356 { 2357 /* Deprecated. Use gc_min_interval_ms */ 2358 2359 .procname = "gc_min_interval", 2360 .data = &ip_rt_gc_min_interval, 2361 .maxlen = sizeof(int), 2362 .mode = 0644, 2363 .proc_handler = proc_dointvec_jiffies, 2364 }, 2365 { 2366 .procname = "gc_min_interval_ms", 2367 .data = &ip_rt_gc_min_interval, 2368 .maxlen = sizeof(int), 2369 .mode = 0644, 2370 .proc_handler = proc_dointvec_ms_jiffies, 2371 }, 2372 { 2373 .procname = "gc_timeout", 2374 .data = &ip_rt_gc_timeout, 2375 .maxlen = sizeof(int), 2376 .mode = 0644, 2377 .proc_handler = proc_dointvec_jiffies, 2378 }, 2379 { 2380 .procname = "gc_interval", 2381 .data = &ip_rt_gc_interval, 2382 .maxlen = sizeof(int), 2383 .mode = 0644, 2384 .proc_handler = proc_dointvec_jiffies, 2385 }, 2386 { 2387 .procname = "redirect_load", 2388 .data = &ip_rt_redirect_load, 2389 .maxlen = sizeof(int), 2390 .mode = 0644, 2391 .proc_handler = proc_dointvec, 2392 }, 2393 { 2394 .procname = "redirect_number", 2395 .data = &ip_rt_redirect_number, 2396 .maxlen = sizeof(int), 2397 .mode = 0644, 2398 .proc_handler = proc_dointvec, 2399 }, 2400 { 2401 .procname = "redirect_silence", 2402 .data = &ip_rt_redirect_silence, 2403 .maxlen = sizeof(int), 2404 .mode = 0644, 2405 .proc_handler = proc_dointvec, 2406 }, 2407 { 2408 .procname = "error_cost", 2409 .data = &ip_rt_error_cost, 2410 .maxlen = sizeof(int), 2411 .mode = 0644, 2412 .proc_handler = proc_dointvec, 2413 }, 2414 { 2415 .procname = "error_burst", 2416 .data = &ip_rt_error_burst, 2417 .maxlen = sizeof(int), 2418 .mode = 0644, 2419 .proc_handler = proc_dointvec, 2420 }, 2421 { 2422 .procname = "gc_elasticity", 2423 .data = &ip_rt_gc_elasticity, 2424 .maxlen = sizeof(int), 2425 .mode = 0644, 2426 .proc_handler = proc_dointvec, 2427 }, 2428 { 2429 .procname = "mtu_expires", 2430 .data = &ip_rt_mtu_expires, 2431 .maxlen = sizeof(int), 2432 .mode = 0644, 2433 .proc_handler = proc_dointvec_jiffies, 2434 }, 2435 { 2436 .procname = "min_pmtu", 2437 .data = &ip_rt_min_pmtu, 2438 .maxlen = sizeof(int), 2439 .mode = 0644, 2440 .proc_handler = proc_dointvec, 2441 }, 2442 { 2443 .procname = "min_adv_mss", 2444 .data = &ip_rt_min_advmss, 2445 .maxlen = sizeof(int), 2446 .mode = 0644, 2447 .proc_handler = proc_dointvec, 2448 }, 2449 { } 2450 }; 2451 2452 static struct ctl_table ipv4_route_flush_table[] = { 2453 { 2454 .procname = "flush", 2455 .maxlen = sizeof(int), 2456 .mode = 0200, 2457 .proc_handler = ipv4_sysctl_rtcache_flush, 2458 }, 2459 { }, 2460 }; 2461 2462 static __net_init int sysctl_route_net_init(struct net *net) 2463 { 2464 struct ctl_table *tbl; 2465 2466 tbl = ipv4_route_flush_table; 2467 if (!net_eq(net, &init_net)) { 2468 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 2469 if (tbl == NULL) 2470 goto err_dup; 2471 } 2472 tbl[0].extra1 = net; 2473 2474 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 2475 if (net->ipv4.route_hdr == NULL) 2476 goto err_reg; 2477 return 0; 2478 2479 err_reg: 2480 if (tbl != ipv4_route_flush_table) 2481 kfree(tbl); 2482 err_dup: 2483 return -ENOMEM; 2484 } 2485 2486 static __net_exit void sysctl_route_net_exit(struct net *net) 2487 { 2488 struct ctl_table *tbl; 2489 2490 tbl = net->ipv4.route_hdr->ctl_table_arg; 2491 unregister_net_sysctl_table(net->ipv4.route_hdr); 2492 BUG_ON(tbl == ipv4_route_flush_table); 2493 kfree(tbl); 2494 } 2495 2496 static __net_initdata struct pernet_operations sysctl_route_ops = { 2497 .init = sysctl_route_net_init, 2498 .exit = sysctl_route_net_exit, 2499 }; 2500 #endif 2501 2502 static __net_init int rt_genid_init(struct net *net) 2503 { 2504 atomic_set(&net->rt_genid, 0); 2505 get_random_bytes(&net->ipv4.dev_addr_genid, 2506 sizeof(net->ipv4.dev_addr_genid)); 2507 return 0; 2508 } 2509 2510 static __net_initdata struct pernet_operations rt_genid_ops = { 2511 .init = rt_genid_init, 2512 }; 2513 2514 static int __net_init ipv4_inetpeer_init(struct net *net) 2515 { 2516 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 2517 2518 if (!bp) 2519 return -ENOMEM; 2520 inet_peer_base_init(bp); 2521 net->ipv4.peers = bp; 2522 return 0; 2523 } 2524 2525 static void __net_exit ipv4_inetpeer_exit(struct net *net) 2526 { 2527 struct inet_peer_base *bp = net->ipv4.peers; 2528 2529 net->ipv4.peers = NULL; 2530 inetpeer_invalidate_tree(bp); 2531 kfree(bp); 2532 } 2533 2534 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 2535 .init = ipv4_inetpeer_init, 2536 .exit = ipv4_inetpeer_exit, 2537 }; 2538 2539 #ifdef CONFIG_IP_ROUTE_CLASSID 2540 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 2541 #endif /* CONFIG_IP_ROUTE_CLASSID */ 2542 2543 int __init ip_rt_init(void) 2544 { 2545 int rc = 0; 2546 2547 #ifdef CONFIG_IP_ROUTE_CLASSID 2548 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 2549 if (!ip_rt_acct) 2550 panic("IP: failed to allocate ip_rt_acct\n"); 2551 #endif 2552 2553 ipv4_dst_ops.kmem_cachep = 2554 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 2555 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 2556 2557 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 2558 2559 if (dst_entries_init(&ipv4_dst_ops) < 0) 2560 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 2561 2562 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 2563 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 2564 2565 ipv4_dst_ops.gc_thresh = ~0; 2566 ip_rt_max_size = INT_MAX; 2567 2568 devinet_init(); 2569 ip_fib_init(); 2570 2571 if (ip_rt_proc_init()) 2572 pr_err("Unable to create route proc files\n"); 2573 #ifdef CONFIG_XFRM 2574 xfrm_init(); 2575 xfrm4_init(ip_rt_max_size); 2576 #endif 2577 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL); 2578 2579 #ifdef CONFIG_SYSCTL 2580 register_pernet_subsys(&sysctl_route_ops); 2581 #endif 2582 register_pernet_subsys(&rt_genid_ops); 2583 register_pernet_subsys(&ipv4_inetpeer_ops); 2584 return rc; 2585 } 2586 2587 #ifdef CONFIG_SYSCTL 2588 /* 2589 * We really need to sanitize the damn ipv4 init order, then all 2590 * this nonsense will go away. 2591 */ 2592 void __init ip_static_sysctl_init(void) 2593 { 2594 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 2595 } 2596 #endif 2597