1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #define pr_fmt(fmt) "IPv4: " fmt 66 67 #include <linux/module.h> 68 #include <linux/uaccess.h> 69 #include <linux/bitops.h> 70 #include <linux/types.h> 71 #include <linux/kernel.h> 72 #include <linux/mm.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/skbuff.h> 83 #include <linux/inetdevice.h> 84 #include <linux/igmp.h> 85 #include <linux/pkt_sched.h> 86 #include <linux/mroute.h> 87 #include <linux/netfilter_ipv4.h> 88 #include <linux/random.h> 89 #include <linux/rcupdate.h> 90 #include <linux/times.h> 91 #include <linux/slab.h> 92 #include <linux/jhash.h> 93 #include <net/dst.h> 94 #include <net/dst_metadata.h> 95 #include <net/net_namespace.h> 96 #include <net/protocol.h> 97 #include <net/ip.h> 98 #include <net/route.h> 99 #include <net/inetpeer.h> 100 #include <net/sock.h> 101 #include <net/ip_fib.h> 102 #include <net/arp.h> 103 #include <net/tcp.h> 104 #include <net/icmp.h> 105 #include <net/xfrm.h> 106 #include <net/lwtunnel.h> 107 #include <net/netevent.h> 108 #include <net/rtnetlink.h> 109 #ifdef CONFIG_SYSCTL 110 #include <linux/sysctl.h> 111 #include <linux/kmemleak.h> 112 #endif 113 #include <net/secure_seq.h> 114 #include <net/ip_tunnels.h> 115 #include <net/l3mdev.h> 116 117 #include "fib_lookup.h" 118 119 #define RT_FL_TOS(oldflp4) \ 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 121 122 #define RT_GC_TIMEOUT (300*HZ) 123 124 static int ip_rt_max_size; 125 static int ip_rt_redirect_number __read_mostly = 9; 126 static int ip_rt_redirect_load __read_mostly = HZ / 50; 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 128 static int ip_rt_error_cost __read_mostly = HZ; 129 static int ip_rt_error_burst __read_mostly = 5 * HZ; 130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 131 static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 132 static int ip_rt_min_advmss __read_mostly = 256; 133 134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 135 /* 136 * Interface to generic destination cache. 137 */ 138 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 141 static unsigned int ipv4_mtu(const struct dst_entry *dst); 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 143 static void ipv4_link_failure(struct sk_buff *skb); 144 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 145 struct sk_buff *skb, u32 mtu); 146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 147 struct sk_buff *skb); 148 static void ipv4_dst_destroy(struct dst_entry *dst); 149 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 151 { 152 WARN_ON(1); 153 return NULL; 154 } 155 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 157 struct sk_buff *skb, 158 const void *daddr); 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); 160 161 static struct dst_ops ipv4_dst_ops = { 162 .family = AF_INET, 163 .check = ipv4_dst_check, 164 .default_advmss = ipv4_default_advmss, 165 .mtu = ipv4_mtu, 166 .cow_metrics = ipv4_cow_metrics, 167 .destroy = ipv4_dst_destroy, 168 .negative_advice = ipv4_negative_advice, 169 .link_failure = ipv4_link_failure, 170 .update_pmtu = ip_rt_update_pmtu, 171 .redirect = ip_do_redirect, 172 .local_out = __ip_local_out, 173 .neigh_lookup = ipv4_neigh_lookup, 174 .confirm_neigh = ipv4_confirm_neigh, 175 }; 176 177 #define ECN_OR_COST(class) TC_PRIO_##class 178 179 const __u8 ip_tos2prio[16] = { 180 TC_PRIO_BESTEFFORT, 181 ECN_OR_COST(BESTEFFORT), 182 TC_PRIO_BESTEFFORT, 183 ECN_OR_COST(BESTEFFORT), 184 TC_PRIO_BULK, 185 ECN_OR_COST(BULK), 186 TC_PRIO_BULK, 187 ECN_OR_COST(BULK), 188 TC_PRIO_INTERACTIVE, 189 ECN_OR_COST(INTERACTIVE), 190 TC_PRIO_INTERACTIVE, 191 ECN_OR_COST(INTERACTIVE), 192 TC_PRIO_INTERACTIVE_BULK, 193 ECN_OR_COST(INTERACTIVE_BULK), 194 TC_PRIO_INTERACTIVE_BULK, 195 ECN_OR_COST(INTERACTIVE_BULK) 196 }; 197 EXPORT_SYMBOL(ip_tos2prio); 198 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 201 202 #ifdef CONFIG_PROC_FS 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 204 { 205 if (*pos) 206 return NULL; 207 return SEQ_START_TOKEN; 208 } 209 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 211 { 212 ++*pos; 213 return NULL; 214 } 215 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 217 { 218 } 219 220 static int rt_cache_seq_show(struct seq_file *seq, void *v) 221 { 222 if (v == SEQ_START_TOKEN) 223 seq_printf(seq, "%-127s\n", 224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 226 "HHUptod\tSpecDst"); 227 return 0; 228 } 229 230 static const struct seq_operations rt_cache_seq_ops = { 231 .start = rt_cache_seq_start, 232 .next = rt_cache_seq_next, 233 .stop = rt_cache_seq_stop, 234 .show = rt_cache_seq_show, 235 }; 236 237 static int rt_cache_seq_open(struct inode *inode, struct file *file) 238 { 239 return seq_open(file, &rt_cache_seq_ops); 240 } 241 242 static const struct file_operations rt_cache_seq_fops = { 243 .open = rt_cache_seq_open, 244 .read = seq_read, 245 .llseek = seq_lseek, 246 .release = seq_release, 247 }; 248 249 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 251 { 252 int cpu; 253 254 if (*pos == 0) 255 return SEQ_START_TOKEN; 256 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 258 if (!cpu_possible(cpu)) 259 continue; 260 *pos = cpu+1; 261 return &per_cpu(rt_cache_stat, cpu); 262 } 263 return NULL; 264 } 265 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 267 { 268 int cpu; 269 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 271 if (!cpu_possible(cpu)) 272 continue; 273 *pos = cpu+1; 274 return &per_cpu(rt_cache_stat, cpu); 275 } 276 return NULL; 277 278 } 279 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 281 { 282 283 } 284 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 286 { 287 struct rt_cache_stat *st = v; 288 289 if (v == SEQ_START_TOKEN) { 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 291 return 0; 292 } 293 294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 296 dst_entries_get_slow(&ipv4_dst_ops), 297 0, /* st->in_hit */ 298 st->in_slow_tot, 299 st->in_slow_mc, 300 st->in_no_route, 301 st->in_brd, 302 st->in_martian_dst, 303 st->in_martian_src, 304 305 0, /* st->out_hit */ 306 st->out_slow_tot, 307 st->out_slow_mc, 308 309 0, /* st->gc_total */ 310 0, /* st->gc_ignored */ 311 0, /* st->gc_goal_miss */ 312 0, /* st->gc_dst_overflow */ 313 0, /* st->in_hlist_search */ 314 0 /* st->out_hlist_search */ 315 ); 316 return 0; 317 } 318 319 static const struct seq_operations rt_cpu_seq_ops = { 320 .start = rt_cpu_seq_start, 321 .next = rt_cpu_seq_next, 322 .stop = rt_cpu_seq_stop, 323 .show = rt_cpu_seq_show, 324 }; 325 326 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 328 { 329 return seq_open(file, &rt_cpu_seq_ops); 330 } 331 332 static const struct file_operations rt_cpu_seq_fops = { 333 .open = rt_cpu_seq_open, 334 .read = seq_read, 335 .llseek = seq_lseek, 336 .release = seq_release, 337 }; 338 339 #ifdef CONFIG_IP_ROUTE_CLASSID 340 static int rt_acct_proc_show(struct seq_file *m, void *v) 341 { 342 struct ip_rt_acct *dst, *src; 343 unsigned int i, j; 344 345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 346 if (!dst) 347 return -ENOMEM; 348 349 for_each_possible_cpu(i) { 350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 351 for (j = 0; j < 256; j++) { 352 dst[j].o_bytes += src[j].o_bytes; 353 dst[j].o_packets += src[j].o_packets; 354 dst[j].i_bytes += src[j].i_bytes; 355 dst[j].i_packets += src[j].i_packets; 356 } 357 } 358 359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 360 kfree(dst); 361 return 0; 362 } 363 364 static int rt_acct_proc_open(struct inode *inode, struct file *file) 365 { 366 return single_open(file, rt_acct_proc_show, NULL); 367 } 368 369 static const struct file_operations rt_acct_proc_fops = { 370 .open = rt_acct_proc_open, 371 .read = seq_read, 372 .llseek = seq_lseek, 373 .release = single_release, 374 }; 375 #endif 376 377 static int __net_init ip_rt_do_proc_init(struct net *net) 378 { 379 struct proc_dir_entry *pde; 380 381 pde = proc_create("rt_cache", S_IRUGO, net->proc_net, 382 &rt_cache_seq_fops); 383 if (!pde) 384 goto err1; 385 386 pde = proc_create("rt_cache", S_IRUGO, 387 net->proc_net_stat, &rt_cpu_seq_fops); 388 if (!pde) 389 goto err2; 390 391 #ifdef CONFIG_IP_ROUTE_CLASSID 392 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); 393 if (!pde) 394 goto err3; 395 #endif 396 return 0; 397 398 #ifdef CONFIG_IP_ROUTE_CLASSID 399 err3: 400 remove_proc_entry("rt_cache", net->proc_net_stat); 401 #endif 402 err2: 403 remove_proc_entry("rt_cache", net->proc_net); 404 err1: 405 return -ENOMEM; 406 } 407 408 static void __net_exit ip_rt_do_proc_exit(struct net *net) 409 { 410 remove_proc_entry("rt_cache", net->proc_net_stat); 411 remove_proc_entry("rt_cache", net->proc_net); 412 #ifdef CONFIG_IP_ROUTE_CLASSID 413 remove_proc_entry("rt_acct", net->proc_net); 414 #endif 415 } 416 417 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 418 .init = ip_rt_do_proc_init, 419 .exit = ip_rt_do_proc_exit, 420 .async = true, 421 }; 422 423 static int __init ip_rt_proc_init(void) 424 { 425 return register_pernet_subsys(&ip_rt_proc_ops); 426 } 427 428 #else 429 static inline int ip_rt_proc_init(void) 430 { 431 return 0; 432 } 433 #endif /* CONFIG_PROC_FS */ 434 435 static inline bool rt_is_expired(const struct rtable *rth) 436 { 437 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 438 } 439 440 void rt_cache_flush(struct net *net) 441 { 442 rt_genid_bump_ipv4(net); 443 } 444 445 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 446 struct sk_buff *skb, 447 const void *daddr) 448 { 449 struct net_device *dev = dst->dev; 450 const __be32 *pkey = daddr; 451 const struct rtable *rt; 452 struct neighbour *n; 453 454 rt = (const struct rtable *) dst; 455 if (rt->rt_gateway) 456 pkey = (const __be32 *) &rt->rt_gateway; 457 else if (skb) 458 pkey = &ip_hdr(skb)->daddr; 459 460 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); 461 if (n) 462 return n; 463 return neigh_create(&arp_tbl, pkey, dev); 464 } 465 466 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 467 { 468 struct net_device *dev = dst->dev; 469 const __be32 *pkey = daddr; 470 const struct rtable *rt; 471 472 rt = (const struct rtable *)dst; 473 if (rt->rt_gateway) 474 pkey = (const __be32 *)&rt->rt_gateway; 475 else if (!daddr || 476 (rt->rt_flags & 477 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) 478 return; 479 480 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); 481 } 482 483 #define IP_IDENTS_SZ 2048u 484 485 static atomic_t *ip_idents __read_mostly; 486 static u32 *ip_tstamps __read_mostly; 487 488 /* In order to protect privacy, we add a perturbation to identifiers 489 * if one generator is seldom used. This makes hard for an attacker 490 * to infer how many packets were sent between two points in time. 491 */ 492 u32 ip_idents_reserve(u32 hash, int segs) 493 { 494 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; 495 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; 496 u32 old = READ_ONCE(*p_tstamp); 497 u32 now = (u32)jiffies; 498 u32 new, delta = 0; 499 500 if (old != now && cmpxchg(p_tstamp, old, now) == old) 501 delta = prandom_u32_max(now - old); 502 503 /* Do not use atomic_add_return() as it makes UBSAN unhappy */ 504 do { 505 old = (u32)atomic_read(p_id); 506 new = old + delta + segs; 507 } while (atomic_cmpxchg(p_id, old, new) != old); 508 509 return new - segs; 510 } 511 EXPORT_SYMBOL(ip_idents_reserve); 512 513 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) 514 { 515 static u32 ip_idents_hashrnd __read_mostly; 516 u32 hash, id; 517 518 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd)); 519 520 hash = jhash_3words((__force u32)iph->daddr, 521 (__force u32)iph->saddr, 522 iph->protocol ^ net_hash_mix(net), 523 ip_idents_hashrnd); 524 id = ip_idents_reserve(hash, segs); 525 iph->id = htons(id); 526 } 527 EXPORT_SYMBOL(__ip_select_ident); 528 529 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 530 const struct sock *sk, 531 const struct iphdr *iph, 532 int oif, u8 tos, 533 u8 prot, u32 mark, int flow_flags) 534 { 535 if (sk) { 536 const struct inet_sock *inet = inet_sk(sk); 537 538 oif = sk->sk_bound_dev_if; 539 mark = sk->sk_mark; 540 tos = RT_CONN_FLAGS(sk); 541 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 542 } 543 flowi4_init_output(fl4, oif, mark, tos, 544 RT_SCOPE_UNIVERSE, prot, 545 flow_flags, 546 iph->daddr, iph->saddr, 0, 0, 547 sock_net_uid(net, sk)); 548 } 549 550 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 551 const struct sock *sk) 552 { 553 const struct net *net = dev_net(skb->dev); 554 const struct iphdr *iph = ip_hdr(skb); 555 int oif = skb->dev->ifindex; 556 u8 tos = RT_TOS(iph->tos); 557 u8 prot = iph->protocol; 558 u32 mark = skb->mark; 559 560 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); 561 } 562 563 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 564 { 565 const struct inet_sock *inet = inet_sk(sk); 566 const struct ip_options_rcu *inet_opt; 567 __be32 daddr = inet->inet_daddr; 568 569 rcu_read_lock(); 570 inet_opt = rcu_dereference(inet->inet_opt); 571 if (inet_opt && inet_opt->opt.srr) 572 daddr = inet_opt->opt.faddr; 573 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 574 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 575 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 576 inet_sk_flowi_flags(sk), 577 daddr, inet->inet_saddr, 0, 0, sk->sk_uid); 578 rcu_read_unlock(); 579 } 580 581 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 582 const struct sk_buff *skb) 583 { 584 if (skb) 585 build_skb_flow_key(fl4, skb, sk); 586 else 587 build_sk_flow_key(fl4, sk); 588 } 589 590 static DEFINE_SPINLOCK(fnhe_lock); 591 592 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) 593 { 594 struct rtable *rt; 595 596 rt = rcu_dereference(fnhe->fnhe_rth_input); 597 if (rt) { 598 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 599 dst_dev_put(&rt->dst); 600 dst_release(&rt->dst); 601 } 602 rt = rcu_dereference(fnhe->fnhe_rth_output); 603 if (rt) { 604 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 605 dst_dev_put(&rt->dst); 606 dst_release(&rt->dst); 607 } 608 } 609 610 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 611 { 612 struct fib_nh_exception *fnhe, *oldest; 613 614 oldest = rcu_dereference(hash->chain); 615 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 616 fnhe = rcu_dereference(fnhe->fnhe_next)) { 617 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 618 oldest = fnhe; 619 } 620 fnhe_flush_routes(oldest); 621 return oldest; 622 } 623 624 static inline u32 fnhe_hashfun(__be32 daddr) 625 { 626 static u32 fnhe_hashrnd __read_mostly; 627 u32 hval; 628 629 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); 630 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); 631 return hash_32(hval, FNHE_HASH_SHIFT); 632 } 633 634 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 635 { 636 rt->rt_pmtu = fnhe->fnhe_pmtu; 637 rt->dst.expires = fnhe->fnhe_expires; 638 639 if (fnhe->fnhe_gw) { 640 rt->rt_flags |= RTCF_REDIRECTED; 641 rt->rt_gateway = fnhe->fnhe_gw; 642 rt->rt_uses_gateway = 1; 643 } 644 } 645 646 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 647 u32 pmtu, unsigned long expires) 648 { 649 struct fnhe_hash_bucket *hash; 650 struct fib_nh_exception *fnhe; 651 struct rtable *rt; 652 u32 genid, hval; 653 unsigned int i; 654 int depth; 655 656 genid = fnhe_genid(dev_net(nh->nh_dev)); 657 hval = fnhe_hashfun(daddr); 658 659 spin_lock_bh(&fnhe_lock); 660 661 hash = rcu_dereference(nh->nh_exceptions); 662 if (!hash) { 663 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC); 664 if (!hash) 665 goto out_unlock; 666 rcu_assign_pointer(nh->nh_exceptions, hash); 667 } 668 669 hash += hval; 670 671 depth = 0; 672 for (fnhe = rcu_dereference(hash->chain); fnhe; 673 fnhe = rcu_dereference(fnhe->fnhe_next)) { 674 if (fnhe->fnhe_daddr == daddr) 675 break; 676 depth++; 677 } 678 679 if (fnhe) { 680 if (fnhe->fnhe_genid != genid) 681 fnhe->fnhe_genid = genid; 682 if (gw) 683 fnhe->fnhe_gw = gw; 684 if (pmtu) 685 fnhe->fnhe_pmtu = pmtu; 686 fnhe->fnhe_expires = max(1UL, expires); 687 /* Update all cached dsts too */ 688 rt = rcu_dereference(fnhe->fnhe_rth_input); 689 if (rt) 690 fill_route_from_fnhe(rt, fnhe); 691 rt = rcu_dereference(fnhe->fnhe_rth_output); 692 if (rt) 693 fill_route_from_fnhe(rt, fnhe); 694 } else { 695 if (depth > FNHE_RECLAIM_DEPTH) 696 fnhe = fnhe_oldest(hash); 697 else { 698 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 699 if (!fnhe) 700 goto out_unlock; 701 702 fnhe->fnhe_next = hash->chain; 703 rcu_assign_pointer(hash->chain, fnhe); 704 } 705 fnhe->fnhe_genid = genid; 706 fnhe->fnhe_daddr = daddr; 707 fnhe->fnhe_gw = gw; 708 fnhe->fnhe_pmtu = pmtu; 709 fnhe->fnhe_expires = expires; 710 711 /* Exception created; mark the cached routes for the nexthop 712 * stale, so anyone caching it rechecks if this exception 713 * applies to them. 714 */ 715 rt = rcu_dereference(nh->nh_rth_input); 716 if (rt) 717 rt->dst.obsolete = DST_OBSOLETE_KILL; 718 719 for_each_possible_cpu(i) { 720 struct rtable __rcu **prt; 721 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); 722 rt = rcu_dereference(*prt); 723 if (rt) 724 rt->dst.obsolete = DST_OBSOLETE_KILL; 725 } 726 } 727 728 fnhe->fnhe_stamp = jiffies; 729 730 out_unlock: 731 spin_unlock_bh(&fnhe_lock); 732 } 733 734 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 735 bool kill_route) 736 { 737 __be32 new_gw = icmp_hdr(skb)->un.gateway; 738 __be32 old_gw = ip_hdr(skb)->saddr; 739 struct net_device *dev = skb->dev; 740 struct in_device *in_dev; 741 struct fib_result res; 742 struct neighbour *n; 743 struct net *net; 744 745 switch (icmp_hdr(skb)->code & 7) { 746 case ICMP_REDIR_NET: 747 case ICMP_REDIR_NETTOS: 748 case ICMP_REDIR_HOST: 749 case ICMP_REDIR_HOSTTOS: 750 break; 751 752 default: 753 return; 754 } 755 756 if (rt->rt_gateway != old_gw) 757 return; 758 759 in_dev = __in_dev_get_rcu(dev); 760 if (!in_dev) 761 return; 762 763 net = dev_net(dev); 764 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 765 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 766 ipv4_is_zeronet(new_gw)) 767 goto reject_redirect; 768 769 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 770 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 771 goto reject_redirect; 772 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 773 goto reject_redirect; 774 } else { 775 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 776 goto reject_redirect; 777 } 778 779 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 780 if (!n) 781 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 782 if (!IS_ERR(n)) { 783 if (!(n->nud_state & NUD_VALID)) { 784 neigh_event_send(n, NULL); 785 } else { 786 if (fib_lookup(net, fl4, &res, 0) == 0) { 787 struct fib_nh *nh = &FIB_RES_NH(res); 788 789 update_or_create_fnhe(nh, fl4->daddr, new_gw, 790 0, jiffies + ip_rt_gc_timeout); 791 } 792 if (kill_route) 793 rt->dst.obsolete = DST_OBSOLETE_KILL; 794 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 795 } 796 neigh_release(n); 797 } 798 return; 799 800 reject_redirect: 801 #ifdef CONFIG_IP_ROUTE_VERBOSE 802 if (IN_DEV_LOG_MARTIANS(in_dev)) { 803 const struct iphdr *iph = (const struct iphdr *) skb->data; 804 __be32 daddr = iph->daddr; 805 __be32 saddr = iph->saddr; 806 807 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 808 " Advised path = %pI4 -> %pI4\n", 809 &old_gw, dev->name, &new_gw, 810 &saddr, &daddr); 811 } 812 #endif 813 ; 814 } 815 816 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 817 { 818 struct rtable *rt; 819 struct flowi4 fl4; 820 const struct iphdr *iph = (const struct iphdr *) skb->data; 821 struct net *net = dev_net(skb->dev); 822 int oif = skb->dev->ifindex; 823 u8 tos = RT_TOS(iph->tos); 824 u8 prot = iph->protocol; 825 u32 mark = skb->mark; 826 827 rt = (struct rtable *) dst; 828 829 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 830 __ip_do_redirect(rt, skb, &fl4, true); 831 } 832 833 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 834 { 835 struct rtable *rt = (struct rtable *)dst; 836 struct dst_entry *ret = dst; 837 838 if (rt) { 839 if (dst->obsolete > 0) { 840 ip_rt_put(rt); 841 ret = NULL; 842 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 843 rt->dst.expires) { 844 ip_rt_put(rt); 845 ret = NULL; 846 } 847 } 848 return ret; 849 } 850 851 /* 852 * Algorithm: 853 * 1. The first ip_rt_redirect_number redirects are sent 854 * with exponential backoff, then we stop sending them at all, 855 * assuming that the host ignores our redirects. 856 * 2. If we did not see packets requiring redirects 857 * during ip_rt_redirect_silence, we assume that the host 858 * forgot redirected route and start to send redirects again. 859 * 860 * This algorithm is much cheaper and more intelligent than dumb load limiting 861 * in icmp.c. 862 * 863 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 864 * and "frag. need" (breaks PMTU discovery) in icmp.c. 865 */ 866 867 void ip_rt_send_redirect(struct sk_buff *skb) 868 { 869 struct rtable *rt = skb_rtable(skb); 870 struct in_device *in_dev; 871 struct inet_peer *peer; 872 struct net *net; 873 int log_martians; 874 int vif; 875 876 rcu_read_lock(); 877 in_dev = __in_dev_get_rcu(rt->dst.dev); 878 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 879 rcu_read_unlock(); 880 return; 881 } 882 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 883 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 884 rcu_read_unlock(); 885 886 net = dev_net(rt->dst.dev); 887 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 888 if (!peer) { 889 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 890 rt_nexthop(rt, ip_hdr(skb)->daddr)); 891 return; 892 } 893 894 /* No redirected packets during ip_rt_redirect_silence; 895 * reset the algorithm. 896 */ 897 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) 898 peer->rate_tokens = 0; 899 900 /* Too many ignored redirects; do not send anything 901 * set dst.rate_last to the last seen redirected packet. 902 */ 903 if (peer->rate_tokens >= ip_rt_redirect_number) { 904 peer->rate_last = jiffies; 905 goto out_put_peer; 906 } 907 908 /* Check for load limit; set rate_last to the latest sent 909 * redirect. 910 */ 911 if (peer->rate_tokens == 0 || 912 time_after(jiffies, 913 (peer->rate_last + 914 (ip_rt_redirect_load << peer->rate_tokens)))) { 915 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 916 917 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); 918 peer->rate_last = jiffies; 919 ++peer->rate_tokens; 920 #ifdef CONFIG_IP_ROUTE_VERBOSE 921 if (log_martians && 922 peer->rate_tokens == ip_rt_redirect_number) 923 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 924 &ip_hdr(skb)->saddr, inet_iif(skb), 925 &ip_hdr(skb)->daddr, &gw); 926 #endif 927 } 928 out_put_peer: 929 inet_putpeer(peer); 930 } 931 932 static int ip_error(struct sk_buff *skb) 933 { 934 struct in_device *in_dev = __in_dev_get_rcu(skb->dev); 935 struct rtable *rt = skb_rtable(skb); 936 struct inet_peer *peer; 937 unsigned long now; 938 struct net *net; 939 bool send; 940 int code; 941 942 /* IP on this device is disabled. */ 943 if (!in_dev) 944 goto out; 945 946 net = dev_net(rt->dst.dev); 947 if (!IN_DEV_FORWARD(in_dev)) { 948 switch (rt->dst.error) { 949 case EHOSTUNREACH: 950 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); 951 break; 952 953 case ENETUNREACH: 954 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 955 break; 956 } 957 goto out; 958 } 959 960 switch (rt->dst.error) { 961 case EINVAL: 962 default: 963 goto out; 964 case EHOSTUNREACH: 965 code = ICMP_HOST_UNREACH; 966 break; 967 case ENETUNREACH: 968 code = ICMP_NET_UNREACH; 969 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 970 break; 971 case EACCES: 972 code = ICMP_PKT_FILTERED; 973 break; 974 } 975 976 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 977 l3mdev_master_ifindex(skb->dev), 1); 978 979 send = true; 980 if (peer) { 981 now = jiffies; 982 peer->rate_tokens += now - peer->rate_last; 983 if (peer->rate_tokens > ip_rt_error_burst) 984 peer->rate_tokens = ip_rt_error_burst; 985 peer->rate_last = now; 986 if (peer->rate_tokens >= ip_rt_error_cost) 987 peer->rate_tokens -= ip_rt_error_cost; 988 else 989 send = false; 990 inet_putpeer(peer); 991 } 992 if (send) 993 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 994 995 out: kfree_skb(skb); 996 return 0; 997 } 998 999 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1000 { 1001 struct dst_entry *dst = &rt->dst; 1002 struct fib_result res; 1003 1004 if (dst_metric_locked(dst, RTAX_MTU)) 1005 return; 1006 1007 if (ipv4_mtu(dst) < mtu) 1008 return; 1009 1010 if (mtu < ip_rt_min_pmtu) 1011 mtu = ip_rt_min_pmtu; 1012 1013 if (rt->rt_pmtu == mtu && 1014 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) 1015 return; 1016 1017 rcu_read_lock(); 1018 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { 1019 struct fib_nh *nh = &FIB_RES_NH(res); 1020 1021 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, 1022 jiffies + ip_rt_mtu_expires); 1023 } 1024 rcu_read_unlock(); 1025 } 1026 1027 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1028 struct sk_buff *skb, u32 mtu) 1029 { 1030 struct rtable *rt = (struct rtable *) dst; 1031 struct flowi4 fl4; 1032 1033 ip_rt_build_flow_key(&fl4, sk, skb); 1034 __ip_rt_update_pmtu(rt, &fl4, mtu); 1035 } 1036 1037 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1038 int oif, u32 mark, u8 protocol, int flow_flags) 1039 { 1040 const struct iphdr *iph = (const struct iphdr *) skb->data; 1041 struct flowi4 fl4; 1042 struct rtable *rt; 1043 1044 if (!mark) 1045 mark = IP4_REPLY_MARK(net, skb->mark); 1046 1047 __build_flow_key(net, &fl4, NULL, iph, oif, 1048 RT_TOS(iph->tos), protocol, mark, flow_flags); 1049 rt = __ip_route_output_key(net, &fl4); 1050 if (!IS_ERR(rt)) { 1051 __ip_rt_update_pmtu(rt, &fl4, mtu); 1052 ip_rt_put(rt); 1053 } 1054 } 1055 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 1056 1057 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1058 { 1059 const struct iphdr *iph = (const struct iphdr *) skb->data; 1060 struct flowi4 fl4; 1061 struct rtable *rt; 1062 1063 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); 1064 1065 if (!fl4.flowi4_mark) 1066 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1067 1068 rt = __ip_route_output_key(sock_net(sk), &fl4); 1069 if (!IS_ERR(rt)) { 1070 __ip_rt_update_pmtu(rt, &fl4, mtu); 1071 ip_rt_put(rt); 1072 } 1073 } 1074 1075 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1076 { 1077 const struct iphdr *iph = (const struct iphdr *) skb->data; 1078 struct flowi4 fl4; 1079 struct rtable *rt; 1080 struct dst_entry *odst = NULL; 1081 bool new = false; 1082 struct net *net = sock_net(sk); 1083 1084 bh_lock_sock(sk); 1085 1086 if (!ip_sk_accept_pmtu(sk)) 1087 goto out; 1088 1089 odst = sk_dst_get(sk); 1090 1091 if (sock_owned_by_user(sk) || !odst) { 1092 __ipv4_sk_update_pmtu(skb, sk, mtu); 1093 goto out; 1094 } 1095 1096 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1097 1098 rt = (struct rtable *)odst; 1099 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1100 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1101 if (IS_ERR(rt)) 1102 goto out; 1103 1104 new = true; 1105 } 1106 1107 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); 1108 1109 if (!dst_check(&rt->dst, 0)) { 1110 if (new) 1111 dst_release(&rt->dst); 1112 1113 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1114 if (IS_ERR(rt)) 1115 goto out; 1116 1117 new = true; 1118 } 1119 1120 if (new) 1121 sk_dst_set(sk, &rt->dst); 1122 1123 out: 1124 bh_unlock_sock(sk); 1125 dst_release(odst); 1126 } 1127 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1128 1129 void ipv4_redirect(struct sk_buff *skb, struct net *net, 1130 int oif, u32 mark, u8 protocol, int flow_flags) 1131 { 1132 const struct iphdr *iph = (const struct iphdr *) skb->data; 1133 struct flowi4 fl4; 1134 struct rtable *rt; 1135 1136 __build_flow_key(net, &fl4, NULL, iph, oif, 1137 RT_TOS(iph->tos), protocol, mark, flow_flags); 1138 rt = __ip_route_output_key(net, &fl4); 1139 if (!IS_ERR(rt)) { 1140 __ip_do_redirect(rt, skb, &fl4, false); 1141 ip_rt_put(rt); 1142 } 1143 } 1144 EXPORT_SYMBOL_GPL(ipv4_redirect); 1145 1146 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1147 { 1148 const struct iphdr *iph = (const struct iphdr *) skb->data; 1149 struct flowi4 fl4; 1150 struct rtable *rt; 1151 struct net *net = sock_net(sk); 1152 1153 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1154 rt = __ip_route_output_key(net, &fl4); 1155 if (!IS_ERR(rt)) { 1156 __ip_do_redirect(rt, skb, &fl4, false); 1157 ip_rt_put(rt); 1158 } 1159 } 1160 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1161 1162 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1163 { 1164 struct rtable *rt = (struct rtable *) dst; 1165 1166 /* All IPV4 dsts are created with ->obsolete set to the value 1167 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1168 * into this function always. 1169 * 1170 * When a PMTU/redirect information update invalidates a route, 1171 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1172 * DST_OBSOLETE_DEAD by dst_free(). 1173 */ 1174 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1175 return NULL; 1176 return dst; 1177 } 1178 1179 static void ipv4_link_failure(struct sk_buff *skb) 1180 { 1181 struct rtable *rt; 1182 1183 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1184 1185 rt = skb_rtable(skb); 1186 if (rt) 1187 dst_set_expires(&rt->dst, 0); 1188 } 1189 1190 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) 1191 { 1192 pr_debug("%s: %pI4 -> %pI4, %s\n", 1193 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1194 skb->dev ? skb->dev->name : "?"); 1195 kfree_skb(skb); 1196 WARN_ON(1); 1197 return 0; 1198 } 1199 1200 /* 1201 We do not cache source address of outgoing interface, 1202 because it is used only by IP RR, TS and SRR options, 1203 so that it out of fast path. 1204 1205 BTW remember: "addr" is allowed to be not aligned 1206 in IP options! 1207 */ 1208 1209 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1210 { 1211 __be32 src; 1212 1213 if (rt_is_output_route(rt)) 1214 src = ip_hdr(skb)->saddr; 1215 else { 1216 struct fib_result res; 1217 struct flowi4 fl4; 1218 struct iphdr *iph; 1219 1220 iph = ip_hdr(skb); 1221 1222 memset(&fl4, 0, sizeof(fl4)); 1223 fl4.daddr = iph->daddr; 1224 fl4.saddr = iph->saddr; 1225 fl4.flowi4_tos = RT_TOS(iph->tos); 1226 fl4.flowi4_oif = rt->dst.dev->ifindex; 1227 fl4.flowi4_iif = skb->dev->ifindex; 1228 fl4.flowi4_mark = skb->mark; 1229 1230 rcu_read_lock(); 1231 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1232 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res); 1233 else 1234 src = inet_select_addr(rt->dst.dev, 1235 rt_nexthop(rt, iph->daddr), 1236 RT_SCOPE_UNIVERSE); 1237 rcu_read_unlock(); 1238 } 1239 memcpy(addr, &src, 4); 1240 } 1241 1242 #ifdef CONFIG_IP_ROUTE_CLASSID 1243 static void set_class_tag(struct rtable *rt, u32 tag) 1244 { 1245 if (!(rt->dst.tclassid & 0xFFFF)) 1246 rt->dst.tclassid |= tag & 0xFFFF; 1247 if (!(rt->dst.tclassid & 0xFFFF0000)) 1248 rt->dst.tclassid |= tag & 0xFFFF0000; 1249 } 1250 #endif 1251 1252 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1253 { 1254 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1255 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1256 ip_rt_min_advmss); 1257 1258 return min(advmss, IPV4_MAX_PMTU - header_size); 1259 } 1260 1261 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1262 { 1263 const struct rtable *rt = (const struct rtable *) dst; 1264 unsigned int mtu = rt->rt_pmtu; 1265 1266 if (!mtu || time_after_eq(jiffies, rt->dst.expires)) 1267 mtu = dst_metric_raw(dst, RTAX_MTU); 1268 1269 if (mtu) 1270 return mtu; 1271 1272 mtu = READ_ONCE(dst->dev->mtu); 1273 1274 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { 1275 if (rt->rt_uses_gateway && mtu > 576) 1276 mtu = 576; 1277 } 1278 1279 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1280 1281 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1282 } 1283 1284 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1285 { 1286 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); 1287 struct fib_nh_exception *fnhe; 1288 u32 hval; 1289 1290 if (!hash) 1291 return NULL; 1292 1293 hval = fnhe_hashfun(daddr); 1294 1295 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1296 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1297 if (fnhe->fnhe_daddr == daddr) 1298 return fnhe; 1299 } 1300 return NULL; 1301 } 1302 1303 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1304 __be32 daddr, const bool do_cache) 1305 { 1306 bool ret = false; 1307 1308 spin_lock_bh(&fnhe_lock); 1309 1310 if (daddr == fnhe->fnhe_daddr) { 1311 struct rtable __rcu **porig; 1312 struct rtable *orig; 1313 int genid = fnhe_genid(dev_net(rt->dst.dev)); 1314 1315 if (rt_is_input_route(rt)) 1316 porig = &fnhe->fnhe_rth_input; 1317 else 1318 porig = &fnhe->fnhe_rth_output; 1319 orig = rcu_dereference(*porig); 1320 1321 if (fnhe->fnhe_genid != genid) { 1322 fnhe->fnhe_genid = genid; 1323 fnhe->fnhe_gw = 0; 1324 fnhe->fnhe_pmtu = 0; 1325 fnhe->fnhe_expires = 0; 1326 fnhe_flush_routes(fnhe); 1327 orig = NULL; 1328 } 1329 fill_route_from_fnhe(rt, fnhe); 1330 if (!rt->rt_gateway) 1331 rt->rt_gateway = daddr; 1332 1333 if (do_cache) { 1334 dst_hold(&rt->dst); 1335 rcu_assign_pointer(*porig, rt); 1336 if (orig) { 1337 dst_dev_put(&orig->dst); 1338 dst_release(&orig->dst); 1339 } 1340 ret = true; 1341 } 1342 1343 fnhe->fnhe_stamp = jiffies; 1344 } 1345 spin_unlock_bh(&fnhe_lock); 1346 1347 return ret; 1348 } 1349 1350 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1351 { 1352 struct rtable *orig, *prev, **p; 1353 bool ret = true; 1354 1355 if (rt_is_input_route(rt)) { 1356 p = (struct rtable **)&nh->nh_rth_input; 1357 } else { 1358 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); 1359 } 1360 orig = *p; 1361 1362 /* hold dst before doing cmpxchg() to avoid race condition 1363 * on this dst 1364 */ 1365 dst_hold(&rt->dst); 1366 prev = cmpxchg(p, orig, rt); 1367 if (prev == orig) { 1368 if (orig) { 1369 dst_dev_put(&orig->dst); 1370 dst_release(&orig->dst); 1371 } 1372 } else { 1373 dst_release(&rt->dst); 1374 ret = false; 1375 } 1376 1377 return ret; 1378 } 1379 1380 struct uncached_list { 1381 spinlock_t lock; 1382 struct list_head head; 1383 }; 1384 1385 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); 1386 1387 static void rt_add_uncached_list(struct rtable *rt) 1388 { 1389 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); 1390 1391 rt->rt_uncached_list = ul; 1392 1393 spin_lock_bh(&ul->lock); 1394 list_add_tail(&rt->rt_uncached, &ul->head); 1395 spin_unlock_bh(&ul->lock); 1396 } 1397 1398 static void ipv4_dst_destroy(struct dst_entry *dst) 1399 { 1400 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst); 1401 struct rtable *rt = (struct rtable *) dst; 1402 1403 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt)) 1404 kfree(p); 1405 1406 if (!list_empty(&rt->rt_uncached)) { 1407 struct uncached_list *ul = rt->rt_uncached_list; 1408 1409 spin_lock_bh(&ul->lock); 1410 list_del(&rt->rt_uncached); 1411 spin_unlock_bh(&ul->lock); 1412 } 1413 } 1414 1415 void rt_flush_dev(struct net_device *dev) 1416 { 1417 struct net *net = dev_net(dev); 1418 struct rtable *rt; 1419 int cpu; 1420 1421 for_each_possible_cpu(cpu) { 1422 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 1423 1424 spin_lock_bh(&ul->lock); 1425 list_for_each_entry(rt, &ul->head, rt_uncached) { 1426 if (rt->dst.dev != dev) 1427 continue; 1428 rt->dst.dev = net->loopback_dev; 1429 dev_hold(rt->dst.dev); 1430 dev_put(dev); 1431 } 1432 spin_unlock_bh(&ul->lock); 1433 } 1434 } 1435 1436 static bool rt_cache_valid(const struct rtable *rt) 1437 { 1438 return rt && 1439 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1440 !rt_is_expired(rt); 1441 } 1442 1443 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1444 const struct fib_result *res, 1445 struct fib_nh_exception *fnhe, 1446 struct fib_info *fi, u16 type, u32 itag, 1447 const bool do_cache) 1448 { 1449 bool cached = false; 1450 1451 if (fi) { 1452 struct fib_nh *nh = &FIB_RES_NH(*res); 1453 1454 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) { 1455 rt->rt_gateway = nh->nh_gw; 1456 rt->rt_uses_gateway = 1; 1457 } 1458 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true); 1459 if (fi->fib_metrics != &dst_default_metrics) { 1460 rt->dst._metrics |= DST_METRICS_REFCOUNTED; 1461 refcount_inc(&fi->fib_metrics->refcnt); 1462 } 1463 #ifdef CONFIG_IP_ROUTE_CLASSID 1464 rt->dst.tclassid = nh->nh_tclassid; 1465 #endif 1466 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); 1467 if (unlikely(fnhe)) 1468 cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1469 else if (do_cache) 1470 cached = rt_cache_route(nh, rt); 1471 if (unlikely(!cached)) { 1472 /* Routes we intend to cache in nexthop exception or 1473 * FIB nexthop have the DST_NOCACHE bit clear. 1474 * However, if we are unsuccessful at storing this 1475 * route into the cache we really need to set it. 1476 */ 1477 if (!rt->rt_gateway) 1478 rt->rt_gateway = daddr; 1479 rt_add_uncached_list(rt); 1480 } 1481 } else 1482 rt_add_uncached_list(rt); 1483 1484 #ifdef CONFIG_IP_ROUTE_CLASSID 1485 #ifdef CONFIG_IP_MULTIPLE_TABLES 1486 set_class_tag(rt, res->tclassid); 1487 #endif 1488 set_class_tag(rt, itag); 1489 #endif 1490 } 1491 1492 struct rtable *rt_dst_alloc(struct net_device *dev, 1493 unsigned int flags, u16 type, 1494 bool nopolicy, bool noxfrm, bool will_cache) 1495 { 1496 struct rtable *rt; 1497 1498 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1499 (will_cache ? 0 : DST_HOST) | 1500 (nopolicy ? DST_NOPOLICY : 0) | 1501 (noxfrm ? DST_NOXFRM : 0)); 1502 1503 if (rt) { 1504 rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1505 rt->rt_flags = flags; 1506 rt->rt_type = type; 1507 rt->rt_is_input = 0; 1508 rt->rt_iif = 0; 1509 rt->rt_pmtu = 0; 1510 rt->rt_gateway = 0; 1511 rt->rt_uses_gateway = 0; 1512 INIT_LIST_HEAD(&rt->rt_uncached); 1513 1514 rt->dst.output = ip_output; 1515 if (flags & RTCF_LOCAL) 1516 rt->dst.input = ip_local_deliver; 1517 } 1518 1519 return rt; 1520 } 1521 EXPORT_SYMBOL(rt_dst_alloc); 1522 1523 /* called in rcu_read_lock() section */ 1524 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1525 u8 tos, struct net_device *dev, 1526 struct in_device *in_dev, u32 *itag) 1527 { 1528 int err; 1529 1530 /* Primary sanity checks. */ 1531 if (!in_dev) 1532 return -EINVAL; 1533 1534 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1535 skb->protocol != htons(ETH_P_IP)) 1536 return -EINVAL; 1537 1538 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) 1539 return -EINVAL; 1540 1541 if (ipv4_is_zeronet(saddr)) { 1542 if (!ipv4_is_local_multicast(daddr)) 1543 return -EINVAL; 1544 } else { 1545 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1546 in_dev, itag); 1547 if (err < 0) 1548 return err; 1549 } 1550 return 0; 1551 } 1552 1553 /* called in rcu_read_lock() section */ 1554 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1555 u8 tos, struct net_device *dev, int our) 1556 { 1557 struct in_device *in_dev = __in_dev_get_rcu(dev); 1558 unsigned int flags = RTCF_MULTICAST; 1559 struct rtable *rth; 1560 u32 itag = 0; 1561 int err; 1562 1563 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); 1564 if (err) 1565 return err; 1566 1567 if (our) 1568 flags |= RTCF_LOCAL; 1569 1570 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, 1571 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); 1572 if (!rth) 1573 return -ENOBUFS; 1574 1575 #ifdef CONFIG_IP_ROUTE_CLASSID 1576 rth->dst.tclassid = itag; 1577 #endif 1578 rth->dst.output = ip_rt_bug; 1579 rth->rt_is_input= 1; 1580 1581 #ifdef CONFIG_IP_MROUTE 1582 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1583 rth->dst.input = ip_mr_input; 1584 #endif 1585 RT_CACHE_STAT_INC(in_slow_mc); 1586 1587 skb_dst_set(skb, &rth->dst); 1588 return 0; 1589 } 1590 1591 1592 static void ip_handle_martian_source(struct net_device *dev, 1593 struct in_device *in_dev, 1594 struct sk_buff *skb, 1595 __be32 daddr, 1596 __be32 saddr) 1597 { 1598 RT_CACHE_STAT_INC(in_martian_src); 1599 #ifdef CONFIG_IP_ROUTE_VERBOSE 1600 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1601 /* 1602 * RFC1812 recommendation, if source is martian, 1603 * the only hint is MAC header. 1604 */ 1605 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1606 &daddr, &saddr, dev->name); 1607 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1608 print_hex_dump(KERN_WARNING, "ll header: ", 1609 DUMP_PREFIX_OFFSET, 16, 1, 1610 skb_mac_header(skb), 1611 dev->hard_header_len, true); 1612 } 1613 } 1614 #endif 1615 } 1616 1617 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) 1618 { 1619 struct fnhe_hash_bucket *hash; 1620 struct fib_nh_exception *fnhe, __rcu **fnhe_p; 1621 u32 hval = fnhe_hashfun(daddr); 1622 1623 spin_lock_bh(&fnhe_lock); 1624 1625 hash = rcu_dereference_protected(nh->nh_exceptions, 1626 lockdep_is_held(&fnhe_lock)); 1627 hash += hval; 1628 1629 fnhe_p = &hash->chain; 1630 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); 1631 while (fnhe) { 1632 if (fnhe->fnhe_daddr == daddr) { 1633 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( 1634 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); 1635 fnhe_flush_routes(fnhe); 1636 kfree_rcu(fnhe, rcu); 1637 break; 1638 } 1639 fnhe_p = &fnhe->fnhe_next; 1640 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1641 lockdep_is_held(&fnhe_lock)); 1642 } 1643 1644 spin_unlock_bh(&fnhe_lock); 1645 } 1646 1647 /* called in rcu_read_lock() section */ 1648 static int __mkroute_input(struct sk_buff *skb, 1649 const struct fib_result *res, 1650 struct in_device *in_dev, 1651 __be32 daddr, __be32 saddr, u32 tos) 1652 { 1653 struct fib_nh_exception *fnhe; 1654 struct rtable *rth; 1655 int err; 1656 struct in_device *out_dev; 1657 bool do_cache; 1658 u32 itag = 0; 1659 1660 /* get a working reference to the output device */ 1661 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res)); 1662 if (!out_dev) { 1663 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1664 return -EINVAL; 1665 } 1666 1667 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1668 in_dev->dev, in_dev, &itag); 1669 if (err < 0) { 1670 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1671 saddr); 1672 1673 goto cleanup; 1674 } 1675 1676 do_cache = res->fi && !itag; 1677 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1678 skb->protocol == htons(ETH_P_IP) && 1679 (IN_DEV_SHARED_MEDIA(out_dev) || 1680 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) 1681 IPCB(skb)->flags |= IPSKB_DOREDIRECT; 1682 1683 if (skb->protocol != htons(ETH_P_IP)) { 1684 /* Not IP (i.e. ARP). Do not create route, if it is 1685 * invalid for proxy arp. DNAT routes are always valid. 1686 * 1687 * Proxy arp feature have been extended to allow, ARP 1688 * replies back to the same interface, to support 1689 * Private VLAN switch technologies. See arp.c. 1690 */ 1691 if (out_dev == in_dev && 1692 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1693 err = -EINVAL; 1694 goto cleanup; 1695 } 1696 } 1697 1698 fnhe = find_exception(&FIB_RES_NH(*res), daddr); 1699 if (do_cache) { 1700 if (fnhe) { 1701 rth = rcu_dereference(fnhe->fnhe_rth_input); 1702 if (rth && rth->dst.expires && 1703 time_after(jiffies, rth->dst.expires)) { 1704 ip_del_fnhe(&FIB_RES_NH(*res), daddr); 1705 fnhe = NULL; 1706 } else { 1707 goto rt_cache; 1708 } 1709 } 1710 1711 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); 1712 1713 rt_cache: 1714 if (rt_cache_valid(rth)) { 1715 skb_dst_set_noref(skb, &rth->dst); 1716 goto out; 1717 } 1718 } 1719 1720 rth = rt_dst_alloc(out_dev->dev, 0, res->type, 1721 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1722 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); 1723 if (!rth) { 1724 err = -ENOBUFS; 1725 goto cleanup; 1726 } 1727 1728 rth->rt_is_input = 1; 1729 RT_CACHE_STAT_INC(in_slow_tot); 1730 1731 rth->dst.input = ip_forward; 1732 1733 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1734 do_cache); 1735 lwtunnel_set_redirect(&rth->dst); 1736 skb_dst_set(skb, &rth->dst); 1737 out: 1738 err = 0; 1739 cleanup: 1740 return err; 1741 } 1742 1743 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1744 /* To make ICMP packets follow the right flow, the multipath hash is 1745 * calculated from the inner IP addresses. 1746 */ 1747 static void ip_multipath_l3_keys(const struct sk_buff *skb, 1748 struct flow_keys *hash_keys) 1749 { 1750 const struct iphdr *outer_iph = ip_hdr(skb); 1751 const struct iphdr *inner_iph; 1752 const struct icmphdr *icmph; 1753 struct iphdr _inner_iph; 1754 struct icmphdr _icmph; 1755 1756 hash_keys->addrs.v4addrs.src = outer_iph->saddr; 1757 hash_keys->addrs.v4addrs.dst = outer_iph->daddr; 1758 if (likely(outer_iph->protocol != IPPROTO_ICMP)) 1759 return; 1760 1761 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1762 return; 1763 1764 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1765 &_icmph); 1766 if (!icmph) 1767 return; 1768 1769 if (icmph->type != ICMP_DEST_UNREACH && 1770 icmph->type != ICMP_REDIRECT && 1771 icmph->type != ICMP_TIME_EXCEEDED && 1772 icmph->type != ICMP_PARAMETERPROB) 1773 return; 1774 1775 inner_iph = skb_header_pointer(skb, 1776 outer_iph->ihl * 4 + sizeof(_icmph), 1777 sizeof(_inner_iph), &_inner_iph); 1778 if (!inner_iph) 1779 return; 1780 hash_keys->addrs.v4addrs.src = inner_iph->saddr; 1781 hash_keys->addrs.v4addrs.dst = inner_iph->daddr; 1782 } 1783 1784 /* if skb is set it will be used and fl4 can be NULL */ 1785 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4, 1786 const struct sk_buff *skb) 1787 { 1788 struct net *net = fi->fib_net; 1789 struct flow_keys hash_keys; 1790 u32 mhash; 1791 1792 switch (net->ipv4.sysctl_fib_multipath_hash_policy) { 1793 case 0: 1794 memset(&hash_keys, 0, sizeof(hash_keys)); 1795 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1796 if (skb) { 1797 ip_multipath_l3_keys(skb, &hash_keys); 1798 } else { 1799 hash_keys.addrs.v4addrs.src = fl4->saddr; 1800 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1801 } 1802 break; 1803 case 1: 1804 /* skb is currently provided only when forwarding */ 1805 if (skb) { 1806 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1807 struct flow_keys keys; 1808 1809 /* short-circuit if we already have L4 hash present */ 1810 if (skb->l4_hash) 1811 return skb_get_hash_raw(skb) >> 1; 1812 memset(&hash_keys, 0, sizeof(hash_keys)); 1813 skb_flow_dissect_flow_keys(skb, &keys, flag); 1814 1815 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1816 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 1817 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 1818 hash_keys.ports.src = keys.ports.src; 1819 hash_keys.ports.dst = keys.ports.dst; 1820 hash_keys.basic.ip_proto = keys.basic.ip_proto; 1821 } else { 1822 memset(&hash_keys, 0, sizeof(hash_keys)); 1823 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1824 hash_keys.addrs.v4addrs.src = fl4->saddr; 1825 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1826 hash_keys.ports.src = fl4->fl4_sport; 1827 hash_keys.ports.dst = fl4->fl4_dport; 1828 hash_keys.basic.ip_proto = fl4->flowi4_proto; 1829 } 1830 break; 1831 } 1832 mhash = flow_hash_from_keys(&hash_keys); 1833 1834 return mhash >> 1; 1835 } 1836 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1837 1838 static int ip_mkroute_input(struct sk_buff *skb, 1839 struct fib_result *res, 1840 struct in_device *in_dev, 1841 __be32 daddr, __be32 saddr, u32 tos) 1842 { 1843 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1844 if (res->fi && res->fi->fib_nhs > 1) { 1845 int h = fib_multipath_hash(res->fi, NULL, skb); 1846 1847 fib_select_multipath(res, h); 1848 } 1849 #endif 1850 1851 /* create a routing cache entry */ 1852 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1853 } 1854 1855 /* 1856 * NOTE. We drop all the packets that has local source 1857 * addresses, because every properly looped back packet 1858 * must have correct destination already attached by output routine. 1859 * 1860 * Such approach solves two big problems: 1861 * 1. Not simplex devices are handled properly. 1862 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1863 * called with rcu_read_lock() 1864 */ 1865 1866 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1867 u8 tos, struct net_device *dev, 1868 struct fib_result *res) 1869 { 1870 struct in_device *in_dev = __in_dev_get_rcu(dev); 1871 struct ip_tunnel_info *tun_info; 1872 struct flowi4 fl4; 1873 unsigned int flags = 0; 1874 u32 itag = 0; 1875 struct rtable *rth; 1876 int err = -EINVAL; 1877 struct net *net = dev_net(dev); 1878 bool do_cache; 1879 1880 /* IP on this device is disabled. */ 1881 1882 if (!in_dev) 1883 goto out; 1884 1885 /* Check for the most weird martians, which can be not detected 1886 by fib_lookup. 1887 */ 1888 1889 tun_info = skb_tunnel_info(skb); 1890 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1891 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; 1892 else 1893 fl4.flowi4_tun_key.tun_id = 0; 1894 skb_dst_drop(skb); 1895 1896 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 1897 goto martian_source; 1898 1899 res->fi = NULL; 1900 res->table = NULL; 1901 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 1902 goto brd_input; 1903 1904 /* Accept zero addresses only to limited broadcast; 1905 * I even do not know to fix it or not. Waiting for complains :-) 1906 */ 1907 if (ipv4_is_zeronet(saddr)) 1908 goto martian_source; 1909 1910 if (ipv4_is_zeronet(daddr)) 1911 goto martian_destination; 1912 1913 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), 1914 * and call it once if daddr or/and saddr are loopback addresses 1915 */ 1916 if (ipv4_is_loopback(daddr)) { 1917 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 1918 goto martian_destination; 1919 } else if (ipv4_is_loopback(saddr)) { 1920 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 1921 goto martian_source; 1922 } 1923 1924 /* 1925 * Now we are ready to route packet. 1926 */ 1927 fl4.flowi4_oif = 0; 1928 fl4.flowi4_iif = dev->ifindex; 1929 fl4.flowi4_mark = skb->mark; 1930 fl4.flowi4_tos = tos; 1931 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 1932 fl4.flowi4_flags = 0; 1933 fl4.daddr = daddr; 1934 fl4.saddr = saddr; 1935 fl4.flowi4_uid = sock_net_uid(net, NULL); 1936 err = fib_lookup(net, &fl4, res, 0); 1937 if (err != 0) { 1938 if (!IN_DEV_FORWARD(in_dev)) 1939 err = -EHOSTUNREACH; 1940 goto no_route; 1941 } 1942 1943 if (res->type == RTN_BROADCAST) 1944 goto brd_input; 1945 1946 if (res->type == RTN_LOCAL) { 1947 err = fib_validate_source(skb, saddr, daddr, tos, 1948 0, dev, in_dev, &itag); 1949 if (err < 0) 1950 goto martian_source; 1951 goto local_input; 1952 } 1953 1954 if (!IN_DEV_FORWARD(in_dev)) { 1955 err = -EHOSTUNREACH; 1956 goto no_route; 1957 } 1958 if (res->type != RTN_UNICAST) 1959 goto martian_destination; 1960 1961 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1962 out: return err; 1963 1964 brd_input: 1965 if (skb->protocol != htons(ETH_P_IP)) 1966 goto e_inval; 1967 1968 if (!ipv4_is_zeronet(saddr)) { 1969 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1970 in_dev, &itag); 1971 if (err < 0) 1972 goto martian_source; 1973 } 1974 flags |= RTCF_BROADCAST; 1975 res->type = RTN_BROADCAST; 1976 RT_CACHE_STAT_INC(in_brd); 1977 1978 local_input: 1979 do_cache = false; 1980 if (res->fi) { 1981 if (!itag) { 1982 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); 1983 if (rt_cache_valid(rth)) { 1984 skb_dst_set_noref(skb, &rth->dst); 1985 err = 0; 1986 goto out; 1987 } 1988 do_cache = true; 1989 } 1990 } 1991 1992 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, 1993 flags | RTCF_LOCAL, res->type, 1994 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 1995 if (!rth) 1996 goto e_nobufs; 1997 1998 rth->dst.output= ip_rt_bug; 1999 #ifdef CONFIG_IP_ROUTE_CLASSID 2000 rth->dst.tclassid = itag; 2001 #endif 2002 rth->rt_is_input = 1; 2003 2004 RT_CACHE_STAT_INC(in_slow_tot); 2005 if (res->type == RTN_UNREACHABLE) { 2006 rth->dst.input= ip_error; 2007 rth->dst.error= -err; 2008 rth->rt_flags &= ~RTCF_LOCAL; 2009 } 2010 2011 if (do_cache) { 2012 struct fib_nh *nh = &FIB_RES_NH(*res); 2013 2014 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate); 2015 if (lwtunnel_input_redirect(rth->dst.lwtstate)) { 2016 WARN_ON(rth->dst.input == lwtunnel_input); 2017 rth->dst.lwtstate->orig_input = rth->dst.input; 2018 rth->dst.input = lwtunnel_input; 2019 } 2020 2021 if (unlikely(!rt_cache_route(nh, rth))) 2022 rt_add_uncached_list(rth); 2023 } 2024 skb_dst_set(skb, &rth->dst); 2025 err = 0; 2026 goto out; 2027 2028 no_route: 2029 RT_CACHE_STAT_INC(in_no_route); 2030 res->type = RTN_UNREACHABLE; 2031 res->fi = NULL; 2032 res->table = NULL; 2033 goto local_input; 2034 2035 /* 2036 * Do not cache martian addresses: they should be logged (RFC1812) 2037 */ 2038 martian_destination: 2039 RT_CACHE_STAT_INC(in_martian_dst); 2040 #ifdef CONFIG_IP_ROUTE_VERBOSE 2041 if (IN_DEV_LOG_MARTIANS(in_dev)) 2042 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2043 &daddr, &saddr, dev->name); 2044 #endif 2045 2046 e_inval: 2047 err = -EINVAL; 2048 goto out; 2049 2050 e_nobufs: 2051 err = -ENOBUFS; 2052 goto out; 2053 2054 martian_source: 2055 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2056 goto out; 2057 } 2058 2059 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2060 u8 tos, struct net_device *dev) 2061 { 2062 struct fib_result res; 2063 int err; 2064 2065 tos &= IPTOS_RT_MASK; 2066 rcu_read_lock(); 2067 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); 2068 rcu_read_unlock(); 2069 2070 return err; 2071 } 2072 EXPORT_SYMBOL(ip_route_input_noref); 2073 2074 /* called with rcu_read_lock held */ 2075 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2076 u8 tos, struct net_device *dev, struct fib_result *res) 2077 { 2078 /* Multicast recognition logic is moved from route cache to here. 2079 The problem was that too many Ethernet cards have broken/missing 2080 hardware multicast filters :-( As result the host on multicasting 2081 network acquires a lot of useless route cache entries, sort of 2082 SDR messages from all the world. Now we try to get rid of them. 2083 Really, provided software IP multicast filter is organized 2084 reasonably (at least, hashed), it does not result in a slowdown 2085 comparing with route cache reject entries. 2086 Note, that multicast routers are not affected, because 2087 route cache entry is created eventually. 2088 */ 2089 if (ipv4_is_multicast(daddr)) { 2090 struct in_device *in_dev = __in_dev_get_rcu(dev); 2091 int our = 0; 2092 int err = -EINVAL; 2093 2094 if (in_dev) 2095 our = ip_check_mc_rcu(in_dev, daddr, saddr, 2096 ip_hdr(skb)->protocol); 2097 2098 /* check l3 master if no match yet */ 2099 if ((!in_dev || !our) && netif_is_l3_slave(dev)) { 2100 struct in_device *l3_in_dev; 2101 2102 l3_in_dev = __in_dev_get_rcu(skb->dev); 2103 if (l3_in_dev) 2104 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, 2105 ip_hdr(skb)->protocol); 2106 } 2107 2108 if (our 2109 #ifdef CONFIG_IP_MROUTE 2110 || 2111 (!ipv4_is_local_multicast(daddr) && 2112 IN_DEV_MFORWARD(in_dev)) 2113 #endif 2114 ) { 2115 err = ip_route_input_mc(skb, daddr, saddr, 2116 tos, dev, our); 2117 } 2118 return err; 2119 } 2120 2121 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); 2122 } 2123 2124 /* called with rcu_read_lock() */ 2125 static struct rtable *__mkroute_output(const struct fib_result *res, 2126 const struct flowi4 *fl4, int orig_oif, 2127 struct net_device *dev_out, 2128 unsigned int flags) 2129 { 2130 struct fib_info *fi = res->fi; 2131 struct fib_nh_exception *fnhe; 2132 struct in_device *in_dev; 2133 u16 type = res->type; 2134 struct rtable *rth; 2135 bool do_cache; 2136 2137 in_dev = __in_dev_get_rcu(dev_out); 2138 if (!in_dev) 2139 return ERR_PTR(-EINVAL); 2140 2141 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2142 if (ipv4_is_loopback(fl4->saddr) && 2143 !(dev_out->flags & IFF_LOOPBACK) && 2144 !netif_is_l3_master(dev_out)) 2145 return ERR_PTR(-EINVAL); 2146 2147 if (ipv4_is_lbcast(fl4->daddr)) 2148 type = RTN_BROADCAST; 2149 else if (ipv4_is_multicast(fl4->daddr)) 2150 type = RTN_MULTICAST; 2151 else if (ipv4_is_zeronet(fl4->daddr)) 2152 return ERR_PTR(-EINVAL); 2153 2154 if (dev_out->flags & IFF_LOOPBACK) 2155 flags |= RTCF_LOCAL; 2156 2157 do_cache = true; 2158 if (type == RTN_BROADCAST) { 2159 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2160 fi = NULL; 2161 } else if (type == RTN_MULTICAST) { 2162 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2163 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2164 fl4->flowi4_proto)) 2165 flags &= ~RTCF_LOCAL; 2166 else 2167 do_cache = false; 2168 /* If multicast route do not exist use 2169 * default one, but do not gateway in this case. 2170 * Yes, it is hack. 2171 */ 2172 if (fi && res->prefixlen < 4) 2173 fi = NULL; 2174 } else if ((type == RTN_LOCAL) && (orig_oif != 0) && 2175 (orig_oif != dev_out->ifindex)) { 2176 /* For local routes that require a particular output interface 2177 * we do not want to cache the result. Caching the result 2178 * causes incorrect behaviour when there are multiple source 2179 * addresses on the interface, the end result being that if the 2180 * intended recipient is waiting on that interface for the 2181 * packet he won't receive it because it will be delivered on 2182 * the loopback interface and the IP_PKTINFO ipi_ifindex will 2183 * be set to the loopback interface as well. 2184 */ 2185 fi = NULL; 2186 } 2187 2188 fnhe = NULL; 2189 do_cache &= fi != NULL; 2190 if (do_cache) { 2191 struct rtable __rcu **prth; 2192 struct fib_nh *nh = &FIB_RES_NH(*res); 2193 2194 fnhe = find_exception(nh, fl4->daddr); 2195 if (fnhe) { 2196 prth = &fnhe->fnhe_rth_output; 2197 rth = rcu_dereference(*prth); 2198 if (rth && rth->dst.expires && 2199 time_after(jiffies, rth->dst.expires)) { 2200 ip_del_fnhe(nh, fl4->daddr); 2201 fnhe = NULL; 2202 } else { 2203 goto rt_cache; 2204 } 2205 } 2206 2207 if (unlikely(fl4->flowi4_flags & 2208 FLOWI_FLAG_KNOWN_NH && 2209 !(nh->nh_gw && 2210 nh->nh_scope == RT_SCOPE_LINK))) { 2211 do_cache = false; 2212 goto add; 2213 } 2214 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); 2215 rth = rcu_dereference(*prth); 2216 2217 rt_cache: 2218 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2219 return rth; 2220 } 2221 2222 add: 2223 rth = rt_dst_alloc(dev_out, flags, type, 2224 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2225 IN_DEV_CONF_GET(in_dev, NOXFRM), 2226 do_cache); 2227 if (!rth) 2228 return ERR_PTR(-ENOBUFS); 2229 2230 rth->rt_iif = orig_oif; 2231 2232 RT_CACHE_STAT_INC(out_slow_tot); 2233 2234 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2235 if (flags & RTCF_LOCAL && 2236 !(dev_out->flags & IFF_LOOPBACK)) { 2237 rth->dst.output = ip_mc_output; 2238 RT_CACHE_STAT_INC(out_slow_mc); 2239 } 2240 #ifdef CONFIG_IP_MROUTE 2241 if (type == RTN_MULTICAST) { 2242 if (IN_DEV_MFORWARD(in_dev) && 2243 !ipv4_is_local_multicast(fl4->daddr)) { 2244 rth->dst.input = ip_mr_input; 2245 rth->dst.output = ip_mc_output; 2246 } 2247 } 2248 #endif 2249 } 2250 2251 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2252 lwtunnel_set_redirect(&rth->dst); 2253 2254 return rth; 2255 } 2256 2257 /* 2258 * Major route resolver routine. 2259 */ 2260 2261 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2262 const struct sk_buff *skb) 2263 { 2264 __u8 tos = RT_FL_TOS(fl4); 2265 struct fib_result res; 2266 struct rtable *rth; 2267 2268 res.tclassid = 0; 2269 res.fi = NULL; 2270 res.table = NULL; 2271 2272 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2273 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2274 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2275 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2276 2277 rcu_read_lock(); 2278 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); 2279 rcu_read_unlock(); 2280 2281 return rth; 2282 } 2283 EXPORT_SYMBOL_GPL(ip_route_output_key_hash); 2284 2285 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, 2286 struct fib_result *res, 2287 const struct sk_buff *skb) 2288 { 2289 struct net_device *dev_out = NULL; 2290 int orig_oif = fl4->flowi4_oif; 2291 unsigned int flags = 0; 2292 struct rtable *rth; 2293 int err = -ENETUNREACH; 2294 2295 if (fl4->saddr) { 2296 rth = ERR_PTR(-EINVAL); 2297 if (ipv4_is_multicast(fl4->saddr) || 2298 ipv4_is_lbcast(fl4->saddr) || 2299 ipv4_is_zeronet(fl4->saddr)) 2300 goto out; 2301 2302 /* I removed check for oif == dev_out->oif here. 2303 It was wrong for two reasons: 2304 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2305 is assigned to multiple interfaces. 2306 2. Moreover, we are allowed to send packets with saddr 2307 of another iface. --ANK 2308 */ 2309 2310 if (fl4->flowi4_oif == 0 && 2311 (ipv4_is_multicast(fl4->daddr) || 2312 ipv4_is_lbcast(fl4->daddr))) { 2313 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2314 dev_out = __ip_dev_find(net, fl4->saddr, false); 2315 if (!dev_out) 2316 goto out; 2317 2318 /* Special hack: user can direct multicasts 2319 and limited broadcast via necessary interface 2320 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2321 This hack is not just for fun, it allows 2322 vic,vat and friends to work. 2323 They bind socket to loopback, set ttl to zero 2324 and expect that it will work. 2325 From the viewpoint of routing cache they are broken, 2326 because we are not allowed to build multicast path 2327 with loopback source addr (look, routing cache 2328 cannot know, that ttl is zero, so that packet 2329 will not leave this host and route is valid). 2330 Luckily, this hack is good workaround. 2331 */ 2332 2333 fl4->flowi4_oif = dev_out->ifindex; 2334 goto make_route; 2335 } 2336 2337 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2338 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2339 if (!__ip_dev_find(net, fl4->saddr, false)) 2340 goto out; 2341 } 2342 } 2343 2344 2345 if (fl4->flowi4_oif) { 2346 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2347 rth = ERR_PTR(-ENODEV); 2348 if (!dev_out) 2349 goto out; 2350 2351 /* RACE: Check return value of inet_select_addr instead. */ 2352 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2353 rth = ERR_PTR(-ENETUNREACH); 2354 goto out; 2355 } 2356 if (ipv4_is_local_multicast(fl4->daddr) || 2357 ipv4_is_lbcast(fl4->daddr) || 2358 fl4->flowi4_proto == IPPROTO_IGMP) { 2359 if (!fl4->saddr) 2360 fl4->saddr = inet_select_addr(dev_out, 0, 2361 RT_SCOPE_LINK); 2362 goto make_route; 2363 } 2364 if (!fl4->saddr) { 2365 if (ipv4_is_multicast(fl4->daddr)) 2366 fl4->saddr = inet_select_addr(dev_out, 0, 2367 fl4->flowi4_scope); 2368 else if (!fl4->daddr) 2369 fl4->saddr = inet_select_addr(dev_out, 0, 2370 RT_SCOPE_HOST); 2371 } 2372 } 2373 2374 if (!fl4->daddr) { 2375 fl4->daddr = fl4->saddr; 2376 if (!fl4->daddr) 2377 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2378 dev_out = net->loopback_dev; 2379 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2380 res->type = RTN_LOCAL; 2381 flags |= RTCF_LOCAL; 2382 goto make_route; 2383 } 2384 2385 err = fib_lookup(net, fl4, res, 0); 2386 if (err) { 2387 res->fi = NULL; 2388 res->table = NULL; 2389 if (fl4->flowi4_oif && 2390 (ipv4_is_multicast(fl4->daddr) || 2391 !netif_index_is_l3_master(net, fl4->flowi4_oif))) { 2392 /* Apparently, routing tables are wrong. Assume, 2393 that the destination is on link. 2394 2395 WHY? DW. 2396 Because we are allowed to send to iface 2397 even if it has NO routes and NO assigned 2398 addresses. When oif is specified, routing 2399 tables are looked up with only one purpose: 2400 to catch if destination is gatewayed, rather than 2401 direct. Moreover, if MSG_DONTROUTE is set, 2402 we send packet, ignoring both routing tables 2403 and ifaddr state. --ANK 2404 2405 2406 We could make it even if oif is unknown, 2407 likely IPv6, but we do not. 2408 */ 2409 2410 if (fl4->saddr == 0) 2411 fl4->saddr = inet_select_addr(dev_out, 0, 2412 RT_SCOPE_LINK); 2413 res->type = RTN_UNICAST; 2414 goto make_route; 2415 } 2416 rth = ERR_PTR(err); 2417 goto out; 2418 } 2419 2420 if (res->type == RTN_LOCAL) { 2421 if (!fl4->saddr) { 2422 if (res->fi->fib_prefsrc) 2423 fl4->saddr = res->fi->fib_prefsrc; 2424 else 2425 fl4->saddr = fl4->daddr; 2426 } 2427 2428 /* L3 master device is the loopback for that domain */ 2429 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : 2430 net->loopback_dev; 2431 2432 /* make sure orig_oif points to fib result device even 2433 * though packet rx/tx happens over loopback or l3mdev 2434 */ 2435 orig_oif = FIB_RES_OIF(*res); 2436 2437 fl4->flowi4_oif = dev_out->ifindex; 2438 flags |= RTCF_LOCAL; 2439 goto make_route; 2440 } 2441 2442 fib_select_path(net, res, fl4, skb); 2443 2444 dev_out = FIB_RES_DEV(*res); 2445 fl4->flowi4_oif = dev_out->ifindex; 2446 2447 2448 make_route: 2449 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); 2450 2451 out: 2452 return rth; 2453 } 2454 2455 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2456 { 2457 return NULL; 2458 } 2459 2460 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2461 { 2462 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2463 2464 return mtu ? : dst->dev->mtu; 2465 } 2466 2467 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2468 struct sk_buff *skb, u32 mtu) 2469 { 2470 } 2471 2472 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2473 struct sk_buff *skb) 2474 { 2475 } 2476 2477 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2478 unsigned long old) 2479 { 2480 return NULL; 2481 } 2482 2483 static struct dst_ops ipv4_dst_blackhole_ops = { 2484 .family = AF_INET, 2485 .check = ipv4_blackhole_dst_check, 2486 .mtu = ipv4_blackhole_mtu, 2487 .default_advmss = ipv4_default_advmss, 2488 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2489 .redirect = ipv4_rt_blackhole_redirect, 2490 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2491 .neigh_lookup = ipv4_neigh_lookup, 2492 }; 2493 2494 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2495 { 2496 struct rtable *ort = (struct rtable *) dst_orig; 2497 struct rtable *rt; 2498 2499 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); 2500 if (rt) { 2501 struct dst_entry *new = &rt->dst; 2502 2503 new->__use = 1; 2504 new->input = dst_discard; 2505 new->output = dst_discard_out; 2506 2507 new->dev = net->loopback_dev; 2508 if (new->dev) 2509 dev_hold(new->dev); 2510 2511 rt->rt_is_input = ort->rt_is_input; 2512 rt->rt_iif = ort->rt_iif; 2513 rt->rt_pmtu = ort->rt_pmtu; 2514 2515 rt->rt_genid = rt_genid_ipv4(net); 2516 rt->rt_flags = ort->rt_flags; 2517 rt->rt_type = ort->rt_type; 2518 rt->rt_gateway = ort->rt_gateway; 2519 rt->rt_uses_gateway = ort->rt_uses_gateway; 2520 2521 INIT_LIST_HEAD(&rt->rt_uncached); 2522 } 2523 2524 dst_release(dst_orig); 2525 2526 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2527 } 2528 2529 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2530 const struct sock *sk) 2531 { 2532 struct rtable *rt = __ip_route_output_key(net, flp4); 2533 2534 if (IS_ERR(rt)) 2535 return rt; 2536 2537 if (flp4->flowi4_proto) 2538 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, 2539 flowi4_to_flowi(flp4), 2540 sk, 0); 2541 2542 return rt; 2543 } 2544 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2545 2546 /* called with rcu_read_lock held */ 2547 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id, 2548 struct flowi4 *fl4, struct sk_buff *skb, u32 portid, 2549 u32 seq) 2550 { 2551 struct rtable *rt = skb_rtable(skb); 2552 struct rtmsg *r; 2553 struct nlmsghdr *nlh; 2554 unsigned long expires = 0; 2555 u32 error; 2556 u32 metrics[RTAX_MAX]; 2557 2558 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); 2559 if (!nlh) 2560 return -EMSGSIZE; 2561 2562 r = nlmsg_data(nlh); 2563 r->rtm_family = AF_INET; 2564 r->rtm_dst_len = 32; 2565 r->rtm_src_len = 0; 2566 r->rtm_tos = fl4->flowi4_tos; 2567 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; 2568 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2569 goto nla_put_failure; 2570 r->rtm_type = rt->rt_type; 2571 r->rtm_scope = RT_SCOPE_UNIVERSE; 2572 r->rtm_protocol = RTPROT_UNSPEC; 2573 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2574 if (rt->rt_flags & RTCF_NOTIFY) 2575 r->rtm_flags |= RTM_F_NOTIFY; 2576 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2577 r->rtm_flags |= RTCF_DOREDIRECT; 2578 2579 if (nla_put_in_addr(skb, RTA_DST, dst)) 2580 goto nla_put_failure; 2581 if (src) { 2582 r->rtm_src_len = 32; 2583 if (nla_put_in_addr(skb, RTA_SRC, src)) 2584 goto nla_put_failure; 2585 } 2586 if (rt->dst.dev && 2587 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2588 goto nla_put_failure; 2589 #ifdef CONFIG_IP_ROUTE_CLASSID 2590 if (rt->dst.tclassid && 2591 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2592 goto nla_put_failure; 2593 #endif 2594 if (!rt_is_input_route(rt) && 2595 fl4->saddr != src) { 2596 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) 2597 goto nla_put_failure; 2598 } 2599 if (rt->rt_uses_gateway && 2600 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) 2601 goto nla_put_failure; 2602 2603 expires = rt->dst.expires; 2604 if (expires) { 2605 unsigned long now = jiffies; 2606 2607 if (time_before(now, expires)) 2608 expires -= now; 2609 else 2610 expires = 0; 2611 } 2612 2613 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2614 if (rt->rt_pmtu && expires) 2615 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2616 if (rtnetlink_put_metrics(skb, metrics) < 0) 2617 goto nla_put_failure; 2618 2619 if (fl4->flowi4_mark && 2620 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2621 goto nla_put_failure; 2622 2623 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && 2624 nla_put_u32(skb, RTA_UID, 2625 from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) 2626 goto nla_put_failure; 2627 2628 error = rt->dst.error; 2629 2630 if (rt_is_input_route(rt)) { 2631 #ifdef CONFIG_IP_MROUTE 2632 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2633 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2634 int err = ipmr_get_route(net, skb, 2635 fl4->saddr, fl4->daddr, 2636 r, portid); 2637 2638 if (err <= 0) { 2639 if (err == 0) 2640 return 0; 2641 goto nla_put_failure; 2642 } 2643 } else 2644 #endif 2645 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex)) 2646 goto nla_put_failure; 2647 } 2648 2649 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2650 goto nla_put_failure; 2651 2652 nlmsg_end(skb, nlh); 2653 return 0; 2654 2655 nla_put_failure: 2656 nlmsg_cancel(skb, nlh); 2657 return -EMSGSIZE; 2658 } 2659 2660 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 2661 struct netlink_ext_ack *extack) 2662 { 2663 struct net *net = sock_net(in_skb->sk); 2664 struct rtmsg *rtm; 2665 struct nlattr *tb[RTA_MAX+1]; 2666 struct fib_result res = {}; 2667 struct rtable *rt = NULL; 2668 struct flowi4 fl4; 2669 __be32 dst = 0; 2670 __be32 src = 0; 2671 u32 iif; 2672 int err; 2673 int mark; 2674 struct sk_buff *skb; 2675 u32 table_id = RT_TABLE_MAIN; 2676 kuid_t uid; 2677 2678 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy, 2679 extack); 2680 if (err < 0) 2681 goto errout; 2682 2683 rtm = nlmsg_data(nlh); 2684 2685 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2686 if (!skb) { 2687 err = -ENOBUFS; 2688 goto errout; 2689 } 2690 2691 /* Reserve room for dummy headers, this skb can pass 2692 through good chunk of routing engine. 2693 */ 2694 skb_reset_mac_header(skb); 2695 skb_reset_network_header(skb); 2696 2697 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2698 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 2699 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2700 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2701 if (tb[RTA_UID]) 2702 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); 2703 else 2704 uid = (iif ? INVALID_UID : current_uid()); 2705 2706 /* Bugfix: need to give ip_route_input enough of an IP header to 2707 * not gag. 2708 */ 2709 ip_hdr(skb)->protocol = IPPROTO_UDP; 2710 ip_hdr(skb)->saddr = src; 2711 ip_hdr(skb)->daddr = dst; 2712 2713 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr)); 2714 2715 memset(&fl4, 0, sizeof(fl4)); 2716 fl4.daddr = dst; 2717 fl4.saddr = src; 2718 fl4.flowi4_tos = rtm->rtm_tos; 2719 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2720 fl4.flowi4_mark = mark; 2721 fl4.flowi4_uid = uid; 2722 2723 rcu_read_lock(); 2724 2725 if (iif) { 2726 struct net_device *dev; 2727 2728 dev = dev_get_by_index_rcu(net, iif); 2729 if (!dev) { 2730 err = -ENODEV; 2731 goto errout_free; 2732 } 2733 2734 skb->protocol = htons(ETH_P_IP); 2735 skb->dev = dev; 2736 skb->mark = mark; 2737 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, 2738 dev, &res); 2739 2740 rt = skb_rtable(skb); 2741 if (err == 0 && rt->dst.error) 2742 err = -rt->dst.error; 2743 } else { 2744 fl4.flowi4_iif = LOOPBACK_IFINDEX; 2745 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); 2746 err = 0; 2747 if (IS_ERR(rt)) 2748 err = PTR_ERR(rt); 2749 else 2750 skb_dst_set(skb, &rt->dst); 2751 } 2752 2753 if (err) 2754 goto errout_free; 2755 2756 if (rtm->rtm_flags & RTM_F_NOTIFY) 2757 rt->rt_flags |= RTCF_NOTIFY; 2758 2759 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 2760 table_id = res.table ? res.table->tb_id : 0; 2761 2762 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 2763 if (!res.fi) { 2764 err = fib_props[res.type].error; 2765 if (!err) 2766 err = -EHOSTUNREACH; 2767 goto errout_free; 2768 } 2769 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 2770 nlh->nlmsg_seq, RTM_NEWROUTE, table_id, 2771 rt->rt_type, res.prefix, res.prefixlen, 2772 fl4.flowi4_tos, res.fi, 0); 2773 } else { 2774 err = rt_fill_info(net, dst, src, table_id, &fl4, skb, 2775 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); 2776 } 2777 if (err < 0) 2778 goto errout_free; 2779 2780 rcu_read_unlock(); 2781 2782 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 2783 errout: 2784 return err; 2785 2786 errout_free: 2787 rcu_read_unlock(); 2788 kfree_skb(skb); 2789 goto errout; 2790 } 2791 2792 void ip_rt_multicast_event(struct in_device *in_dev) 2793 { 2794 rt_cache_flush(dev_net(in_dev->dev)); 2795 } 2796 2797 #ifdef CONFIG_SYSCTL 2798 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 2799 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 2800 static int ip_rt_gc_elasticity __read_mostly = 8; 2801 2802 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, 2803 void __user *buffer, 2804 size_t *lenp, loff_t *ppos) 2805 { 2806 struct net *net = (struct net *)__ctl->extra1; 2807 2808 if (write) { 2809 rt_cache_flush(net); 2810 fnhe_genid_bump(net); 2811 return 0; 2812 } 2813 2814 return -EINVAL; 2815 } 2816 2817 static struct ctl_table ipv4_route_table[] = { 2818 { 2819 .procname = "gc_thresh", 2820 .data = &ipv4_dst_ops.gc_thresh, 2821 .maxlen = sizeof(int), 2822 .mode = 0644, 2823 .proc_handler = proc_dointvec, 2824 }, 2825 { 2826 .procname = "max_size", 2827 .data = &ip_rt_max_size, 2828 .maxlen = sizeof(int), 2829 .mode = 0644, 2830 .proc_handler = proc_dointvec, 2831 }, 2832 { 2833 /* Deprecated. Use gc_min_interval_ms */ 2834 2835 .procname = "gc_min_interval", 2836 .data = &ip_rt_gc_min_interval, 2837 .maxlen = sizeof(int), 2838 .mode = 0644, 2839 .proc_handler = proc_dointvec_jiffies, 2840 }, 2841 { 2842 .procname = "gc_min_interval_ms", 2843 .data = &ip_rt_gc_min_interval, 2844 .maxlen = sizeof(int), 2845 .mode = 0644, 2846 .proc_handler = proc_dointvec_ms_jiffies, 2847 }, 2848 { 2849 .procname = "gc_timeout", 2850 .data = &ip_rt_gc_timeout, 2851 .maxlen = sizeof(int), 2852 .mode = 0644, 2853 .proc_handler = proc_dointvec_jiffies, 2854 }, 2855 { 2856 .procname = "gc_interval", 2857 .data = &ip_rt_gc_interval, 2858 .maxlen = sizeof(int), 2859 .mode = 0644, 2860 .proc_handler = proc_dointvec_jiffies, 2861 }, 2862 { 2863 .procname = "redirect_load", 2864 .data = &ip_rt_redirect_load, 2865 .maxlen = sizeof(int), 2866 .mode = 0644, 2867 .proc_handler = proc_dointvec, 2868 }, 2869 { 2870 .procname = "redirect_number", 2871 .data = &ip_rt_redirect_number, 2872 .maxlen = sizeof(int), 2873 .mode = 0644, 2874 .proc_handler = proc_dointvec, 2875 }, 2876 { 2877 .procname = "redirect_silence", 2878 .data = &ip_rt_redirect_silence, 2879 .maxlen = sizeof(int), 2880 .mode = 0644, 2881 .proc_handler = proc_dointvec, 2882 }, 2883 { 2884 .procname = "error_cost", 2885 .data = &ip_rt_error_cost, 2886 .maxlen = sizeof(int), 2887 .mode = 0644, 2888 .proc_handler = proc_dointvec, 2889 }, 2890 { 2891 .procname = "error_burst", 2892 .data = &ip_rt_error_burst, 2893 .maxlen = sizeof(int), 2894 .mode = 0644, 2895 .proc_handler = proc_dointvec, 2896 }, 2897 { 2898 .procname = "gc_elasticity", 2899 .data = &ip_rt_gc_elasticity, 2900 .maxlen = sizeof(int), 2901 .mode = 0644, 2902 .proc_handler = proc_dointvec, 2903 }, 2904 { 2905 .procname = "mtu_expires", 2906 .data = &ip_rt_mtu_expires, 2907 .maxlen = sizeof(int), 2908 .mode = 0644, 2909 .proc_handler = proc_dointvec_jiffies, 2910 }, 2911 { 2912 .procname = "min_pmtu", 2913 .data = &ip_rt_min_pmtu, 2914 .maxlen = sizeof(int), 2915 .mode = 0644, 2916 .proc_handler = proc_dointvec, 2917 }, 2918 { 2919 .procname = "min_adv_mss", 2920 .data = &ip_rt_min_advmss, 2921 .maxlen = sizeof(int), 2922 .mode = 0644, 2923 .proc_handler = proc_dointvec, 2924 }, 2925 { } 2926 }; 2927 2928 static struct ctl_table ipv4_route_flush_table[] = { 2929 { 2930 .procname = "flush", 2931 .maxlen = sizeof(int), 2932 .mode = 0200, 2933 .proc_handler = ipv4_sysctl_rtcache_flush, 2934 }, 2935 { }, 2936 }; 2937 2938 static __net_init int sysctl_route_net_init(struct net *net) 2939 { 2940 struct ctl_table *tbl; 2941 2942 tbl = ipv4_route_flush_table; 2943 if (!net_eq(net, &init_net)) { 2944 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 2945 if (!tbl) 2946 goto err_dup; 2947 2948 /* Don't export sysctls to unprivileged users */ 2949 if (net->user_ns != &init_user_ns) 2950 tbl[0].procname = NULL; 2951 } 2952 tbl[0].extra1 = net; 2953 2954 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 2955 if (!net->ipv4.route_hdr) 2956 goto err_reg; 2957 return 0; 2958 2959 err_reg: 2960 if (tbl != ipv4_route_flush_table) 2961 kfree(tbl); 2962 err_dup: 2963 return -ENOMEM; 2964 } 2965 2966 static __net_exit void sysctl_route_net_exit(struct net *net) 2967 { 2968 struct ctl_table *tbl; 2969 2970 tbl = net->ipv4.route_hdr->ctl_table_arg; 2971 unregister_net_sysctl_table(net->ipv4.route_hdr); 2972 BUG_ON(tbl == ipv4_route_flush_table); 2973 kfree(tbl); 2974 } 2975 2976 static __net_initdata struct pernet_operations sysctl_route_ops = { 2977 .init = sysctl_route_net_init, 2978 .exit = sysctl_route_net_exit, 2979 .async = true, 2980 }; 2981 #endif 2982 2983 static __net_init int rt_genid_init(struct net *net) 2984 { 2985 atomic_set(&net->ipv4.rt_genid, 0); 2986 atomic_set(&net->fnhe_genid, 0); 2987 atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); 2988 return 0; 2989 } 2990 2991 static __net_initdata struct pernet_operations rt_genid_ops = { 2992 .init = rt_genid_init, 2993 .async = true, 2994 }; 2995 2996 static int __net_init ipv4_inetpeer_init(struct net *net) 2997 { 2998 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 2999 3000 if (!bp) 3001 return -ENOMEM; 3002 inet_peer_base_init(bp); 3003 net->ipv4.peers = bp; 3004 return 0; 3005 } 3006 3007 static void __net_exit ipv4_inetpeer_exit(struct net *net) 3008 { 3009 struct inet_peer_base *bp = net->ipv4.peers; 3010 3011 net->ipv4.peers = NULL; 3012 inetpeer_invalidate_tree(bp); 3013 kfree(bp); 3014 } 3015 3016 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 3017 .init = ipv4_inetpeer_init, 3018 .exit = ipv4_inetpeer_exit, 3019 .async = true, 3020 }; 3021 3022 #ifdef CONFIG_IP_ROUTE_CLASSID 3023 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3024 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3025 3026 int __init ip_rt_init(void) 3027 { 3028 int cpu; 3029 3030 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL); 3031 if (!ip_idents) 3032 panic("IP: failed to allocate ip_idents\n"); 3033 3034 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 3035 3036 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); 3037 if (!ip_tstamps) 3038 panic("IP: failed to allocate ip_tstamps\n"); 3039 3040 for_each_possible_cpu(cpu) { 3041 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 3042 3043 INIT_LIST_HEAD(&ul->head); 3044 spin_lock_init(&ul->lock); 3045 } 3046 #ifdef CONFIG_IP_ROUTE_CLASSID 3047 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3048 if (!ip_rt_acct) 3049 panic("IP: failed to allocate ip_rt_acct\n"); 3050 #endif 3051 3052 ipv4_dst_ops.kmem_cachep = 3053 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3054 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3055 3056 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3057 3058 if (dst_entries_init(&ipv4_dst_ops) < 0) 3059 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3060 3061 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3062 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3063 3064 ipv4_dst_ops.gc_thresh = ~0; 3065 ip_rt_max_size = INT_MAX; 3066 3067 devinet_init(); 3068 ip_fib_init(); 3069 3070 if (ip_rt_proc_init()) 3071 pr_err("Unable to create route proc files\n"); 3072 #ifdef CONFIG_XFRM 3073 xfrm_init(); 3074 xfrm4_init(); 3075 #endif 3076 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 3077 RTNL_FLAG_DOIT_UNLOCKED); 3078 3079 #ifdef CONFIG_SYSCTL 3080 register_pernet_subsys(&sysctl_route_ops); 3081 #endif 3082 register_pernet_subsys(&rt_genid_ops); 3083 register_pernet_subsys(&ipv4_inetpeer_ops); 3084 return 0; 3085 } 3086 3087 #ifdef CONFIG_SYSCTL 3088 /* 3089 * We really need to sanitize the damn ipv4 init order, then all 3090 * this nonsense will go away. 3091 */ 3092 void __init ip_static_sysctl_init(void) 3093 { 3094 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3095 } 3096 #endif 3097