1 /* 2 * INET An implementation of the TCP/IP protocol suite for the LINUX 3 * operating system. INET is implemented using the BSD Socket 4 * interface as the means of communication with the user level. 5 * 6 * ROUTE - implementation of the IP router. 7 * 8 * Authors: Ross Biro 9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 10 * Alan Cox, <gw4pts@gw4pts.ampr.org> 11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 13 * 14 * Fixes: 15 * Alan Cox : Verify area fixes. 16 * Alan Cox : cli() protects routing changes 17 * Rui Oliveira : ICMP routing table updates 18 * (rco@di.uminho.pt) Routing table insertion and update 19 * Linus Torvalds : Rewrote bits to be sensible 20 * Alan Cox : Added BSD route gw semantics 21 * Alan Cox : Super /proc >4K 22 * Alan Cox : MTU in route table 23 * Alan Cox : MSS actually. Also added the window 24 * clamper. 25 * Sam Lantinga : Fixed route matching in rt_del() 26 * Alan Cox : Routing cache support. 27 * Alan Cox : Removed compatibility cruft. 28 * Alan Cox : RTF_REJECT support. 29 * Alan Cox : TCP irtt support. 30 * Jonathan Naylor : Added Metric support. 31 * Miquel van Smoorenburg : BSD API fixes. 32 * Miquel van Smoorenburg : Metrics. 33 * Alan Cox : Use __u32 properly 34 * Alan Cox : Aligned routing errors more closely with BSD 35 * our system is still very different. 36 * Alan Cox : Faster /proc handling 37 * Alexey Kuznetsov : Massive rework to support tree based routing, 38 * routing caches and better behaviour. 39 * 40 * Olaf Erb : irtt wasn't being copied right. 41 * Bjorn Ekwall : Kerneld route support. 42 * Alan Cox : Multicast fixed (I hope) 43 * Pavel Krauz : Limited broadcast fixed 44 * Mike McLagan : Routing by source 45 * Alexey Kuznetsov : End of old history. Split to fib.c and 46 * route.c and rewritten from scratch. 47 * Andi Kleen : Load-limit warning messages. 48 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 52 * Marc Boucher : routing by fwmark 53 * Robert Olsson : Added rt_cache statistics 54 * Arnaldo C. Melo : Convert proc stuff to seq_file 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 57 * Ilia Sotnikov : Removed TOS from hash calculations 58 * 59 * This program is free software; you can redistribute it and/or 60 * modify it under the terms of the GNU General Public License 61 * as published by the Free Software Foundation; either version 62 * 2 of the License, or (at your option) any later version. 63 */ 64 65 #define pr_fmt(fmt) "IPv4: " fmt 66 67 #include <linux/module.h> 68 #include <linux/uaccess.h> 69 #include <linux/bitops.h> 70 #include <linux/types.h> 71 #include <linux/kernel.h> 72 #include <linux/mm.h> 73 #include <linux/string.h> 74 #include <linux/socket.h> 75 #include <linux/sockios.h> 76 #include <linux/errno.h> 77 #include <linux/in.h> 78 #include <linux/inet.h> 79 #include <linux/netdevice.h> 80 #include <linux/proc_fs.h> 81 #include <linux/init.h> 82 #include <linux/skbuff.h> 83 #include <linux/inetdevice.h> 84 #include <linux/igmp.h> 85 #include <linux/pkt_sched.h> 86 #include <linux/mroute.h> 87 #include <linux/netfilter_ipv4.h> 88 #include <linux/random.h> 89 #include <linux/rcupdate.h> 90 #include <linux/times.h> 91 #include <linux/slab.h> 92 #include <linux/jhash.h> 93 #include <net/dst.h> 94 #include <net/dst_metadata.h> 95 #include <net/net_namespace.h> 96 #include <net/protocol.h> 97 #include <net/ip.h> 98 #include <net/route.h> 99 #include <net/inetpeer.h> 100 #include <net/sock.h> 101 #include <net/ip_fib.h> 102 #include <net/arp.h> 103 #include <net/tcp.h> 104 #include <net/icmp.h> 105 #include <net/xfrm.h> 106 #include <net/lwtunnel.h> 107 #include <net/netevent.h> 108 #include <net/rtnetlink.h> 109 #ifdef CONFIG_SYSCTL 110 #include <linux/sysctl.h> 111 #endif 112 #include <net/secure_seq.h> 113 #include <net/ip_tunnels.h> 114 #include <net/l3mdev.h> 115 116 #include "fib_lookup.h" 117 118 #define RT_FL_TOS(oldflp4) \ 119 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 120 121 #define RT_GC_TIMEOUT (300*HZ) 122 123 static int ip_rt_max_size; 124 static int ip_rt_redirect_number __read_mostly = 9; 125 static int ip_rt_redirect_load __read_mostly = HZ / 50; 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 127 static int ip_rt_error_cost __read_mostly = HZ; 128 static int ip_rt_error_burst __read_mostly = 5 * HZ; 129 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 130 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 131 static int ip_rt_min_advmss __read_mostly = 256; 132 133 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 134 135 /* 136 * Interface to generic destination cache. 137 */ 138 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 140 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 141 static unsigned int ipv4_mtu(const struct dst_entry *dst); 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 143 static void ipv4_link_failure(struct sk_buff *skb); 144 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 145 struct sk_buff *skb, u32 mtu); 146 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 147 struct sk_buff *skb); 148 static void ipv4_dst_destroy(struct dst_entry *dst); 149 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 151 { 152 WARN_ON(1); 153 return NULL; 154 } 155 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 157 struct sk_buff *skb, 158 const void *daddr); 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); 160 161 static struct dst_ops ipv4_dst_ops = { 162 .family = AF_INET, 163 .check = ipv4_dst_check, 164 .default_advmss = ipv4_default_advmss, 165 .mtu = ipv4_mtu, 166 .cow_metrics = ipv4_cow_metrics, 167 .destroy = ipv4_dst_destroy, 168 .negative_advice = ipv4_negative_advice, 169 .link_failure = ipv4_link_failure, 170 .update_pmtu = ip_rt_update_pmtu, 171 .redirect = ip_do_redirect, 172 .local_out = __ip_local_out, 173 .neigh_lookup = ipv4_neigh_lookup, 174 .confirm_neigh = ipv4_confirm_neigh, 175 }; 176 177 #define ECN_OR_COST(class) TC_PRIO_##class 178 179 const __u8 ip_tos2prio[16] = { 180 TC_PRIO_BESTEFFORT, 181 ECN_OR_COST(BESTEFFORT), 182 TC_PRIO_BESTEFFORT, 183 ECN_OR_COST(BESTEFFORT), 184 TC_PRIO_BULK, 185 ECN_OR_COST(BULK), 186 TC_PRIO_BULK, 187 ECN_OR_COST(BULK), 188 TC_PRIO_INTERACTIVE, 189 ECN_OR_COST(INTERACTIVE), 190 TC_PRIO_INTERACTIVE, 191 ECN_OR_COST(INTERACTIVE), 192 TC_PRIO_INTERACTIVE_BULK, 193 ECN_OR_COST(INTERACTIVE_BULK), 194 TC_PRIO_INTERACTIVE_BULK, 195 ECN_OR_COST(INTERACTIVE_BULK) 196 }; 197 EXPORT_SYMBOL(ip_tos2prio); 198 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 201 202 #ifdef CONFIG_PROC_FS 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 204 { 205 if (*pos) 206 return NULL; 207 return SEQ_START_TOKEN; 208 } 209 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 211 { 212 ++*pos; 213 return NULL; 214 } 215 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 217 { 218 } 219 220 static int rt_cache_seq_show(struct seq_file *seq, void *v) 221 { 222 if (v == SEQ_START_TOKEN) 223 seq_printf(seq, "%-127s\n", 224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 226 "HHUptod\tSpecDst"); 227 return 0; 228 } 229 230 static const struct seq_operations rt_cache_seq_ops = { 231 .start = rt_cache_seq_start, 232 .next = rt_cache_seq_next, 233 .stop = rt_cache_seq_stop, 234 .show = rt_cache_seq_show, 235 }; 236 237 static int rt_cache_seq_open(struct inode *inode, struct file *file) 238 { 239 return seq_open(file, &rt_cache_seq_ops); 240 } 241 242 static const struct file_operations rt_cache_seq_fops = { 243 .open = rt_cache_seq_open, 244 .read = seq_read, 245 .llseek = seq_lseek, 246 .release = seq_release, 247 }; 248 249 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 251 { 252 int cpu; 253 254 if (*pos == 0) 255 return SEQ_START_TOKEN; 256 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 258 if (!cpu_possible(cpu)) 259 continue; 260 *pos = cpu+1; 261 return &per_cpu(rt_cache_stat, cpu); 262 } 263 return NULL; 264 } 265 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 267 { 268 int cpu; 269 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 271 if (!cpu_possible(cpu)) 272 continue; 273 *pos = cpu+1; 274 return &per_cpu(rt_cache_stat, cpu); 275 } 276 return NULL; 277 278 } 279 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 281 { 282 283 } 284 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 286 { 287 struct rt_cache_stat *st = v; 288 289 if (v == SEQ_START_TOKEN) { 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 291 return 0; 292 } 293 294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 296 dst_entries_get_slow(&ipv4_dst_ops), 297 0, /* st->in_hit */ 298 st->in_slow_tot, 299 st->in_slow_mc, 300 st->in_no_route, 301 st->in_brd, 302 st->in_martian_dst, 303 st->in_martian_src, 304 305 0, /* st->out_hit */ 306 st->out_slow_tot, 307 st->out_slow_mc, 308 309 0, /* st->gc_total */ 310 0, /* st->gc_ignored */ 311 0, /* st->gc_goal_miss */ 312 0, /* st->gc_dst_overflow */ 313 0, /* st->in_hlist_search */ 314 0 /* st->out_hlist_search */ 315 ); 316 return 0; 317 } 318 319 static const struct seq_operations rt_cpu_seq_ops = { 320 .start = rt_cpu_seq_start, 321 .next = rt_cpu_seq_next, 322 .stop = rt_cpu_seq_stop, 323 .show = rt_cpu_seq_show, 324 }; 325 326 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 328 { 329 return seq_open(file, &rt_cpu_seq_ops); 330 } 331 332 static const struct file_operations rt_cpu_seq_fops = { 333 .open = rt_cpu_seq_open, 334 .read = seq_read, 335 .llseek = seq_lseek, 336 .release = seq_release, 337 }; 338 339 #ifdef CONFIG_IP_ROUTE_CLASSID 340 static int rt_acct_proc_show(struct seq_file *m, void *v) 341 { 342 struct ip_rt_acct *dst, *src; 343 unsigned int i, j; 344 345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 346 if (!dst) 347 return -ENOMEM; 348 349 for_each_possible_cpu(i) { 350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 351 for (j = 0; j < 256; j++) { 352 dst[j].o_bytes += src[j].o_bytes; 353 dst[j].o_packets += src[j].o_packets; 354 dst[j].i_bytes += src[j].i_bytes; 355 dst[j].i_packets += src[j].i_packets; 356 } 357 } 358 359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 360 kfree(dst); 361 return 0; 362 } 363 #endif 364 365 static int __net_init ip_rt_do_proc_init(struct net *net) 366 { 367 struct proc_dir_entry *pde; 368 369 pde = proc_create("rt_cache", 0444, net->proc_net, 370 &rt_cache_seq_fops); 371 if (!pde) 372 goto err1; 373 374 pde = proc_create("rt_cache", 0444, 375 net->proc_net_stat, &rt_cpu_seq_fops); 376 if (!pde) 377 goto err2; 378 379 #ifdef CONFIG_IP_ROUTE_CLASSID 380 pde = proc_create_single("rt_acct", 0, net->proc_net, 381 rt_acct_proc_show); 382 if (!pde) 383 goto err3; 384 #endif 385 return 0; 386 387 #ifdef CONFIG_IP_ROUTE_CLASSID 388 err3: 389 remove_proc_entry("rt_cache", net->proc_net_stat); 390 #endif 391 err2: 392 remove_proc_entry("rt_cache", net->proc_net); 393 err1: 394 return -ENOMEM; 395 } 396 397 static void __net_exit ip_rt_do_proc_exit(struct net *net) 398 { 399 remove_proc_entry("rt_cache", net->proc_net_stat); 400 remove_proc_entry("rt_cache", net->proc_net); 401 #ifdef CONFIG_IP_ROUTE_CLASSID 402 remove_proc_entry("rt_acct", net->proc_net); 403 #endif 404 } 405 406 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 407 .init = ip_rt_do_proc_init, 408 .exit = ip_rt_do_proc_exit, 409 }; 410 411 static int __init ip_rt_proc_init(void) 412 { 413 return register_pernet_subsys(&ip_rt_proc_ops); 414 } 415 416 #else 417 static inline int ip_rt_proc_init(void) 418 { 419 return 0; 420 } 421 #endif /* CONFIG_PROC_FS */ 422 423 static inline bool rt_is_expired(const struct rtable *rth) 424 { 425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 426 } 427 428 void rt_cache_flush(struct net *net) 429 { 430 rt_genid_bump_ipv4(net); 431 } 432 433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 434 struct sk_buff *skb, 435 const void *daddr) 436 { 437 struct net_device *dev = dst->dev; 438 const __be32 *pkey = daddr; 439 const struct rtable *rt; 440 struct neighbour *n; 441 442 rt = (const struct rtable *) dst; 443 if (rt->rt_gateway) 444 pkey = (const __be32 *) &rt->rt_gateway; 445 else if (skb) 446 pkey = &ip_hdr(skb)->daddr; 447 448 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey); 449 if (n) 450 return n; 451 return neigh_create(&arp_tbl, pkey, dev); 452 } 453 454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 455 { 456 struct net_device *dev = dst->dev; 457 const __be32 *pkey = daddr; 458 const struct rtable *rt; 459 460 rt = (const struct rtable *)dst; 461 if (rt->rt_gateway) 462 pkey = (const __be32 *)&rt->rt_gateway; 463 else if (!daddr || 464 (rt->rt_flags & 465 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) 466 return; 467 468 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); 469 } 470 471 #define IP_IDENTS_SZ 2048u 472 473 static atomic_t *ip_idents __read_mostly; 474 static u32 *ip_tstamps __read_mostly; 475 476 /* In order to protect privacy, we add a perturbation to identifiers 477 * if one generator is seldom used. This makes hard for an attacker 478 * to infer how many packets were sent between two points in time. 479 */ 480 u32 ip_idents_reserve(u32 hash, int segs) 481 { 482 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; 483 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; 484 u32 old = READ_ONCE(*p_tstamp); 485 u32 now = (u32)jiffies; 486 u32 new, delta = 0; 487 488 if (old != now && cmpxchg(p_tstamp, old, now) == old) 489 delta = prandom_u32_max(now - old); 490 491 /* Do not use atomic_add_return() as it makes UBSAN unhappy */ 492 do { 493 old = (u32)atomic_read(p_id); 494 new = old + delta + segs; 495 } while (atomic_cmpxchg(p_id, old, new) != old); 496 497 return new - segs; 498 } 499 EXPORT_SYMBOL(ip_idents_reserve); 500 501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) 502 { 503 u32 hash, id; 504 505 /* Note the following code is not safe, but this is okay. */ 506 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) 507 get_random_bytes(&net->ipv4.ip_id_key, 508 sizeof(net->ipv4.ip_id_key)); 509 510 hash = siphash_3u32((__force u32)iph->daddr, 511 (__force u32)iph->saddr, 512 iph->protocol, 513 &net->ipv4.ip_id_key); 514 id = ip_idents_reserve(hash, segs); 515 iph->id = htons(id); 516 } 517 EXPORT_SYMBOL(__ip_select_ident); 518 519 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 520 const struct sock *sk, 521 const struct iphdr *iph, 522 int oif, u8 tos, 523 u8 prot, u32 mark, int flow_flags) 524 { 525 if (sk) { 526 const struct inet_sock *inet = inet_sk(sk); 527 528 oif = sk->sk_bound_dev_if; 529 mark = sk->sk_mark; 530 tos = RT_CONN_FLAGS(sk); 531 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 532 } 533 flowi4_init_output(fl4, oif, mark, tos, 534 RT_SCOPE_UNIVERSE, prot, 535 flow_flags, 536 iph->daddr, iph->saddr, 0, 0, 537 sock_net_uid(net, sk)); 538 } 539 540 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 541 const struct sock *sk) 542 { 543 const struct net *net = dev_net(skb->dev); 544 const struct iphdr *iph = ip_hdr(skb); 545 int oif = skb->dev->ifindex; 546 u8 tos = RT_TOS(iph->tos); 547 u8 prot = iph->protocol; 548 u32 mark = skb->mark; 549 550 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); 551 } 552 553 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 554 { 555 const struct inet_sock *inet = inet_sk(sk); 556 const struct ip_options_rcu *inet_opt; 557 __be32 daddr = inet->inet_daddr; 558 559 rcu_read_lock(); 560 inet_opt = rcu_dereference(inet->inet_opt); 561 if (inet_opt && inet_opt->opt.srr) 562 daddr = inet_opt->opt.faddr; 563 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 564 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 565 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 566 inet_sk_flowi_flags(sk), 567 daddr, inet->inet_saddr, 0, 0, sk->sk_uid); 568 rcu_read_unlock(); 569 } 570 571 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 572 const struct sk_buff *skb) 573 { 574 if (skb) 575 build_skb_flow_key(fl4, skb, sk); 576 else 577 build_sk_flow_key(fl4, sk); 578 } 579 580 static DEFINE_SPINLOCK(fnhe_lock); 581 582 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) 583 { 584 struct rtable *rt; 585 586 rt = rcu_dereference(fnhe->fnhe_rth_input); 587 if (rt) { 588 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 589 dst_dev_put(&rt->dst); 590 dst_release(&rt->dst); 591 } 592 rt = rcu_dereference(fnhe->fnhe_rth_output); 593 if (rt) { 594 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 595 dst_dev_put(&rt->dst); 596 dst_release(&rt->dst); 597 } 598 } 599 600 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 601 { 602 struct fib_nh_exception *fnhe, *oldest; 603 604 oldest = rcu_dereference(hash->chain); 605 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 606 fnhe = rcu_dereference(fnhe->fnhe_next)) { 607 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 608 oldest = fnhe; 609 } 610 fnhe_flush_routes(oldest); 611 return oldest; 612 } 613 614 static inline u32 fnhe_hashfun(__be32 daddr) 615 { 616 static u32 fnhe_hashrnd __read_mostly; 617 u32 hval; 618 619 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); 620 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd); 621 return hash_32(hval, FNHE_HASH_SHIFT); 622 } 623 624 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 625 { 626 rt->rt_pmtu = fnhe->fnhe_pmtu; 627 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; 628 rt->dst.expires = fnhe->fnhe_expires; 629 630 if (fnhe->fnhe_gw) { 631 rt->rt_flags |= RTCF_REDIRECTED; 632 rt->rt_gateway = fnhe->fnhe_gw; 633 rt->rt_uses_gateway = 1; 634 } 635 } 636 637 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw, 638 u32 pmtu, bool lock, unsigned long expires) 639 { 640 struct fnhe_hash_bucket *hash; 641 struct fib_nh_exception *fnhe; 642 struct rtable *rt; 643 u32 genid, hval; 644 unsigned int i; 645 int depth; 646 647 genid = fnhe_genid(dev_net(nh->fib_nh_dev)); 648 hval = fnhe_hashfun(daddr); 649 650 spin_lock_bh(&fnhe_lock); 651 652 hash = rcu_dereference(nh->nh_exceptions); 653 if (!hash) { 654 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); 655 if (!hash) 656 goto out_unlock; 657 rcu_assign_pointer(nh->nh_exceptions, hash); 658 } 659 660 hash += hval; 661 662 depth = 0; 663 for (fnhe = rcu_dereference(hash->chain); fnhe; 664 fnhe = rcu_dereference(fnhe->fnhe_next)) { 665 if (fnhe->fnhe_daddr == daddr) 666 break; 667 depth++; 668 } 669 670 if (fnhe) { 671 if (fnhe->fnhe_genid != genid) 672 fnhe->fnhe_genid = genid; 673 if (gw) 674 fnhe->fnhe_gw = gw; 675 if (pmtu) { 676 fnhe->fnhe_pmtu = pmtu; 677 fnhe->fnhe_mtu_locked = lock; 678 } 679 fnhe->fnhe_expires = max(1UL, expires); 680 /* Update all cached dsts too */ 681 rt = rcu_dereference(fnhe->fnhe_rth_input); 682 if (rt) 683 fill_route_from_fnhe(rt, fnhe); 684 rt = rcu_dereference(fnhe->fnhe_rth_output); 685 if (rt) 686 fill_route_from_fnhe(rt, fnhe); 687 } else { 688 if (depth > FNHE_RECLAIM_DEPTH) 689 fnhe = fnhe_oldest(hash); 690 else { 691 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 692 if (!fnhe) 693 goto out_unlock; 694 695 fnhe->fnhe_next = hash->chain; 696 rcu_assign_pointer(hash->chain, fnhe); 697 } 698 fnhe->fnhe_genid = genid; 699 fnhe->fnhe_daddr = daddr; 700 fnhe->fnhe_gw = gw; 701 fnhe->fnhe_pmtu = pmtu; 702 fnhe->fnhe_mtu_locked = lock; 703 fnhe->fnhe_expires = max(1UL, expires); 704 705 /* Exception created; mark the cached routes for the nexthop 706 * stale, so anyone caching it rechecks if this exception 707 * applies to them. 708 */ 709 rt = rcu_dereference(nh->nh_rth_input); 710 if (rt) 711 rt->dst.obsolete = DST_OBSOLETE_KILL; 712 713 for_each_possible_cpu(i) { 714 struct rtable __rcu **prt; 715 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i); 716 rt = rcu_dereference(*prt); 717 if (rt) 718 rt->dst.obsolete = DST_OBSOLETE_KILL; 719 } 720 } 721 722 fnhe->fnhe_stamp = jiffies; 723 724 out_unlock: 725 spin_unlock_bh(&fnhe_lock); 726 } 727 728 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 729 bool kill_route) 730 { 731 __be32 new_gw = icmp_hdr(skb)->un.gateway; 732 __be32 old_gw = ip_hdr(skb)->saddr; 733 struct net_device *dev = skb->dev; 734 struct in_device *in_dev; 735 struct fib_result res; 736 struct neighbour *n; 737 struct net *net; 738 739 switch (icmp_hdr(skb)->code & 7) { 740 case ICMP_REDIR_NET: 741 case ICMP_REDIR_NETTOS: 742 case ICMP_REDIR_HOST: 743 case ICMP_REDIR_HOSTTOS: 744 break; 745 746 default: 747 return; 748 } 749 750 if (rt->rt_gateway != old_gw) 751 return; 752 753 in_dev = __in_dev_get_rcu(dev); 754 if (!in_dev) 755 return; 756 757 net = dev_net(dev); 758 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 759 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 760 ipv4_is_zeronet(new_gw)) 761 goto reject_redirect; 762 763 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 764 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 765 goto reject_redirect; 766 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 767 goto reject_redirect; 768 } else { 769 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 770 goto reject_redirect; 771 } 772 773 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 774 if (!n) 775 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 776 if (!IS_ERR(n)) { 777 if (!(n->nud_state & NUD_VALID)) { 778 neigh_event_send(n, NULL); 779 } else { 780 if (fib_lookup(net, fl4, &res, 0) == 0) { 781 struct fib_nh_common *nhc = FIB_RES_NHC(res); 782 struct fib_nh *nh; 783 784 nh = container_of(nhc, struct fib_nh, nh_common); 785 update_or_create_fnhe(nh, fl4->daddr, new_gw, 786 0, false, 787 jiffies + ip_rt_gc_timeout); 788 } 789 if (kill_route) 790 rt->dst.obsolete = DST_OBSOLETE_KILL; 791 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 792 } 793 neigh_release(n); 794 } 795 return; 796 797 reject_redirect: 798 #ifdef CONFIG_IP_ROUTE_VERBOSE 799 if (IN_DEV_LOG_MARTIANS(in_dev)) { 800 const struct iphdr *iph = (const struct iphdr *) skb->data; 801 __be32 daddr = iph->daddr; 802 __be32 saddr = iph->saddr; 803 804 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 805 " Advised path = %pI4 -> %pI4\n", 806 &old_gw, dev->name, &new_gw, 807 &saddr, &daddr); 808 } 809 #endif 810 ; 811 } 812 813 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 814 { 815 struct rtable *rt; 816 struct flowi4 fl4; 817 const struct iphdr *iph = (const struct iphdr *) skb->data; 818 struct net *net = dev_net(skb->dev); 819 int oif = skb->dev->ifindex; 820 u8 tos = RT_TOS(iph->tos); 821 u8 prot = iph->protocol; 822 u32 mark = skb->mark; 823 824 rt = (struct rtable *) dst; 825 826 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 827 __ip_do_redirect(rt, skb, &fl4, true); 828 } 829 830 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 831 { 832 struct rtable *rt = (struct rtable *)dst; 833 struct dst_entry *ret = dst; 834 835 if (rt) { 836 if (dst->obsolete > 0) { 837 ip_rt_put(rt); 838 ret = NULL; 839 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 840 rt->dst.expires) { 841 ip_rt_put(rt); 842 ret = NULL; 843 } 844 } 845 return ret; 846 } 847 848 /* 849 * Algorithm: 850 * 1. The first ip_rt_redirect_number redirects are sent 851 * with exponential backoff, then we stop sending them at all, 852 * assuming that the host ignores our redirects. 853 * 2. If we did not see packets requiring redirects 854 * during ip_rt_redirect_silence, we assume that the host 855 * forgot redirected route and start to send redirects again. 856 * 857 * This algorithm is much cheaper and more intelligent than dumb load limiting 858 * in icmp.c. 859 * 860 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 861 * and "frag. need" (breaks PMTU discovery) in icmp.c. 862 */ 863 864 void ip_rt_send_redirect(struct sk_buff *skb) 865 { 866 struct rtable *rt = skb_rtable(skb); 867 struct in_device *in_dev; 868 struct inet_peer *peer; 869 struct net *net; 870 int log_martians; 871 int vif; 872 873 rcu_read_lock(); 874 in_dev = __in_dev_get_rcu(rt->dst.dev); 875 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 876 rcu_read_unlock(); 877 return; 878 } 879 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 880 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 881 rcu_read_unlock(); 882 883 net = dev_net(rt->dst.dev); 884 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 885 if (!peer) { 886 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 887 rt_nexthop(rt, ip_hdr(skb)->daddr)); 888 return; 889 } 890 891 /* No redirected packets during ip_rt_redirect_silence; 892 * reset the algorithm. 893 */ 894 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { 895 peer->rate_tokens = 0; 896 peer->n_redirects = 0; 897 } 898 899 /* Too many ignored redirects; do not send anything 900 * set dst.rate_last to the last seen redirected packet. 901 */ 902 if (peer->n_redirects >= ip_rt_redirect_number) { 903 peer->rate_last = jiffies; 904 goto out_put_peer; 905 } 906 907 /* Check for load limit; set rate_last to the latest sent 908 * redirect. 909 */ 910 if (peer->rate_tokens == 0 || 911 time_after(jiffies, 912 (peer->rate_last + 913 (ip_rt_redirect_load << peer->rate_tokens)))) { 914 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 915 916 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); 917 peer->rate_last = jiffies; 918 ++peer->rate_tokens; 919 ++peer->n_redirects; 920 #ifdef CONFIG_IP_ROUTE_VERBOSE 921 if (log_martians && 922 peer->rate_tokens == ip_rt_redirect_number) 923 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 924 &ip_hdr(skb)->saddr, inet_iif(skb), 925 &ip_hdr(skb)->daddr, &gw); 926 #endif 927 } 928 out_put_peer: 929 inet_putpeer(peer); 930 } 931 932 static int ip_error(struct sk_buff *skb) 933 { 934 struct rtable *rt = skb_rtable(skb); 935 struct net_device *dev = skb->dev; 936 struct in_device *in_dev; 937 struct inet_peer *peer; 938 unsigned long now; 939 struct net *net; 940 bool send; 941 int code; 942 943 if (netif_is_l3_master(skb->dev)) { 944 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); 945 if (!dev) 946 goto out; 947 } 948 949 in_dev = __in_dev_get_rcu(dev); 950 951 /* IP on this device is disabled. */ 952 if (!in_dev) 953 goto out; 954 955 net = dev_net(rt->dst.dev); 956 if (!IN_DEV_FORWARD(in_dev)) { 957 switch (rt->dst.error) { 958 case EHOSTUNREACH: 959 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); 960 break; 961 962 case ENETUNREACH: 963 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 964 break; 965 } 966 goto out; 967 } 968 969 switch (rt->dst.error) { 970 case EINVAL: 971 default: 972 goto out; 973 case EHOSTUNREACH: 974 code = ICMP_HOST_UNREACH; 975 break; 976 case ENETUNREACH: 977 code = ICMP_NET_UNREACH; 978 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 979 break; 980 case EACCES: 981 code = ICMP_PKT_FILTERED; 982 break; 983 } 984 985 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 986 l3mdev_master_ifindex(skb->dev), 1); 987 988 send = true; 989 if (peer) { 990 now = jiffies; 991 peer->rate_tokens += now - peer->rate_last; 992 if (peer->rate_tokens > ip_rt_error_burst) 993 peer->rate_tokens = ip_rt_error_burst; 994 peer->rate_last = now; 995 if (peer->rate_tokens >= ip_rt_error_cost) 996 peer->rate_tokens -= ip_rt_error_cost; 997 else 998 send = false; 999 inet_putpeer(peer); 1000 } 1001 if (send) 1002 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1003 1004 out: kfree_skb(skb); 1005 return 0; 1006 } 1007 1008 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1009 { 1010 struct dst_entry *dst = &rt->dst; 1011 u32 old_mtu = ipv4_mtu(dst); 1012 struct fib_result res; 1013 bool lock = false; 1014 1015 if (ip_mtu_locked(dst)) 1016 return; 1017 1018 if (old_mtu < mtu) 1019 return; 1020 1021 if (mtu < ip_rt_min_pmtu) { 1022 lock = true; 1023 mtu = min(old_mtu, ip_rt_min_pmtu); 1024 } 1025 1026 if (rt->rt_pmtu == mtu && !lock && 1027 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) 1028 return; 1029 1030 rcu_read_lock(); 1031 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { 1032 struct fib_nh_common *nhc = FIB_RES_NHC(res); 1033 struct fib_nh *nh; 1034 1035 nh = container_of(nhc, struct fib_nh, nh_common); 1036 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock, 1037 jiffies + ip_rt_mtu_expires); 1038 } 1039 rcu_read_unlock(); 1040 } 1041 1042 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1043 struct sk_buff *skb, u32 mtu) 1044 { 1045 struct rtable *rt = (struct rtable *) dst; 1046 struct flowi4 fl4; 1047 1048 ip_rt_build_flow_key(&fl4, sk, skb); 1049 __ip_rt_update_pmtu(rt, &fl4, mtu); 1050 } 1051 1052 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1053 int oif, u8 protocol) 1054 { 1055 const struct iphdr *iph = (const struct iphdr *) skb->data; 1056 struct flowi4 fl4; 1057 struct rtable *rt; 1058 u32 mark = IP4_REPLY_MARK(net, skb->mark); 1059 1060 __build_flow_key(net, &fl4, NULL, iph, oif, 1061 RT_TOS(iph->tos), protocol, mark, 0); 1062 rt = __ip_route_output_key(net, &fl4); 1063 if (!IS_ERR(rt)) { 1064 __ip_rt_update_pmtu(rt, &fl4, mtu); 1065 ip_rt_put(rt); 1066 } 1067 } 1068 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 1069 1070 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1071 { 1072 const struct iphdr *iph = (const struct iphdr *) skb->data; 1073 struct flowi4 fl4; 1074 struct rtable *rt; 1075 1076 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); 1077 1078 if (!fl4.flowi4_mark) 1079 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1080 1081 rt = __ip_route_output_key(sock_net(sk), &fl4); 1082 if (!IS_ERR(rt)) { 1083 __ip_rt_update_pmtu(rt, &fl4, mtu); 1084 ip_rt_put(rt); 1085 } 1086 } 1087 1088 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1089 { 1090 const struct iphdr *iph = (const struct iphdr *) skb->data; 1091 struct flowi4 fl4; 1092 struct rtable *rt; 1093 struct dst_entry *odst = NULL; 1094 bool new = false; 1095 struct net *net = sock_net(sk); 1096 1097 bh_lock_sock(sk); 1098 1099 if (!ip_sk_accept_pmtu(sk)) 1100 goto out; 1101 1102 odst = sk_dst_get(sk); 1103 1104 if (sock_owned_by_user(sk) || !odst) { 1105 __ipv4_sk_update_pmtu(skb, sk, mtu); 1106 goto out; 1107 } 1108 1109 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1110 1111 rt = (struct rtable *)odst; 1112 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1113 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1114 if (IS_ERR(rt)) 1115 goto out; 1116 1117 new = true; 1118 } 1119 1120 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu); 1121 1122 if (!dst_check(&rt->dst, 0)) { 1123 if (new) 1124 dst_release(&rt->dst); 1125 1126 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1127 if (IS_ERR(rt)) 1128 goto out; 1129 1130 new = true; 1131 } 1132 1133 if (new) 1134 sk_dst_set(sk, &rt->dst); 1135 1136 out: 1137 bh_unlock_sock(sk); 1138 dst_release(odst); 1139 } 1140 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1141 1142 void ipv4_redirect(struct sk_buff *skb, struct net *net, 1143 int oif, u8 protocol) 1144 { 1145 const struct iphdr *iph = (const struct iphdr *) skb->data; 1146 struct flowi4 fl4; 1147 struct rtable *rt; 1148 1149 __build_flow_key(net, &fl4, NULL, iph, oif, 1150 RT_TOS(iph->tos), protocol, 0, 0); 1151 rt = __ip_route_output_key(net, &fl4); 1152 if (!IS_ERR(rt)) { 1153 __ip_do_redirect(rt, skb, &fl4, false); 1154 ip_rt_put(rt); 1155 } 1156 } 1157 EXPORT_SYMBOL_GPL(ipv4_redirect); 1158 1159 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1160 { 1161 const struct iphdr *iph = (const struct iphdr *) skb->data; 1162 struct flowi4 fl4; 1163 struct rtable *rt; 1164 struct net *net = sock_net(sk); 1165 1166 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1167 rt = __ip_route_output_key(net, &fl4); 1168 if (!IS_ERR(rt)) { 1169 __ip_do_redirect(rt, skb, &fl4, false); 1170 ip_rt_put(rt); 1171 } 1172 } 1173 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1174 1175 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1176 { 1177 struct rtable *rt = (struct rtable *) dst; 1178 1179 /* All IPV4 dsts are created with ->obsolete set to the value 1180 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1181 * into this function always. 1182 * 1183 * When a PMTU/redirect information update invalidates a route, 1184 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1185 * DST_OBSOLETE_DEAD. 1186 */ 1187 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1188 return NULL; 1189 return dst; 1190 } 1191 1192 static void ipv4_link_failure(struct sk_buff *skb) 1193 { 1194 struct rtable *rt; 1195 1196 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); 1197 1198 rt = skb_rtable(skb); 1199 if (rt) 1200 dst_set_expires(&rt->dst, 0); 1201 } 1202 1203 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) 1204 { 1205 pr_debug("%s: %pI4 -> %pI4, %s\n", 1206 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1207 skb->dev ? skb->dev->name : "?"); 1208 kfree_skb(skb); 1209 WARN_ON(1); 1210 return 0; 1211 } 1212 1213 /* 1214 We do not cache source address of outgoing interface, 1215 because it is used only by IP RR, TS and SRR options, 1216 so that it out of fast path. 1217 1218 BTW remember: "addr" is allowed to be not aligned 1219 in IP options! 1220 */ 1221 1222 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1223 { 1224 __be32 src; 1225 1226 if (rt_is_output_route(rt)) 1227 src = ip_hdr(skb)->saddr; 1228 else { 1229 struct fib_result res; 1230 struct iphdr *iph = ip_hdr(skb); 1231 struct flowi4 fl4 = { 1232 .daddr = iph->daddr, 1233 .saddr = iph->saddr, 1234 .flowi4_tos = RT_TOS(iph->tos), 1235 .flowi4_oif = rt->dst.dev->ifindex, 1236 .flowi4_iif = skb->dev->ifindex, 1237 .flowi4_mark = skb->mark, 1238 }; 1239 1240 rcu_read_lock(); 1241 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1242 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); 1243 else 1244 src = inet_select_addr(rt->dst.dev, 1245 rt_nexthop(rt, iph->daddr), 1246 RT_SCOPE_UNIVERSE); 1247 rcu_read_unlock(); 1248 } 1249 memcpy(addr, &src, 4); 1250 } 1251 1252 #ifdef CONFIG_IP_ROUTE_CLASSID 1253 static void set_class_tag(struct rtable *rt, u32 tag) 1254 { 1255 if (!(rt->dst.tclassid & 0xFFFF)) 1256 rt->dst.tclassid |= tag & 0xFFFF; 1257 if (!(rt->dst.tclassid & 0xFFFF0000)) 1258 rt->dst.tclassid |= tag & 0xFFFF0000; 1259 } 1260 #endif 1261 1262 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1263 { 1264 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1265 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1266 ip_rt_min_advmss); 1267 1268 return min(advmss, IPV4_MAX_PMTU - header_size); 1269 } 1270 1271 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1272 { 1273 const struct rtable *rt = (const struct rtable *) dst; 1274 unsigned int mtu = rt->rt_pmtu; 1275 1276 if (!mtu || time_after_eq(jiffies, rt->dst.expires)) 1277 mtu = dst_metric_raw(dst, RTAX_MTU); 1278 1279 if (mtu) 1280 return mtu; 1281 1282 mtu = READ_ONCE(dst->dev->mtu); 1283 1284 if (unlikely(ip_mtu_locked(dst))) { 1285 if (rt->rt_uses_gateway && mtu > 576) 1286 mtu = 576; 1287 } 1288 1289 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1290 1291 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1292 } 1293 1294 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) 1295 { 1296 struct fnhe_hash_bucket *hash; 1297 struct fib_nh_exception *fnhe, __rcu **fnhe_p; 1298 u32 hval = fnhe_hashfun(daddr); 1299 1300 spin_lock_bh(&fnhe_lock); 1301 1302 hash = rcu_dereference_protected(nh->nh_exceptions, 1303 lockdep_is_held(&fnhe_lock)); 1304 hash += hval; 1305 1306 fnhe_p = &hash->chain; 1307 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); 1308 while (fnhe) { 1309 if (fnhe->fnhe_daddr == daddr) { 1310 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( 1311 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); 1312 /* set fnhe_daddr to 0 to ensure it won't bind with 1313 * new dsts in rt_bind_exception(). 1314 */ 1315 fnhe->fnhe_daddr = 0; 1316 fnhe_flush_routes(fnhe); 1317 kfree_rcu(fnhe, rcu); 1318 break; 1319 } 1320 fnhe_p = &fnhe->fnhe_next; 1321 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1322 lockdep_is_held(&fnhe_lock)); 1323 } 1324 1325 spin_unlock_bh(&fnhe_lock); 1326 } 1327 1328 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) 1329 { 1330 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions); 1331 struct fib_nh_exception *fnhe; 1332 u32 hval; 1333 1334 if (!hash) 1335 return NULL; 1336 1337 hval = fnhe_hashfun(daddr); 1338 1339 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1340 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1341 if (fnhe->fnhe_daddr == daddr) { 1342 if (fnhe->fnhe_expires && 1343 time_after(jiffies, fnhe->fnhe_expires)) { 1344 ip_del_fnhe(nh, daddr); 1345 break; 1346 } 1347 return fnhe; 1348 } 1349 } 1350 return NULL; 1351 } 1352 1353 /* MTU selection: 1354 * 1. mtu on route is locked - use it 1355 * 2. mtu from nexthop exception 1356 * 3. mtu from egress device 1357 */ 1358 1359 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) 1360 { 1361 struct fib_nh_common *nhc = res->nhc; 1362 struct net_device *dev = nhc->nhc_dev; 1363 struct fib_info *fi = res->fi; 1364 u32 mtu = 0; 1365 1366 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || 1367 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) 1368 mtu = fi->fib_mtu; 1369 1370 if (likely(!mtu)) { 1371 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); 1372 struct fib_nh_exception *fnhe; 1373 1374 fnhe = find_exception(nh, daddr); 1375 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) 1376 mtu = fnhe->fnhe_pmtu; 1377 } 1378 1379 if (likely(!mtu)) 1380 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); 1381 1382 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); 1383 } 1384 1385 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1386 __be32 daddr, const bool do_cache) 1387 { 1388 bool ret = false; 1389 1390 spin_lock_bh(&fnhe_lock); 1391 1392 if (daddr == fnhe->fnhe_daddr) { 1393 struct rtable __rcu **porig; 1394 struct rtable *orig; 1395 int genid = fnhe_genid(dev_net(rt->dst.dev)); 1396 1397 if (rt_is_input_route(rt)) 1398 porig = &fnhe->fnhe_rth_input; 1399 else 1400 porig = &fnhe->fnhe_rth_output; 1401 orig = rcu_dereference(*porig); 1402 1403 if (fnhe->fnhe_genid != genid) { 1404 fnhe->fnhe_genid = genid; 1405 fnhe->fnhe_gw = 0; 1406 fnhe->fnhe_pmtu = 0; 1407 fnhe->fnhe_expires = 0; 1408 fnhe->fnhe_mtu_locked = false; 1409 fnhe_flush_routes(fnhe); 1410 orig = NULL; 1411 } 1412 fill_route_from_fnhe(rt, fnhe); 1413 if (!rt->rt_gateway) 1414 rt->rt_gateway = daddr; 1415 1416 if (do_cache) { 1417 dst_hold(&rt->dst); 1418 rcu_assign_pointer(*porig, rt); 1419 if (orig) { 1420 dst_dev_put(&orig->dst); 1421 dst_release(&orig->dst); 1422 } 1423 ret = true; 1424 } 1425 1426 fnhe->fnhe_stamp = jiffies; 1427 } 1428 spin_unlock_bh(&fnhe_lock); 1429 1430 return ret; 1431 } 1432 1433 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt) 1434 { 1435 struct rtable *orig, *prev, **p; 1436 bool ret = true; 1437 1438 if (rt_is_input_route(rt)) { 1439 p = (struct rtable **)&nh->nh_rth_input; 1440 } else { 1441 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output); 1442 } 1443 orig = *p; 1444 1445 /* hold dst before doing cmpxchg() to avoid race condition 1446 * on this dst 1447 */ 1448 dst_hold(&rt->dst); 1449 prev = cmpxchg(p, orig, rt); 1450 if (prev == orig) { 1451 if (orig) { 1452 dst_dev_put(&orig->dst); 1453 dst_release(&orig->dst); 1454 } 1455 } else { 1456 dst_release(&rt->dst); 1457 ret = false; 1458 } 1459 1460 return ret; 1461 } 1462 1463 struct uncached_list { 1464 spinlock_t lock; 1465 struct list_head head; 1466 }; 1467 1468 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); 1469 1470 void rt_add_uncached_list(struct rtable *rt) 1471 { 1472 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); 1473 1474 rt->rt_uncached_list = ul; 1475 1476 spin_lock_bh(&ul->lock); 1477 list_add_tail(&rt->rt_uncached, &ul->head); 1478 spin_unlock_bh(&ul->lock); 1479 } 1480 1481 void rt_del_uncached_list(struct rtable *rt) 1482 { 1483 if (!list_empty(&rt->rt_uncached)) { 1484 struct uncached_list *ul = rt->rt_uncached_list; 1485 1486 spin_lock_bh(&ul->lock); 1487 list_del(&rt->rt_uncached); 1488 spin_unlock_bh(&ul->lock); 1489 } 1490 } 1491 1492 static void ipv4_dst_destroy(struct dst_entry *dst) 1493 { 1494 struct rtable *rt = (struct rtable *)dst; 1495 1496 ip_dst_metrics_put(dst); 1497 rt_del_uncached_list(rt); 1498 } 1499 1500 void rt_flush_dev(struct net_device *dev) 1501 { 1502 struct net *net = dev_net(dev); 1503 struct rtable *rt; 1504 int cpu; 1505 1506 for_each_possible_cpu(cpu) { 1507 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 1508 1509 spin_lock_bh(&ul->lock); 1510 list_for_each_entry(rt, &ul->head, rt_uncached) { 1511 if (rt->dst.dev != dev) 1512 continue; 1513 rt->dst.dev = net->loopback_dev; 1514 dev_hold(rt->dst.dev); 1515 dev_put(dev); 1516 } 1517 spin_unlock_bh(&ul->lock); 1518 } 1519 } 1520 1521 static bool rt_cache_valid(const struct rtable *rt) 1522 { 1523 return rt && 1524 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1525 !rt_is_expired(rt); 1526 } 1527 1528 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1529 const struct fib_result *res, 1530 struct fib_nh_exception *fnhe, 1531 struct fib_info *fi, u16 type, u32 itag, 1532 const bool do_cache) 1533 { 1534 bool cached = false; 1535 1536 if (fi) { 1537 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1538 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); 1539 1540 if (nh->fib_nh_gw4 && nh->fib_nh_scope == RT_SCOPE_LINK) { 1541 rt->rt_gateway = nh->fib_nh_gw4; 1542 rt->rt_uses_gateway = 1; 1543 } 1544 ip_dst_init_metrics(&rt->dst, fi->fib_metrics); 1545 1546 #ifdef CONFIG_IP_ROUTE_CLASSID 1547 rt->dst.tclassid = nh->nh_tclassid; 1548 #endif 1549 rt->dst.lwtstate = lwtstate_get(nh->fib_nh_lws); 1550 if (unlikely(fnhe)) 1551 cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1552 else if (do_cache) 1553 cached = rt_cache_route(nh, rt); 1554 if (unlikely(!cached)) { 1555 /* Routes we intend to cache in nexthop exception or 1556 * FIB nexthop have the DST_NOCACHE bit clear. 1557 * However, if we are unsuccessful at storing this 1558 * route into the cache we really need to set it. 1559 */ 1560 if (!rt->rt_gateway) 1561 rt->rt_gateway = daddr; 1562 rt_add_uncached_list(rt); 1563 } 1564 } else 1565 rt_add_uncached_list(rt); 1566 1567 #ifdef CONFIG_IP_ROUTE_CLASSID 1568 #ifdef CONFIG_IP_MULTIPLE_TABLES 1569 set_class_tag(rt, res->tclassid); 1570 #endif 1571 set_class_tag(rt, itag); 1572 #endif 1573 } 1574 1575 struct rtable *rt_dst_alloc(struct net_device *dev, 1576 unsigned int flags, u16 type, 1577 bool nopolicy, bool noxfrm, bool will_cache) 1578 { 1579 struct rtable *rt; 1580 1581 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1582 (will_cache ? 0 : DST_HOST) | 1583 (nopolicy ? DST_NOPOLICY : 0) | 1584 (noxfrm ? DST_NOXFRM : 0)); 1585 1586 if (rt) { 1587 rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1588 rt->rt_flags = flags; 1589 rt->rt_type = type; 1590 rt->rt_is_input = 0; 1591 rt->rt_iif = 0; 1592 rt->rt_pmtu = 0; 1593 rt->rt_mtu_locked = 0; 1594 rt->rt_gateway = 0; 1595 rt->rt_uses_gateway = 0; 1596 INIT_LIST_HEAD(&rt->rt_uncached); 1597 1598 rt->dst.output = ip_output; 1599 if (flags & RTCF_LOCAL) 1600 rt->dst.input = ip_local_deliver; 1601 } 1602 1603 return rt; 1604 } 1605 EXPORT_SYMBOL(rt_dst_alloc); 1606 1607 /* called in rcu_read_lock() section */ 1608 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1609 u8 tos, struct net_device *dev, 1610 struct in_device *in_dev, u32 *itag) 1611 { 1612 int err; 1613 1614 /* Primary sanity checks. */ 1615 if (!in_dev) 1616 return -EINVAL; 1617 1618 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1619 skb->protocol != htons(ETH_P_IP)) 1620 return -EINVAL; 1621 1622 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) 1623 return -EINVAL; 1624 1625 if (ipv4_is_zeronet(saddr)) { 1626 if (!ipv4_is_local_multicast(daddr) && 1627 ip_hdr(skb)->protocol != IPPROTO_IGMP) 1628 return -EINVAL; 1629 } else { 1630 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1631 in_dev, itag); 1632 if (err < 0) 1633 return err; 1634 } 1635 return 0; 1636 } 1637 1638 /* called in rcu_read_lock() section */ 1639 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1640 u8 tos, struct net_device *dev, int our) 1641 { 1642 struct in_device *in_dev = __in_dev_get_rcu(dev); 1643 unsigned int flags = RTCF_MULTICAST; 1644 struct rtable *rth; 1645 u32 itag = 0; 1646 int err; 1647 1648 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); 1649 if (err) 1650 return err; 1651 1652 if (our) 1653 flags |= RTCF_LOCAL; 1654 1655 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, 1656 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); 1657 if (!rth) 1658 return -ENOBUFS; 1659 1660 #ifdef CONFIG_IP_ROUTE_CLASSID 1661 rth->dst.tclassid = itag; 1662 #endif 1663 rth->dst.output = ip_rt_bug; 1664 rth->rt_is_input= 1; 1665 1666 #ifdef CONFIG_IP_MROUTE 1667 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1668 rth->dst.input = ip_mr_input; 1669 #endif 1670 RT_CACHE_STAT_INC(in_slow_mc); 1671 1672 skb_dst_set(skb, &rth->dst); 1673 return 0; 1674 } 1675 1676 1677 static void ip_handle_martian_source(struct net_device *dev, 1678 struct in_device *in_dev, 1679 struct sk_buff *skb, 1680 __be32 daddr, 1681 __be32 saddr) 1682 { 1683 RT_CACHE_STAT_INC(in_martian_src); 1684 #ifdef CONFIG_IP_ROUTE_VERBOSE 1685 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1686 /* 1687 * RFC1812 recommendation, if source is martian, 1688 * the only hint is MAC header. 1689 */ 1690 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1691 &daddr, &saddr, dev->name); 1692 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1693 print_hex_dump(KERN_WARNING, "ll header: ", 1694 DUMP_PREFIX_OFFSET, 16, 1, 1695 skb_mac_header(skb), 1696 dev->hard_header_len, false); 1697 } 1698 } 1699 #endif 1700 } 1701 1702 /* called in rcu_read_lock() section */ 1703 static int __mkroute_input(struct sk_buff *skb, 1704 const struct fib_result *res, 1705 struct in_device *in_dev, 1706 __be32 daddr, __be32 saddr, u32 tos) 1707 { 1708 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1709 struct net_device *dev = nhc->nhc_dev; 1710 struct fib_nh_exception *fnhe; 1711 struct rtable *rth; 1712 struct fib_nh *nh; 1713 int err; 1714 struct in_device *out_dev; 1715 bool do_cache; 1716 u32 itag = 0; 1717 1718 /* get a working reference to the output device */ 1719 out_dev = __in_dev_get_rcu(dev); 1720 if (!out_dev) { 1721 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1722 return -EINVAL; 1723 } 1724 1725 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1726 in_dev->dev, in_dev, &itag); 1727 if (err < 0) { 1728 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1729 saddr); 1730 1731 goto cleanup; 1732 } 1733 1734 do_cache = res->fi && !itag; 1735 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1736 skb->protocol == htons(ETH_P_IP)) { 1737 __be32 gw = nhc->nhc_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; 1738 1739 if (IN_DEV_SHARED_MEDIA(out_dev) || 1740 inet_addr_onlink(out_dev, saddr, gw)) 1741 IPCB(skb)->flags |= IPSKB_DOREDIRECT; 1742 } 1743 1744 if (skb->protocol != htons(ETH_P_IP)) { 1745 /* Not IP (i.e. ARP). Do not create route, if it is 1746 * invalid for proxy arp. DNAT routes are always valid. 1747 * 1748 * Proxy arp feature have been extended to allow, ARP 1749 * replies back to the same interface, to support 1750 * Private VLAN switch technologies. See arp.c. 1751 */ 1752 if (out_dev == in_dev && 1753 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1754 err = -EINVAL; 1755 goto cleanup; 1756 } 1757 } 1758 1759 nh = container_of(nhc, struct fib_nh, nh_common); 1760 fnhe = find_exception(nh, daddr); 1761 if (do_cache) { 1762 if (fnhe) 1763 rth = rcu_dereference(fnhe->fnhe_rth_input); 1764 else 1765 rth = rcu_dereference(nh->nh_rth_input); 1766 if (rt_cache_valid(rth)) { 1767 skb_dst_set_noref(skb, &rth->dst); 1768 goto out; 1769 } 1770 } 1771 1772 rth = rt_dst_alloc(out_dev->dev, 0, res->type, 1773 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1774 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); 1775 if (!rth) { 1776 err = -ENOBUFS; 1777 goto cleanup; 1778 } 1779 1780 rth->rt_is_input = 1; 1781 RT_CACHE_STAT_INC(in_slow_tot); 1782 1783 rth->dst.input = ip_forward; 1784 1785 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1786 do_cache); 1787 lwtunnel_set_redirect(&rth->dst); 1788 skb_dst_set(skb, &rth->dst); 1789 out: 1790 err = 0; 1791 cleanup: 1792 return err; 1793 } 1794 1795 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1796 /* To make ICMP packets follow the right flow, the multipath hash is 1797 * calculated from the inner IP addresses. 1798 */ 1799 static void ip_multipath_l3_keys(const struct sk_buff *skb, 1800 struct flow_keys *hash_keys) 1801 { 1802 const struct iphdr *outer_iph = ip_hdr(skb); 1803 const struct iphdr *key_iph = outer_iph; 1804 const struct iphdr *inner_iph; 1805 const struct icmphdr *icmph; 1806 struct iphdr _inner_iph; 1807 struct icmphdr _icmph; 1808 1809 if (likely(outer_iph->protocol != IPPROTO_ICMP)) 1810 goto out; 1811 1812 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1813 goto out; 1814 1815 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1816 &_icmph); 1817 if (!icmph) 1818 goto out; 1819 1820 if (icmph->type != ICMP_DEST_UNREACH && 1821 icmph->type != ICMP_REDIRECT && 1822 icmph->type != ICMP_TIME_EXCEEDED && 1823 icmph->type != ICMP_PARAMETERPROB) 1824 goto out; 1825 1826 inner_iph = skb_header_pointer(skb, 1827 outer_iph->ihl * 4 + sizeof(_icmph), 1828 sizeof(_inner_iph), &_inner_iph); 1829 if (!inner_iph) 1830 goto out; 1831 1832 key_iph = inner_iph; 1833 out: 1834 hash_keys->addrs.v4addrs.src = key_iph->saddr; 1835 hash_keys->addrs.v4addrs.dst = key_iph->daddr; 1836 } 1837 1838 /* if skb is set it will be used and fl4 can be NULL */ 1839 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, 1840 const struct sk_buff *skb, struct flow_keys *flkeys) 1841 { 1842 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; 1843 struct flow_keys hash_keys; 1844 u32 mhash; 1845 1846 switch (net->ipv4.sysctl_fib_multipath_hash_policy) { 1847 case 0: 1848 memset(&hash_keys, 0, sizeof(hash_keys)); 1849 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1850 if (skb) { 1851 ip_multipath_l3_keys(skb, &hash_keys); 1852 } else { 1853 hash_keys.addrs.v4addrs.src = fl4->saddr; 1854 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1855 } 1856 break; 1857 case 1: 1858 /* skb is currently provided only when forwarding */ 1859 if (skb) { 1860 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1861 struct flow_keys keys; 1862 1863 /* short-circuit if we already have L4 hash present */ 1864 if (skb->l4_hash) 1865 return skb_get_hash_raw(skb) >> 1; 1866 1867 memset(&hash_keys, 0, sizeof(hash_keys)); 1868 1869 if (!flkeys) { 1870 skb_flow_dissect_flow_keys(skb, &keys, flag); 1871 flkeys = &keys; 1872 } 1873 1874 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1875 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 1876 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 1877 hash_keys.ports.src = flkeys->ports.src; 1878 hash_keys.ports.dst = flkeys->ports.dst; 1879 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 1880 } else { 1881 memset(&hash_keys, 0, sizeof(hash_keys)); 1882 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1883 hash_keys.addrs.v4addrs.src = fl4->saddr; 1884 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1885 hash_keys.ports.src = fl4->fl4_sport; 1886 hash_keys.ports.dst = fl4->fl4_dport; 1887 hash_keys.basic.ip_proto = fl4->flowi4_proto; 1888 } 1889 break; 1890 } 1891 mhash = flow_hash_from_keys(&hash_keys); 1892 1893 if (multipath_hash) 1894 mhash = jhash_2words(mhash, multipath_hash, 0); 1895 1896 return mhash >> 1; 1897 } 1898 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 1899 1900 static int ip_mkroute_input(struct sk_buff *skb, 1901 struct fib_result *res, 1902 struct in_device *in_dev, 1903 __be32 daddr, __be32 saddr, u32 tos, 1904 struct flow_keys *hkeys) 1905 { 1906 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1907 if (res->fi && res->fi->fib_nhs > 1) { 1908 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); 1909 1910 fib_select_multipath(res, h); 1911 } 1912 #endif 1913 1914 /* create a routing cache entry */ 1915 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 1916 } 1917 1918 /* 1919 * NOTE. We drop all the packets that has local source 1920 * addresses, because every properly looped back packet 1921 * must have correct destination already attached by output routine. 1922 * 1923 * Such approach solves two big problems: 1924 * 1. Not simplex devices are handled properly. 1925 * 2. IP spoofing attempts are filtered with 100% of guarantee. 1926 * called with rcu_read_lock() 1927 */ 1928 1929 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1930 u8 tos, struct net_device *dev, 1931 struct fib_result *res) 1932 { 1933 struct in_device *in_dev = __in_dev_get_rcu(dev); 1934 struct flow_keys *flkeys = NULL, _flkeys; 1935 struct net *net = dev_net(dev); 1936 struct ip_tunnel_info *tun_info; 1937 int err = -EINVAL; 1938 unsigned int flags = 0; 1939 u32 itag = 0; 1940 struct rtable *rth; 1941 struct flowi4 fl4; 1942 bool do_cache; 1943 1944 /* IP on this device is disabled. */ 1945 1946 if (!in_dev) 1947 goto out; 1948 1949 /* Check for the most weird martians, which can be not detected 1950 by fib_lookup. 1951 */ 1952 1953 tun_info = skb_tunnel_info(skb); 1954 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 1955 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; 1956 else 1957 fl4.flowi4_tun_key.tun_id = 0; 1958 skb_dst_drop(skb); 1959 1960 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 1961 goto martian_source; 1962 1963 res->fi = NULL; 1964 res->table = NULL; 1965 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 1966 goto brd_input; 1967 1968 /* Accept zero addresses only to limited broadcast; 1969 * I even do not know to fix it or not. Waiting for complains :-) 1970 */ 1971 if (ipv4_is_zeronet(saddr)) 1972 goto martian_source; 1973 1974 if (ipv4_is_zeronet(daddr)) 1975 goto martian_destination; 1976 1977 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), 1978 * and call it once if daddr or/and saddr are loopback addresses 1979 */ 1980 if (ipv4_is_loopback(daddr)) { 1981 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 1982 goto martian_destination; 1983 } else if (ipv4_is_loopback(saddr)) { 1984 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 1985 goto martian_source; 1986 } 1987 1988 /* 1989 * Now we are ready to route packet. 1990 */ 1991 fl4.flowi4_oif = 0; 1992 fl4.flowi4_iif = dev->ifindex; 1993 fl4.flowi4_mark = skb->mark; 1994 fl4.flowi4_tos = tos; 1995 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 1996 fl4.flowi4_flags = 0; 1997 fl4.daddr = daddr; 1998 fl4.saddr = saddr; 1999 fl4.flowi4_uid = sock_net_uid(net, NULL); 2000 2001 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { 2002 flkeys = &_flkeys; 2003 } else { 2004 fl4.flowi4_proto = 0; 2005 fl4.fl4_sport = 0; 2006 fl4.fl4_dport = 0; 2007 } 2008 2009 err = fib_lookup(net, &fl4, res, 0); 2010 if (err != 0) { 2011 if (!IN_DEV_FORWARD(in_dev)) 2012 err = -EHOSTUNREACH; 2013 goto no_route; 2014 } 2015 2016 if (res->type == RTN_BROADCAST) { 2017 if (IN_DEV_BFORWARD(in_dev)) 2018 goto make_route; 2019 goto brd_input; 2020 } 2021 2022 if (res->type == RTN_LOCAL) { 2023 err = fib_validate_source(skb, saddr, daddr, tos, 2024 0, dev, in_dev, &itag); 2025 if (err < 0) 2026 goto martian_source; 2027 goto local_input; 2028 } 2029 2030 if (!IN_DEV_FORWARD(in_dev)) { 2031 err = -EHOSTUNREACH; 2032 goto no_route; 2033 } 2034 if (res->type != RTN_UNICAST) 2035 goto martian_destination; 2036 2037 make_route: 2038 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); 2039 out: return err; 2040 2041 brd_input: 2042 if (skb->protocol != htons(ETH_P_IP)) 2043 goto e_inval; 2044 2045 if (!ipv4_is_zeronet(saddr)) { 2046 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 2047 in_dev, &itag); 2048 if (err < 0) 2049 goto martian_source; 2050 } 2051 flags |= RTCF_BROADCAST; 2052 res->type = RTN_BROADCAST; 2053 RT_CACHE_STAT_INC(in_brd); 2054 2055 local_input: 2056 do_cache = false; 2057 if (res->fi) { 2058 if (!itag) { 2059 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2060 struct fib_nh *nh; 2061 2062 nh = container_of(nhc, struct fib_nh, nh_common); 2063 rth = rcu_dereference(nh->nh_rth_input); 2064 if (rt_cache_valid(rth)) { 2065 skb_dst_set_noref(skb, &rth->dst); 2066 err = 0; 2067 goto out; 2068 } 2069 do_cache = true; 2070 } 2071 } 2072 2073 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, 2074 flags | RTCF_LOCAL, res->type, 2075 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache); 2076 if (!rth) 2077 goto e_nobufs; 2078 2079 rth->dst.output= ip_rt_bug; 2080 #ifdef CONFIG_IP_ROUTE_CLASSID 2081 rth->dst.tclassid = itag; 2082 #endif 2083 rth->rt_is_input = 1; 2084 2085 RT_CACHE_STAT_INC(in_slow_tot); 2086 if (res->type == RTN_UNREACHABLE) { 2087 rth->dst.input= ip_error; 2088 rth->dst.error= -err; 2089 rth->rt_flags &= ~RTCF_LOCAL; 2090 } 2091 2092 if (do_cache) { 2093 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2094 struct fib_nh *nh; 2095 2096 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); 2097 if (lwtunnel_input_redirect(rth->dst.lwtstate)) { 2098 WARN_ON(rth->dst.input == lwtunnel_input); 2099 rth->dst.lwtstate->orig_input = rth->dst.input; 2100 rth->dst.input = lwtunnel_input; 2101 } 2102 2103 nh = container_of(nhc, struct fib_nh, nh_common); 2104 if (unlikely(!rt_cache_route(nh, rth))) 2105 rt_add_uncached_list(rth); 2106 } 2107 skb_dst_set(skb, &rth->dst); 2108 err = 0; 2109 goto out; 2110 2111 no_route: 2112 RT_CACHE_STAT_INC(in_no_route); 2113 res->type = RTN_UNREACHABLE; 2114 res->fi = NULL; 2115 res->table = NULL; 2116 goto local_input; 2117 2118 /* 2119 * Do not cache martian addresses: they should be logged (RFC1812) 2120 */ 2121 martian_destination: 2122 RT_CACHE_STAT_INC(in_martian_dst); 2123 #ifdef CONFIG_IP_ROUTE_VERBOSE 2124 if (IN_DEV_LOG_MARTIANS(in_dev)) 2125 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2126 &daddr, &saddr, dev->name); 2127 #endif 2128 2129 e_inval: 2130 err = -EINVAL; 2131 goto out; 2132 2133 e_nobufs: 2134 err = -ENOBUFS; 2135 goto out; 2136 2137 martian_source: 2138 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2139 goto out; 2140 } 2141 2142 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2143 u8 tos, struct net_device *dev) 2144 { 2145 struct fib_result res; 2146 int err; 2147 2148 tos &= IPTOS_RT_MASK; 2149 rcu_read_lock(); 2150 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); 2151 rcu_read_unlock(); 2152 2153 return err; 2154 } 2155 EXPORT_SYMBOL(ip_route_input_noref); 2156 2157 /* called with rcu_read_lock held */ 2158 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2159 u8 tos, struct net_device *dev, struct fib_result *res) 2160 { 2161 /* Multicast recognition logic is moved from route cache to here. 2162 The problem was that too many Ethernet cards have broken/missing 2163 hardware multicast filters :-( As result the host on multicasting 2164 network acquires a lot of useless route cache entries, sort of 2165 SDR messages from all the world. Now we try to get rid of them. 2166 Really, provided software IP multicast filter is organized 2167 reasonably (at least, hashed), it does not result in a slowdown 2168 comparing with route cache reject entries. 2169 Note, that multicast routers are not affected, because 2170 route cache entry is created eventually. 2171 */ 2172 if (ipv4_is_multicast(daddr)) { 2173 struct in_device *in_dev = __in_dev_get_rcu(dev); 2174 int our = 0; 2175 int err = -EINVAL; 2176 2177 if (!in_dev) 2178 return err; 2179 our = ip_check_mc_rcu(in_dev, daddr, saddr, 2180 ip_hdr(skb)->protocol); 2181 2182 /* check l3 master if no match yet */ 2183 if (!our && netif_is_l3_slave(dev)) { 2184 struct in_device *l3_in_dev; 2185 2186 l3_in_dev = __in_dev_get_rcu(skb->dev); 2187 if (l3_in_dev) 2188 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, 2189 ip_hdr(skb)->protocol); 2190 } 2191 2192 if (our 2193 #ifdef CONFIG_IP_MROUTE 2194 || 2195 (!ipv4_is_local_multicast(daddr) && 2196 IN_DEV_MFORWARD(in_dev)) 2197 #endif 2198 ) { 2199 err = ip_route_input_mc(skb, daddr, saddr, 2200 tos, dev, our); 2201 } 2202 return err; 2203 } 2204 2205 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); 2206 } 2207 2208 /* called with rcu_read_lock() */ 2209 static struct rtable *__mkroute_output(const struct fib_result *res, 2210 const struct flowi4 *fl4, int orig_oif, 2211 struct net_device *dev_out, 2212 unsigned int flags) 2213 { 2214 struct fib_info *fi = res->fi; 2215 struct fib_nh_exception *fnhe; 2216 struct in_device *in_dev; 2217 u16 type = res->type; 2218 struct rtable *rth; 2219 bool do_cache; 2220 2221 in_dev = __in_dev_get_rcu(dev_out); 2222 if (!in_dev) 2223 return ERR_PTR(-EINVAL); 2224 2225 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2226 if (ipv4_is_loopback(fl4->saddr) && 2227 !(dev_out->flags & IFF_LOOPBACK) && 2228 !netif_is_l3_master(dev_out)) 2229 return ERR_PTR(-EINVAL); 2230 2231 if (ipv4_is_lbcast(fl4->daddr)) 2232 type = RTN_BROADCAST; 2233 else if (ipv4_is_multicast(fl4->daddr)) 2234 type = RTN_MULTICAST; 2235 else if (ipv4_is_zeronet(fl4->daddr)) 2236 return ERR_PTR(-EINVAL); 2237 2238 if (dev_out->flags & IFF_LOOPBACK) 2239 flags |= RTCF_LOCAL; 2240 2241 do_cache = true; 2242 if (type == RTN_BROADCAST) { 2243 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2244 fi = NULL; 2245 } else if (type == RTN_MULTICAST) { 2246 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2247 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2248 fl4->flowi4_proto)) 2249 flags &= ~RTCF_LOCAL; 2250 else 2251 do_cache = false; 2252 /* If multicast route do not exist use 2253 * default one, but do not gateway in this case. 2254 * Yes, it is hack. 2255 */ 2256 if (fi && res->prefixlen < 4) 2257 fi = NULL; 2258 } else if ((type == RTN_LOCAL) && (orig_oif != 0) && 2259 (orig_oif != dev_out->ifindex)) { 2260 /* For local routes that require a particular output interface 2261 * we do not want to cache the result. Caching the result 2262 * causes incorrect behaviour when there are multiple source 2263 * addresses on the interface, the end result being that if the 2264 * intended recipient is waiting on that interface for the 2265 * packet he won't receive it because it will be delivered on 2266 * the loopback interface and the IP_PKTINFO ipi_ifindex will 2267 * be set to the loopback interface as well. 2268 */ 2269 do_cache = false; 2270 } 2271 2272 fnhe = NULL; 2273 do_cache &= fi != NULL; 2274 if (fi) { 2275 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2276 struct fib_nh *nh = container_of(nhc, struct fib_nh, nh_common); 2277 struct rtable __rcu **prth; 2278 2279 fnhe = find_exception(nh, fl4->daddr); 2280 if (!do_cache) 2281 goto add; 2282 if (fnhe) { 2283 prth = &fnhe->fnhe_rth_output; 2284 } else { 2285 if (unlikely(fl4->flowi4_flags & 2286 FLOWI_FLAG_KNOWN_NH && 2287 !(nhc->nhc_has_gw && 2288 nhc->nhc_scope == RT_SCOPE_LINK))) { 2289 do_cache = false; 2290 goto add; 2291 } 2292 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); 2293 } 2294 rth = rcu_dereference(*prth); 2295 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2296 return rth; 2297 } 2298 2299 add: 2300 rth = rt_dst_alloc(dev_out, flags, type, 2301 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2302 IN_DEV_CONF_GET(in_dev, NOXFRM), 2303 do_cache); 2304 if (!rth) 2305 return ERR_PTR(-ENOBUFS); 2306 2307 rth->rt_iif = orig_oif; 2308 2309 RT_CACHE_STAT_INC(out_slow_tot); 2310 2311 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2312 if (flags & RTCF_LOCAL && 2313 !(dev_out->flags & IFF_LOOPBACK)) { 2314 rth->dst.output = ip_mc_output; 2315 RT_CACHE_STAT_INC(out_slow_mc); 2316 } 2317 #ifdef CONFIG_IP_MROUTE 2318 if (type == RTN_MULTICAST) { 2319 if (IN_DEV_MFORWARD(in_dev) && 2320 !ipv4_is_local_multicast(fl4->daddr)) { 2321 rth->dst.input = ip_mr_input; 2322 rth->dst.output = ip_mc_output; 2323 } 2324 } 2325 #endif 2326 } 2327 2328 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2329 lwtunnel_set_redirect(&rth->dst); 2330 2331 return rth; 2332 } 2333 2334 /* 2335 * Major route resolver routine. 2336 */ 2337 2338 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2339 const struct sk_buff *skb) 2340 { 2341 __u8 tos = RT_FL_TOS(fl4); 2342 struct fib_result res = { 2343 .type = RTN_UNSPEC, 2344 .fi = NULL, 2345 .table = NULL, 2346 .tclassid = 0, 2347 }; 2348 struct rtable *rth; 2349 2350 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2351 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2352 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2353 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2354 2355 rcu_read_lock(); 2356 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); 2357 rcu_read_unlock(); 2358 2359 return rth; 2360 } 2361 EXPORT_SYMBOL_GPL(ip_route_output_key_hash); 2362 2363 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, 2364 struct fib_result *res, 2365 const struct sk_buff *skb) 2366 { 2367 struct net_device *dev_out = NULL; 2368 int orig_oif = fl4->flowi4_oif; 2369 unsigned int flags = 0; 2370 struct rtable *rth; 2371 int err = -ENETUNREACH; 2372 2373 if (fl4->saddr) { 2374 rth = ERR_PTR(-EINVAL); 2375 if (ipv4_is_multicast(fl4->saddr) || 2376 ipv4_is_lbcast(fl4->saddr) || 2377 ipv4_is_zeronet(fl4->saddr)) 2378 goto out; 2379 2380 /* I removed check for oif == dev_out->oif here. 2381 It was wrong for two reasons: 2382 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2383 is assigned to multiple interfaces. 2384 2. Moreover, we are allowed to send packets with saddr 2385 of another iface. --ANK 2386 */ 2387 2388 if (fl4->flowi4_oif == 0 && 2389 (ipv4_is_multicast(fl4->daddr) || 2390 ipv4_is_lbcast(fl4->daddr))) { 2391 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2392 dev_out = __ip_dev_find(net, fl4->saddr, false); 2393 if (!dev_out) 2394 goto out; 2395 2396 /* Special hack: user can direct multicasts 2397 and limited broadcast via necessary interface 2398 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2399 This hack is not just for fun, it allows 2400 vic,vat and friends to work. 2401 They bind socket to loopback, set ttl to zero 2402 and expect that it will work. 2403 From the viewpoint of routing cache they are broken, 2404 because we are not allowed to build multicast path 2405 with loopback source addr (look, routing cache 2406 cannot know, that ttl is zero, so that packet 2407 will not leave this host and route is valid). 2408 Luckily, this hack is good workaround. 2409 */ 2410 2411 fl4->flowi4_oif = dev_out->ifindex; 2412 goto make_route; 2413 } 2414 2415 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2416 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2417 if (!__ip_dev_find(net, fl4->saddr, false)) 2418 goto out; 2419 } 2420 } 2421 2422 2423 if (fl4->flowi4_oif) { 2424 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2425 rth = ERR_PTR(-ENODEV); 2426 if (!dev_out) 2427 goto out; 2428 2429 /* RACE: Check return value of inet_select_addr instead. */ 2430 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2431 rth = ERR_PTR(-ENETUNREACH); 2432 goto out; 2433 } 2434 if (ipv4_is_local_multicast(fl4->daddr) || 2435 ipv4_is_lbcast(fl4->daddr) || 2436 fl4->flowi4_proto == IPPROTO_IGMP) { 2437 if (!fl4->saddr) 2438 fl4->saddr = inet_select_addr(dev_out, 0, 2439 RT_SCOPE_LINK); 2440 goto make_route; 2441 } 2442 if (!fl4->saddr) { 2443 if (ipv4_is_multicast(fl4->daddr)) 2444 fl4->saddr = inet_select_addr(dev_out, 0, 2445 fl4->flowi4_scope); 2446 else if (!fl4->daddr) 2447 fl4->saddr = inet_select_addr(dev_out, 0, 2448 RT_SCOPE_HOST); 2449 } 2450 } 2451 2452 if (!fl4->daddr) { 2453 fl4->daddr = fl4->saddr; 2454 if (!fl4->daddr) 2455 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2456 dev_out = net->loopback_dev; 2457 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2458 res->type = RTN_LOCAL; 2459 flags |= RTCF_LOCAL; 2460 goto make_route; 2461 } 2462 2463 err = fib_lookup(net, fl4, res, 0); 2464 if (err) { 2465 res->fi = NULL; 2466 res->table = NULL; 2467 if (fl4->flowi4_oif && 2468 (ipv4_is_multicast(fl4->daddr) || 2469 !netif_index_is_l3_master(net, fl4->flowi4_oif))) { 2470 /* Apparently, routing tables are wrong. Assume, 2471 that the destination is on link. 2472 2473 WHY? DW. 2474 Because we are allowed to send to iface 2475 even if it has NO routes and NO assigned 2476 addresses. When oif is specified, routing 2477 tables are looked up with only one purpose: 2478 to catch if destination is gatewayed, rather than 2479 direct. Moreover, if MSG_DONTROUTE is set, 2480 we send packet, ignoring both routing tables 2481 and ifaddr state. --ANK 2482 2483 2484 We could make it even if oif is unknown, 2485 likely IPv6, but we do not. 2486 */ 2487 2488 if (fl4->saddr == 0) 2489 fl4->saddr = inet_select_addr(dev_out, 0, 2490 RT_SCOPE_LINK); 2491 res->type = RTN_UNICAST; 2492 goto make_route; 2493 } 2494 rth = ERR_PTR(err); 2495 goto out; 2496 } 2497 2498 if (res->type == RTN_LOCAL) { 2499 if (!fl4->saddr) { 2500 if (res->fi->fib_prefsrc) 2501 fl4->saddr = res->fi->fib_prefsrc; 2502 else 2503 fl4->saddr = fl4->daddr; 2504 } 2505 2506 /* L3 master device is the loopback for that domain */ 2507 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : 2508 net->loopback_dev; 2509 2510 /* make sure orig_oif points to fib result device even 2511 * though packet rx/tx happens over loopback or l3mdev 2512 */ 2513 orig_oif = FIB_RES_OIF(*res); 2514 2515 fl4->flowi4_oif = dev_out->ifindex; 2516 flags |= RTCF_LOCAL; 2517 goto make_route; 2518 } 2519 2520 fib_select_path(net, res, fl4, skb); 2521 2522 dev_out = FIB_RES_DEV(*res); 2523 fl4->flowi4_oif = dev_out->ifindex; 2524 2525 2526 make_route: 2527 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); 2528 2529 out: 2530 return rth; 2531 } 2532 2533 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2534 { 2535 return NULL; 2536 } 2537 2538 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2539 { 2540 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2541 2542 return mtu ? : dst->dev->mtu; 2543 } 2544 2545 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2546 struct sk_buff *skb, u32 mtu) 2547 { 2548 } 2549 2550 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2551 struct sk_buff *skb) 2552 { 2553 } 2554 2555 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2556 unsigned long old) 2557 { 2558 return NULL; 2559 } 2560 2561 static struct dst_ops ipv4_dst_blackhole_ops = { 2562 .family = AF_INET, 2563 .check = ipv4_blackhole_dst_check, 2564 .mtu = ipv4_blackhole_mtu, 2565 .default_advmss = ipv4_default_advmss, 2566 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2567 .redirect = ipv4_rt_blackhole_redirect, 2568 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2569 .neigh_lookup = ipv4_neigh_lookup, 2570 }; 2571 2572 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2573 { 2574 struct rtable *ort = (struct rtable *) dst_orig; 2575 struct rtable *rt; 2576 2577 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); 2578 if (rt) { 2579 struct dst_entry *new = &rt->dst; 2580 2581 new->__use = 1; 2582 new->input = dst_discard; 2583 new->output = dst_discard_out; 2584 2585 new->dev = net->loopback_dev; 2586 if (new->dev) 2587 dev_hold(new->dev); 2588 2589 rt->rt_is_input = ort->rt_is_input; 2590 rt->rt_iif = ort->rt_iif; 2591 rt->rt_pmtu = ort->rt_pmtu; 2592 rt->rt_mtu_locked = ort->rt_mtu_locked; 2593 2594 rt->rt_genid = rt_genid_ipv4(net); 2595 rt->rt_flags = ort->rt_flags; 2596 rt->rt_type = ort->rt_type; 2597 rt->rt_gateway = ort->rt_gateway; 2598 rt->rt_uses_gateway = ort->rt_uses_gateway; 2599 2600 INIT_LIST_HEAD(&rt->rt_uncached); 2601 } 2602 2603 dst_release(dst_orig); 2604 2605 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2606 } 2607 2608 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2609 const struct sock *sk) 2610 { 2611 struct rtable *rt = __ip_route_output_key(net, flp4); 2612 2613 if (IS_ERR(rt)) 2614 return rt; 2615 2616 if (flp4->flowi4_proto) 2617 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, 2618 flowi4_to_flowi(flp4), 2619 sk, 0); 2620 2621 return rt; 2622 } 2623 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2624 2625 /* called with rcu_read_lock held */ 2626 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2627 struct rtable *rt, u32 table_id, struct flowi4 *fl4, 2628 struct sk_buff *skb, u32 portid, u32 seq) 2629 { 2630 struct rtmsg *r; 2631 struct nlmsghdr *nlh; 2632 unsigned long expires = 0; 2633 u32 error; 2634 u32 metrics[RTAX_MAX]; 2635 2636 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0); 2637 if (!nlh) 2638 return -EMSGSIZE; 2639 2640 r = nlmsg_data(nlh); 2641 r->rtm_family = AF_INET; 2642 r->rtm_dst_len = 32; 2643 r->rtm_src_len = 0; 2644 r->rtm_tos = fl4->flowi4_tos; 2645 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; 2646 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2647 goto nla_put_failure; 2648 r->rtm_type = rt->rt_type; 2649 r->rtm_scope = RT_SCOPE_UNIVERSE; 2650 r->rtm_protocol = RTPROT_UNSPEC; 2651 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2652 if (rt->rt_flags & RTCF_NOTIFY) 2653 r->rtm_flags |= RTM_F_NOTIFY; 2654 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2655 r->rtm_flags |= RTCF_DOREDIRECT; 2656 2657 if (nla_put_in_addr(skb, RTA_DST, dst)) 2658 goto nla_put_failure; 2659 if (src) { 2660 r->rtm_src_len = 32; 2661 if (nla_put_in_addr(skb, RTA_SRC, src)) 2662 goto nla_put_failure; 2663 } 2664 if (rt->dst.dev && 2665 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2666 goto nla_put_failure; 2667 #ifdef CONFIG_IP_ROUTE_CLASSID 2668 if (rt->dst.tclassid && 2669 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2670 goto nla_put_failure; 2671 #endif 2672 if (!rt_is_input_route(rt) && 2673 fl4->saddr != src) { 2674 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) 2675 goto nla_put_failure; 2676 } 2677 if (rt->rt_uses_gateway && 2678 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway)) 2679 goto nla_put_failure; 2680 2681 expires = rt->dst.expires; 2682 if (expires) { 2683 unsigned long now = jiffies; 2684 2685 if (time_before(now, expires)) 2686 expires -= now; 2687 else 2688 expires = 0; 2689 } 2690 2691 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2692 if (rt->rt_pmtu && expires) 2693 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2694 if (rt->rt_mtu_locked && expires) 2695 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); 2696 if (rtnetlink_put_metrics(skb, metrics) < 0) 2697 goto nla_put_failure; 2698 2699 if (fl4->flowi4_mark && 2700 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2701 goto nla_put_failure; 2702 2703 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && 2704 nla_put_u32(skb, RTA_UID, 2705 from_kuid_munged(current_user_ns(), fl4->flowi4_uid))) 2706 goto nla_put_failure; 2707 2708 error = rt->dst.error; 2709 2710 if (rt_is_input_route(rt)) { 2711 #ifdef CONFIG_IP_MROUTE 2712 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && 2713 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2714 int err = ipmr_get_route(net, skb, 2715 fl4->saddr, fl4->daddr, 2716 r, portid); 2717 2718 if (err <= 0) { 2719 if (err == 0) 2720 return 0; 2721 goto nla_put_failure; 2722 } 2723 } else 2724 #endif 2725 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) 2726 goto nla_put_failure; 2727 } 2728 2729 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2730 goto nla_put_failure; 2731 2732 nlmsg_end(skb, nlh); 2733 return 0; 2734 2735 nla_put_failure: 2736 nlmsg_cancel(skb, nlh); 2737 return -EMSGSIZE; 2738 } 2739 2740 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, 2741 u8 ip_proto, __be16 sport, 2742 __be16 dport) 2743 { 2744 struct sk_buff *skb; 2745 struct iphdr *iph; 2746 2747 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 2748 if (!skb) 2749 return NULL; 2750 2751 /* Reserve room for dummy headers, this skb can pass 2752 * through good chunk of routing engine. 2753 */ 2754 skb_reset_mac_header(skb); 2755 skb_reset_network_header(skb); 2756 skb->protocol = htons(ETH_P_IP); 2757 iph = skb_put(skb, sizeof(struct iphdr)); 2758 iph->protocol = ip_proto; 2759 iph->saddr = src; 2760 iph->daddr = dst; 2761 iph->version = 0x4; 2762 iph->frag_off = 0; 2763 iph->ihl = 0x5; 2764 skb_set_transport_header(skb, skb->len); 2765 2766 switch (iph->protocol) { 2767 case IPPROTO_UDP: { 2768 struct udphdr *udph; 2769 2770 udph = skb_put_zero(skb, sizeof(struct udphdr)); 2771 udph->source = sport; 2772 udph->dest = dport; 2773 udph->len = sizeof(struct udphdr); 2774 udph->check = 0; 2775 break; 2776 } 2777 case IPPROTO_TCP: { 2778 struct tcphdr *tcph; 2779 2780 tcph = skb_put_zero(skb, sizeof(struct tcphdr)); 2781 tcph->source = sport; 2782 tcph->dest = dport; 2783 tcph->doff = sizeof(struct tcphdr) / 4; 2784 tcph->rst = 1; 2785 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), 2786 src, dst, 0); 2787 break; 2788 } 2789 case IPPROTO_ICMP: { 2790 struct icmphdr *icmph; 2791 2792 icmph = skb_put_zero(skb, sizeof(struct icmphdr)); 2793 icmph->type = ICMP_ECHO; 2794 icmph->code = 0; 2795 } 2796 } 2797 2798 return skb; 2799 } 2800 2801 static int inet_rtm_valid_getroute_req(struct sk_buff *skb, 2802 const struct nlmsghdr *nlh, 2803 struct nlattr **tb, 2804 struct netlink_ext_ack *extack) 2805 { 2806 struct rtmsg *rtm; 2807 int i, err; 2808 2809 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 2810 NL_SET_ERR_MSG(extack, 2811 "ipv4: Invalid header for route get request"); 2812 return -EINVAL; 2813 } 2814 2815 if (!netlink_strict_get_check(skb)) 2816 return nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, 2817 rtm_ipv4_policy, extack); 2818 2819 rtm = nlmsg_data(nlh); 2820 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || 2821 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || 2822 rtm->rtm_table || rtm->rtm_protocol || 2823 rtm->rtm_scope || rtm->rtm_type) { 2824 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); 2825 return -EINVAL; 2826 } 2827 2828 if (rtm->rtm_flags & ~(RTM_F_NOTIFY | 2829 RTM_F_LOOKUP_TABLE | 2830 RTM_F_FIB_MATCH)) { 2831 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); 2832 return -EINVAL; 2833 } 2834 2835 err = nlmsg_parse_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 2836 rtm_ipv4_policy, extack); 2837 if (err) 2838 return err; 2839 2840 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 2841 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 2842 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); 2843 return -EINVAL; 2844 } 2845 2846 for (i = 0; i <= RTA_MAX; i++) { 2847 if (!tb[i]) 2848 continue; 2849 2850 switch (i) { 2851 case RTA_IIF: 2852 case RTA_OIF: 2853 case RTA_SRC: 2854 case RTA_DST: 2855 case RTA_IP_PROTO: 2856 case RTA_SPORT: 2857 case RTA_DPORT: 2858 case RTA_MARK: 2859 case RTA_UID: 2860 break; 2861 default: 2862 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); 2863 return -EINVAL; 2864 } 2865 } 2866 2867 return 0; 2868 } 2869 2870 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 2871 struct netlink_ext_ack *extack) 2872 { 2873 struct net *net = sock_net(in_skb->sk); 2874 struct nlattr *tb[RTA_MAX+1]; 2875 u32 table_id = RT_TABLE_MAIN; 2876 __be16 sport = 0, dport = 0; 2877 struct fib_result res = {}; 2878 u8 ip_proto = IPPROTO_UDP; 2879 struct rtable *rt = NULL; 2880 struct sk_buff *skb; 2881 struct rtmsg *rtm; 2882 struct flowi4 fl4 = {}; 2883 __be32 dst = 0; 2884 __be32 src = 0; 2885 kuid_t uid; 2886 u32 iif; 2887 int err; 2888 int mark; 2889 2890 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 2891 if (err < 0) 2892 return err; 2893 2894 rtm = nlmsg_data(nlh); 2895 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 2896 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 2897 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 2898 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 2899 if (tb[RTA_UID]) 2900 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); 2901 else 2902 uid = (iif ? INVALID_UID : current_uid()); 2903 2904 if (tb[RTA_IP_PROTO]) { 2905 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 2906 &ip_proto, AF_INET, extack); 2907 if (err) 2908 return err; 2909 } 2910 2911 if (tb[RTA_SPORT]) 2912 sport = nla_get_be16(tb[RTA_SPORT]); 2913 2914 if (tb[RTA_DPORT]) 2915 dport = nla_get_be16(tb[RTA_DPORT]); 2916 2917 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); 2918 if (!skb) 2919 return -ENOBUFS; 2920 2921 fl4.daddr = dst; 2922 fl4.saddr = src; 2923 fl4.flowi4_tos = rtm->rtm_tos; 2924 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 2925 fl4.flowi4_mark = mark; 2926 fl4.flowi4_uid = uid; 2927 if (sport) 2928 fl4.fl4_sport = sport; 2929 if (dport) 2930 fl4.fl4_dport = dport; 2931 fl4.flowi4_proto = ip_proto; 2932 2933 rcu_read_lock(); 2934 2935 if (iif) { 2936 struct net_device *dev; 2937 2938 dev = dev_get_by_index_rcu(net, iif); 2939 if (!dev) { 2940 err = -ENODEV; 2941 goto errout_rcu; 2942 } 2943 2944 fl4.flowi4_iif = iif; /* for rt_fill_info */ 2945 skb->dev = dev; 2946 skb->mark = mark; 2947 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, 2948 dev, &res); 2949 2950 rt = skb_rtable(skb); 2951 if (err == 0 && rt->dst.error) 2952 err = -rt->dst.error; 2953 } else { 2954 fl4.flowi4_iif = LOOPBACK_IFINDEX; 2955 skb->dev = net->loopback_dev; 2956 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); 2957 err = 0; 2958 if (IS_ERR(rt)) 2959 err = PTR_ERR(rt); 2960 else 2961 skb_dst_set(skb, &rt->dst); 2962 } 2963 2964 if (err) 2965 goto errout_rcu; 2966 2967 if (rtm->rtm_flags & RTM_F_NOTIFY) 2968 rt->rt_flags |= RTCF_NOTIFY; 2969 2970 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 2971 table_id = res.table ? res.table->tb_id : 0; 2972 2973 /* reset skb for netlink reply msg */ 2974 skb_trim(skb, 0); 2975 skb_reset_network_header(skb); 2976 skb_reset_transport_header(skb); 2977 skb_reset_mac_header(skb); 2978 2979 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 2980 if (!res.fi) { 2981 err = fib_props[res.type].error; 2982 if (!err) 2983 err = -EHOSTUNREACH; 2984 goto errout_rcu; 2985 } 2986 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 2987 nlh->nlmsg_seq, RTM_NEWROUTE, table_id, 2988 rt->rt_type, res.prefix, res.prefixlen, 2989 fl4.flowi4_tos, res.fi, 0); 2990 } else { 2991 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, 2992 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq); 2993 } 2994 if (err < 0) 2995 goto errout_rcu; 2996 2997 rcu_read_unlock(); 2998 2999 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3000 3001 errout_free: 3002 return err; 3003 errout_rcu: 3004 rcu_read_unlock(); 3005 kfree_skb(skb); 3006 goto errout_free; 3007 } 3008 3009 void ip_rt_multicast_event(struct in_device *in_dev) 3010 { 3011 rt_cache_flush(dev_net(in_dev->dev)); 3012 } 3013 3014 #ifdef CONFIG_SYSCTL 3015 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 3016 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 3017 static int ip_rt_gc_elasticity __read_mostly = 8; 3018 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; 3019 3020 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, 3021 void __user *buffer, 3022 size_t *lenp, loff_t *ppos) 3023 { 3024 struct net *net = (struct net *)__ctl->extra1; 3025 3026 if (write) { 3027 rt_cache_flush(net); 3028 fnhe_genid_bump(net); 3029 return 0; 3030 } 3031 3032 return -EINVAL; 3033 } 3034 3035 static struct ctl_table ipv4_route_table[] = { 3036 { 3037 .procname = "gc_thresh", 3038 .data = &ipv4_dst_ops.gc_thresh, 3039 .maxlen = sizeof(int), 3040 .mode = 0644, 3041 .proc_handler = proc_dointvec, 3042 }, 3043 { 3044 .procname = "max_size", 3045 .data = &ip_rt_max_size, 3046 .maxlen = sizeof(int), 3047 .mode = 0644, 3048 .proc_handler = proc_dointvec, 3049 }, 3050 { 3051 /* Deprecated. Use gc_min_interval_ms */ 3052 3053 .procname = "gc_min_interval", 3054 .data = &ip_rt_gc_min_interval, 3055 .maxlen = sizeof(int), 3056 .mode = 0644, 3057 .proc_handler = proc_dointvec_jiffies, 3058 }, 3059 { 3060 .procname = "gc_min_interval_ms", 3061 .data = &ip_rt_gc_min_interval, 3062 .maxlen = sizeof(int), 3063 .mode = 0644, 3064 .proc_handler = proc_dointvec_ms_jiffies, 3065 }, 3066 { 3067 .procname = "gc_timeout", 3068 .data = &ip_rt_gc_timeout, 3069 .maxlen = sizeof(int), 3070 .mode = 0644, 3071 .proc_handler = proc_dointvec_jiffies, 3072 }, 3073 { 3074 .procname = "gc_interval", 3075 .data = &ip_rt_gc_interval, 3076 .maxlen = sizeof(int), 3077 .mode = 0644, 3078 .proc_handler = proc_dointvec_jiffies, 3079 }, 3080 { 3081 .procname = "redirect_load", 3082 .data = &ip_rt_redirect_load, 3083 .maxlen = sizeof(int), 3084 .mode = 0644, 3085 .proc_handler = proc_dointvec, 3086 }, 3087 { 3088 .procname = "redirect_number", 3089 .data = &ip_rt_redirect_number, 3090 .maxlen = sizeof(int), 3091 .mode = 0644, 3092 .proc_handler = proc_dointvec, 3093 }, 3094 { 3095 .procname = "redirect_silence", 3096 .data = &ip_rt_redirect_silence, 3097 .maxlen = sizeof(int), 3098 .mode = 0644, 3099 .proc_handler = proc_dointvec, 3100 }, 3101 { 3102 .procname = "error_cost", 3103 .data = &ip_rt_error_cost, 3104 .maxlen = sizeof(int), 3105 .mode = 0644, 3106 .proc_handler = proc_dointvec, 3107 }, 3108 { 3109 .procname = "error_burst", 3110 .data = &ip_rt_error_burst, 3111 .maxlen = sizeof(int), 3112 .mode = 0644, 3113 .proc_handler = proc_dointvec, 3114 }, 3115 { 3116 .procname = "gc_elasticity", 3117 .data = &ip_rt_gc_elasticity, 3118 .maxlen = sizeof(int), 3119 .mode = 0644, 3120 .proc_handler = proc_dointvec, 3121 }, 3122 { 3123 .procname = "mtu_expires", 3124 .data = &ip_rt_mtu_expires, 3125 .maxlen = sizeof(int), 3126 .mode = 0644, 3127 .proc_handler = proc_dointvec_jiffies, 3128 }, 3129 { 3130 .procname = "min_pmtu", 3131 .data = &ip_rt_min_pmtu, 3132 .maxlen = sizeof(int), 3133 .mode = 0644, 3134 .proc_handler = proc_dointvec_minmax, 3135 .extra1 = &ip_min_valid_pmtu, 3136 }, 3137 { 3138 .procname = "min_adv_mss", 3139 .data = &ip_rt_min_advmss, 3140 .maxlen = sizeof(int), 3141 .mode = 0644, 3142 .proc_handler = proc_dointvec, 3143 }, 3144 { } 3145 }; 3146 3147 static struct ctl_table ipv4_route_flush_table[] = { 3148 { 3149 .procname = "flush", 3150 .maxlen = sizeof(int), 3151 .mode = 0200, 3152 .proc_handler = ipv4_sysctl_rtcache_flush, 3153 }, 3154 { }, 3155 }; 3156 3157 static __net_init int sysctl_route_net_init(struct net *net) 3158 { 3159 struct ctl_table *tbl; 3160 3161 tbl = ipv4_route_flush_table; 3162 if (!net_eq(net, &init_net)) { 3163 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3164 if (!tbl) 3165 goto err_dup; 3166 3167 /* Don't export sysctls to unprivileged users */ 3168 if (net->user_ns != &init_user_ns) 3169 tbl[0].procname = NULL; 3170 } 3171 tbl[0].extra1 = net; 3172 3173 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 3174 if (!net->ipv4.route_hdr) 3175 goto err_reg; 3176 return 0; 3177 3178 err_reg: 3179 if (tbl != ipv4_route_flush_table) 3180 kfree(tbl); 3181 err_dup: 3182 return -ENOMEM; 3183 } 3184 3185 static __net_exit void sysctl_route_net_exit(struct net *net) 3186 { 3187 struct ctl_table *tbl; 3188 3189 tbl = net->ipv4.route_hdr->ctl_table_arg; 3190 unregister_net_sysctl_table(net->ipv4.route_hdr); 3191 BUG_ON(tbl == ipv4_route_flush_table); 3192 kfree(tbl); 3193 } 3194 3195 static __net_initdata struct pernet_operations sysctl_route_ops = { 3196 .init = sysctl_route_net_init, 3197 .exit = sysctl_route_net_exit, 3198 }; 3199 #endif 3200 3201 static __net_init int rt_genid_init(struct net *net) 3202 { 3203 atomic_set(&net->ipv4.rt_genid, 0); 3204 atomic_set(&net->fnhe_genid, 0); 3205 atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); 3206 return 0; 3207 } 3208 3209 static __net_initdata struct pernet_operations rt_genid_ops = { 3210 .init = rt_genid_init, 3211 }; 3212 3213 static int __net_init ipv4_inetpeer_init(struct net *net) 3214 { 3215 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 3216 3217 if (!bp) 3218 return -ENOMEM; 3219 inet_peer_base_init(bp); 3220 net->ipv4.peers = bp; 3221 return 0; 3222 } 3223 3224 static void __net_exit ipv4_inetpeer_exit(struct net *net) 3225 { 3226 struct inet_peer_base *bp = net->ipv4.peers; 3227 3228 net->ipv4.peers = NULL; 3229 inetpeer_invalidate_tree(bp); 3230 kfree(bp); 3231 } 3232 3233 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 3234 .init = ipv4_inetpeer_init, 3235 .exit = ipv4_inetpeer_exit, 3236 }; 3237 3238 #ifdef CONFIG_IP_ROUTE_CLASSID 3239 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3240 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3241 3242 int __init ip_rt_init(void) 3243 { 3244 int cpu; 3245 3246 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), 3247 GFP_KERNEL); 3248 if (!ip_idents) 3249 panic("IP: failed to allocate ip_idents\n"); 3250 3251 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 3252 3253 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); 3254 if (!ip_tstamps) 3255 panic("IP: failed to allocate ip_tstamps\n"); 3256 3257 for_each_possible_cpu(cpu) { 3258 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 3259 3260 INIT_LIST_HEAD(&ul->head); 3261 spin_lock_init(&ul->lock); 3262 } 3263 #ifdef CONFIG_IP_ROUTE_CLASSID 3264 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3265 if (!ip_rt_acct) 3266 panic("IP: failed to allocate ip_rt_acct\n"); 3267 #endif 3268 3269 ipv4_dst_ops.kmem_cachep = 3270 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3271 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3272 3273 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3274 3275 if (dst_entries_init(&ipv4_dst_ops) < 0) 3276 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3277 3278 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3279 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3280 3281 ipv4_dst_ops.gc_thresh = ~0; 3282 ip_rt_max_size = INT_MAX; 3283 3284 devinet_init(); 3285 ip_fib_init(); 3286 3287 if (ip_rt_proc_init()) 3288 pr_err("Unable to create route proc files\n"); 3289 #ifdef CONFIG_XFRM 3290 xfrm_init(); 3291 xfrm4_init(); 3292 #endif 3293 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 3294 RTNL_FLAG_DOIT_UNLOCKED); 3295 3296 #ifdef CONFIG_SYSCTL 3297 register_pernet_subsys(&sysctl_route_ops); 3298 #endif 3299 register_pernet_subsys(&rt_genid_ops); 3300 register_pernet_subsys(&ipv4_inetpeer_ops); 3301 return 0; 3302 } 3303 3304 #ifdef CONFIG_SYSCTL 3305 /* 3306 * We really need to sanitize the damn ipv4 init order, then all 3307 * this nonsense will go away. 3308 */ 3309 void __init ip_static_sysctl_init(void) 3310 { 3311 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3312 } 3313 #endif 3314