1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * INET An implementation of the TCP/IP protocol suite for the LINUX 4 * operating system. INET is implemented using the BSD Socket 5 * interface as the means of communication with the user level. 6 * 7 * ROUTE - implementation of the IP router. 8 * 9 * Authors: Ross Biro 10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> 11 * Alan Cox, <gw4pts@gw4pts.ampr.org> 12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi> 13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru> 14 * 15 * Fixes: 16 * Alan Cox : Verify area fixes. 17 * Alan Cox : cli() protects routing changes 18 * Rui Oliveira : ICMP routing table updates 19 * (rco@di.uminho.pt) Routing table insertion and update 20 * Linus Torvalds : Rewrote bits to be sensible 21 * Alan Cox : Added BSD route gw semantics 22 * Alan Cox : Super /proc >4K 23 * Alan Cox : MTU in route table 24 * Alan Cox : MSS actually. Also added the window 25 * clamper. 26 * Sam Lantinga : Fixed route matching in rt_del() 27 * Alan Cox : Routing cache support. 28 * Alan Cox : Removed compatibility cruft. 29 * Alan Cox : RTF_REJECT support. 30 * Alan Cox : TCP irtt support. 31 * Jonathan Naylor : Added Metric support. 32 * Miquel van Smoorenburg : BSD API fixes. 33 * Miquel van Smoorenburg : Metrics. 34 * Alan Cox : Use __u32 properly 35 * Alan Cox : Aligned routing errors more closely with BSD 36 * our system is still very different. 37 * Alan Cox : Faster /proc handling 38 * Alexey Kuznetsov : Massive rework to support tree based routing, 39 * routing caches and better behaviour. 40 * 41 * Olaf Erb : irtt wasn't being copied right. 42 * Bjorn Ekwall : Kerneld route support. 43 * Alan Cox : Multicast fixed (I hope) 44 * Pavel Krauz : Limited broadcast fixed 45 * Mike McLagan : Routing by source 46 * Alexey Kuznetsov : End of old history. Split to fib.c and 47 * route.c and rewritten from scratch. 48 * Andi Kleen : Load-limit warning messages. 49 * Vitaly E. Lavrov : Transparent proxy revived after year coma. 50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow. 51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow. 52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful. 53 * Marc Boucher : routing by fwmark 54 * Robert Olsson : Added rt_cache statistics 55 * Arnaldo C. Melo : Convert proc stuff to seq_file 56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes. 57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect 58 * Ilia Sotnikov : Removed TOS from hash calculations 59 */ 60 61 #define pr_fmt(fmt) "IPv4: " fmt 62 63 #include <linux/module.h> 64 #include <linux/uaccess.h> 65 #include <linux/bitops.h> 66 #include <linux/types.h> 67 #include <linux/kernel.h> 68 #include <linux/mm.h> 69 #include <linux/string.h> 70 #include <linux/socket.h> 71 #include <linux/sockios.h> 72 #include <linux/errno.h> 73 #include <linux/in.h> 74 #include <linux/inet.h> 75 #include <linux/netdevice.h> 76 #include <linux/proc_fs.h> 77 #include <linux/init.h> 78 #include <linux/skbuff.h> 79 #include <linux/inetdevice.h> 80 #include <linux/igmp.h> 81 #include <linux/pkt_sched.h> 82 #include <linux/mroute.h> 83 #include <linux/netfilter_ipv4.h> 84 #include <linux/random.h> 85 #include <linux/rcupdate.h> 86 #include <linux/times.h> 87 #include <linux/slab.h> 88 #include <linux/jhash.h> 89 #include <net/dst.h> 90 #include <net/dst_metadata.h> 91 #include <net/net_namespace.h> 92 #include <net/protocol.h> 93 #include <net/ip.h> 94 #include <net/route.h> 95 #include <net/inetpeer.h> 96 #include <net/sock.h> 97 #include <net/ip_fib.h> 98 #include <net/nexthop.h> 99 #include <net/arp.h> 100 #include <net/tcp.h> 101 #include <net/icmp.h> 102 #include <net/xfrm.h> 103 #include <net/lwtunnel.h> 104 #include <net/netevent.h> 105 #include <net/rtnetlink.h> 106 #ifdef CONFIG_SYSCTL 107 #include <linux/sysctl.h> 108 #endif 109 #include <net/secure_seq.h> 110 #include <net/ip_tunnels.h> 111 #include <net/l3mdev.h> 112 113 #include "fib_lookup.h" 114 115 #define RT_FL_TOS(oldflp4) \ 116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK)) 117 118 #define RT_GC_TIMEOUT (300*HZ) 119 120 static int ip_rt_max_size; 121 static int ip_rt_redirect_number __read_mostly = 9; 122 static int ip_rt_redirect_load __read_mostly = HZ / 50; 123 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); 124 static int ip_rt_error_cost __read_mostly = HZ; 125 static int ip_rt_error_burst __read_mostly = 5 * HZ; 126 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; 127 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; 128 static int ip_rt_min_advmss __read_mostly = 256; 129 130 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; 131 132 /* 133 * Interface to generic destination cache. 134 */ 135 136 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie); 137 static unsigned int ipv4_default_advmss(const struct dst_entry *dst); 138 static unsigned int ipv4_mtu(const struct dst_entry *dst); 139 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); 140 static void ipv4_link_failure(struct sk_buff *skb); 141 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 142 struct sk_buff *skb, u32 mtu, 143 bool confirm_neigh); 144 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, 145 struct sk_buff *skb); 146 static void ipv4_dst_destroy(struct dst_entry *dst); 147 148 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old) 149 { 150 WARN_ON(1); 151 return NULL; 152 } 153 154 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 155 struct sk_buff *skb, 156 const void *daddr); 157 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr); 158 159 static struct dst_ops ipv4_dst_ops = { 160 .family = AF_INET, 161 .check = ipv4_dst_check, 162 .default_advmss = ipv4_default_advmss, 163 .mtu = ipv4_mtu, 164 .cow_metrics = ipv4_cow_metrics, 165 .destroy = ipv4_dst_destroy, 166 .negative_advice = ipv4_negative_advice, 167 .link_failure = ipv4_link_failure, 168 .update_pmtu = ip_rt_update_pmtu, 169 .redirect = ip_do_redirect, 170 .local_out = __ip_local_out, 171 .neigh_lookup = ipv4_neigh_lookup, 172 .confirm_neigh = ipv4_confirm_neigh, 173 }; 174 175 #define ECN_OR_COST(class) TC_PRIO_##class 176 177 const __u8 ip_tos2prio[16] = { 178 TC_PRIO_BESTEFFORT, 179 ECN_OR_COST(BESTEFFORT), 180 TC_PRIO_BESTEFFORT, 181 ECN_OR_COST(BESTEFFORT), 182 TC_PRIO_BULK, 183 ECN_OR_COST(BULK), 184 TC_PRIO_BULK, 185 ECN_OR_COST(BULK), 186 TC_PRIO_INTERACTIVE, 187 ECN_OR_COST(INTERACTIVE), 188 TC_PRIO_INTERACTIVE, 189 ECN_OR_COST(INTERACTIVE), 190 TC_PRIO_INTERACTIVE_BULK, 191 ECN_OR_COST(INTERACTIVE_BULK), 192 TC_PRIO_INTERACTIVE_BULK, 193 ECN_OR_COST(INTERACTIVE_BULK) 194 }; 195 EXPORT_SYMBOL(ip_tos2prio); 196 197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); 198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field) 199 200 #ifdef CONFIG_PROC_FS 201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) 202 { 203 if (*pos) 204 return NULL; 205 return SEQ_START_TOKEN; 206 } 207 208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) 209 { 210 ++*pos; 211 return NULL; 212 } 213 214 static void rt_cache_seq_stop(struct seq_file *seq, void *v) 215 { 216 } 217 218 static int rt_cache_seq_show(struct seq_file *seq, void *v) 219 { 220 if (v == SEQ_START_TOKEN) 221 seq_printf(seq, "%-127s\n", 222 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t" 223 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t" 224 "HHUptod\tSpecDst"); 225 return 0; 226 } 227 228 static const struct seq_operations rt_cache_seq_ops = { 229 .start = rt_cache_seq_start, 230 .next = rt_cache_seq_next, 231 .stop = rt_cache_seq_stop, 232 .show = rt_cache_seq_show, 233 }; 234 235 static int rt_cache_seq_open(struct inode *inode, struct file *file) 236 { 237 return seq_open(file, &rt_cache_seq_ops); 238 } 239 240 static const struct proc_ops rt_cache_proc_ops = { 241 .proc_open = rt_cache_seq_open, 242 .proc_read = seq_read, 243 .proc_lseek = seq_lseek, 244 .proc_release = seq_release, 245 }; 246 247 248 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) 249 { 250 int cpu; 251 252 if (*pos == 0) 253 return SEQ_START_TOKEN; 254 255 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { 256 if (!cpu_possible(cpu)) 257 continue; 258 *pos = cpu+1; 259 return &per_cpu(rt_cache_stat, cpu); 260 } 261 return NULL; 262 } 263 264 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) 265 { 266 int cpu; 267 268 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { 269 if (!cpu_possible(cpu)) 270 continue; 271 *pos = cpu+1; 272 return &per_cpu(rt_cache_stat, cpu); 273 } 274 (*pos)++; 275 return NULL; 276 277 } 278 279 static void rt_cpu_seq_stop(struct seq_file *seq, void *v) 280 { 281 282 } 283 284 static int rt_cpu_seq_show(struct seq_file *seq, void *v) 285 { 286 struct rt_cache_stat *st = v; 287 288 if (v == SEQ_START_TOKEN) { 289 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n"); 290 return 0; 291 } 292 293 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " 294 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", 295 dst_entries_get_slow(&ipv4_dst_ops), 296 0, /* st->in_hit */ 297 st->in_slow_tot, 298 st->in_slow_mc, 299 st->in_no_route, 300 st->in_brd, 301 st->in_martian_dst, 302 st->in_martian_src, 303 304 0, /* st->out_hit */ 305 st->out_slow_tot, 306 st->out_slow_mc, 307 308 0, /* st->gc_total */ 309 0, /* st->gc_ignored */ 310 0, /* st->gc_goal_miss */ 311 0, /* st->gc_dst_overflow */ 312 0, /* st->in_hlist_search */ 313 0 /* st->out_hlist_search */ 314 ); 315 return 0; 316 } 317 318 static const struct seq_operations rt_cpu_seq_ops = { 319 .start = rt_cpu_seq_start, 320 .next = rt_cpu_seq_next, 321 .stop = rt_cpu_seq_stop, 322 .show = rt_cpu_seq_show, 323 }; 324 325 326 static int rt_cpu_seq_open(struct inode *inode, struct file *file) 327 { 328 return seq_open(file, &rt_cpu_seq_ops); 329 } 330 331 static const struct proc_ops rt_cpu_proc_ops = { 332 .proc_open = rt_cpu_seq_open, 333 .proc_read = seq_read, 334 .proc_lseek = seq_lseek, 335 .proc_release = seq_release, 336 }; 337 338 #ifdef CONFIG_IP_ROUTE_CLASSID 339 static int rt_acct_proc_show(struct seq_file *m, void *v) 340 { 341 struct ip_rt_acct *dst, *src; 342 unsigned int i, j; 343 344 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); 345 if (!dst) 346 return -ENOMEM; 347 348 for_each_possible_cpu(i) { 349 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); 350 for (j = 0; j < 256; j++) { 351 dst[j].o_bytes += src[j].o_bytes; 352 dst[j].o_packets += src[j].o_packets; 353 dst[j].i_bytes += src[j].i_bytes; 354 dst[j].i_packets += src[j].i_packets; 355 } 356 } 357 358 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); 359 kfree(dst); 360 return 0; 361 } 362 #endif 363 364 static int __net_init ip_rt_do_proc_init(struct net *net) 365 { 366 struct proc_dir_entry *pde; 367 368 pde = proc_create("rt_cache", 0444, net->proc_net, 369 &rt_cache_proc_ops); 370 if (!pde) 371 goto err1; 372 373 pde = proc_create("rt_cache", 0444, 374 net->proc_net_stat, &rt_cpu_proc_ops); 375 if (!pde) 376 goto err2; 377 378 #ifdef CONFIG_IP_ROUTE_CLASSID 379 pde = proc_create_single("rt_acct", 0, net->proc_net, 380 rt_acct_proc_show); 381 if (!pde) 382 goto err3; 383 #endif 384 return 0; 385 386 #ifdef CONFIG_IP_ROUTE_CLASSID 387 err3: 388 remove_proc_entry("rt_cache", net->proc_net_stat); 389 #endif 390 err2: 391 remove_proc_entry("rt_cache", net->proc_net); 392 err1: 393 return -ENOMEM; 394 } 395 396 static void __net_exit ip_rt_do_proc_exit(struct net *net) 397 { 398 remove_proc_entry("rt_cache", net->proc_net_stat); 399 remove_proc_entry("rt_cache", net->proc_net); 400 #ifdef CONFIG_IP_ROUTE_CLASSID 401 remove_proc_entry("rt_acct", net->proc_net); 402 #endif 403 } 404 405 static struct pernet_operations ip_rt_proc_ops __net_initdata = { 406 .init = ip_rt_do_proc_init, 407 .exit = ip_rt_do_proc_exit, 408 }; 409 410 static int __init ip_rt_proc_init(void) 411 { 412 return register_pernet_subsys(&ip_rt_proc_ops); 413 } 414 415 #else 416 static inline int ip_rt_proc_init(void) 417 { 418 return 0; 419 } 420 #endif /* CONFIG_PROC_FS */ 421 422 static inline bool rt_is_expired(const struct rtable *rth) 423 { 424 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev)); 425 } 426 427 void rt_cache_flush(struct net *net) 428 { 429 rt_genid_bump_ipv4(net); 430 } 431 432 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst, 433 struct sk_buff *skb, 434 const void *daddr) 435 { 436 const struct rtable *rt = container_of(dst, struct rtable, dst); 437 struct net_device *dev = dst->dev; 438 struct neighbour *n; 439 440 rcu_read_lock_bh(); 441 442 if (likely(rt->rt_gw_family == AF_INET)) { 443 n = ip_neigh_gw4(dev, rt->rt_gw4); 444 } else if (rt->rt_gw_family == AF_INET6) { 445 n = ip_neigh_gw6(dev, &rt->rt_gw6); 446 } else { 447 __be32 pkey; 448 449 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr); 450 n = ip_neigh_gw4(dev, pkey); 451 } 452 453 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt)) 454 n = NULL; 455 456 rcu_read_unlock_bh(); 457 458 return n; 459 } 460 461 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr) 462 { 463 const struct rtable *rt = container_of(dst, struct rtable, dst); 464 struct net_device *dev = dst->dev; 465 const __be32 *pkey = daddr; 466 467 if (rt->rt_gw_family == AF_INET) { 468 pkey = (const __be32 *)&rt->rt_gw4; 469 } else if (rt->rt_gw_family == AF_INET6) { 470 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6); 471 } else if (!daddr || 472 (rt->rt_flags & 473 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) { 474 return; 475 } 476 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey); 477 } 478 479 #define IP_IDENTS_SZ 2048u 480 481 static atomic_t *ip_idents __read_mostly; 482 static u32 *ip_tstamps __read_mostly; 483 484 /* In order to protect privacy, we add a perturbation to identifiers 485 * if one generator is seldom used. This makes hard for an attacker 486 * to infer how many packets were sent between two points in time. 487 */ 488 u32 ip_idents_reserve(u32 hash, int segs) 489 { 490 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ; 491 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ; 492 u32 old = READ_ONCE(*p_tstamp); 493 u32 now = (u32)jiffies; 494 u32 delta = 0; 495 496 if (old != now && cmpxchg(p_tstamp, old, now) == old) 497 delta = prandom_u32_max(now - old); 498 499 /* If UBSAN reports an error there, please make sure your compiler 500 * supports -fno-strict-overflow before reporting it that was a bug 501 * in UBSAN, and it has been fixed in GCC-8. 502 */ 503 return atomic_add_return(segs + delta, p_id) - segs; 504 } 505 EXPORT_SYMBOL(ip_idents_reserve); 506 507 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs) 508 { 509 u32 hash, id; 510 511 /* Note the following code is not safe, but this is okay. */ 512 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key))) 513 get_random_bytes(&net->ipv4.ip_id_key, 514 sizeof(net->ipv4.ip_id_key)); 515 516 hash = siphash_3u32((__force u32)iph->daddr, 517 (__force u32)iph->saddr, 518 iph->protocol, 519 &net->ipv4.ip_id_key); 520 id = ip_idents_reserve(hash, segs); 521 iph->id = htons(id); 522 } 523 EXPORT_SYMBOL(__ip_select_ident); 524 525 static void __build_flow_key(const struct net *net, struct flowi4 *fl4, 526 const struct sock *sk, 527 const struct iphdr *iph, 528 int oif, u8 tos, 529 u8 prot, u32 mark, int flow_flags) 530 { 531 if (sk) { 532 const struct inet_sock *inet = inet_sk(sk); 533 534 oif = sk->sk_bound_dev_if; 535 mark = sk->sk_mark; 536 tos = RT_CONN_FLAGS(sk); 537 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol; 538 } 539 flowi4_init_output(fl4, oif, mark, tos, 540 RT_SCOPE_UNIVERSE, prot, 541 flow_flags, 542 iph->daddr, iph->saddr, 0, 0, 543 sock_net_uid(net, sk)); 544 } 545 546 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb, 547 const struct sock *sk) 548 { 549 const struct net *net = dev_net(skb->dev); 550 const struct iphdr *iph = ip_hdr(skb); 551 int oif = skb->dev->ifindex; 552 u8 tos = RT_TOS(iph->tos); 553 u8 prot = iph->protocol; 554 u32 mark = skb->mark; 555 556 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0); 557 } 558 559 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk) 560 { 561 const struct inet_sock *inet = inet_sk(sk); 562 const struct ip_options_rcu *inet_opt; 563 __be32 daddr = inet->inet_daddr; 564 565 rcu_read_lock(); 566 inet_opt = rcu_dereference(inet->inet_opt); 567 if (inet_opt && inet_opt->opt.srr) 568 daddr = inet_opt->opt.faddr; 569 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark, 570 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, 571 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, 572 inet_sk_flowi_flags(sk), 573 daddr, inet->inet_saddr, 0, 0, sk->sk_uid); 574 rcu_read_unlock(); 575 } 576 577 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk, 578 const struct sk_buff *skb) 579 { 580 if (skb) 581 build_skb_flow_key(fl4, skb, sk); 582 else 583 build_sk_flow_key(fl4, sk); 584 } 585 586 static DEFINE_SPINLOCK(fnhe_lock); 587 588 static void fnhe_flush_routes(struct fib_nh_exception *fnhe) 589 { 590 struct rtable *rt; 591 592 rt = rcu_dereference(fnhe->fnhe_rth_input); 593 if (rt) { 594 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL); 595 dst_dev_put(&rt->dst); 596 dst_release(&rt->dst); 597 } 598 rt = rcu_dereference(fnhe->fnhe_rth_output); 599 if (rt) { 600 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL); 601 dst_dev_put(&rt->dst); 602 dst_release(&rt->dst); 603 } 604 } 605 606 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash) 607 { 608 struct fib_nh_exception *fnhe, *oldest; 609 610 oldest = rcu_dereference(hash->chain); 611 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe; 612 fnhe = rcu_dereference(fnhe->fnhe_next)) { 613 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) 614 oldest = fnhe; 615 } 616 fnhe_flush_routes(oldest); 617 return oldest; 618 } 619 620 static inline u32 fnhe_hashfun(__be32 daddr) 621 { 622 static u32 fnhe_hashrnd __read_mostly; 623 u32 hval; 624 625 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd)); 626 hval = jhash_1word((__force u32)daddr, fnhe_hashrnd); 627 return hash_32(hval, FNHE_HASH_SHIFT); 628 } 629 630 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe) 631 { 632 rt->rt_pmtu = fnhe->fnhe_pmtu; 633 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked; 634 rt->dst.expires = fnhe->fnhe_expires; 635 636 if (fnhe->fnhe_gw) { 637 rt->rt_flags |= RTCF_REDIRECTED; 638 rt->rt_uses_gateway = 1; 639 rt->rt_gw_family = AF_INET; 640 rt->rt_gw4 = fnhe->fnhe_gw; 641 } 642 } 643 644 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr, 645 __be32 gw, u32 pmtu, bool lock, 646 unsigned long expires) 647 { 648 struct fnhe_hash_bucket *hash; 649 struct fib_nh_exception *fnhe; 650 struct rtable *rt; 651 u32 genid, hval; 652 unsigned int i; 653 int depth; 654 655 genid = fnhe_genid(dev_net(nhc->nhc_dev)); 656 hval = fnhe_hashfun(daddr); 657 658 spin_lock_bh(&fnhe_lock); 659 660 hash = rcu_dereference(nhc->nhc_exceptions); 661 if (!hash) { 662 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC); 663 if (!hash) 664 goto out_unlock; 665 rcu_assign_pointer(nhc->nhc_exceptions, hash); 666 } 667 668 hash += hval; 669 670 depth = 0; 671 for (fnhe = rcu_dereference(hash->chain); fnhe; 672 fnhe = rcu_dereference(fnhe->fnhe_next)) { 673 if (fnhe->fnhe_daddr == daddr) 674 break; 675 depth++; 676 } 677 678 if (fnhe) { 679 if (fnhe->fnhe_genid != genid) 680 fnhe->fnhe_genid = genid; 681 if (gw) 682 fnhe->fnhe_gw = gw; 683 if (pmtu) { 684 fnhe->fnhe_pmtu = pmtu; 685 fnhe->fnhe_mtu_locked = lock; 686 } 687 fnhe->fnhe_expires = max(1UL, expires); 688 /* Update all cached dsts too */ 689 rt = rcu_dereference(fnhe->fnhe_rth_input); 690 if (rt) 691 fill_route_from_fnhe(rt, fnhe); 692 rt = rcu_dereference(fnhe->fnhe_rth_output); 693 if (rt) 694 fill_route_from_fnhe(rt, fnhe); 695 } else { 696 if (depth > FNHE_RECLAIM_DEPTH) 697 fnhe = fnhe_oldest(hash); 698 else { 699 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC); 700 if (!fnhe) 701 goto out_unlock; 702 703 fnhe->fnhe_next = hash->chain; 704 rcu_assign_pointer(hash->chain, fnhe); 705 } 706 fnhe->fnhe_genid = genid; 707 fnhe->fnhe_daddr = daddr; 708 fnhe->fnhe_gw = gw; 709 fnhe->fnhe_pmtu = pmtu; 710 fnhe->fnhe_mtu_locked = lock; 711 fnhe->fnhe_expires = max(1UL, expires); 712 713 /* Exception created; mark the cached routes for the nexthop 714 * stale, so anyone caching it rechecks if this exception 715 * applies to them. 716 */ 717 rt = rcu_dereference(nhc->nhc_rth_input); 718 if (rt) 719 rt->dst.obsolete = DST_OBSOLETE_KILL; 720 721 for_each_possible_cpu(i) { 722 struct rtable __rcu **prt; 723 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i); 724 rt = rcu_dereference(*prt); 725 if (rt) 726 rt->dst.obsolete = DST_OBSOLETE_KILL; 727 } 728 } 729 730 fnhe->fnhe_stamp = jiffies; 731 732 out_unlock: 733 spin_unlock_bh(&fnhe_lock); 734 } 735 736 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4, 737 bool kill_route) 738 { 739 __be32 new_gw = icmp_hdr(skb)->un.gateway; 740 __be32 old_gw = ip_hdr(skb)->saddr; 741 struct net_device *dev = skb->dev; 742 struct in_device *in_dev; 743 struct fib_result res; 744 struct neighbour *n; 745 struct net *net; 746 747 switch (icmp_hdr(skb)->code & 7) { 748 case ICMP_REDIR_NET: 749 case ICMP_REDIR_NETTOS: 750 case ICMP_REDIR_HOST: 751 case ICMP_REDIR_HOSTTOS: 752 break; 753 754 default: 755 return; 756 } 757 758 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw) 759 return; 760 761 in_dev = __in_dev_get_rcu(dev); 762 if (!in_dev) 763 return; 764 765 net = dev_net(dev); 766 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || 767 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || 768 ipv4_is_zeronet(new_gw)) 769 goto reject_redirect; 770 771 if (!IN_DEV_SHARED_MEDIA(in_dev)) { 772 if (!inet_addr_onlink(in_dev, new_gw, old_gw)) 773 goto reject_redirect; 774 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) 775 goto reject_redirect; 776 } else { 777 if (inet_addr_type(net, new_gw) != RTN_UNICAST) 778 goto reject_redirect; 779 } 780 781 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); 782 if (!n) 783 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); 784 if (!IS_ERR(n)) { 785 if (!(n->nud_state & NUD_VALID)) { 786 neigh_event_send(n, NULL); 787 } else { 788 if (fib_lookup(net, fl4, &res, 0) == 0) { 789 struct fib_nh_common *nhc = FIB_RES_NHC(res); 790 791 update_or_create_fnhe(nhc, fl4->daddr, new_gw, 792 0, false, 793 jiffies + ip_rt_gc_timeout); 794 } 795 if (kill_route) 796 rt->dst.obsolete = DST_OBSOLETE_KILL; 797 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n); 798 } 799 neigh_release(n); 800 } 801 return; 802 803 reject_redirect: 804 #ifdef CONFIG_IP_ROUTE_VERBOSE 805 if (IN_DEV_LOG_MARTIANS(in_dev)) { 806 const struct iphdr *iph = (const struct iphdr *) skb->data; 807 __be32 daddr = iph->daddr; 808 __be32 saddr = iph->saddr; 809 810 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n" 811 " Advised path = %pI4 -> %pI4\n", 812 &old_gw, dev->name, &new_gw, 813 &saddr, &daddr); 814 } 815 #endif 816 ; 817 } 818 819 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb) 820 { 821 struct rtable *rt; 822 struct flowi4 fl4; 823 const struct iphdr *iph = (const struct iphdr *) skb->data; 824 struct net *net = dev_net(skb->dev); 825 int oif = skb->dev->ifindex; 826 u8 tos = RT_TOS(iph->tos); 827 u8 prot = iph->protocol; 828 u32 mark = skb->mark; 829 830 rt = (struct rtable *) dst; 831 832 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0); 833 __ip_do_redirect(rt, skb, &fl4, true); 834 } 835 836 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) 837 { 838 struct rtable *rt = (struct rtable *)dst; 839 struct dst_entry *ret = dst; 840 841 if (rt) { 842 if (dst->obsolete > 0) { 843 ip_rt_put(rt); 844 ret = NULL; 845 } else if ((rt->rt_flags & RTCF_REDIRECTED) || 846 rt->dst.expires) { 847 ip_rt_put(rt); 848 ret = NULL; 849 } 850 } 851 return ret; 852 } 853 854 /* 855 * Algorithm: 856 * 1. The first ip_rt_redirect_number redirects are sent 857 * with exponential backoff, then we stop sending them at all, 858 * assuming that the host ignores our redirects. 859 * 2. If we did not see packets requiring redirects 860 * during ip_rt_redirect_silence, we assume that the host 861 * forgot redirected route and start to send redirects again. 862 * 863 * This algorithm is much cheaper and more intelligent than dumb load limiting 864 * in icmp.c. 865 * 866 * NOTE. Do not forget to inhibit load limiting for redirects (redundant) 867 * and "frag. need" (breaks PMTU discovery) in icmp.c. 868 */ 869 870 void ip_rt_send_redirect(struct sk_buff *skb) 871 { 872 struct rtable *rt = skb_rtable(skb); 873 struct in_device *in_dev; 874 struct inet_peer *peer; 875 struct net *net; 876 int log_martians; 877 int vif; 878 879 rcu_read_lock(); 880 in_dev = __in_dev_get_rcu(rt->dst.dev); 881 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { 882 rcu_read_unlock(); 883 return; 884 } 885 log_martians = IN_DEV_LOG_MARTIANS(in_dev); 886 vif = l3mdev_master_ifindex_rcu(rt->dst.dev); 887 rcu_read_unlock(); 888 889 net = dev_net(rt->dst.dev); 890 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1); 891 if (!peer) { 892 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, 893 rt_nexthop(rt, ip_hdr(skb)->daddr)); 894 return; 895 } 896 897 /* No redirected packets during ip_rt_redirect_silence; 898 * reset the algorithm. 899 */ 900 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) { 901 peer->rate_tokens = 0; 902 peer->n_redirects = 0; 903 } 904 905 /* Too many ignored redirects; do not send anything 906 * set dst.rate_last to the last seen redirected packet. 907 */ 908 if (peer->n_redirects >= ip_rt_redirect_number) { 909 peer->rate_last = jiffies; 910 goto out_put_peer; 911 } 912 913 /* Check for load limit; set rate_last to the latest sent 914 * redirect. 915 */ 916 if (peer->n_redirects == 0 || 917 time_after(jiffies, 918 (peer->rate_last + 919 (ip_rt_redirect_load << peer->n_redirects)))) { 920 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr); 921 922 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw); 923 peer->rate_last = jiffies; 924 ++peer->n_redirects; 925 #ifdef CONFIG_IP_ROUTE_VERBOSE 926 if (log_martians && 927 peer->n_redirects == ip_rt_redirect_number) 928 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n", 929 &ip_hdr(skb)->saddr, inet_iif(skb), 930 &ip_hdr(skb)->daddr, &gw); 931 #endif 932 } 933 out_put_peer: 934 inet_putpeer(peer); 935 } 936 937 static int ip_error(struct sk_buff *skb) 938 { 939 struct rtable *rt = skb_rtable(skb); 940 struct net_device *dev = skb->dev; 941 struct in_device *in_dev; 942 struct inet_peer *peer; 943 unsigned long now; 944 struct net *net; 945 bool send; 946 int code; 947 948 if (netif_is_l3_master(skb->dev)) { 949 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif); 950 if (!dev) 951 goto out; 952 } 953 954 in_dev = __in_dev_get_rcu(dev); 955 956 /* IP on this device is disabled. */ 957 if (!in_dev) 958 goto out; 959 960 net = dev_net(rt->dst.dev); 961 if (!IN_DEV_FORWARD(in_dev)) { 962 switch (rt->dst.error) { 963 case EHOSTUNREACH: 964 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS); 965 break; 966 967 case ENETUNREACH: 968 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 969 break; 970 } 971 goto out; 972 } 973 974 switch (rt->dst.error) { 975 case EINVAL: 976 default: 977 goto out; 978 case EHOSTUNREACH: 979 code = ICMP_HOST_UNREACH; 980 break; 981 case ENETUNREACH: 982 code = ICMP_NET_UNREACH; 983 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES); 984 break; 985 case EACCES: 986 code = ICMP_PKT_FILTERED; 987 break; 988 } 989 990 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, 991 l3mdev_master_ifindex(skb->dev), 1); 992 993 send = true; 994 if (peer) { 995 now = jiffies; 996 peer->rate_tokens += now - peer->rate_last; 997 if (peer->rate_tokens > ip_rt_error_burst) 998 peer->rate_tokens = ip_rt_error_burst; 999 peer->rate_last = now; 1000 if (peer->rate_tokens >= ip_rt_error_cost) 1001 peer->rate_tokens -= ip_rt_error_cost; 1002 else 1003 send = false; 1004 inet_putpeer(peer); 1005 } 1006 if (send) 1007 icmp_send(skb, ICMP_DEST_UNREACH, code, 0); 1008 1009 out: kfree_skb(skb); 1010 return 0; 1011 } 1012 1013 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu) 1014 { 1015 struct dst_entry *dst = &rt->dst; 1016 struct fib_result res; 1017 bool lock = false; 1018 u32 old_mtu; 1019 1020 if (ip_mtu_locked(dst)) 1021 return; 1022 1023 old_mtu = ipv4_mtu(dst); 1024 if (old_mtu < mtu) 1025 return; 1026 1027 if (mtu < ip_rt_min_pmtu) { 1028 lock = true; 1029 mtu = min(old_mtu, ip_rt_min_pmtu); 1030 } 1031 1032 if (rt->rt_pmtu == mtu && !lock && 1033 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2)) 1034 return; 1035 1036 rcu_read_lock(); 1037 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) { 1038 struct fib_nh_common *nhc = FIB_RES_NHC(res); 1039 1040 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock, 1041 jiffies + ip_rt_mtu_expires); 1042 } 1043 rcu_read_unlock(); 1044 } 1045 1046 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk, 1047 struct sk_buff *skb, u32 mtu, 1048 bool confirm_neigh) 1049 { 1050 struct rtable *rt = (struct rtable *) dst; 1051 struct flowi4 fl4; 1052 1053 ip_rt_build_flow_key(&fl4, sk, skb); 1054 1055 /* Don't make lookup fail for bridged encapsulations */ 1056 if (skb && netif_is_any_bridge_port(skb->dev)) 1057 fl4.flowi4_oif = 0; 1058 1059 __ip_rt_update_pmtu(rt, &fl4, mtu); 1060 } 1061 1062 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu, 1063 int oif, u8 protocol) 1064 { 1065 const struct iphdr *iph = (const struct iphdr *)skb->data; 1066 struct flowi4 fl4; 1067 struct rtable *rt; 1068 u32 mark = IP4_REPLY_MARK(net, skb->mark); 1069 1070 __build_flow_key(net, &fl4, NULL, iph, oif, 1071 RT_TOS(iph->tos), protocol, mark, 0); 1072 rt = __ip_route_output_key(net, &fl4); 1073 if (!IS_ERR(rt)) { 1074 __ip_rt_update_pmtu(rt, &fl4, mtu); 1075 ip_rt_put(rt); 1076 } 1077 } 1078 EXPORT_SYMBOL_GPL(ipv4_update_pmtu); 1079 1080 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1081 { 1082 const struct iphdr *iph = (const struct iphdr *)skb->data; 1083 struct flowi4 fl4; 1084 struct rtable *rt; 1085 1086 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0); 1087 1088 if (!fl4.flowi4_mark) 1089 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark); 1090 1091 rt = __ip_route_output_key(sock_net(sk), &fl4); 1092 if (!IS_ERR(rt)) { 1093 __ip_rt_update_pmtu(rt, &fl4, mtu); 1094 ip_rt_put(rt); 1095 } 1096 } 1097 1098 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu) 1099 { 1100 const struct iphdr *iph = (const struct iphdr *)skb->data; 1101 struct flowi4 fl4; 1102 struct rtable *rt; 1103 struct dst_entry *odst = NULL; 1104 bool new = false; 1105 struct net *net = sock_net(sk); 1106 1107 bh_lock_sock(sk); 1108 1109 if (!ip_sk_accept_pmtu(sk)) 1110 goto out; 1111 1112 odst = sk_dst_get(sk); 1113 1114 if (sock_owned_by_user(sk) || !odst) { 1115 __ipv4_sk_update_pmtu(skb, sk, mtu); 1116 goto out; 1117 } 1118 1119 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1120 1121 rt = (struct rtable *)odst; 1122 if (odst->obsolete && !odst->ops->check(odst, 0)) { 1123 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1124 if (IS_ERR(rt)) 1125 goto out; 1126 1127 new = true; 1128 } 1129 1130 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu); 1131 1132 if (!dst_check(&rt->dst, 0)) { 1133 if (new) 1134 dst_release(&rt->dst); 1135 1136 rt = ip_route_output_flow(sock_net(sk), &fl4, sk); 1137 if (IS_ERR(rt)) 1138 goto out; 1139 1140 new = true; 1141 } 1142 1143 if (new) 1144 sk_dst_set(sk, &rt->dst); 1145 1146 out: 1147 bh_unlock_sock(sk); 1148 dst_release(odst); 1149 } 1150 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu); 1151 1152 void ipv4_redirect(struct sk_buff *skb, struct net *net, 1153 int oif, u8 protocol) 1154 { 1155 const struct iphdr *iph = (const struct iphdr *)skb->data; 1156 struct flowi4 fl4; 1157 struct rtable *rt; 1158 1159 __build_flow_key(net, &fl4, NULL, iph, oif, 1160 RT_TOS(iph->tos), protocol, 0, 0); 1161 rt = __ip_route_output_key(net, &fl4); 1162 if (!IS_ERR(rt)) { 1163 __ip_do_redirect(rt, skb, &fl4, false); 1164 ip_rt_put(rt); 1165 } 1166 } 1167 EXPORT_SYMBOL_GPL(ipv4_redirect); 1168 1169 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk) 1170 { 1171 const struct iphdr *iph = (const struct iphdr *)skb->data; 1172 struct flowi4 fl4; 1173 struct rtable *rt; 1174 struct net *net = sock_net(sk); 1175 1176 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0); 1177 rt = __ip_route_output_key(net, &fl4); 1178 if (!IS_ERR(rt)) { 1179 __ip_do_redirect(rt, skb, &fl4, false); 1180 ip_rt_put(rt); 1181 } 1182 } 1183 EXPORT_SYMBOL_GPL(ipv4_sk_redirect); 1184 1185 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) 1186 { 1187 struct rtable *rt = (struct rtable *) dst; 1188 1189 /* All IPV4 dsts are created with ->obsolete set to the value 1190 * DST_OBSOLETE_FORCE_CHK which forces validation calls down 1191 * into this function always. 1192 * 1193 * When a PMTU/redirect information update invalidates a route, 1194 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or 1195 * DST_OBSOLETE_DEAD. 1196 */ 1197 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt)) 1198 return NULL; 1199 return dst; 1200 } 1201 1202 static void ipv4_send_dest_unreach(struct sk_buff *skb) 1203 { 1204 struct ip_options opt; 1205 int res; 1206 1207 /* Recompile ip options since IPCB may not be valid anymore. 1208 * Also check we have a reasonable ipv4 header. 1209 */ 1210 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) || 1211 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5) 1212 return; 1213 1214 memset(&opt, 0, sizeof(opt)); 1215 if (ip_hdr(skb)->ihl > 5) { 1216 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4)) 1217 return; 1218 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr); 1219 1220 rcu_read_lock(); 1221 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); 1222 rcu_read_unlock(); 1223 1224 if (res) 1225 return; 1226 } 1227 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); 1228 } 1229 1230 static void ipv4_link_failure(struct sk_buff *skb) 1231 { 1232 struct rtable *rt; 1233 1234 ipv4_send_dest_unreach(skb); 1235 1236 rt = skb_rtable(skb); 1237 if (rt) 1238 dst_set_expires(&rt->dst, 0); 1239 } 1240 1241 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb) 1242 { 1243 pr_debug("%s: %pI4 -> %pI4, %s\n", 1244 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, 1245 skb->dev ? skb->dev->name : "?"); 1246 kfree_skb(skb); 1247 WARN_ON(1); 1248 return 0; 1249 } 1250 1251 /* 1252 We do not cache source address of outgoing interface, 1253 because it is used only by IP RR, TS and SRR options, 1254 so that it out of fast path. 1255 1256 BTW remember: "addr" is allowed to be not aligned 1257 in IP options! 1258 */ 1259 1260 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt) 1261 { 1262 __be32 src; 1263 1264 if (rt_is_output_route(rt)) 1265 src = ip_hdr(skb)->saddr; 1266 else { 1267 struct fib_result res; 1268 struct iphdr *iph = ip_hdr(skb); 1269 struct flowi4 fl4 = { 1270 .daddr = iph->daddr, 1271 .saddr = iph->saddr, 1272 .flowi4_tos = RT_TOS(iph->tos), 1273 .flowi4_oif = rt->dst.dev->ifindex, 1274 .flowi4_iif = skb->dev->ifindex, 1275 .flowi4_mark = skb->mark, 1276 }; 1277 1278 rcu_read_lock(); 1279 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0) 1280 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res); 1281 else 1282 src = inet_select_addr(rt->dst.dev, 1283 rt_nexthop(rt, iph->daddr), 1284 RT_SCOPE_UNIVERSE); 1285 rcu_read_unlock(); 1286 } 1287 memcpy(addr, &src, 4); 1288 } 1289 1290 #ifdef CONFIG_IP_ROUTE_CLASSID 1291 static void set_class_tag(struct rtable *rt, u32 tag) 1292 { 1293 if (!(rt->dst.tclassid & 0xFFFF)) 1294 rt->dst.tclassid |= tag & 0xFFFF; 1295 if (!(rt->dst.tclassid & 0xFFFF0000)) 1296 rt->dst.tclassid |= tag & 0xFFFF0000; 1297 } 1298 #endif 1299 1300 static unsigned int ipv4_default_advmss(const struct dst_entry *dst) 1301 { 1302 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr); 1303 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size, 1304 ip_rt_min_advmss); 1305 1306 return min(advmss, IPV4_MAX_PMTU - header_size); 1307 } 1308 1309 static unsigned int ipv4_mtu(const struct dst_entry *dst) 1310 { 1311 const struct rtable *rt = (const struct rtable *)dst; 1312 unsigned int mtu = rt->rt_pmtu; 1313 1314 if (!mtu || time_after_eq(jiffies, rt->dst.expires)) 1315 mtu = dst_metric_raw(dst, RTAX_MTU); 1316 1317 if (mtu) 1318 return mtu; 1319 1320 mtu = READ_ONCE(dst->dev->mtu); 1321 1322 if (unlikely(ip_mtu_locked(dst))) { 1323 if (rt->rt_uses_gateway && mtu > 576) 1324 mtu = 576; 1325 } 1326 1327 mtu = min_t(unsigned int, mtu, IP_MAX_MTU); 1328 1329 return mtu - lwtunnel_headroom(dst->lwtstate, mtu); 1330 } 1331 1332 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr) 1333 { 1334 struct fnhe_hash_bucket *hash; 1335 struct fib_nh_exception *fnhe, __rcu **fnhe_p; 1336 u32 hval = fnhe_hashfun(daddr); 1337 1338 spin_lock_bh(&fnhe_lock); 1339 1340 hash = rcu_dereference_protected(nhc->nhc_exceptions, 1341 lockdep_is_held(&fnhe_lock)); 1342 hash += hval; 1343 1344 fnhe_p = &hash->chain; 1345 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); 1346 while (fnhe) { 1347 if (fnhe->fnhe_daddr == daddr) { 1348 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( 1349 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); 1350 /* set fnhe_daddr to 0 to ensure it won't bind with 1351 * new dsts in rt_bind_exception(). 1352 */ 1353 fnhe->fnhe_daddr = 0; 1354 fnhe_flush_routes(fnhe); 1355 kfree_rcu(fnhe, rcu); 1356 break; 1357 } 1358 fnhe_p = &fnhe->fnhe_next; 1359 fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1360 lockdep_is_held(&fnhe_lock)); 1361 } 1362 1363 spin_unlock_bh(&fnhe_lock); 1364 } 1365 1366 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc, 1367 __be32 daddr) 1368 { 1369 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions); 1370 struct fib_nh_exception *fnhe; 1371 u32 hval; 1372 1373 if (!hash) 1374 return NULL; 1375 1376 hval = fnhe_hashfun(daddr); 1377 1378 for (fnhe = rcu_dereference(hash[hval].chain); fnhe; 1379 fnhe = rcu_dereference(fnhe->fnhe_next)) { 1380 if (fnhe->fnhe_daddr == daddr) { 1381 if (fnhe->fnhe_expires && 1382 time_after(jiffies, fnhe->fnhe_expires)) { 1383 ip_del_fnhe(nhc, daddr); 1384 break; 1385 } 1386 return fnhe; 1387 } 1388 } 1389 return NULL; 1390 } 1391 1392 /* MTU selection: 1393 * 1. mtu on route is locked - use it 1394 * 2. mtu from nexthop exception 1395 * 3. mtu from egress device 1396 */ 1397 1398 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr) 1399 { 1400 struct fib_nh_common *nhc = res->nhc; 1401 struct net_device *dev = nhc->nhc_dev; 1402 struct fib_info *fi = res->fi; 1403 u32 mtu = 0; 1404 1405 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu || 1406 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU)) 1407 mtu = fi->fib_mtu; 1408 1409 if (likely(!mtu)) { 1410 struct fib_nh_exception *fnhe; 1411 1412 fnhe = find_exception(nhc, daddr); 1413 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires)) 1414 mtu = fnhe->fnhe_pmtu; 1415 } 1416 1417 if (likely(!mtu)) 1418 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU); 1419 1420 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu); 1421 } 1422 1423 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe, 1424 __be32 daddr, const bool do_cache) 1425 { 1426 bool ret = false; 1427 1428 spin_lock_bh(&fnhe_lock); 1429 1430 if (daddr == fnhe->fnhe_daddr) { 1431 struct rtable __rcu **porig; 1432 struct rtable *orig; 1433 int genid = fnhe_genid(dev_net(rt->dst.dev)); 1434 1435 if (rt_is_input_route(rt)) 1436 porig = &fnhe->fnhe_rth_input; 1437 else 1438 porig = &fnhe->fnhe_rth_output; 1439 orig = rcu_dereference(*porig); 1440 1441 if (fnhe->fnhe_genid != genid) { 1442 fnhe->fnhe_genid = genid; 1443 fnhe->fnhe_gw = 0; 1444 fnhe->fnhe_pmtu = 0; 1445 fnhe->fnhe_expires = 0; 1446 fnhe->fnhe_mtu_locked = false; 1447 fnhe_flush_routes(fnhe); 1448 orig = NULL; 1449 } 1450 fill_route_from_fnhe(rt, fnhe); 1451 if (!rt->rt_gw4) { 1452 rt->rt_gw4 = daddr; 1453 rt->rt_gw_family = AF_INET; 1454 } 1455 1456 if (do_cache) { 1457 dst_hold(&rt->dst); 1458 rcu_assign_pointer(*porig, rt); 1459 if (orig) { 1460 dst_dev_put(&orig->dst); 1461 dst_release(&orig->dst); 1462 } 1463 ret = true; 1464 } 1465 1466 fnhe->fnhe_stamp = jiffies; 1467 } 1468 spin_unlock_bh(&fnhe_lock); 1469 1470 return ret; 1471 } 1472 1473 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt) 1474 { 1475 struct rtable *orig, *prev, **p; 1476 bool ret = true; 1477 1478 if (rt_is_input_route(rt)) { 1479 p = (struct rtable **)&nhc->nhc_rth_input; 1480 } else { 1481 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output); 1482 } 1483 orig = *p; 1484 1485 /* hold dst before doing cmpxchg() to avoid race condition 1486 * on this dst 1487 */ 1488 dst_hold(&rt->dst); 1489 prev = cmpxchg(p, orig, rt); 1490 if (prev == orig) { 1491 if (orig) { 1492 rt_add_uncached_list(orig); 1493 dst_release(&orig->dst); 1494 } 1495 } else { 1496 dst_release(&rt->dst); 1497 ret = false; 1498 } 1499 1500 return ret; 1501 } 1502 1503 struct uncached_list { 1504 spinlock_t lock; 1505 struct list_head head; 1506 }; 1507 1508 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list); 1509 1510 void rt_add_uncached_list(struct rtable *rt) 1511 { 1512 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list); 1513 1514 rt->rt_uncached_list = ul; 1515 1516 spin_lock_bh(&ul->lock); 1517 list_add_tail(&rt->rt_uncached, &ul->head); 1518 spin_unlock_bh(&ul->lock); 1519 } 1520 1521 void rt_del_uncached_list(struct rtable *rt) 1522 { 1523 if (!list_empty(&rt->rt_uncached)) { 1524 struct uncached_list *ul = rt->rt_uncached_list; 1525 1526 spin_lock_bh(&ul->lock); 1527 list_del(&rt->rt_uncached); 1528 spin_unlock_bh(&ul->lock); 1529 } 1530 } 1531 1532 static void ipv4_dst_destroy(struct dst_entry *dst) 1533 { 1534 struct rtable *rt = (struct rtable *)dst; 1535 1536 ip_dst_metrics_put(dst); 1537 rt_del_uncached_list(rt); 1538 } 1539 1540 void rt_flush_dev(struct net_device *dev) 1541 { 1542 struct rtable *rt; 1543 int cpu; 1544 1545 for_each_possible_cpu(cpu) { 1546 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 1547 1548 spin_lock_bh(&ul->lock); 1549 list_for_each_entry(rt, &ul->head, rt_uncached) { 1550 if (rt->dst.dev != dev) 1551 continue; 1552 rt->dst.dev = blackhole_netdev; 1553 dev_hold(rt->dst.dev); 1554 dev_put(dev); 1555 } 1556 spin_unlock_bh(&ul->lock); 1557 } 1558 } 1559 1560 static bool rt_cache_valid(const struct rtable *rt) 1561 { 1562 return rt && 1563 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK && 1564 !rt_is_expired(rt); 1565 } 1566 1567 static void rt_set_nexthop(struct rtable *rt, __be32 daddr, 1568 const struct fib_result *res, 1569 struct fib_nh_exception *fnhe, 1570 struct fib_info *fi, u16 type, u32 itag, 1571 const bool do_cache) 1572 { 1573 bool cached = false; 1574 1575 if (fi) { 1576 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1577 1578 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) { 1579 rt->rt_uses_gateway = 1; 1580 rt->rt_gw_family = nhc->nhc_gw_family; 1581 /* only INET and INET6 are supported */ 1582 if (likely(nhc->nhc_gw_family == AF_INET)) 1583 rt->rt_gw4 = nhc->nhc_gw.ipv4; 1584 else 1585 rt->rt_gw6 = nhc->nhc_gw.ipv6; 1586 } 1587 1588 ip_dst_init_metrics(&rt->dst, fi->fib_metrics); 1589 1590 #ifdef CONFIG_IP_ROUTE_CLASSID 1591 if (nhc->nhc_family == AF_INET) { 1592 struct fib_nh *nh; 1593 1594 nh = container_of(nhc, struct fib_nh, nh_common); 1595 rt->dst.tclassid = nh->nh_tclassid; 1596 } 1597 #endif 1598 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); 1599 if (unlikely(fnhe)) 1600 cached = rt_bind_exception(rt, fnhe, daddr, do_cache); 1601 else if (do_cache) 1602 cached = rt_cache_route(nhc, rt); 1603 if (unlikely(!cached)) { 1604 /* Routes we intend to cache in nexthop exception or 1605 * FIB nexthop have the DST_NOCACHE bit clear. 1606 * However, if we are unsuccessful at storing this 1607 * route into the cache we really need to set it. 1608 */ 1609 if (!rt->rt_gw4) { 1610 rt->rt_gw_family = AF_INET; 1611 rt->rt_gw4 = daddr; 1612 } 1613 rt_add_uncached_list(rt); 1614 } 1615 } else 1616 rt_add_uncached_list(rt); 1617 1618 #ifdef CONFIG_IP_ROUTE_CLASSID 1619 #ifdef CONFIG_IP_MULTIPLE_TABLES 1620 set_class_tag(rt, res->tclassid); 1621 #endif 1622 set_class_tag(rt, itag); 1623 #endif 1624 } 1625 1626 struct rtable *rt_dst_alloc(struct net_device *dev, 1627 unsigned int flags, u16 type, 1628 bool nopolicy, bool noxfrm) 1629 { 1630 struct rtable *rt; 1631 1632 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1633 (nopolicy ? DST_NOPOLICY : 0) | 1634 (noxfrm ? DST_NOXFRM : 0)); 1635 1636 if (rt) { 1637 rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1638 rt->rt_flags = flags; 1639 rt->rt_type = type; 1640 rt->rt_is_input = 0; 1641 rt->rt_iif = 0; 1642 rt->rt_pmtu = 0; 1643 rt->rt_mtu_locked = 0; 1644 rt->rt_uses_gateway = 0; 1645 rt->rt_gw_family = 0; 1646 rt->rt_gw4 = 0; 1647 INIT_LIST_HEAD(&rt->rt_uncached); 1648 1649 rt->dst.output = ip_output; 1650 if (flags & RTCF_LOCAL) 1651 rt->dst.input = ip_local_deliver; 1652 } 1653 1654 return rt; 1655 } 1656 EXPORT_SYMBOL(rt_dst_alloc); 1657 1658 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt) 1659 { 1660 struct rtable *new_rt; 1661 1662 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK, 1663 rt->dst.flags); 1664 1665 if (new_rt) { 1666 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev)); 1667 new_rt->rt_flags = rt->rt_flags; 1668 new_rt->rt_type = rt->rt_type; 1669 new_rt->rt_is_input = rt->rt_is_input; 1670 new_rt->rt_iif = rt->rt_iif; 1671 new_rt->rt_pmtu = rt->rt_pmtu; 1672 new_rt->rt_mtu_locked = rt->rt_mtu_locked; 1673 new_rt->rt_gw_family = rt->rt_gw_family; 1674 if (rt->rt_gw_family == AF_INET) 1675 new_rt->rt_gw4 = rt->rt_gw4; 1676 else if (rt->rt_gw_family == AF_INET6) 1677 new_rt->rt_gw6 = rt->rt_gw6; 1678 INIT_LIST_HEAD(&new_rt->rt_uncached); 1679 1680 new_rt->dst.input = rt->dst.input; 1681 new_rt->dst.output = rt->dst.output; 1682 new_rt->dst.error = rt->dst.error; 1683 new_rt->dst.lastuse = jiffies; 1684 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate); 1685 } 1686 return new_rt; 1687 } 1688 EXPORT_SYMBOL(rt_dst_clone); 1689 1690 /* called in rcu_read_lock() section */ 1691 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1692 u8 tos, struct net_device *dev, 1693 struct in_device *in_dev, u32 *itag) 1694 { 1695 int err; 1696 1697 /* Primary sanity checks. */ 1698 if (!in_dev) 1699 return -EINVAL; 1700 1701 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || 1702 skb->protocol != htons(ETH_P_IP)) 1703 return -EINVAL; 1704 1705 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev)) 1706 return -EINVAL; 1707 1708 if (ipv4_is_zeronet(saddr)) { 1709 if (!ipv4_is_local_multicast(daddr) && 1710 ip_hdr(skb)->protocol != IPPROTO_IGMP) 1711 return -EINVAL; 1712 } else { 1713 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 1714 in_dev, itag); 1715 if (err < 0) 1716 return err; 1717 } 1718 return 0; 1719 } 1720 1721 /* called in rcu_read_lock() section */ 1722 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, 1723 u8 tos, struct net_device *dev, int our) 1724 { 1725 struct in_device *in_dev = __in_dev_get_rcu(dev); 1726 unsigned int flags = RTCF_MULTICAST; 1727 struct rtable *rth; 1728 u32 itag = 0; 1729 int err; 1730 1731 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag); 1732 if (err) 1733 return err; 1734 1735 if (our) 1736 flags |= RTCF_LOCAL; 1737 1738 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST, 1739 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 1740 if (!rth) 1741 return -ENOBUFS; 1742 1743 #ifdef CONFIG_IP_ROUTE_CLASSID 1744 rth->dst.tclassid = itag; 1745 #endif 1746 rth->dst.output = ip_rt_bug; 1747 rth->rt_is_input= 1; 1748 1749 #ifdef CONFIG_IP_MROUTE 1750 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) 1751 rth->dst.input = ip_mr_input; 1752 #endif 1753 RT_CACHE_STAT_INC(in_slow_mc); 1754 1755 skb_dst_set(skb, &rth->dst); 1756 return 0; 1757 } 1758 1759 1760 static void ip_handle_martian_source(struct net_device *dev, 1761 struct in_device *in_dev, 1762 struct sk_buff *skb, 1763 __be32 daddr, 1764 __be32 saddr) 1765 { 1766 RT_CACHE_STAT_INC(in_martian_src); 1767 #ifdef CONFIG_IP_ROUTE_VERBOSE 1768 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) { 1769 /* 1770 * RFC1812 recommendation, if source is martian, 1771 * the only hint is MAC header. 1772 */ 1773 pr_warn("martian source %pI4 from %pI4, on dev %s\n", 1774 &daddr, &saddr, dev->name); 1775 if (dev->hard_header_len && skb_mac_header_was_set(skb)) { 1776 print_hex_dump(KERN_WARNING, "ll header: ", 1777 DUMP_PREFIX_OFFSET, 16, 1, 1778 skb_mac_header(skb), 1779 dev->hard_header_len, false); 1780 } 1781 } 1782 #endif 1783 } 1784 1785 /* called in rcu_read_lock() section */ 1786 static int __mkroute_input(struct sk_buff *skb, 1787 const struct fib_result *res, 1788 struct in_device *in_dev, 1789 __be32 daddr, __be32 saddr, u32 tos) 1790 { 1791 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 1792 struct net_device *dev = nhc->nhc_dev; 1793 struct fib_nh_exception *fnhe; 1794 struct rtable *rth; 1795 int err; 1796 struct in_device *out_dev; 1797 bool do_cache; 1798 u32 itag = 0; 1799 1800 /* get a working reference to the output device */ 1801 out_dev = __in_dev_get_rcu(dev); 1802 if (!out_dev) { 1803 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n"); 1804 return -EINVAL; 1805 } 1806 1807 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res), 1808 in_dev->dev, in_dev, &itag); 1809 if (err < 0) { 1810 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, 1811 saddr); 1812 1813 goto cleanup; 1814 } 1815 1816 do_cache = res->fi && !itag; 1817 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) && 1818 skb->protocol == htons(ETH_P_IP)) { 1819 __be32 gw; 1820 1821 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0; 1822 if (IN_DEV_SHARED_MEDIA(out_dev) || 1823 inet_addr_onlink(out_dev, saddr, gw)) 1824 IPCB(skb)->flags |= IPSKB_DOREDIRECT; 1825 } 1826 1827 if (skb->protocol != htons(ETH_P_IP)) { 1828 /* Not IP (i.e. ARP). Do not create route, if it is 1829 * invalid for proxy arp. DNAT routes are always valid. 1830 * 1831 * Proxy arp feature have been extended to allow, ARP 1832 * replies back to the same interface, to support 1833 * Private VLAN switch technologies. See arp.c. 1834 */ 1835 if (out_dev == in_dev && 1836 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) { 1837 err = -EINVAL; 1838 goto cleanup; 1839 } 1840 } 1841 1842 fnhe = find_exception(nhc, daddr); 1843 if (do_cache) { 1844 if (fnhe) 1845 rth = rcu_dereference(fnhe->fnhe_rth_input); 1846 else 1847 rth = rcu_dereference(nhc->nhc_rth_input); 1848 if (rt_cache_valid(rth)) { 1849 skb_dst_set_noref(skb, &rth->dst); 1850 goto out; 1851 } 1852 } 1853 1854 rth = rt_dst_alloc(out_dev->dev, 0, res->type, 1855 IN_DEV_CONF_GET(in_dev, NOPOLICY), 1856 IN_DEV_CONF_GET(out_dev, NOXFRM)); 1857 if (!rth) { 1858 err = -ENOBUFS; 1859 goto cleanup; 1860 } 1861 1862 rth->rt_is_input = 1; 1863 RT_CACHE_STAT_INC(in_slow_tot); 1864 1865 rth->dst.input = ip_forward; 1866 1867 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag, 1868 do_cache); 1869 lwtunnel_set_redirect(&rth->dst); 1870 skb_dst_set(skb, &rth->dst); 1871 out: 1872 err = 0; 1873 cleanup: 1874 return err; 1875 } 1876 1877 #ifdef CONFIG_IP_ROUTE_MULTIPATH 1878 /* To make ICMP packets follow the right flow, the multipath hash is 1879 * calculated from the inner IP addresses. 1880 */ 1881 static void ip_multipath_l3_keys(const struct sk_buff *skb, 1882 struct flow_keys *hash_keys) 1883 { 1884 const struct iphdr *outer_iph = ip_hdr(skb); 1885 const struct iphdr *key_iph = outer_iph; 1886 const struct iphdr *inner_iph; 1887 const struct icmphdr *icmph; 1888 struct iphdr _inner_iph; 1889 struct icmphdr _icmph; 1890 1891 if (likely(outer_iph->protocol != IPPROTO_ICMP)) 1892 goto out; 1893 1894 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0)) 1895 goto out; 1896 1897 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph), 1898 &_icmph); 1899 if (!icmph) 1900 goto out; 1901 1902 if (!icmp_is_err(icmph->type)) 1903 goto out; 1904 1905 inner_iph = skb_header_pointer(skb, 1906 outer_iph->ihl * 4 + sizeof(_icmph), 1907 sizeof(_inner_iph), &_inner_iph); 1908 if (!inner_iph) 1909 goto out; 1910 1911 key_iph = inner_iph; 1912 out: 1913 hash_keys->addrs.v4addrs.src = key_iph->saddr; 1914 hash_keys->addrs.v4addrs.dst = key_iph->daddr; 1915 } 1916 1917 /* if skb is set it will be used and fl4 can be NULL */ 1918 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, 1919 const struct sk_buff *skb, struct flow_keys *flkeys) 1920 { 1921 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0; 1922 struct flow_keys hash_keys; 1923 u32 mhash; 1924 1925 switch (net->ipv4.sysctl_fib_multipath_hash_policy) { 1926 case 0: 1927 memset(&hash_keys, 0, sizeof(hash_keys)); 1928 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1929 if (skb) { 1930 ip_multipath_l3_keys(skb, &hash_keys); 1931 } else { 1932 hash_keys.addrs.v4addrs.src = fl4->saddr; 1933 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1934 } 1935 break; 1936 case 1: 1937 /* skb is currently provided only when forwarding */ 1938 if (skb) { 1939 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; 1940 struct flow_keys keys; 1941 1942 /* short-circuit if we already have L4 hash present */ 1943 if (skb->l4_hash) 1944 return skb_get_hash_raw(skb) >> 1; 1945 1946 memset(&hash_keys, 0, sizeof(hash_keys)); 1947 1948 if (!flkeys) { 1949 skb_flow_dissect_flow_keys(skb, &keys, flag); 1950 flkeys = &keys; 1951 } 1952 1953 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1954 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src; 1955 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst; 1956 hash_keys.ports.src = flkeys->ports.src; 1957 hash_keys.ports.dst = flkeys->ports.dst; 1958 hash_keys.basic.ip_proto = flkeys->basic.ip_proto; 1959 } else { 1960 memset(&hash_keys, 0, sizeof(hash_keys)); 1961 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1962 hash_keys.addrs.v4addrs.src = fl4->saddr; 1963 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1964 hash_keys.ports.src = fl4->fl4_sport; 1965 hash_keys.ports.dst = fl4->fl4_dport; 1966 hash_keys.basic.ip_proto = fl4->flowi4_proto; 1967 } 1968 break; 1969 case 2: 1970 memset(&hash_keys, 0, sizeof(hash_keys)); 1971 /* skb is currently provided only when forwarding */ 1972 if (skb) { 1973 struct flow_keys keys; 1974 1975 skb_flow_dissect_flow_keys(skb, &keys, 0); 1976 /* Inner can be v4 or v6 */ 1977 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) { 1978 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1979 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src; 1980 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst; 1981 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) { 1982 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; 1983 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src; 1984 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst; 1985 hash_keys.tags.flow_label = keys.tags.flow_label; 1986 hash_keys.basic.ip_proto = keys.basic.ip_proto; 1987 } else { 1988 /* Same as case 0 */ 1989 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1990 ip_multipath_l3_keys(skb, &hash_keys); 1991 } 1992 } else { 1993 /* Same as case 0 */ 1994 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS; 1995 hash_keys.addrs.v4addrs.src = fl4->saddr; 1996 hash_keys.addrs.v4addrs.dst = fl4->daddr; 1997 } 1998 break; 1999 } 2000 mhash = flow_hash_from_keys(&hash_keys); 2001 2002 if (multipath_hash) 2003 mhash = jhash_2words(mhash, multipath_hash, 0); 2004 2005 return mhash >> 1; 2006 } 2007 #endif /* CONFIG_IP_ROUTE_MULTIPATH */ 2008 2009 static int ip_mkroute_input(struct sk_buff *skb, 2010 struct fib_result *res, 2011 struct in_device *in_dev, 2012 __be32 daddr, __be32 saddr, u32 tos, 2013 struct flow_keys *hkeys) 2014 { 2015 #ifdef CONFIG_IP_ROUTE_MULTIPATH 2016 if (res->fi && fib_info_num_path(res->fi) > 1) { 2017 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys); 2018 2019 fib_select_multipath(res, h); 2020 } 2021 #endif 2022 2023 /* create a routing cache entry */ 2024 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos); 2025 } 2026 2027 /* Implements all the saddr-related checks as ip_route_input_slow(), 2028 * assuming daddr is valid and the destination is not a local broadcast one. 2029 * Uses the provided hint instead of performing a route lookup. 2030 */ 2031 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2032 u8 tos, struct net_device *dev, 2033 const struct sk_buff *hint) 2034 { 2035 struct in_device *in_dev = __in_dev_get_rcu(dev); 2036 struct rtable *rt = skb_rtable(hint); 2037 struct net *net = dev_net(dev); 2038 int err = -EINVAL; 2039 u32 tag = 0; 2040 2041 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2042 goto martian_source; 2043 2044 if (ipv4_is_zeronet(saddr)) 2045 goto martian_source; 2046 2047 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2048 goto martian_source; 2049 2050 if (rt->rt_type != RTN_LOCAL) 2051 goto skip_validate_source; 2052 2053 tos &= IPTOS_RT_MASK; 2054 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag); 2055 if (err < 0) 2056 goto martian_source; 2057 2058 skip_validate_source: 2059 skb_dst_copy(skb, hint); 2060 return 0; 2061 2062 martian_source: 2063 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2064 return err; 2065 } 2066 2067 /* 2068 * NOTE. We drop all the packets that has local source 2069 * addresses, because every properly looped back packet 2070 * must have correct destination already attached by output routine. 2071 * Changes in the enforced policies must be applied also to 2072 * ip_route_use_hint(). 2073 * 2074 * Such approach solves two big problems: 2075 * 1. Not simplex devices are handled properly. 2076 * 2. IP spoofing attempts are filtered with 100% of guarantee. 2077 * called with rcu_read_lock() 2078 */ 2079 2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2081 u8 tos, struct net_device *dev, 2082 struct fib_result *res) 2083 { 2084 struct in_device *in_dev = __in_dev_get_rcu(dev); 2085 struct flow_keys *flkeys = NULL, _flkeys; 2086 struct net *net = dev_net(dev); 2087 struct ip_tunnel_info *tun_info; 2088 int err = -EINVAL; 2089 unsigned int flags = 0; 2090 u32 itag = 0; 2091 struct rtable *rth; 2092 struct flowi4 fl4; 2093 bool do_cache = true; 2094 2095 /* IP on this device is disabled. */ 2096 2097 if (!in_dev) 2098 goto out; 2099 2100 /* Check for the most weird martians, which can be not detected 2101 by fib_lookup. 2102 */ 2103 2104 tun_info = skb_tunnel_info(skb); 2105 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX)) 2106 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id; 2107 else 2108 fl4.flowi4_tun_key.tun_id = 0; 2109 skb_dst_drop(skb); 2110 2111 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr)) 2112 goto martian_source; 2113 2114 res->fi = NULL; 2115 res->table = NULL; 2116 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) 2117 goto brd_input; 2118 2119 /* Accept zero addresses only to limited broadcast; 2120 * I even do not know to fix it or not. Waiting for complains :-) 2121 */ 2122 if (ipv4_is_zeronet(saddr)) 2123 goto martian_source; 2124 2125 if (ipv4_is_zeronet(daddr)) 2126 goto martian_destination; 2127 2128 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(), 2129 * and call it once if daddr or/and saddr are loopback addresses 2130 */ 2131 if (ipv4_is_loopback(daddr)) { 2132 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2133 goto martian_destination; 2134 } else if (ipv4_is_loopback(saddr)) { 2135 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net)) 2136 goto martian_source; 2137 } 2138 2139 /* 2140 * Now we are ready to route packet. 2141 */ 2142 fl4.flowi4_oif = 0; 2143 fl4.flowi4_iif = dev->ifindex; 2144 fl4.flowi4_mark = skb->mark; 2145 fl4.flowi4_tos = tos; 2146 fl4.flowi4_scope = RT_SCOPE_UNIVERSE; 2147 fl4.flowi4_flags = 0; 2148 fl4.daddr = daddr; 2149 fl4.saddr = saddr; 2150 fl4.flowi4_uid = sock_net_uid(net, NULL); 2151 2152 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) { 2153 flkeys = &_flkeys; 2154 } else { 2155 fl4.flowi4_proto = 0; 2156 fl4.fl4_sport = 0; 2157 fl4.fl4_dport = 0; 2158 } 2159 2160 err = fib_lookup(net, &fl4, res, 0); 2161 if (err != 0) { 2162 if (!IN_DEV_FORWARD(in_dev)) 2163 err = -EHOSTUNREACH; 2164 goto no_route; 2165 } 2166 2167 if (res->type == RTN_BROADCAST) { 2168 if (IN_DEV_BFORWARD(in_dev)) 2169 goto make_route; 2170 /* not do cache if bc_forwarding is enabled */ 2171 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING)) 2172 do_cache = false; 2173 goto brd_input; 2174 } 2175 2176 if (res->type == RTN_LOCAL) { 2177 err = fib_validate_source(skb, saddr, daddr, tos, 2178 0, dev, in_dev, &itag); 2179 if (err < 0) 2180 goto martian_source; 2181 goto local_input; 2182 } 2183 2184 if (!IN_DEV_FORWARD(in_dev)) { 2185 err = -EHOSTUNREACH; 2186 goto no_route; 2187 } 2188 if (res->type != RTN_UNICAST) 2189 goto martian_destination; 2190 2191 make_route: 2192 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys); 2193 out: return err; 2194 2195 brd_input: 2196 if (skb->protocol != htons(ETH_P_IP)) 2197 goto e_inval; 2198 2199 if (!ipv4_is_zeronet(saddr)) { 2200 err = fib_validate_source(skb, saddr, 0, tos, 0, dev, 2201 in_dev, &itag); 2202 if (err < 0) 2203 goto martian_source; 2204 } 2205 flags |= RTCF_BROADCAST; 2206 res->type = RTN_BROADCAST; 2207 RT_CACHE_STAT_INC(in_brd); 2208 2209 local_input: 2210 do_cache &= res->fi && !itag; 2211 if (do_cache) { 2212 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2213 2214 rth = rcu_dereference(nhc->nhc_rth_input); 2215 if (rt_cache_valid(rth)) { 2216 skb_dst_set_noref(skb, &rth->dst); 2217 err = 0; 2218 goto out; 2219 } 2220 } 2221 2222 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev, 2223 flags | RTCF_LOCAL, res->type, 2224 IN_DEV_CONF_GET(in_dev, NOPOLICY), false); 2225 if (!rth) 2226 goto e_nobufs; 2227 2228 rth->dst.output= ip_rt_bug; 2229 #ifdef CONFIG_IP_ROUTE_CLASSID 2230 rth->dst.tclassid = itag; 2231 #endif 2232 rth->rt_is_input = 1; 2233 2234 RT_CACHE_STAT_INC(in_slow_tot); 2235 if (res->type == RTN_UNREACHABLE) { 2236 rth->dst.input= ip_error; 2237 rth->dst.error= -err; 2238 rth->rt_flags &= ~RTCF_LOCAL; 2239 } 2240 2241 if (do_cache) { 2242 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2243 2244 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate); 2245 if (lwtunnel_input_redirect(rth->dst.lwtstate)) { 2246 WARN_ON(rth->dst.input == lwtunnel_input); 2247 rth->dst.lwtstate->orig_input = rth->dst.input; 2248 rth->dst.input = lwtunnel_input; 2249 } 2250 2251 if (unlikely(!rt_cache_route(nhc, rth))) 2252 rt_add_uncached_list(rth); 2253 } 2254 skb_dst_set(skb, &rth->dst); 2255 err = 0; 2256 goto out; 2257 2258 no_route: 2259 RT_CACHE_STAT_INC(in_no_route); 2260 res->type = RTN_UNREACHABLE; 2261 res->fi = NULL; 2262 res->table = NULL; 2263 goto local_input; 2264 2265 /* 2266 * Do not cache martian addresses: they should be logged (RFC1812) 2267 */ 2268 martian_destination: 2269 RT_CACHE_STAT_INC(in_martian_dst); 2270 #ifdef CONFIG_IP_ROUTE_VERBOSE 2271 if (IN_DEV_LOG_MARTIANS(in_dev)) 2272 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n", 2273 &daddr, &saddr, dev->name); 2274 #endif 2275 2276 e_inval: 2277 err = -EINVAL; 2278 goto out; 2279 2280 e_nobufs: 2281 err = -ENOBUFS; 2282 goto out; 2283 2284 martian_source: 2285 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); 2286 goto out; 2287 } 2288 2289 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2290 u8 tos, struct net_device *dev) 2291 { 2292 struct fib_result res; 2293 int err; 2294 2295 tos &= IPTOS_RT_MASK; 2296 rcu_read_lock(); 2297 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res); 2298 rcu_read_unlock(); 2299 2300 return err; 2301 } 2302 EXPORT_SYMBOL(ip_route_input_noref); 2303 2304 /* called with rcu_read_lock held */ 2305 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr, 2306 u8 tos, struct net_device *dev, struct fib_result *res) 2307 { 2308 /* Multicast recognition logic is moved from route cache to here. 2309 The problem was that too many Ethernet cards have broken/missing 2310 hardware multicast filters :-( As result the host on multicasting 2311 network acquires a lot of useless route cache entries, sort of 2312 SDR messages from all the world. Now we try to get rid of them. 2313 Really, provided software IP multicast filter is organized 2314 reasonably (at least, hashed), it does not result in a slowdown 2315 comparing with route cache reject entries. 2316 Note, that multicast routers are not affected, because 2317 route cache entry is created eventually. 2318 */ 2319 if (ipv4_is_multicast(daddr)) { 2320 struct in_device *in_dev = __in_dev_get_rcu(dev); 2321 int our = 0; 2322 int err = -EINVAL; 2323 2324 if (!in_dev) 2325 return err; 2326 our = ip_check_mc_rcu(in_dev, daddr, saddr, 2327 ip_hdr(skb)->protocol); 2328 2329 /* check l3 master if no match yet */ 2330 if (!our && netif_is_l3_slave(dev)) { 2331 struct in_device *l3_in_dev; 2332 2333 l3_in_dev = __in_dev_get_rcu(skb->dev); 2334 if (l3_in_dev) 2335 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr, 2336 ip_hdr(skb)->protocol); 2337 } 2338 2339 if (our 2340 #ifdef CONFIG_IP_MROUTE 2341 || 2342 (!ipv4_is_local_multicast(daddr) && 2343 IN_DEV_MFORWARD(in_dev)) 2344 #endif 2345 ) { 2346 err = ip_route_input_mc(skb, daddr, saddr, 2347 tos, dev, our); 2348 } 2349 return err; 2350 } 2351 2352 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res); 2353 } 2354 2355 /* called with rcu_read_lock() */ 2356 static struct rtable *__mkroute_output(const struct fib_result *res, 2357 const struct flowi4 *fl4, int orig_oif, 2358 struct net_device *dev_out, 2359 unsigned int flags) 2360 { 2361 struct fib_info *fi = res->fi; 2362 struct fib_nh_exception *fnhe; 2363 struct in_device *in_dev; 2364 u16 type = res->type; 2365 struct rtable *rth; 2366 bool do_cache; 2367 2368 in_dev = __in_dev_get_rcu(dev_out); 2369 if (!in_dev) 2370 return ERR_PTR(-EINVAL); 2371 2372 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev))) 2373 if (ipv4_is_loopback(fl4->saddr) && 2374 !(dev_out->flags & IFF_LOOPBACK) && 2375 !netif_is_l3_master(dev_out)) 2376 return ERR_PTR(-EINVAL); 2377 2378 if (ipv4_is_lbcast(fl4->daddr)) 2379 type = RTN_BROADCAST; 2380 else if (ipv4_is_multicast(fl4->daddr)) 2381 type = RTN_MULTICAST; 2382 else if (ipv4_is_zeronet(fl4->daddr)) 2383 return ERR_PTR(-EINVAL); 2384 2385 if (dev_out->flags & IFF_LOOPBACK) 2386 flags |= RTCF_LOCAL; 2387 2388 do_cache = true; 2389 if (type == RTN_BROADCAST) { 2390 flags |= RTCF_BROADCAST | RTCF_LOCAL; 2391 fi = NULL; 2392 } else if (type == RTN_MULTICAST) { 2393 flags |= RTCF_MULTICAST | RTCF_LOCAL; 2394 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, 2395 fl4->flowi4_proto)) 2396 flags &= ~RTCF_LOCAL; 2397 else 2398 do_cache = false; 2399 /* If multicast route do not exist use 2400 * default one, but do not gateway in this case. 2401 * Yes, it is hack. 2402 */ 2403 if (fi && res->prefixlen < 4) 2404 fi = NULL; 2405 } else if ((type == RTN_LOCAL) && (orig_oif != 0) && 2406 (orig_oif != dev_out->ifindex)) { 2407 /* For local routes that require a particular output interface 2408 * we do not want to cache the result. Caching the result 2409 * causes incorrect behaviour when there are multiple source 2410 * addresses on the interface, the end result being that if the 2411 * intended recipient is waiting on that interface for the 2412 * packet he won't receive it because it will be delivered on 2413 * the loopback interface and the IP_PKTINFO ipi_ifindex will 2414 * be set to the loopback interface as well. 2415 */ 2416 do_cache = false; 2417 } 2418 2419 fnhe = NULL; 2420 do_cache &= fi != NULL; 2421 if (fi) { 2422 struct fib_nh_common *nhc = FIB_RES_NHC(*res); 2423 struct rtable __rcu **prth; 2424 2425 fnhe = find_exception(nhc, fl4->daddr); 2426 if (!do_cache) 2427 goto add; 2428 if (fnhe) { 2429 prth = &fnhe->fnhe_rth_output; 2430 } else { 2431 if (unlikely(fl4->flowi4_flags & 2432 FLOWI_FLAG_KNOWN_NH && 2433 !(nhc->nhc_gw_family && 2434 nhc->nhc_scope == RT_SCOPE_LINK))) { 2435 do_cache = false; 2436 goto add; 2437 } 2438 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output); 2439 } 2440 rth = rcu_dereference(*prth); 2441 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst)) 2442 return rth; 2443 } 2444 2445 add: 2446 rth = rt_dst_alloc(dev_out, flags, type, 2447 IN_DEV_CONF_GET(in_dev, NOPOLICY), 2448 IN_DEV_CONF_GET(in_dev, NOXFRM)); 2449 if (!rth) 2450 return ERR_PTR(-ENOBUFS); 2451 2452 rth->rt_iif = orig_oif; 2453 2454 RT_CACHE_STAT_INC(out_slow_tot); 2455 2456 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) { 2457 if (flags & RTCF_LOCAL && 2458 !(dev_out->flags & IFF_LOOPBACK)) { 2459 rth->dst.output = ip_mc_output; 2460 RT_CACHE_STAT_INC(out_slow_mc); 2461 } 2462 #ifdef CONFIG_IP_MROUTE 2463 if (type == RTN_MULTICAST) { 2464 if (IN_DEV_MFORWARD(in_dev) && 2465 !ipv4_is_local_multicast(fl4->daddr)) { 2466 rth->dst.input = ip_mr_input; 2467 rth->dst.output = ip_mc_output; 2468 } 2469 } 2470 #endif 2471 } 2472 2473 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache); 2474 lwtunnel_set_redirect(&rth->dst); 2475 2476 return rth; 2477 } 2478 2479 /* 2480 * Major route resolver routine. 2481 */ 2482 2483 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4, 2484 const struct sk_buff *skb) 2485 { 2486 __u8 tos = RT_FL_TOS(fl4); 2487 struct fib_result res = { 2488 .type = RTN_UNSPEC, 2489 .fi = NULL, 2490 .table = NULL, 2491 .tclassid = 0, 2492 }; 2493 struct rtable *rth; 2494 2495 fl4->flowi4_iif = LOOPBACK_IFINDEX; 2496 fl4->flowi4_tos = tos & IPTOS_RT_MASK; 2497 fl4->flowi4_scope = ((tos & RTO_ONLINK) ? 2498 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE); 2499 2500 rcu_read_lock(); 2501 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb); 2502 rcu_read_unlock(); 2503 2504 return rth; 2505 } 2506 EXPORT_SYMBOL_GPL(ip_route_output_key_hash); 2507 2508 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4, 2509 struct fib_result *res, 2510 const struct sk_buff *skb) 2511 { 2512 struct net_device *dev_out = NULL; 2513 int orig_oif = fl4->flowi4_oif; 2514 unsigned int flags = 0; 2515 struct rtable *rth; 2516 int err; 2517 2518 if (fl4->saddr) { 2519 if (ipv4_is_multicast(fl4->saddr) || 2520 ipv4_is_lbcast(fl4->saddr) || 2521 ipv4_is_zeronet(fl4->saddr)) { 2522 rth = ERR_PTR(-EINVAL); 2523 goto out; 2524 } 2525 2526 rth = ERR_PTR(-ENETUNREACH); 2527 2528 /* I removed check for oif == dev_out->oif here. 2529 It was wrong for two reasons: 2530 1. ip_dev_find(net, saddr) can return wrong iface, if saddr 2531 is assigned to multiple interfaces. 2532 2. Moreover, we are allowed to send packets with saddr 2533 of another iface. --ANK 2534 */ 2535 2536 if (fl4->flowi4_oif == 0 && 2537 (ipv4_is_multicast(fl4->daddr) || 2538 ipv4_is_lbcast(fl4->daddr))) { 2539 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2540 dev_out = __ip_dev_find(net, fl4->saddr, false); 2541 if (!dev_out) 2542 goto out; 2543 2544 /* Special hack: user can direct multicasts 2545 and limited broadcast via necessary interface 2546 without fiddling with IP_MULTICAST_IF or IP_PKTINFO. 2547 This hack is not just for fun, it allows 2548 vic,vat and friends to work. 2549 They bind socket to loopback, set ttl to zero 2550 and expect that it will work. 2551 From the viewpoint of routing cache they are broken, 2552 because we are not allowed to build multicast path 2553 with loopback source addr (look, routing cache 2554 cannot know, that ttl is zero, so that packet 2555 will not leave this host and route is valid). 2556 Luckily, this hack is good workaround. 2557 */ 2558 2559 fl4->flowi4_oif = dev_out->ifindex; 2560 goto make_route; 2561 } 2562 2563 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) { 2564 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ 2565 if (!__ip_dev_find(net, fl4->saddr, false)) 2566 goto out; 2567 } 2568 } 2569 2570 2571 if (fl4->flowi4_oif) { 2572 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif); 2573 rth = ERR_PTR(-ENODEV); 2574 if (!dev_out) 2575 goto out; 2576 2577 /* RACE: Check return value of inet_select_addr instead. */ 2578 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) { 2579 rth = ERR_PTR(-ENETUNREACH); 2580 goto out; 2581 } 2582 if (ipv4_is_local_multicast(fl4->daddr) || 2583 ipv4_is_lbcast(fl4->daddr) || 2584 fl4->flowi4_proto == IPPROTO_IGMP) { 2585 if (!fl4->saddr) 2586 fl4->saddr = inet_select_addr(dev_out, 0, 2587 RT_SCOPE_LINK); 2588 goto make_route; 2589 } 2590 if (!fl4->saddr) { 2591 if (ipv4_is_multicast(fl4->daddr)) 2592 fl4->saddr = inet_select_addr(dev_out, 0, 2593 fl4->flowi4_scope); 2594 else if (!fl4->daddr) 2595 fl4->saddr = inet_select_addr(dev_out, 0, 2596 RT_SCOPE_HOST); 2597 } 2598 } 2599 2600 if (!fl4->daddr) { 2601 fl4->daddr = fl4->saddr; 2602 if (!fl4->daddr) 2603 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK); 2604 dev_out = net->loopback_dev; 2605 fl4->flowi4_oif = LOOPBACK_IFINDEX; 2606 res->type = RTN_LOCAL; 2607 flags |= RTCF_LOCAL; 2608 goto make_route; 2609 } 2610 2611 err = fib_lookup(net, fl4, res, 0); 2612 if (err) { 2613 res->fi = NULL; 2614 res->table = NULL; 2615 if (fl4->flowi4_oif && 2616 (ipv4_is_multicast(fl4->daddr) || 2617 !netif_index_is_l3_master(net, fl4->flowi4_oif))) { 2618 /* Apparently, routing tables are wrong. Assume, 2619 that the destination is on link. 2620 2621 WHY? DW. 2622 Because we are allowed to send to iface 2623 even if it has NO routes and NO assigned 2624 addresses. When oif is specified, routing 2625 tables are looked up with only one purpose: 2626 to catch if destination is gatewayed, rather than 2627 direct. Moreover, if MSG_DONTROUTE is set, 2628 we send packet, ignoring both routing tables 2629 and ifaddr state. --ANK 2630 2631 2632 We could make it even if oif is unknown, 2633 likely IPv6, but we do not. 2634 */ 2635 2636 if (fl4->saddr == 0) 2637 fl4->saddr = inet_select_addr(dev_out, 0, 2638 RT_SCOPE_LINK); 2639 res->type = RTN_UNICAST; 2640 goto make_route; 2641 } 2642 rth = ERR_PTR(err); 2643 goto out; 2644 } 2645 2646 if (res->type == RTN_LOCAL) { 2647 if (!fl4->saddr) { 2648 if (res->fi->fib_prefsrc) 2649 fl4->saddr = res->fi->fib_prefsrc; 2650 else 2651 fl4->saddr = fl4->daddr; 2652 } 2653 2654 /* L3 master device is the loopback for that domain */ 2655 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? : 2656 net->loopback_dev; 2657 2658 /* make sure orig_oif points to fib result device even 2659 * though packet rx/tx happens over loopback or l3mdev 2660 */ 2661 orig_oif = FIB_RES_OIF(*res); 2662 2663 fl4->flowi4_oif = dev_out->ifindex; 2664 flags |= RTCF_LOCAL; 2665 goto make_route; 2666 } 2667 2668 fib_select_path(net, res, fl4, skb); 2669 2670 dev_out = FIB_RES_DEV(*res); 2671 fl4->flowi4_oif = dev_out->ifindex; 2672 2673 2674 make_route: 2675 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags); 2676 2677 out: 2678 return rth; 2679 } 2680 2681 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie) 2682 { 2683 return NULL; 2684 } 2685 2686 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst) 2687 { 2688 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); 2689 2690 return mtu ? : dst->dev->mtu; 2691 } 2692 2693 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk, 2694 struct sk_buff *skb, u32 mtu, 2695 bool confirm_neigh) 2696 { 2697 } 2698 2699 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk, 2700 struct sk_buff *skb) 2701 { 2702 } 2703 2704 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst, 2705 unsigned long old) 2706 { 2707 return NULL; 2708 } 2709 2710 static struct dst_ops ipv4_dst_blackhole_ops = { 2711 .family = AF_INET, 2712 .check = ipv4_blackhole_dst_check, 2713 .mtu = ipv4_blackhole_mtu, 2714 .default_advmss = ipv4_default_advmss, 2715 .update_pmtu = ipv4_rt_blackhole_update_pmtu, 2716 .redirect = ipv4_rt_blackhole_redirect, 2717 .cow_metrics = ipv4_rt_blackhole_cow_metrics, 2718 .neigh_lookup = ipv4_neigh_lookup, 2719 }; 2720 2721 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig) 2722 { 2723 struct rtable *ort = (struct rtable *) dst_orig; 2724 struct rtable *rt; 2725 2726 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0); 2727 if (rt) { 2728 struct dst_entry *new = &rt->dst; 2729 2730 new->__use = 1; 2731 new->input = dst_discard; 2732 new->output = dst_discard_out; 2733 2734 new->dev = net->loopback_dev; 2735 if (new->dev) 2736 dev_hold(new->dev); 2737 2738 rt->rt_is_input = ort->rt_is_input; 2739 rt->rt_iif = ort->rt_iif; 2740 rt->rt_pmtu = ort->rt_pmtu; 2741 rt->rt_mtu_locked = ort->rt_mtu_locked; 2742 2743 rt->rt_genid = rt_genid_ipv4(net); 2744 rt->rt_flags = ort->rt_flags; 2745 rt->rt_type = ort->rt_type; 2746 rt->rt_uses_gateway = ort->rt_uses_gateway; 2747 rt->rt_gw_family = ort->rt_gw_family; 2748 if (rt->rt_gw_family == AF_INET) 2749 rt->rt_gw4 = ort->rt_gw4; 2750 else if (rt->rt_gw_family == AF_INET6) 2751 rt->rt_gw6 = ort->rt_gw6; 2752 2753 INIT_LIST_HEAD(&rt->rt_uncached); 2754 } 2755 2756 dst_release(dst_orig); 2757 2758 return rt ? &rt->dst : ERR_PTR(-ENOMEM); 2759 } 2760 2761 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4, 2762 const struct sock *sk) 2763 { 2764 struct rtable *rt = __ip_route_output_key(net, flp4); 2765 2766 if (IS_ERR(rt)) 2767 return rt; 2768 2769 if (flp4->flowi4_proto) 2770 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst, 2771 flowi4_to_flowi(flp4), 2772 sk, 0); 2773 2774 return rt; 2775 } 2776 EXPORT_SYMBOL_GPL(ip_route_output_flow); 2777 2778 struct rtable *ip_route_output_tunnel(struct sk_buff *skb, 2779 struct net_device *dev, 2780 struct net *net, __be32 *saddr, 2781 const struct ip_tunnel_info *info, 2782 u8 protocol, bool use_cache) 2783 { 2784 #ifdef CONFIG_DST_CACHE 2785 struct dst_cache *dst_cache; 2786 #endif 2787 struct rtable *rt = NULL; 2788 struct flowi4 fl4; 2789 __u8 tos; 2790 2791 #ifdef CONFIG_DST_CACHE 2792 dst_cache = (struct dst_cache *)&info->dst_cache; 2793 if (use_cache) { 2794 rt = dst_cache_get_ip4(dst_cache, saddr); 2795 if (rt) 2796 return rt; 2797 } 2798 #endif 2799 memset(&fl4, 0, sizeof(fl4)); 2800 fl4.flowi4_mark = skb->mark; 2801 fl4.flowi4_proto = protocol; 2802 fl4.daddr = info->key.u.ipv4.dst; 2803 fl4.saddr = info->key.u.ipv4.src; 2804 tos = info->key.tos; 2805 fl4.flowi4_tos = RT_TOS(tos); 2806 2807 rt = ip_route_output_key(net, &fl4); 2808 if (IS_ERR(rt)) { 2809 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr); 2810 return ERR_PTR(-ENETUNREACH); 2811 } 2812 if (rt->dst.dev == dev) { /* is this necessary? */ 2813 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr); 2814 ip_rt_put(rt); 2815 return ERR_PTR(-ELOOP); 2816 } 2817 #ifdef CONFIG_DST_CACHE 2818 if (use_cache) 2819 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr); 2820 #endif 2821 *saddr = fl4.saddr; 2822 return rt; 2823 } 2824 EXPORT_SYMBOL_GPL(ip_route_output_tunnel); 2825 2826 /* called with rcu_read_lock held */ 2827 static int rt_fill_info(struct net *net, __be32 dst, __be32 src, 2828 struct rtable *rt, u32 table_id, struct flowi4 *fl4, 2829 struct sk_buff *skb, u32 portid, u32 seq, 2830 unsigned int flags) 2831 { 2832 struct rtmsg *r; 2833 struct nlmsghdr *nlh; 2834 unsigned long expires = 0; 2835 u32 error; 2836 u32 metrics[RTAX_MAX]; 2837 2838 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags); 2839 if (!nlh) 2840 return -EMSGSIZE; 2841 2842 r = nlmsg_data(nlh); 2843 r->rtm_family = AF_INET; 2844 r->rtm_dst_len = 32; 2845 r->rtm_src_len = 0; 2846 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0; 2847 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT; 2848 if (nla_put_u32(skb, RTA_TABLE, table_id)) 2849 goto nla_put_failure; 2850 r->rtm_type = rt->rt_type; 2851 r->rtm_scope = RT_SCOPE_UNIVERSE; 2852 r->rtm_protocol = RTPROT_UNSPEC; 2853 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED; 2854 if (rt->rt_flags & RTCF_NOTIFY) 2855 r->rtm_flags |= RTM_F_NOTIFY; 2856 if (IPCB(skb)->flags & IPSKB_DOREDIRECT) 2857 r->rtm_flags |= RTCF_DOREDIRECT; 2858 2859 if (nla_put_in_addr(skb, RTA_DST, dst)) 2860 goto nla_put_failure; 2861 if (src) { 2862 r->rtm_src_len = 32; 2863 if (nla_put_in_addr(skb, RTA_SRC, src)) 2864 goto nla_put_failure; 2865 } 2866 if (rt->dst.dev && 2867 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex)) 2868 goto nla_put_failure; 2869 #ifdef CONFIG_IP_ROUTE_CLASSID 2870 if (rt->dst.tclassid && 2871 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid)) 2872 goto nla_put_failure; 2873 #endif 2874 if (fl4 && !rt_is_input_route(rt) && 2875 fl4->saddr != src) { 2876 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr)) 2877 goto nla_put_failure; 2878 } 2879 if (rt->rt_uses_gateway) { 2880 if (rt->rt_gw_family == AF_INET && 2881 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) { 2882 goto nla_put_failure; 2883 } else if (rt->rt_gw_family == AF_INET6) { 2884 int alen = sizeof(struct in6_addr); 2885 struct nlattr *nla; 2886 struct rtvia *via; 2887 2888 nla = nla_reserve(skb, RTA_VIA, alen + 2); 2889 if (!nla) 2890 goto nla_put_failure; 2891 2892 via = nla_data(nla); 2893 via->rtvia_family = AF_INET6; 2894 memcpy(via->rtvia_addr, &rt->rt_gw6, alen); 2895 } 2896 } 2897 2898 expires = rt->dst.expires; 2899 if (expires) { 2900 unsigned long now = jiffies; 2901 2902 if (time_before(now, expires)) 2903 expires -= now; 2904 else 2905 expires = 0; 2906 } 2907 2908 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics)); 2909 if (rt->rt_pmtu && expires) 2910 metrics[RTAX_MTU - 1] = rt->rt_pmtu; 2911 if (rt->rt_mtu_locked && expires) 2912 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU); 2913 if (rtnetlink_put_metrics(skb, metrics) < 0) 2914 goto nla_put_failure; 2915 2916 if (fl4) { 2917 if (fl4->flowi4_mark && 2918 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark)) 2919 goto nla_put_failure; 2920 2921 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) && 2922 nla_put_u32(skb, RTA_UID, 2923 from_kuid_munged(current_user_ns(), 2924 fl4->flowi4_uid))) 2925 goto nla_put_failure; 2926 2927 if (rt_is_input_route(rt)) { 2928 #ifdef CONFIG_IP_MROUTE 2929 if (ipv4_is_multicast(dst) && 2930 !ipv4_is_local_multicast(dst) && 2931 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { 2932 int err = ipmr_get_route(net, skb, 2933 fl4->saddr, fl4->daddr, 2934 r, portid); 2935 2936 if (err <= 0) { 2937 if (err == 0) 2938 return 0; 2939 goto nla_put_failure; 2940 } 2941 } else 2942 #endif 2943 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif)) 2944 goto nla_put_failure; 2945 } 2946 } 2947 2948 error = rt->dst.error; 2949 2950 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0) 2951 goto nla_put_failure; 2952 2953 nlmsg_end(skb, nlh); 2954 return 0; 2955 2956 nla_put_failure: 2957 nlmsg_cancel(skb, nlh); 2958 return -EMSGSIZE; 2959 } 2960 2961 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb, 2962 struct netlink_callback *cb, u32 table_id, 2963 struct fnhe_hash_bucket *bucket, int genid, 2964 int *fa_index, int fa_start, unsigned int flags) 2965 { 2966 int i; 2967 2968 for (i = 0; i < FNHE_HASH_SIZE; i++) { 2969 struct fib_nh_exception *fnhe; 2970 2971 for (fnhe = rcu_dereference(bucket[i].chain); fnhe; 2972 fnhe = rcu_dereference(fnhe->fnhe_next)) { 2973 struct rtable *rt; 2974 int err; 2975 2976 if (*fa_index < fa_start) 2977 goto next; 2978 2979 if (fnhe->fnhe_genid != genid) 2980 goto next; 2981 2982 if (fnhe->fnhe_expires && 2983 time_after(jiffies, fnhe->fnhe_expires)) 2984 goto next; 2985 2986 rt = rcu_dereference(fnhe->fnhe_rth_input); 2987 if (!rt) 2988 rt = rcu_dereference(fnhe->fnhe_rth_output); 2989 if (!rt) 2990 goto next; 2991 2992 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt, 2993 table_id, NULL, skb, 2994 NETLINK_CB(cb->skb).portid, 2995 cb->nlh->nlmsg_seq, flags); 2996 if (err) 2997 return err; 2998 next: 2999 (*fa_index)++; 3000 } 3001 } 3002 3003 return 0; 3004 } 3005 3006 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb, 3007 u32 table_id, struct fib_info *fi, 3008 int *fa_index, int fa_start, unsigned int flags) 3009 { 3010 struct net *net = sock_net(cb->skb->sk); 3011 int nhsel, genid = fnhe_genid(net); 3012 3013 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) { 3014 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel); 3015 struct fnhe_hash_bucket *bucket; 3016 int err; 3017 3018 if (nhc->nhc_flags & RTNH_F_DEAD) 3019 continue; 3020 3021 rcu_read_lock(); 3022 bucket = rcu_dereference(nhc->nhc_exceptions); 3023 err = 0; 3024 if (bucket) 3025 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket, 3026 genid, fa_index, fa_start, 3027 flags); 3028 rcu_read_unlock(); 3029 if (err) 3030 return err; 3031 } 3032 3033 return 0; 3034 } 3035 3036 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst, 3037 u8 ip_proto, __be16 sport, 3038 __be16 dport) 3039 { 3040 struct sk_buff *skb; 3041 struct iphdr *iph; 3042 3043 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL); 3044 if (!skb) 3045 return NULL; 3046 3047 /* Reserve room for dummy headers, this skb can pass 3048 * through good chunk of routing engine. 3049 */ 3050 skb_reset_mac_header(skb); 3051 skb_reset_network_header(skb); 3052 skb->protocol = htons(ETH_P_IP); 3053 iph = skb_put(skb, sizeof(struct iphdr)); 3054 iph->protocol = ip_proto; 3055 iph->saddr = src; 3056 iph->daddr = dst; 3057 iph->version = 0x4; 3058 iph->frag_off = 0; 3059 iph->ihl = 0x5; 3060 skb_set_transport_header(skb, skb->len); 3061 3062 switch (iph->protocol) { 3063 case IPPROTO_UDP: { 3064 struct udphdr *udph; 3065 3066 udph = skb_put_zero(skb, sizeof(struct udphdr)); 3067 udph->source = sport; 3068 udph->dest = dport; 3069 udph->len = sizeof(struct udphdr); 3070 udph->check = 0; 3071 break; 3072 } 3073 case IPPROTO_TCP: { 3074 struct tcphdr *tcph; 3075 3076 tcph = skb_put_zero(skb, sizeof(struct tcphdr)); 3077 tcph->source = sport; 3078 tcph->dest = dport; 3079 tcph->doff = sizeof(struct tcphdr) / 4; 3080 tcph->rst = 1; 3081 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr), 3082 src, dst, 0); 3083 break; 3084 } 3085 case IPPROTO_ICMP: { 3086 struct icmphdr *icmph; 3087 3088 icmph = skb_put_zero(skb, sizeof(struct icmphdr)); 3089 icmph->type = ICMP_ECHO; 3090 icmph->code = 0; 3091 } 3092 } 3093 3094 return skb; 3095 } 3096 3097 static int inet_rtm_valid_getroute_req(struct sk_buff *skb, 3098 const struct nlmsghdr *nlh, 3099 struct nlattr **tb, 3100 struct netlink_ext_ack *extack) 3101 { 3102 struct rtmsg *rtm; 3103 int i, err; 3104 3105 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) { 3106 NL_SET_ERR_MSG(extack, 3107 "ipv4: Invalid header for route get request"); 3108 return -EINVAL; 3109 } 3110 3111 if (!netlink_strict_get_check(skb)) 3112 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX, 3113 rtm_ipv4_policy, extack); 3114 3115 rtm = nlmsg_data(nlh); 3116 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) || 3117 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) || 3118 rtm->rtm_table || rtm->rtm_protocol || 3119 rtm->rtm_scope || rtm->rtm_type) { 3120 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request"); 3121 return -EINVAL; 3122 } 3123 3124 if (rtm->rtm_flags & ~(RTM_F_NOTIFY | 3125 RTM_F_LOOKUP_TABLE | 3126 RTM_F_FIB_MATCH)) { 3127 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request"); 3128 return -EINVAL; 3129 } 3130 3131 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX, 3132 rtm_ipv4_policy, extack); 3133 if (err) 3134 return err; 3135 3136 if ((tb[RTA_SRC] && !rtm->rtm_src_len) || 3137 (tb[RTA_DST] && !rtm->rtm_dst_len)) { 3138 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4"); 3139 return -EINVAL; 3140 } 3141 3142 for (i = 0; i <= RTA_MAX; i++) { 3143 if (!tb[i]) 3144 continue; 3145 3146 switch (i) { 3147 case RTA_IIF: 3148 case RTA_OIF: 3149 case RTA_SRC: 3150 case RTA_DST: 3151 case RTA_IP_PROTO: 3152 case RTA_SPORT: 3153 case RTA_DPORT: 3154 case RTA_MARK: 3155 case RTA_UID: 3156 break; 3157 default: 3158 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request"); 3159 return -EINVAL; 3160 } 3161 } 3162 3163 return 0; 3164 } 3165 3166 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh, 3167 struct netlink_ext_ack *extack) 3168 { 3169 struct net *net = sock_net(in_skb->sk); 3170 struct nlattr *tb[RTA_MAX+1]; 3171 u32 table_id = RT_TABLE_MAIN; 3172 __be16 sport = 0, dport = 0; 3173 struct fib_result res = {}; 3174 u8 ip_proto = IPPROTO_UDP; 3175 struct rtable *rt = NULL; 3176 struct sk_buff *skb; 3177 struct rtmsg *rtm; 3178 struct flowi4 fl4 = {}; 3179 __be32 dst = 0; 3180 __be32 src = 0; 3181 kuid_t uid; 3182 u32 iif; 3183 int err; 3184 int mark; 3185 3186 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack); 3187 if (err < 0) 3188 return err; 3189 3190 rtm = nlmsg_data(nlh); 3191 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0; 3192 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0; 3193 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0; 3194 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0; 3195 if (tb[RTA_UID]) 3196 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID])); 3197 else 3198 uid = (iif ? INVALID_UID : current_uid()); 3199 3200 if (tb[RTA_IP_PROTO]) { 3201 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO], 3202 &ip_proto, AF_INET, extack); 3203 if (err) 3204 return err; 3205 } 3206 3207 if (tb[RTA_SPORT]) 3208 sport = nla_get_be16(tb[RTA_SPORT]); 3209 3210 if (tb[RTA_DPORT]) 3211 dport = nla_get_be16(tb[RTA_DPORT]); 3212 3213 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport); 3214 if (!skb) 3215 return -ENOBUFS; 3216 3217 fl4.daddr = dst; 3218 fl4.saddr = src; 3219 fl4.flowi4_tos = rtm->rtm_tos; 3220 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0; 3221 fl4.flowi4_mark = mark; 3222 fl4.flowi4_uid = uid; 3223 if (sport) 3224 fl4.fl4_sport = sport; 3225 if (dport) 3226 fl4.fl4_dport = dport; 3227 fl4.flowi4_proto = ip_proto; 3228 3229 rcu_read_lock(); 3230 3231 if (iif) { 3232 struct net_device *dev; 3233 3234 dev = dev_get_by_index_rcu(net, iif); 3235 if (!dev) { 3236 err = -ENODEV; 3237 goto errout_rcu; 3238 } 3239 3240 fl4.flowi4_iif = iif; /* for rt_fill_info */ 3241 skb->dev = dev; 3242 skb->mark = mark; 3243 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos, 3244 dev, &res); 3245 3246 rt = skb_rtable(skb); 3247 if (err == 0 && rt->dst.error) 3248 err = -rt->dst.error; 3249 } else { 3250 fl4.flowi4_iif = LOOPBACK_IFINDEX; 3251 skb->dev = net->loopback_dev; 3252 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb); 3253 err = 0; 3254 if (IS_ERR(rt)) 3255 err = PTR_ERR(rt); 3256 else 3257 skb_dst_set(skb, &rt->dst); 3258 } 3259 3260 if (err) 3261 goto errout_rcu; 3262 3263 if (rtm->rtm_flags & RTM_F_NOTIFY) 3264 rt->rt_flags |= RTCF_NOTIFY; 3265 3266 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE) 3267 table_id = res.table ? res.table->tb_id : 0; 3268 3269 /* reset skb for netlink reply msg */ 3270 skb_trim(skb, 0); 3271 skb_reset_network_header(skb); 3272 skb_reset_transport_header(skb); 3273 skb_reset_mac_header(skb); 3274 3275 if (rtm->rtm_flags & RTM_F_FIB_MATCH) { 3276 struct fib_rt_info fri; 3277 3278 if (!res.fi) { 3279 err = fib_props[res.type].error; 3280 if (!err) 3281 err = -EHOSTUNREACH; 3282 goto errout_rcu; 3283 } 3284 fri.fi = res.fi; 3285 fri.tb_id = table_id; 3286 fri.dst = res.prefix; 3287 fri.dst_len = res.prefixlen; 3288 fri.tos = fl4.flowi4_tos; 3289 fri.type = rt->rt_type; 3290 fri.offload = 0; 3291 fri.trap = 0; 3292 if (res.fa_head) { 3293 struct fib_alias *fa; 3294 3295 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) { 3296 u8 slen = 32 - fri.dst_len; 3297 3298 if (fa->fa_slen == slen && 3299 fa->tb_id == fri.tb_id && 3300 fa->fa_tos == fri.tos && 3301 fa->fa_info == res.fi && 3302 fa->fa_type == fri.type) { 3303 fri.offload = fa->offload; 3304 fri.trap = fa->trap; 3305 break; 3306 } 3307 } 3308 } 3309 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid, 3310 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0); 3311 } else { 3312 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb, 3313 NETLINK_CB(in_skb).portid, 3314 nlh->nlmsg_seq, 0); 3315 } 3316 if (err < 0) 3317 goto errout_rcu; 3318 3319 rcu_read_unlock(); 3320 3321 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid); 3322 3323 errout_free: 3324 return err; 3325 errout_rcu: 3326 rcu_read_unlock(); 3327 kfree_skb(skb); 3328 goto errout_free; 3329 } 3330 3331 void ip_rt_multicast_event(struct in_device *in_dev) 3332 { 3333 rt_cache_flush(dev_net(in_dev->dev)); 3334 } 3335 3336 #ifdef CONFIG_SYSCTL 3337 static int ip_rt_gc_interval __read_mostly = 60 * HZ; 3338 static int ip_rt_gc_min_interval __read_mostly = HZ / 2; 3339 static int ip_rt_gc_elasticity __read_mostly = 8; 3340 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU; 3341 3342 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write, 3343 void *buffer, size_t *lenp, loff_t *ppos) 3344 { 3345 struct net *net = (struct net *)__ctl->extra1; 3346 3347 if (write) { 3348 rt_cache_flush(net); 3349 fnhe_genid_bump(net); 3350 return 0; 3351 } 3352 3353 return -EINVAL; 3354 } 3355 3356 static struct ctl_table ipv4_route_table[] = { 3357 { 3358 .procname = "gc_thresh", 3359 .data = &ipv4_dst_ops.gc_thresh, 3360 .maxlen = sizeof(int), 3361 .mode = 0644, 3362 .proc_handler = proc_dointvec, 3363 }, 3364 { 3365 .procname = "max_size", 3366 .data = &ip_rt_max_size, 3367 .maxlen = sizeof(int), 3368 .mode = 0644, 3369 .proc_handler = proc_dointvec, 3370 }, 3371 { 3372 /* Deprecated. Use gc_min_interval_ms */ 3373 3374 .procname = "gc_min_interval", 3375 .data = &ip_rt_gc_min_interval, 3376 .maxlen = sizeof(int), 3377 .mode = 0644, 3378 .proc_handler = proc_dointvec_jiffies, 3379 }, 3380 { 3381 .procname = "gc_min_interval_ms", 3382 .data = &ip_rt_gc_min_interval, 3383 .maxlen = sizeof(int), 3384 .mode = 0644, 3385 .proc_handler = proc_dointvec_ms_jiffies, 3386 }, 3387 { 3388 .procname = "gc_timeout", 3389 .data = &ip_rt_gc_timeout, 3390 .maxlen = sizeof(int), 3391 .mode = 0644, 3392 .proc_handler = proc_dointvec_jiffies, 3393 }, 3394 { 3395 .procname = "gc_interval", 3396 .data = &ip_rt_gc_interval, 3397 .maxlen = sizeof(int), 3398 .mode = 0644, 3399 .proc_handler = proc_dointvec_jiffies, 3400 }, 3401 { 3402 .procname = "redirect_load", 3403 .data = &ip_rt_redirect_load, 3404 .maxlen = sizeof(int), 3405 .mode = 0644, 3406 .proc_handler = proc_dointvec, 3407 }, 3408 { 3409 .procname = "redirect_number", 3410 .data = &ip_rt_redirect_number, 3411 .maxlen = sizeof(int), 3412 .mode = 0644, 3413 .proc_handler = proc_dointvec, 3414 }, 3415 { 3416 .procname = "redirect_silence", 3417 .data = &ip_rt_redirect_silence, 3418 .maxlen = sizeof(int), 3419 .mode = 0644, 3420 .proc_handler = proc_dointvec, 3421 }, 3422 { 3423 .procname = "error_cost", 3424 .data = &ip_rt_error_cost, 3425 .maxlen = sizeof(int), 3426 .mode = 0644, 3427 .proc_handler = proc_dointvec, 3428 }, 3429 { 3430 .procname = "error_burst", 3431 .data = &ip_rt_error_burst, 3432 .maxlen = sizeof(int), 3433 .mode = 0644, 3434 .proc_handler = proc_dointvec, 3435 }, 3436 { 3437 .procname = "gc_elasticity", 3438 .data = &ip_rt_gc_elasticity, 3439 .maxlen = sizeof(int), 3440 .mode = 0644, 3441 .proc_handler = proc_dointvec, 3442 }, 3443 { 3444 .procname = "mtu_expires", 3445 .data = &ip_rt_mtu_expires, 3446 .maxlen = sizeof(int), 3447 .mode = 0644, 3448 .proc_handler = proc_dointvec_jiffies, 3449 }, 3450 { 3451 .procname = "min_pmtu", 3452 .data = &ip_rt_min_pmtu, 3453 .maxlen = sizeof(int), 3454 .mode = 0644, 3455 .proc_handler = proc_dointvec_minmax, 3456 .extra1 = &ip_min_valid_pmtu, 3457 }, 3458 { 3459 .procname = "min_adv_mss", 3460 .data = &ip_rt_min_advmss, 3461 .maxlen = sizeof(int), 3462 .mode = 0644, 3463 .proc_handler = proc_dointvec, 3464 }, 3465 { } 3466 }; 3467 3468 static const char ipv4_route_flush_procname[] = "flush"; 3469 3470 static struct ctl_table ipv4_route_flush_table[] = { 3471 { 3472 .procname = ipv4_route_flush_procname, 3473 .maxlen = sizeof(int), 3474 .mode = 0200, 3475 .proc_handler = ipv4_sysctl_rtcache_flush, 3476 }, 3477 { }, 3478 }; 3479 3480 static __net_init int sysctl_route_net_init(struct net *net) 3481 { 3482 struct ctl_table *tbl; 3483 3484 tbl = ipv4_route_flush_table; 3485 if (!net_eq(net, &init_net)) { 3486 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); 3487 if (!tbl) 3488 goto err_dup; 3489 3490 /* Don't export non-whitelisted sysctls to unprivileged users */ 3491 if (net->user_ns != &init_user_ns) { 3492 if (tbl[0].procname != ipv4_route_flush_procname) 3493 tbl[0].procname = NULL; 3494 } 3495 } 3496 tbl[0].extra1 = net; 3497 3498 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl); 3499 if (!net->ipv4.route_hdr) 3500 goto err_reg; 3501 return 0; 3502 3503 err_reg: 3504 if (tbl != ipv4_route_flush_table) 3505 kfree(tbl); 3506 err_dup: 3507 return -ENOMEM; 3508 } 3509 3510 static __net_exit void sysctl_route_net_exit(struct net *net) 3511 { 3512 struct ctl_table *tbl; 3513 3514 tbl = net->ipv4.route_hdr->ctl_table_arg; 3515 unregister_net_sysctl_table(net->ipv4.route_hdr); 3516 BUG_ON(tbl == ipv4_route_flush_table); 3517 kfree(tbl); 3518 } 3519 3520 static __net_initdata struct pernet_operations sysctl_route_ops = { 3521 .init = sysctl_route_net_init, 3522 .exit = sysctl_route_net_exit, 3523 }; 3524 #endif 3525 3526 static __net_init int rt_genid_init(struct net *net) 3527 { 3528 atomic_set(&net->ipv4.rt_genid, 0); 3529 atomic_set(&net->fnhe_genid, 0); 3530 atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); 3531 return 0; 3532 } 3533 3534 static __net_initdata struct pernet_operations rt_genid_ops = { 3535 .init = rt_genid_init, 3536 }; 3537 3538 static int __net_init ipv4_inetpeer_init(struct net *net) 3539 { 3540 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL); 3541 3542 if (!bp) 3543 return -ENOMEM; 3544 inet_peer_base_init(bp); 3545 net->ipv4.peers = bp; 3546 return 0; 3547 } 3548 3549 static void __net_exit ipv4_inetpeer_exit(struct net *net) 3550 { 3551 struct inet_peer_base *bp = net->ipv4.peers; 3552 3553 net->ipv4.peers = NULL; 3554 inetpeer_invalidate_tree(bp); 3555 kfree(bp); 3556 } 3557 3558 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = { 3559 .init = ipv4_inetpeer_init, 3560 .exit = ipv4_inetpeer_exit, 3561 }; 3562 3563 #ifdef CONFIG_IP_ROUTE_CLASSID 3564 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly; 3565 #endif /* CONFIG_IP_ROUTE_CLASSID */ 3566 3567 int __init ip_rt_init(void) 3568 { 3569 int cpu; 3570 3571 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents), 3572 GFP_KERNEL); 3573 if (!ip_idents) 3574 panic("IP: failed to allocate ip_idents\n"); 3575 3576 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents)); 3577 3578 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL); 3579 if (!ip_tstamps) 3580 panic("IP: failed to allocate ip_tstamps\n"); 3581 3582 for_each_possible_cpu(cpu) { 3583 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu); 3584 3585 INIT_LIST_HEAD(&ul->head); 3586 spin_lock_init(&ul->lock); 3587 } 3588 #ifdef CONFIG_IP_ROUTE_CLASSID 3589 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); 3590 if (!ip_rt_acct) 3591 panic("IP: failed to allocate ip_rt_acct\n"); 3592 #endif 3593 3594 ipv4_dst_ops.kmem_cachep = 3595 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0, 3596 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); 3597 3598 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; 3599 3600 if (dst_entries_init(&ipv4_dst_ops) < 0) 3601 panic("IP: failed to allocate ipv4_dst_ops counter\n"); 3602 3603 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) 3604 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); 3605 3606 ipv4_dst_ops.gc_thresh = ~0; 3607 ip_rt_max_size = INT_MAX; 3608 3609 devinet_init(); 3610 ip_fib_init(); 3611 3612 if (ip_rt_proc_init()) 3613 pr_err("Unable to create route proc files\n"); 3614 #ifdef CONFIG_XFRM 3615 xfrm_init(); 3616 xfrm4_init(); 3617 #endif 3618 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 3619 RTNL_FLAG_DOIT_UNLOCKED); 3620 3621 #ifdef CONFIG_SYSCTL 3622 register_pernet_subsys(&sysctl_route_ops); 3623 #endif 3624 register_pernet_subsys(&rt_genid_ops); 3625 register_pernet_subsys(&ipv4_inetpeer_ops); 3626 return 0; 3627 } 3628 3629 #ifdef CONFIG_SYSCTL 3630 /* 3631 * We really need to sanitize the damn ipv4 init order, then all 3632 * this nonsense will go away. 3633 */ 3634 void __init ip_static_sysctl_init(void) 3635 { 3636 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table); 3637 } 3638 #endif 3639