1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (unlikely(ipv6_addr_is_multicast(daddr))) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (unlikely(!skb_gso_validate_network_len(skb, mtu))) 183 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 184 185 return ip6_finish_output2(net, sk, skb); 186 } 187 188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 189 { 190 unsigned int mtu; 191 192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 193 /* Policy lookup after SNAT yielded a new policy */ 194 if (skb_dst(skb)->xfrm) { 195 IP6CB(skb)->flags |= IP6SKB_REROUTED; 196 return dst_output(net, sk, skb); 197 } 198 #endif 199 200 mtu = ip6_skb_dst_mtu(skb); 201 if (skb_is_gso(skb)) 202 return ip6_finish_output_gso(net, sk, skb, mtu); 203 204 if (unlikely(skb->len > mtu || 205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) 206 return ip6_fragment(net, sk, skb, ip6_finish_output2); 207 208 return ip6_finish_output2(net, sk, skb); 209 } 210 211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 212 { 213 int ret; 214 215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 216 switch (ret) { 217 case NET_XMIT_SUCCESS: 218 case NET_XMIT_CN: 219 return __ip6_finish_output(net, sk, skb) ? : ret; 220 default: 221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 222 return ret; 223 } 224 } 225 226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 227 { 228 struct dst_entry *dst = skb_dst(skb); 229 struct net_device *dev, *indev = skb->dev; 230 struct inet6_dev *idev; 231 int ret; 232 233 skb->protocol = htons(ETH_P_IPV6); 234 rcu_read_lock(); 235 dev = dst_dev_rcu(dst); 236 idev = ip6_dst_idev(dst); 237 skb->dev = dev; 238 239 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 240 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 241 rcu_read_unlock(); 242 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 243 return 0; 244 } 245 246 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 247 net, sk, skb, indev, dev, 248 ip6_finish_output, 249 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 250 rcu_read_unlock(); 251 return ret; 252 } 253 EXPORT_SYMBOL(ip6_output); 254 255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 256 { 257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 258 return ip6_default_np_autolabel(net); 259 return inet6_test_bit(AUTOFLOWLABEL, sk); 260 } 261 262 int ip6_dst_hoplimit(struct dst_entry *dst) 263 { 264 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); 265 266 rcu_read_lock(); 267 if (hoplimit == 0) { 268 struct net_device *dev = dst_dev_rcu(dst); 269 struct inet6_dev *idev; 270 271 idev = __in6_dev_get(dev); 272 if (idev) 273 hoplimit = READ_ONCE(idev->cnf.hop_limit); 274 else 275 hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); 276 } 277 rcu_read_unlock(); 278 279 return hoplimit; 280 } 281 EXPORT_SYMBOL(ip6_dst_hoplimit); 282 283 /* 284 * xmit an sk_buff (used by TCP and SCTP) 285 * Note : socket lock is not held for SYNACK packets, but might be modified 286 * by calls to skb_set_owner_w() and ipv6_local_error(), 287 * which are using proper atomic operations or spinlocks. 288 */ 289 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 290 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 291 { 292 const struct ipv6_pinfo *np = inet6_sk(sk); 293 struct in6_addr *first_hop = &fl6->daddr; 294 struct dst_entry *dst = skb_dst(skb); 295 struct inet6_dev *idev = ip6_dst_idev(dst); 296 struct net *net = sock_net(sk); 297 unsigned int head_room; 298 struct net_device *dev; 299 struct ipv6hdr *hdr; 300 u8 proto = fl6->flowi6_proto; 301 int seg_len = skb->len; 302 int ret, hlimit = -1; 303 u32 mtu; 304 305 rcu_read_lock(); 306 307 dev = dst_dev_rcu(dst); 308 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); 309 if (opt) 310 head_room += opt->opt_nflen + opt->opt_flen; 311 312 if (unlikely(head_room > skb_headroom(skb))) { 313 /* idev stays alive while we hold rcu_read_lock(). */ 314 skb = skb_expand_head(skb, head_room); 315 if (!skb) { 316 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 317 ret = -ENOBUFS; 318 goto unlock; 319 } 320 } 321 322 if (unlikely(opt)) { 323 seg_len += opt->opt_nflen + opt->opt_flen; 324 325 if (opt->opt_flen) 326 proto = ipv6_push_frag_opts(skb, opt, proto); 327 328 if (opt->opt_nflen) 329 proto = ipv6_push_nfrag_opts(skb, opt, proto, 330 &first_hop, 331 &fl6->saddr); 332 } 333 334 if (unlikely(seg_len > IPV6_MAXPLEN)) 335 seg_len = 0; 336 337 __skb_push(skb, sizeof(struct ipv6hdr)); 338 skb_reset_network_header(skb); 339 hdr = ipv6_hdr(skb); 340 341 /* 342 * Fill in the IPv6 header 343 */ 344 if (np) 345 hlimit = READ_ONCE(np->hop_limit); 346 if (hlimit < 0) 347 hlimit = ip6_dst_hoplimit(dst); 348 349 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 350 ip6_autoflowlabel(net, sk), fl6)); 351 352 hdr->payload_len = htons(seg_len); 353 hdr->nexthdr = proto; 354 hdr->hop_limit = hlimit; 355 356 hdr->saddr = fl6->saddr; 357 hdr->daddr = *first_hop; 358 359 skb->protocol = htons(ETH_P_IPV6); 360 skb->priority = priority; 361 skb->mark = mark; 362 363 mtu = dst6_mtu(dst); 364 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { 365 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 366 367 /* if egress device is enslaved to an L3 master device pass the 368 * skb to its handler for processing 369 */ 370 skb = l3mdev_ip6_out((struct sock *)sk, skb); 371 if (unlikely(!skb)) { 372 ret = 0; 373 goto unlock; 374 } 375 376 /* hooks should never assume socket lock is held. 377 * we promote our socket to non const 378 */ 379 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 380 net, (struct sock *)sk, skb, NULL, dev, 381 dst_output); 382 goto unlock; 383 } 384 385 ret = -EMSGSIZE; 386 skb->dev = dev; 387 /* ipv6_local_error() does not require socket lock, 388 * we promote our socket to non const 389 */ 390 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 391 392 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 393 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 394 unlock: 395 rcu_read_unlock(); 396 return ret; 397 } 398 EXPORT_SYMBOL(ip6_xmit); 399 400 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 401 { 402 struct ip6_ra_chain *ra; 403 struct sock *last = NULL; 404 405 read_lock(&ip6_ra_lock); 406 for (ra = ip6_ra_chain; ra; ra = ra->next) { 407 struct sock *sk = ra->sk; 408 if (sk && ra->sel == sel && 409 (!sk->sk_bound_dev_if || 410 sk->sk_bound_dev_if == skb->dev->ifindex)) { 411 412 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 413 !net_eq(sock_net(sk), dev_net(skb->dev))) { 414 continue; 415 } 416 if (last) { 417 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 418 if (skb2) 419 rawv6_rcv(last, skb2); 420 } 421 last = sk; 422 } 423 } 424 425 if (last) { 426 rawv6_rcv(last, skb); 427 read_unlock(&ip6_ra_lock); 428 return 1; 429 } 430 read_unlock(&ip6_ra_lock); 431 return 0; 432 } 433 434 static int ip6_forward_proxy_check(struct sk_buff *skb) 435 { 436 struct ipv6hdr *hdr = ipv6_hdr(skb); 437 u8 nexthdr = hdr->nexthdr; 438 __be16 frag_off; 439 int offset; 440 441 if (ipv6_ext_hdr(nexthdr)) { 442 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 443 if (offset < 0) 444 return 0; 445 } else 446 offset = sizeof(struct ipv6hdr); 447 448 if (nexthdr == IPPROTO_ICMPV6) { 449 struct icmp6hdr *icmp6; 450 451 if (!pskb_may_pull(skb, (skb_network_header(skb) + 452 offset + 1 - skb->data))) 453 return 0; 454 455 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 456 457 switch (icmp6->icmp6_type) { 458 case NDISC_ROUTER_SOLICITATION: 459 case NDISC_ROUTER_ADVERTISEMENT: 460 case NDISC_NEIGHBOUR_SOLICITATION: 461 case NDISC_NEIGHBOUR_ADVERTISEMENT: 462 case NDISC_REDIRECT: 463 /* For reaction involving unicast neighbor discovery 464 * message destined to the proxied address, pass it to 465 * input function. 466 */ 467 return 1; 468 default: 469 break; 470 } 471 hdr = ipv6_hdr(skb); 472 } 473 474 /* 475 * The proxying router can't forward traffic sent to a link-local 476 * address, so signal the sender and discard the packet. This 477 * behavior is clarified by the MIPv6 specification. 478 */ 479 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 480 dst_link_failure(skb); 481 return -1; 482 } 483 484 return 0; 485 } 486 487 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 488 struct sk_buff *skb) 489 { 490 #ifdef CONFIG_NET_SWITCHDEV 491 if (skb->offload_l3_fwd_mark) { 492 consume_skb(skb); 493 return 0; 494 } 495 #endif 496 497 skb_clear_tstamp(skb); 498 return dst_output(net, sk, skb); 499 } 500 501 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 502 { 503 if (skb->len <= mtu) 504 return false; 505 506 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 507 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 508 return true; 509 510 if (skb->ignore_df) 511 return false; 512 513 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 514 return false; 515 516 return true; 517 } 518 519 int ip6_forward(struct sk_buff *skb) 520 { 521 struct dst_entry *dst = skb_dst(skb); 522 struct ipv6hdr *hdr = ipv6_hdr(skb); 523 struct inet6_skb_parm *opt = IP6CB(skb); 524 struct net *net = dev_net(dst_dev(dst)); 525 struct net_device *dev; 526 struct inet6_dev *idev; 527 SKB_DR(reason); 528 u32 mtu; 529 530 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 531 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 532 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 533 goto error; 534 535 if (skb->pkt_type != PACKET_HOST) 536 goto drop; 537 538 if (unlikely(skb->sk)) 539 goto drop; 540 541 if (skb_warn_if_lro(skb)) 542 goto drop; 543 544 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 545 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 546 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 547 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 548 goto drop; 549 } 550 551 skb_forward_csum(skb); 552 553 /* 554 * We DO NOT make any processing on 555 * RA packets, pushing them to user level AS IS 556 * without ane WARRANTY that application will be able 557 * to interpret them. The reason is that we 558 * cannot make anything clever here. 559 * 560 * We are not end-node, so that if packet contains 561 * AH/ESP, we cannot make anything. 562 * Defragmentation also would be mistake, RA packets 563 * cannot be fragmented, because there is no warranty 564 * that different fragments will go along one path. --ANK 565 */ 566 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 567 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 568 return 0; 569 } 570 571 /* 572 * check and decrement ttl 573 */ 574 if (hdr->hop_limit <= 1) { 575 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 576 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 577 578 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 579 return -ETIMEDOUT; 580 } 581 582 /* XXX: idev->cnf.proxy_ndp? */ 583 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 584 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 585 int proxied = ip6_forward_proxy_check(skb); 586 587 hdr = ipv6_hdr(skb); 588 if (proxied > 0) { 589 /* It's tempting to decrease the hop limit 590 * here by 1, as we do at the end of the 591 * function too. 592 * 593 * But that would be incorrect, as proxying is 594 * not forwarding. The ip6_input function 595 * will handle this packet locally, and it 596 * depends on the hop limit being unchanged. 597 * 598 * One example is the NDP hop limit, that 599 * always has to stay 255, but other would be 600 * similar checks around RA packets, where the 601 * user can even change the desired limit. 602 */ 603 return ip6_input(skb); 604 } else if (proxied < 0) { 605 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 606 goto drop; 607 } 608 } 609 610 if (!xfrm6_route_forward(skb)) { 611 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 612 SKB_DR_SET(reason, XFRM_POLICY); 613 goto drop; 614 } 615 dst = skb_dst(skb); 616 dev = dst_dev(dst); 617 /* IPv6 specs say nothing about it, but it is clear that we cannot 618 send redirects to source routed frames. 619 We don't send redirects to frames decapsulated from IPsec. 620 */ 621 if (IP6CB(skb)->iif == dev->ifindex && 622 opt->srcrt == 0 && !skb_sec_path(skb)) { 623 struct in6_addr *target = NULL; 624 struct inet_peer *peer; 625 struct rt6_info *rt; 626 627 /* 628 * incoming and outgoing devices are the same 629 * send a redirect. 630 */ 631 632 rt = dst_rt6_info(dst); 633 if (rt->rt6i_flags & RTF_GATEWAY) 634 target = &rt->rt6i_gateway; 635 else 636 target = &hdr->daddr; 637 638 rcu_read_lock(); 639 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 640 641 /* Limit redirects both by destination (here) 642 and by source (inside ndisc_send_redirect) 643 */ 644 if (inet_peer_xrlim_allow(peer, 1*HZ)) 645 ndisc_send_redirect(skb, target); 646 rcu_read_unlock(); 647 } else { 648 int addrtype = ipv6_addr_type(&hdr->saddr); 649 650 /* This check is security critical. */ 651 if (addrtype == IPV6_ADDR_ANY || 652 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 653 goto error; 654 if (addrtype & IPV6_ADDR_LINKLOCAL) { 655 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 656 ICMPV6_NOT_NEIGHBOUR, 0); 657 goto error; 658 } 659 } 660 661 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 662 663 mtu = ip6_dst_mtu_maybe_forward(dst, true); 664 if (mtu < IPV6_MIN_MTU) 665 mtu = IPV6_MIN_MTU; 666 667 if (unlikely(ip6_pkt_too_big(skb, mtu))) { 668 /* Again, force OUTPUT device used as source address */ 669 skb->dev = dev; 670 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 671 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 672 __IP6_INC_STATS(net, ip6_dst_idev(dst), 673 IPSTATS_MIB_FRAGFAILS); 674 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 675 return -EMSGSIZE; 676 } 677 678 if (skb_cow(skb, dev->hard_header_len)) { 679 __IP6_INC_STATS(net, ip6_dst_idev(dst), 680 IPSTATS_MIB_OUTDISCARDS); 681 goto drop; 682 } 683 684 hdr = ipv6_hdr(skb); 685 686 /* Mangling hops number delayed to point after skb COW */ 687 688 hdr->hop_limit--; 689 690 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 691 net, NULL, skb, skb->dev, dev, 692 ip6_forward_finish); 693 694 error: 695 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 696 SKB_DR_SET(reason, IP_INADDRERRORS); 697 drop: 698 kfree_skb_reason(skb, reason); 699 return -EINVAL; 700 } 701 702 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 703 { 704 to->pkt_type = from->pkt_type; 705 to->priority = from->priority; 706 to->protocol = from->protocol; 707 skb_dst_drop(to); 708 skb_dst_set(to, dst_clone(skb_dst(from))); 709 to->dev = from->dev; 710 to->mark = from->mark; 711 712 skb_copy_hash(to, from); 713 714 #ifdef CONFIG_NET_SCHED 715 to->tc_index = from->tc_index; 716 #endif 717 nf_copy(to, from); 718 skb_ext_copy(to, from); 719 skb_copy_secmark(to, from); 720 } 721 722 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 723 u8 nexthdr, __be32 frag_id, 724 struct ip6_fraglist_iter *iter) 725 { 726 unsigned int first_len; 727 struct frag_hdr *fh; 728 729 /* BUILD HEADER */ 730 *prevhdr = NEXTHDR_FRAGMENT; 731 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 732 if (!iter->tmp_hdr) 733 return -ENOMEM; 734 735 iter->frag = skb_shinfo(skb)->frag_list; 736 skb_frag_list_init(skb); 737 738 iter->offset = 0; 739 iter->hlen = hlen; 740 iter->frag_id = frag_id; 741 iter->nexthdr = nexthdr; 742 743 __skb_pull(skb, hlen); 744 fh = __skb_push(skb, sizeof(struct frag_hdr)); 745 __skb_push(skb, hlen); 746 skb_reset_network_header(skb); 747 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 748 749 fh->nexthdr = nexthdr; 750 fh->reserved = 0; 751 fh->frag_off = htons(IP6_MF); 752 fh->identification = frag_id; 753 754 first_len = skb_pagelen(skb); 755 skb->data_len = first_len - skb_headlen(skb); 756 skb->len = first_len; 757 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 758 759 return 0; 760 } 761 EXPORT_SYMBOL(ip6_fraglist_init); 762 763 void ip6_fraglist_prepare(struct sk_buff *skb, 764 struct ip6_fraglist_iter *iter) 765 { 766 struct sk_buff *frag = iter->frag; 767 unsigned int hlen = iter->hlen; 768 struct frag_hdr *fh; 769 770 frag->ip_summed = CHECKSUM_NONE; 771 skb_reset_transport_header(frag); 772 fh = __skb_push(frag, sizeof(struct frag_hdr)); 773 __skb_push(frag, hlen); 774 skb_reset_network_header(frag); 775 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 776 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 777 fh->nexthdr = iter->nexthdr; 778 fh->reserved = 0; 779 fh->frag_off = htons(iter->offset); 780 if (frag->next) 781 fh->frag_off |= htons(IP6_MF); 782 fh->identification = iter->frag_id; 783 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 784 ip6_copy_metadata(frag, skb); 785 } 786 EXPORT_SYMBOL(ip6_fraglist_prepare); 787 788 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 789 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 790 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 791 { 792 state->prevhdr = prevhdr; 793 state->nexthdr = nexthdr; 794 state->frag_id = frag_id; 795 796 state->hlen = hlen; 797 state->mtu = mtu; 798 799 state->left = skb->len - hlen; /* Space per frame */ 800 state->ptr = hlen; /* Where to start from */ 801 802 state->hroom = hdr_room; 803 state->troom = needed_tailroom; 804 805 state->offset = 0; 806 } 807 EXPORT_SYMBOL(ip6_frag_init); 808 809 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 810 { 811 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 812 struct sk_buff *frag; 813 struct frag_hdr *fh; 814 unsigned int len; 815 816 len = state->left; 817 /* IF: it doesn't fit, use 'mtu' - the data space left */ 818 if (len > state->mtu) 819 len = state->mtu; 820 /* IF: we are not sending up to and including the packet end 821 then align the next start on an eight byte boundary */ 822 if (len < state->left) 823 len &= ~7; 824 825 /* Allocate buffer */ 826 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 827 state->hroom + state->troom, GFP_ATOMIC); 828 if (!frag) 829 return ERR_PTR(-ENOMEM); 830 831 /* 832 * Set up data on packet 833 */ 834 835 ip6_copy_metadata(frag, skb); 836 skb_reserve(frag, state->hroom); 837 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 838 skb_reset_network_header(frag); 839 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 840 frag->transport_header = (frag->network_header + state->hlen + 841 sizeof(struct frag_hdr)); 842 843 /* 844 * Charge the memory for the fragment to any owner 845 * it might possess 846 */ 847 if (skb->sk) 848 skb_set_owner_w(frag, skb->sk); 849 850 /* 851 * Copy the packet header into the new buffer. 852 */ 853 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 854 855 fragnexthdr_offset = skb_network_header(frag); 856 fragnexthdr_offset += prevhdr - skb_network_header(skb); 857 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 858 859 /* 860 * Build fragment header. 861 */ 862 fh->nexthdr = state->nexthdr; 863 fh->reserved = 0; 864 fh->identification = state->frag_id; 865 866 /* 867 * Copy a block of the IP datagram. 868 */ 869 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 870 len)); 871 state->left -= len; 872 873 fh->frag_off = htons(state->offset); 874 if (state->left > 0) 875 fh->frag_off |= htons(IP6_MF); 876 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 877 878 state->ptr += len; 879 state->offset += len; 880 881 return frag; 882 } 883 EXPORT_SYMBOL(ip6_frag_next); 884 885 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 886 int (*output)(struct net *, struct sock *, struct sk_buff *)) 887 { 888 struct sk_buff *frag; 889 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 890 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 891 inet6_sk(skb->sk) : NULL; 892 u8 tstamp_type = skb->tstamp_type; 893 struct ip6_frag_state state; 894 unsigned int mtu, hlen, nexthdr_offset; 895 ktime_t tstamp = skb->tstamp; 896 int hroom, err = 0; 897 __be32 frag_id; 898 u8 *prevhdr, nexthdr = 0; 899 900 if (!ipv6_mod_enabled()) { 901 kfree_skb(skb); 902 return -EAFNOSUPPORT; 903 } 904 905 err = ip6_find_1stfragopt(skb, &prevhdr); 906 if (err < 0) 907 goto fail; 908 hlen = err; 909 nexthdr = *prevhdr; 910 nexthdr_offset = prevhdr - skb_network_header(skb); 911 912 mtu = ip6_skb_dst_mtu(skb); 913 914 /* We must not fragment if the socket is set to force MTU discovery 915 * or if the skb it not generated by a local socket. 916 */ 917 if (unlikely(!skb->ignore_df && skb->len > mtu)) 918 goto fail_toobig; 919 920 if (IP6CB(skb)->frag_max_size) { 921 if (IP6CB(skb)->frag_max_size > mtu) 922 goto fail_toobig; 923 924 /* don't send fragments larger than what we received */ 925 mtu = IP6CB(skb)->frag_max_size; 926 if (mtu < IPV6_MIN_MTU) 927 mtu = IPV6_MIN_MTU; 928 } 929 930 if (np) { 931 u32 frag_size = READ_ONCE(np->frag_size); 932 933 if (frag_size && frag_size < mtu) 934 mtu = frag_size; 935 } 936 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 937 goto fail_toobig; 938 mtu -= hlen + sizeof(struct frag_hdr); 939 940 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 941 &ipv6_hdr(skb)->saddr); 942 943 if (skb->ip_summed == CHECKSUM_PARTIAL && 944 (err = skb_checksum_help(skb))) 945 goto fail; 946 947 prevhdr = skb_network_header(skb) + nexthdr_offset; 948 hroom = LL_RESERVED_SPACE(rt->dst.dev); 949 if (skb_has_frag_list(skb)) { 950 unsigned int first_len = skb_pagelen(skb); 951 struct ip6_fraglist_iter iter; 952 struct sk_buff *frag2; 953 954 if (first_len - hlen > mtu || 955 ((first_len - hlen) & 7) || 956 skb_cloned(skb) || 957 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 958 goto slow_path; 959 960 skb_walk_frags(skb, frag) { 961 /* Correct geometry. */ 962 if (frag->len > mtu || 963 ((frag->len & 7) && frag->next) || 964 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 965 goto slow_path_clean; 966 967 /* Partially cloned skb? */ 968 if (skb_shared(frag)) 969 goto slow_path_clean; 970 971 BUG_ON(frag->sk); 972 if (skb->sk) { 973 frag->sk = skb->sk; 974 frag->destructor = sock_wfree; 975 } 976 skb->truesize -= frag->truesize; 977 } 978 979 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 980 &iter); 981 if (err < 0) 982 goto fail; 983 984 /* We prevent @rt from being freed. */ 985 rcu_read_lock(); 986 987 for (;;) { 988 /* Prepare header of the next frame, 989 * before previous one went down. */ 990 if (iter.frag) 991 ip6_fraglist_prepare(skb, &iter); 992 993 skb_set_delivery_time(skb, tstamp, tstamp_type); 994 err = output(net, sk, skb); 995 if (!err) 996 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 997 IPSTATS_MIB_FRAGCREATES); 998 999 if (err || !iter.frag) 1000 break; 1001 1002 skb = ip6_fraglist_next(&iter); 1003 } 1004 1005 kfree(iter.tmp_hdr); 1006 1007 if (err == 0) { 1008 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1009 IPSTATS_MIB_FRAGOKS); 1010 rcu_read_unlock(); 1011 return 0; 1012 } 1013 1014 kfree_skb_list(iter.frag); 1015 1016 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1017 IPSTATS_MIB_FRAGFAILS); 1018 rcu_read_unlock(); 1019 return err; 1020 1021 slow_path_clean: 1022 skb_walk_frags(skb, frag2) { 1023 if (frag2 == frag) 1024 break; 1025 frag2->sk = NULL; 1026 frag2->destructor = NULL; 1027 skb->truesize += frag2->truesize; 1028 } 1029 } 1030 1031 slow_path: 1032 /* 1033 * Fragment the datagram. 1034 */ 1035 1036 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1037 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1038 &state); 1039 1040 /* 1041 * Keep copying data until we run out. 1042 */ 1043 1044 while (state.left > 0) { 1045 frag = ip6_frag_next(skb, &state); 1046 if (IS_ERR(frag)) { 1047 err = PTR_ERR(frag); 1048 goto fail; 1049 } 1050 1051 /* 1052 * Put this fragment into the sending queue. 1053 */ 1054 skb_set_delivery_time(frag, tstamp, tstamp_type); 1055 err = output(net, sk, frag); 1056 if (err) 1057 goto fail; 1058 1059 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1060 IPSTATS_MIB_FRAGCREATES); 1061 } 1062 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1063 IPSTATS_MIB_FRAGOKS); 1064 consume_skb(skb); 1065 return err; 1066 1067 fail_toobig: 1068 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1069 err = -EMSGSIZE; 1070 1071 fail: 1072 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1073 IPSTATS_MIB_FRAGFAILS); 1074 kfree_skb(skb); 1075 return err; 1076 } 1077 EXPORT_SYMBOL_GPL(ip6_fragment); 1078 1079 static inline int ip6_rt_check(const struct rt6key *rt_key, 1080 const struct in6_addr *fl_addr, 1081 const struct in6_addr *addr_cache) 1082 { 1083 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1084 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1085 } 1086 1087 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1088 struct dst_entry *dst, 1089 const struct flowi6 *fl6) 1090 { 1091 struct ipv6_pinfo *np = inet6_sk(sk); 1092 struct rt6_info *rt; 1093 1094 if (!dst) 1095 goto out; 1096 1097 if (dst->ops->family != AF_INET6) { 1098 dst_release(dst); 1099 return NULL; 1100 } 1101 1102 rt = dst_rt6_info(dst); 1103 /* Yes, checking route validity in not connected 1104 * case is not very simple. Take into account, 1105 * that we do not support routing by source, TOS, 1106 * and MSG_DONTROUTE --ANK (980726) 1107 * 1108 * 1. ip6_rt_check(): If route was host route, 1109 * check that cached destination is current. 1110 * If it is network route, we still may 1111 * check its validity using saved pointer 1112 * to the last used address: daddr_cache. 1113 * We do not want to save whole address now, 1114 * (because main consumer of this service 1115 * is tcp, which has not this problem), 1116 * so that the last trick works only on connected 1117 * sockets. 1118 * 2. oif also should be the same. 1119 */ 1120 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1121 np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1122 #ifdef CONFIG_IPV6_SUBTREES 1123 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1124 np->saddr_cache ? &np->saddr : NULL) || 1125 #endif 1126 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1127 dst_release(dst); 1128 dst = NULL; 1129 } 1130 1131 out: 1132 return dst; 1133 } 1134 1135 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1136 struct dst_entry **dst, struct flowi6 *fl6) 1137 { 1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1139 struct neighbour *n; 1140 struct rt6_info *rt; 1141 #endif 1142 int err; 1143 int flags = 0; 1144 1145 /* The correct way to handle this would be to do 1146 * ip6_route_get_saddr, and then ip6_route_output; however, 1147 * the route-specific preferred source forces the 1148 * ip6_route_output call _before_ ip6_route_get_saddr. 1149 * 1150 * In source specific routing (no src=any default route), 1151 * ip6_route_output will fail given src=any saddr, though, so 1152 * that's why we try it again later. 1153 */ 1154 if (ipv6_addr_any(&fl6->saddr)) { 1155 struct fib6_info *from; 1156 struct rt6_info *rt; 1157 1158 *dst = ip6_route_output(net, sk, fl6); 1159 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1160 1161 rcu_read_lock(); 1162 from = rt ? rcu_dereference(rt->from) : NULL; 1163 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1164 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1165 fl6->flowi6_l3mdev, 1166 &fl6->saddr); 1167 rcu_read_unlock(); 1168 1169 if (err) 1170 goto out_err_release; 1171 1172 /* If we had an erroneous initial result, pretend it 1173 * never existed and let the SA-enabled version take 1174 * over. 1175 */ 1176 if ((*dst)->error) { 1177 dst_release(*dst); 1178 *dst = NULL; 1179 } 1180 1181 if (fl6->flowi6_oif) 1182 flags |= RT6_LOOKUP_F_IFACE; 1183 } 1184 1185 if (!*dst) 1186 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1187 1188 err = (*dst)->error; 1189 if (err) 1190 goto out_err_release; 1191 1192 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1193 /* 1194 * Here if the dst entry we've looked up 1195 * has a neighbour entry that is in the INCOMPLETE 1196 * state and the src address from the flow is 1197 * marked as OPTIMISTIC, we release the found 1198 * dst entry and replace it instead with the 1199 * dst entry of the nexthop router 1200 */ 1201 rt = dst_rt6_info(*dst); 1202 rcu_read_lock(); 1203 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1204 rt6_nexthop(rt, &fl6->daddr)); 1205 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1206 rcu_read_unlock(); 1207 1208 if (err) { 1209 struct inet6_ifaddr *ifp; 1210 struct flowi6 fl_gw6; 1211 int redirect; 1212 1213 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1214 (*dst)->dev, 1); 1215 1216 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1217 if (ifp) 1218 in6_ifa_put(ifp); 1219 1220 if (redirect) { 1221 /* 1222 * We need to get the dst entry for the 1223 * default router instead 1224 */ 1225 dst_release(*dst); 1226 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1227 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1228 *dst = ip6_route_output(net, sk, &fl_gw6); 1229 err = (*dst)->error; 1230 if (err) 1231 goto out_err_release; 1232 } 1233 } 1234 #endif 1235 if (ipv6_addr_v4mapped(&fl6->saddr) && 1236 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1237 err = -EAFNOSUPPORT; 1238 goto out_err_release; 1239 } 1240 1241 return 0; 1242 1243 out_err_release: 1244 dst_release(*dst); 1245 *dst = NULL; 1246 1247 if (err == -ENETUNREACH) 1248 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1249 return err; 1250 } 1251 1252 /** 1253 * ip6_dst_lookup - perform route lookup on flow 1254 * @net: Network namespace to perform lookup in 1255 * @sk: socket which provides route info 1256 * @dst: pointer to dst_entry * for result 1257 * @fl6: flow to lookup 1258 * 1259 * This function performs a route lookup on the given flow. 1260 * 1261 * It returns zero on success, or a standard errno code on error. 1262 */ 1263 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1264 struct flowi6 *fl6) 1265 { 1266 *dst = NULL; 1267 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1268 } 1269 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1270 1271 /** 1272 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1273 * @net: Network namespace to perform lookup in 1274 * @sk: socket which provides route info 1275 * @fl6: flow to lookup 1276 * @final_dst: final destination address for ipsec lookup 1277 * 1278 * This function performs a route lookup on the given flow. 1279 * 1280 * It returns a valid dst pointer on success, or a pointer encoded 1281 * error code. 1282 */ 1283 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1284 const struct in6_addr *final_dst) 1285 { 1286 struct dst_entry *dst = NULL; 1287 int err; 1288 1289 if (!ipv6_mod_enabled()) 1290 return ERR_PTR(-EAFNOSUPPORT); 1291 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1292 if (err) 1293 return ERR_PTR(err); 1294 if (final_dst) 1295 fl6->daddr = *final_dst; 1296 1297 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1298 } 1299 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1300 1301 /** 1302 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1303 * @sk: socket which provides the dst cache and route info 1304 * @fl6: flow to lookup 1305 * @final_dst: final destination address for ipsec lookup 1306 * @connected: whether @sk is connected or not 1307 * 1308 * This function performs a route lookup on the given flow with the 1309 * possibility of using the cached route in the socket if it is valid. 1310 * It will take the socket dst lock when operating on the dst cache. 1311 * As a result, this function can only be used in process context. 1312 * 1313 * In addition, for a connected socket, cache the dst in the socket 1314 * if the current cache is not valid. 1315 * 1316 * It returns a valid dst pointer on success, or a pointer encoded 1317 * error code. 1318 */ 1319 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1320 const struct in6_addr *final_dst, 1321 bool connected) 1322 { 1323 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1324 1325 dst = ip6_sk_dst_check(sk, dst, fl6); 1326 if (dst) 1327 return dst; 1328 1329 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1330 if (connected && !IS_ERR(dst)) 1331 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1332 1333 return dst; 1334 } 1335 1336 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1337 gfp_t gfp) 1338 { 1339 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1340 } 1341 1342 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1343 gfp_t gfp) 1344 { 1345 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1346 } 1347 1348 static void ip6_append_data_mtu(unsigned int *mtu, 1349 int *maxfraglen, 1350 unsigned int fragheaderlen, 1351 struct sk_buff *skb, 1352 struct rt6_info *rt, 1353 unsigned int orig_mtu) 1354 { 1355 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1356 if (!skb) { 1357 /* first fragment, reserve header_len */ 1358 *mtu = orig_mtu - rt->dst.header_len; 1359 1360 } else { 1361 /* 1362 * this fragment is not first, the headers 1363 * space is regarded as data space. 1364 */ 1365 *mtu = orig_mtu; 1366 } 1367 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1368 + fragheaderlen - sizeof(struct frag_hdr); 1369 } 1370 } 1371 1372 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1373 struct ipcm6_cookie *ipc6, 1374 struct rt6_info *rt) 1375 { 1376 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1377 struct inet6_cork *v6_cork = &cork->base6; 1378 struct ipv6_pinfo *np = inet6_sk(sk); 1379 unsigned int mtu, frag_size; 1380 1381 /* callers pass dst together with a reference, set it first so 1382 * ip6_cork_release() can put it down even in case of an error. 1383 */ 1384 cork->base.dst = &rt->dst; 1385 1386 /* 1387 * setup for corking 1388 */ 1389 if (unlikely(opt)) { 1390 if (WARN_ON(v6_cork->opt)) 1391 return -EINVAL; 1392 1393 nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation); 1394 if (unlikely(!nopt)) 1395 return -ENOBUFS; 1396 1397 nopt->tot_len = sizeof(*opt); 1398 nopt->opt_flen = opt->opt_flen; 1399 nopt->opt_nflen = opt->opt_nflen; 1400 1401 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1402 if (opt->dst0opt && !nopt->dst0opt) 1403 return -ENOBUFS; 1404 1405 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1406 if (opt->dst1opt && !nopt->dst1opt) 1407 return -ENOBUFS; 1408 1409 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1410 if (opt->hopopt && !nopt->hopopt) 1411 return -ENOBUFS; 1412 1413 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1414 if (opt->srcrt && !nopt->srcrt) 1415 return -ENOBUFS; 1416 1417 /* need source address above miyazawa*/ 1418 } 1419 v6_cork->hop_limit = ipc6->hlimit; 1420 v6_cork->tclass = ipc6->tclass; 1421 v6_cork->dontfrag = ipc6->dontfrag; 1422 if (rt->dst.flags & DST_XFRM_TUNNEL) 1423 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1424 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); 1425 else 1426 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1427 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); 1428 1429 frag_size = READ_ONCE(np->frag_size); 1430 if (frag_size && frag_size < mtu) 1431 mtu = frag_size; 1432 1433 cork->base.fragsize = mtu; 1434 cork->base.gso_size = ipc6->gso_size; 1435 cork->base.tx_flags = 0; 1436 cork->base.mark = ipc6->sockc.mark; 1437 cork->base.priority = ipc6->sockc.priority; 1438 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1439 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1440 cork->base.flags |= IPCORK_TS_OPT_ID; 1441 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1442 } 1443 cork->base.length = 0; 1444 cork->base.transmit_time = ipc6->sockc.transmit_time; 1445 1446 return 0; 1447 } 1448 1449 static int __ip6_append_data(struct sock *sk, 1450 struct sk_buff_head *queue, 1451 struct inet_cork_full *cork_full, 1452 struct page_frag *pfrag, 1453 int getfrag(void *from, char *to, int offset, 1454 int len, int odd, struct sk_buff *skb), 1455 void *from, size_t length, int transhdrlen, 1456 unsigned int flags) 1457 { 1458 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1459 struct inet6_cork *v6_cork = &cork_full->base6; 1460 struct inet_cork *cork = &cork_full->base; 1461 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1462 struct sk_buff *skb, *skb_prev = NULL; 1463 struct ubuf_info *uarg = NULL; 1464 int exthdrlen = 0; 1465 int dst_exthdrlen = 0; 1466 int hh_len; 1467 int copy; 1468 int err; 1469 int offset = 0; 1470 bool zc = false; 1471 u32 tskey = 0; 1472 struct rt6_info *rt = dst_rt6_info(cork->dst); 1473 bool paged, hold_tskey = false, extra_uref = false; 1474 struct ipv6_txoptions *opt = v6_cork->opt; 1475 int csummode = CHECKSUM_NONE; 1476 unsigned int maxnonfragsize, headersize; 1477 unsigned int wmem_alloc_delta = 0; 1478 1479 skb = skb_peek_tail(queue); 1480 if (!skb) { 1481 exthdrlen = opt ? opt->opt_flen : 0; 1482 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1483 } 1484 1485 paged = !!cork->gso_size; 1486 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1487 orig_mtu = mtu; 1488 1489 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1490 1491 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1492 (opt ? opt->opt_nflen : 0); 1493 1494 headersize = sizeof(struct ipv6hdr) + 1495 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1496 rt->rt6i_nfheader_len; 1497 1498 if (mtu <= fragheaderlen || 1499 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1500 goto emsgsize; 1501 1502 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1503 sizeof(struct frag_hdr); 1504 1505 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1506 * the first fragment 1507 */ 1508 if (headersize + transhdrlen > mtu) 1509 goto emsgsize; 1510 1511 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1512 (sk->sk_protocol == IPPROTO_UDP || 1513 sk->sk_protocol == IPPROTO_ICMPV6 || 1514 sk->sk_protocol == IPPROTO_RAW)) { 1515 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1516 sizeof(struct ipv6hdr)); 1517 goto emsgsize; 1518 } 1519 1520 if (ip6_sk_ignore_df(sk)) 1521 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1522 else 1523 maxnonfragsize = mtu; 1524 1525 if (cork->length + length > maxnonfragsize - headersize) { 1526 emsgsize: 1527 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1528 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1529 return -EMSGSIZE; 1530 } 1531 1532 /* CHECKSUM_PARTIAL only with no extension headers and when 1533 * we are not going to fragment 1534 */ 1535 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1536 headersize == sizeof(struct ipv6hdr) && 1537 length <= mtu - headersize && 1538 (!(flags & MSG_MORE) || cork->gso_size) && 1539 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1540 csummode = CHECKSUM_PARTIAL; 1541 1542 if ((flags & MSG_ZEROCOPY) && length) { 1543 struct msghdr *msg = from; 1544 1545 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1546 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1547 return -EINVAL; 1548 1549 /* Leave uarg NULL if can't zerocopy, callers should 1550 * be able to handle it. 1551 */ 1552 if ((rt->dst.dev->features & NETIF_F_SG) && 1553 csummode == CHECKSUM_PARTIAL) { 1554 paged = true; 1555 zc = true; 1556 uarg = msg->msg_ubuf; 1557 } 1558 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1559 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1560 false); 1561 if (!uarg) 1562 return -ENOBUFS; 1563 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1564 if (rt->dst.dev->features & NETIF_F_SG && 1565 csummode == CHECKSUM_PARTIAL) { 1566 paged = true; 1567 zc = true; 1568 } else { 1569 uarg_to_msgzc(uarg)->zerocopy = 0; 1570 skb_zcopy_set(skb, uarg, &extra_uref); 1571 } 1572 } 1573 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1574 if (inet_test_bit(HDRINCL, sk)) 1575 return -EPERM; 1576 if (rt->dst.dev->features & NETIF_F_SG && 1577 getfrag == ip_generic_getfrag) 1578 /* We need an empty buffer to attach stuff to */ 1579 paged = true; 1580 else 1581 flags &= ~MSG_SPLICE_PAGES; 1582 } 1583 1584 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1585 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1586 if (cork->flags & IPCORK_TS_OPT_ID) { 1587 tskey = cork->ts_opt_id; 1588 } else { 1589 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1590 hold_tskey = true; 1591 } 1592 } 1593 1594 /* 1595 * Let's try using as much space as possible. 1596 * Use MTU if total length of the message fits into the MTU. 1597 * Otherwise, we need to reserve fragment header and 1598 * fragment alignment (= 8-15 octects, in total). 1599 * 1600 * Note that we may need to "move" the data from the tail 1601 * of the buffer to the new fragment when we split 1602 * the message. 1603 * 1604 * FIXME: It may be fragmented into multiple chunks 1605 * at once if non-fragmentable extension headers 1606 * are too large. 1607 * --yoshfuji 1608 */ 1609 1610 cork->length += length; 1611 if (!skb) 1612 goto alloc_new_skb; 1613 1614 while (length > 0) { 1615 /* Check if the remaining data fits into current packet. */ 1616 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1617 if (copy < length) 1618 copy = maxfraglen - skb->len; 1619 1620 if (copy <= 0) { 1621 char *data; 1622 unsigned int datalen; 1623 unsigned int fraglen; 1624 unsigned int fraggap; 1625 unsigned int alloclen, alloc_extra; 1626 unsigned int pagedlen; 1627 alloc_new_skb: 1628 /* There's no room in the current skb */ 1629 if (skb) 1630 fraggap = skb->len - maxfraglen; 1631 else 1632 fraggap = 0; 1633 /* update mtu and maxfraglen if necessary */ 1634 if (!skb || !skb_prev) 1635 ip6_append_data_mtu(&mtu, &maxfraglen, 1636 fragheaderlen, skb, rt, 1637 orig_mtu); 1638 1639 skb_prev = skb; 1640 1641 /* 1642 * If remaining data exceeds the mtu, 1643 * we know we need more fragment(s). 1644 */ 1645 datalen = length + fraggap; 1646 1647 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1648 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1649 fraglen = datalen + fragheaderlen; 1650 pagedlen = 0; 1651 1652 alloc_extra = hh_len; 1653 alloc_extra += dst_exthdrlen; 1654 alloc_extra += rt->dst.trailer_len; 1655 1656 /* We just reserve space for fragment header. 1657 * Note: this may be overallocation if the message 1658 * (without MSG_MORE) fits into the MTU. 1659 */ 1660 alloc_extra += sizeof(struct frag_hdr); 1661 1662 if ((flags & MSG_MORE) && 1663 !(rt->dst.dev->features&NETIF_F_SG)) 1664 alloclen = mtu; 1665 else if (!paged && 1666 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1667 !(rt->dst.dev->features & NETIF_F_SG))) 1668 alloclen = fraglen; 1669 else { 1670 alloclen = fragheaderlen + transhdrlen; 1671 pagedlen = datalen - transhdrlen; 1672 } 1673 alloclen += alloc_extra; 1674 1675 if (datalen != length + fraggap) { 1676 /* 1677 * this is not the last fragment, the trailer 1678 * space is regarded as data space. 1679 */ 1680 datalen += rt->dst.trailer_len; 1681 } 1682 1683 fraglen = datalen + fragheaderlen; 1684 1685 copy = datalen - transhdrlen - fraggap - pagedlen; 1686 /* [!] NOTE: copy may be negative if pagedlen>0 1687 * because then the equation may reduces to -fraggap. 1688 */ 1689 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1690 err = -EINVAL; 1691 goto error; 1692 } 1693 if (transhdrlen) { 1694 skb = sock_alloc_send_skb(sk, alloclen, 1695 (flags & MSG_DONTWAIT), &err); 1696 } else { 1697 skb = NULL; 1698 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1699 2 * sk->sk_sndbuf) 1700 skb = alloc_skb(alloclen, 1701 sk->sk_allocation); 1702 if (unlikely(!skb)) 1703 err = -ENOBUFS; 1704 } 1705 if (!skb) 1706 goto error; 1707 /* 1708 * Fill in the control structures 1709 */ 1710 skb->protocol = htons(ETH_P_IPV6); 1711 skb->ip_summed = csummode; 1712 skb->csum = 0; 1713 /* reserve for fragmentation and ipsec header */ 1714 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1715 dst_exthdrlen); 1716 1717 /* 1718 * Find where to start putting bytes 1719 */ 1720 data = skb_put(skb, fraglen - pagedlen); 1721 skb_set_network_header(skb, exthdrlen); 1722 data += fragheaderlen; 1723 skb->transport_header = (skb->network_header + 1724 fragheaderlen); 1725 if (fraggap) { 1726 skb->csum = skb_copy_and_csum_bits( 1727 skb_prev, maxfraglen, 1728 data + transhdrlen, fraggap); 1729 skb_prev->csum = csum_sub(skb_prev->csum, 1730 skb->csum); 1731 data += fraggap; 1732 pskb_trim_unique(skb_prev, maxfraglen); 1733 } 1734 if (copy > 0 && 1735 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1736 from, data + transhdrlen, offset, 1737 copy, fraggap, skb) < 0) { 1738 err = -EFAULT; 1739 kfree_skb(skb); 1740 goto error; 1741 } else if (flags & MSG_SPLICE_PAGES) { 1742 copy = 0; 1743 } 1744 1745 offset += copy; 1746 length -= copy + transhdrlen; 1747 transhdrlen = 0; 1748 exthdrlen = 0; 1749 dst_exthdrlen = 0; 1750 1751 /* Only the initial fragment is time stamped */ 1752 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1753 cork->tx_flags = 0; 1754 skb_shinfo(skb)->tskey = tskey; 1755 tskey = 0; 1756 skb_zcopy_set(skb, uarg, &extra_uref); 1757 1758 if ((flags & MSG_CONFIRM) && !skb_prev) 1759 skb_set_dst_pending_confirm(skb, 1); 1760 1761 /* 1762 * Put the packet on the pending queue 1763 */ 1764 if (!skb->destructor) { 1765 skb->destructor = sock_wfree; 1766 skb->sk = sk; 1767 wmem_alloc_delta += skb->truesize; 1768 } 1769 __skb_queue_tail(queue, skb); 1770 continue; 1771 } 1772 1773 if (copy > length) 1774 copy = length; 1775 1776 if (!(rt->dst.dev->features&NETIF_F_SG) && 1777 skb_tailroom(skb) >= copy) { 1778 unsigned int off; 1779 1780 off = skb->len; 1781 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1782 from, skb_put(skb, copy), 1783 offset, copy, off, skb) < 0) { 1784 __skb_trim(skb, off); 1785 err = -EFAULT; 1786 goto error; 1787 } 1788 } else if (flags & MSG_SPLICE_PAGES) { 1789 struct msghdr *msg = from; 1790 1791 err = -EIO; 1792 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1793 goto error; 1794 1795 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1796 if (err < 0) 1797 goto error; 1798 copy = err; 1799 if (!(flags & MSG_NO_SHARED_FRAGS)) 1800 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; 1801 wmem_alloc_delta += copy; 1802 } else if (!zc) { 1803 int i = skb_shinfo(skb)->nr_frags; 1804 1805 err = -ENOMEM; 1806 if (!sk_page_frag_refill(sk, pfrag)) 1807 goto error; 1808 1809 skb_zcopy_downgrade_managed(skb); 1810 if (!skb_can_coalesce(skb, i, pfrag->page, 1811 pfrag->offset)) { 1812 err = -EMSGSIZE; 1813 if (i == MAX_SKB_FRAGS) 1814 goto error; 1815 1816 __skb_fill_page_desc(skb, i, pfrag->page, 1817 pfrag->offset, 0); 1818 skb_shinfo(skb)->nr_frags = ++i; 1819 get_page(pfrag->page); 1820 } 1821 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1822 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1823 from, 1824 page_address(pfrag->page) + pfrag->offset, 1825 offset, copy, skb->len, skb) < 0) 1826 goto error_efault; 1827 1828 pfrag->offset += copy; 1829 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1830 skb->len += copy; 1831 skb->data_len += copy; 1832 skb->truesize += copy; 1833 wmem_alloc_delta += copy; 1834 } else { 1835 err = skb_zerocopy_iter_dgram(skb, from, copy); 1836 if (err < 0) 1837 goto error; 1838 } 1839 offset += copy; 1840 length -= copy; 1841 } 1842 1843 if (wmem_alloc_delta) 1844 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1845 return 0; 1846 1847 error_efault: 1848 err = -EFAULT; 1849 error: 1850 net_zcopy_put_abort(uarg, extra_uref); 1851 cork->length -= length; 1852 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1853 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1854 if (hold_tskey) 1855 atomic_dec(&sk->sk_tskey); 1856 return err; 1857 } 1858 1859 int ip6_append_data(struct sock *sk, 1860 int getfrag(void *from, char *to, int offset, int len, 1861 int odd, struct sk_buff *skb), 1862 void *from, size_t length, int transhdrlen, 1863 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1864 struct rt6_info *rt, unsigned int flags) 1865 { 1866 struct inet_sock *inet = inet_sk(sk); 1867 int exthdrlen; 1868 int err; 1869 1870 if (flags&MSG_PROBE) 1871 return 0; 1872 if (skb_queue_empty(&sk->sk_write_queue)) { 1873 /* 1874 * setup for corking 1875 */ 1876 dst_hold(&rt->dst); 1877 err = ip6_setup_cork(sk, &inet->cork, 1878 ipc6, rt); 1879 if (err) 1880 return err; 1881 1882 inet->cork.fl.u.ip6 = *fl6; 1883 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1884 length += exthdrlen; 1885 transhdrlen += exthdrlen; 1886 } else { 1887 transhdrlen = 0; 1888 } 1889 1890 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1891 sk_page_frag(sk), getfrag, 1892 from, length, transhdrlen, flags); 1893 } 1894 EXPORT_SYMBOL_GPL(ip6_append_data); 1895 1896 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1897 { 1898 struct dst_entry *dst = cork->base.dst; 1899 1900 cork->base.dst = NULL; 1901 skb_dst_set(skb, dst); 1902 } 1903 1904 static void ip6_cork_release(struct inet_cork_full *cork) 1905 { 1906 struct inet6_cork *v6_cork = &cork->base6; 1907 1908 if (unlikely(v6_cork->opt)) { 1909 struct ipv6_txoptions *opt = v6_cork->opt; 1910 1911 kfree(opt->dst0opt); 1912 kfree(opt->dst1opt); 1913 kfree(opt->hopopt); 1914 kfree(opt->srcrt); 1915 kfree(opt); 1916 v6_cork->opt = NULL; 1917 } 1918 1919 if (cork->base.dst) { 1920 dst_release(cork->base.dst); 1921 cork->base.dst = NULL; 1922 } 1923 } 1924 1925 struct sk_buff *__ip6_make_skb(struct sock *sk, 1926 struct sk_buff_head *queue, 1927 struct inet_cork_full *cork) 1928 { 1929 struct sk_buff *skb, *tmp_skb; 1930 struct sk_buff **tail_skb; 1931 struct in6_addr *final_dst; 1932 struct net *net = sock_net(sk); 1933 struct ipv6hdr *hdr; 1934 struct ipv6_txoptions *opt; 1935 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1936 struct flowi6 *fl6 = &cork->fl.u.ip6; 1937 unsigned char proto = fl6->flowi6_proto; 1938 1939 skb = __skb_dequeue(queue); 1940 if (!skb) 1941 goto out; 1942 tail_skb = &(skb_shinfo(skb)->frag_list); 1943 1944 /* move skb->data to ip header from ext header */ 1945 if (skb->data < skb_network_header(skb)) 1946 __skb_pull(skb, skb_network_offset(skb)); 1947 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1948 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1949 *tail_skb = tmp_skb; 1950 tail_skb = &(tmp_skb->next); 1951 skb->len += tmp_skb->len; 1952 skb->data_len += tmp_skb->len; 1953 skb->truesize += tmp_skb->truesize; 1954 tmp_skb->destructor = NULL; 1955 tmp_skb->sk = NULL; 1956 } 1957 1958 /* Allow local fragmentation. */ 1959 skb->ignore_df = ip6_sk_ignore_df(sk); 1960 __skb_pull(skb, skb_network_header_len(skb)); 1961 1962 final_dst = &fl6->daddr; 1963 opt = cork->base6.opt; 1964 if (unlikely(opt)) { 1965 if (opt->opt_flen) 1966 proto = ipv6_push_frag_opts(skb, opt, proto); 1967 if (opt->opt_nflen) 1968 proto = ipv6_push_nfrag_opts(skb, opt, proto, 1969 &final_dst, &fl6->saddr); 1970 } 1971 skb_push(skb, sizeof(struct ipv6hdr)); 1972 skb_reset_network_header(skb); 1973 hdr = ipv6_hdr(skb); 1974 1975 ip6_flow_hdr(hdr, cork->base6.tclass, 1976 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1977 ip6_autoflowlabel(net, sk), fl6)); 1978 hdr->hop_limit = cork->base6.hop_limit; 1979 hdr->nexthdr = proto; 1980 hdr->saddr = fl6->saddr; 1981 hdr->daddr = *final_dst; 1982 1983 skb->priority = cork->base.priority; 1984 skb->mark = cork->base.mark; 1985 if (sk_is_tcp(sk)) 1986 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1987 else 1988 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1989 1990 ip6_cork_steal_dst(skb, cork); 1991 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1992 if (unlikely(proto == IPPROTO_ICMPV6)) { 1993 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1994 u8 icmp6_type; 1995 1996 if (sk->sk_socket->type == SOCK_RAW && 1997 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1998 icmp6_type = fl6->fl6_icmp_type; 1999 else 2000 icmp6_type = icmp6_hdr(skb)->icmp6_type; 2001 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 2002 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 2003 } 2004 2005 ip6_cork_release(cork); 2006 out: 2007 return skb; 2008 } 2009 2010 int ip6_send_skb(struct sk_buff *skb) 2011 { 2012 struct net *net = sock_net(skb->sk); 2013 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 2014 int err; 2015 2016 rcu_read_lock(); 2017 err = ip6_local_out(net, skb->sk, skb); 2018 if (err) { 2019 if (err > 0) 2020 err = net_xmit_errno(err); 2021 if (err) 2022 IP6_INC_STATS(net, rt->rt6i_idev, 2023 IPSTATS_MIB_OUTDISCARDS); 2024 } 2025 2026 rcu_read_unlock(); 2027 return err; 2028 } 2029 2030 int ip6_push_pending_frames(struct sock *sk) 2031 { 2032 struct sk_buff *skb; 2033 2034 skb = ip6_finish_skb(sk); 2035 if (!skb) 2036 return 0; 2037 2038 return ip6_send_skb(skb); 2039 } 2040 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2041 2042 static void __ip6_flush_pending_frames(struct sock *sk, 2043 struct sk_buff_head *queue, 2044 struct inet_cork_full *cork) 2045 { 2046 struct sk_buff *skb; 2047 2048 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2049 if (skb_dst(skb)) 2050 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2051 IPSTATS_MIB_OUTDISCARDS); 2052 kfree_skb(skb); 2053 } 2054 2055 ip6_cork_release(cork); 2056 } 2057 2058 void ip6_flush_pending_frames(struct sock *sk) 2059 { 2060 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2061 &inet_sk(sk)->cork); 2062 } 2063 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2064 2065 struct sk_buff *ip6_make_skb(struct sock *sk, 2066 int getfrag(void *from, char *to, int offset, 2067 int len, int odd, struct sk_buff *skb), 2068 void *from, size_t length, int transhdrlen, 2069 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2070 unsigned int flags, struct inet_cork_full *cork) 2071 { 2072 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2073 struct sk_buff_head queue; 2074 int err; 2075 2076 if (flags & MSG_PROBE) { 2077 dst_release(&rt->dst); 2078 return NULL; 2079 } 2080 2081 __skb_queue_head_init(&queue); 2082 2083 cork->base.flags = 0; 2084 cork->base.addr = 0; 2085 cork->base.opt = NULL; 2086 cork->base6.opt = NULL; 2087 err = ip6_setup_cork(sk, cork, ipc6, rt); 2088 if (err) { 2089 ip6_cork_release(cork); 2090 return ERR_PTR(err); 2091 } 2092 2093 err = __ip6_append_data(sk, &queue, cork, 2094 ¤t->task_frag, getfrag, from, 2095 length + exthdrlen, transhdrlen + exthdrlen, 2096 flags); 2097 if (err) { 2098 __ip6_flush_pending_frames(sk, &queue, cork); 2099 return ERR_PTR(err); 2100 } 2101 2102 return __ip6_make_skb(sk, &queue, cork); 2103 } 2104