1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (unlikely(ipv6_addr_is_multicast(daddr))) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (unlikely(!skb_gso_validate_network_len(skb, mtu))) 183 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 184 185 return ip6_finish_output2(net, sk, skb); 186 } 187 188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 189 { 190 unsigned int mtu; 191 192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 193 /* Policy lookup after SNAT yielded a new policy */ 194 if (skb_dst(skb)->xfrm) { 195 IP6CB(skb)->flags |= IP6SKB_REROUTED; 196 return dst_output(net, sk, skb); 197 } 198 #endif 199 200 mtu = ip6_skb_dst_mtu(skb); 201 if (skb_is_gso(skb)) 202 return ip6_finish_output_gso(net, sk, skb, mtu); 203 204 if (unlikely(skb->len > mtu || 205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) 206 return ip6_fragment(net, sk, skb, ip6_finish_output2); 207 208 return ip6_finish_output2(net, sk, skb); 209 } 210 211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 212 { 213 int ret; 214 215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 216 switch (ret) { 217 case NET_XMIT_SUCCESS: 218 case NET_XMIT_CN: 219 return __ip6_finish_output(net, sk, skb) ? : ret; 220 default: 221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 222 return ret; 223 } 224 } 225 226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 227 { 228 struct dst_entry *dst = skb_dst(skb); 229 struct net_device *dev, *indev = skb->dev; 230 struct inet6_dev *idev; 231 int ret; 232 233 skb->protocol = htons(ETH_P_IPV6); 234 rcu_read_lock(); 235 dev = dst_dev_rcu(dst); 236 idev = ip6_dst_idev(dst); 237 skb->dev = dev; 238 239 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 240 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 241 rcu_read_unlock(); 242 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 243 return 0; 244 } 245 246 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 247 net, sk, skb, indev, dev, 248 ip6_finish_output, 249 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 250 rcu_read_unlock(); 251 return ret; 252 } 253 EXPORT_SYMBOL(ip6_output); 254 255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 256 { 257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 258 return ip6_default_np_autolabel(net); 259 return inet6_test_bit(AUTOFLOWLABEL, sk); 260 } 261 262 int ip6_dst_hoplimit(struct dst_entry *dst) 263 { 264 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); 265 266 rcu_read_lock(); 267 if (hoplimit == 0) { 268 struct net_device *dev = dst_dev_rcu(dst); 269 struct inet6_dev *idev; 270 271 idev = __in6_dev_get(dev); 272 if (idev) 273 hoplimit = READ_ONCE(idev->cnf.hop_limit); 274 else 275 hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); 276 } 277 rcu_read_unlock(); 278 279 return hoplimit; 280 } 281 EXPORT_SYMBOL(ip6_dst_hoplimit); 282 283 /* 284 * xmit an sk_buff (used by TCP and SCTP) 285 * Note : socket lock is not held for SYNACK packets, but might be modified 286 * by calls to skb_set_owner_w() and ipv6_local_error(), 287 * which are using proper atomic operations or spinlocks. 288 */ 289 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 290 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 291 { 292 const struct ipv6_pinfo *np = inet6_sk(sk); 293 struct in6_addr *first_hop = &fl6->daddr; 294 struct dst_entry *dst = skb_dst(skb); 295 struct inet6_dev *idev = ip6_dst_idev(dst); 296 struct net *net = sock_net(sk); 297 unsigned int head_room; 298 struct net_device *dev; 299 struct ipv6hdr *hdr; 300 u8 proto = fl6->flowi6_proto; 301 int seg_len = skb->len; 302 int ret, hlimit = -1; 303 u32 mtu; 304 305 rcu_read_lock(); 306 307 dev = dst_dev_rcu(dst); 308 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); 309 if (opt) 310 head_room += opt->opt_nflen + opt->opt_flen; 311 312 if (unlikely(head_room > skb_headroom(skb))) { 313 /* idev stays alive while we hold rcu_read_lock(). */ 314 skb = skb_expand_head(skb, head_room); 315 if (!skb) { 316 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 317 ret = -ENOBUFS; 318 goto unlock; 319 } 320 } 321 322 if (unlikely(opt)) { 323 seg_len += opt->opt_nflen + opt->opt_flen; 324 325 if (opt->opt_flen) 326 proto = ipv6_push_frag_opts(skb, opt, proto); 327 328 if (opt->opt_nflen) 329 proto = ipv6_push_nfrag_opts(skb, opt, proto, 330 &first_hop, 331 &fl6->saddr); 332 } 333 334 if (unlikely(seg_len > IPV6_MAXPLEN)) 335 seg_len = 0; 336 337 __skb_push(skb, sizeof(struct ipv6hdr)); 338 skb_reset_network_header(skb); 339 hdr = ipv6_hdr(skb); 340 341 /* 342 * Fill in the IPv6 header 343 */ 344 if (np) 345 hlimit = READ_ONCE(np->hop_limit); 346 if (hlimit < 0) 347 hlimit = ip6_dst_hoplimit(dst); 348 349 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 350 ip6_autoflowlabel(net, sk), fl6)); 351 352 hdr->payload_len = htons(seg_len); 353 hdr->nexthdr = proto; 354 hdr->hop_limit = hlimit; 355 356 hdr->saddr = fl6->saddr; 357 hdr->daddr = *first_hop; 358 359 skb->protocol = htons(ETH_P_IPV6); 360 skb->priority = priority; 361 skb->mark = mark; 362 363 mtu = dst6_mtu(dst); 364 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { 365 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 366 367 /* if egress device is enslaved to an L3 master device pass the 368 * skb to its handler for processing 369 */ 370 skb = l3mdev_ip6_out((struct sock *)sk, skb); 371 if (unlikely(!skb)) { 372 ret = 0; 373 goto unlock; 374 } 375 376 /* hooks should never assume socket lock is held. 377 * we promote our socket to non const 378 */ 379 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 380 net, (struct sock *)sk, skb, NULL, dev, 381 dst_output); 382 goto unlock; 383 } 384 385 ret = -EMSGSIZE; 386 skb->dev = dev; 387 /* ipv6_local_error() does not require socket lock, 388 * we promote our socket to non const 389 */ 390 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 391 392 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 393 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 394 unlock: 395 rcu_read_unlock(); 396 return ret; 397 } 398 EXPORT_SYMBOL(ip6_xmit); 399 400 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 401 { 402 struct ip6_ra_chain *ra; 403 struct sock *last = NULL; 404 405 read_lock(&ip6_ra_lock); 406 for (ra = ip6_ra_chain; ra; ra = ra->next) { 407 struct sock *sk = ra->sk; 408 if (sk && ra->sel == sel && 409 (!sk->sk_bound_dev_if || 410 sk->sk_bound_dev_if == skb->dev->ifindex)) { 411 412 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 413 !net_eq(sock_net(sk), dev_net(skb->dev))) { 414 continue; 415 } 416 if (last) { 417 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 418 if (skb2) 419 rawv6_rcv(last, skb2); 420 } 421 last = sk; 422 } 423 } 424 425 if (last) { 426 rawv6_rcv(last, skb); 427 read_unlock(&ip6_ra_lock); 428 return 1; 429 } 430 read_unlock(&ip6_ra_lock); 431 return 0; 432 } 433 434 static int ip6_forward_proxy_check(struct sk_buff *skb) 435 { 436 struct ipv6hdr *hdr = ipv6_hdr(skb); 437 u8 nexthdr = hdr->nexthdr; 438 __be16 frag_off; 439 int offset; 440 441 if (ipv6_ext_hdr(nexthdr)) { 442 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 443 if (offset < 0) 444 return 0; 445 } else 446 offset = sizeof(struct ipv6hdr); 447 448 if (nexthdr == IPPROTO_ICMPV6) { 449 struct icmp6hdr *icmp6; 450 451 if (!pskb_may_pull(skb, (skb_network_header(skb) + 452 offset + 1 - skb->data))) 453 return 0; 454 455 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 456 457 switch (icmp6->icmp6_type) { 458 case NDISC_ROUTER_SOLICITATION: 459 case NDISC_ROUTER_ADVERTISEMENT: 460 case NDISC_NEIGHBOUR_SOLICITATION: 461 case NDISC_NEIGHBOUR_ADVERTISEMENT: 462 case NDISC_REDIRECT: 463 /* For reaction involving unicast neighbor discovery 464 * message destined to the proxied address, pass it to 465 * input function. 466 */ 467 return 1; 468 default: 469 break; 470 } 471 } 472 473 /* 474 * The proxying router can't forward traffic sent to a link-local 475 * address, so signal the sender and discard the packet. This 476 * behavior is clarified by the MIPv6 specification. 477 */ 478 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 479 dst_link_failure(skb); 480 return -1; 481 } 482 483 return 0; 484 } 485 486 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 487 struct sk_buff *skb) 488 { 489 #ifdef CONFIG_NET_SWITCHDEV 490 if (skb->offload_l3_fwd_mark) { 491 consume_skb(skb); 492 return 0; 493 } 494 #endif 495 496 skb_clear_tstamp(skb); 497 return dst_output(net, sk, skb); 498 } 499 500 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 501 { 502 if (skb->len <= mtu) 503 return false; 504 505 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 506 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 507 return true; 508 509 if (skb->ignore_df) 510 return false; 511 512 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 513 return false; 514 515 return true; 516 } 517 518 int ip6_forward(struct sk_buff *skb) 519 { 520 struct dst_entry *dst = skb_dst(skb); 521 struct ipv6hdr *hdr = ipv6_hdr(skb); 522 struct inet6_skb_parm *opt = IP6CB(skb); 523 struct net *net = dev_net(dst_dev(dst)); 524 struct net_device *dev; 525 struct inet6_dev *idev; 526 SKB_DR(reason); 527 u32 mtu; 528 529 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 530 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 531 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 532 goto error; 533 534 if (skb->pkt_type != PACKET_HOST) 535 goto drop; 536 537 if (unlikely(skb->sk)) 538 goto drop; 539 540 if (skb_warn_if_lro(skb)) 541 goto drop; 542 543 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 544 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 545 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 546 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 547 goto drop; 548 } 549 550 skb_forward_csum(skb); 551 552 /* 553 * We DO NOT make any processing on 554 * RA packets, pushing them to user level AS IS 555 * without ane WARRANTY that application will be able 556 * to interpret them. The reason is that we 557 * cannot make anything clever here. 558 * 559 * We are not end-node, so that if packet contains 560 * AH/ESP, we cannot make anything. 561 * Defragmentation also would be mistake, RA packets 562 * cannot be fragmented, because there is no warranty 563 * that different fragments will go along one path. --ANK 564 */ 565 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 566 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 567 return 0; 568 } 569 570 /* 571 * check and decrement ttl 572 */ 573 if (hdr->hop_limit <= 1) { 574 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 575 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 576 577 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 578 return -ETIMEDOUT; 579 } 580 581 /* XXX: idev->cnf.proxy_ndp? */ 582 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 583 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 584 int proxied = ip6_forward_proxy_check(skb); 585 if (proxied > 0) { 586 /* It's tempting to decrease the hop limit 587 * here by 1, as we do at the end of the 588 * function too. 589 * 590 * But that would be incorrect, as proxying is 591 * not forwarding. The ip6_input function 592 * will handle this packet locally, and it 593 * depends on the hop limit being unchanged. 594 * 595 * One example is the NDP hop limit, that 596 * always has to stay 255, but other would be 597 * similar checks around RA packets, where the 598 * user can even change the desired limit. 599 */ 600 return ip6_input(skb); 601 } else if (proxied < 0) { 602 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 603 goto drop; 604 } 605 } 606 607 if (!xfrm6_route_forward(skb)) { 608 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 609 SKB_DR_SET(reason, XFRM_POLICY); 610 goto drop; 611 } 612 dst = skb_dst(skb); 613 dev = dst_dev(dst); 614 /* IPv6 specs say nothing about it, but it is clear that we cannot 615 send redirects to source routed frames. 616 We don't send redirects to frames decapsulated from IPsec. 617 */ 618 if (IP6CB(skb)->iif == dev->ifindex && 619 opt->srcrt == 0 && !skb_sec_path(skb)) { 620 struct in6_addr *target = NULL; 621 struct inet_peer *peer; 622 struct rt6_info *rt; 623 624 /* 625 * incoming and outgoing devices are the same 626 * send a redirect. 627 */ 628 629 rt = dst_rt6_info(dst); 630 if (rt->rt6i_flags & RTF_GATEWAY) 631 target = &rt->rt6i_gateway; 632 else 633 target = &hdr->daddr; 634 635 rcu_read_lock(); 636 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 637 638 /* Limit redirects both by destination (here) 639 and by source (inside ndisc_send_redirect) 640 */ 641 if (inet_peer_xrlim_allow(peer, 1*HZ)) 642 ndisc_send_redirect(skb, target); 643 rcu_read_unlock(); 644 } else { 645 int addrtype = ipv6_addr_type(&hdr->saddr); 646 647 /* This check is security critical. */ 648 if (addrtype == IPV6_ADDR_ANY || 649 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 650 goto error; 651 if (addrtype & IPV6_ADDR_LINKLOCAL) { 652 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 653 ICMPV6_NOT_NEIGHBOUR, 0); 654 goto error; 655 } 656 } 657 658 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 659 660 mtu = ip6_dst_mtu_maybe_forward(dst, true); 661 if (mtu < IPV6_MIN_MTU) 662 mtu = IPV6_MIN_MTU; 663 664 if (unlikely(ip6_pkt_too_big(skb, mtu))) { 665 /* Again, force OUTPUT device used as source address */ 666 skb->dev = dev; 667 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 668 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 669 __IP6_INC_STATS(net, ip6_dst_idev(dst), 670 IPSTATS_MIB_FRAGFAILS); 671 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 672 return -EMSGSIZE; 673 } 674 675 if (skb_cow(skb, dev->hard_header_len)) { 676 __IP6_INC_STATS(net, ip6_dst_idev(dst), 677 IPSTATS_MIB_OUTDISCARDS); 678 goto drop; 679 } 680 681 hdr = ipv6_hdr(skb); 682 683 /* Mangling hops number delayed to point after skb COW */ 684 685 hdr->hop_limit--; 686 687 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 688 net, NULL, skb, skb->dev, dev, 689 ip6_forward_finish); 690 691 error: 692 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 693 SKB_DR_SET(reason, IP_INADDRERRORS); 694 drop: 695 kfree_skb_reason(skb, reason); 696 return -EINVAL; 697 } 698 699 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 700 { 701 to->pkt_type = from->pkt_type; 702 to->priority = from->priority; 703 to->protocol = from->protocol; 704 skb_dst_drop(to); 705 skb_dst_set(to, dst_clone(skb_dst(from))); 706 to->dev = from->dev; 707 to->mark = from->mark; 708 709 skb_copy_hash(to, from); 710 711 #ifdef CONFIG_NET_SCHED 712 to->tc_index = from->tc_index; 713 #endif 714 nf_copy(to, from); 715 skb_ext_copy(to, from); 716 skb_copy_secmark(to, from); 717 } 718 719 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 720 u8 nexthdr, __be32 frag_id, 721 struct ip6_fraglist_iter *iter) 722 { 723 unsigned int first_len; 724 struct frag_hdr *fh; 725 726 /* BUILD HEADER */ 727 *prevhdr = NEXTHDR_FRAGMENT; 728 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 729 if (!iter->tmp_hdr) 730 return -ENOMEM; 731 732 iter->frag = skb_shinfo(skb)->frag_list; 733 skb_frag_list_init(skb); 734 735 iter->offset = 0; 736 iter->hlen = hlen; 737 iter->frag_id = frag_id; 738 iter->nexthdr = nexthdr; 739 740 __skb_pull(skb, hlen); 741 fh = __skb_push(skb, sizeof(struct frag_hdr)); 742 __skb_push(skb, hlen); 743 skb_reset_network_header(skb); 744 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 745 746 fh->nexthdr = nexthdr; 747 fh->reserved = 0; 748 fh->frag_off = htons(IP6_MF); 749 fh->identification = frag_id; 750 751 first_len = skb_pagelen(skb); 752 skb->data_len = first_len - skb_headlen(skb); 753 skb->len = first_len; 754 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 755 756 return 0; 757 } 758 EXPORT_SYMBOL(ip6_fraglist_init); 759 760 void ip6_fraglist_prepare(struct sk_buff *skb, 761 struct ip6_fraglist_iter *iter) 762 { 763 struct sk_buff *frag = iter->frag; 764 unsigned int hlen = iter->hlen; 765 struct frag_hdr *fh; 766 767 frag->ip_summed = CHECKSUM_NONE; 768 skb_reset_transport_header(frag); 769 fh = __skb_push(frag, sizeof(struct frag_hdr)); 770 __skb_push(frag, hlen); 771 skb_reset_network_header(frag); 772 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 773 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 774 fh->nexthdr = iter->nexthdr; 775 fh->reserved = 0; 776 fh->frag_off = htons(iter->offset); 777 if (frag->next) 778 fh->frag_off |= htons(IP6_MF); 779 fh->identification = iter->frag_id; 780 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 781 ip6_copy_metadata(frag, skb); 782 } 783 EXPORT_SYMBOL(ip6_fraglist_prepare); 784 785 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 786 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 787 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 788 { 789 state->prevhdr = prevhdr; 790 state->nexthdr = nexthdr; 791 state->frag_id = frag_id; 792 793 state->hlen = hlen; 794 state->mtu = mtu; 795 796 state->left = skb->len - hlen; /* Space per frame */ 797 state->ptr = hlen; /* Where to start from */ 798 799 state->hroom = hdr_room; 800 state->troom = needed_tailroom; 801 802 state->offset = 0; 803 } 804 EXPORT_SYMBOL(ip6_frag_init); 805 806 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 807 { 808 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 809 struct sk_buff *frag; 810 struct frag_hdr *fh; 811 unsigned int len; 812 813 len = state->left; 814 /* IF: it doesn't fit, use 'mtu' - the data space left */ 815 if (len > state->mtu) 816 len = state->mtu; 817 /* IF: we are not sending up to and including the packet end 818 then align the next start on an eight byte boundary */ 819 if (len < state->left) 820 len &= ~7; 821 822 /* Allocate buffer */ 823 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 824 state->hroom + state->troom, GFP_ATOMIC); 825 if (!frag) 826 return ERR_PTR(-ENOMEM); 827 828 /* 829 * Set up data on packet 830 */ 831 832 ip6_copy_metadata(frag, skb); 833 skb_reserve(frag, state->hroom); 834 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 835 skb_reset_network_header(frag); 836 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 837 frag->transport_header = (frag->network_header + state->hlen + 838 sizeof(struct frag_hdr)); 839 840 /* 841 * Charge the memory for the fragment to any owner 842 * it might possess 843 */ 844 if (skb->sk) 845 skb_set_owner_w(frag, skb->sk); 846 847 /* 848 * Copy the packet header into the new buffer. 849 */ 850 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 851 852 fragnexthdr_offset = skb_network_header(frag); 853 fragnexthdr_offset += prevhdr - skb_network_header(skb); 854 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 855 856 /* 857 * Build fragment header. 858 */ 859 fh->nexthdr = state->nexthdr; 860 fh->reserved = 0; 861 fh->identification = state->frag_id; 862 863 /* 864 * Copy a block of the IP datagram. 865 */ 866 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 867 len)); 868 state->left -= len; 869 870 fh->frag_off = htons(state->offset); 871 if (state->left > 0) 872 fh->frag_off |= htons(IP6_MF); 873 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 874 875 state->ptr += len; 876 state->offset += len; 877 878 return frag; 879 } 880 EXPORT_SYMBOL(ip6_frag_next); 881 882 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 883 int (*output)(struct net *, struct sock *, struct sk_buff *)) 884 { 885 struct sk_buff *frag; 886 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 887 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 888 inet6_sk(skb->sk) : NULL; 889 u8 tstamp_type = skb->tstamp_type; 890 struct ip6_frag_state state; 891 unsigned int mtu, hlen, nexthdr_offset; 892 ktime_t tstamp = skb->tstamp; 893 int hroom, err = 0; 894 __be32 frag_id; 895 u8 *prevhdr, nexthdr = 0; 896 897 if (!ipv6_mod_enabled()) { 898 kfree_skb(skb); 899 return -EAFNOSUPPORT; 900 } 901 902 err = ip6_find_1stfragopt(skb, &prevhdr); 903 if (err < 0) 904 goto fail; 905 hlen = err; 906 nexthdr = *prevhdr; 907 nexthdr_offset = prevhdr - skb_network_header(skb); 908 909 mtu = ip6_skb_dst_mtu(skb); 910 911 /* We must not fragment if the socket is set to force MTU discovery 912 * or if the skb it not generated by a local socket. 913 */ 914 if (unlikely(!skb->ignore_df && skb->len > mtu)) 915 goto fail_toobig; 916 917 if (IP6CB(skb)->frag_max_size) { 918 if (IP6CB(skb)->frag_max_size > mtu) 919 goto fail_toobig; 920 921 /* don't send fragments larger than what we received */ 922 mtu = IP6CB(skb)->frag_max_size; 923 if (mtu < IPV6_MIN_MTU) 924 mtu = IPV6_MIN_MTU; 925 } 926 927 if (np) { 928 u32 frag_size = READ_ONCE(np->frag_size); 929 930 if (frag_size && frag_size < mtu) 931 mtu = frag_size; 932 } 933 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 934 goto fail_toobig; 935 mtu -= hlen + sizeof(struct frag_hdr); 936 937 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 938 &ipv6_hdr(skb)->saddr); 939 940 if (skb->ip_summed == CHECKSUM_PARTIAL && 941 (err = skb_checksum_help(skb))) 942 goto fail; 943 944 prevhdr = skb_network_header(skb) + nexthdr_offset; 945 hroom = LL_RESERVED_SPACE(rt->dst.dev); 946 if (skb_has_frag_list(skb)) { 947 unsigned int first_len = skb_pagelen(skb); 948 struct ip6_fraglist_iter iter; 949 struct sk_buff *frag2; 950 951 if (first_len - hlen > mtu || 952 ((first_len - hlen) & 7) || 953 skb_cloned(skb) || 954 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 955 goto slow_path; 956 957 skb_walk_frags(skb, frag) { 958 /* Correct geometry. */ 959 if (frag->len > mtu || 960 ((frag->len & 7) && frag->next) || 961 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 962 goto slow_path_clean; 963 964 /* Partially cloned skb? */ 965 if (skb_shared(frag)) 966 goto slow_path_clean; 967 968 BUG_ON(frag->sk); 969 if (skb->sk) { 970 frag->sk = skb->sk; 971 frag->destructor = sock_wfree; 972 } 973 skb->truesize -= frag->truesize; 974 } 975 976 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 977 &iter); 978 if (err < 0) 979 goto fail; 980 981 /* We prevent @rt from being freed. */ 982 rcu_read_lock(); 983 984 for (;;) { 985 /* Prepare header of the next frame, 986 * before previous one went down. */ 987 if (iter.frag) 988 ip6_fraglist_prepare(skb, &iter); 989 990 skb_set_delivery_time(skb, tstamp, tstamp_type); 991 err = output(net, sk, skb); 992 if (!err) 993 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 994 IPSTATS_MIB_FRAGCREATES); 995 996 if (err || !iter.frag) 997 break; 998 999 skb = ip6_fraglist_next(&iter); 1000 } 1001 1002 kfree(iter.tmp_hdr); 1003 1004 if (err == 0) { 1005 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1006 IPSTATS_MIB_FRAGOKS); 1007 rcu_read_unlock(); 1008 return 0; 1009 } 1010 1011 kfree_skb_list(iter.frag); 1012 1013 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1014 IPSTATS_MIB_FRAGFAILS); 1015 rcu_read_unlock(); 1016 return err; 1017 1018 slow_path_clean: 1019 skb_walk_frags(skb, frag2) { 1020 if (frag2 == frag) 1021 break; 1022 frag2->sk = NULL; 1023 frag2->destructor = NULL; 1024 skb->truesize += frag2->truesize; 1025 } 1026 } 1027 1028 slow_path: 1029 /* 1030 * Fragment the datagram. 1031 */ 1032 1033 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1034 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1035 &state); 1036 1037 /* 1038 * Keep copying data until we run out. 1039 */ 1040 1041 while (state.left > 0) { 1042 frag = ip6_frag_next(skb, &state); 1043 if (IS_ERR(frag)) { 1044 err = PTR_ERR(frag); 1045 goto fail; 1046 } 1047 1048 /* 1049 * Put this fragment into the sending queue. 1050 */ 1051 skb_set_delivery_time(frag, tstamp, tstamp_type); 1052 err = output(net, sk, frag); 1053 if (err) 1054 goto fail; 1055 1056 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1057 IPSTATS_MIB_FRAGCREATES); 1058 } 1059 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1060 IPSTATS_MIB_FRAGOKS); 1061 consume_skb(skb); 1062 return err; 1063 1064 fail_toobig: 1065 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1066 err = -EMSGSIZE; 1067 1068 fail: 1069 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1070 IPSTATS_MIB_FRAGFAILS); 1071 kfree_skb(skb); 1072 return err; 1073 } 1074 EXPORT_SYMBOL_GPL(ip6_fragment); 1075 1076 static inline int ip6_rt_check(const struct rt6key *rt_key, 1077 const struct in6_addr *fl_addr, 1078 const struct in6_addr *addr_cache) 1079 { 1080 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1081 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1082 } 1083 1084 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1085 struct dst_entry *dst, 1086 const struct flowi6 *fl6) 1087 { 1088 struct ipv6_pinfo *np = inet6_sk(sk); 1089 struct rt6_info *rt; 1090 1091 if (!dst) 1092 goto out; 1093 1094 if (dst->ops->family != AF_INET6) { 1095 dst_release(dst); 1096 return NULL; 1097 } 1098 1099 rt = dst_rt6_info(dst); 1100 /* Yes, checking route validity in not connected 1101 * case is not very simple. Take into account, 1102 * that we do not support routing by source, TOS, 1103 * and MSG_DONTROUTE --ANK (980726) 1104 * 1105 * 1. ip6_rt_check(): If route was host route, 1106 * check that cached destination is current. 1107 * If it is network route, we still may 1108 * check its validity using saved pointer 1109 * to the last used address: daddr_cache. 1110 * We do not want to save whole address now, 1111 * (because main consumer of this service 1112 * is tcp, which has not this problem), 1113 * so that the last trick works only on connected 1114 * sockets. 1115 * 2. oif also should be the same. 1116 */ 1117 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1118 np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1119 #ifdef CONFIG_IPV6_SUBTREES 1120 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1121 np->saddr_cache ? &np->saddr : NULL) || 1122 #endif 1123 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1124 dst_release(dst); 1125 dst = NULL; 1126 } 1127 1128 out: 1129 return dst; 1130 } 1131 1132 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1133 struct dst_entry **dst, struct flowi6 *fl6) 1134 { 1135 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1136 struct neighbour *n; 1137 struct rt6_info *rt; 1138 #endif 1139 int err; 1140 int flags = 0; 1141 1142 /* The correct way to handle this would be to do 1143 * ip6_route_get_saddr, and then ip6_route_output; however, 1144 * the route-specific preferred source forces the 1145 * ip6_route_output call _before_ ip6_route_get_saddr. 1146 * 1147 * In source specific routing (no src=any default route), 1148 * ip6_route_output will fail given src=any saddr, though, so 1149 * that's why we try it again later. 1150 */ 1151 if (ipv6_addr_any(&fl6->saddr)) { 1152 struct fib6_info *from; 1153 struct rt6_info *rt; 1154 1155 *dst = ip6_route_output(net, sk, fl6); 1156 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1157 1158 rcu_read_lock(); 1159 from = rt ? rcu_dereference(rt->from) : NULL; 1160 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1161 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1162 fl6->flowi6_l3mdev, 1163 &fl6->saddr); 1164 rcu_read_unlock(); 1165 1166 if (err) 1167 goto out_err_release; 1168 1169 /* If we had an erroneous initial result, pretend it 1170 * never existed and let the SA-enabled version take 1171 * over. 1172 */ 1173 if ((*dst)->error) { 1174 dst_release(*dst); 1175 *dst = NULL; 1176 } 1177 1178 if (fl6->flowi6_oif) 1179 flags |= RT6_LOOKUP_F_IFACE; 1180 } 1181 1182 if (!*dst) 1183 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1184 1185 err = (*dst)->error; 1186 if (err) 1187 goto out_err_release; 1188 1189 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1190 /* 1191 * Here if the dst entry we've looked up 1192 * has a neighbour entry that is in the INCOMPLETE 1193 * state and the src address from the flow is 1194 * marked as OPTIMISTIC, we release the found 1195 * dst entry and replace it instead with the 1196 * dst entry of the nexthop router 1197 */ 1198 rt = dst_rt6_info(*dst); 1199 rcu_read_lock(); 1200 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1201 rt6_nexthop(rt, &fl6->daddr)); 1202 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1203 rcu_read_unlock(); 1204 1205 if (err) { 1206 struct inet6_ifaddr *ifp; 1207 struct flowi6 fl_gw6; 1208 int redirect; 1209 1210 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1211 (*dst)->dev, 1); 1212 1213 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1214 if (ifp) 1215 in6_ifa_put(ifp); 1216 1217 if (redirect) { 1218 /* 1219 * We need to get the dst entry for the 1220 * default router instead 1221 */ 1222 dst_release(*dst); 1223 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1224 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1225 *dst = ip6_route_output(net, sk, &fl_gw6); 1226 err = (*dst)->error; 1227 if (err) 1228 goto out_err_release; 1229 } 1230 } 1231 #endif 1232 if (ipv6_addr_v4mapped(&fl6->saddr) && 1233 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1234 err = -EAFNOSUPPORT; 1235 goto out_err_release; 1236 } 1237 1238 return 0; 1239 1240 out_err_release: 1241 dst_release(*dst); 1242 *dst = NULL; 1243 1244 if (err == -ENETUNREACH) 1245 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1246 return err; 1247 } 1248 1249 /** 1250 * ip6_dst_lookup - perform route lookup on flow 1251 * @net: Network namespace to perform lookup in 1252 * @sk: socket which provides route info 1253 * @dst: pointer to dst_entry * for result 1254 * @fl6: flow to lookup 1255 * 1256 * This function performs a route lookup on the given flow. 1257 * 1258 * It returns zero on success, or a standard errno code on error. 1259 */ 1260 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1261 struct flowi6 *fl6) 1262 { 1263 *dst = NULL; 1264 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1265 } 1266 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1267 1268 /** 1269 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1270 * @net: Network namespace to perform lookup in 1271 * @sk: socket which provides route info 1272 * @fl6: flow to lookup 1273 * @final_dst: final destination address for ipsec lookup 1274 * 1275 * This function performs a route lookup on the given flow. 1276 * 1277 * It returns a valid dst pointer on success, or a pointer encoded 1278 * error code. 1279 */ 1280 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1281 const struct in6_addr *final_dst) 1282 { 1283 struct dst_entry *dst = NULL; 1284 int err; 1285 1286 if (!ipv6_mod_enabled()) 1287 return ERR_PTR(-EAFNOSUPPORT); 1288 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1289 if (err) 1290 return ERR_PTR(err); 1291 if (final_dst) 1292 fl6->daddr = *final_dst; 1293 1294 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1295 } 1296 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1297 1298 /** 1299 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1300 * @sk: socket which provides the dst cache and route info 1301 * @fl6: flow to lookup 1302 * @final_dst: final destination address for ipsec lookup 1303 * @connected: whether @sk is connected or not 1304 * 1305 * This function performs a route lookup on the given flow with the 1306 * possibility of using the cached route in the socket if it is valid. 1307 * It will take the socket dst lock when operating on the dst cache. 1308 * As a result, this function can only be used in process context. 1309 * 1310 * In addition, for a connected socket, cache the dst in the socket 1311 * if the current cache is not valid. 1312 * 1313 * It returns a valid dst pointer on success, or a pointer encoded 1314 * error code. 1315 */ 1316 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1317 const struct in6_addr *final_dst, 1318 bool connected) 1319 { 1320 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1321 1322 dst = ip6_sk_dst_check(sk, dst, fl6); 1323 if (dst) 1324 return dst; 1325 1326 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1327 if (connected && !IS_ERR(dst)) 1328 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1329 1330 return dst; 1331 } 1332 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1333 1334 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1335 gfp_t gfp) 1336 { 1337 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1338 } 1339 1340 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1341 gfp_t gfp) 1342 { 1343 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1344 } 1345 1346 static void ip6_append_data_mtu(unsigned int *mtu, 1347 int *maxfraglen, 1348 unsigned int fragheaderlen, 1349 struct sk_buff *skb, 1350 struct rt6_info *rt, 1351 unsigned int orig_mtu) 1352 { 1353 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1354 if (!skb) { 1355 /* first fragment, reserve header_len */ 1356 *mtu = orig_mtu - rt->dst.header_len; 1357 1358 } else { 1359 /* 1360 * this fragment is not first, the headers 1361 * space is regarded as data space. 1362 */ 1363 *mtu = orig_mtu; 1364 } 1365 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1366 + fragheaderlen - sizeof(struct frag_hdr); 1367 } 1368 } 1369 1370 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1371 struct ipcm6_cookie *ipc6, 1372 struct rt6_info *rt) 1373 { 1374 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1375 struct inet6_cork *v6_cork = &cork->base6; 1376 struct ipv6_pinfo *np = inet6_sk(sk); 1377 unsigned int mtu, frag_size; 1378 1379 /* callers pass dst together with a reference, set it first so 1380 * ip6_cork_release() can put it down even in case of an error. 1381 */ 1382 cork->base.dst = &rt->dst; 1383 1384 /* 1385 * setup for corking 1386 */ 1387 if (unlikely(opt)) { 1388 if (WARN_ON(v6_cork->opt)) 1389 return -EINVAL; 1390 1391 nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation); 1392 if (unlikely(!nopt)) 1393 return -ENOBUFS; 1394 1395 nopt->tot_len = sizeof(*opt); 1396 nopt->opt_flen = opt->opt_flen; 1397 nopt->opt_nflen = opt->opt_nflen; 1398 1399 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1400 if (opt->dst0opt && !nopt->dst0opt) 1401 return -ENOBUFS; 1402 1403 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1404 if (opt->dst1opt && !nopt->dst1opt) 1405 return -ENOBUFS; 1406 1407 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1408 if (opt->hopopt && !nopt->hopopt) 1409 return -ENOBUFS; 1410 1411 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1412 if (opt->srcrt && !nopt->srcrt) 1413 return -ENOBUFS; 1414 1415 /* need source address above miyazawa*/ 1416 } 1417 v6_cork->hop_limit = ipc6->hlimit; 1418 v6_cork->tclass = ipc6->tclass; 1419 v6_cork->dontfrag = ipc6->dontfrag; 1420 if (rt->dst.flags & DST_XFRM_TUNNEL) 1421 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1422 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); 1423 else 1424 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1425 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); 1426 1427 frag_size = READ_ONCE(np->frag_size); 1428 if (frag_size && frag_size < mtu) 1429 mtu = frag_size; 1430 1431 cork->base.fragsize = mtu; 1432 cork->base.gso_size = ipc6->gso_size; 1433 cork->base.tx_flags = 0; 1434 cork->base.mark = ipc6->sockc.mark; 1435 cork->base.priority = ipc6->sockc.priority; 1436 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1437 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1438 cork->base.flags |= IPCORK_TS_OPT_ID; 1439 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1440 } 1441 cork->base.length = 0; 1442 cork->base.transmit_time = ipc6->sockc.transmit_time; 1443 1444 return 0; 1445 } 1446 1447 static int __ip6_append_data(struct sock *sk, 1448 struct sk_buff_head *queue, 1449 struct inet_cork_full *cork_full, 1450 struct page_frag *pfrag, 1451 int getfrag(void *from, char *to, int offset, 1452 int len, int odd, struct sk_buff *skb), 1453 void *from, size_t length, int transhdrlen, 1454 unsigned int flags) 1455 { 1456 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1457 struct inet6_cork *v6_cork = &cork_full->base6; 1458 struct inet_cork *cork = &cork_full->base; 1459 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1460 struct sk_buff *skb, *skb_prev = NULL; 1461 struct ubuf_info *uarg = NULL; 1462 int exthdrlen = 0; 1463 int dst_exthdrlen = 0; 1464 int hh_len; 1465 int copy; 1466 int err; 1467 int offset = 0; 1468 bool zc = false; 1469 u32 tskey = 0; 1470 struct rt6_info *rt = dst_rt6_info(cork->dst); 1471 bool paged, hold_tskey = false, extra_uref = false; 1472 struct ipv6_txoptions *opt = v6_cork->opt; 1473 int csummode = CHECKSUM_NONE; 1474 unsigned int maxnonfragsize, headersize; 1475 unsigned int wmem_alloc_delta = 0; 1476 1477 skb = skb_peek_tail(queue); 1478 if (!skb) { 1479 exthdrlen = opt ? opt->opt_flen : 0; 1480 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1481 } 1482 1483 paged = !!cork->gso_size; 1484 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1485 orig_mtu = mtu; 1486 1487 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1488 1489 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1490 (opt ? opt->opt_nflen : 0); 1491 1492 headersize = sizeof(struct ipv6hdr) + 1493 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1494 rt->rt6i_nfheader_len; 1495 1496 if (mtu <= fragheaderlen || 1497 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1498 goto emsgsize; 1499 1500 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1501 sizeof(struct frag_hdr); 1502 1503 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1504 * the first fragment 1505 */ 1506 if (headersize + transhdrlen > mtu) 1507 goto emsgsize; 1508 1509 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1510 (sk->sk_protocol == IPPROTO_UDP || 1511 sk->sk_protocol == IPPROTO_ICMPV6 || 1512 sk->sk_protocol == IPPROTO_RAW)) { 1513 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1514 sizeof(struct ipv6hdr)); 1515 goto emsgsize; 1516 } 1517 1518 if (ip6_sk_ignore_df(sk)) 1519 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1520 else 1521 maxnonfragsize = mtu; 1522 1523 if (cork->length + length > maxnonfragsize - headersize) { 1524 emsgsize: 1525 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1526 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1527 return -EMSGSIZE; 1528 } 1529 1530 /* CHECKSUM_PARTIAL only with no extension headers and when 1531 * we are not going to fragment 1532 */ 1533 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1534 headersize == sizeof(struct ipv6hdr) && 1535 length <= mtu - headersize && 1536 (!(flags & MSG_MORE) || cork->gso_size) && 1537 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1538 csummode = CHECKSUM_PARTIAL; 1539 1540 if ((flags & MSG_ZEROCOPY) && length) { 1541 struct msghdr *msg = from; 1542 1543 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1544 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1545 return -EINVAL; 1546 1547 /* Leave uarg NULL if can't zerocopy, callers should 1548 * be able to handle it. 1549 */ 1550 if ((rt->dst.dev->features & NETIF_F_SG) && 1551 csummode == CHECKSUM_PARTIAL) { 1552 paged = true; 1553 zc = true; 1554 uarg = msg->msg_ubuf; 1555 } 1556 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1557 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1558 false); 1559 if (!uarg) 1560 return -ENOBUFS; 1561 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1562 if (rt->dst.dev->features & NETIF_F_SG && 1563 csummode == CHECKSUM_PARTIAL) { 1564 paged = true; 1565 zc = true; 1566 } else { 1567 uarg_to_msgzc(uarg)->zerocopy = 0; 1568 skb_zcopy_set(skb, uarg, &extra_uref); 1569 } 1570 } 1571 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1572 if (inet_test_bit(HDRINCL, sk)) 1573 return -EPERM; 1574 if (rt->dst.dev->features & NETIF_F_SG && 1575 getfrag == ip_generic_getfrag) 1576 /* We need an empty buffer to attach stuff to */ 1577 paged = true; 1578 else 1579 flags &= ~MSG_SPLICE_PAGES; 1580 } 1581 1582 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1583 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1584 if (cork->flags & IPCORK_TS_OPT_ID) { 1585 tskey = cork->ts_opt_id; 1586 } else { 1587 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1588 hold_tskey = true; 1589 } 1590 } 1591 1592 /* 1593 * Let's try using as much space as possible. 1594 * Use MTU if total length of the message fits into the MTU. 1595 * Otherwise, we need to reserve fragment header and 1596 * fragment alignment (= 8-15 octects, in total). 1597 * 1598 * Note that we may need to "move" the data from the tail 1599 * of the buffer to the new fragment when we split 1600 * the message. 1601 * 1602 * FIXME: It may be fragmented into multiple chunks 1603 * at once if non-fragmentable extension headers 1604 * are too large. 1605 * --yoshfuji 1606 */ 1607 1608 cork->length += length; 1609 if (!skb) 1610 goto alloc_new_skb; 1611 1612 while (length > 0) { 1613 /* Check if the remaining data fits into current packet. */ 1614 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1615 if (copy < length) 1616 copy = maxfraglen - skb->len; 1617 1618 if (copy <= 0) { 1619 char *data; 1620 unsigned int datalen; 1621 unsigned int fraglen; 1622 unsigned int fraggap; 1623 unsigned int alloclen, alloc_extra; 1624 unsigned int pagedlen; 1625 alloc_new_skb: 1626 /* There's no room in the current skb */ 1627 if (skb) 1628 fraggap = skb->len - maxfraglen; 1629 else 1630 fraggap = 0; 1631 /* update mtu and maxfraglen if necessary */ 1632 if (!skb || !skb_prev) 1633 ip6_append_data_mtu(&mtu, &maxfraglen, 1634 fragheaderlen, skb, rt, 1635 orig_mtu); 1636 1637 skb_prev = skb; 1638 1639 /* 1640 * If remaining data exceeds the mtu, 1641 * we know we need more fragment(s). 1642 */ 1643 datalen = length + fraggap; 1644 1645 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1646 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1647 fraglen = datalen + fragheaderlen; 1648 pagedlen = 0; 1649 1650 alloc_extra = hh_len; 1651 alloc_extra += dst_exthdrlen; 1652 alloc_extra += rt->dst.trailer_len; 1653 1654 /* We just reserve space for fragment header. 1655 * Note: this may be overallocation if the message 1656 * (without MSG_MORE) fits into the MTU. 1657 */ 1658 alloc_extra += sizeof(struct frag_hdr); 1659 1660 if ((flags & MSG_MORE) && 1661 !(rt->dst.dev->features&NETIF_F_SG)) 1662 alloclen = mtu; 1663 else if (!paged && 1664 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1665 !(rt->dst.dev->features & NETIF_F_SG))) 1666 alloclen = fraglen; 1667 else { 1668 alloclen = fragheaderlen + transhdrlen; 1669 pagedlen = datalen - transhdrlen; 1670 } 1671 alloclen += alloc_extra; 1672 1673 if (datalen != length + fraggap) { 1674 /* 1675 * this is not the last fragment, the trailer 1676 * space is regarded as data space. 1677 */ 1678 datalen += rt->dst.trailer_len; 1679 } 1680 1681 fraglen = datalen + fragheaderlen; 1682 1683 copy = datalen - transhdrlen - fraggap - pagedlen; 1684 /* [!] NOTE: copy may be negative if pagedlen>0 1685 * because then the equation may reduces to -fraggap. 1686 */ 1687 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1688 err = -EINVAL; 1689 goto error; 1690 } 1691 if (transhdrlen) { 1692 skb = sock_alloc_send_skb(sk, alloclen, 1693 (flags & MSG_DONTWAIT), &err); 1694 } else { 1695 skb = NULL; 1696 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1697 2 * sk->sk_sndbuf) 1698 skb = alloc_skb(alloclen, 1699 sk->sk_allocation); 1700 if (unlikely(!skb)) 1701 err = -ENOBUFS; 1702 } 1703 if (!skb) 1704 goto error; 1705 /* 1706 * Fill in the control structures 1707 */ 1708 skb->protocol = htons(ETH_P_IPV6); 1709 skb->ip_summed = csummode; 1710 skb->csum = 0; 1711 /* reserve for fragmentation and ipsec header */ 1712 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1713 dst_exthdrlen); 1714 1715 /* 1716 * Find where to start putting bytes 1717 */ 1718 data = skb_put(skb, fraglen - pagedlen); 1719 skb_set_network_header(skb, exthdrlen); 1720 data += fragheaderlen; 1721 skb->transport_header = (skb->network_header + 1722 fragheaderlen); 1723 if (fraggap) { 1724 skb->csum = skb_copy_and_csum_bits( 1725 skb_prev, maxfraglen, 1726 data + transhdrlen, fraggap); 1727 skb_prev->csum = csum_sub(skb_prev->csum, 1728 skb->csum); 1729 data += fraggap; 1730 pskb_trim_unique(skb_prev, maxfraglen); 1731 } 1732 if (copy > 0 && 1733 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1734 from, data + transhdrlen, offset, 1735 copy, fraggap, skb) < 0) { 1736 err = -EFAULT; 1737 kfree_skb(skb); 1738 goto error; 1739 } else if (flags & MSG_SPLICE_PAGES) { 1740 copy = 0; 1741 } 1742 1743 offset += copy; 1744 length -= copy + transhdrlen; 1745 transhdrlen = 0; 1746 exthdrlen = 0; 1747 dst_exthdrlen = 0; 1748 1749 /* Only the initial fragment is time stamped */ 1750 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1751 cork->tx_flags = 0; 1752 skb_shinfo(skb)->tskey = tskey; 1753 tskey = 0; 1754 skb_zcopy_set(skb, uarg, &extra_uref); 1755 1756 if ((flags & MSG_CONFIRM) && !skb_prev) 1757 skb_set_dst_pending_confirm(skb, 1); 1758 1759 /* 1760 * Put the packet on the pending queue 1761 */ 1762 if (!skb->destructor) { 1763 skb->destructor = sock_wfree; 1764 skb->sk = sk; 1765 wmem_alloc_delta += skb->truesize; 1766 } 1767 __skb_queue_tail(queue, skb); 1768 continue; 1769 } 1770 1771 if (copy > length) 1772 copy = length; 1773 1774 if (!(rt->dst.dev->features&NETIF_F_SG) && 1775 skb_tailroom(skb) >= copy) { 1776 unsigned int off; 1777 1778 off = skb->len; 1779 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1780 from, skb_put(skb, copy), 1781 offset, copy, off, skb) < 0) { 1782 __skb_trim(skb, off); 1783 err = -EFAULT; 1784 goto error; 1785 } 1786 } else if (flags & MSG_SPLICE_PAGES) { 1787 struct msghdr *msg = from; 1788 1789 err = -EIO; 1790 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1791 goto error; 1792 1793 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1794 if (err < 0) 1795 goto error; 1796 copy = err; 1797 wmem_alloc_delta += copy; 1798 } else if (!zc) { 1799 int i = skb_shinfo(skb)->nr_frags; 1800 1801 err = -ENOMEM; 1802 if (!sk_page_frag_refill(sk, pfrag)) 1803 goto error; 1804 1805 skb_zcopy_downgrade_managed(skb); 1806 if (!skb_can_coalesce(skb, i, pfrag->page, 1807 pfrag->offset)) { 1808 err = -EMSGSIZE; 1809 if (i == MAX_SKB_FRAGS) 1810 goto error; 1811 1812 __skb_fill_page_desc(skb, i, pfrag->page, 1813 pfrag->offset, 0); 1814 skb_shinfo(skb)->nr_frags = ++i; 1815 get_page(pfrag->page); 1816 } 1817 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1818 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1819 from, 1820 page_address(pfrag->page) + pfrag->offset, 1821 offset, copy, skb->len, skb) < 0) 1822 goto error_efault; 1823 1824 pfrag->offset += copy; 1825 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1826 skb->len += copy; 1827 skb->data_len += copy; 1828 skb->truesize += copy; 1829 wmem_alloc_delta += copy; 1830 } else { 1831 err = skb_zerocopy_iter_dgram(skb, from, copy); 1832 if (err < 0) 1833 goto error; 1834 } 1835 offset += copy; 1836 length -= copy; 1837 } 1838 1839 if (wmem_alloc_delta) 1840 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1841 return 0; 1842 1843 error_efault: 1844 err = -EFAULT; 1845 error: 1846 net_zcopy_put_abort(uarg, extra_uref); 1847 cork->length -= length; 1848 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1849 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1850 if (hold_tskey) 1851 atomic_dec(&sk->sk_tskey); 1852 return err; 1853 } 1854 1855 int ip6_append_data(struct sock *sk, 1856 int getfrag(void *from, char *to, int offset, int len, 1857 int odd, struct sk_buff *skb), 1858 void *from, size_t length, int transhdrlen, 1859 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1860 struct rt6_info *rt, unsigned int flags) 1861 { 1862 struct inet_sock *inet = inet_sk(sk); 1863 int exthdrlen; 1864 int err; 1865 1866 if (flags&MSG_PROBE) 1867 return 0; 1868 if (skb_queue_empty(&sk->sk_write_queue)) { 1869 /* 1870 * setup for corking 1871 */ 1872 dst_hold(&rt->dst); 1873 err = ip6_setup_cork(sk, &inet->cork, 1874 ipc6, rt); 1875 if (err) 1876 return err; 1877 1878 inet->cork.fl.u.ip6 = *fl6; 1879 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1880 length += exthdrlen; 1881 transhdrlen += exthdrlen; 1882 } else { 1883 transhdrlen = 0; 1884 } 1885 1886 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1887 sk_page_frag(sk), getfrag, 1888 from, length, transhdrlen, flags); 1889 } 1890 EXPORT_SYMBOL_GPL(ip6_append_data); 1891 1892 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1893 { 1894 struct dst_entry *dst = cork->base.dst; 1895 1896 cork->base.dst = NULL; 1897 skb_dst_set(skb, dst); 1898 } 1899 1900 static void ip6_cork_release(struct inet_cork_full *cork) 1901 { 1902 struct inet6_cork *v6_cork = &cork->base6; 1903 1904 if (unlikely(v6_cork->opt)) { 1905 struct ipv6_txoptions *opt = v6_cork->opt; 1906 1907 kfree(opt->dst0opt); 1908 kfree(opt->dst1opt); 1909 kfree(opt->hopopt); 1910 kfree(opt->srcrt); 1911 kfree(opt); 1912 v6_cork->opt = NULL; 1913 } 1914 1915 if (cork->base.dst) { 1916 dst_release(cork->base.dst); 1917 cork->base.dst = NULL; 1918 } 1919 } 1920 1921 struct sk_buff *__ip6_make_skb(struct sock *sk, 1922 struct sk_buff_head *queue, 1923 struct inet_cork_full *cork) 1924 { 1925 struct sk_buff *skb, *tmp_skb; 1926 struct sk_buff **tail_skb; 1927 struct in6_addr *final_dst; 1928 struct net *net = sock_net(sk); 1929 struct ipv6hdr *hdr; 1930 struct ipv6_txoptions *opt; 1931 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1932 struct flowi6 *fl6 = &cork->fl.u.ip6; 1933 unsigned char proto = fl6->flowi6_proto; 1934 1935 skb = __skb_dequeue(queue); 1936 if (!skb) 1937 goto out; 1938 tail_skb = &(skb_shinfo(skb)->frag_list); 1939 1940 /* move skb->data to ip header from ext header */ 1941 if (skb->data < skb_network_header(skb)) 1942 __skb_pull(skb, skb_network_offset(skb)); 1943 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1944 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1945 *tail_skb = tmp_skb; 1946 tail_skb = &(tmp_skb->next); 1947 skb->len += tmp_skb->len; 1948 skb->data_len += tmp_skb->len; 1949 skb->truesize += tmp_skb->truesize; 1950 tmp_skb->destructor = NULL; 1951 tmp_skb->sk = NULL; 1952 } 1953 1954 /* Allow local fragmentation. */ 1955 skb->ignore_df = ip6_sk_ignore_df(sk); 1956 __skb_pull(skb, skb_network_header_len(skb)); 1957 1958 final_dst = &fl6->daddr; 1959 opt = cork->base6.opt; 1960 if (unlikely(opt)) { 1961 if (opt->opt_flen) 1962 proto = ipv6_push_frag_opts(skb, opt, proto); 1963 if (opt->opt_nflen) 1964 proto = ipv6_push_nfrag_opts(skb, opt, proto, 1965 &final_dst, &fl6->saddr); 1966 } 1967 skb_push(skb, sizeof(struct ipv6hdr)); 1968 skb_reset_network_header(skb); 1969 hdr = ipv6_hdr(skb); 1970 1971 ip6_flow_hdr(hdr, cork->base6.tclass, 1972 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1973 ip6_autoflowlabel(net, sk), fl6)); 1974 hdr->hop_limit = cork->base6.hop_limit; 1975 hdr->nexthdr = proto; 1976 hdr->saddr = fl6->saddr; 1977 hdr->daddr = *final_dst; 1978 1979 skb->priority = cork->base.priority; 1980 skb->mark = cork->base.mark; 1981 if (sk_is_tcp(sk)) 1982 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1983 else 1984 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1985 1986 ip6_cork_steal_dst(skb, cork); 1987 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1988 if (unlikely(proto == IPPROTO_ICMPV6)) { 1989 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1990 u8 icmp6_type; 1991 1992 if (sk->sk_socket->type == SOCK_RAW && 1993 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1994 icmp6_type = fl6->fl6_icmp_type; 1995 else 1996 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1997 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1998 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1999 } 2000 2001 ip6_cork_release(cork); 2002 out: 2003 return skb; 2004 } 2005 2006 int ip6_send_skb(struct sk_buff *skb) 2007 { 2008 struct net *net = sock_net(skb->sk); 2009 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 2010 int err; 2011 2012 rcu_read_lock(); 2013 err = ip6_local_out(net, skb->sk, skb); 2014 if (err) { 2015 if (err > 0) 2016 err = net_xmit_errno(err); 2017 if (err) 2018 IP6_INC_STATS(net, rt->rt6i_idev, 2019 IPSTATS_MIB_OUTDISCARDS); 2020 } 2021 2022 rcu_read_unlock(); 2023 return err; 2024 } 2025 2026 int ip6_push_pending_frames(struct sock *sk) 2027 { 2028 struct sk_buff *skb; 2029 2030 skb = ip6_finish_skb(sk); 2031 if (!skb) 2032 return 0; 2033 2034 return ip6_send_skb(skb); 2035 } 2036 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2037 2038 static void __ip6_flush_pending_frames(struct sock *sk, 2039 struct sk_buff_head *queue, 2040 struct inet_cork_full *cork) 2041 { 2042 struct sk_buff *skb; 2043 2044 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2045 if (skb_dst(skb)) 2046 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2047 IPSTATS_MIB_OUTDISCARDS); 2048 kfree_skb(skb); 2049 } 2050 2051 ip6_cork_release(cork); 2052 } 2053 2054 void ip6_flush_pending_frames(struct sock *sk) 2055 { 2056 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2057 &inet_sk(sk)->cork); 2058 } 2059 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2060 2061 struct sk_buff *ip6_make_skb(struct sock *sk, 2062 int getfrag(void *from, char *to, int offset, 2063 int len, int odd, struct sk_buff *skb), 2064 void *from, size_t length, int transhdrlen, 2065 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2066 unsigned int flags, struct inet_cork_full *cork) 2067 { 2068 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2069 struct sk_buff_head queue; 2070 int err; 2071 2072 if (flags & MSG_PROBE) { 2073 dst_release(&rt->dst); 2074 return NULL; 2075 } 2076 2077 __skb_queue_head_init(&queue); 2078 2079 cork->base.flags = 0; 2080 cork->base.addr = 0; 2081 cork->base.opt = NULL; 2082 cork->base6.opt = NULL; 2083 err = ip6_setup_cork(sk, cork, ipc6, rt); 2084 if (err) { 2085 ip6_cork_release(cork); 2086 return ERR_PTR(err); 2087 } 2088 2089 err = __ip6_append_data(sk, &queue, cork, 2090 ¤t->task_frag, getfrag, from, 2091 length + exthdrlen, transhdrlen + exthdrlen, 2092 flags); 2093 if (err) { 2094 __ip6_flush_pending_frames(sk, &queue, cork); 2095 return ERR_PTR(err); 2096 } 2097 2098 return __ip6_make_skb(sk, &queue, cork); 2099 } 2100