1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (unlikely(ipv6_addr_is_multicast(daddr))) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (unlikely(!skb_gso_validate_network_len(skb, mtu))) 183 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 184 185 return ip6_finish_output2(net, sk, skb); 186 } 187 188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 189 { 190 unsigned int mtu; 191 192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 193 /* Policy lookup after SNAT yielded a new policy */ 194 if (skb_dst(skb)->xfrm) { 195 IP6CB(skb)->flags |= IP6SKB_REROUTED; 196 return dst_output(net, sk, skb); 197 } 198 #endif 199 200 mtu = ip6_skb_dst_mtu(skb); 201 if (skb_is_gso(skb)) 202 return ip6_finish_output_gso(net, sk, skb, mtu); 203 204 if (unlikely(skb->len > mtu || 205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) 206 return ip6_fragment(net, sk, skb, ip6_finish_output2); 207 208 return ip6_finish_output2(net, sk, skb); 209 } 210 211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 212 { 213 int ret; 214 215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 216 switch (ret) { 217 case NET_XMIT_SUCCESS: 218 case NET_XMIT_CN: 219 return __ip6_finish_output(net, sk, skb) ? : ret; 220 default: 221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 222 return ret; 223 } 224 } 225 226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 227 { 228 struct dst_entry *dst = skb_dst(skb); 229 struct net_device *dev, *indev = skb->dev; 230 struct inet6_dev *idev; 231 int ret; 232 233 skb->protocol = htons(ETH_P_IPV6); 234 rcu_read_lock(); 235 dev = dst_dev_rcu(dst); 236 idev = ip6_dst_idev(dst); 237 skb->dev = dev; 238 239 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 240 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 241 rcu_read_unlock(); 242 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 243 return 0; 244 } 245 246 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 247 net, sk, skb, indev, dev, 248 ip6_finish_output, 249 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 250 rcu_read_unlock(); 251 return ret; 252 } 253 EXPORT_SYMBOL(ip6_output); 254 255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 256 { 257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 258 return ip6_default_np_autolabel(net); 259 return inet6_test_bit(AUTOFLOWLABEL, sk); 260 } 261 262 /* 263 * xmit an sk_buff (used by TCP and SCTP) 264 * Note : socket lock is not held for SYNACK packets, but might be modified 265 * by calls to skb_set_owner_w() and ipv6_local_error(), 266 * which are using proper atomic operations or spinlocks. 267 */ 268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 269 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 270 { 271 const struct ipv6_pinfo *np = inet6_sk(sk); 272 struct in6_addr *first_hop = &fl6->daddr; 273 struct dst_entry *dst = skb_dst(skb); 274 struct inet6_dev *idev = ip6_dst_idev(dst); 275 struct net *net = sock_net(sk); 276 unsigned int head_room; 277 struct net_device *dev; 278 struct ipv6hdr *hdr; 279 u8 proto = fl6->flowi6_proto; 280 int seg_len = skb->len; 281 int ret, hlimit = -1; 282 u32 mtu; 283 284 rcu_read_lock(); 285 286 dev = dst_dev_rcu(dst); 287 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); 288 if (opt) 289 head_room += opt->opt_nflen + opt->opt_flen; 290 291 if (unlikely(head_room > skb_headroom(skb))) { 292 /* idev stays alive while we hold rcu_read_lock(). */ 293 skb = skb_expand_head(skb, head_room); 294 if (!skb) { 295 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 296 ret = -ENOBUFS; 297 goto unlock; 298 } 299 } 300 301 if (unlikely(opt)) { 302 seg_len += opt->opt_nflen + opt->opt_flen; 303 304 if (opt->opt_flen) 305 proto = ipv6_push_frag_opts(skb, opt, proto); 306 307 if (opt->opt_nflen) 308 proto = ipv6_push_nfrag_opts(skb, opt, proto, 309 &first_hop, 310 &fl6->saddr); 311 } 312 313 if (unlikely(seg_len > IPV6_MAXPLEN)) 314 seg_len = 0; 315 316 __skb_push(skb, sizeof(struct ipv6hdr)); 317 skb_reset_network_header(skb); 318 hdr = ipv6_hdr(skb); 319 320 /* 321 * Fill in the IPv6 header 322 */ 323 if (np) 324 hlimit = READ_ONCE(np->hop_limit); 325 if (hlimit < 0) 326 hlimit = ip6_dst_hoplimit(dst); 327 328 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 329 ip6_autoflowlabel(net, sk), fl6)); 330 331 hdr->payload_len = htons(seg_len); 332 hdr->nexthdr = proto; 333 hdr->hop_limit = hlimit; 334 335 hdr->saddr = fl6->saddr; 336 hdr->daddr = *first_hop; 337 338 skb->protocol = htons(ETH_P_IPV6); 339 skb->priority = priority; 340 skb->mark = mark; 341 342 mtu = dst6_mtu(dst); 343 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { 344 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 345 346 /* if egress device is enslaved to an L3 master device pass the 347 * skb to its handler for processing 348 */ 349 skb = l3mdev_ip6_out((struct sock *)sk, skb); 350 if (unlikely(!skb)) { 351 ret = 0; 352 goto unlock; 353 } 354 355 /* hooks should never assume socket lock is held. 356 * we promote our socket to non const 357 */ 358 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 359 net, (struct sock *)sk, skb, NULL, dev, 360 dst_output); 361 goto unlock; 362 } 363 364 ret = -EMSGSIZE; 365 skb->dev = dev; 366 /* ipv6_local_error() does not require socket lock, 367 * we promote our socket to non const 368 */ 369 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 370 371 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 372 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 373 unlock: 374 rcu_read_unlock(); 375 return ret; 376 } 377 EXPORT_SYMBOL(ip6_xmit); 378 379 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 380 { 381 struct ip6_ra_chain *ra; 382 struct sock *last = NULL; 383 384 read_lock(&ip6_ra_lock); 385 for (ra = ip6_ra_chain; ra; ra = ra->next) { 386 struct sock *sk = ra->sk; 387 if (sk && ra->sel == sel && 388 (!sk->sk_bound_dev_if || 389 sk->sk_bound_dev_if == skb->dev->ifindex)) { 390 391 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 392 !net_eq(sock_net(sk), dev_net(skb->dev))) { 393 continue; 394 } 395 if (last) { 396 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 397 if (skb2) 398 rawv6_rcv(last, skb2); 399 } 400 last = sk; 401 } 402 } 403 404 if (last) { 405 rawv6_rcv(last, skb); 406 read_unlock(&ip6_ra_lock); 407 return 1; 408 } 409 read_unlock(&ip6_ra_lock); 410 return 0; 411 } 412 413 static int ip6_forward_proxy_check(struct sk_buff *skb) 414 { 415 struct ipv6hdr *hdr = ipv6_hdr(skb); 416 u8 nexthdr = hdr->nexthdr; 417 __be16 frag_off; 418 int offset; 419 420 if (ipv6_ext_hdr(nexthdr)) { 421 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 422 if (offset < 0) 423 return 0; 424 } else 425 offset = sizeof(struct ipv6hdr); 426 427 if (nexthdr == IPPROTO_ICMPV6) { 428 struct icmp6hdr *icmp6; 429 430 if (!pskb_may_pull(skb, (skb_network_header(skb) + 431 offset + 1 - skb->data))) 432 return 0; 433 434 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 435 436 switch (icmp6->icmp6_type) { 437 case NDISC_ROUTER_SOLICITATION: 438 case NDISC_ROUTER_ADVERTISEMENT: 439 case NDISC_NEIGHBOUR_SOLICITATION: 440 case NDISC_NEIGHBOUR_ADVERTISEMENT: 441 case NDISC_REDIRECT: 442 /* For reaction involving unicast neighbor discovery 443 * message destined to the proxied address, pass it to 444 * input function. 445 */ 446 return 1; 447 default: 448 break; 449 } 450 } 451 452 /* 453 * The proxying router can't forward traffic sent to a link-local 454 * address, so signal the sender and discard the packet. This 455 * behavior is clarified by the MIPv6 specification. 456 */ 457 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 458 dst_link_failure(skb); 459 return -1; 460 } 461 462 return 0; 463 } 464 465 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 466 struct sk_buff *skb) 467 { 468 #ifdef CONFIG_NET_SWITCHDEV 469 if (skb->offload_l3_fwd_mark) { 470 consume_skb(skb); 471 return 0; 472 } 473 #endif 474 475 skb_clear_tstamp(skb); 476 return dst_output(net, sk, skb); 477 } 478 479 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 480 { 481 if (skb->len <= mtu) 482 return false; 483 484 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 485 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 486 return true; 487 488 if (skb->ignore_df) 489 return false; 490 491 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 492 return false; 493 494 return true; 495 } 496 497 int ip6_forward(struct sk_buff *skb) 498 { 499 struct dst_entry *dst = skb_dst(skb); 500 struct ipv6hdr *hdr = ipv6_hdr(skb); 501 struct inet6_skb_parm *opt = IP6CB(skb); 502 struct net *net = dev_net(dst_dev(dst)); 503 struct net_device *dev; 504 struct inet6_dev *idev; 505 SKB_DR(reason); 506 u32 mtu; 507 508 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 509 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 510 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 511 goto error; 512 513 if (skb->pkt_type != PACKET_HOST) 514 goto drop; 515 516 if (unlikely(skb->sk)) 517 goto drop; 518 519 if (skb_warn_if_lro(skb)) 520 goto drop; 521 522 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 523 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 524 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 525 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 526 goto drop; 527 } 528 529 skb_forward_csum(skb); 530 531 /* 532 * We DO NOT make any processing on 533 * RA packets, pushing them to user level AS IS 534 * without ane WARRANTY that application will be able 535 * to interpret them. The reason is that we 536 * cannot make anything clever here. 537 * 538 * We are not end-node, so that if packet contains 539 * AH/ESP, we cannot make anything. 540 * Defragmentation also would be mistake, RA packets 541 * cannot be fragmented, because there is no warranty 542 * that different fragments will go along one path. --ANK 543 */ 544 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 545 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 546 return 0; 547 } 548 549 /* 550 * check and decrement ttl 551 */ 552 if (hdr->hop_limit <= 1) { 553 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 554 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 555 556 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 557 return -ETIMEDOUT; 558 } 559 560 /* XXX: idev->cnf.proxy_ndp? */ 561 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 562 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 563 int proxied = ip6_forward_proxy_check(skb); 564 if (proxied > 0) { 565 /* It's tempting to decrease the hop limit 566 * here by 1, as we do at the end of the 567 * function too. 568 * 569 * But that would be incorrect, as proxying is 570 * not forwarding. The ip6_input function 571 * will handle this packet locally, and it 572 * depends on the hop limit being unchanged. 573 * 574 * One example is the NDP hop limit, that 575 * always has to stay 255, but other would be 576 * similar checks around RA packets, where the 577 * user can even change the desired limit. 578 */ 579 return ip6_input(skb); 580 } else if (proxied < 0) { 581 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 582 goto drop; 583 } 584 } 585 586 if (!xfrm6_route_forward(skb)) { 587 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 588 SKB_DR_SET(reason, XFRM_POLICY); 589 goto drop; 590 } 591 dst = skb_dst(skb); 592 dev = dst_dev(dst); 593 /* IPv6 specs say nothing about it, but it is clear that we cannot 594 send redirects to source routed frames. 595 We don't send redirects to frames decapsulated from IPsec. 596 */ 597 if (IP6CB(skb)->iif == dev->ifindex && 598 opt->srcrt == 0 && !skb_sec_path(skb)) { 599 struct in6_addr *target = NULL; 600 struct inet_peer *peer; 601 struct rt6_info *rt; 602 603 /* 604 * incoming and outgoing devices are the same 605 * send a redirect. 606 */ 607 608 rt = dst_rt6_info(dst); 609 if (rt->rt6i_flags & RTF_GATEWAY) 610 target = &rt->rt6i_gateway; 611 else 612 target = &hdr->daddr; 613 614 rcu_read_lock(); 615 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 616 617 /* Limit redirects both by destination (here) 618 and by source (inside ndisc_send_redirect) 619 */ 620 if (inet_peer_xrlim_allow(peer, 1*HZ)) 621 ndisc_send_redirect(skb, target); 622 rcu_read_unlock(); 623 } else { 624 int addrtype = ipv6_addr_type(&hdr->saddr); 625 626 /* This check is security critical. */ 627 if (addrtype == IPV6_ADDR_ANY || 628 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 629 goto error; 630 if (addrtype & IPV6_ADDR_LINKLOCAL) { 631 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 632 ICMPV6_NOT_NEIGHBOUR, 0); 633 goto error; 634 } 635 } 636 637 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 638 639 mtu = ip6_dst_mtu_maybe_forward(dst, true); 640 if (mtu < IPV6_MIN_MTU) 641 mtu = IPV6_MIN_MTU; 642 643 if (unlikely(ip6_pkt_too_big(skb, mtu))) { 644 /* Again, force OUTPUT device used as source address */ 645 skb->dev = dev; 646 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 647 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 648 __IP6_INC_STATS(net, ip6_dst_idev(dst), 649 IPSTATS_MIB_FRAGFAILS); 650 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 651 return -EMSGSIZE; 652 } 653 654 if (skb_cow(skb, dev->hard_header_len)) { 655 __IP6_INC_STATS(net, ip6_dst_idev(dst), 656 IPSTATS_MIB_OUTDISCARDS); 657 goto drop; 658 } 659 660 hdr = ipv6_hdr(skb); 661 662 /* Mangling hops number delayed to point after skb COW */ 663 664 hdr->hop_limit--; 665 666 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 667 net, NULL, skb, skb->dev, dev, 668 ip6_forward_finish); 669 670 error: 671 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 672 SKB_DR_SET(reason, IP_INADDRERRORS); 673 drop: 674 kfree_skb_reason(skb, reason); 675 return -EINVAL; 676 } 677 678 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 679 { 680 to->pkt_type = from->pkt_type; 681 to->priority = from->priority; 682 to->protocol = from->protocol; 683 skb_dst_drop(to); 684 skb_dst_set(to, dst_clone(skb_dst(from))); 685 to->dev = from->dev; 686 to->mark = from->mark; 687 688 skb_copy_hash(to, from); 689 690 #ifdef CONFIG_NET_SCHED 691 to->tc_index = from->tc_index; 692 #endif 693 nf_copy(to, from); 694 skb_ext_copy(to, from); 695 skb_copy_secmark(to, from); 696 } 697 698 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 699 u8 nexthdr, __be32 frag_id, 700 struct ip6_fraglist_iter *iter) 701 { 702 unsigned int first_len; 703 struct frag_hdr *fh; 704 705 /* BUILD HEADER */ 706 *prevhdr = NEXTHDR_FRAGMENT; 707 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 708 if (!iter->tmp_hdr) 709 return -ENOMEM; 710 711 iter->frag = skb_shinfo(skb)->frag_list; 712 skb_frag_list_init(skb); 713 714 iter->offset = 0; 715 iter->hlen = hlen; 716 iter->frag_id = frag_id; 717 iter->nexthdr = nexthdr; 718 719 __skb_pull(skb, hlen); 720 fh = __skb_push(skb, sizeof(struct frag_hdr)); 721 __skb_push(skb, hlen); 722 skb_reset_network_header(skb); 723 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 724 725 fh->nexthdr = nexthdr; 726 fh->reserved = 0; 727 fh->frag_off = htons(IP6_MF); 728 fh->identification = frag_id; 729 730 first_len = skb_pagelen(skb); 731 skb->data_len = first_len - skb_headlen(skb); 732 skb->len = first_len; 733 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 734 735 return 0; 736 } 737 EXPORT_SYMBOL(ip6_fraglist_init); 738 739 void ip6_fraglist_prepare(struct sk_buff *skb, 740 struct ip6_fraglist_iter *iter) 741 { 742 struct sk_buff *frag = iter->frag; 743 unsigned int hlen = iter->hlen; 744 struct frag_hdr *fh; 745 746 frag->ip_summed = CHECKSUM_NONE; 747 skb_reset_transport_header(frag); 748 fh = __skb_push(frag, sizeof(struct frag_hdr)); 749 __skb_push(frag, hlen); 750 skb_reset_network_header(frag); 751 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 752 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 753 fh->nexthdr = iter->nexthdr; 754 fh->reserved = 0; 755 fh->frag_off = htons(iter->offset); 756 if (frag->next) 757 fh->frag_off |= htons(IP6_MF); 758 fh->identification = iter->frag_id; 759 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 760 ip6_copy_metadata(frag, skb); 761 } 762 EXPORT_SYMBOL(ip6_fraglist_prepare); 763 764 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 765 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 766 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 767 { 768 state->prevhdr = prevhdr; 769 state->nexthdr = nexthdr; 770 state->frag_id = frag_id; 771 772 state->hlen = hlen; 773 state->mtu = mtu; 774 775 state->left = skb->len - hlen; /* Space per frame */ 776 state->ptr = hlen; /* Where to start from */ 777 778 state->hroom = hdr_room; 779 state->troom = needed_tailroom; 780 781 state->offset = 0; 782 } 783 EXPORT_SYMBOL(ip6_frag_init); 784 785 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 786 { 787 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 788 struct sk_buff *frag; 789 struct frag_hdr *fh; 790 unsigned int len; 791 792 len = state->left; 793 /* IF: it doesn't fit, use 'mtu' - the data space left */ 794 if (len > state->mtu) 795 len = state->mtu; 796 /* IF: we are not sending up to and including the packet end 797 then align the next start on an eight byte boundary */ 798 if (len < state->left) 799 len &= ~7; 800 801 /* Allocate buffer */ 802 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 803 state->hroom + state->troom, GFP_ATOMIC); 804 if (!frag) 805 return ERR_PTR(-ENOMEM); 806 807 /* 808 * Set up data on packet 809 */ 810 811 ip6_copy_metadata(frag, skb); 812 skb_reserve(frag, state->hroom); 813 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 814 skb_reset_network_header(frag); 815 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 816 frag->transport_header = (frag->network_header + state->hlen + 817 sizeof(struct frag_hdr)); 818 819 /* 820 * Charge the memory for the fragment to any owner 821 * it might possess 822 */ 823 if (skb->sk) 824 skb_set_owner_w(frag, skb->sk); 825 826 /* 827 * Copy the packet header into the new buffer. 828 */ 829 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 830 831 fragnexthdr_offset = skb_network_header(frag); 832 fragnexthdr_offset += prevhdr - skb_network_header(skb); 833 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 834 835 /* 836 * Build fragment header. 837 */ 838 fh->nexthdr = state->nexthdr; 839 fh->reserved = 0; 840 fh->identification = state->frag_id; 841 842 /* 843 * Copy a block of the IP datagram. 844 */ 845 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 846 len)); 847 state->left -= len; 848 849 fh->frag_off = htons(state->offset); 850 if (state->left > 0) 851 fh->frag_off |= htons(IP6_MF); 852 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 853 854 state->ptr += len; 855 state->offset += len; 856 857 return frag; 858 } 859 EXPORT_SYMBOL(ip6_frag_next); 860 861 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 862 int (*output)(struct net *, struct sock *, struct sk_buff *)) 863 { 864 struct sk_buff *frag; 865 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 866 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 867 inet6_sk(skb->sk) : NULL; 868 u8 tstamp_type = skb->tstamp_type; 869 struct ip6_frag_state state; 870 unsigned int mtu, hlen, nexthdr_offset; 871 ktime_t tstamp = skb->tstamp; 872 int hroom, err = 0; 873 __be32 frag_id; 874 u8 *prevhdr, nexthdr = 0; 875 876 err = ip6_find_1stfragopt(skb, &prevhdr); 877 if (err < 0) 878 goto fail; 879 hlen = err; 880 nexthdr = *prevhdr; 881 nexthdr_offset = prevhdr - skb_network_header(skb); 882 883 mtu = ip6_skb_dst_mtu(skb); 884 885 /* We must not fragment if the socket is set to force MTU discovery 886 * or if the skb it not generated by a local socket. 887 */ 888 if (unlikely(!skb->ignore_df && skb->len > mtu)) 889 goto fail_toobig; 890 891 if (IP6CB(skb)->frag_max_size) { 892 if (IP6CB(skb)->frag_max_size > mtu) 893 goto fail_toobig; 894 895 /* don't send fragments larger than what we received */ 896 mtu = IP6CB(skb)->frag_max_size; 897 if (mtu < IPV6_MIN_MTU) 898 mtu = IPV6_MIN_MTU; 899 } 900 901 if (np) { 902 u32 frag_size = READ_ONCE(np->frag_size); 903 904 if (frag_size && frag_size < mtu) 905 mtu = frag_size; 906 } 907 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 908 goto fail_toobig; 909 mtu -= hlen + sizeof(struct frag_hdr); 910 911 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 912 &ipv6_hdr(skb)->saddr); 913 914 if (skb->ip_summed == CHECKSUM_PARTIAL && 915 (err = skb_checksum_help(skb))) 916 goto fail; 917 918 prevhdr = skb_network_header(skb) + nexthdr_offset; 919 hroom = LL_RESERVED_SPACE(rt->dst.dev); 920 if (skb_has_frag_list(skb)) { 921 unsigned int first_len = skb_pagelen(skb); 922 struct ip6_fraglist_iter iter; 923 struct sk_buff *frag2; 924 925 if (first_len - hlen > mtu || 926 ((first_len - hlen) & 7) || 927 skb_cloned(skb) || 928 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 929 goto slow_path; 930 931 skb_walk_frags(skb, frag) { 932 /* Correct geometry. */ 933 if (frag->len > mtu || 934 ((frag->len & 7) && frag->next) || 935 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 936 goto slow_path_clean; 937 938 /* Partially cloned skb? */ 939 if (skb_shared(frag)) 940 goto slow_path_clean; 941 942 BUG_ON(frag->sk); 943 if (skb->sk) { 944 frag->sk = skb->sk; 945 frag->destructor = sock_wfree; 946 } 947 skb->truesize -= frag->truesize; 948 } 949 950 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 951 &iter); 952 if (err < 0) 953 goto fail; 954 955 /* We prevent @rt from being freed. */ 956 rcu_read_lock(); 957 958 for (;;) { 959 /* Prepare header of the next frame, 960 * before previous one went down. */ 961 if (iter.frag) 962 ip6_fraglist_prepare(skb, &iter); 963 964 skb_set_delivery_time(skb, tstamp, tstamp_type); 965 err = output(net, sk, skb); 966 if (!err) 967 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 968 IPSTATS_MIB_FRAGCREATES); 969 970 if (err || !iter.frag) 971 break; 972 973 skb = ip6_fraglist_next(&iter); 974 } 975 976 kfree(iter.tmp_hdr); 977 978 if (err == 0) { 979 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 980 IPSTATS_MIB_FRAGOKS); 981 rcu_read_unlock(); 982 return 0; 983 } 984 985 kfree_skb_list(iter.frag); 986 987 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 988 IPSTATS_MIB_FRAGFAILS); 989 rcu_read_unlock(); 990 return err; 991 992 slow_path_clean: 993 skb_walk_frags(skb, frag2) { 994 if (frag2 == frag) 995 break; 996 frag2->sk = NULL; 997 frag2->destructor = NULL; 998 skb->truesize += frag2->truesize; 999 } 1000 } 1001 1002 slow_path: 1003 /* 1004 * Fragment the datagram. 1005 */ 1006 1007 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1008 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1009 &state); 1010 1011 /* 1012 * Keep copying data until we run out. 1013 */ 1014 1015 while (state.left > 0) { 1016 frag = ip6_frag_next(skb, &state); 1017 if (IS_ERR(frag)) { 1018 err = PTR_ERR(frag); 1019 goto fail; 1020 } 1021 1022 /* 1023 * Put this fragment into the sending queue. 1024 */ 1025 skb_set_delivery_time(frag, tstamp, tstamp_type); 1026 err = output(net, sk, frag); 1027 if (err) 1028 goto fail; 1029 1030 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1031 IPSTATS_MIB_FRAGCREATES); 1032 } 1033 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1034 IPSTATS_MIB_FRAGOKS); 1035 consume_skb(skb); 1036 return err; 1037 1038 fail_toobig: 1039 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1040 err = -EMSGSIZE; 1041 1042 fail: 1043 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1044 IPSTATS_MIB_FRAGFAILS); 1045 kfree_skb(skb); 1046 return err; 1047 } 1048 1049 static inline int ip6_rt_check(const struct rt6key *rt_key, 1050 const struct in6_addr *fl_addr, 1051 const struct in6_addr *addr_cache) 1052 { 1053 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1054 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1055 } 1056 1057 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1058 struct dst_entry *dst, 1059 const struct flowi6 *fl6) 1060 { 1061 struct ipv6_pinfo *np = inet6_sk(sk); 1062 struct rt6_info *rt; 1063 1064 if (!dst) 1065 goto out; 1066 1067 if (dst->ops->family != AF_INET6) { 1068 dst_release(dst); 1069 return NULL; 1070 } 1071 1072 rt = dst_rt6_info(dst); 1073 /* Yes, checking route validity in not connected 1074 * case is not very simple. Take into account, 1075 * that we do not support routing by source, TOS, 1076 * and MSG_DONTROUTE --ANK (980726) 1077 * 1078 * 1. ip6_rt_check(): If route was host route, 1079 * check that cached destination is current. 1080 * If it is network route, we still may 1081 * check its validity using saved pointer 1082 * to the last used address: daddr_cache. 1083 * We do not want to save whole address now, 1084 * (because main consumer of this service 1085 * is tcp, which has not this problem), 1086 * so that the last trick works only on connected 1087 * sockets. 1088 * 2. oif also should be the same. 1089 */ 1090 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1091 np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1092 #ifdef CONFIG_IPV6_SUBTREES 1093 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1094 np->saddr_cache ? &np->saddr : NULL) || 1095 #endif 1096 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1097 dst_release(dst); 1098 dst = NULL; 1099 } 1100 1101 out: 1102 return dst; 1103 } 1104 1105 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1106 struct dst_entry **dst, struct flowi6 *fl6) 1107 { 1108 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1109 struct neighbour *n; 1110 struct rt6_info *rt; 1111 #endif 1112 int err; 1113 int flags = 0; 1114 1115 /* The correct way to handle this would be to do 1116 * ip6_route_get_saddr, and then ip6_route_output; however, 1117 * the route-specific preferred source forces the 1118 * ip6_route_output call _before_ ip6_route_get_saddr. 1119 * 1120 * In source specific routing (no src=any default route), 1121 * ip6_route_output will fail given src=any saddr, though, so 1122 * that's why we try it again later. 1123 */ 1124 if (ipv6_addr_any(&fl6->saddr)) { 1125 struct fib6_info *from; 1126 struct rt6_info *rt; 1127 1128 *dst = ip6_route_output(net, sk, fl6); 1129 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1130 1131 rcu_read_lock(); 1132 from = rt ? rcu_dereference(rt->from) : NULL; 1133 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1134 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1135 fl6->flowi6_l3mdev, 1136 &fl6->saddr); 1137 rcu_read_unlock(); 1138 1139 if (err) 1140 goto out_err_release; 1141 1142 /* If we had an erroneous initial result, pretend it 1143 * never existed and let the SA-enabled version take 1144 * over. 1145 */ 1146 if ((*dst)->error) { 1147 dst_release(*dst); 1148 *dst = NULL; 1149 } 1150 1151 if (fl6->flowi6_oif) 1152 flags |= RT6_LOOKUP_F_IFACE; 1153 } 1154 1155 if (!*dst) 1156 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1157 1158 err = (*dst)->error; 1159 if (err) 1160 goto out_err_release; 1161 1162 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1163 /* 1164 * Here if the dst entry we've looked up 1165 * has a neighbour entry that is in the INCOMPLETE 1166 * state and the src address from the flow is 1167 * marked as OPTIMISTIC, we release the found 1168 * dst entry and replace it instead with the 1169 * dst entry of the nexthop router 1170 */ 1171 rt = dst_rt6_info(*dst); 1172 rcu_read_lock(); 1173 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1174 rt6_nexthop(rt, &fl6->daddr)); 1175 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1176 rcu_read_unlock(); 1177 1178 if (err) { 1179 struct inet6_ifaddr *ifp; 1180 struct flowi6 fl_gw6; 1181 int redirect; 1182 1183 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1184 (*dst)->dev, 1); 1185 1186 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1187 if (ifp) 1188 in6_ifa_put(ifp); 1189 1190 if (redirect) { 1191 /* 1192 * We need to get the dst entry for the 1193 * default router instead 1194 */ 1195 dst_release(*dst); 1196 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1197 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1198 *dst = ip6_route_output(net, sk, &fl_gw6); 1199 err = (*dst)->error; 1200 if (err) 1201 goto out_err_release; 1202 } 1203 } 1204 #endif 1205 if (ipv6_addr_v4mapped(&fl6->saddr) && 1206 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1207 err = -EAFNOSUPPORT; 1208 goto out_err_release; 1209 } 1210 1211 return 0; 1212 1213 out_err_release: 1214 dst_release(*dst); 1215 *dst = NULL; 1216 1217 if (err == -ENETUNREACH) 1218 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1219 return err; 1220 } 1221 1222 /** 1223 * ip6_dst_lookup - perform route lookup on flow 1224 * @net: Network namespace to perform lookup in 1225 * @sk: socket which provides route info 1226 * @dst: pointer to dst_entry * for result 1227 * @fl6: flow to lookup 1228 * 1229 * This function performs a route lookup on the given flow. 1230 * 1231 * It returns zero on success, or a standard errno code on error. 1232 */ 1233 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1234 struct flowi6 *fl6) 1235 { 1236 *dst = NULL; 1237 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1238 } 1239 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1240 1241 /** 1242 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1243 * @net: Network namespace to perform lookup in 1244 * @sk: socket which provides route info 1245 * @fl6: flow to lookup 1246 * @final_dst: final destination address for ipsec lookup 1247 * 1248 * This function performs a route lookup on the given flow. 1249 * 1250 * It returns a valid dst pointer on success, or a pointer encoded 1251 * error code. 1252 */ 1253 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1254 const struct in6_addr *final_dst) 1255 { 1256 struct dst_entry *dst = NULL; 1257 int err; 1258 1259 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1260 if (err) 1261 return ERR_PTR(err); 1262 if (final_dst) 1263 fl6->daddr = *final_dst; 1264 1265 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1266 } 1267 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1268 1269 /** 1270 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1271 * @sk: socket which provides the dst cache and route info 1272 * @fl6: flow to lookup 1273 * @final_dst: final destination address for ipsec lookup 1274 * @connected: whether @sk is connected or not 1275 * 1276 * This function performs a route lookup on the given flow with the 1277 * possibility of using the cached route in the socket if it is valid. 1278 * It will take the socket dst lock when operating on the dst cache. 1279 * As a result, this function can only be used in process context. 1280 * 1281 * In addition, for a connected socket, cache the dst in the socket 1282 * if the current cache is not valid. 1283 * 1284 * It returns a valid dst pointer on success, or a pointer encoded 1285 * error code. 1286 */ 1287 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1288 const struct in6_addr *final_dst, 1289 bool connected) 1290 { 1291 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1292 1293 dst = ip6_sk_dst_check(sk, dst, fl6); 1294 if (dst) 1295 return dst; 1296 1297 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1298 if (connected && !IS_ERR(dst)) 1299 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1300 1301 return dst; 1302 } 1303 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1304 1305 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1306 gfp_t gfp) 1307 { 1308 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1309 } 1310 1311 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1312 gfp_t gfp) 1313 { 1314 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1315 } 1316 1317 static void ip6_append_data_mtu(unsigned int *mtu, 1318 int *maxfraglen, 1319 unsigned int fragheaderlen, 1320 struct sk_buff *skb, 1321 struct rt6_info *rt, 1322 unsigned int orig_mtu) 1323 { 1324 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1325 if (!skb) { 1326 /* first fragment, reserve header_len */ 1327 *mtu = orig_mtu - rt->dst.header_len; 1328 1329 } else { 1330 /* 1331 * this fragment is not first, the headers 1332 * space is regarded as data space. 1333 */ 1334 *mtu = orig_mtu; 1335 } 1336 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1337 + fragheaderlen - sizeof(struct frag_hdr); 1338 } 1339 } 1340 1341 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1342 struct ipcm6_cookie *ipc6, 1343 struct rt6_info *rt) 1344 { 1345 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1346 struct inet6_cork *v6_cork = &cork->base6; 1347 struct ipv6_pinfo *np = inet6_sk(sk); 1348 unsigned int mtu, frag_size; 1349 1350 /* callers pass dst together with a reference, set it first so 1351 * ip6_cork_release() can put it down even in case of an error. 1352 */ 1353 cork->base.dst = &rt->dst; 1354 1355 /* 1356 * setup for corking 1357 */ 1358 if (unlikely(opt)) { 1359 if (WARN_ON(v6_cork->opt)) 1360 return -EINVAL; 1361 1362 nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation); 1363 if (unlikely(!nopt)) 1364 return -ENOBUFS; 1365 1366 nopt->tot_len = sizeof(*opt); 1367 nopt->opt_flen = opt->opt_flen; 1368 nopt->opt_nflen = opt->opt_nflen; 1369 1370 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1371 if (opt->dst0opt && !nopt->dst0opt) 1372 return -ENOBUFS; 1373 1374 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1375 if (opt->dst1opt && !nopt->dst1opt) 1376 return -ENOBUFS; 1377 1378 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1379 if (opt->hopopt && !nopt->hopopt) 1380 return -ENOBUFS; 1381 1382 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1383 if (opt->srcrt && !nopt->srcrt) 1384 return -ENOBUFS; 1385 1386 /* need source address above miyazawa*/ 1387 } 1388 v6_cork->hop_limit = ipc6->hlimit; 1389 v6_cork->tclass = ipc6->tclass; 1390 v6_cork->dontfrag = ipc6->dontfrag; 1391 if (rt->dst.flags & DST_XFRM_TUNNEL) 1392 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1393 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); 1394 else 1395 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1396 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); 1397 1398 frag_size = READ_ONCE(np->frag_size); 1399 if (frag_size && frag_size < mtu) 1400 mtu = frag_size; 1401 1402 cork->base.fragsize = mtu; 1403 cork->base.gso_size = ipc6->gso_size; 1404 cork->base.tx_flags = 0; 1405 cork->base.mark = ipc6->sockc.mark; 1406 cork->base.priority = ipc6->sockc.priority; 1407 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1408 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1409 cork->base.flags |= IPCORK_TS_OPT_ID; 1410 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1411 } 1412 cork->base.length = 0; 1413 cork->base.transmit_time = ipc6->sockc.transmit_time; 1414 1415 return 0; 1416 } 1417 1418 static int __ip6_append_data(struct sock *sk, 1419 struct sk_buff_head *queue, 1420 struct inet_cork_full *cork_full, 1421 struct page_frag *pfrag, 1422 int getfrag(void *from, char *to, int offset, 1423 int len, int odd, struct sk_buff *skb), 1424 void *from, size_t length, int transhdrlen, 1425 unsigned int flags) 1426 { 1427 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1428 struct inet6_cork *v6_cork = &cork_full->base6; 1429 struct inet_cork *cork = &cork_full->base; 1430 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1431 struct sk_buff *skb, *skb_prev = NULL; 1432 struct ubuf_info *uarg = NULL; 1433 int exthdrlen = 0; 1434 int dst_exthdrlen = 0; 1435 int hh_len; 1436 int copy; 1437 int err; 1438 int offset = 0; 1439 bool zc = false; 1440 u32 tskey = 0; 1441 struct rt6_info *rt = dst_rt6_info(cork->dst); 1442 bool paged, hold_tskey = false, extra_uref = false; 1443 struct ipv6_txoptions *opt = v6_cork->opt; 1444 int csummode = CHECKSUM_NONE; 1445 unsigned int maxnonfragsize, headersize; 1446 unsigned int wmem_alloc_delta = 0; 1447 1448 skb = skb_peek_tail(queue); 1449 if (!skb) { 1450 exthdrlen = opt ? opt->opt_flen : 0; 1451 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1452 } 1453 1454 paged = !!cork->gso_size; 1455 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1456 orig_mtu = mtu; 1457 1458 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1459 1460 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1461 (opt ? opt->opt_nflen : 0); 1462 1463 headersize = sizeof(struct ipv6hdr) + 1464 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1465 rt->rt6i_nfheader_len; 1466 1467 if (mtu <= fragheaderlen || 1468 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1469 goto emsgsize; 1470 1471 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1472 sizeof(struct frag_hdr); 1473 1474 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1475 * the first fragment 1476 */ 1477 if (headersize + transhdrlen > mtu) 1478 goto emsgsize; 1479 1480 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1481 (sk->sk_protocol == IPPROTO_UDP || 1482 sk->sk_protocol == IPPROTO_ICMPV6 || 1483 sk->sk_protocol == IPPROTO_RAW)) { 1484 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1485 sizeof(struct ipv6hdr)); 1486 goto emsgsize; 1487 } 1488 1489 if (ip6_sk_ignore_df(sk)) 1490 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1491 else 1492 maxnonfragsize = mtu; 1493 1494 if (cork->length + length > maxnonfragsize - headersize) { 1495 emsgsize: 1496 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1497 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1498 return -EMSGSIZE; 1499 } 1500 1501 /* CHECKSUM_PARTIAL only with no extension headers and when 1502 * we are not going to fragment 1503 */ 1504 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1505 headersize == sizeof(struct ipv6hdr) && 1506 length <= mtu - headersize && 1507 (!(flags & MSG_MORE) || cork->gso_size) && 1508 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1509 csummode = CHECKSUM_PARTIAL; 1510 1511 if ((flags & MSG_ZEROCOPY) && length) { 1512 struct msghdr *msg = from; 1513 1514 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1515 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1516 return -EINVAL; 1517 1518 /* Leave uarg NULL if can't zerocopy, callers should 1519 * be able to handle it. 1520 */ 1521 if ((rt->dst.dev->features & NETIF_F_SG) && 1522 csummode == CHECKSUM_PARTIAL) { 1523 paged = true; 1524 zc = true; 1525 uarg = msg->msg_ubuf; 1526 } 1527 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1528 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1529 false); 1530 if (!uarg) 1531 return -ENOBUFS; 1532 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1533 if (rt->dst.dev->features & NETIF_F_SG && 1534 csummode == CHECKSUM_PARTIAL) { 1535 paged = true; 1536 zc = true; 1537 } else { 1538 uarg_to_msgzc(uarg)->zerocopy = 0; 1539 skb_zcopy_set(skb, uarg, &extra_uref); 1540 } 1541 } 1542 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1543 if (inet_test_bit(HDRINCL, sk)) 1544 return -EPERM; 1545 if (rt->dst.dev->features & NETIF_F_SG && 1546 getfrag == ip_generic_getfrag) 1547 /* We need an empty buffer to attach stuff to */ 1548 paged = true; 1549 else 1550 flags &= ~MSG_SPLICE_PAGES; 1551 } 1552 1553 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1554 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1555 if (cork->flags & IPCORK_TS_OPT_ID) { 1556 tskey = cork->ts_opt_id; 1557 } else { 1558 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1559 hold_tskey = true; 1560 } 1561 } 1562 1563 /* 1564 * Let's try using as much space as possible. 1565 * Use MTU if total length of the message fits into the MTU. 1566 * Otherwise, we need to reserve fragment header and 1567 * fragment alignment (= 8-15 octects, in total). 1568 * 1569 * Note that we may need to "move" the data from the tail 1570 * of the buffer to the new fragment when we split 1571 * the message. 1572 * 1573 * FIXME: It may be fragmented into multiple chunks 1574 * at once if non-fragmentable extension headers 1575 * are too large. 1576 * --yoshfuji 1577 */ 1578 1579 cork->length += length; 1580 if (!skb) 1581 goto alloc_new_skb; 1582 1583 while (length > 0) { 1584 /* Check if the remaining data fits into current packet. */ 1585 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1586 if (copy < length) 1587 copy = maxfraglen - skb->len; 1588 1589 if (copy <= 0) { 1590 char *data; 1591 unsigned int datalen; 1592 unsigned int fraglen; 1593 unsigned int fraggap; 1594 unsigned int alloclen, alloc_extra; 1595 unsigned int pagedlen; 1596 alloc_new_skb: 1597 /* There's no room in the current skb */ 1598 if (skb) 1599 fraggap = skb->len - maxfraglen; 1600 else 1601 fraggap = 0; 1602 /* update mtu and maxfraglen if necessary */ 1603 if (!skb || !skb_prev) 1604 ip6_append_data_mtu(&mtu, &maxfraglen, 1605 fragheaderlen, skb, rt, 1606 orig_mtu); 1607 1608 skb_prev = skb; 1609 1610 /* 1611 * If remaining data exceeds the mtu, 1612 * we know we need more fragment(s). 1613 */ 1614 datalen = length + fraggap; 1615 1616 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1617 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1618 fraglen = datalen + fragheaderlen; 1619 pagedlen = 0; 1620 1621 alloc_extra = hh_len; 1622 alloc_extra += dst_exthdrlen; 1623 alloc_extra += rt->dst.trailer_len; 1624 1625 /* We just reserve space for fragment header. 1626 * Note: this may be overallocation if the message 1627 * (without MSG_MORE) fits into the MTU. 1628 */ 1629 alloc_extra += sizeof(struct frag_hdr); 1630 1631 if ((flags & MSG_MORE) && 1632 !(rt->dst.dev->features&NETIF_F_SG)) 1633 alloclen = mtu; 1634 else if (!paged && 1635 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1636 !(rt->dst.dev->features & NETIF_F_SG))) 1637 alloclen = fraglen; 1638 else { 1639 alloclen = fragheaderlen + transhdrlen; 1640 pagedlen = datalen - transhdrlen; 1641 } 1642 alloclen += alloc_extra; 1643 1644 if (datalen != length + fraggap) { 1645 /* 1646 * this is not the last fragment, the trailer 1647 * space is regarded as data space. 1648 */ 1649 datalen += rt->dst.trailer_len; 1650 } 1651 1652 fraglen = datalen + fragheaderlen; 1653 1654 copy = datalen - transhdrlen - fraggap - pagedlen; 1655 /* [!] NOTE: copy may be negative if pagedlen>0 1656 * because then the equation may reduces to -fraggap. 1657 */ 1658 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1659 err = -EINVAL; 1660 goto error; 1661 } 1662 if (transhdrlen) { 1663 skb = sock_alloc_send_skb(sk, alloclen, 1664 (flags & MSG_DONTWAIT), &err); 1665 } else { 1666 skb = NULL; 1667 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1668 2 * sk->sk_sndbuf) 1669 skb = alloc_skb(alloclen, 1670 sk->sk_allocation); 1671 if (unlikely(!skb)) 1672 err = -ENOBUFS; 1673 } 1674 if (!skb) 1675 goto error; 1676 /* 1677 * Fill in the control structures 1678 */ 1679 skb->protocol = htons(ETH_P_IPV6); 1680 skb->ip_summed = csummode; 1681 skb->csum = 0; 1682 /* reserve for fragmentation and ipsec header */ 1683 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1684 dst_exthdrlen); 1685 1686 /* 1687 * Find where to start putting bytes 1688 */ 1689 data = skb_put(skb, fraglen - pagedlen); 1690 skb_set_network_header(skb, exthdrlen); 1691 data += fragheaderlen; 1692 skb->transport_header = (skb->network_header + 1693 fragheaderlen); 1694 if (fraggap) { 1695 skb->csum = skb_copy_and_csum_bits( 1696 skb_prev, maxfraglen, 1697 data + transhdrlen, fraggap); 1698 skb_prev->csum = csum_sub(skb_prev->csum, 1699 skb->csum); 1700 data += fraggap; 1701 pskb_trim_unique(skb_prev, maxfraglen); 1702 } 1703 if (copy > 0 && 1704 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1705 from, data + transhdrlen, offset, 1706 copy, fraggap, skb) < 0) { 1707 err = -EFAULT; 1708 kfree_skb(skb); 1709 goto error; 1710 } else if (flags & MSG_SPLICE_PAGES) { 1711 copy = 0; 1712 } 1713 1714 offset += copy; 1715 length -= copy + transhdrlen; 1716 transhdrlen = 0; 1717 exthdrlen = 0; 1718 dst_exthdrlen = 0; 1719 1720 /* Only the initial fragment is time stamped */ 1721 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1722 cork->tx_flags = 0; 1723 skb_shinfo(skb)->tskey = tskey; 1724 tskey = 0; 1725 skb_zcopy_set(skb, uarg, &extra_uref); 1726 1727 if ((flags & MSG_CONFIRM) && !skb_prev) 1728 skb_set_dst_pending_confirm(skb, 1); 1729 1730 /* 1731 * Put the packet on the pending queue 1732 */ 1733 if (!skb->destructor) { 1734 skb->destructor = sock_wfree; 1735 skb->sk = sk; 1736 wmem_alloc_delta += skb->truesize; 1737 } 1738 __skb_queue_tail(queue, skb); 1739 continue; 1740 } 1741 1742 if (copy > length) 1743 copy = length; 1744 1745 if (!(rt->dst.dev->features&NETIF_F_SG) && 1746 skb_tailroom(skb) >= copy) { 1747 unsigned int off; 1748 1749 off = skb->len; 1750 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1751 from, skb_put(skb, copy), 1752 offset, copy, off, skb) < 0) { 1753 __skb_trim(skb, off); 1754 err = -EFAULT; 1755 goto error; 1756 } 1757 } else if (flags & MSG_SPLICE_PAGES) { 1758 struct msghdr *msg = from; 1759 1760 err = -EIO; 1761 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1762 goto error; 1763 1764 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1765 if (err < 0) 1766 goto error; 1767 copy = err; 1768 wmem_alloc_delta += copy; 1769 } else if (!zc) { 1770 int i = skb_shinfo(skb)->nr_frags; 1771 1772 err = -ENOMEM; 1773 if (!sk_page_frag_refill(sk, pfrag)) 1774 goto error; 1775 1776 skb_zcopy_downgrade_managed(skb); 1777 if (!skb_can_coalesce(skb, i, pfrag->page, 1778 pfrag->offset)) { 1779 err = -EMSGSIZE; 1780 if (i == MAX_SKB_FRAGS) 1781 goto error; 1782 1783 __skb_fill_page_desc(skb, i, pfrag->page, 1784 pfrag->offset, 0); 1785 skb_shinfo(skb)->nr_frags = ++i; 1786 get_page(pfrag->page); 1787 } 1788 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1789 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1790 from, 1791 page_address(pfrag->page) + pfrag->offset, 1792 offset, copy, skb->len, skb) < 0) 1793 goto error_efault; 1794 1795 pfrag->offset += copy; 1796 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1797 skb->len += copy; 1798 skb->data_len += copy; 1799 skb->truesize += copy; 1800 wmem_alloc_delta += copy; 1801 } else { 1802 err = skb_zerocopy_iter_dgram(skb, from, copy); 1803 if (err < 0) 1804 goto error; 1805 } 1806 offset += copy; 1807 length -= copy; 1808 } 1809 1810 if (wmem_alloc_delta) 1811 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1812 return 0; 1813 1814 error_efault: 1815 err = -EFAULT; 1816 error: 1817 net_zcopy_put_abort(uarg, extra_uref); 1818 cork->length -= length; 1819 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1820 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1821 if (hold_tskey) 1822 atomic_dec(&sk->sk_tskey); 1823 return err; 1824 } 1825 1826 int ip6_append_data(struct sock *sk, 1827 int getfrag(void *from, char *to, int offset, int len, 1828 int odd, struct sk_buff *skb), 1829 void *from, size_t length, int transhdrlen, 1830 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1831 struct rt6_info *rt, unsigned int flags) 1832 { 1833 struct inet_sock *inet = inet_sk(sk); 1834 int exthdrlen; 1835 int err; 1836 1837 if (flags&MSG_PROBE) 1838 return 0; 1839 if (skb_queue_empty(&sk->sk_write_queue)) { 1840 /* 1841 * setup for corking 1842 */ 1843 dst_hold(&rt->dst); 1844 err = ip6_setup_cork(sk, &inet->cork, 1845 ipc6, rt); 1846 if (err) 1847 return err; 1848 1849 inet->cork.fl.u.ip6 = *fl6; 1850 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1851 length += exthdrlen; 1852 transhdrlen += exthdrlen; 1853 } else { 1854 transhdrlen = 0; 1855 } 1856 1857 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1858 sk_page_frag(sk), getfrag, 1859 from, length, transhdrlen, flags); 1860 } 1861 EXPORT_SYMBOL_GPL(ip6_append_data); 1862 1863 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1864 { 1865 struct dst_entry *dst = cork->base.dst; 1866 1867 cork->base.dst = NULL; 1868 skb_dst_set(skb, dst); 1869 } 1870 1871 static void ip6_cork_release(struct inet_cork_full *cork) 1872 { 1873 struct inet6_cork *v6_cork = &cork->base6; 1874 1875 if (unlikely(v6_cork->opt)) { 1876 struct ipv6_txoptions *opt = v6_cork->opt; 1877 1878 kfree(opt->dst0opt); 1879 kfree(opt->dst1opt); 1880 kfree(opt->hopopt); 1881 kfree(opt->srcrt); 1882 kfree(opt); 1883 v6_cork->opt = NULL; 1884 } 1885 1886 if (cork->base.dst) { 1887 dst_release(cork->base.dst); 1888 cork->base.dst = NULL; 1889 } 1890 } 1891 1892 struct sk_buff *__ip6_make_skb(struct sock *sk, 1893 struct sk_buff_head *queue, 1894 struct inet_cork_full *cork) 1895 { 1896 struct sk_buff *skb, *tmp_skb; 1897 struct sk_buff **tail_skb; 1898 struct in6_addr *final_dst; 1899 struct net *net = sock_net(sk); 1900 struct ipv6hdr *hdr; 1901 struct ipv6_txoptions *opt; 1902 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1903 struct flowi6 *fl6 = &cork->fl.u.ip6; 1904 unsigned char proto = fl6->flowi6_proto; 1905 1906 skb = __skb_dequeue(queue); 1907 if (!skb) 1908 goto out; 1909 tail_skb = &(skb_shinfo(skb)->frag_list); 1910 1911 /* move skb->data to ip header from ext header */ 1912 if (skb->data < skb_network_header(skb)) 1913 __skb_pull(skb, skb_network_offset(skb)); 1914 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1915 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1916 *tail_skb = tmp_skb; 1917 tail_skb = &(tmp_skb->next); 1918 skb->len += tmp_skb->len; 1919 skb->data_len += tmp_skb->len; 1920 skb->truesize += tmp_skb->truesize; 1921 tmp_skb->destructor = NULL; 1922 tmp_skb->sk = NULL; 1923 } 1924 1925 /* Allow local fragmentation. */ 1926 skb->ignore_df = ip6_sk_ignore_df(sk); 1927 __skb_pull(skb, skb_network_header_len(skb)); 1928 1929 final_dst = &fl6->daddr; 1930 opt = cork->base6.opt; 1931 if (unlikely(opt)) { 1932 if (opt->opt_flen) 1933 proto = ipv6_push_frag_opts(skb, opt, proto); 1934 if (opt->opt_nflen) 1935 proto = ipv6_push_nfrag_opts(skb, opt, proto, 1936 &final_dst, &fl6->saddr); 1937 } 1938 skb_push(skb, sizeof(struct ipv6hdr)); 1939 skb_reset_network_header(skb); 1940 hdr = ipv6_hdr(skb); 1941 1942 ip6_flow_hdr(hdr, cork->base6.tclass, 1943 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1944 ip6_autoflowlabel(net, sk), fl6)); 1945 hdr->hop_limit = cork->base6.hop_limit; 1946 hdr->nexthdr = proto; 1947 hdr->saddr = fl6->saddr; 1948 hdr->daddr = *final_dst; 1949 1950 skb->priority = cork->base.priority; 1951 skb->mark = cork->base.mark; 1952 if (sk_is_tcp(sk)) 1953 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1954 else 1955 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1956 1957 ip6_cork_steal_dst(skb, cork); 1958 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1959 if (unlikely(proto == IPPROTO_ICMPV6)) { 1960 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1961 u8 icmp6_type; 1962 1963 if (sk->sk_socket->type == SOCK_RAW && 1964 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1965 icmp6_type = fl6->fl6_icmp_type; 1966 else 1967 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1968 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1969 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1970 } 1971 1972 ip6_cork_release(cork); 1973 out: 1974 return skb; 1975 } 1976 1977 int ip6_send_skb(struct sk_buff *skb) 1978 { 1979 struct net *net = sock_net(skb->sk); 1980 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 1981 int err; 1982 1983 rcu_read_lock(); 1984 err = ip6_local_out(net, skb->sk, skb); 1985 if (err) { 1986 if (err > 0) 1987 err = net_xmit_errno(err); 1988 if (err) 1989 IP6_INC_STATS(net, rt->rt6i_idev, 1990 IPSTATS_MIB_OUTDISCARDS); 1991 } 1992 1993 rcu_read_unlock(); 1994 return err; 1995 } 1996 1997 int ip6_push_pending_frames(struct sock *sk) 1998 { 1999 struct sk_buff *skb; 2000 2001 skb = ip6_finish_skb(sk); 2002 if (!skb) 2003 return 0; 2004 2005 return ip6_send_skb(skb); 2006 } 2007 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2008 2009 static void __ip6_flush_pending_frames(struct sock *sk, 2010 struct sk_buff_head *queue, 2011 struct inet_cork_full *cork) 2012 { 2013 struct sk_buff *skb; 2014 2015 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2016 if (skb_dst(skb)) 2017 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2018 IPSTATS_MIB_OUTDISCARDS); 2019 kfree_skb(skb); 2020 } 2021 2022 ip6_cork_release(cork); 2023 } 2024 2025 void ip6_flush_pending_frames(struct sock *sk) 2026 { 2027 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2028 &inet_sk(sk)->cork); 2029 } 2030 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2031 2032 struct sk_buff *ip6_make_skb(struct sock *sk, 2033 int getfrag(void *from, char *to, int offset, 2034 int len, int odd, struct sk_buff *skb), 2035 void *from, size_t length, int transhdrlen, 2036 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2037 unsigned int flags, struct inet_cork_full *cork) 2038 { 2039 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2040 struct sk_buff_head queue; 2041 int err; 2042 2043 if (flags & MSG_PROBE) { 2044 dst_release(&rt->dst); 2045 return NULL; 2046 } 2047 2048 __skb_queue_head_init(&queue); 2049 2050 cork->base.flags = 0; 2051 cork->base.addr = 0; 2052 cork->base.opt = NULL; 2053 cork->base6.opt = NULL; 2054 err = ip6_setup_cork(sk, cork, ipc6, rt); 2055 if (err) { 2056 ip6_cork_release(cork); 2057 return ERR_PTR(err); 2058 } 2059 2060 err = __ip6_append_data(sk, &queue, cork, 2061 ¤t->task_frag, getfrag, from, 2062 length + exthdrlen, transhdrlen + exthdrlen, 2063 flags); 2064 if (err) { 2065 __ip6_flush_pending_frames(sk, &queue, cork); 2066 return ERR_PTR(err); 2067 } 2068 2069 return __ip6_make_skb(sk, &queue, cork); 2070 } 2071