1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (unlikely(ipv6_addr_is_multicast(daddr))) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (unlikely(!skb_gso_validate_network_len(skb, mtu))) 183 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 184 185 return ip6_finish_output2(net, sk, skb); 186 } 187 188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 189 { 190 unsigned int mtu; 191 192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 193 /* Policy lookup after SNAT yielded a new policy */ 194 if (skb_dst(skb)->xfrm) { 195 IP6CB(skb)->flags |= IP6SKB_REROUTED; 196 return dst_output(net, sk, skb); 197 } 198 #endif 199 200 mtu = ip6_skb_dst_mtu(skb); 201 if (skb_is_gso(skb)) 202 return ip6_finish_output_gso(net, sk, skb, mtu); 203 204 if (unlikely(skb->len > mtu || 205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) 206 return ip6_fragment(net, sk, skb, ip6_finish_output2); 207 208 return ip6_finish_output2(net, sk, skb); 209 } 210 211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 212 { 213 int ret; 214 215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 216 switch (ret) { 217 case NET_XMIT_SUCCESS: 218 case NET_XMIT_CN: 219 return __ip6_finish_output(net, sk, skb) ? : ret; 220 default: 221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 222 return ret; 223 } 224 } 225 226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 227 { 228 struct dst_entry *dst = skb_dst(skb); 229 struct net_device *dev, *indev = skb->dev; 230 struct inet6_dev *idev; 231 int ret; 232 233 skb->protocol = htons(ETH_P_IPV6); 234 rcu_read_lock(); 235 dev = dst_dev_rcu(dst); 236 idev = ip6_dst_idev(dst); 237 skb->dev = dev; 238 239 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 240 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 241 rcu_read_unlock(); 242 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 243 return 0; 244 } 245 246 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 247 net, sk, skb, indev, dev, 248 ip6_finish_output, 249 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 250 rcu_read_unlock(); 251 return ret; 252 } 253 EXPORT_SYMBOL(ip6_output); 254 255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 256 { 257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 258 return ip6_default_np_autolabel(net); 259 return inet6_test_bit(AUTOFLOWLABEL, sk); 260 } 261 262 /* 263 * xmit an sk_buff (used by TCP and SCTP) 264 * Note : socket lock is not held for SYNACK packets, but might be modified 265 * by calls to skb_set_owner_w() and ipv6_local_error(), 266 * which are using proper atomic operations or spinlocks. 267 */ 268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 269 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 270 { 271 const struct ipv6_pinfo *np = inet6_sk(sk); 272 struct in6_addr *first_hop = &fl6->daddr; 273 struct dst_entry *dst = skb_dst(skb); 274 struct inet6_dev *idev = ip6_dst_idev(dst); 275 struct net *net = sock_net(sk); 276 unsigned int head_room; 277 struct net_device *dev; 278 struct ipv6hdr *hdr; 279 u8 proto = fl6->flowi6_proto; 280 int seg_len = skb->len; 281 int ret, hlimit = -1; 282 u32 mtu; 283 284 rcu_read_lock(); 285 286 dev = dst_dev_rcu(dst); 287 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); 288 if (opt) 289 head_room += opt->opt_nflen + opt->opt_flen; 290 291 if (unlikely(head_room > skb_headroom(skb))) { 292 /* idev stays alive while we hold rcu_read_lock(). */ 293 skb = skb_expand_head(skb, head_room); 294 if (!skb) { 295 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 296 ret = -ENOBUFS; 297 goto unlock; 298 } 299 } 300 301 if (unlikely(opt)) { 302 seg_len += opt->opt_nflen + opt->opt_flen; 303 304 if (opt->opt_flen) 305 proto = ipv6_push_frag_opts(skb, opt, proto); 306 307 if (opt->opt_nflen) 308 proto = ipv6_push_nfrag_opts(skb, opt, proto, 309 &first_hop, 310 &fl6->saddr); 311 } 312 313 if (unlikely(seg_len > IPV6_MAXPLEN)) 314 seg_len = 0; 315 316 __skb_push(skb, sizeof(struct ipv6hdr)); 317 skb_reset_network_header(skb); 318 hdr = ipv6_hdr(skb); 319 320 /* 321 * Fill in the IPv6 header 322 */ 323 if (np) 324 hlimit = READ_ONCE(np->hop_limit); 325 if (hlimit < 0) 326 hlimit = ip6_dst_hoplimit(dst); 327 328 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 329 ip6_autoflowlabel(net, sk), fl6)); 330 331 hdr->payload_len = htons(seg_len); 332 hdr->nexthdr = proto; 333 hdr->hop_limit = hlimit; 334 335 hdr->saddr = fl6->saddr; 336 hdr->daddr = *first_hop; 337 338 skb->protocol = htons(ETH_P_IPV6); 339 skb->priority = priority; 340 skb->mark = mark; 341 342 mtu = dst6_mtu(dst); 343 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { 344 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 345 346 /* if egress device is enslaved to an L3 master device pass the 347 * skb to its handler for processing 348 */ 349 skb = l3mdev_ip6_out((struct sock *)sk, skb); 350 if (unlikely(!skb)) { 351 ret = 0; 352 goto unlock; 353 } 354 355 /* hooks should never assume socket lock is held. 356 * we promote our socket to non const 357 */ 358 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 359 net, (struct sock *)sk, skb, NULL, dev, 360 dst_output); 361 goto unlock; 362 } 363 364 ret = -EMSGSIZE; 365 skb->dev = dev; 366 /* ipv6_local_error() does not require socket lock, 367 * we promote our socket to non const 368 */ 369 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 370 371 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 372 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 373 unlock: 374 rcu_read_unlock(); 375 return ret; 376 } 377 EXPORT_SYMBOL(ip6_xmit); 378 379 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 380 { 381 struct ip6_ra_chain *ra; 382 struct sock *last = NULL; 383 384 read_lock(&ip6_ra_lock); 385 for (ra = ip6_ra_chain; ra; ra = ra->next) { 386 struct sock *sk = ra->sk; 387 if (sk && ra->sel == sel && 388 (!sk->sk_bound_dev_if || 389 sk->sk_bound_dev_if == skb->dev->ifindex)) { 390 391 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 392 !net_eq(sock_net(sk), dev_net(skb->dev))) { 393 continue; 394 } 395 if (last) { 396 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 397 if (skb2) 398 rawv6_rcv(last, skb2); 399 } 400 last = sk; 401 } 402 } 403 404 if (last) { 405 rawv6_rcv(last, skb); 406 read_unlock(&ip6_ra_lock); 407 return 1; 408 } 409 read_unlock(&ip6_ra_lock); 410 return 0; 411 } 412 413 static int ip6_forward_proxy_check(struct sk_buff *skb) 414 { 415 struct ipv6hdr *hdr = ipv6_hdr(skb); 416 u8 nexthdr = hdr->nexthdr; 417 __be16 frag_off; 418 int offset; 419 420 if (ipv6_ext_hdr(nexthdr)) { 421 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 422 if (offset < 0) 423 return 0; 424 } else 425 offset = sizeof(struct ipv6hdr); 426 427 if (nexthdr == IPPROTO_ICMPV6) { 428 struct icmp6hdr *icmp6; 429 430 if (!pskb_may_pull(skb, (skb_network_header(skb) + 431 offset + 1 - skb->data))) 432 return 0; 433 434 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 435 436 switch (icmp6->icmp6_type) { 437 case NDISC_ROUTER_SOLICITATION: 438 case NDISC_ROUTER_ADVERTISEMENT: 439 case NDISC_NEIGHBOUR_SOLICITATION: 440 case NDISC_NEIGHBOUR_ADVERTISEMENT: 441 case NDISC_REDIRECT: 442 /* For reaction involving unicast neighbor discovery 443 * message destined to the proxied address, pass it to 444 * input function. 445 */ 446 return 1; 447 default: 448 break; 449 } 450 } 451 452 /* 453 * The proxying router can't forward traffic sent to a link-local 454 * address, so signal the sender and discard the packet. This 455 * behavior is clarified by the MIPv6 specification. 456 */ 457 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 458 dst_link_failure(skb); 459 return -1; 460 } 461 462 return 0; 463 } 464 465 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 466 struct sk_buff *skb) 467 { 468 #ifdef CONFIG_NET_SWITCHDEV 469 if (skb->offload_l3_fwd_mark) { 470 consume_skb(skb); 471 return 0; 472 } 473 #endif 474 475 skb_clear_tstamp(skb); 476 return dst_output(net, sk, skb); 477 } 478 479 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 480 { 481 if (skb->len <= mtu) 482 return false; 483 484 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 485 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 486 return true; 487 488 if (skb->ignore_df) 489 return false; 490 491 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 492 return false; 493 494 return true; 495 } 496 497 int ip6_forward(struct sk_buff *skb) 498 { 499 struct dst_entry *dst = skb_dst(skb); 500 struct ipv6hdr *hdr = ipv6_hdr(skb); 501 struct inet6_skb_parm *opt = IP6CB(skb); 502 struct net *net = dev_net(dst_dev(dst)); 503 struct net_device *dev; 504 struct inet6_dev *idev; 505 SKB_DR(reason); 506 u32 mtu; 507 508 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 509 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 510 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 511 goto error; 512 513 if (skb->pkt_type != PACKET_HOST) 514 goto drop; 515 516 if (unlikely(skb->sk)) 517 goto drop; 518 519 if (skb_warn_if_lro(skb)) 520 goto drop; 521 522 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 523 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 524 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 525 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 526 goto drop; 527 } 528 529 skb_forward_csum(skb); 530 531 /* 532 * We DO NOT make any processing on 533 * RA packets, pushing them to user level AS IS 534 * without ane WARRANTY that application will be able 535 * to interpret them. The reason is that we 536 * cannot make anything clever here. 537 * 538 * We are not end-node, so that if packet contains 539 * AH/ESP, we cannot make anything. 540 * Defragmentation also would be mistake, RA packets 541 * cannot be fragmented, because there is no warranty 542 * that different fragments will go along one path. --ANK 543 */ 544 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 545 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 546 return 0; 547 } 548 549 /* 550 * check and decrement ttl 551 */ 552 if (hdr->hop_limit <= 1) { 553 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 554 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 555 556 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 557 return -ETIMEDOUT; 558 } 559 560 /* XXX: idev->cnf.proxy_ndp? */ 561 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 562 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 563 int proxied = ip6_forward_proxy_check(skb); 564 if (proxied > 0) { 565 /* It's tempting to decrease the hop limit 566 * here by 1, as we do at the end of the 567 * function too. 568 * 569 * But that would be incorrect, as proxying is 570 * not forwarding. The ip6_input function 571 * will handle this packet locally, and it 572 * depends on the hop limit being unchanged. 573 * 574 * One example is the NDP hop limit, that 575 * always has to stay 255, but other would be 576 * similar checks around RA packets, where the 577 * user can even change the desired limit. 578 */ 579 return ip6_input(skb); 580 } else if (proxied < 0) { 581 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 582 goto drop; 583 } 584 } 585 586 if (!xfrm6_route_forward(skb)) { 587 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 588 SKB_DR_SET(reason, XFRM_POLICY); 589 goto drop; 590 } 591 dst = skb_dst(skb); 592 dev = dst_dev(dst); 593 /* IPv6 specs say nothing about it, but it is clear that we cannot 594 send redirects to source routed frames. 595 We don't send redirects to frames decapsulated from IPsec. 596 */ 597 if (IP6CB(skb)->iif == dev->ifindex && 598 opt->srcrt == 0 && !skb_sec_path(skb)) { 599 struct in6_addr *target = NULL; 600 struct inet_peer *peer; 601 struct rt6_info *rt; 602 603 /* 604 * incoming and outgoing devices are the same 605 * send a redirect. 606 */ 607 608 rt = dst_rt6_info(dst); 609 if (rt->rt6i_flags & RTF_GATEWAY) 610 target = &rt->rt6i_gateway; 611 else 612 target = &hdr->daddr; 613 614 rcu_read_lock(); 615 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 616 617 /* Limit redirects both by destination (here) 618 and by source (inside ndisc_send_redirect) 619 */ 620 if (inet_peer_xrlim_allow(peer, 1*HZ)) 621 ndisc_send_redirect(skb, target); 622 rcu_read_unlock(); 623 } else { 624 int addrtype = ipv6_addr_type(&hdr->saddr); 625 626 /* This check is security critical. */ 627 if (addrtype == IPV6_ADDR_ANY || 628 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 629 goto error; 630 if (addrtype & IPV6_ADDR_LINKLOCAL) { 631 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 632 ICMPV6_NOT_NEIGHBOUR, 0); 633 goto error; 634 } 635 } 636 637 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 638 639 mtu = ip6_dst_mtu_maybe_forward(dst, true); 640 if (mtu < IPV6_MIN_MTU) 641 mtu = IPV6_MIN_MTU; 642 643 if (unlikely(ip6_pkt_too_big(skb, mtu))) { 644 /* Again, force OUTPUT device used as source address */ 645 skb->dev = dev; 646 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 647 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 648 __IP6_INC_STATS(net, ip6_dst_idev(dst), 649 IPSTATS_MIB_FRAGFAILS); 650 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 651 return -EMSGSIZE; 652 } 653 654 if (skb_cow(skb, dev->hard_header_len)) { 655 __IP6_INC_STATS(net, ip6_dst_idev(dst), 656 IPSTATS_MIB_OUTDISCARDS); 657 goto drop; 658 } 659 660 hdr = ipv6_hdr(skb); 661 662 /* Mangling hops number delayed to point after skb COW */ 663 664 hdr->hop_limit--; 665 666 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 667 net, NULL, skb, skb->dev, dev, 668 ip6_forward_finish); 669 670 error: 671 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 672 SKB_DR_SET(reason, IP_INADDRERRORS); 673 drop: 674 kfree_skb_reason(skb, reason); 675 return -EINVAL; 676 } 677 678 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 679 { 680 to->pkt_type = from->pkt_type; 681 to->priority = from->priority; 682 to->protocol = from->protocol; 683 skb_dst_drop(to); 684 skb_dst_set(to, dst_clone(skb_dst(from))); 685 to->dev = from->dev; 686 to->mark = from->mark; 687 688 skb_copy_hash(to, from); 689 690 #ifdef CONFIG_NET_SCHED 691 to->tc_index = from->tc_index; 692 #endif 693 nf_copy(to, from); 694 skb_ext_copy(to, from); 695 skb_copy_secmark(to, from); 696 } 697 698 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 699 u8 nexthdr, __be32 frag_id, 700 struct ip6_fraglist_iter *iter) 701 { 702 unsigned int first_len; 703 struct frag_hdr *fh; 704 705 /* BUILD HEADER */ 706 *prevhdr = NEXTHDR_FRAGMENT; 707 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 708 if (!iter->tmp_hdr) 709 return -ENOMEM; 710 711 iter->frag = skb_shinfo(skb)->frag_list; 712 skb_frag_list_init(skb); 713 714 iter->offset = 0; 715 iter->hlen = hlen; 716 iter->frag_id = frag_id; 717 iter->nexthdr = nexthdr; 718 719 __skb_pull(skb, hlen); 720 fh = __skb_push(skb, sizeof(struct frag_hdr)); 721 __skb_push(skb, hlen); 722 skb_reset_network_header(skb); 723 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 724 725 fh->nexthdr = nexthdr; 726 fh->reserved = 0; 727 fh->frag_off = htons(IP6_MF); 728 fh->identification = frag_id; 729 730 first_len = skb_pagelen(skb); 731 skb->data_len = first_len - skb_headlen(skb); 732 skb->len = first_len; 733 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 734 735 return 0; 736 } 737 EXPORT_SYMBOL(ip6_fraglist_init); 738 739 void ip6_fraglist_prepare(struct sk_buff *skb, 740 struct ip6_fraglist_iter *iter) 741 { 742 struct sk_buff *frag = iter->frag; 743 unsigned int hlen = iter->hlen; 744 struct frag_hdr *fh; 745 746 frag->ip_summed = CHECKSUM_NONE; 747 skb_reset_transport_header(frag); 748 fh = __skb_push(frag, sizeof(struct frag_hdr)); 749 __skb_push(frag, hlen); 750 skb_reset_network_header(frag); 751 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 752 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 753 fh->nexthdr = iter->nexthdr; 754 fh->reserved = 0; 755 fh->frag_off = htons(iter->offset); 756 if (frag->next) 757 fh->frag_off |= htons(IP6_MF); 758 fh->identification = iter->frag_id; 759 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 760 ip6_copy_metadata(frag, skb); 761 } 762 EXPORT_SYMBOL(ip6_fraglist_prepare); 763 764 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 765 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 766 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 767 { 768 state->prevhdr = prevhdr; 769 state->nexthdr = nexthdr; 770 state->frag_id = frag_id; 771 772 state->hlen = hlen; 773 state->mtu = mtu; 774 775 state->left = skb->len - hlen; /* Space per frame */ 776 state->ptr = hlen; /* Where to start from */ 777 778 state->hroom = hdr_room; 779 state->troom = needed_tailroom; 780 781 state->offset = 0; 782 } 783 EXPORT_SYMBOL(ip6_frag_init); 784 785 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 786 { 787 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 788 struct sk_buff *frag; 789 struct frag_hdr *fh; 790 unsigned int len; 791 792 len = state->left; 793 /* IF: it doesn't fit, use 'mtu' - the data space left */ 794 if (len > state->mtu) 795 len = state->mtu; 796 /* IF: we are not sending up to and including the packet end 797 then align the next start on an eight byte boundary */ 798 if (len < state->left) 799 len &= ~7; 800 801 /* Allocate buffer */ 802 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 803 state->hroom + state->troom, GFP_ATOMIC); 804 if (!frag) 805 return ERR_PTR(-ENOMEM); 806 807 /* 808 * Set up data on packet 809 */ 810 811 ip6_copy_metadata(frag, skb); 812 skb_reserve(frag, state->hroom); 813 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 814 skb_reset_network_header(frag); 815 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 816 frag->transport_header = (frag->network_header + state->hlen + 817 sizeof(struct frag_hdr)); 818 819 /* 820 * Charge the memory for the fragment to any owner 821 * it might possess 822 */ 823 if (skb->sk) 824 skb_set_owner_w(frag, skb->sk); 825 826 /* 827 * Copy the packet header into the new buffer. 828 */ 829 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 830 831 fragnexthdr_offset = skb_network_header(frag); 832 fragnexthdr_offset += prevhdr - skb_network_header(skb); 833 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 834 835 /* 836 * Build fragment header. 837 */ 838 fh->nexthdr = state->nexthdr; 839 fh->reserved = 0; 840 fh->identification = state->frag_id; 841 842 /* 843 * Copy a block of the IP datagram. 844 */ 845 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 846 len)); 847 state->left -= len; 848 849 fh->frag_off = htons(state->offset); 850 if (state->left > 0) 851 fh->frag_off |= htons(IP6_MF); 852 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 853 854 state->ptr += len; 855 state->offset += len; 856 857 return frag; 858 } 859 EXPORT_SYMBOL(ip6_frag_next); 860 861 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 862 int (*output)(struct net *, struct sock *, struct sk_buff *)) 863 { 864 struct sk_buff *frag; 865 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 866 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 867 inet6_sk(skb->sk) : NULL; 868 u8 tstamp_type = skb->tstamp_type; 869 struct ip6_frag_state state; 870 unsigned int mtu, hlen, nexthdr_offset; 871 ktime_t tstamp = skb->tstamp; 872 int hroom, err = 0; 873 __be32 frag_id; 874 u8 *prevhdr, nexthdr = 0; 875 876 if (!ipv6_mod_enabled()) { 877 kfree_skb(skb); 878 return -EAFNOSUPPORT; 879 } 880 881 err = ip6_find_1stfragopt(skb, &prevhdr); 882 if (err < 0) 883 goto fail; 884 hlen = err; 885 nexthdr = *prevhdr; 886 nexthdr_offset = prevhdr - skb_network_header(skb); 887 888 mtu = ip6_skb_dst_mtu(skb); 889 890 /* We must not fragment if the socket is set to force MTU discovery 891 * or if the skb it not generated by a local socket. 892 */ 893 if (unlikely(!skb->ignore_df && skb->len > mtu)) 894 goto fail_toobig; 895 896 if (IP6CB(skb)->frag_max_size) { 897 if (IP6CB(skb)->frag_max_size > mtu) 898 goto fail_toobig; 899 900 /* don't send fragments larger than what we received */ 901 mtu = IP6CB(skb)->frag_max_size; 902 if (mtu < IPV6_MIN_MTU) 903 mtu = IPV6_MIN_MTU; 904 } 905 906 if (np) { 907 u32 frag_size = READ_ONCE(np->frag_size); 908 909 if (frag_size && frag_size < mtu) 910 mtu = frag_size; 911 } 912 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 913 goto fail_toobig; 914 mtu -= hlen + sizeof(struct frag_hdr); 915 916 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 917 &ipv6_hdr(skb)->saddr); 918 919 if (skb->ip_summed == CHECKSUM_PARTIAL && 920 (err = skb_checksum_help(skb))) 921 goto fail; 922 923 prevhdr = skb_network_header(skb) + nexthdr_offset; 924 hroom = LL_RESERVED_SPACE(rt->dst.dev); 925 if (skb_has_frag_list(skb)) { 926 unsigned int first_len = skb_pagelen(skb); 927 struct ip6_fraglist_iter iter; 928 struct sk_buff *frag2; 929 930 if (first_len - hlen > mtu || 931 ((first_len - hlen) & 7) || 932 skb_cloned(skb) || 933 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 934 goto slow_path; 935 936 skb_walk_frags(skb, frag) { 937 /* Correct geometry. */ 938 if (frag->len > mtu || 939 ((frag->len & 7) && frag->next) || 940 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 941 goto slow_path_clean; 942 943 /* Partially cloned skb? */ 944 if (skb_shared(frag)) 945 goto slow_path_clean; 946 947 BUG_ON(frag->sk); 948 if (skb->sk) { 949 frag->sk = skb->sk; 950 frag->destructor = sock_wfree; 951 } 952 skb->truesize -= frag->truesize; 953 } 954 955 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 956 &iter); 957 if (err < 0) 958 goto fail; 959 960 /* We prevent @rt from being freed. */ 961 rcu_read_lock(); 962 963 for (;;) { 964 /* Prepare header of the next frame, 965 * before previous one went down. */ 966 if (iter.frag) 967 ip6_fraglist_prepare(skb, &iter); 968 969 skb_set_delivery_time(skb, tstamp, tstamp_type); 970 err = output(net, sk, skb); 971 if (!err) 972 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 973 IPSTATS_MIB_FRAGCREATES); 974 975 if (err || !iter.frag) 976 break; 977 978 skb = ip6_fraglist_next(&iter); 979 } 980 981 kfree(iter.tmp_hdr); 982 983 if (err == 0) { 984 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 985 IPSTATS_MIB_FRAGOKS); 986 rcu_read_unlock(); 987 return 0; 988 } 989 990 kfree_skb_list(iter.frag); 991 992 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 993 IPSTATS_MIB_FRAGFAILS); 994 rcu_read_unlock(); 995 return err; 996 997 slow_path_clean: 998 skb_walk_frags(skb, frag2) { 999 if (frag2 == frag) 1000 break; 1001 frag2->sk = NULL; 1002 frag2->destructor = NULL; 1003 skb->truesize += frag2->truesize; 1004 } 1005 } 1006 1007 slow_path: 1008 /* 1009 * Fragment the datagram. 1010 */ 1011 1012 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1013 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1014 &state); 1015 1016 /* 1017 * Keep copying data until we run out. 1018 */ 1019 1020 while (state.left > 0) { 1021 frag = ip6_frag_next(skb, &state); 1022 if (IS_ERR(frag)) { 1023 err = PTR_ERR(frag); 1024 goto fail; 1025 } 1026 1027 /* 1028 * Put this fragment into the sending queue. 1029 */ 1030 skb_set_delivery_time(frag, tstamp, tstamp_type); 1031 err = output(net, sk, frag); 1032 if (err) 1033 goto fail; 1034 1035 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1036 IPSTATS_MIB_FRAGCREATES); 1037 } 1038 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1039 IPSTATS_MIB_FRAGOKS); 1040 consume_skb(skb); 1041 return err; 1042 1043 fail_toobig: 1044 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1045 err = -EMSGSIZE; 1046 1047 fail: 1048 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1049 IPSTATS_MIB_FRAGFAILS); 1050 kfree_skb(skb); 1051 return err; 1052 } 1053 EXPORT_SYMBOL_GPL(ip6_fragment); 1054 1055 static inline int ip6_rt_check(const struct rt6key *rt_key, 1056 const struct in6_addr *fl_addr, 1057 const struct in6_addr *addr_cache) 1058 { 1059 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1060 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1061 } 1062 1063 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1064 struct dst_entry *dst, 1065 const struct flowi6 *fl6) 1066 { 1067 struct ipv6_pinfo *np = inet6_sk(sk); 1068 struct rt6_info *rt; 1069 1070 if (!dst) 1071 goto out; 1072 1073 if (dst->ops->family != AF_INET6) { 1074 dst_release(dst); 1075 return NULL; 1076 } 1077 1078 rt = dst_rt6_info(dst); 1079 /* Yes, checking route validity in not connected 1080 * case is not very simple. Take into account, 1081 * that we do not support routing by source, TOS, 1082 * and MSG_DONTROUTE --ANK (980726) 1083 * 1084 * 1. ip6_rt_check(): If route was host route, 1085 * check that cached destination is current. 1086 * If it is network route, we still may 1087 * check its validity using saved pointer 1088 * to the last used address: daddr_cache. 1089 * We do not want to save whole address now, 1090 * (because main consumer of this service 1091 * is tcp, which has not this problem), 1092 * so that the last trick works only on connected 1093 * sockets. 1094 * 2. oif also should be the same. 1095 */ 1096 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1097 np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1098 #ifdef CONFIG_IPV6_SUBTREES 1099 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1100 np->saddr_cache ? &np->saddr : NULL) || 1101 #endif 1102 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1103 dst_release(dst); 1104 dst = NULL; 1105 } 1106 1107 out: 1108 return dst; 1109 } 1110 1111 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1112 struct dst_entry **dst, struct flowi6 *fl6) 1113 { 1114 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1115 struct neighbour *n; 1116 struct rt6_info *rt; 1117 #endif 1118 int err; 1119 int flags = 0; 1120 1121 /* The correct way to handle this would be to do 1122 * ip6_route_get_saddr, and then ip6_route_output; however, 1123 * the route-specific preferred source forces the 1124 * ip6_route_output call _before_ ip6_route_get_saddr. 1125 * 1126 * In source specific routing (no src=any default route), 1127 * ip6_route_output will fail given src=any saddr, though, so 1128 * that's why we try it again later. 1129 */ 1130 if (ipv6_addr_any(&fl6->saddr)) { 1131 struct fib6_info *from; 1132 struct rt6_info *rt; 1133 1134 *dst = ip6_route_output(net, sk, fl6); 1135 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1136 1137 rcu_read_lock(); 1138 from = rt ? rcu_dereference(rt->from) : NULL; 1139 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1140 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1141 fl6->flowi6_l3mdev, 1142 &fl6->saddr); 1143 rcu_read_unlock(); 1144 1145 if (err) 1146 goto out_err_release; 1147 1148 /* If we had an erroneous initial result, pretend it 1149 * never existed and let the SA-enabled version take 1150 * over. 1151 */ 1152 if ((*dst)->error) { 1153 dst_release(*dst); 1154 *dst = NULL; 1155 } 1156 1157 if (fl6->flowi6_oif) 1158 flags |= RT6_LOOKUP_F_IFACE; 1159 } 1160 1161 if (!*dst) 1162 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1163 1164 err = (*dst)->error; 1165 if (err) 1166 goto out_err_release; 1167 1168 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1169 /* 1170 * Here if the dst entry we've looked up 1171 * has a neighbour entry that is in the INCOMPLETE 1172 * state and the src address from the flow is 1173 * marked as OPTIMISTIC, we release the found 1174 * dst entry and replace it instead with the 1175 * dst entry of the nexthop router 1176 */ 1177 rt = dst_rt6_info(*dst); 1178 rcu_read_lock(); 1179 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1180 rt6_nexthop(rt, &fl6->daddr)); 1181 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1182 rcu_read_unlock(); 1183 1184 if (err) { 1185 struct inet6_ifaddr *ifp; 1186 struct flowi6 fl_gw6; 1187 int redirect; 1188 1189 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1190 (*dst)->dev, 1); 1191 1192 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1193 if (ifp) 1194 in6_ifa_put(ifp); 1195 1196 if (redirect) { 1197 /* 1198 * We need to get the dst entry for the 1199 * default router instead 1200 */ 1201 dst_release(*dst); 1202 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1203 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1204 *dst = ip6_route_output(net, sk, &fl_gw6); 1205 err = (*dst)->error; 1206 if (err) 1207 goto out_err_release; 1208 } 1209 } 1210 #endif 1211 if (ipv6_addr_v4mapped(&fl6->saddr) && 1212 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1213 err = -EAFNOSUPPORT; 1214 goto out_err_release; 1215 } 1216 1217 return 0; 1218 1219 out_err_release: 1220 dst_release(*dst); 1221 *dst = NULL; 1222 1223 if (err == -ENETUNREACH) 1224 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1225 return err; 1226 } 1227 1228 /** 1229 * ip6_dst_lookup - perform route lookup on flow 1230 * @net: Network namespace to perform lookup in 1231 * @sk: socket which provides route info 1232 * @dst: pointer to dst_entry * for result 1233 * @fl6: flow to lookup 1234 * 1235 * This function performs a route lookup on the given flow. 1236 * 1237 * It returns zero on success, or a standard errno code on error. 1238 */ 1239 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1240 struct flowi6 *fl6) 1241 { 1242 *dst = NULL; 1243 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1244 } 1245 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1246 1247 /** 1248 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1249 * @net: Network namespace to perform lookup in 1250 * @sk: socket which provides route info 1251 * @fl6: flow to lookup 1252 * @final_dst: final destination address for ipsec lookup 1253 * 1254 * This function performs a route lookup on the given flow. 1255 * 1256 * It returns a valid dst pointer on success, or a pointer encoded 1257 * error code. 1258 */ 1259 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1260 const struct in6_addr *final_dst) 1261 { 1262 struct dst_entry *dst = NULL; 1263 int err; 1264 1265 if (!ipv6_mod_enabled()) 1266 return ERR_PTR(-EAFNOSUPPORT); 1267 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1268 if (err) 1269 return ERR_PTR(err); 1270 if (final_dst) 1271 fl6->daddr = *final_dst; 1272 1273 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1274 } 1275 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1276 1277 /** 1278 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1279 * @sk: socket which provides the dst cache and route info 1280 * @fl6: flow to lookup 1281 * @final_dst: final destination address for ipsec lookup 1282 * @connected: whether @sk is connected or not 1283 * 1284 * This function performs a route lookup on the given flow with the 1285 * possibility of using the cached route in the socket if it is valid. 1286 * It will take the socket dst lock when operating on the dst cache. 1287 * As a result, this function can only be used in process context. 1288 * 1289 * In addition, for a connected socket, cache the dst in the socket 1290 * if the current cache is not valid. 1291 * 1292 * It returns a valid dst pointer on success, or a pointer encoded 1293 * error code. 1294 */ 1295 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1296 const struct in6_addr *final_dst, 1297 bool connected) 1298 { 1299 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1300 1301 dst = ip6_sk_dst_check(sk, dst, fl6); 1302 if (dst) 1303 return dst; 1304 1305 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1306 if (connected && !IS_ERR(dst)) 1307 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1308 1309 return dst; 1310 } 1311 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1312 1313 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1314 gfp_t gfp) 1315 { 1316 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1317 } 1318 1319 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1320 gfp_t gfp) 1321 { 1322 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1323 } 1324 1325 static void ip6_append_data_mtu(unsigned int *mtu, 1326 int *maxfraglen, 1327 unsigned int fragheaderlen, 1328 struct sk_buff *skb, 1329 struct rt6_info *rt, 1330 unsigned int orig_mtu) 1331 { 1332 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1333 if (!skb) { 1334 /* first fragment, reserve header_len */ 1335 *mtu = orig_mtu - rt->dst.header_len; 1336 1337 } else { 1338 /* 1339 * this fragment is not first, the headers 1340 * space is regarded as data space. 1341 */ 1342 *mtu = orig_mtu; 1343 } 1344 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1345 + fragheaderlen - sizeof(struct frag_hdr); 1346 } 1347 } 1348 1349 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1350 struct ipcm6_cookie *ipc6, 1351 struct rt6_info *rt) 1352 { 1353 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1354 struct inet6_cork *v6_cork = &cork->base6; 1355 struct ipv6_pinfo *np = inet6_sk(sk); 1356 unsigned int mtu, frag_size; 1357 1358 /* callers pass dst together with a reference, set it first so 1359 * ip6_cork_release() can put it down even in case of an error. 1360 */ 1361 cork->base.dst = &rt->dst; 1362 1363 /* 1364 * setup for corking 1365 */ 1366 if (unlikely(opt)) { 1367 if (WARN_ON(v6_cork->opt)) 1368 return -EINVAL; 1369 1370 nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation); 1371 if (unlikely(!nopt)) 1372 return -ENOBUFS; 1373 1374 nopt->tot_len = sizeof(*opt); 1375 nopt->opt_flen = opt->opt_flen; 1376 nopt->opt_nflen = opt->opt_nflen; 1377 1378 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1379 if (opt->dst0opt && !nopt->dst0opt) 1380 return -ENOBUFS; 1381 1382 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1383 if (opt->dst1opt && !nopt->dst1opt) 1384 return -ENOBUFS; 1385 1386 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1387 if (opt->hopopt && !nopt->hopopt) 1388 return -ENOBUFS; 1389 1390 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1391 if (opt->srcrt && !nopt->srcrt) 1392 return -ENOBUFS; 1393 1394 /* need source address above miyazawa*/ 1395 } 1396 v6_cork->hop_limit = ipc6->hlimit; 1397 v6_cork->tclass = ipc6->tclass; 1398 v6_cork->dontfrag = ipc6->dontfrag; 1399 if (rt->dst.flags & DST_XFRM_TUNNEL) 1400 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1401 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); 1402 else 1403 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1404 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); 1405 1406 frag_size = READ_ONCE(np->frag_size); 1407 if (frag_size && frag_size < mtu) 1408 mtu = frag_size; 1409 1410 cork->base.fragsize = mtu; 1411 cork->base.gso_size = ipc6->gso_size; 1412 cork->base.tx_flags = 0; 1413 cork->base.mark = ipc6->sockc.mark; 1414 cork->base.priority = ipc6->sockc.priority; 1415 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1416 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1417 cork->base.flags |= IPCORK_TS_OPT_ID; 1418 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1419 } 1420 cork->base.length = 0; 1421 cork->base.transmit_time = ipc6->sockc.transmit_time; 1422 1423 return 0; 1424 } 1425 1426 static int __ip6_append_data(struct sock *sk, 1427 struct sk_buff_head *queue, 1428 struct inet_cork_full *cork_full, 1429 struct page_frag *pfrag, 1430 int getfrag(void *from, char *to, int offset, 1431 int len, int odd, struct sk_buff *skb), 1432 void *from, size_t length, int transhdrlen, 1433 unsigned int flags) 1434 { 1435 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1436 struct inet6_cork *v6_cork = &cork_full->base6; 1437 struct inet_cork *cork = &cork_full->base; 1438 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1439 struct sk_buff *skb, *skb_prev = NULL; 1440 struct ubuf_info *uarg = NULL; 1441 int exthdrlen = 0; 1442 int dst_exthdrlen = 0; 1443 int hh_len; 1444 int copy; 1445 int err; 1446 int offset = 0; 1447 bool zc = false; 1448 u32 tskey = 0; 1449 struct rt6_info *rt = dst_rt6_info(cork->dst); 1450 bool paged, hold_tskey = false, extra_uref = false; 1451 struct ipv6_txoptions *opt = v6_cork->opt; 1452 int csummode = CHECKSUM_NONE; 1453 unsigned int maxnonfragsize, headersize; 1454 unsigned int wmem_alloc_delta = 0; 1455 1456 skb = skb_peek_tail(queue); 1457 if (!skb) { 1458 exthdrlen = opt ? opt->opt_flen : 0; 1459 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1460 } 1461 1462 paged = !!cork->gso_size; 1463 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1464 orig_mtu = mtu; 1465 1466 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1467 1468 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1469 (opt ? opt->opt_nflen : 0); 1470 1471 headersize = sizeof(struct ipv6hdr) + 1472 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1473 rt->rt6i_nfheader_len; 1474 1475 if (mtu <= fragheaderlen || 1476 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1477 goto emsgsize; 1478 1479 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1480 sizeof(struct frag_hdr); 1481 1482 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1483 * the first fragment 1484 */ 1485 if (headersize + transhdrlen > mtu) 1486 goto emsgsize; 1487 1488 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1489 (sk->sk_protocol == IPPROTO_UDP || 1490 sk->sk_protocol == IPPROTO_ICMPV6 || 1491 sk->sk_protocol == IPPROTO_RAW)) { 1492 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1493 sizeof(struct ipv6hdr)); 1494 goto emsgsize; 1495 } 1496 1497 if (ip6_sk_ignore_df(sk)) 1498 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1499 else 1500 maxnonfragsize = mtu; 1501 1502 if (cork->length + length > maxnonfragsize - headersize) { 1503 emsgsize: 1504 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1505 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1506 return -EMSGSIZE; 1507 } 1508 1509 /* CHECKSUM_PARTIAL only with no extension headers and when 1510 * we are not going to fragment 1511 */ 1512 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1513 headersize == sizeof(struct ipv6hdr) && 1514 length <= mtu - headersize && 1515 (!(flags & MSG_MORE) || cork->gso_size) && 1516 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1517 csummode = CHECKSUM_PARTIAL; 1518 1519 if ((flags & MSG_ZEROCOPY) && length) { 1520 struct msghdr *msg = from; 1521 1522 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1523 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1524 return -EINVAL; 1525 1526 /* Leave uarg NULL if can't zerocopy, callers should 1527 * be able to handle it. 1528 */ 1529 if ((rt->dst.dev->features & NETIF_F_SG) && 1530 csummode == CHECKSUM_PARTIAL) { 1531 paged = true; 1532 zc = true; 1533 uarg = msg->msg_ubuf; 1534 } 1535 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1536 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1537 false); 1538 if (!uarg) 1539 return -ENOBUFS; 1540 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1541 if (rt->dst.dev->features & NETIF_F_SG && 1542 csummode == CHECKSUM_PARTIAL) { 1543 paged = true; 1544 zc = true; 1545 } else { 1546 uarg_to_msgzc(uarg)->zerocopy = 0; 1547 skb_zcopy_set(skb, uarg, &extra_uref); 1548 } 1549 } 1550 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1551 if (inet_test_bit(HDRINCL, sk)) 1552 return -EPERM; 1553 if (rt->dst.dev->features & NETIF_F_SG && 1554 getfrag == ip_generic_getfrag) 1555 /* We need an empty buffer to attach stuff to */ 1556 paged = true; 1557 else 1558 flags &= ~MSG_SPLICE_PAGES; 1559 } 1560 1561 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1562 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1563 if (cork->flags & IPCORK_TS_OPT_ID) { 1564 tskey = cork->ts_opt_id; 1565 } else { 1566 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1567 hold_tskey = true; 1568 } 1569 } 1570 1571 /* 1572 * Let's try using as much space as possible. 1573 * Use MTU if total length of the message fits into the MTU. 1574 * Otherwise, we need to reserve fragment header and 1575 * fragment alignment (= 8-15 octects, in total). 1576 * 1577 * Note that we may need to "move" the data from the tail 1578 * of the buffer to the new fragment when we split 1579 * the message. 1580 * 1581 * FIXME: It may be fragmented into multiple chunks 1582 * at once if non-fragmentable extension headers 1583 * are too large. 1584 * --yoshfuji 1585 */ 1586 1587 cork->length += length; 1588 if (!skb) 1589 goto alloc_new_skb; 1590 1591 while (length > 0) { 1592 /* Check if the remaining data fits into current packet. */ 1593 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1594 if (copy < length) 1595 copy = maxfraglen - skb->len; 1596 1597 if (copy <= 0) { 1598 char *data; 1599 unsigned int datalen; 1600 unsigned int fraglen; 1601 unsigned int fraggap; 1602 unsigned int alloclen, alloc_extra; 1603 unsigned int pagedlen; 1604 alloc_new_skb: 1605 /* There's no room in the current skb */ 1606 if (skb) 1607 fraggap = skb->len - maxfraglen; 1608 else 1609 fraggap = 0; 1610 /* update mtu and maxfraglen if necessary */ 1611 if (!skb || !skb_prev) 1612 ip6_append_data_mtu(&mtu, &maxfraglen, 1613 fragheaderlen, skb, rt, 1614 orig_mtu); 1615 1616 skb_prev = skb; 1617 1618 /* 1619 * If remaining data exceeds the mtu, 1620 * we know we need more fragment(s). 1621 */ 1622 datalen = length + fraggap; 1623 1624 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1625 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1626 fraglen = datalen + fragheaderlen; 1627 pagedlen = 0; 1628 1629 alloc_extra = hh_len; 1630 alloc_extra += dst_exthdrlen; 1631 alloc_extra += rt->dst.trailer_len; 1632 1633 /* We just reserve space for fragment header. 1634 * Note: this may be overallocation if the message 1635 * (without MSG_MORE) fits into the MTU. 1636 */ 1637 alloc_extra += sizeof(struct frag_hdr); 1638 1639 if ((flags & MSG_MORE) && 1640 !(rt->dst.dev->features&NETIF_F_SG)) 1641 alloclen = mtu; 1642 else if (!paged && 1643 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1644 !(rt->dst.dev->features & NETIF_F_SG))) 1645 alloclen = fraglen; 1646 else { 1647 alloclen = fragheaderlen + transhdrlen; 1648 pagedlen = datalen - transhdrlen; 1649 } 1650 alloclen += alloc_extra; 1651 1652 if (datalen != length + fraggap) { 1653 /* 1654 * this is not the last fragment, the trailer 1655 * space is regarded as data space. 1656 */ 1657 datalen += rt->dst.trailer_len; 1658 } 1659 1660 fraglen = datalen + fragheaderlen; 1661 1662 copy = datalen - transhdrlen - fraggap - pagedlen; 1663 /* [!] NOTE: copy may be negative if pagedlen>0 1664 * because then the equation may reduces to -fraggap. 1665 */ 1666 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1667 err = -EINVAL; 1668 goto error; 1669 } 1670 if (transhdrlen) { 1671 skb = sock_alloc_send_skb(sk, alloclen, 1672 (flags & MSG_DONTWAIT), &err); 1673 } else { 1674 skb = NULL; 1675 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1676 2 * sk->sk_sndbuf) 1677 skb = alloc_skb(alloclen, 1678 sk->sk_allocation); 1679 if (unlikely(!skb)) 1680 err = -ENOBUFS; 1681 } 1682 if (!skb) 1683 goto error; 1684 /* 1685 * Fill in the control structures 1686 */ 1687 skb->protocol = htons(ETH_P_IPV6); 1688 skb->ip_summed = csummode; 1689 skb->csum = 0; 1690 /* reserve for fragmentation and ipsec header */ 1691 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1692 dst_exthdrlen); 1693 1694 /* 1695 * Find where to start putting bytes 1696 */ 1697 data = skb_put(skb, fraglen - pagedlen); 1698 skb_set_network_header(skb, exthdrlen); 1699 data += fragheaderlen; 1700 skb->transport_header = (skb->network_header + 1701 fragheaderlen); 1702 if (fraggap) { 1703 skb->csum = skb_copy_and_csum_bits( 1704 skb_prev, maxfraglen, 1705 data + transhdrlen, fraggap); 1706 skb_prev->csum = csum_sub(skb_prev->csum, 1707 skb->csum); 1708 data += fraggap; 1709 pskb_trim_unique(skb_prev, maxfraglen); 1710 } 1711 if (copy > 0 && 1712 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1713 from, data + transhdrlen, offset, 1714 copy, fraggap, skb) < 0) { 1715 err = -EFAULT; 1716 kfree_skb(skb); 1717 goto error; 1718 } else if (flags & MSG_SPLICE_PAGES) { 1719 copy = 0; 1720 } 1721 1722 offset += copy; 1723 length -= copy + transhdrlen; 1724 transhdrlen = 0; 1725 exthdrlen = 0; 1726 dst_exthdrlen = 0; 1727 1728 /* Only the initial fragment is time stamped */ 1729 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1730 cork->tx_flags = 0; 1731 skb_shinfo(skb)->tskey = tskey; 1732 tskey = 0; 1733 skb_zcopy_set(skb, uarg, &extra_uref); 1734 1735 if ((flags & MSG_CONFIRM) && !skb_prev) 1736 skb_set_dst_pending_confirm(skb, 1); 1737 1738 /* 1739 * Put the packet on the pending queue 1740 */ 1741 if (!skb->destructor) { 1742 skb->destructor = sock_wfree; 1743 skb->sk = sk; 1744 wmem_alloc_delta += skb->truesize; 1745 } 1746 __skb_queue_tail(queue, skb); 1747 continue; 1748 } 1749 1750 if (copy > length) 1751 copy = length; 1752 1753 if (!(rt->dst.dev->features&NETIF_F_SG) && 1754 skb_tailroom(skb) >= copy) { 1755 unsigned int off; 1756 1757 off = skb->len; 1758 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1759 from, skb_put(skb, copy), 1760 offset, copy, off, skb) < 0) { 1761 __skb_trim(skb, off); 1762 err = -EFAULT; 1763 goto error; 1764 } 1765 } else if (flags & MSG_SPLICE_PAGES) { 1766 struct msghdr *msg = from; 1767 1768 err = -EIO; 1769 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1770 goto error; 1771 1772 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1773 if (err < 0) 1774 goto error; 1775 copy = err; 1776 wmem_alloc_delta += copy; 1777 } else if (!zc) { 1778 int i = skb_shinfo(skb)->nr_frags; 1779 1780 err = -ENOMEM; 1781 if (!sk_page_frag_refill(sk, pfrag)) 1782 goto error; 1783 1784 skb_zcopy_downgrade_managed(skb); 1785 if (!skb_can_coalesce(skb, i, pfrag->page, 1786 pfrag->offset)) { 1787 err = -EMSGSIZE; 1788 if (i == MAX_SKB_FRAGS) 1789 goto error; 1790 1791 __skb_fill_page_desc(skb, i, pfrag->page, 1792 pfrag->offset, 0); 1793 skb_shinfo(skb)->nr_frags = ++i; 1794 get_page(pfrag->page); 1795 } 1796 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1797 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1798 from, 1799 page_address(pfrag->page) + pfrag->offset, 1800 offset, copy, skb->len, skb) < 0) 1801 goto error_efault; 1802 1803 pfrag->offset += copy; 1804 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1805 skb->len += copy; 1806 skb->data_len += copy; 1807 skb->truesize += copy; 1808 wmem_alloc_delta += copy; 1809 } else { 1810 err = skb_zerocopy_iter_dgram(skb, from, copy); 1811 if (err < 0) 1812 goto error; 1813 } 1814 offset += copy; 1815 length -= copy; 1816 } 1817 1818 if (wmem_alloc_delta) 1819 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1820 return 0; 1821 1822 error_efault: 1823 err = -EFAULT; 1824 error: 1825 net_zcopy_put_abort(uarg, extra_uref); 1826 cork->length -= length; 1827 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1828 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1829 if (hold_tskey) 1830 atomic_dec(&sk->sk_tskey); 1831 return err; 1832 } 1833 1834 int ip6_append_data(struct sock *sk, 1835 int getfrag(void *from, char *to, int offset, int len, 1836 int odd, struct sk_buff *skb), 1837 void *from, size_t length, int transhdrlen, 1838 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1839 struct rt6_info *rt, unsigned int flags) 1840 { 1841 struct inet_sock *inet = inet_sk(sk); 1842 int exthdrlen; 1843 int err; 1844 1845 if (flags&MSG_PROBE) 1846 return 0; 1847 if (skb_queue_empty(&sk->sk_write_queue)) { 1848 /* 1849 * setup for corking 1850 */ 1851 dst_hold(&rt->dst); 1852 err = ip6_setup_cork(sk, &inet->cork, 1853 ipc6, rt); 1854 if (err) 1855 return err; 1856 1857 inet->cork.fl.u.ip6 = *fl6; 1858 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1859 length += exthdrlen; 1860 transhdrlen += exthdrlen; 1861 } else { 1862 transhdrlen = 0; 1863 } 1864 1865 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1866 sk_page_frag(sk), getfrag, 1867 from, length, transhdrlen, flags); 1868 } 1869 EXPORT_SYMBOL_GPL(ip6_append_data); 1870 1871 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1872 { 1873 struct dst_entry *dst = cork->base.dst; 1874 1875 cork->base.dst = NULL; 1876 skb_dst_set(skb, dst); 1877 } 1878 1879 static void ip6_cork_release(struct inet_cork_full *cork) 1880 { 1881 struct inet6_cork *v6_cork = &cork->base6; 1882 1883 if (unlikely(v6_cork->opt)) { 1884 struct ipv6_txoptions *opt = v6_cork->opt; 1885 1886 kfree(opt->dst0opt); 1887 kfree(opt->dst1opt); 1888 kfree(opt->hopopt); 1889 kfree(opt->srcrt); 1890 kfree(opt); 1891 v6_cork->opt = NULL; 1892 } 1893 1894 if (cork->base.dst) { 1895 dst_release(cork->base.dst); 1896 cork->base.dst = NULL; 1897 } 1898 } 1899 1900 struct sk_buff *__ip6_make_skb(struct sock *sk, 1901 struct sk_buff_head *queue, 1902 struct inet_cork_full *cork) 1903 { 1904 struct sk_buff *skb, *tmp_skb; 1905 struct sk_buff **tail_skb; 1906 struct in6_addr *final_dst; 1907 struct net *net = sock_net(sk); 1908 struct ipv6hdr *hdr; 1909 struct ipv6_txoptions *opt; 1910 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1911 struct flowi6 *fl6 = &cork->fl.u.ip6; 1912 unsigned char proto = fl6->flowi6_proto; 1913 1914 skb = __skb_dequeue(queue); 1915 if (!skb) 1916 goto out; 1917 tail_skb = &(skb_shinfo(skb)->frag_list); 1918 1919 /* move skb->data to ip header from ext header */ 1920 if (skb->data < skb_network_header(skb)) 1921 __skb_pull(skb, skb_network_offset(skb)); 1922 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1923 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1924 *tail_skb = tmp_skb; 1925 tail_skb = &(tmp_skb->next); 1926 skb->len += tmp_skb->len; 1927 skb->data_len += tmp_skb->len; 1928 skb->truesize += tmp_skb->truesize; 1929 tmp_skb->destructor = NULL; 1930 tmp_skb->sk = NULL; 1931 } 1932 1933 /* Allow local fragmentation. */ 1934 skb->ignore_df = ip6_sk_ignore_df(sk); 1935 __skb_pull(skb, skb_network_header_len(skb)); 1936 1937 final_dst = &fl6->daddr; 1938 opt = cork->base6.opt; 1939 if (unlikely(opt)) { 1940 if (opt->opt_flen) 1941 proto = ipv6_push_frag_opts(skb, opt, proto); 1942 if (opt->opt_nflen) 1943 proto = ipv6_push_nfrag_opts(skb, opt, proto, 1944 &final_dst, &fl6->saddr); 1945 } 1946 skb_push(skb, sizeof(struct ipv6hdr)); 1947 skb_reset_network_header(skb); 1948 hdr = ipv6_hdr(skb); 1949 1950 ip6_flow_hdr(hdr, cork->base6.tclass, 1951 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1952 ip6_autoflowlabel(net, sk), fl6)); 1953 hdr->hop_limit = cork->base6.hop_limit; 1954 hdr->nexthdr = proto; 1955 hdr->saddr = fl6->saddr; 1956 hdr->daddr = *final_dst; 1957 1958 skb->priority = cork->base.priority; 1959 skb->mark = cork->base.mark; 1960 if (sk_is_tcp(sk)) 1961 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1962 else 1963 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1964 1965 ip6_cork_steal_dst(skb, cork); 1966 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1967 if (unlikely(proto == IPPROTO_ICMPV6)) { 1968 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1969 u8 icmp6_type; 1970 1971 if (sk->sk_socket->type == SOCK_RAW && 1972 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1973 icmp6_type = fl6->fl6_icmp_type; 1974 else 1975 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1976 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1977 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1978 } 1979 1980 ip6_cork_release(cork); 1981 out: 1982 return skb; 1983 } 1984 1985 int ip6_send_skb(struct sk_buff *skb) 1986 { 1987 struct net *net = sock_net(skb->sk); 1988 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 1989 int err; 1990 1991 rcu_read_lock(); 1992 err = ip6_local_out(net, skb->sk, skb); 1993 if (err) { 1994 if (err > 0) 1995 err = net_xmit_errno(err); 1996 if (err) 1997 IP6_INC_STATS(net, rt->rt6i_idev, 1998 IPSTATS_MIB_OUTDISCARDS); 1999 } 2000 2001 rcu_read_unlock(); 2002 return err; 2003 } 2004 2005 int ip6_push_pending_frames(struct sock *sk) 2006 { 2007 struct sk_buff *skb; 2008 2009 skb = ip6_finish_skb(sk); 2010 if (!skb) 2011 return 0; 2012 2013 return ip6_send_skb(skb); 2014 } 2015 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2016 2017 static void __ip6_flush_pending_frames(struct sock *sk, 2018 struct sk_buff_head *queue, 2019 struct inet_cork_full *cork) 2020 { 2021 struct sk_buff *skb; 2022 2023 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2024 if (skb_dst(skb)) 2025 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2026 IPSTATS_MIB_OUTDISCARDS); 2027 kfree_skb(skb); 2028 } 2029 2030 ip6_cork_release(cork); 2031 } 2032 2033 void ip6_flush_pending_frames(struct sock *sk) 2034 { 2035 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2036 &inet_sk(sk)->cork); 2037 } 2038 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2039 2040 struct sk_buff *ip6_make_skb(struct sock *sk, 2041 int getfrag(void *from, char *to, int offset, 2042 int len, int odd, struct sk_buff *skb), 2043 void *from, size_t length, int transhdrlen, 2044 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2045 unsigned int flags, struct inet_cork_full *cork) 2046 { 2047 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2048 struct sk_buff_head queue; 2049 int err; 2050 2051 if (flags & MSG_PROBE) { 2052 dst_release(&rt->dst); 2053 return NULL; 2054 } 2055 2056 __skb_queue_head_init(&queue); 2057 2058 cork->base.flags = 0; 2059 cork->base.addr = 0; 2060 cork->base.opt = NULL; 2061 cork->base6.opt = NULL; 2062 err = ip6_setup_cork(sk, cork, ipc6, rt); 2063 if (err) { 2064 ip6_cork_release(cork); 2065 return ERR_PTR(err); 2066 } 2067 2068 err = __ip6_append_data(sk, &queue, cork, 2069 ¤t->task_frag, getfrag, from, 2070 length + exthdrlen, transhdrlen + exthdrlen, 2071 flags); 2072 if (err) { 2073 __ip6_flush_pending_frames(sk, &queue, cork); 2074 return ERR_PTR(err); 2075 } 2076 2077 return __ip6_make_skb(sk, &queue, cork); 2078 } 2079