1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (ipv6_addr_is_multicast(daddr)) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 183 !skb_gso_validate_network_len(skb, mtu)) 184 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 185 186 return ip6_finish_output2(net, sk, skb); 187 } 188 189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 190 { 191 unsigned int mtu; 192 193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 194 /* Policy lookup after SNAT yielded a new policy */ 195 if (skb_dst(skb)->xfrm) { 196 IP6CB(skb)->flags |= IP6SKB_REROUTED; 197 return dst_output(net, sk, skb); 198 } 199 #endif 200 201 mtu = ip6_skb_dst_mtu(skb); 202 if (skb_is_gso(skb)) 203 return ip6_finish_output_gso(net, sk, skb, mtu); 204 205 if (skb->len > mtu || 206 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 207 return ip6_fragment(net, sk, skb, ip6_finish_output2); 208 209 return ip6_finish_output2(net, sk, skb); 210 } 211 212 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 213 { 214 int ret; 215 216 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 217 switch (ret) { 218 case NET_XMIT_SUCCESS: 219 case NET_XMIT_CN: 220 return __ip6_finish_output(net, sk, skb) ? : ret; 221 default: 222 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 223 return ret; 224 } 225 } 226 227 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 228 { 229 struct dst_entry *dst = skb_dst(skb); 230 struct net_device *dev, *indev = skb->dev; 231 struct inet6_dev *idev; 232 int ret; 233 234 skb->protocol = htons(ETH_P_IPV6); 235 rcu_read_lock(); 236 dev = dst_dev_rcu(dst); 237 idev = ip6_dst_idev(dst); 238 skb->dev = dev; 239 240 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 241 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 242 rcu_read_unlock(); 243 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 244 return 0; 245 } 246 247 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 248 net, sk, skb, indev, dev, 249 ip6_finish_output, 250 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 251 rcu_read_unlock(); 252 return ret; 253 } 254 EXPORT_SYMBOL(ip6_output); 255 256 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 257 { 258 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 259 return ip6_default_np_autolabel(net); 260 return inet6_test_bit(AUTOFLOWLABEL, sk); 261 } 262 263 /* 264 * xmit an sk_buff (used by TCP and SCTP) 265 * Note : socket lock is not held for SYNACK packets, but might be modified 266 * by calls to skb_set_owner_w() and ipv6_local_error(), 267 * which are using proper atomic operations or spinlocks. 268 */ 269 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 270 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 271 { 272 const struct ipv6_pinfo *np = inet6_sk(sk); 273 struct in6_addr *first_hop = &fl6->daddr; 274 struct dst_entry *dst = skb_dst(skb); 275 struct inet6_dev *idev = ip6_dst_idev(dst); 276 struct hop_jumbo_hdr *hop_jumbo; 277 int hoplen = sizeof(*hop_jumbo); 278 struct net *net = sock_net(sk); 279 unsigned int head_room; 280 struct net_device *dev; 281 struct ipv6hdr *hdr; 282 u8 proto = fl6->flowi6_proto; 283 int seg_len = skb->len; 284 int ret, hlimit = -1; 285 u32 mtu; 286 287 rcu_read_lock(); 288 289 dev = dst_dev_rcu(dst); 290 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 291 if (opt) 292 head_room += opt->opt_nflen + opt->opt_flen; 293 294 if (unlikely(head_room > skb_headroom(skb))) { 295 /* idev stays alive while we hold rcu_read_lock(). */ 296 skb = skb_expand_head(skb, head_room); 297 if (!skb) { 298 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 299 ret = -ENOBUFS; 300 goto unlock; 301 } 302 } 303 304 if (opt) { 305 seg_len += opt->opt_nflen + opt->opt_flen; 306 307 if (opt->opt_flen) 308 ipv6_push_frag_opts(skb, opt, &proto); 309 310 if (opt->opt_nflen) 311 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 312 &fl6->saddr); 313 } 314 315 if (unlikely(seg_len > IPV6_MAXPLEN)) { 316 hop_jumbo = skb_push(skb, hoplen); 317 318 hop_jumbo->nexthdr = proto; 319 hop_jumbo->hdrlen = 0; 320 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 321 hop_jumbo->tlv_len = 4; 322 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 323 324 proto = IPPROTO_HOPOPTS; 325 seg_len = 0; 326 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 327 } 328 329 skb_push(skb, sizeof(struct ipv6hdr)); 330 skb_reset_network_header(skb); 331 hdr = ipv6_hdr(skb); 332 333 /* 334 * Fill in the IPv6 header 335 */ 336 if (np) 337 hlimit = READ_ONCE(np->hop_limit); 338 if (hlimit < 0) 339 hlimit = ip6_dst_hoplimit(dst); 340 341 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 342 ip6_autoflowlabel(net, sk), fl6)); 343 344 hdr->payload_len = htons(seg_len); 345 hdr->nexthdr = proto; 346 hdr->hop_limit = hlimit; 347 348 hdr->saddr = fl6->saddr; 349 hdr->daddr = *first_hop; 350 351 skb->protocol = htons(ETH_P_IPV6); 352 skb->priority = priority; 353 skb->mark = mark; 354 355 mtu = dst_mtu(dst); 356 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 357 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 358 359 /* if egress device is enslaved to an L3 master device pass the 360 * skb to its handler for processing 361 */ 362 skb = l3mdev_ip6_out((struct sock *)sk, skb); 363 if (unlikely(!skb)) { 364 ret = 0; 365 goto unlock; 366 } 367 368 /* hooks should never assume socket lock is held. 369 * we promote our socket to non const 370 */ 371 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 372 net, (struct sock *)sk, skb, NULL, dev, 373 dst_output); 374 goto unlock; 375 } 376 377 ret = -EMSGSIZE; 378 skb->dev = dev; 379 /* ipv6_local_error() does not require socket lock, 380 * we promote our socket to non const 381 */ 382 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 383 384 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 385 kfree_skb(skb); 386 unlock: 387 rcu_read_unlock(); 388 return ret; 389 } 390 EXPORT_SYMBOL(ip6_xmit); 391 392 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 393 { 394 struct ip6_ra_chain *ra; 395 struct sock *last = NULL; 396 397 read_lock(&ip6_ra_lock); 398 for (ra = ip6_ra_chain; ra; ra = ra->next) { 399 struct sock *sk = ra->sk; 400 if (sk && ra->sel == sel && 401 (!sk->sk_bound_dev_if || 402 sk->sk_bound_dev_if == skb->dev->ifindex)) { 403 404 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 405 !net_eq(sock_net(sk), dev_net(skb->dev))) { 406 continue; 407 } 408 if (last) { 409 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 410 if (skb2) 411 rawv6_rcv(last, skb2); 412 } 413 last = sk; 414 } 415 } 416 417 if (last) { 418 rawv6_rcv(last, skb); 419 read_unlock(&ip6_ra_lock); 420 return 1; 421 } 422 read_unlock(&ip6_ra_lock); 423 return 0; 424 } 425 426 static int ip6_forward_proxy_check(struct sk_buff *skb) 427 { 428 struct ipv6hdr *hdr = ipv6_hdr(skb); 429 u8 nexthdr = hdr->nexthdr; 430 __be16 frag_off; 431 int offset; 432 433 if (ipv6_ext_hdr(nexthdr)) { 434 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 435 if (offset < 0) 436 return 0; 437 } else 438 offset = sizeof(struct ipv6hdr); 439 440 if (nexthdr == IPPROTO_ICMPV6) { 441 struct icmp6hdr *icmp6; 442 443 if (!pskb_may_pull(skb, (skb_network_header(skb) + 444 offset + 1 - skb->data))) 445 return 0; 446 447 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 448 449 switch (icmp6->icmp6_type) { 450 case NDISC_ROUTER_SOLICITATION: 451 case NDISC_ROUTER_ADVERTISEMENT: 452 case NDISC_NEIGHBOUR_SOLICITATION: 453 case NDISC_NEIGHBOUR_ADVERTISEMENT: 454 case NDISC_REDIRECT: 455 /* For reaction involving unicast neighbor discovery 456 * message destined to the proxied address, pass it to 457 * input function. 458 */ 459 return 1; 460 default: 461 break; 462 } 463 } 464 465 /* 466 * The proxying router can't forward traffic sent to a link-local 467 * address, so signal the sender and discard the packet. This 468 * behavior is clarified by the MIPv6 specification. 469 */ 470 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 471 dst_link_failure(skb); 472 return -1; 473 } 474 475 return 0; 476 } 477 478 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 479 struct sk_buff *skb) 480 { 481 #ifdef CONFIG_NET_SWITCHDEV 482 if (skb->offload_l3_fwd_mark) { 483 consume_skb(skb); 484 return 0; 485 } 486 #endif 487 488 skb_clear_tstamp(skb); 489 return dst_output(net, sk, skb); 490 } 491 492 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 493 { 494 if (skb->len <= mtu) 495 return false; 496 497 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 498 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 499 return true; 500 501 if (skb->ignore_df) 502 return false; 503 504 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 505 return false; 506 507 return true; 508 } 509 510 int ip6_forward(struct sk_buff *skb) 511 { 512 struct dst_entry *dst = skb_dst(skb); 513 struct ipv6hdr *hdr = ipv6_hdr(skb); 514 struct inet6_skb_parm *opt = IP6CB(skb); 515 struct net *net = dev_net(dst_dev(dst)); 516 struct net_device *dev; 517 struct inet6_dev *idev; 518 SKB_DR(reason); 519 u32 mtu; 520 521 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 522 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 523 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 524 goto error; 525 526 if (skb->pkt_type != PACKET_HOST) 527 goto drop; 528 529 if (unlikely(skb->sk)) 530 goto drop; 531 532 if (skb_warn_if_lro(skb)) 533 goto drop; 534 535 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 536 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 537 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 538 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 539 goto drop; 540 } 541 542 skb_forward_csum(skb); 543 544 /* 545 * We DO NOT make any processing on 546 * RA packets, pushing them to user level AS IS 547 * without ane WARRANTY that application will be able 548 * to interpret them. The reason is that we 549 * cannot make anything clever here. 550 * 551 * We are not end-node, so that if packet contains 552 * AH/ESP, we cannot make anything. 553 * Defragmentation also would be mistake, RA packets 554 * cannot be fragmented, because there is no warranty 555 * that different fragments will go along one path. --ANK 556 */ 557 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 558 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 559 return 0; 560 } 561 562 /* 563 * check and decrement ttl 564 */ 565 if (hdr->hop_limit <= 1) { 566 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 567 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 568 569 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 570 return -ETIMEDOUT; 571 } 572 573 /* XXX: idev->cnf.proxy_ndp? */ 574 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 575 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 576 int proxied = ip6_forward_proxy_check(skb); 577 if (proxied > 0) { 578 /* It's tempting to decrease the hop limit 579 * here by 1, as we do at the end of the 580 * function too. 581 * 582 * But that would be incorrect, as proxying is 583 * not forwarding. The ip6_input function 584 * will handle this packet locally, and it 585 * depends on the hop limit being unchanged. 586 * 587 * One example is the NDP hop limit, that 588 * always has to stay 255, but other would be 589 * similar checks around RA packets, where the 590 * user can even change the desired limit. 591 */ 592 return ip6_input(skb); 593 } else if (proxied < 0) { 594 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 595 goto drop; 596 } 597 } 598 599 if (!xfrm6_route_forward(skb)) { 600 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 601 SKB_DR_SET(reason, XFRM_POLICY); 602 goto drop; 603 } 604 dst = skb_dst(skb); 605 dev = dst_dev(dst); 606 /* IPv6 specs say nothing about it, but it is clear that we cannot 607 send redirects to source routed frames. 608 We don't send redirects to frames decapsulated from IPsec. 609 */ 610 if (IP6CB(skb)->iif == dev->ifindex && 611 opt->srcrt == 0 && !skb_sec_path(skb)) { 612 struct in6_addr *target = NULL; 613 struct inet_peer *peer; 614 struct rt6_info *rt; 615 616 /* 617 * incoming and outgoing devices are the same 618 * send a redirect. 619 */ 620 621 rt = dst_rt6_info(dst); 622 if (rt->rt6i_flags & RTF_GATEWAY) 623 target = &rt->rt6i_gateway; 624 else 625 target = &hdr->daddr; 626 627 rcu_read_lock(); 628 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 629 630 /* Limit redirects both by destination (here) 631 and by source (inside ndisc_send_redirect) 632 */ 633 if (inet_peer_xrlim_allow(peer, 1*HZ)) 634 ndisc_send_redirect(skb, target); 635 rcu_read_unlock(); 636 } else { 637 int addrtype = ipv6_addr_type(&hdr->saddr); 638 639 /* This check is security critical. */ 640 if (addrtype == IPV6_ADDR_ANY || 641 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 642 goto error; 643 if (addrtype & IPV6_ADDR_LINKLOCAL) { 644 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 645 ICMPV6_NOT_NEIGHBOUR, 0); 646 goto error; 647 } 648 } 649 650 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 651 652 mtu = ip6_dst_mtu_maybe_forward(dst, true); 653 if (mtu < IPV6_MIN_MTU) 654 mtu = IPV6_MIN_MTU; 655 656 if (ip6_pkt_too_big(skb, mtu)) { 657 /* Again, force OUTPUT device used as source address */ 658 skb->dev = dev; 659 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 660 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 661 __IP6_INC_STATS(net, ip6_dst_idev(dst), 662 IPSTATS_MIB_FRAGFAILS); 663 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 664 return -EMSGSIZE; 665 } 666 667 if (skb_cow(skb, dev->hard_header_len)) { 668 __IP6_INC_STATS(net, ip6_dst_idev(dst), 669 IPSTATS_MIB_OUTDISCARDS); 670 goto drop; 671 } 672 673 hdr = ipv6_hdr(skb); 674 675 /* Mangling hops number delayed to point after skb COW */ 676 677 hdr->hop_limit--; 678 679 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 680 net, NULL, skb, skb->dev, dev, 681 ip6_forward_finish); 682 683 error: 684 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 685 SKB_DR_SET(reason, IP_INADDRERRORS); 686 drop: 687 kfree_skb_reason(skb, reason); 688 return -EINVAL; 689 } 690 691 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 692 { 693 to->pkt_type = from->pkt_type; 694 to->priority = from->priority; 695 to->protocol = from->protocol; 696 skb_dst_drop(to); 697 skb_dst_set(to, dst_clone(skb_dst(from))); 698 to->dev = from->dev; 699 to->mark = from->mark; 700 701 skb_copy_hash(to, from); 702 703 #ifdef CONFIG_NET_SCHED 704 to->tc_index = from->tc_index; 705 #endif 706 nf_copy(to, from); 707 skb_ext_copy(to, from); 708 skb_copy_secmark(to, from); 709 } 710 711 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 712 u8 nexthdr, __be32 frag_id, 713 struct ip6_fraglist_iter *iter) 714 { 715 unsigned int first_len; 716 struct frag_hdr *fh; 717 718 /* BUILD HEADER */ 719 *prevhdr = NEXTHDR_FRAGMENT; 720 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 721 if (!iter->tmp_hdr) 722 return -ENOMEM; 723 724 iter->frag = skb_shinfo(skb)->frag_list; 725 skb_frag_list_init(skb); 726 727 iter->offset = 0; 728 iter->hlen = hlen; 729 iter->frag_id = frag_id; 730 iter->nexthdr = nexthdr; 731 732 __skb_pull(skb, hlen); 733 fh = __skb_push(skb, sizeof(struct frag_hdr)); 734 __skb_push(skb, hlen); 735 skb_reset_network_header(skb); 736 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 737 738 fh->nexthdr = nexthdr; 739 fh->reserved = 0; 740 fh->frag_off = htons(IP6_MF); 741 fh->identification = frag_id; 742 743 first_len = skb_pagelen(skb); 744 skb->data_len = first_len - skb_headlen(skb); 745 skb->len = first_len; 746 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 747 748 return 0; 749 } 750 EXPORT_SYMBOL(ip6_fraglist_init); 751 752 void ip6_fraglist_prepare(struct sk_buff *skb, 753 struct ip6_fraglist_iter *iter) 754 { 755 struct sk_buff *frag = iter->frag; 756 unsigned int hlen = iter->hlen; 757 struct frag_hdr *fh; 758 759 frag->ip_summed = CHECKSUM_NONE; 760 skb_reset_transport_header(frag); 761 fh = __skb_push(frag, sizeof(struct frag_hdr)); 762 __skb_push(frag, hlen); 763 skb_reset_network_header(frag); 764 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 765 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 766 fh->nexthdr = iter->nexthdr; 767 fh->reserved = 0; 768 fh->frag_off = htons(iter->offset); 769 if (frag->next) 770 fh->frag_off |= htons(IP6_MF); 771 fh->identification = iter->frag_id; 772 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 773 ip6_copy_metadata(frag, skb); 774 } 775 EXPORT_SYMBOL(ip6_fraglist_prepare); 776 777 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 778 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 779 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 780 { 781 state->prevhdr = prevhdr; 782 state->nexthdr = nexthdr; 783 state->frag_id = frag_id; 784 785 state->hlen = hlen; 786 state->mtu = mtu; 787 788 state->left = skb->len - hlen; /* Space per frame */ 789 state->ptr = hlen; /* Where to start from */ 790 791 state->hroom = hdr_room; 792 state->troom = needed_tailroom; 793 794 state->offset = 0; 795 } 796 EXPORT_SYMBOL(ip6_frag_init); 797 798 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 799 { 800 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 801 struct sk_buff *frag; 802 struct frag_hdr *fh; 803 unsigned int len; 804 805 len = state->left; 806 /* IF: it doesn't fit, use 'mtu' - the data space left */ 807 if (len > state->mtu) 808 len = state->mtu; 809 /* IF: we are not sending up to and including the packet end 810 then align the next start on an eight byte boundary */ 811 if (len < state->left) 812 len &= ~7; 813 814 /* Allocate buffer */ 815 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 816 state->hroom + state->troom, GFP_ATOMIC); 817 if (!frag) 818 return ERR_PTR(-ENOMEM); 819 820 /* 821 * Set up data on packet 822 */ 823 824 ip6_copy_metadata(frag, skb); 825 skb_reserve(frag, state->hroom); 826 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 827 skb_reset_network_header(frag); 828 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 829 frag->transport_header = (frag->network_header + state->hlen + 830 sizeof(struct frag_hdr)); 831 832 /* 833 * Charge the memory for the fragment to any owner 834 * it might possess 835 */ 836 if (skb->sk) 837 skb_set_owner_w(frag, skb->sk); 838 839 /* 840 * Copy the packet header into the new buffer. 841 */ 842 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 843 844 fragnexthdr_offset = skb_network_header(frag); 845 fragnexthdr_offset += prevhdr - skb_network_header(skb); 846 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 847 848 /* 849 * Build fragment header. 850 */ 851 fh->nexthdr = state->nexthdr; 852 fh->reserved = 0; 853 fh->identification = state->frag_id; 854 855 /* 856 * Copy a block of the IP datagram. 857 */ 858 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 859 len)); 860 state->left -= len; 861 862 fh->frag_off = htons(state->offset); 863 if (state->left > 0) 864 fh->frag_off |= htons(IP6_MF); 865 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 866 867 state->ptr += len; 868 state->offset += len; 869 870 return frag; 871 } 872 EXPORT_SYMBOL(ip6_frag_next); 873 874 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 875 int (*output)(struct net *, struct sock *, struct sk_buff *)) 876 { 877 struct sk_buff *frag; 878 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 879 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 880 inet6_sk(skb->sk) : NULL; 881 u8 tstamp_type = skb->tstamp_type; 882 struct ip6_frag_state state; 883 unsigned int mtu, hlen, nexthdr_offset; 884 ktime_t tstamp = skb->tstamp; 885 int hroom, err = 0; 886 __be32 frag_id; 887 u8 *prevhdr, nexthdr = 0; 888 889 err = ip6_find_1stfragopt(skb, &prevhdr); 890 if (err < 0) 891 goto fail; 892 hlen = err; 893 nexthdr = *prevhdr; 894 nexthdr_offset = prevhdr - skb_network_header(skb); 895 896 mtu = ip6_skb_dst_mtu(skb); 897 898 /* We must not fragment if the socket is set to force MTU discovery 899 * or if the skb it not generated by a local socket. 900 */ 901 if (unlikely(!skb->ignore_df && skb->len > mtu)) 902 goto fail_toobig; 903 904 if (IP6CB(skb)->frag_max_size) { 905 if (IP6CB(skb)->frag_max_size > mtu) 906 goto fail_toobig; 907 908 /* don't send fragments larger than what we received */ 909 mtu = IP6CB(skb)->frag_max_size; 910 if (mtu < IPV6_MIN_MTU) 911 mtu = IPV6_MIN_MTU; 912 } 913 914 if (np) { 915 u32 frag_size = READ_ONCE(np->frag_size); 916 917 if (frag_size && frag_size < mtu) 918 mtu = frag_size; 919 } 920 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 921 goto fail_toobig; 922 mtu -= hlen + sizeof(struct frag_hdr); 923 924 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 925 &ipv6_hdr(skb)->saddr); 926 927 if (skb->ip_summed == CHECKSUM_PARTIAL && 928 (err = skb_checksum_help(skb))) 929 goto fail; 930 931 prevhdr = skb_network_header(skb) + nexthdr_offset; 932 hroom = LL_RESERVED_SPACE(rt->dst.dev); 933 if (skb_has_frag_list(skb)) { 934 unsigned int first_len = skb_pagelen(skb); 935 struct ip6_fraglist_iter iter; 936 struct sk_buff *frag2; 937 938 if (first_len - hlen > mtu || 939 ((first_len - hlen) & 7) || 940 skb_cloned(skb) || 941 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 942 goto slow_path; 943 944 skb_walk_frags(skb, frag) { 945 /* Correct geometry. */ 946 if (frag->len > mtu || 947 ((frag->len & 7) && frag->next) || 948 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 949 goto slow_path_clean; 950 951 /* Partially cloned skb? */ 952 if (skb_shared(frag)) 953 goto slow_path_clean; 954 955 BUG_ON(frag->sk); 956 if (skb->sk) { 957 frag->sk = skb->sk; 958 frag->destructor = sock_wfree; 959 } 960 skb->truesize -= frag->truesize; 961 } 962 963 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 964 &iter); 965 if (err < 0) 966 goto fail; 967 968 /* We prevent @rt from being freed. */ 969 rcu_read_lock(); 970 971 for (;;) { 972 /* Prepare header of the next frame, 973 * before previous one went down. */ 974 if (iter.frag) 975 ip6_fraglist_prepare(skb, &iter); 976 977 skb_set_delivery_time(skb, tstamp, tstamp_type); 978 err = output(net, sk, skb); 979 if (!err) 980 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 981 IPSTATS_MIB_FRAGCREATES); 982 983 if (err || !iter.frag) 984 break; 985 986 skb = ip6_fraglist_next(&iter); 987 } 988 989 kfree(iter.tmp_hdr); 990 991 if (err == 0) { 992 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 993 IPSTATS_MIB_FRAGOKS); 994 rcu_read_unlock(); 995 return 0; 996 } 997 998 kfree_skb_list(iter.frag); 999 1000 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1001 IPSTATS_MIB_FRAGFAILS); 1002 rcu_read_unlock(); 1003 return err; 1004 1005 slow_path_clean: 1006 skb_walk_frags(skb, frag2) { 1007 if (frag2 == frag) 1008 break; 1009 frag2->sk = NULL; 1010 frag2->destructor = NULL; 1011 skb->truesize += frag2->truesize; 1012 } 1013 } 1014 1015 slow_path: 1016 /* 1017 * Fragment the datagram. 1018 */ 1019 1020 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1021 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1022 &state); 1023 1024 /* 1025 * Keep copying data until we run out. 1026 */ 1027 1028 while (state.left > 0) { 1029 frag = ip6_frag_next(skb, &state); 1030 if (IS_ERR(frag)) { 1031 err = PTR_ERR(frag); 1032 goto fail; 1033 } 1034 1035 /* 1036 * Put this fragment into the sending queue. 1037 */ 1038 skb_set_delivery_time(frag, tstamp, tstamp_type); 1039 err = output(net, sk, frag); 1040 if (err) 1041 goto fail; 1042 1043 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1044 IPSTATS_MIB_FRAGCREATES); 1045 } 1046 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1047 IPSTATS_MIB_FRAGOKS); 1048 consume_skb(skb); 1049 return err; 1050 1051 fail_toobig: 1052 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1053 err = -EMSGSIZE; 1054 1055 fail: 1056 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1057 IPSTATS_MIB_FRAGFAILS); 1058 kfree_skb(skb); 1059 return err; 1060 } 1061 1062 static inline int ip6_rt_check(const struct rt6key *rt_key, 1063 const struct in6_addr *fl_addr, 1064 const struct in6_addr *addr_cache) 1065 { 1066 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1067 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1068 } 1069 1070 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1071 struct dst_entry *dst, 1072 const struct flowi6 *fl6) 1073 { 1074 struct ipv6_pinfo *np = inet6_sk(sk); 1075 struct rt6_info *rt; 1076 1077 if (!dst) 1078 goto out; 1079 1080 if (dst->ops->family != AF_INET6) { 1081 dst_release(dst); 1082 return NULL; 1083 } 1084 1085 rt = dst_rt6_info(dst); 1086 /* Yes, checking route validity in not connected 1087 * case is not very simple. Take into account, 1088 * that we do not support routing by source, TOS, 1089 * and MSG_DONTROUTE --ANK (980726) 1090 * 1091 * 1. ip6_rt_check(): If route was host route, 1092 * check that cached destination is current. 1093 * If it is network route, we still may 1094 * check its validity using saved pointer 1095 * to the last used address: daddr_cache. 1096 * We do not want to save whole address now, 1097 * (because main consumer of this service 1098 * is tcp, which has not this problem), 1099 * so that the last trick works only on connected 1100 * sockets. 1101 * 2. oif also should be the same. 1102 */ 1103 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1104 #ifdef CONFIG_IPV6_SUBTREES 1105 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1106 #endif 1107 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1108 dst_release(dst); 1109 dst = NULL; 1110 } 1111 1112 out: 1113 return dst; 1114 } 1115 1116 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1117 struct dst_entry **dst, struct flowi6 *fl6) 1118 { 1119 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1120 struct neighbour *n; 1121 struct rt6_info *rt; 1122 #endif 1123 int err; 1124 int flags = 0; 1125 1126 /* The correct way to handle this would be to do 1127 * ip6_route_get_saddr, and then ip6_route_output; however, 1128 * the route-specific preferred source forces the 1129 * ip6_route_output call _before_ ip6_route_get_saddr. 1130 * 1131 * In source specific routing (no src=any default route), 1132 * ip6_route_output will fail given src=any saddr, though, so 1133 * that's why we try it again later. 1134 */ 1135 if (ipv6_addr_any(&fl6->saddr)) { 1136 struct fib6_info *from; 1137 struct rt6_info *rt; 1138 1139 *dst = ip6_route_output(net, sk, fl6); 1140 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1141 1142 rcu_read_lock(); 1143 from = rt ? rcu_dereference(rt->from) : NULL; 1144 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1145 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1146 fl6->flowi6_l3mdev, 1147 &fl6->saddr); 1148 rcu_read_unlock(); 1149 1150 if (err) 1151 goto out_err_release; 1152 1153 /* If we had an erroneous initial result, pretend it 1154 * never existed and let the SA-enabled version take 1155 * over. 1156 */ 1157 if ((*dst)->error) { 1158 dst_release(*dst); 1159 *dst = NULL; 1160 } 1161 1162 if (fl6->flowi6_oif) 1163 flags |= RT6_LOOKUP_F_IFACE; 1164 } 1165 1166 if (!*dst) 1167 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1168 1169 err = (*dst)->error; 1170 if (err) 1171 goto out_err_release; 1172 1173 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1174 /* 1175 * Here if the dst entry we've looked up 1176 * has a neighbour entry that is in the INCOMPLETE 1177 * state and the src address from the flow is 1178 * marked as OPTIMISTIC, we release the found 1179 * dst entry and replace it instead with the 1180 * dst entry of the nexthop router 1181 */ 1182 rt = dst_rt6_info(*dst); 1183 rcu_read_lock(); 1184 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1185 rt6_nexthop(rt, &fl6->daddr)); 1186 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1187 rcu_read_unlock(); 1188 1189 if (err) { 1190 struct inet6_ifaddr *ifp; 1191 struct flowi6 fl_gw6; 1192 int redirect; 1193 1194 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1195 (*dst)->dev, 1); 1196 1197 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1198 if (ifp) 1199 in6_ifa_put(ifp); 1200 1201 if (redirect) { 1202 /* 1203 * We need to get the dst entry for the 1204 * default router instead 1205 */ 1206 dst_release(*dst); 1207 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1208 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1209 *dst = ip6_route_output(net, sk, &fl_gw6); 1210 err = (*dst)->error; 1211 if (err) 1212 goto out_err_release; 1213 } 1214 } 1215 #endif 1216 if (ipv6_addr_v4mapped(&fl6->saddr) && 1217 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1218 err = -EAFNOSUPPORT; 1219 goto out_err_release; 1220 } 1221 1222 return 0; 1223 1224 out_err_release: 1225 dst_release(*dst); 1226 *dst = NULL; 1227 1228 if (err == -ENETUNREACH) 1229 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1230 return err; 1231 } 1232 1233 /** 1234 * ip6_dst_lookup - perform route lookup on flow 1235 * @net: Network namespace to perform lookup in 1236 * @sk: socket which provides route info 1237 * @dst: pointer to dst_entry * for result 1238 * @fl6: flow to lookup 1239 * 1240 * This function performs a route lookup on the given flow. 1241 * 1242 * It returns zero on success, or a standard errno code on error. 1243 */ 1244 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1245 struct flowi6 *fl6) 1246 { 1247 *dst = NULL; 1248 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1249 } 1250 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1251 1252 /** 1253 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1254 * @net: Network namespace to perform lookup in 1255 * @sk: socket which provides route info 1256 * @fl6: flow to lookup 1257 * @final_dst: final destination address for ipsec lookup 1258 * 1259 * This function performs a route lookup on the given flow. 1260 * 1261 * It returns a valid dst pointer on success, or a pointer encoded 1262 * error code. 1263 */ 1264 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1265 const struct in6_addr *final_dst) 1266 { 1267 struct dst_entry *dst = NULL; 1268 int err; 1269 1270 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1271 if (err) 1272 return ERR_PTR(err); 1273 if (final_dst) 1274 fl6->daddr = *final_dst; 1275 1276 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1277 } 1278 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1279 1280 /** 1281 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1282 * @sk: socket which provides the dst cache and route info 1283 * @fl6: flow to lookup 1284 * @final_dst: final destination address for ipsec lookup 1285 * @connected: whether @sk is connected or not 1286 * 1287 * This function performs a route lookup on the given flow with the 1288 * possibility of using the cached route in the socket if it is valid. 1289 * It will take the socket dst lock when operating on the dst cache. 1290 * As a result, this function can only be used in process context. 1291 * 1292 * In addition, for a connected socket, cache the dst in the socket 1293 * if the current cache is not valid. 1294 * 1295 * It returns a valid dst pointer on success, or a pointer encoded 1296 * error code. 1297 */ 1298 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1299 const struct in6_addr *final_dst, 1300 bool connected) 1301 { 1302 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1303 1304 dst = ip6_sk_dst_check(sk, dst, fl6); 1305 if (dst) 1306 return dst; 1307 1308 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1309 if (connected && !IS_ERR(dst)) 1310 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1311 1312 return dst; 1313 } 1314 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1315 1316 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1317 gfp_t gfp) 1318 { 1319 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1320 } 1321 1322 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1323 gfp_t gfp) 1324 { 1325 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1326 } 1327 1328 static void ip6_append_data_mtu(unsigned int *mtu, 1329 int *maxfraglen, 1330 unsigned int fragheaderlen, 1331 struct sk_buff *skb, 1332 struct rt6_info *rt, 1333 unsigned int orig_mtu) 1334 { 1335 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1336 if (!skb) { 1337 /* first fragment, reserve header_len */ 1338 *mtu = orig_mtu - rt->dst.header_len; 1339 1340 } else { 1341 /* 1342 * this fragment is not first, the headers 1343 * space is regarded as data space. 1344 */ 1345 *mtu = orig_mtu; 1346 } 1347 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1348 + fragheaderlen - sizeof(struct frag_hdr); 1349 } 1350 } 1351 1352 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1353 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1354 struct rt6_info *rt) 1355 { 1356 struct ipv6_pinfo *np = inet6_sk(sk); 1357 unsigned int mtu, frag_size; 1358 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1359 1360 /* callers pass dst together with a reference, set it first so 1361 * ip6_cork_release() can put it down even in case of an error. 1362 */ 1363 cork->base.dst = &rt->dst; 1364 1365 /* 1366 * setup for corking 1367 */ 1368 if (opt) { 1369 if (WARN_ON(v6_cork->opt)) 1370 return -EINVAL; 1371 1372 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1373 if (unlikely(!nopt)) 1374 return -ENOBUFS; 1375 1376 nopt->tot_len = sizeof(*opt); 1377 nopt->opt_flen = opt->opt_flen; 1378 nopt->opt_nflen = opt->opt_nflen; 1379 1380 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1381 if (opt->dst0opt && !nopt->dst0opt) 1382 return -ENOBUFS; 1383 1384 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1385 if (opt->dst1opt && !nopt->dst1opt) 1386 return -ENOBUFS; 1387 1388 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1389 if (opt->hopopt && !nopt->hopopt) 1390 return -ENOBUFS; 1391 1392 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1393 if (opt->srcrt && !nopt->srcrt) 1394 return -ENOBUFS; 1395 1396 /* need source address above miyazawa*/ 1397 } 1398 v6_cork->hop_limit = ipc6->hlimit; 1399 v6_cork->tclass = ipc6->tclass; 1400 v6_cork->dontfrag = ipc6->dontfrag; 1401 if (rt->dst.flags & DST_XFRM_TUNNEL) 1402 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1403 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1404 else 1405 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1406 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1407 1408 frag_size = READ_ONCE(np->frag_size); 1409 if (frag_size && frag_size < mtu) 1410 mtu = frag_size; 1411 1412 cork->base.fragsize = mtu; 1413 cork->base.gso_size = ipc6->gso_size; 1414 cork->base.tx_flags = 0; 1415 cork->base.mark = ipc6->sockc.mark; 1416 cork->base.priority = ipc6->sockc.priority; 1417 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1418 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1419 cork->base.flags |= IPCORK_TS_OPT_ID; 1420 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1421 } 1422 cork->base.length = 0; 1423 cork->base.transmit_time = ipc6->sockc.transmit_time; 1424 1425 return 0; 1426 } 1427 1428 static int __ip6_append_data(struct sock *sk, 1429 struct sk_buff_head *queue, 1430 struct inet_cork_full *cork_full, 1431 struct inet6_cork *v6_cork, 1432 struct page_frag *pfrag, 1433 int getfrag(void *from, char *to, int offset, 1434 int len, int odd, struct sk_buff *skb), 1435 void *from, size_t length, int transhdrlen, 1436 unsigned int flags) 1437 { 1438 struct sk_buff *skb, *skb_prev = NULL; 1439 struct inet_cork *cork = &cork_full->base; 1440 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1441 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1442 struct ubuf_info *uarg = NULL; 1443 int exthdrlen = 0; 1444 int dst_exthdrlen = 0; 1445 int hh_len; 1446 int copy; 1447 int err; 1448 int offset = 0; 1449 bool zc = false; 1450 u32 tskey = 0; 1451 struct rt6_info *rt = dst_rt6_info(cork->dst); 1452 bool paged, hold_tskey = false, extra_uref = false; 1453 struct ipv6_txoptions *opt = v6_cork->opt; 1454 int csummode = CHECKSUM_NONE; 1455 unsigned int maxnonfragsize, headersize; 1456 unsigned int wmem_alloc_delta = 0; 1457 1458 skb = skb_peek_tail(queue); 1459 if (!skb) { 1460 exthdrlen = opt ? opt->opt_flen : 0; 1461 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1462 } 1463 1464 paged = !!cork->gso_size; 1465 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1466 orig_mtu = mtu; 1467 1468 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1469 1470 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1471 (opt ? opt->opt_nflen : 0); 1472 1473 headersize = sizeof(struct ipv6hdr) + 1474 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1475 rt->rt6i_nfheader_len; 1476 1477 if (mtu <= fragheaderlen || 1478 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1479 goto emsgsize; 1480 1481 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1482 sizeof(struct frag_hdr); 1483 1484 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1485 * the first fragment 1486 */ 1487 if (headersize + transhdrlen > mtu) 1488 goto emsgsize; 1489 1490 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1491 (sk->sk_protocol == IPPROTO_UDP || 1492 sk->sk_protocol == IPPROTO_ICMPV6 || 1493 sk->sk_protocol == IPPROTO_RAW)) { 1494 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1495 sizeof(struct ipv6hdr)); 1496 goto emsgsize; 1497 } 1498 1499 if (ip6_sk_ignore_df(sk)) 1500 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1501 else 1502 maxnonfragsize = mtu; 1503 1504 if (cork->length + length > maxnonfragsize - headersize) { 1505 emsgsize: 1506 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1507 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1508 return -EMSGSIZE; 1509 } 1510 1511 /* CHECKSUM_PARTIAL only with no extension headers and when 1512 * we are not going to fragment 1513 */ 1514 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1515 headersize == sizeof(struct ipv6hdr) && 1516 length <= mtu - headersize && 1517 (!(flags & MSG_MORE) || cork->gso_size) && 1518 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1519 csummode = CHECKSUM_PARTIAL; 1520 1521 if ((flags & MSG_ZEROCOPY) && length) { 1522 struct msghdr *msg = from; 1523 1524 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1525 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1526 return -EINVAL; 1527 1528 /* Leave uarg NULL if can't zerocopy, callers should 1529 * be able to handle it. 1530 */ 1531 if ((rt->dst.dev->features & NETIF_F_SG) && 1532 csummode == CHECKSUM_PARTIAL) { 1533 paged = true; 1534 zc = true; 1535 uarg = msg->msg_ubuf; 1536 } 1537 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1538 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1539 false); 1540 if (!uarg) 1541 return -ENOBUFS; 1542 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1543 if (rt->dst.dev->features & NETIF_F_SG && 1544 csummode == CHECKSUM_PARTIAL) { 1545 paged = true; 1546 zc = true; 1547 } else { 1548 uarg_to_msgzc(uarg)->zerocopy = 0; 1549 skb_zcopy_set(skb, uarg, &extra_uref); 1550 } 1551 } 1552 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1553 if (inet_test_bit(HDRINCL, sk)) 1554 return -EPERM; 1555 if (rt->dst.dev->features & NETIF_F_SG && 1556 getfrag == ip_generic_getfrag) 1557 /* We need an empty buffer to attach stuff to */ 1558 paged = true; 1559 else 1560 flags &= ~MSG_SPLICE_PAGES; 1561 } 1562 1563 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1564 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1565 if (cork->flags & IPCORK_TS_OPT_ID) { 1566 tskey = cork->ts_opt_id; 1567 } else { 1568 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1569 hold_tskey = true; 1570 } 1571 } 1572 1573 /* 1574 * Let's try using as much space as possible. 1575 * Use MTU if total length of the message fits into the MTU. 1576 * Otherwise, we need to reserve fragment header and 1577 * fragment alignment (= 8-15 octects, in total). 1578 * 1579 * Note that we may need to "move" the data from the tail 1580 * of the buffer to the new fragment when we split 1581 * the message. 1582 * 1583 * FIXME: It may be fragmented into multiple chunks 1584 * at once if non-fragmentable extension headers 1585 * are too large. 1586 * --yoshfuji 1587 */ 1588 1589 cork->length += length; 1590 if (!skb) 1591 goto alloc_new_skb; 1592 1593 while (length > 0) { 1594 /* Check if the remaining data fits into current packet. */ 1595 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1596 if (copy < length) 1597 copy = maxfraglen - skb->len; 1598 1599 if (copy <= 0) { 1600 char *data; 1601 unsigned int datalen; 1602 unsigned int fraglen; 1603 unsigned int fraggap; 1604 unsigned int alloclen, alloc_extra; 1605 unsigned int pagedlen; 1606 alloc_new_skb: 1607 /* There's no room in the current skb */ 1608 if (skb) 1609 fraggap = skb->len - maxfraglen; 1610 else 1611 fraggap = 0; 1612 /* update mtu and maxfraglen if necessary */ 1613 if (!skb || !skb_prev) 1614 ip6_append_data_mtu(&mtu, &maxfraglen, 1615 fragheaderlen, skb, rt, 1616 orig_mtu); 1617 1618 skb_prev = skb; 1619 1620 /* 1621 * If remaining data exceeds the mtu, 1622 * we know we need more fragment(s). 1623 */ 1624 datalen = length + fraggap; 1625 1626 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1627 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1628 fraglen = datalen + fragheaderlen; 1629 pagedlen = 0; 1630 1631 alloc_extra = hh_len; 1632 alloc_extra += dst_exthdrlen; 1633 alloc_extra += rt->dst.trailer_len; 1634 1635 /* We just reserve space for fragment header. 1636 * Note: this may be overallocation if the message 1637 * (without MSG_MORE) fits into the MTU. 1638 */ 1639 alloc_extra += sizeof(struct frag_hdr); 1640 1641 if ((flags & MSG_MORE) && 1642 !(rt->dst.dev->features&NETIF_F_SG)) 1643 alloclen = mtu; 1644 else if (!paged && 1645 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1646 !(rt->dst.dev->features & NETIF_F_SG))) 1647 alloclen = fraglen; 1648 else { 1649 alloclen = fragheaderlen + transhdrlen; 1650 pagedlen = datalen - transhdrlen; 1651 } 1652 alloclen += alloc_extra; 1653 1654 if (datalen != length + fraggap) { 1655 /* 1656 * this is not the last fragment, the trailer 1657 * space is regarded as data space. 1658 */ 1659 datalen += rt->dst.trailer_len; 1660 } 1661 1662 fraglen = datalen + fragheaderlen; 1663 1664 copy = datalen - transhdrlen - fraggap - pagedlen; 1665 /* [!] NOTE: copy may be negative if pagedlen>0 1666 * because then the equation may reduces to -fraggap. 1667 */ 1668 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1669 err = -EINVAL; 1670 goto error; 1671 } 1672 if (transhdrlen) { 1673 skb = sock_alloc_send_skb(sk, alloclen, 1674 (flags & MSG_DONTWAIT), &err); 1675 } else { 1676 skb = NULL; 1677 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1678 2 * sk->sk_sndbuf) 1679 skb = alloc_skb(alloclen, 1680 sk->sk_allocation); 1681 if (unlikely(!skb)) 1682 err = -ENOBUFS; 1683 } 1684 if (!skb) 1685 goto error; 1686 /* 1687 * Fill in the control structures 1688 */ 1689 skb->protocol = htons(ETH_P_IPV6); 1690 skb->ip_summed = csummode; 1691 skb->csum = 0; 1692 /* reserve for fragmentation and ipsec header */ 1693 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1694 dst_exthdrlen); 1695 1696 /* 1697 * Find where to start putting bytes 1698 */ 1699 data = skb_put(skb, fraglen - pagedlen); 1700 skb_set_network_header(skb, exthdrlen); 1701 data += fragheaderlen; 1702 skb->transport_header = (skb->network_header + 1703 fragheaderlen); 1704 if (fraggap) { 1705 skb->csum = skb_copy_and_csum_bits( 1706 skb_prev, maxfraglen, 1707 data + transhdrlen, fraggap); 1708 skb_prev->csum = csum_sub(skb_prev->csum, 1709 skb->csum); 1710 data += fraggap; 1711 pskb_trim_unique(skb_prev, maxfraglen); 1712 } 1713 if (copy > 0 && 1714 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1715 from, data + transhdrlen, offset, 1716 copy, fraggap, skb) < 0) { 1717 err = -EFAULT; 1718 kfree_skb(skb); 1719 goto error; 1720 } else if (flags & MSG_SPLICE_PAGES) { 1721 copy = 0; 1722 } 1723 1724 offset += copy; 1725 length -= copy + transhdrlen; 1726 transhdrlen = 0; 1727 exthdrlen = 0; 1728 dst_exthdrlen = 0; 1729 1730 /* Only the initial fragment is time stamped */ 1731 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1732 cork->tx_flags = 0; 1733 skb_shinfo(skb)->tskey = tskey; 1734 tskey = 0; 1735 skb_zcopy_set(skb, uarg, &extra_uref); 1736 1737 if ((flags & MSG_CONFIRM) && !skb_prev) 1738 skb_set_dst_pending_confirm(skb, 1); 1739 1740 /* 1741 * Put the packet on the pending queue 1742 */ 1743 if (!skb->destructor) { 1744 skb->destructor = sock_wfree; 1745 skb->sk = sk; 1746 wmem_alloc_delta += skb->truesize; 1747 } 1748 __skb_queue_tail(queue, skb); 1749 continue; 1750 } 1751 1752 if (copy > length) 1753 copy = length; 1754 1755 if (!(rt->dst.dev->features&NETIF_F_SG) && 1756 skb_tailroom(skb) >= copy) { 1757 unsigned int off; 1758 1759 off = skb->len; 1760 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1761 from, skb_put(skb, copy), 1762 offset, copy, off, skb) < 0) { 1763 __skb_trim(skb, off); 1764 err = -EFAULT; 1765 goto error; 1766 } 1767 } else if (flags & MSG_SPLICE_PAGES) { 1768 struct msghdr *msg = from; 1769 1770 err = -EIO; 1771 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1772 goto error; 1773 1774 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1775 if (err < 0) 1776 goto error; 1777 copy = err; 1778 wmem_alloc_delta += copy; 1779 } else if (!zc) { 1780 int i = skb_shinfo(skb)->nr_frags; 1781 1782 err = -ENOMEM; 1783 if (!sk_page_frag_refill(sk, pfrag)) 1784 goto error; 1785 1786 skb_zcopy_downgrade_managed(skb); 1787 if (!skb_can_coalesce(skb, i, pfrag->page, 1788 pfrag->offset)) { 1789 err = -EMSGSIZE; 1790 if (i == MAX_SKB_FRAGS) 1791 goto error; 1792 1793 __skb_fill_page_desc(skb, i, pfrag->page, 1794 pfrag->offset, 0); 1795 skb_shinfo(skb)->nr_frags = ++i; 1796 get_page(pfrag->page); 1797 } 1798 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1799 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1800 from, 1801 page_address(pfrag->page) + pfrag->offset, 1802 offset, copy, skb->len, skb) < 0) 1803 goto error_efault; 1804 1805 pfrag->offset += copy; 1806 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1807 skb->len += copy; 1808 skb->data_len += copy; 1809 skb->truesize += copy; 1810 wmem_alloc_delta += copy; 1811 } else { 1812 err = skb_zerocopy_iter_dgram(skb, from, copy); 1813 if (err < 0) 1814 goto error; 1815 } 1816 offset += copy; 1817 length -= copy; 1818 } 1819 1820 if (wmem_alloc_delta) 1821 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1822 return 0; 1823 1824 error_efault: 1825 err = -EFAULT; 1826 error: 1827 net_zcopy_put_abort(uarg, extra_uref); 1828 cork->length -= length; 1829 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1830 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1831 if (hold_tskey) 1832 atomic_dec(&sk->sk_tskey); 1833 return err; 1834 } 1835 1836 int ip6_append_data(struct sock *sk, 1837 int getfrag(void *from, char *to, int offset, int len, 1838 int odd, struct sk_buff *skb), 1839 void *from, size_t length, int transhdrlen, 1840 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1841 struct rt6_info *rt, unsigned int flags) 1842 { 1843 struct inet_sock *inet = inet_sk(sk); 1844 struct ipv6_pinfo *np = inet6_sk(sk); 1845 int exthdrlen; 1846 int err; 1847 1848 if (flags&MSG_PROBE) 1849 return 0; 1850 if (skb_queue_empty(&sk->sk_write_queue)) { 1851 /* 1852 * setup for corking 1853 */ 1854 dst_hold(&rt->dst); 1855 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1856 ipc6, rt); 1857 if (err) 1858 return err; 1859 1860 inet->cork.fl.u.ip6 = *fl6; 1861 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1862 length += exthdrlen; 1863 transhdrlen += exthdrlen; 1864 } else { 1865 transhdrlen = 0; 1866 } 1867 1868 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1869 &np->cork, sk_page_frag(sk), getfrag, 1870 from, length, transhdrlen, flags); 1871 } 1872 EXPORT_SYMBOL_GPL(ip6_append_data); 1873 1874 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1875 { 1876 struct dst_entry *dst = cork->base.dst; 1877 1878 cork->base.dst = NULL; 1879 skb_dst_set(skb, dst); 1880 } 1881 1882 static void ip6_cork_release(struct inet_cork_full *cork, 1883 struct inet6_cork *v6_cork) 1884 { 1885 if (v6_cork->opt) { 1886 struct ipv6_txoptions *opt = v6_cork->opt; 1887 1888 kfree(opt->dst0opt); 1889 kfree(opt->dst1opt); 1890 kfree(opt->hopopt); 1891 kfree(opt->srcrt); 1892 kfree(opt); 1893 v6_cork->opt = NULL; 1894 } 1895 1896 if (cork->base.dst) { 1897 dst_release(cork->base.dst); 1898 cork->base.dst = NULL; 1899 } 1900 } 1901 1902 struct sk_buff *__ip6_make_skb(struct sock *sk, 1903 struct sk_buff_head *queue, 1904 struct inet_cork_full *cork, 1905 struct inet6_cork *v6_cork) 1906 { 1907 struct sk_buff *skb, *tmp_skb; 1908 struct sk_buff **tail_skb; 1909 struct in6_addr *final_dst; 1910 struct net *net = sock_net(sk); 1911 struct ipv6hdr *hdr; 1912 struct ipv6_txoptions *opt = v6_cork->opt; 1913 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1914 struct flowi6 *fl6 = &cork->fl.u.ip6; 1915 unsigned char proto = fl6->flowi6_proto; 1916 1917 skb = __skb_dequeue(queue); 1918 if (!skb) 1919 goto out; 1920 tail_skb = &(skb_shinfo(skb)->frag_list); 1921 1922 /* move skb->data to ip header from ext header */ 1923 if (skb->data < skb_network_header(skb)) 1924 __skb_pull(skb, skb_network_offset(skb)); 1925 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1926 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1927 *tail_skb = tmp_skb; 1928 tail_skb = &(tmp_skb->next); 1929 skb->len += tmp_skb->len; 1930 skb->data_len += tmp_skb->len; 1931 skb->truesize += tmp_skb->truesize; 1932 tmp_skb->destructor = NULL; 1933 tmp_skb->sk = NULL; 1934 } 1935 1936 /* Allow local fragmentation. */ 1937 skb->ignore_df = ip6_sk_ignore_df(sk); 1938 __skb_pull(skb, skb_network_header_len(skb)); 1939 1940 final_dst = &fl6->daddr; 1941 if (opt && opt->opt_flen) 1942 ipv6_push_frag_opts(skb, opt, &proto); 1943 if (opt && opt->opt_nflen) 1944 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1945 1946 skb_push(skb, sizeof(struct ipv6hdr)); 1947 skb_reset_network_header(skb); 1948 hdr = ipv6_hdr(skb); 1949 1950 ip6_flow_hdr(hdr, v6_cork->tclass, 1951 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1952 ip6_autoflowlabel(net, sk), fl6)); 1953 hdr->hop_limit = v6_cork->hop_limit; 1954 hdr->nexthdr = proto; 1955 hdr->saddr = fl6->saddr; 1956 hdr->daddr = *final_dst; 1957 1958 skb->priority = cork->base.priority; 1959 skb->mark = cork->base.mark; 1960 if (sk_is_tcp(sk)) 1961 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1962 else 1963 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1964 1965 ip6_cork_steal_dst(skb, cork); 1966 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1967 if (proto == IPPROTO_ICMPV6) { 1968 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1969 u8 icmp6_type; 1970 1971 if (sk->sk_socket->type == SOCK_RAW && 1972 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1973 icmp6_type = fl6->fl6_icmp_type; 1974 else 1975 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1976 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1977 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1978 } 1979 1980 ip6_cork_release(cork, v6_cork); 1981 out: 1982 return skb; 1983 } 1984 1985 int ip6_send_skb(struct sk_buff *skb) 1986 { 1987 struct net *net = sock_net(skb->sk); 1988 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 1989 int err; 1990 1991 rcu_read_lock(); 1992 err = ip6_local_out(net, skb->sk, skb); 1993 if (err) { 1994 if (err > 0) 1995 err = net_xmit_errno(err); 1996 if (err) 1997 IP6_INC_STATS(net, rt->rt6i_idev, 1998 IPSTATS_MIB_OUTDISCARDS); 1999 } 2000 2001 rcu_read_unlock(); 2002 return err; 2003 } 2004 2005 int ip6_push_pending_frames(struct sock *sk) 2006 { 2007 struct sk_buff *skb; 2008 2009 skb = ip6_finish_skb(sk); 2010 if (!skb) 2011 return 0; 2012 2013 return ip6_send_skb(skb); 2014 } 2015 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2016 2017 static void __ip6_flush_pending_frames(struct sock *sk, 2018 struct sk_buff_head *queue, 2019 struct inet_cork_full *cork, 2020 struct inet6_cork *v6_cork) 2021 { 2022 struct sk_buff *skb; 2023 2024 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2025 if (skb_dst(skb)) 2026 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2027 IPSTATS_MIB_OUTDISCARDS); 2028 kfree_skb(skb); 2029 } 2030 2031 ip6_cork_release(cork, v6_cork); 2032 } 2033 2034 void ip6_flush_pending_frames(struct sock *sk) 2035 { 2036 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2037 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 2038 } 2039 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2040 2041 struct sk_buff *ip6_make_skb(struct sock *sk, 2042 int getfrag(void *from, char *to, int offset, 2043 int len, int odd, struct sk_buff *skb), 2044 void *from, size_t length, int transhdrlen, 2045 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2046 unsigned int flags, struct inet_cork_full *cork) 2047 { 2048 struct inet6_cork v6_cork; 2049 struct sk_buff_head queue; 2050 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2051 int err; 2052 2053 if (flags & MSG_PROBE) { 2054 dst_release(&rt->dst); 2055 return NULL; 2056 } 2057 2058 __skb_queue_head_init(&queue); 2059 2060 cork->base.flags = 0; 2061 cork->base.addr = 0; 2062 cork->base.opt = NULL; 2063 v6_cork.opt = NULL; 2064 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); 2065 if (err) { 2066 ip6_cork_release(cork, &v6_cork); 2067 return ERR_PTR(err); 2068 } 2069 2070 err = __ip6_append_data(sk, &queue, cork, &v6_cork, 2071 ¤t->task_frag, getfrag, from, 2072 length + exthdrlen, transhdrlen + exthdrlen, 2073 flags); 2074 if (err) { 2075 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 2076 return ERR_PTR(err); 2077 } 2078 2079 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 2080 } 2081