1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (ipv6_addr_is_multicast(daddr)) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 183 !skb_gso_validate_network_len(skb, mtu)) 184 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 185 186 return ip6_finish_output2(net, sk, skb); 187 } 188 189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 190 { 191 unsigned int mtu; 192 193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 194 /* Policy lookup after SNAT yielded a new policy */ 195 if (skb_dst(skb)->xfrm) { 196 IP6CB(skb)->flags |= IP6SKB_REROUTED; 197 return dst_output(net, sk, skb); 198 } 199 #endif 200 201 mtu = ip6_skb_dst_mtu(skb); 202 if (skb_is_gso(skb)) 203 return ip6_finish_output_gso(net, sk, skb, mtu); 204 205 if (skb->len > mtu || 206 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 207 return ip6_fragment(net, sk, skb, ip6_finish_output2); 208 209 return ip6_finish_output2(net, sk, skb); 210 } 211 212 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 213 { 214 int ret; 215 216 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 217 switch (ret) { 218 case NET_XMIT_SUCCESS: 219 case NET_XMIT_CN: 220 return __ip6_finish_output(net, sk, skb) ? : ret; 221 default: 222 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 223 return ret; 224 } 225 } 226 227 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 228 { 229 struct dst_entry *dst = skb_dst(skb); 230 struct net_device *dev, *indev = skb->dev; 231 struct inet6_dev *idev; 232 int ret; 233 234 skb->protocol = htons(ETH_P_IPV6); 235 rcu_read_lock(); 236 dev = dst_dev_rcu(dst); 237 idev = ip6_dst_idev(dst); 238 skb->dev = dev; 239 240 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 241 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 242 rcu_read_unlock(); 243 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 244 return 0; 245 } 246 247 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 248 net, sk, skb, indev, dev, 249 ip6_finish_output, 250 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 251 rcu_read_unlock(); 252 return ret; 253 } 254 EXPORT_SYMBOL(ip6_output); 255 256 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 257 { 258 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 259 return ip6_default_np_autolabel(net); 260 return inet6_test_bit(AUTOFLOWLABEL, sk); 261 } 262 263 /* 264 * xmit an sk_buff (used by TCP and SCTP) 265 * Note : socket lock is not held for SYNACK packets, but might be modified 266 * by calls to skb_set_owner_w() and ipv6_local_error(), 267 * which are using proper atomic operations or spinlocks. 268 */ 269 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 270 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 271 { 272 const struct ipv6_pinfo *np = inet6_sk(sk); 273 struct in6_addr *first_hop = &fl6->daddr; 274 struct dst_entry *dst = skb_dst(skb); 275 struct inet6_dev *idev = ip6_dst_idev(dst); 276 struct hop_jumbo_hdr *hop_jumbo; 277 int hoplen = sizeof(*hop_jumbo); 278 struct net *net = sock_net(sk); 279 unsigned int head_room; 280 struct net_device *dev; 281 struct ipv6hdr *hdr; 282 u8 proto = fl6->flowi6_proto; 283 int seg_len = skb->len; 284 int ret, hlimit = -1; 285 u32 mtu; 286 287 rcu_read_lock(); 288 289 dev = dst_dev_rcu(dst); 290 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 291 if (opt) 292 head_room += opt->opt_nflen + opt->opt_flen; 293 294 if (unlikely(head_room > skb_headroom(skb))) { 295 /* idev stays alive while we hold rcu_read_lock(). */ 296 skb = skb_expand_head(skb, head_room); 297 if (!skb) { 298 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 299 ret = -ENOBUFS; 300 goto unlock; 301 } 302 } 303 304 if (opt) { 305 seg_len += opt->opt_nflen + opt->opt_flen; 306 307 if (opt->opt_flen) 308 ipv6_push_frag_opts(skb, opt, &proto); 309 310 if (opt->opt_nflen) 311 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 312 &fl6->saddr); 313 } 314 315 if (unlikely(seg_len > IPV6_MAXPLEN)) { 316 hop_jumbo = skb_push(skb, hoplen); 317 318 hop_jumbo->nexthdr = proto; 319 hop_jumbo->hdrlen = 0; 320 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 321 hop_jumbo->tlv_len = 4; 322 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 323 324 proto = IPPROTO_HOPOPTS; 325 seg_len = 0; 326 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 327 } 328 329 skb_push(skb, sizeof(struct ipv6hdr)); 330 skb_reset_network_header(skb); 331 hdr = ipv6_hdr(skb); 332 333 /* 334 * Fill in the IPv6 header 335 */ 336 if (np) 337 hlimit = READ_ONCE(np->hop_limit); 338 if (hlimit < 0) 339 hlimit = ip6_dst_hoplimit(dst); 340 341 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 342 ip6_autoflowlabel(net, sk), fl6)); 343 344 hdr->payload_len = htons(seg_len); 345 hdr->nexthdr = proto; 346 hdr->hop_limit = hlimit; 347 348 hdr->saddr = fl6->saddr; 349 hdr->daddr = *first_hop; 350 351 skb->protocol = htons(ETH_P_IPV6); 352 skb->priority = priority; 353 skb->mark = mark; 354 355 mtu = dst_mtu(dst); 356 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 357 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 358 359 /* if egress device is enslaved to an L3 master device pass the 360 * skb to its handler for processing 361 */ 362 skb = l3mdev_ip6_out((struct sock *)sk, skb); 363 if (unlikely(!skb)) { 364 ret = 0; 365 goto unlock; 366 } 367 368 /* hooks should never assume socket lock is held. 369 * we promote our socket to non const 370 */ 371 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 372 net, (struct sock *)sk, skb, NULL, dev, 373 dst_output); 374 goto unlock; 375 } 376 377 ret = -EMSGSIZE; 378 skb->dev = dev; 379 /* ipv6_local_error() does not require socket lock, 380 * we promote our socket to non const 381 */ 382 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 383 384 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 385 kfree_skb(skb); 386 unlock: 387 rcu_read_unlock(); 388 return ret; 389 } 390 EXPORT_SYMBOL(ip6_xmit); 391 392 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 393 { 394 struct ip6_ra_chain *ra; 395 struct sock *last = NULL; 396 397 read_lock(&ip6_ra_lock); 398 for (ra = ip6_ra_chain; ra; ra = ra->next) { 399 struct sock *sk = ra->sk; 400 if (sk && ra->sel == sel && 401 (!sk->sk_bound_dev_if || 402 sk->sk_bound_dev_if == skb->dev->ifindex)) { 403 404 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 405 !net_eq(sock_net(sk), dev_net(skb->dev))) { 406 continue; 407 } 408 if (last) { 409 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 410 if (skb2) 411 rawv6_rcv(last, skb2); 412 } 413 last = sk; 414 } 415 } 416 417 if (last) { 418 rawv6_rcv(last, skb); 419 read_unlock(&ip6_ra_lock); 420 return 1; 421 } 422 read_unlock(&ip6_ra_lock); 423 return 0; 424 } 425 426 static int ip6_forward_proxy_check(struct sk_buff *skb) 427 { 428 struct ipv6hdr *hdr = ipv6_hdr(skb); 429 u8 nexthdr = hdr->nexthdr; 430 __be16 frag_off; 431 int offset; 432 433 if (ipv6_ext_hdr(nexthdr)) { 434 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 435 if (offset < 0) 436 return 0; 437 } else 438 offset = sizeof(struct ipv6hdr); 439 440 if (nexthdr == IPPROTO_ICMPV6) { 441 struct icmp6hdr *icmp6; 442 443 if (!pskb_may_pull(skb, (skb_network_header(skb) + 444 offset + 1 - skb->data))) 445 return 0; 446 447 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 448 449 switch (icmp6->icmp6_type) { 450 case NDISC_ROUTER_SOLICITATION: 451 case NDISC_ROUTER_ADVERTISEMENT: 452 case NDISC_NEIGHBOUR_SOLICITATION: 453 case NDISC_NEIGHBOUR_ADVERTISEMENT: 454 case NDISC_REDIRECT: 455 /* For reaction involving unicast neighbor discovery 456 * message destined to the proxied address, pass it to 457 * input function. 458 */ 459 return 1; 460 default: 461 break; 462 } 463 } 464 465 /* 466 * The proxying router can't forward traffic sent to a link-local 467 * address, so signal the sender and discard the packet. This 468 * behavior is clarified by the MIPv6 specification. 469 */ 470 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 471 dst_link_failure(skb); 472 return -1; 473 } 474 475 return 0; 476 } 477 478 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 479 struct sk_buff *skb) 480 { 481 #ifdef CONFIG_NET_SWITCHDEV 482 if (skb->offload_l3_fwd_mark) { 483 consume_skb(skb); 484 return 0; 485 } 486 #endif 487 488 skb_clear_tstamp(skb); 489 return dst_output(net, sk, skb); 490 } 491 492 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 493 { 494 if (skb->len <= mtu) 495 return false; 496 497 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 498 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 499 return true; 500 501 if (skb->ignore_df) 502 return false; 503 504 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 505 return false; 506 507 return true; 508 } 509 510 int ip6_forward(struct sk_buff *skb) 511 { 512 struct dst_entry *dst = skb_dst(skb); 513 struct ipv6hdr *hdr = ipv6_hdr(skb); 514 struct inet6_skb_parm *opt = IP6CB(skb); 515 struct net *net = dev_net(dst_dev(dst)); 516 struct net_device *dev; 517 struct inet6_dev *idev; 518 SKB_DR(reason); 519 u32 mtu; 520 521 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 522 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 523 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 524 goto error; 525 526 if (skb->pkt_type != PACKET_HOST) 527 goto drop; 528 529 if (unlikely(skb->sk)) 530 goto drop; 531 532 if (skb_warn_if_lro(skb)) 533 goto drop; 534 535 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 536 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 537 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 538 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 539 goto drop; 540 } 541 542 skb_forward_csum(skb); 543 544 /* 545 * We DO NOT make any processing on 546 * RA packets, pushing them to user level AS IS 547 * without ane WARRANTY that application will be able 548 * to interpret them. The reason is that we 549 * cannot make anything clever here. 550 * 551 * We are not end-node, so that if packet contains 552 * AH/ESP, we cannot make anything. 553 * Defragmentation also would be mistake, RA packets 554 * cannot be fragmented, because there is no warranty 555 * that different fragments will go along one path. --ANK 556 */ 557 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 558 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 559 return 0; 560 } 561 562 /* 563 * check and decrement ttl 564 */ 565 if (hdr->hop_limit <= 1) { 566 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 567 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 568 569 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 570 return -ETIMEDOUT; 571 } 572 573 /* XXX: idev->cnf.proxy_ndp? */ 574 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 575 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 576 int proxied = ip6_forward_proxy_check(skb); 577 if (proxied > 0) { 578 /* It's tempting to decrease the hop limit 579 * here by 1, as we do at the end of the 580 * function too. 581 * 582 * But that would be incorrect, as proxying is 583 * not forwarding. The ip6_input function 584 * will handle this packet locally, and it 585 * depends on the hop limit being unchanged. 586 * 587 * One example is the NDP hop limit, that 588 * always has to stay 255, but other would be 589 * similar checks around RA packets, where the 590 * user can even change the desired limit. 591 */ 592 return ip6_input(skb); 593 } else if (proxied < 0) { 594 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 595 goto drop; 596 } 597 } 598 599 if (!xfrm6_route_forward(skb)) { 600 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 601 SKB_DR_SET(reason, XFRM_POLICY); 602 goto drop; 603 } 604 dst = skb_dst(skb); 605 dev = dst_dev(dst); 606 /* IPv6 specs say nothing about it, but it is clear that we cannot 607 send redirects to source routed frames. 608 We don't send redirects to frames decapsulated from IPsec. 609 */ 610 if (IP6CB(skb)->iif == dev->ifindex && 611 opt->srcrt == 0 && !skb_sec_path(skb)) { 612 struct in6_addr *target = NULL; 613 struct inet_peer *peer; 614 struct rt6_info *rt; 615 616 /* 617 * incoming and outgoing devices are the same 618 * send a redirect. 619 */ 620 621 rt = dst_rt6_info(dst); 622 if (rt->rt6i_flags & RTF_GATEWAY) 623 target = &rt->rt6i_gateway; 624 else 625 target = &hdr->daddr; 626 627 rcu_read_lock(); 628 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 629 630 /* Limit redirects both by destination (here) 631 and by source (inside ndisc_send_redirect) 632 */ 633 if (inet_peer_xrlim_allow(peer, 1*HZ)) 634 ndisc_send_redirect(skb, target); 635 rcu_read_unlock(); 636 } else { 637 int addrtype = ipv6_addr_type(&hdr->saddr); 638 639 /* This check is security critical. */ 640 if (addrtype == IPV6_ADDR_ANY || 641 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 642 goto error; 643 if (addrtype & IPV6_ADDR_LINKLOCAL) { 644 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 645 ICMPV6_NOT_NEIGHBOUR, 0); 646 goto error; 647 } 648 } 649 650 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 651 652 mtu = ip6_dst_mtu_maybe_forward(dst, true); 653 if (mtu < IPV6_MIN_MTU) 654 mtu = IPV6_MIN_MTU; 655 656 if (ip6_pkt_too_big(skb, mtu)) { 657 /* Again, force OUTPUT device used as source address */ 658 skb->dev = dev; 659 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 660 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 661 __IP6_INC_STATS(net, ip6_dst_idev(dst), 662 IPSTATS_MIB_FRAGFAILS); 663 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 664 return -EMSGSIZE; 665 } 666 667 if (skb_cow(skb, dev->hard_header_len)) { 668 __IP6_INC_STATS(net, ip6_dst_idev(dst), 669 IPSTATS_MIB_OUTDISCARDS); 670 goto drop; 671 } 672 673 hdr = ipv6_hdr(skb); 674 675 /* Mangling hops number delayed to point after skb COW */ 676 677 hdr->hop_limit--; 678 679 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 680 net, NULL, skb, skb->dev, dev, 681 ip6_forward_finish); 682 683 error: 684 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 685 SKB_DR_SET(reason, IP_INADDRERRORS); 686 drop: 687 kfree_skb_reason(skb, reason); 688 return -EINVAL; 689 } 690 691 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 692 { 693 to->pkt_type = from->pkt_type; 694 to->priority = from->priority; 695 to->protocol = from->protocol; 696 skb_dst_drop(to); 697 skb_dst_set(to, dst_clone(skb_dst(from))); 698 to->dev = from->dev; 699 to->mark = from->mark; 700 701 skb_copy_hash(to, from); 702 703 #ifdef CONFIG_NET_SCHED 704 to->tc_index = from->tc_index; 705 #endif 706 nf_copy(to, from); 707 skb_ext_copy(to, from); 708 skb_copy_secmark(to, from); 709 } 710 711 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 712 u8 nexthdr, __be32 frag_id, 713 struct ip6_fraglist_iter *iter) 714 { 715 unsigned int first_len; 716 struct frag_hdr *fh; 717 718 /* BUILD HEADER */ 719 *prevhdr = NEXTHDR_FRAGMENT; 720 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 721 if (!iter->tmp_hdr) 722 return -ENOMEM; 723 724 iter->frag = skb_shinfo(skb)->frag_list; 725 skb_frag_list_init(skb); 726 727 iter->offset = 0; 728 iter->hlen = hlen; 729 iter->frag_id = frag_id; 730 iter->nexthdr = nexthdr; 731 732 __skb_pull(skb, hlen); 733 fh = __skb_push(skb, sizeof(struct frag_hdr)); 734 __skb_push(skb, hlen); 735 skb_reset_network_header(skb); 736 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 737 738 fh->nexthdr = nexthdr; 739 fh->reserved = 0; 740 fh->frag_off = htons(IP6_MF); 741 fh->identification = frag_id; 742 743 first_len = skb_pagelen(skb); 744 skb->data_len = first_len - skb_headlen(skb); 745 skb->len = first_len; 746 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 747 748 return 0; 749 } 750 EXPORT_SYMBOL(ip6_fraglist_init); 751 752 void ip6_fraglist_prepare(struct sk_buff *skb, 753 struct ip6_fraglist_iter *iter) 754 { 755 struct sk_buff *frag = iter->frag; 756 unsigned int hlen = iter->hlen; 757 struct frag_hdr *fh; 758 759 frag->ip_summed = CHECKSUM_NONE; 760 skb_reset_transport_header(frag); 761 fh = __skb_push(frag, sizeof(struct frag_hdr)); 762 __skb_push(frag, hlen); 763 skb_reset_network_header(frag); 764 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 765 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 766 fh->nexthdr = iter->nexthdr; 767 fh->reserved = 0; 768 fh->frag_off = htons(iter->offset); 769 if (frag->next) 770 fh->frag_off |= htons(IP6_MF); 771 fh->identification = iter->frag_id; 772 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 773 ip6_copy_metadata(frag, skb); 774 } 775 EXPORT_SYMBOL(ip6_fraglist_prepare); 776 777 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 778 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 779 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 780 { 781 state->prevhdr = prevhdr; 782 state->nexthdr = nexthdr; 783 state->frag_id = frag_id; 784 785 state->hlen = hlen; 786 state->mtu = mtu; 787 788 state->left = skb->len - hlen; /* Space per frame */ 789 state->ptr = hlen; /* Where to start from */ 790 791 state->hroom = hdr_room; 792 state->troom = needed_tailroom; 793 794 state->offset = 0; 795 } 796 EXPORT_SYMBOL(ip6_frag_init); 797 798 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 799 { 800 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 801 struct sk_buff *frag; 802 struct frag_hdr *fh; 803 unsigned int len; 804 805 len = state->left; 806 /* IF: it doesn't fit, use 'mtu' - the data space left */ 807 if (len > state->mtu) 808 len = state->mtu; 809 /* IF: we are not sending up to and including the packet end 810 then align the next start on an eight byte boundary */ 811 if (len < state->left) 812 len &= ~7; 813 814 /* Allocate buffer */ 815 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 816 state->hroom + state->troom, GFP_ATOMIC); 817 if (!frag) 818 return ERR_PTR(-ENOMEM); 819 820 /* 821 * Set up data on packet 822 */ 823 824 ip6_copy_metadata(frag, skb); 825 skb_reserve(frag, state->hroom); 826 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 827 skb_reset_network_header(frag); 828 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 829 frag->transport_header = (frag->network_header + state->hlen + 830 sizeof(struct frag_hdr)); 831 832 /* 833 * Charge the memory for the fragment to any owner 834 * it might possess 835 */ 836 if (skb->sk) 837 skb_set_owner_w(frag, skb->sk); 838 839 /* 840 * Copy the packet header into the new buffer. 841 */ 842 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 843 844 fragnexthdr_offset = skb_network_header(frag); 845 fragnexthdr_offset += prevhdr - skb_network_header(skb); 846 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 847 848 /* 849 * Build fragment header. 850 */ 851 fh->nexthdr = state->nexthdr; 852 fh->reserved = 0; 853 fh->identification = state->frag_id; 854 855 /* 856 * Copy a block of the IP datagram. 857 */ 858 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 859 len)); 860 state->left -= len; 861 862 fh->frag_off = htons(state->offset); 863 if (state->left > 0) 864 fh->frag_off |= htons(IP6_MF); 865 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 866 867 state->ptr += len; 868 state->offset += len; 869 870 return frag; 871 } 872 EXPORT_SYMBOL(ip6_frag_next); 873 874 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 875 int (*output)(struct net *, struct sock *, struct sk_buff *)) 876 { 877 struct sk_buff *frag; 878 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 879 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 880 inet6_sk(skb->sk) : NULL; 881 u8 tstamp_type = skb->tstamp_type; 882 struct ip6_frag_state state; 883 unsigned int mtu, hlen, nexthdr_offset; 884 ktime_t tstamp = skb->tstamp; 885 int hroom, err = 0; 886 __be32 frag_id; 887 u8 *prevhdr, nexthdr = 0; 888 889 err = ip6_find_1stfragopt(skb, &prevhdr); 890 if (err < 0) 891 goto fail; 892 hlen = err; 893 nexthdr = *prevhdr; 894 nexthdr_offset = prevhdr - skb_network_header(skb); 895 896 mtu = ip6_skb_dst_mtu(skb); 897 898 /* We must not fragment if the socket is set to force MTU discovery 899 * or if the skb it not generated by a local socket. 900 */ 901 if (unlikely(!skb->ignore_df && skb->len > mtu)) 902 goto fail_toobig; 903 904 if (IP6CB(skb)->frag_max_size) { 905 if (IP6CB(skb)->frag_max_size > mtu) 906 goto fail_toobig; 907 908 /* don't send fragments larger than what we received */ 909 mtu = IP6CB(skb)->frag_max_size; 910 if (mtu < IPV6_MIN_MTU) 911 mtu = IPV6_MIN_MTU; 912 } 913 914 if (np) { 915 u32 frag_size = READ_ONCE(np->frag_size); 916 917 if (frag_size && frag_size < mtu) 918 mtu = frag_size; 919 } 920 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 921 goto fail_toobig; 922 mtu -= hlen + sizeof(struct frag_hdr); 923 924 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 925 &ipv6_hdr(skb)->saddr); 926 927 if (skb->ip_summed == CHECKSUM_PARTIAL && 928 (err = skb_checksum_help(skb))) 929 goto fail; 930 931 prevhdr = skb_network_header(skb) + nexthdr_offset; 932 hroom = LL_RESERVED_SPACE(rt->dst.dev); 933 if (skb_has_frag_list(skb)) { 934 unsigned int first_len = skb_pagelen(skb); 935 struct ip6_fraglist_iter iter; 936 struct sk_buff *frag2; 937 938 if (first_len - hlen > mtu || 939 ((first_len - hlen) & 7) || 940 skb_cloned(skb) || 941 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 942 goto slow_path; 943 944 skb_walk_frags(skb, frag) { 945 /* Correct geometry. */ 946 if (frag->len > mtu || 947 ((frag->len & 7) && frag->next) || 948 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 949 goto slow_path_clean; 950 951 /* Partially cloned skb? */ 952 if (skb_shared(frag)) 953 goto slow_path_clean; 954 955 BUG_ON(frag->sk); 956 if (skb->sk) { 957 frag->sk = skb->sk; 958 frag->destructor = sock_wfree; 959 } 960 skb->truesize -= frag->truesize; 961 } 962 963 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 964 &iter); 965 if (err < 0) 966 goto fail; 967 968 /* We prevent @rt from being freed. */ 969 rcu_read_lock(); 970 971 for (;;) { 972 /* Prepare header of the next frame, 973 * before previous one went down. */ 974 if (iter.frag) 975 ip6_fraglist_prepare(skb, &iter); 976 977 skb_set_delivery_time(skb, tstamp, tstamp_type); 978 err = output(net, sk, skb); 979 if (!err) 980 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 981 IPSTATS_MIB_FRAGCREATES); 982 983 if (err || !iter.frag) 984 break; 985 986 skb = ip6_fraglist_next(&iter); 987 } 988 989 kfree(iter.tmp_hdr); 990 991 if (err == 0) { 992 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 993 IPSTATS_MIB_FRAGOKS); 994 rcu_read_unlock(); 995 return 0; 996 } 997 998 kfree_skb_list(iter.frag); 999 1000 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1001 IPSTATS_MIB_FRAGFAILS); 1002 rcu_read_unlock(); 1003 return err; 1004 1005 slow_path_clean: 1006 skb_walk_frags(skb, frag2) { 1007 if (frag2 == frag) 1008 break; 1009 frag2->sk = NULL; 1010 frag2->destructor = NULL; 1011 skb->truesize += frag2->truesize; 1012 } 1013 } 1014 1015 slow_path: 1016 /* 1017 * Fragment the datagram. 1018 */ 1019 1020 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1021 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1022 &state); 1023 1024 /* 1025 * Keep copying data until we run out. 1026 */ 1027 1028 while (state.left > 0) { 1029 frag = ip6_frag_next(skb, &state); 1030 if (IS_ERR(frag)) { 1031 err = PTR_ERR(frag); 1032 goto fail; 1033 } 1034 1035 /* 1036 * Put this fragment into the sending queue. 1037 */ 1038 skb_set_delivery_time(frag, tstamp, tstamp_type); 1039 err = output(net, sk, frag); 1040 if (err) 1041 goto fail; 1042 1043 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1044 IPSTATS_MIB_FRAGCREATES); 1045 } 1046 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1047 IPSTATS_MIB_FRAGOKS); 1048 consume_skb(skb); 1049 return err; 1050 1051 fail_toobig: 1052 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1053 err = -EMSGSIZE; 1054 1055 fail: 1056 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1057 IPSTATS_MIB_FRAGFAILS); 1058 kfree_skb(skb); 1059 return err; 1060 } 1061 1062 static inline int ip6_rt_check(const struct rt6key *rt_key, 1063 const struct in6_addr *fl_addr, 1064 const struct in6_addr *addr_cache) 1065 { 1066 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1067 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1068 } 1069 1070 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1071 struct dst_entry *dst, 1072 const struct flowi6 *fl6) 1073 { 1074 struct ipv6_pinfo *np = inet6_sk(sk); 1075 struct rt6_info *rt; 1076 1077 if (!dst) 1078 goto out; 1079 1080 if (dst->ops->family != AF_INET6) { 1081 dst_release(dst); 1082 return NULL; 1083 } 1084 1085 rt = dst_rt6_info(dst); 1086 /* Yes, checking route validity in not connected 1087 * case is not very simple. Take into account, 1088 * that we do not support routing by source, TOS, 1089 * and MSG_DONTROUTE --ANK (980726) 1090 * 1091 * 1. ip6_rt_check(): If route was host route, 1092 * check that cached destination is current. 1093 * If it is network route, we still may 1094 * check its validity using saved pointer 1095 * to the last used address: daddr_cache. 1096 * We do not want to save whole address now, 1097 * (because main consumer of this service 1098 * is tcp, which has not this problem), 1099 * so that the last trick works only on connected 1100 * sockets. 1101 * 2. oif also should be the same. 1102 */ 1103 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1104 np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1105 #ifdef CONFIG_IPV6_SUBTREES 1106 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1107 np->saddr_cache ? &np->saddr : NULL) || 1108 #endif 1109 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1110 dst_release(dst); 1111 dst = NULL; 1112 } 1113 1114 out: 1115 return dst; 1116 } 1117 1118 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1119 struct dst_entry **dst, struct flowi6 *fl6) 1120 { 1121 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1122 struct neighbour *n; 1123 struct rt6_info *rt; 1124 #endif 1125 int err; 1126 int flags = 0; 1127 1128 /* The correct way to handle this would be to do 1129 * ip6_route_get_saddr, and then ip6_route_output; however, 1130 * the route-specific preferred source forces the 1131 * ip6_route_output call _before_ ip6_route_get_saddr. 1132 * 1133 * In source specific routing (no src=any default route), 1134 * ip6_route_output will fail given src=any saddr, though, so 1135 * that's why we try it again later. 1136 */ 1137 if (ipv6_addr_any(&fl6->saddr)) { 1138 struct fib6_info *from; 1139 struct rt6_info *rt; 1140 1141 *dst = ip6_route_output(net, sk, fl6); 1142 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1143 1144 rcu_read_lock(); 1145 from = rt ? rcu_dereference(rt->from) : NULL; 1146 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1147 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1148 fl6->flowi6_l3mdev, 1149 &fl6->saddr); 1150 rcu_read_unlock(); 1151 1152 if (err) 1153 goto out_err_release; 1154 1155 /* If we had an erroneous initial result, pretend it 1156 * never existed and let the SA-enabled version take 1157 * over. 1158 */ 1159 if ((*dst)->error) { 1160 dst_release(*dst); 1161 *dst = NULL; 1162 } 1163 1164 if (fl6->flowi6_oif) 1165 flags |= RT6_LOOKUP_F_IFACE; 1166 } 1167 1168 if (!*dst) 1169 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1170 1171 err = (*dst)->error; 1172 if (err) 1173 goto out_err_release; 1174 1175 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1176 /* 1177 * Here if the dst entry we've looked up 1178 * has a neighbour entry that is in the INCOMPLETE 1179 * state and the src address from the flow is 1180 * marked as OPTIMISTIC, we release the found 1181 * dst entry and replace it instead with the 1182 * dst entry of the nexthop router 1183 */ 1184 rt = dst_rt6_info(*dst); 1185 rcu_read_lock(); 1186 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1187 rt6_nexthop(rt, &fl6->daddr)); 1188 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1189 rcu_read_unlock(); 1190 1191 if (err) { 1192 struct inet6_ifaddr *ifp; 1193 struct flowi6 fl_gw6; 1194 int redirect; 1195 1196 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1197 (*dst)->dev, 1); 1198 1199 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1200 if (ifp) 1201 in6_ifa_put(ifp); 1202 1203 if (redirect) { 1204 /* 1205 * We need to get the dst entry for the 1206 * default router instead 1207 */ 1208 dst_release(*dst); 1209 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1210 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1211 *dst = ip6_route_output(net, sk, &fl_gw6); 1212 err = (*dst)->error; 1213 if (err) 1214 goto out_err_release; 1215 } 1216 } 1217 #endif 1218 if (ipv6_addr_v4mapped(&fl6->saddr) && 1219 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1220 err = -EAFNOSUPPORT; 1221 goto out_err_release; 1222 } 1223 1224 return 0; 1225 1226 out_err_release: 1227 dst_release(*dst); 1228 *dst = NULL; 1229 1230 if (err == -ENETUNREACH) 1231 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1232 return err; 1233 } 1234 1235 /** 1236 * ip6_dst_lookup - perform route lookup on flow 1237 * @net: Network namespace to perform lookup in 1238 * @sk: socket which provides route info 1239 * @dst: pointer to dst_entry * for result 1240 * @fl6: flow to lookup 1241 * 1242 * This function performs a route lookup on the given flow. 1243 * 1244 * It returns zero on success, or a standard errno code on error. 1245 */ 1246 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1247 struct flowi6 *fl6) 1248 { 1249 *dst = NULL; 1250 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1251 } 1252 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1253 1254 /** 1255 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1256 * @net: Network namespace to perform lookup in 1257 * @sk: socket which provides route info 1258 * @fl6: flow to lookup 1259 * @final_dst: final destination address for ipsec lookup 1260 * 1261 * This function performs a route lookup on the given flow. 1262 * 1263 * It returns a valid dst pointer on success, or a pointer encoded 1264 * error code. 1265 */ 1266 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1267 const struct in6_addr *final_dst) 1268 { 1269 struct dst_entry *dst = NULL; 1270 int err; 1271 1272 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1273 if (err) 1274 return ERR_PTR(err); 1275 if (final_dst) 1276 fl6->daddr = *final_dst; 1277 1278 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1279 } 1280 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1281 1282 /** 1283 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1284 * @sk: socket which provides the dst cache and route info 1285 * @fl6: flow to lookup 1286 * @final_dst: final destination address for ipsec lookup 1287 * @connected: whether @sk is connected or not 1288 * 1289 * This function performs a route lookup on the given flow with the 1290 * possibility of using the cached route in the socket if it is valid. 1291 * It will take the socket dst lock when operating on the dst cache. 1292 * As a result, this function can only be used in process context. 1293 * 1294 * In addition, for a connected socket, cache the dst in the socket 1295 * if the current cache is not valid. 1296 * 1297 * It returns a valid dst pointer on success, or a pointer encoded 1298 * error code. 1299 */ 1300 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1301 const struct in6_addr *final_dst, 1302 bool connected) 1303 { 1304 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1305 1306 dst = ip6_sk_dst_check(sk, dst, fl6); 1307 if (dst) 1308 return dst; 1309 1310 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1311 if (connected && !IS_ERR(dst)) 1312 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1313 1314 return dst; 1315 } 1316 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1317 1318 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1319 gfp_t gfp) 1320 { 1321 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1322 } 1323 1324 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1325 gfp_t gfp) 1326 { 1327 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1328 } 1329 1330 static void ip6_append_data_mtu(unsigned int *mtu, 1331 int *maxfraglen, 1332 unsigned int fragheaderlen, 1333 struct sk_buff *skb, 1334 struct rt6_info *rt, 1335 unsigned int orig_mtu) 1336 { 1337 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1338 if (!skb) { 1339 /* first fragment, reserve header_len */ 1340 *mtu = orig_mtu - rt->dst.header_len; 1341 1342 } else { 1343 /* 1344 * this fragment is not first, the headers 1345 * space is regarded as data space. 1346 */ 1347 *mtu = orig_mtu; 1348 } 1349 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1350 + fragheaderlen - sizeof(struct frag_hdr); 1351 } 1352 } 1353 1354 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1355 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1356 struct rt6_info *rt) 1357 { 1358 struct ipv6_pinfo *np = inet6_sk(sk); 1359 unsigned int mtu, frag_size; 1360 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1361 1362 /* callers pass dst together with a reference, set it first so 1363 * ip6_cork_release() can put it down even in case of an error. 1364 */ 1365 cork->base.dst = &rt->dst; 1366 1367 /* 1368 * setup for corking 1369 */ 1370 if (opt) { 1371 if (WARN_ON(v6_cork->opt)) 1372 return -EINVAL; 1373 1374 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1375 if (unlikely(!nopt)) 1376 return -ENOBUFS; 1377 1378 nopt->tot_len = sizeof(*opt); 1379 nopt->opt_flen = opt->opt_flen; 1380 nopt->opt_nflen = opt->opt_nflen; 1381 1382 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1383 if (opt->dst0opt && !nopt->dst0opt) 1384 return -ENOBUFS; 1385 1386 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1387 if (opt->dst1opt && !nopt->dst1opt) 1388 return -ENOBUFS; 1389 1390 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1391 if (opt->hopopt && !nopt->hopopt) 1392 return -ENOBUFS; 1393 1394 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1395 if (opt->srcrt && !nopt->srcrt) 1396 return -ENOBUFS; 1397 1398 /* need source address above miyazawa*/ 1399 } 1400 v6_cork->hop_limit = ipc6->hlimit; 1401 v6_cork->tclass = ipc6->tclass; 1402 v6_cork->dontfrag = ipc6->dontfrag; 1403 if (rt->dst.flags & DST_XFRM_TUNNEL) 1404 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1405 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1406 else 1407 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1408 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1409 1410 frag_size = READ_ONCE(np->frag_size); 1411 if (frag_size && frag_size < mtu) 1412 mtu = frag_size; 1413 1414 cork->base.fragsize = mtu; 1415 cork->base.gso_size = ipc6->gso_size; 1416 cork->base.tx_flags = 0; 1417 cork->base.mark = ipc6->sockc.mark; 1418 cork->base.priority = ipc6->sockc.priority; 1419 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1420 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1421 cork->base.flags |= IPCORK_TS_OPT_ID; 1422 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1423 } 1424 cork->base.length = 0; 1425 cork->base.transmit_time = ipc6->sockc.transmit_time; 1426 1427 return 0; 1428 } 1429 1430 static int __ip6_append_data(struct sock *sk, 1431 struct sk_buff_head *queue, 1432 struct inet_cork_full *cork_full, 1433 struct inet6_cork *v6_cork, 1434 struct page_frag *pfrag, 1435 int getfrag(void *from, char *to, int offset, 1436 int len, int odd, struct sk_buff *skb), 1437 void *from, size_t length, int transhdrlen, 1438 unsigned int flags) 1439 { 1440 struct sk_buff *skb, *skb_prev = NULL; 1441 struct inet_cork *cork = &cork_full->base; 1442 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1443 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1444 struct ubuf_info *uarg = NULL; 1445 int exthdrlen = 0; 1446 int dst_exthdrlen = 0; 1447 int hh_len; 1448 int copy; 1449 int err; 1450 int offset = 0; 1451 bool zc = false; 1452 u32 tskey = 0; 1453 struct rt6_info *rt = dst_rt6_info(cork->dst); 1454 bool paged, hold_tskey = false, extra_uref = false; 1455 struct ipv6_txoptions *opt = v6_cork->opt; 1456 int csummode = CHECKSUM_NONE; 1457 unsigned int maxnonfragsize, headersize; 1458 unsigned int wmem_alloc_delta = 0; 1459 1460 skb = skb_peek_tail(queue); 1461 if (!skb) { 1462 exthdrlen = opt ? opt->opt_flen : 0; 1463 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1464 } 1465 1466 paged = !!cork->gso_size; 1467 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1468 orig_mtu = mtu; 1469 1470 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1471 1472 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1473 (opt ? opt->opt_nflen : 0); 1474 1475 headersize = sizeof(struct ipv6hdr) + 1476 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1477 rt->rt6i_nfheader_len; 1478 1479 if (mtu <= fragheaderlen || 1480 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1481 goto emsgsize; 1482 1483 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1484 sizeof(struct frag_hdr); 1485 1486 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1487 * the first fragment 1488 */ 1489 if (headersize + transhdrlen > mtu) 1490 goto emsgsize; 1491 1492 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1493 (sk->sk_protocol == IPPROTO_UDP || 1494 sk->sk_protocol == IPPROTO_ICMPV6 || 1495 sk->sk_protocol == IPPROTO_RAW)) { 1496 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1497 sizeof(struct ipv6hdr)); 1498 goto emsgsize; 1499 } 1500 1501 if (ip6_sk_ignore_df(sk)) 1502 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1503 else 1504 maxnonfragsize = mtu; 1505 1506 if (cork->length + length > maxnonfragsize - headersize) { 1507 emsgsize: 1508 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1509 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1510 return -EMSGSIZE; 1511 } 1512 1513 /* CHECKSUM_PARTIAL only with no extension headers and when 1514 * we are not going to fragment 1515 */ 1516 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1517 headersize == sizeof(struct ipv6hdr) && 1518 length <= mtu - headersize && 1519 (!(flags & MSG_MORE) || cork->gso_size) && 1520 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1521 csummode = CHECKSUM_PARTIAL; 1522 1523 if ((flags & MSG_ZEROCOPY) && length) { 1524 struct msghdr *msg = from; 1525 1526 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1527 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1528 return -EINVAL; 1529 1530 /* Leave uarg NULL if can't zerocopy, callers should 1531 * be able to handle it. 1532 */ 1533 if ((rt->dst.dev->features & NETIF_F_SG) && 1534 csummode == CHECKSUM_PARTIAL) { 1535 paged = true; 1536 zc = true; 1537 uarg = msg->msg_ubuf; 1538 } 1539 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1540 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1541 false); 1542 if (!uarg) 1543 return -ENOBUFS; 1544 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1545 if (rt->dst.dev->features & NETIF_F_SG && 1546 csummode == CHECKSUM_PARTIAL) { 1547 paged = true; 1548 zc = true; 1549 } else { 1550 uarg_to_msgzc(uarg)->zerocopy = 0; 1551 skb_zcopy_set(skb, uarg, &extra_uref); 1552 } 1553 } 1554 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1555 if (inet_test_bit(HDRINCL, sk)) 1556 return -EPERM; 1557 if (rt->dst.dev->features & NETIF_F_SG && 1558 getfrag == ip_generic_getfrag) 1559 /* We need an empty buffer to attach stuff to */ 1560 paged = true; 1561 else 1562 flags &= ~MSG_SPLICE_PAGES; 1563 } 1564 1565 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1566 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1567 if (cork->flags & IPCORK_TS_OPT_ID) { 1568 tskey = cork->ts_opt_id; 1569 } else { 1570 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1571 hold_tskey = true; 1572 } 1573 } 1574 1575 /* 1576 * Let's try using as much space as possible. 1577 * Use MTU if total length of the message fits into the MTU. 1578 * Otherwise, we need to reserve fragment header and 1579 * fragment alignment (= 8-15 octects, in total). 1580 * 1581 * Note that we may need to "move" the data from the tail 1582 * of the buffer to the new fragment when we split 1583 * the message. 1584 * 1585 * FIXME: It may be fragmented into multiple chunks 1586 * at once if non-fragmentable extension headers 1587 * are too large. 1588 * --yoshfuji 1589 */ 1590 1591 cork->length += length; 1592 if (!skb) 1593 goto alloc_new_skb; 1594 1595 while (length > 0) { 1596 /* Check if the remaining data fits into current packet. */ 1597 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1598 if (copy < length) 1599 copy = maxfraglen - skb->len; 1600 1601 if (copy <= 0) { 1602 char *data; 1603 unsigned int datalen; 1604 unsigned int fraglen; 1605 unsigned int fraggap; 1606 unsigned int alloclen, alloc_extra; 1607 unsigned int pagedlen; 1608 alloc_new_skb: 1609 /* There's no room in the current skb */ 1610 if (skb) 1611 fraggap = skb->len - maxfraglen; 1612 else 1613 fraggap = 0; 1614 /* update mtu and maxfraglen if necessary */ 1615 if (!skb || !skb_prev) 1616 ip6_append_data_mtu(&mtu, &maxfraglen, 1617 fragheaderlen, skb, rt, 1618 orig_mtu); 1619 1620 skb_prev = skb; 1621 1622 /* 1623 * If remaining data exceeds the mtu, 1624 * we know we need more fragment(s). 1625 */ 1626 datalen = length + fraggap; 1627 1628 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1629 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1630 fraglen = datalen + fragheaderlen; 1631 pagedlen = 0; 1632 1633 alloc_extra = hh_len; 1634 alloc_extra += dst_exthdrlen; 1635 alloc_extra += rt->dst.trailer_len; 1636 1637 /* We just reserve space for fragment header. 1638 * Note: this may be overallocation if the message 1639 * (without MSG_MORE) fits into the MTU. 1640 */ 1641 alloc_extra += sizeof(struct frag_hdr); 1642 1643 if ((flags & MSG_MORE) && 1644 !(rt->dst.dev->features&NETIF_F_SG)) 1645 alloclen = mtu; 1646 else if (!paged && 1647 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1648 !(rt->dst.dev->features & NETIF_F_SG))) 1649 alloclen = fraglen; 1650 else { 1651 alloclen = fragheaderlen + transhdrlen; 1652 pagedlen = datalen - transhdrlen; 1653 } 1654 alloclen += alloc_extra; 1655 1656 if (datalen != length + fraggap) { 1657 /* 1658 * this is not the last fragment, the trailer 1659 * space is regarded as data space. 1660 */ 1661 datalen += rt->dst.trailer_len; 1662 } 1663 1664 fraglen = datalen + fragheaderlen; 1665 1666 copy = datalen - transhdrlen - fraggap - pagedlen; 1667 /* [!] NOTE: copy may be negative if pagedlen>0 1668 * because then the equation may reduces to -fraggap. 1669 */ 1670 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1671 err = -EINVAL; 1672 goto error; 1673 } 1674 if (transhdrlen) { 1675 skb = sock_alloc_send_skb(sk, alloclen, 1676 (flags & MSG_DONTWAIT), &err); 1677 } else { 1678 skb = NULL; 1679 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1680 2 * sk->sk_sndbuf) 1681 skb = alloc_skb(alloclen, 1682 sk->sk_allocation); 1683 if (unlikely(!skb)) 1684 err = -ENOBUFS; 1685 } 1686 if (!skb) 1687 goto error; 1688 /* 1689 * Fill in the control structures 1690 */ 1691 skb->protocol = htons(ETH_P_IPV6); 1692 skb->ip_summed = csummode; 1693 skb->csum = 0; 1694 /* reserve for fragmentation and ipsec header */ 1695 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1696 dst_exthdrlen); 1697 1698 /* 1699 * Find where to start putting bytes 1700 */ 1701 data = skb_put(skb, fraglen - pagedlen); 1702 skb_set_network_header(skb, exthdrlen); 1703 data += fragheaderlen; 1704 skb->transport_header = (skb->network_header + 1705 fragheaderlen); 1706 if (fraggap) { 1707 skb->csum = skb_copy_and_csum_bits( 1708 skb_prev, maxfraglen, 1709 data + transhdrlen, fraggap); 1710 skb_prev->csum = csum_sub(skb_prev->csum, 1711 skb->csum); 1712 data += fraggap; 1713 pskb_trim_unique(skb_prev, maxfraglen); 1714 } 1715 if (copy > 0 && 1716 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1717 from, data + transhdrlen, offset, 1718 copy, fraggap, skb) < 0) { 1719 err = -EFAULT; 1720 kfree_skb(skb); 1721 goto error; 1722 } else if (flags & MSG_SPLICE_PAGES) { 1723 copy = 0; 1724 } 1725 1726 offset += copy; 1727 length -= copy + transhdrlen; 1728 transhdrlen = 0; 1729 exthdrlen = 0; 1730 dst_exthdrlen = 0; 1731 1732 /* Only the initial fragment is time stamped */ 1733 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1734 cork->tx_flags = 0; 1735 skb_shinfo(skb)->tskey = tskey; 1736 tskey = 0; 1737 skb_zcopy_set(skb, uarg, &extra_uref); 1738 1739 if ((flags & MSG_CONFIRM) && !skb_prev) 1740 skb_set_dst_pending_confirm(skb, 1); 1741 1742 /* 1743 * Put the packet on the pending queue 1744 */ 1745 if (!skb->destructor) { 1746 skb->destructor = sock_wfree; 1747 skb->sk = sk; 1748 wmem_alloc_delta += skb->truesize; 1749 } 1750 __skb_queue_tail(queue, skb); 1751 continue; 1752 } 1753 1754 if (copy > length) 1755 copy = length; 1756 1757 if (!(rt->dst.dev->features&NETIF_F_SG) && 1758 skb_tailroom(skb) >= copy) { 1759 unsigned int off; 1760 1761 off = skb->len; 1762 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1763 from, skb_put(skb, copy), 1764 offset, copy, off, skb) < 0) { 1765 __skb_trim(skb, off); 1766 err = -EFAULT; 1767 goto error; 1768 } 1769 } else if (flags & MSG_SPLICE_PAGES) { 1770 struct msghdr *msg = from; 1771 1772 err = -EIO; 1773 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1774 goto error; 1775 1776 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1777 if (err < 0) 1778 goto error; 1779 copy = err; 1780 wmem_alloc_delta += copy; 1781 } else if (!zc) { 1782 int i = skb_shinfo(skb)->nr_frags; 1783 1784 err = -ENOMEM; 1785 if (!sk_page_frag_refill(sk, pfrag)) 1786 goto error; 1787 1788 skb_zcopy_downgrade_managed(skb); 1789 if (!skb_can_coalesce(skb, i, pfrag->page, 1790 pfrag->offset)) { 1791 err = -EMSGSIZE; 1792 if (i == MAX_SKB_FRAGS) 1793 goto error; 1794 1795 __skb_fill_page_desc(skb, i, pfrag->page, 1796 pfrag->offset, 0); 1797 skb_shinfo(skb)->nr_frags = ++i; 1798 get_page(pfrag->page); 1799 } 1800 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1801 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1802 from, 1803 page_address(pfrag->page) + pfrag->offset, 1804 offset, copy, skb->len, skb) < 0) 1805 goto error_efault; 1806 1807 pfrag->offset += copy; 1808 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1809 skb->len += copy; 1810 skb->data_len += copy; 1811 skb->truesize += copy; 1812 wmem_alloc_delta += copy; 1813 } else { 1814 err = skb_zerocopy_iter_dgram(skb, from, copy); 1815 if (err < 0) 1816 goto error; 1817 } 1818 offset += copy; 1819 length -= copy; 1820 } 1821 1822 if (wmem_alloc_delta) 1823 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1824 return 0; 1825 1826 error_efault: 1827 err = -EFAULT; 1828 error: 1829 net_zcopy_put_abort(uarg, extra_uref); 1830 cork->length -= length; 1831 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1832 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1833 if (hold_tskey) 1834 atomic_dec(&sk->sk_tskey); 1835 return err; 1836 } 1837 1838 int ip6_append_data(struct sock *sk, 1839 int getfrag(void *from, char *to, int offset, int len, 1840 int odd, struct sk_buff *skb), 1841 void *from, size_t length, int transhdrlen, 1842 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1843 struct rt6_info *rt, unsigned int flags) 1844 { 1845 struct inet_sock *inet = inet_sk(sk); 1846 struct ipv6_pinfo *np = inet6_sk(sk); 1847 int exthdrlen; 1848 int err; 1849 1850 if (flags&MSG_PROBE) 1851 return 0; 1852 if (skb_queue_empty(&sk->sk_write_queue)) { 1853 /* 1854 * setup for corking 1855 */ 1856 dst_hold(&rt->dst); 1857 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1858 ipc6, rt); 1859 if (err) 1860 return err; 1861 1862 inet->cork.fl.u.ip6 = *fl6; 1863 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1864 length += exthdrlen; 1865 transhdrlen += exthdrlen; 1866 } else { 1867 transhdrlen = 0; 1868 } 1869 1870 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1871 &np->cork, sk_page_frag(sk), getfrag, 1872 from, length, transhdrlen, flags); 1873 } 1874 EXPORT_SYMBOL_GPL(ip6_append_data); 1875 1876 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1877 { 1878 struct dst_entry *dst = cork->base.dst; 1879 1880 cork->base.dst = NULL; 1881 skb_dst_set(skb, dst); 1882 } 1883 1884 static void ip6_cork_release(struct inet_cork_full *cork, 1885 struct inet6_cork *v6_cork) 1886 { 1887 if (v6_cork->opt) { 1888 struct ipv6_txoptions *opt = v6_cork->opt; 1889 1890 kfree(opt->dst0opt); 1891 kfree(opt->dst1opt); 1892 kfree(opt->hopopt); 1893 kfree(opt->srcrt); 1894 kfree(opt); 1895 v6_cork->opt = NULL; 1896 } 1897 1898 if (cork->base.dst) { 1899 dst_release(cork->base.dst); 1900 cork->base.dst = NULL; 1901 } 1902 } 1903 1904 struct sk_buff *__ip6_make_skb(struct sock *sk, 1905 struct sk_buff_head *queue, 1906 struct inet_cork_full *cork, 1907 struct inet6_cork *v6_cork) 1908 { 1909 struct sk_buff *skb, *tmp_skb; 1910 struct sk_buff **tail_skb; 1911 struct in6_addr *final_dst; 1912 struct net *net = sock_net(sk); 1913 struct ipv6hdr *hdr; 1914 struct ipv6_txoptions *opt = v6_cork->opt; 1915 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1916 struct flowi6 *fl6 = &cork->fl.u.ip6; 1917 unsigned char proto = fl6->flowi6_proto; 1918 1919 skb = __skb_dequeue(queue); 1920 if (!skb) 1921 goto out; 1922 tail_skb = &(skb_shinfo(skb)->frag_list); 1923 1924 /* move skb->data to ip header from ext header */ 1925 if (skb->data < skb_network_header(skb)) 1926 __skb_pull(skb, skb_network_offset(skb)); 1927 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1928 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1929 *tail_skb = tmp_skb; 1930 tail_skb = &(tmp_skb->next); 1931 skb->len += tmp_skb->len; 1932 skb->data_len += tmp_skb->len; 1933 skb->truesize += tmp_skb->truesize; 1934 tmp_skb->destructor = NULL; 1935 tmp_skb->sk = NULL; 1936 } 1937 1938 /* Allow local fragmentation. */ 1939 skb->ignore_df = ip6_sk_ignore_df(sk); 1940 __skb_pull(skb, skb_network_header_len(skb)); 1941 1942 final_dst = &fl6->daddr; 1943 if (opt && opt->opt_flen) 1944 ipv6_push_frag_opts(skb, opt, &proto); 1945 if (opt && opt->opt_nflen) 1946 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1947 1948 skb_push(skb, sizeof(struct ipv6hdr)); 1949 skb_reset_network_header(skb); 1950 hdr = ipv6_hdr(skb); 1951 1952 ip6_flow_hdr(hdr, v6_cork->tclass, 1953 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1954 ip6_autoflowlabel(net, sk), fl6)); 1955 hdr->hop_limit = v6_cork->hop_limit; 1956 hdr->nexthdr = proto; 1957 hdr->saddr = fl6->saddr; 1958 hdr->daddr = *final_dst; 1959 1960 skb->priority = cork->base.priority; 1961 skb->mark = cork->base.mark; 1962 if (sk_is_tcp(sk)) 1963 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1964 else 1965 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1966 1967 ip6_cork_steal_dst(skb, cork); 1968 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1969 if (proto == IPPROTO_ICMPV6) { 1970 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1971 u8 icmp6_type; 1972 1973 if (sk->sk_socket->type == SOCK_RAW && 1974 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1975 icmp6_type = fl6->fl6_icmp_type; 1976 else 1977 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1978 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1979 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1980 } 1981 1982 ip6_cork_release(cork, v6_cork); 1983 out: 1984 return skb; 1985 } 1986 1987 int ip6_send_skb(struct sk_buff *skb) 1988 { 1989 struct net *net = sock_net(skb->sk); 1990 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 1991 int err; 1992 1993 rcu_read_lock(); 1994 err = ip6_local_out(net, skb->sk, skb); 1995 if (err) { 1996 if (err > 0) 1997 err = net_xmit_errno(err); 1998 if (err) 1999 IP6_INC_STATS(net, rt->rt6i_idev, 2000 IPSTATS_MIB_OUTDISCARDS); 2001 } 2002 2003 rcu_read_unlock(); 2004 return err; 2005 } 2006 2007 int ip6_push_pending_frames(struct sock *sk) 2008 { 2009 struct sk_buff *skb; 2010 2011 skb = ip6_finish_skb(sk); 2012 if (!skb) 2013 return 0; 2014 2015 return ip6_send_skb(skb); 2016 } 2017 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2018 2019 static void __ip6_flush_pending_frames(struct sock *sk, 2020 struct sk_buff_head *queue, 2021 struct inet_cork_full *cork, 2022 struct inet6_cork *v6_cork) 2023 { 2024 struct sk_buff *skb; 2025 2026 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2027 if (skb_dst(skb)) 2028 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2029 IPSTATS_MIB_OUTDISCARDS); 2030 kfree_skb(skb); 2031 } 2032 2033 ip6_cork_release(cork, v6_cork); 2034 } 2035 2036 void ip6_flush_pending_frames(struct sock *sk) 2037 { 2038 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2039 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 2040 } 2041 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2042 2043 struct sk_buff *ip6_make_skb(struct sock *sk, 2044 int getfrag(void *from, char *to, int offset, 2045 int len, int odd, struct sk_buff *skb), 2046 void *from, size_t length, int transhdrlen, 2047 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2048 unsigned int flags, struct inet_cork_full *cork) 2049 { 2050 struct inet6_cork v6_cork; 2051 struct sk_buff_head queue; 2052 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2053 int err; 2054 2055 if (flags & MSG_PROBE) { 2056 dst_release(&rt->dst); 2057 return NULL; 2058 } 2059 2060 __skb_queue_head_init(&queue); 2061 2062 cork->base.flags = 0; 2063 cork->base.addr = 0; 2064 cork->base.opt = NULL; 2065 v6_cork.opt = NULL; 2066 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); 2067 if (err) { 2068 ip6_cork_release(cork, &v6_cork); 2069 return ERR_PTR(err); 2070 } 2071 2072 err = __ip6_append_data(sk, &queue, cork, &v6_cork, 2073 ¤t->task_frag, getfrag, from, 2074 length + exthdrlen, transhdrlen + exthdrlen, 2075 flags); 2076 if (err) { 2077 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 2078 return ERR_PTR(err); 2079 } 2080 2081 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 2082 } 2083