1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (unlikely(ipv6_addr_is_multicast(daddr))) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (unlikely(!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 183 !skb_gso_validate_network_len(skb, mtu))) 184 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 185 186 return ip6_finish_output2(net, sk, skb); 187 } 188 189 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 190 { 191 unsigned int mtu; 192 193 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 194 /* Policy lookup after SNAT yielded a new policy */ 195 if (skb_dst(skb)->xfrm) { 196 IP6CB(skb)->flags |= IP6SKB_REROUTED; 197 return dst_output(net, sk, skb); 198 } 199 #endif 200 201 mtu = ip6_skb_dst_mtu(skb); 202 if (skb_is_gso(skb)) 203 return ip6_finish_output_gso(net, sk, skb, mtu); 204 205 if (unlikely(skb->len > mtu || 206 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) 207 return ip6_fragment(net, sk, skb, ip6_finish_output2); 208 209 return ip6_finish_output2(net, sk, skb); 210 } 211 212 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 213 { 214 int ret; 215 216 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 217 switch (ret) { 218 case NET_XMIT_SUCCESS: 219 case NET_XMIT_CN: 220 return __ip6_finish_output(net, sk, skb) ? : ret; 221 default: 222 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 223 return ret; 224 } 225 } 226 227 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 228 { 229 struct dst_entry *dst = skb_dst(skb); 230 struct net_device *dev, *indev = skb->dev; 231 struct inet6_dev *idev; 232 int ret; 233 234 skb->protocol = htons(ETH_P_IPV6); 235 rcu_read_lock(); 236 dev = dst_dev_rcu(dst); 237 idev = ip6_dst_idev(dst); 238 skb->dev = dev; 239 240 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 241 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 242 rcu_read_unlock(); 243 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 244 return 0; 245 } 246 247 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 248 net, sk, skb, indev, dev, 249 ip6_finish_output, 250 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 251 rcu_read_unlock(); 252 return ret; 253 } 254 EXPORT_SYMBOL(ip6_output); 255 256 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 257 { 258 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 259 return ip6_default_np_autolabel(net); 260 return inet6_test_bit(AUTOFLOWLABEL, sk); 261 } 262 263 /* 264 * xmit an sk_buff (used by TCP and SCTP) 265 * Note : socket lock is not held for SYNACK packets, but might be modified 266 * by calls to skb_set_owner_w() and ipv6_local_error(), 267 * which are using proper atomic operations or spinlocks. 268 */ 269 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 270 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 271 { 272 const struct ipv6_pinfo *np = inet6_sk(sk); 273 struct in6_addr *first_hop = &fl6->daddr; 274 struct dst_entry *dst = skb_dst(skb); 275 struct inet6_dev *idev = ip6_dst_idev(dst); 276 struct hop_jumbo_hdr *hop_jumbo; 277 int hoplen = sizeof(*hop_jumbo); 278 struct net *net = sock_net(sk); 279 unsigned int head_room; 280 struct net_device *dev; 281 struct ipv6hdr *hdr; 282 u8 proto = fl6->flowi6_proto; 283 int seg_len = skb->len; 284 int ret, hlimit = -1; 285 u32 mtu; 286 287 rcu_read_lock(); 288 289 dev = dst_dev_rcu(dst); 290 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 291 if (opt) 292 head_room += opt->opt_nflen + opt->opt_flen; 293 294 if (unlikely(head_room > skb_headroom(skb))) { 295 /* idev stays alive while we hold rcu_read_lock(). */ 296 skb = skb_expand_head(skb, head_room); 297 if (!skb) { 298 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 299 ret = -ENOBUFS; 300 goto unlock; 301 } 302 } 303 304 if (unlikely(opt)) { 305 seg_len += opt->opt_nflen + opt->opt_flen; 306 307 if (opt->opt_flen) 308 proto = ipv6_push_frag_opts(skb, opt, proto); 309 310 if (opt->opt_nflen) 311 proto = ipv6_push_nfrag_opts(skb, opt, proto, 312 &first_hop, 313 &fl6->saddr); 314 } 315 316 if (unlikely(seg_len > IPV6_MAXPLEN)) { 317 hop_jumbo = __skb_push(skb, hoplen); 318 319 hop_jumbo->nexthdr = proto; 320 hop_jumbo->hdrlen = 0; 321 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 322 hop_jumbo->tlv_len = 4; 323 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 324 325 proto = IPPROTO_HOPOPTS; 326 seg_len = 0; 327 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 328 } 329 330 __skb_push(skb, sizeof(struct ipv6hdr)); 331 skb_reset_network_header(skb); 332 hdr = ipv6_hdr(skb); 333 334 /* 335 * Fill in the IPv6 header 336 */ 337 if (np) 338 hlimit = READ_ONCE(np->hop_limit); 339 if (hlimit < 0) 340 hlimit = ip6_dst_hoplimit(dst); 341 342 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 343 ip6_autoflowlabel(net, sk), fl6)); 344 345 hdr->payload_len = htons(seg_len); 346 hdr->nexthdr = proto; 347 hdr->hop_limit = hlimit; 348 349 hdr->saddr = fl6->saddr; 350 hdr->daddr = *first_hop; 351 352 skb->protocol = htons(ETH_P_IPV6); 353 skb->priority = priority; 354 skb->mark = mark; 355 356 mtu = dst6_mtu(dst); 357 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { 358 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 359 360 /* if egress device is enslaved to an L3 master device pass the 361 * skb to its handler for processing 362 */ 363 skb = l3mdev_ip6_out((struct sock *)sk, skb); 364 if (unlikely(!skb)) { 365 ret = 0; 366 goto unlock; 367 } 368 369 /* hooks should never assume socket lock is held. 370 * we promote our socket to non const 371 */ 372 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 373 net, (struct sock *)sk, skb, NULL, dev, 374 dst_output); 375 goto unlock; 376 } 377 378 ret = -EMSGSIZE; 379 skb->dev = dev; 380 /* ipv6_local_error() does not require socket lock, 381 * we promote our socket to non const 382 */ 383 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 384 385 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 386 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 387 unlock: 388 rcu_read_unlock(); 389 return ret; 390 } 391 EXPORT_SYMBOL(ip6_xmit); 392 393 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 394 { 395 struct ip6_ra_chain *ra; 396 struct sock *last = NULL; 397 398 read_lock(&ip6_ra_lock); 399 for (ra = ip6_ra_chain; ra; ra = ra->next) { 400 struct sock *sk = ra->sk; 401 if (sk && ra->sel == sel && 402 (!sk->sk_bound_dev_if || 403 sk->sk_bound_dev_if == skb->dev->ifindex)) { 404 405 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 406 !net_eq(sock_net(sk), dev_net(skb->dev))) { 407 continue; 408 } 409 if (last) { 410 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 411 if (skb2) 412 rawv6_rcv(last, skb2); 413 } 414 last = sk; 415 } 416 } 417 418 if (last) { 419 rawv6_rcv(last, skb); 420 read_unlock(&ip6_ra_lock); 421 return 1; 422 } 423 read_unlock(&ip6_ra_lock); 424 return 0; 425 } 426 427 static int ip6_forward_proxy_check(struct sk_buff *skb) 428 { 429 struct ipv6hdr *hdr = ipv6_hdr(skb); 430 u8 nexthdr = hdr->nexthdr; 431 __be16 frag_off; 432 int offset; 433 434 if (ipv6_ext_hdr(nexthdr)) { 435 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 436 if (offset < 0) 437 return 0; 438 } else 439 offset = sizeof(struct ipv6hdr); 440 441 if (nexthdr == IPPROTO_ICMPV6) { 442 struct icmp6hdr *icmp6; 443 444 if (!pskb_may_pull(skb, (skb_network_header(skb) + 445 offset + 1 - skb->data))) 446 return 0; 447 448 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 449 450 switch (icmp6->icmp6_type) { 451 case NDISC_ROUTER_SOLICITATION: 452 case NDISC_ROUTER_ADVERTISEMENT: 453 case NDISC_NEIGHBOUR_SOLICITATION: 454 case NDISC_NEIGHBOUR_ADVERTISEMENT: 455 case NDISC_REDIRECT: 456 /* For reaction involving unicast neighbor discovery 457 * message destined to the proxied address, pass it to 458 * input function. 459 */ 460 return 1; 461 default: 462 break; 463 } 464 } 465 466 /* 467 * The proxying router can't forward traffic sent to a link-local 468 * address, so signal the sender and discard the packet. This 469 * behavior is clarified by the MIPv6 specification. 470 */ 471 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 472 dst_link_failure(skb); 473 return -1; 474 } 475 476 return 0; 477 } 478 479 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 480 struct sk_buff *skb) 481 { 482 #ifdef CONFIG_NET_SWITCHDEV 483 if (skb->offload_l3_fwd_mark) { 484 consume_skb(skb); 485 return 0; 486 } 487 #endif 488 489 skb_clear_tstamp(skb); 490 return dst_output(net, sk, skb); 491 } 492 493 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 494 { 495 if (skb->len <= mtu) 496 return false; 497 498 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 499 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 500 return true; 501 502 if (skb->ignore_df) 503 return false; 504 505 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 506 return false; 507 508 return true; 509 } 510 511 int ip6_forward(struct sk_buff *skb) 512 { 513 struct dst_entry *dst = skb_dst(skb); 514 struct ipv6hdr *hdr = ipv6_hdr(skb); 515 struct inet6_skb_parm *opt = IP6CB(skb); 516 struct net *net = dev_net(dst_dev(dst)); 517 struct net_device *dev; 518 struct inet6_dev *idev; 519 SKB_DR(reason); 520 u32 mtu; 521 522 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 523 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 524 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 525 goto error; 526 527 if (skb->pkt_type != PACKET_HOST) 528 goto drop; 529 530 if (unlikely(skb->sk)) 531 goto drop; 532 533 if (skb_warn_if_lro(skb)) 534 goto drop; 535 536 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 537 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 538 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 539 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 540 goto drop; 541 } 542 543 skb_forward_csum(skb); 544 545 /* 546 * We DO NOT make any processing on 547 * RA packets, pushing them to user level AS IS 548 * without ane WARRANTY that application will be able 549 * to interpret them. The reason is that we 550 * cannot make anything clever here. 551 * 552 * We are not end-node, so that if packet contains 553 * AH/ESP, we cannot make anything. 554 * Defragmentation also would be mistake, RA packets 555 * cannot be fragmented, because there is no warranty 556 * that different fragments will go along one path. --ANK 557 */ 558 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 559 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 560 return 0; 561 } 562 563 /* 564 * check and decrement ttl 565 */ 566 if (hdr->hop_limit <= 1) { 567 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 568 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 569 570 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 571 return -ETIMEDOUT; 572 } 573 574 /* XXX: idev->cnf.proxy_ndp? */ 575 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 576 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 577 int proxied = ip6_forward_proxy_check(skb); 578 if (proxied > 0) { 579 /* It's tempting to decrease the hop limit 580 * here by 1, as we do at the end of the 581 * function too. 582 * 583 * But that would be incorrect, as proxying is 584 * not forwarding. The ip6_input function 585 * will handle this packet locally, and it 586 * depends on the hop limit being unchanged. 587 * 588 * One example is the NDP hop limit, that 589 * always has to stay 255, but other would be 590 * similar checks around RA packets, where the 591 * user can even change the desired limit. 592 */ 593 return ip6_input(skb); 594 } else if (proxied < 0) { 595 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 596 goto drop; 597 } 598 } 599 600 if (!xfrm6_route_forward(skb)) { 601 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 602 SKB_DR_SET(reason, XFRM_POLICY); 603 goto drop; 604 } 605 dst = skb_dst(skb); 606 dev = dst_dev(dst); 607 /* IPv6 specs say nothing about it, but it is clear that we cannot 608 send redirects to source routed frames. 609 We don't send redirects to frames decapsulated from IPsec. 610 */ 611 if (IP6CB(skb)->iif == dev->ifindex && 612 opt->srcrt == 0 && !skb_sec_path(skb)) { 613 struct in6_addr *target = NULL; 614 struct inet_peer *peer; 615 struct rt6_info *rt; 616 617 /* 618 * incoming and outgoing devices are the same 619 * send a redirect. 620 */ 621 622 rt = dst_rt6_info(dst); 623 if (rt->rt6i_flags & RTF_GATEWAY) 624 target = &rt->rt6i_gateway; 625 else 626 target = &hdr->daddr; 627 628 rcu_read_lock(); 629 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 630 631 /* Limit redirects both by destination (here) 632 and by source (inside ndisc_send_redirect) 633 */ 634 if (inet_peer_xrlim_allow(peer, 1*HZ)) 635 ndisc_send_redirect(skb, target); 636 rcu_read_unlock(); 637 } else { 638 int addrtype = ipv6_addr_type(&hdr->saddr); 639 640 /* This check is security critical. */ 641 if (addrtype == IPV6_ADDR_ANY || 642 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 643 goto error; 644 if (addrtype & IPV6_ADDR_LINKLOCAL) { 645 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 646 ICMPV6_NOT_NEIGHBOUR, 0); 647 goto error; 648 } 649 } 650 651 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 652 653 mtu = ip6_dst_mtu_maybe_forward(dst, true); 654 if (mtu < IPV6_MIN_MTU) 655 mtu = IPV6_MIN_MTU; 656 657 if (unlikely(ip6_pkt_too_big(skb, mtu))) { 658 /* Again, force OUTPUT device used as source address */ 659 skb->dev = dev; 660 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 661 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 662 __IP6_INC_STATS(net, ip6_dst_idev(dst), 663 IPSTATS_MIB_FRAGFAILS); 664 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 665 return -EMSGSIZE; 666 } 667 668 if (skb_cow(skb, dev->hard_header_len)) { 669 __IP6_INC_STATS(net, ip6_dst_idev(dst), 670 IPSTATS_MIB_OUTDISCARDS); 671 goto drop; 672 } 673 674 hdr = ipv6_hdr(skb); 675 676 /* Mangling hops number delayed to point after skb COW */ 677 678 hdr->hop_limit--; 679 680 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 681 net, NULL, skb, skb->dev, dev, 682 ip6_forward_finish); 683 684 error: 685 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 686 SKB_DR_SET(reason, IP_INADDRERRORS); 687 drop: 688 kfree_skb_reason(skb, reason); 689 return -EINVAL; 690 } 691 692 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 693 { 694 to->pkt_type = from->pkt_type; 695 to->priority = from->priority; 696 to->protocol = from->protocol; 697 skb_dst_drop(to); 698 skb_dst_set(to, dst_clone(skb_dst(from))); 699 to->dev = from->dev; 700 to->mark = from->mark; 701 702 skb_copy_hash(to, from); 703 704 #ifdef CONFIG_NET_SCHED 705 to->tc_index = from->tc_index; 706 #endif 707 nf_copy(to, from); 708 skb_ext_copy(to, from); 709 skb_copy_secmark(to, from); 710 } 711 712 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 713 u8 nexthdr, __be32 frag_id, 714 struct ip6_fraglist_iter *iter) 715 { 716 unsigned int first_len; 717 struct frag_hdr *fh; 718 719 /* BUILD HEADER */ 720 *prevhdr = NEXTHDR_FRAGMENT; 721 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 722 if (!iter->tmp_hdr) 723 return -ENOMEM; 724 725 iter->frag = skb_shinfo(skb)->frag_list; 726 skb_frag_list_init(skb); 727 728 iter->offset = 0; 729 iter->hlen = hlen; 730 iter->frag_id = frag_id; 731 iter->nexthdr = nexthdr; 732 733 __skb_pull(skb, hlen); 734 fh = __skb_push(skb, sizeof(struct frag_hdr)); 735 __skb_push(skb, hlen); 736 skb_reset_network_header(skb); 737 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 738 739 fh->nexthdr = nexthdr; 740 fh->reserved = 0; 741 fh->frag_off = htons(IP6_MF); 742 fh->identification = frag_id; 743 744 first_len = skb_pagelen(skb); 745 skb->data_len = first_len - skb_headlen(skb); 746 skb->len = first_len; 747 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 748 749 return 0; 750 } 751 EXPORT_SYMBOL(ip6_fraglist_init); 752 753 void ip6_fraglist_prepare(struct sk_buff *skb, 754 struct ip6_fraglist_iter *iter) 755 { 756 struct sk_buff *frag = iter->frag; 757 unsigned int hlen = iter->hlen; 758 struct frag_hdr *fh; 759 760 frag->ip_summed = CHECKSUM_NONE; 761 skb_reset_transport_header(frag); 762 fh = __skb_push(frag, sizeof(struct frag_hdr)); 763 __skb_push(frag, hlen); 764 skb_reset_network_header(frag); 765 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 766 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 767 fh->nexthdr = iter->nexthdr; 768 fh->reserved = 0; 769 fh->frag_off = htons(iter->offset); 770 if (frag->next) 771 fh->frag_off |= htons(IP6_MF); 772 fh->identification = iter->frag_id; 773 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 774 ip6_copy_metadata(frag, skb); 775 } 776 EXPORT_SYMBOL(ip6_fraglist_prepare); 777 778 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 779 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 780 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 781 { 782 state->prevhdr = prevhdr; 783 state->nexthdr = nexthdr; 784 state->frag_id = frag_id; 785 786 state->hlen = hlen; 787 state->mtu = mtu; 788 789 state->left = skb->len - hlen; /* Space per frame */ 790 state->ptr = hlen; /* Where to start from */ 791 792 state->hroom = hdr_room; 793 state->troom = needed_tailroom; 794 795 state->offset = 0; 796 } 797 EXPORT_SYMBOL(ip6_frag_init); 798 799 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 800 { 801 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 802 struct sk_buff *frag; 803 struct frag_hdr *fh; 804 unsigned int len; 805 806 len = state->left; 807 /* IF: it doesn't fit, use 'mtu' - the data space left */ 808 if (len > state->mtu) 809 len = state->mtu; 810 /* IF: we are not sending up to and including the packet end 811 then align the next start on an eight byte boundary */ 812 if (len < state->left) 813 len &= ~7; 814 815 /* Allocate buffer */ 816 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 817 state->hroom + state->troom, GFP_ATOMIC); 818 if (!frag) 819 return ERR_PTR(-ENOMEM); 820 821 /* 822 * Set up data on packet 823 */ 824 825 ip6_copy_metadata(frag, skb); 826 skb_reserve(frag, state->hroom); 827 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 828 skb_reset_network_header(frag); 829 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 830 frag->transport_header = (frag->network_header + state->hlen + 831 sizeof(struct frag_hdr)); 832 833 /* 834 * Charge the memory for the fragment to any owner 835 * it might possess 836 */ 837 if (skb->sk) 838 skb_set_owner_w(frag, skb->sk); 839 840 /* 841 * Copy the packet header into the new buffer. 842 */ 843 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 844 845 fragnexthdr_offset = skb_network_header(frag); 846 fragnexthdr_offset += prevhdr - skb_network_header(skb); 847 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 848 849 /* 850 * Build fragment header. 851 */ 852 fh->nexthdr = state->nexthdr; 853 fh->reserved = 0; 854 fh->identification = state->frag_id; 855 856 /* 857 * Copy a block of the IP datagram. 858 */ 859 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 860 len)); 861 state->left -= len; 862 863 fh->frag_off = htons(state->offset); 864 if (state->left > 0) 865 fh->frag_off |= htons(IP6_MF); 866 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 867 868 state->ptr += len; 869 state->offset += len; 870 871 return frag; 872 } 873 EXPORT_SYMBOL(ip6_frag_next); 874 875 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 876 int (*output)(struct net *, struct sock *, struct sk_buff *)) 877 { 878 struct sk_buff *frag; 879 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 880 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 881 inet6_sk(skb->sk) : NULL; 882 u8 tstamp_type = skb->tstamp_type; 883 struct ip6_frag_state state; 884 unsigned int mtu, hlen, nexthdr_offset; 885 ktime_t tstamp = skb->tstamp; 886 int hroom, err = 0; 887 __be32 frag_id; 888 u8 *prevhdr, nexthdr = 0; 889 890 err = ip6_find_1stfragopt(skb, &prevhdr); 891 if (err < 0) 892 goto fail; 893 hlen = err; 894 nexthdr = *prevhdr; 895 nexthdr_offset = prevhdr - skb_network_header(skb); 896 897 mtu = ip6_skb_dst_mtu(skb); 898 899 /* We must not fragment if the socket is set to force MTU discovery 900 * or if the skb it not generated by a local socket. 901 */ 902 if (unlikely(!skb->ignore_df && skb->len > mtu)) 903 goto fail_toobig; 904 905 if (IP6CB(skb)->frag_max_size) { 906 if (IP6CB(skb)->frag_max_size > mtu) 907 goto fail_toobig; 908 909 /* don't send fragments larger than what we received */ 910 mtu = IP6CB(skb)->frag_max_size; 911 if (mtu < IPV6_MIN_MTU) 912 mtu = IPV6_MIN_MTU; 913 } 914 915 if (np) { 916 u32 frag_size = READ_ONCE(np->frag_size); 917 918 if (frag_size && frag_size < mtu) 919 mtu = frag_size; 920 } 921 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 922 goto fail_toobig; 923 mtu -= hlen + sizeof(struct frag_hdr); 924 925 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 926 &ipv6_hdr(skb)->saddr); 927 928 if (skb->ip_summed == CHECKSUM_PARTIAL && 929 (err = skb_checksum_help(skb))) 930 goto fail; 931 932 prevhdr = skb_network_header(skb) + nexthdr_offset; 933 hroom = LL_RESERVED_SPACE(rt->dst.dev); 934 if (skb_has_frag_list(skb)) { 935 unsigned int first_len = skb_pagelen(skb); 936 struct ip6_fraglist_iter iter; 937 struct sk_buff *frag2; 938 939 if (first_len - hlen > mtu || 940 ((first_len - hlen) & 7) || 941 skb_cloned(skb) || 942 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 943 goto slow_path; 944 945 skb_walk_frags(skb, frag) { 946 /* Correct geometry. */ 947 if (frag->len > mtu || 948 ((frag->len & 7) && frag->next) || 949 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 950 goto slow_path_clean; 951 952 /* Partially cloned skb? */ 953 if (skb_shared(frag)) 954 goto slow_path_clean; 955 956 BUG_ON(frag->sk); 957 if (skb->sk) { 958 frag->sk = skb->sk; 959 frag->destructor = sock_wfree; 960 } 961 skb->truesize -= frag->truesize; 962 } 963 964 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 965 &iter); 966 if (err < 0) 967 goto fail; 968 969 /* We prevent @rt from being freed. */ 970 rcu_read_lock(); 971 972 for (;;) { 973 /* Prepare header of the next frame, 974 * before previous one went down. */ 975 if (iter.frag) 976 ip6_fraglist_prepare(skb, &iter); 977 978 skb_set_delivery_time(skb, tstamp, tstamp_type); 979 err = output(net, sk, skb); 980 if (!err) 981 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 982 IPSTATS_MIB_FRAGCREATES); 983 984 if (err || !iter.frag) 985 break; 986 987 skb = ip6_fraglist_next(&iter); 988 } 989 990 kfree(iter.tmp_hdr); 991 992 if (err == 0) { 993 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 994 IPSTATS_MIB_FRAGOKS); 995 rcu_read_unlock(); 996 return 0; 997 } 998 999 kfree_skb_list(iter.frag); 1000 1001 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1002 IPSTATS_MIB_FRAGFAILS); 1003 rcu_read_unlock(); 1004 return err; 1005 1006 slow_path_clean: 1007 skb_walk_frags(skb, frag2) { 1008 if (frag2 == frag) 1009 break; 1010 frag2->sk = NULL; 1011 frag2->destructor = NULL; 1012 skb->truesize += frag2->truesize; 1013 } 1014 } 1015 1016 slow_path: 1017 /* 1018 * Fragment the datagram. 1019 */ 1020 1021 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1022 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1023 &state); 1024 1025 /* 1026 * Keep copying data until we run out. 1027 */ 1028 1029 while (state.left > 0) { 1030 frag = ip6_frag_next(skb, &state); 1031 if (IS_ERR(frag)) { 1032 err = PTR_ERR(frag); 1033 goto fail; 1034 } 1035 1036 /* 1037 * Put this fragment into the sending queue. 1038 */ 1039 skb_set_delivery_time(frag, tstamp, tstamp_type); 1040 err = output(net, sk, frag); 1041 if (err) 1042 goto fail; 1043 1044 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1045 IPSTATS_MIB_FRAGCREATES); 1046 } 1047 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1048 IPSTATS_MIB_FRAGOKS); 1049 consume_skb(skb); 1050 return err; 1051 1052 fail_toobig: 1053 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1054 err = -EMSGSIZE; 1055 1056 fail: 1057 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1058 IPSTATS_MIB_FRAGFAILS); 1059 kfree_skb(skb); 1060 return err; 1061 } 1062 1063 static inline int ip6_rt_check(const struct rt6key *rt_key, 1064 const struct in6_addr *fl_addr, 1065 const struct in6_addr *addr_cache) 1066 { 1067 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1068 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1069 } 1070 1071 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1072 struct dst_entry *dst, 1073 const struct flowi6 *fl6) 1074 { 1075 struct ipv6_pinfo *np = inet6_sk(sk); 1076 struct rt6_info *rt; 1077 1078 if (!dst) 1079 goto out; 1080 1081 if (dst->ops->family != AF_INET6) { 1082 dst_release(dst); 1083 return NULL; 1084 } 1085 1086 rt = dst_rt6_info(dst); 1087 /* Yes, checking route validity in not connected 1088 * case is not very simple. Take into account, 1089 * that we do not support routing by source, TOS, 1090 * and MSG_DONTROUTE --ANK (980726) 1091 * 1092 * 1. ip6_rt_check(): If route was host route, 1093 * check that cached destination is current. 1094 * If it is network route, we still may 1095 * check its validity using saved pointer 1096 * to the last used address: daddr_cache. 1097 * We do not want to save whole address now, 1098 * (because main consumer of this service 1099 * is tcp, which has not this problem), 1100 * so that the last trick works only on connected 1101 * sockets. 1102 * 2. oif also should be the same. 1103 */ 1104 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1105 np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1106 #ifdef CONFIG_IPV6_SUBTREES 1107 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1108 np->saddr_cache ? &np->saddr : NULL) || 1109 #endif 1110 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1111 dst_release(dst); 1112 dst = NULL; 1113 } 1114 1115 out: 1116 return dst; 1117 } 1118 1119 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1120 struct dst_entry **dst, struct flowi6 *fl6) 1121 { 1122 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1123 struct neighbour *n; 1124 struct rt6_info *rt; 1125 #endif 1126 int err; 1127 int flags = 0; 1128 1129 /* The correct way to handle this would be to do 1130 * ip6_route_get_saddr, and then ip6_route_output; however, 1131 * the route-specific preferred source forces the 1132 * ip6_route_output call _before_ ip6_route_get_saddr. 1133 * 1134 * In source specific routing (no src=any default route), 1135 * ip6_route_output will fail given src=any saddr, though, so 1136 * that's why we try it again later. 1137 */ 1138 if (ipv6_addr_any(&fl6->saddr)) { 1139 struct fib6_info *from; 1140 struct rt6_info *rt; 1141 1142 *dst = ip6_route_output(net, sk, fl6); 1143 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1144 1145 rcu_read_lock(); 1146 from = rt ? rcu_dereference(rt->from) : NULL; 1147 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1148 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1149 fl6->flowi6_l3mdev, 1150 &fl6->saddr); 1151 rcu_read_unlock(); 1152 1153 if (err) 1154 goto out_err_release; 1155 1156 /* If we had an erroneous initial result, pretend it 1157 * never existed and let the SA-enabled version take 1158 * over. 1159 */ 1160 if ((*dst)->error) { 1161 dst_release(*dst); 1162 *dst = NULL; 1163 } 1164 1165 if (fl6->flowi6_oif) 1166 flags |= RT6_LOOKUP_F_IFACE; 1167 } 1168 1169 if (!*dst) 1170 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1171 1172 err = (*dst)->error; 1173 if (err) 1174 goto out_err_release; 1175 1176 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1177 /* 1178 * Here if the dst entry we've looked up 1179 * has a neighbour entry that is in the INCOMPLETE 1180 * state and the src address from the flow is 1181 * marked as OPTIMISTIC, we release the found 1182 * dst entry and replace it instead with the 1183 * dst entry of the nexthop router 1184 */ 1185 rt = dst_rt6_info(*dst); 1186 rcu_read_lock(); 1187 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1188 rt6_nexthop(rt, &fl6->daddr)); 1189 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1190 rcu_read_unlock(); 1191 1192 if (err) { 1193 struct inet6_ifaddr *ifp; 1194 struct flowi6 fl_gw6; 1195 int redirect; 1196 1197 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1198 (*dst)->dev, 1); 1199 1200 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1201 if (ifp) 1202 in6_ifa_put(ifp); 1203 1204 if (redirect) { 1205 /* 1206 * We need to get the dst entry for the 1207 * default router instead 1208 */ 1209 dst_release(*dst); 1210 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1211 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1212 *dst = ip6_route_output(net, sk, &fl_gw6); 1213 err = (*dst)->error; 1214 if (err) 1215 goto out_err_release; 1216 } 1217 } 1218 #endif 1219 if (ipv6_addr_v4mapped(&fl6->saddr) && 1220 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1221 err = -EAFNOSUPPORT; 1222 goto out_err_release; 1223 } 1224 1225 return 0; 1226 1227 out_err_release: 1228 dst_release(*dst); 1229 *dst = NULL; 1230 1231 if (err == -ENETUNREACH) 1232 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1233 return err; 1234 } 1235 1236 /** 1237 * ip6_dst_lookup - perform route lookup on flow 1238 * @net: Network namespace to perform lookup in 1239 * @sk: socket which provides route info 1240 * @dst: pointer to dst_entry * for result 1241 * @fl6: flow to lookup 1242 * 1243 * This function performs a route lookup on the given flow. 1244 * 1245 * It returns zero on success, or a standard errno code on error. 1246 */ 1247 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1248 struct flowi6 *fl6) 1249 { 1250 *dst = NULL; 1251 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1252 } 1253 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1254 1255 /** 1256 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1257 * @net: Network namespace to perform lookup in 1258 * @sk: socket which provides route info 1259 * @fl6: flow to lookup 1260 * @final_dst: final destination address for ipsec lookup 1261 * 1262 * This function performs a route lookup on the given flow. 1263 * 1264 * It returns a valid dst pointer on success, or a pointer encoded 1265 * error code. 1266 */ 1267 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1268 const struct in6_addr *final_dst) 1269 { 1270 struct dst_entry *dst = NULL; 1271 int err; 1272 1273 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1274 if (err) 1275 return ERR_PTR(err); 1276 if (final_dst) 1277 fl6->daddr = *final_dst; 1278 1279 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1280 } 1281 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1282 1283 /** 1284 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1285 * @sk: socket which provides the dst cache and route info 1286 * @fl6: flow to lookup 1287 * @final_dst: final destination address for ipsec lookup 1288 * @connected: whether @sk is connected or not 1289 * 1290 * This function performs a route lookup on the given flow with the 1291 * possibility of using the cached route in the socket if it is valid. 1292 * It will take the socket dst lock when operating on the dst cache. 1293 * As a result, this function can only be used in process context. 1294 * 1295 * In addition, for a connected socket, cache the dst in the socket 1296 * if the current cache is not valid. 1297 * 1298 * It returns a valid dst pointer on success, or a pointer encoded 1299 * error code. 1300 */ 1301 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1302 const struct in6_addr *final_dst, 1303 bool connected) 1304 { 1305 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1306 1307 dst = ip6_sk_dst_check(sk, dst, fl6); 1308 if (dst) 1309 return dst; 1310 1311 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1312 if (connected && !IS_ERR(dst)) 1313 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1314 1315 return dst; 1316 } 1317 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1318 1319 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1320 gfp_t gfp) 1321 { 1322 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1323 } 1324 1325 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1326 gfp_t gfp) 1327 { 1328 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1329 } 1330 1331 static void ip6_append_data_mtu(unsigned int *mtu, 1332 int *maxfraglen, 1333 unsigned int fragheaderlen, 1334 struct sk_buff *skb, 1335 struct rt6_info *rt, 1336 unsigned int orig_mtu) 1337 { 1338 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1339 if (!skb) { 1340 /* first fragment, reserve header_len */ 1341 *mtu = orig_mtu - rt->dst.header_len; 1342 1343 } else { 1344 /* 1345 * this fragment is not first, the headers 1346 * space is regarded as data space. 1347 */ 1348 *mtu = orig_mtu; 1349 } 1350 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1351 + fragheaderlen - sizeof(struct frag_hdr); 1352 } 1353 } 1354 1355 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1356 struct ipcm6_cookie *ipc6, 1357 struct rt6_info *rt) 1358 { 1359 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1360 struct inet6_cork *v6_cork = &cork->base6; 1361 struct ipv6_pinfo *np = inet6_sk(sk); 1362 unsigned int mtu, frag_size; 1363 1364 /* callers pass dst together with a reference, set it first so 1365 * ip6_cork_release() can put it down even in case of an error. 1366 */ 1367 cork->base.dst = &rt->dst; 1368 1369 /* 1370 * setup for corking 1371 */ 1372 if (unlikely(opt)) { 1373 if (WARN_ON(v6_cork->opt)) 1374 return -EINVAL; 1375 1376 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1377 if (unlikely(!nopt)) 1378 return -ENOBUFS; 1379 1380 nopt->tot_len = sizeof(*opt); 1381 nopt->opt_flen = opt->opt_flen; 1382 nopt->opt_nflen = opt->opt_nflen; 1383 1384 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1385 if (opt->dst0opt && !nopt->dst0opt) 1386 return -ENOBUFS; 1387 1388 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1389 if (opt->dst1opt && !nopt->dst1opt) 1390 return -ENOBUFS; 1391 1392 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1393 if (opt->hopopt && !nopt->hopopt) 1394 return -ENOBUFS; 1395 1396 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1397 if (opt->srcrt && !nopt->srcrt) 1398 return -ENOBUFS; 1399 1400 /* need source address above miyazawa*/ 1401 } 1402 v6_cork->hop_limit = ipc6->hlimit; 1403 v6_cork->tclass = ipc6->tclass; 1404 v6_cork->dontfrag = ipc6->dontfrag; 1405 if (rt->dst.flags & DST_XFRM_TUNNEL) 1406 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1407 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); 1408 else 1409 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1410 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); 1411 1412 frag_size = READ_ONCE(np->frag_size); 1413 if (frag_size && frag_size < mtu) 1414 mtu = frag_size; 1415 1416 cork->base.fragsize = mtu; 1417 cork->base.gso_size = ipc6->gso_size; 1418 cork->base.tx_flags = 0; 1419 cork->base.mark = ipc6->sockc.mark; 1420 cork->base.priority = ipc6->sockc.priority; 1421 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1422 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1423 cork->base.flags |= IPCORK_TS_OPT_ID; 1424 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1425 } 1426 cork->base.length = 0; 1427 cork->base.transmit_time = ipc6->sockc.transmit_time; 1428 1429 return 0; 1430 } 1431 1432 static int __ip6_append_data(struct sock *sk, 1433 struct sk_buff_head *queue, 1434 struct inet_cork_full *cork_full, 1435 struct page_frag *pfrag, 1436 int getfrag(void *from, char *to, int offset, 1437 int len, int odd, struct sk_buff *skb), 1438 void *from, size_t length, int transhdrlen, 1439 unsigned int flags) 1440 { 1441 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1442 struct inet6_cork *v6_cork = &cork_full->base6; 1443 struct inet_cork *cork = &cork_full->base; 1444 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1445 struct sk_buff *skb, *skb_prev = NULL; 1446 struct ubuf_info *uarg = NULL; 1447 int exthdrlen = 0; 1448 int dst_exthdrlen = 0; 1449 int hh_len; 1450 int copy; 1451 int err; 1452 int offset = 0; 1453 bool zc = false; 1454 u32 tskey = 0; 1455 struct rt6_info *rt = dst_rt6_info(cork->dst); 1456 bool paged, hold_tskey = false, extra_uref = false; 1457 struct ipv6_txoptions *opt = v6_cork->opt; 1458 int csummode = CHECKSUM_NONE; 1459 unsigned int maxnonfragsize, headersize; 1460 unsigned int wmem_alloc_delta = 0; 1461 1462 skb = skb_peek_tail(queue); 1463 if (!skb) { 1464 exthdrlen = opt ? opt->opt_flen : 0; 1465 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1466 } 1467 1468 paged = !!cork->gso_size; 1469 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1470 orig_mtu = mtu; 1471 1472 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1473 1474 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1475 (opt ? opt->opt_nflen : 0); 1476 1477 headersize = sizeof(struct ipv6hdr) + 1478 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1479 rt->rt6i_nfheader_len; 1480 1481 if (mtu <= fragheaderlen || 1482 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1483 goto emsgsize; 1484 1485 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1486 sizeof(struct frag_hdr); 1487 1488 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1489 * the first fragment 1490 */ 1491 if (headersize + transhdrlen > mtu) 1492 goto emsgsize; 1493 1494 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1495 (sk->sk_protocol == IPPROTO_UDP || 1496 sk->sk_protocol == IPPROTO_ICMPV6 || 1497 sk->sk_protocol == IPPROTO_RAW)) { 1498 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1499 sizeof(struct ipv6hdr)); 1500 goto emsgsize; 1501 } 1502 1503 if (ip6_sk_ignore_df(sk)) 1504 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1505 else 1506 maxnonfragsize = mtu; 1507 1508 if (cork->length + length > maxnonfragsize - headersize) { 1509 emsgsize: 1510 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1511 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1512 return -EMSGSIZE; 1513 } 1514 1515 /* CHECKSUM_PARTIAL only with no extension headers and when 1516 * we are not going to fragment 1517 */ 1518 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1519 headersize == sizeof(struct ipv6hdr) && 1520 length <= mtu - headersize && 1521 (!(flags & MSG_MORE) || cork->gso_size) && 1522 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1523 csummode = CHECKSUM_PARTIAL; 1524 1525 if ((flags & MSG_ZEROCOPY) && length) { 1526 struct msghdr *msg = from; 1527 1528 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1529 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1530 return -EINVAL; 1531 1532 /* Leave uarg NULL if can't zerocopy, callers should 1533 * be able to handle it. 1534 */ 1535 if ((rt->dst.dev->features & NETIF_F_SG) && 1536 csummode == CHECKSUM_PARTIAL) { 1537 paged = true; 1538 zc = true; 1539 uarg = msg->msg_ubuf; 1540 } 1541 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1542 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1543 false); 1544 if (!uarg) 1545 return -ENOBUFS; 1546 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1547 if (rt->dst.dev->features & NETIF_F_SG && 1548 csummode == CHECKSUM_PARTIAL) { 1549 paged = true; 1550 zc = true; 1551 } else { 1552 uarg_to_msgzc(uarg)->zerocopy = 0; 1553 skb_zcopy_set(skb, uarg, &extra_uref); 1554 } 1555 } 1556 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1557 if (inet_test_bit(HDRINCL, sk)) 1558 return -EPERM; 1559 if (rt->dst.dev->features & NETIF_F_SG && 1560 getfrag == ip_generic_getfrag) 1561 /* We need an empty buffer to attach stuff to */ 1562 paged = true; 1563 else 1564 flags &= ~MSG_SPLICE_PAGES; 1565 } 1566 1567 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1568 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1569 if (cork->flags & IPCORK_TS_OPT_ID) { 1570 tskey = cork->ts_opt_id; 1571 } else { 1572 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1573 hold_tskey = true; 1574 } 1575 } 1576 1577 /* 1578 * Let's try using as much space as possible. 1579 * Use MTU if total length of the message fits into the MTU. 1580 * Otherwise, we need to reserve fragment header and 1581 * fragment alignment (= 8-15 octects, in total). 1582 * 1583 * Note that we may need to "move" the data from the tail 1584 * of the buffer to the new fragment when we split 1585 * the message. 1586 * 1587 * FIXME: It may be fragmented into multiple chunks 1588 * at once if non-fragmentable extension headers 1589 * are too large. 1590 * --yoshfuji 1591 */ 1592 1593 cork->length += length; 1594 if (!skb) 1595 goto alloc_new_skb; 1596 1597 while (length > 0) { 1598 /* Check if the remaining data fits into current packet. */ 1599 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1600 if (copy < length) 1601 copy = maxfraglen - skb->len; 1602 1603 if (copy <= 0) { 1604 char *data; 1605 unsigned int datalen; 1606 unsigned int fraglen; 1607 unsigned int fraggap; 1608 unsigned int alloclen, alloc_extra; 1609 unsigned int pagedlen; 1610 alloc_new_skb: 1611 /* There's no room in the current skb */ 1612 if (skb) 1613 fraggap = skb->len - maxfraglen; 1614 else 1615 fraggap = 0; 1616 /* update mtu and maxfraglen if necessary */ 1617 if (!skb || !skb_prev) 1618 ip6_append_data_mtu(&mtu, &maxfraglen, 1619 fragheaderlen, skb, rt, 1620 orig_mtu); 1621 1622 skb_prev = skb; 1623 1624 /* 1625 * If remaining data exceeds the mtu, 1626 * we know we need more fragment(s). 1627 */ 1628 datalen = length + fraggap; 1629 1630 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1631 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1632 fraglen = datalen + fragheaderlen; 1633 pagedlen = 0; 1634 1635 alloc_extra = hh_len; 1636 alloc_extra += dst_exthdrlen; 1637 alloc_extra += rt->dst.trailer_len; 1638 1639 /* We just reserve space for fragment header. 1640 * Note: this may be overallocation if the message 1641 * (without MSG_MORE) fits into the MTU. 1642 */ 1643 alloc_extra += sizeof(struct frag_hdr); 1644 1645 if ((flags & MSG_MORE) && 1646 !(rt->dst.dev->features&NETIF_F_SG)) 1647 alloclen = mtu; 1648 else if (!paged && 1649 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1650 !(rt->dst.dev->features & NETIF_F_SG))) 1651 alloclen = fraglen; 1652 else { 1653 alloclen = fragheaderlen + transhdrlen; 1654 pagedlen = datalen - transhdrlen; 1655 } 1656 alloclen += alloc_extra; 1657 1658 if (datalen != length + fraggap) { 1659 /* 1660 * this is not the last fragment, the trailer 1661 * space is regarded as data space. 1662 */ 1663 datalen += rt->dst.trailer_len; 1664 } 1665 1666 fraglen = datalen + fragheaderlen; 1667 1668 copy = datalen - transhdrlen - fraggap - pagedlen; 1669 /* [!] NOTE: copy may be negative if pagedlen>0 1670 * because then the equation may reduces to -fraggap. 1671 */ 1672 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1673 err = -EINVAL; 1674 goto error; 1675 } 1676 if (transhdrlen) { 1677 skb = sock_alloc_send_skb(sk, alloclen, 1678 (flags & MSG_DONTWAIT), &err); 1679 } else { 1680 skb = NULL; 1681 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1682 2 * sk->sk_sndbuf) 1683 skb = alloc_skb(alloclen, 1684 sk->sk_allocation); 1685 if (unlikely(!skb)) 1686 err = -ENOBUFS; 1687 } 1688 if (!skb) 1689 goto error; 1690 /* 1691 * Fill in the control structures 1692 */ 1693 skb->protocol = htons(ETH_P_IPV6); 1694 skb->ip_summed = csummode; 1695 skb->csum = 0; 1696 /* reserve for fragmentation and ipsec header */ 1697 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1698 dst_exthdrlen); 1699 1700 /* 1701 * Find where to start putting bytes 1702 */ 1703 data = skb_put(skb, fraglen - pagedlen); 1704 skb_set_network_header(skb, exthdrlen); 1705 data += fragheaderlen; 1706 skb->transport_header = (skb->network_header + 1707 fragheaderlen); 1708 if (fraggap) { 1709 skb->csum = skb_copy_and_csum_bits( 1710 skb_prev, maxfraglen, 1711 data + transhdrlen, fraggap); 1712 skb_prev->csum = csum_sub(skb_prev->csum, 1713 skb->csum); 1714 data += fraggap; 1715 pskb_trim_unique(skb_prev, maxfraglen); 1716 } 1717 if (copy > 0 && 1718 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1719 from, data + transhdrlen, offset, 1720 copy, fraggap, skb) < 0) { 1721 err = -EFAULT; 1722 kfree_skb(skb); 1723 goto error; 1724 } else if (flags & MSG_SPLICE_PAGES) { 1725 copy = 0; 1726 } 1727 1728 offset += copy; 1729 length -= copy + transhdrlen; 1730 transhdrlen = 0; 1731 exthdrlen = 0; 1732 dst_exthdrlen = 0; 1733 1734 /* Only the initial fragment is time stamped */ 1735 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1736 cork->tx_flags = 0; 1737 skb_shinfo(skb)->tskey = tskey; 1738 tskey = 0; 1739 skb_zcopy_set(skb, uarg, &extra_uref); 1740 1741 if ((flags & MSG_CONFIRM) && !skb_prev) 1742 skb_set_dst_pending_confirm(skb, 1); 1743 1744 /* 1745 * Put the packet on the pending queue 1746 */ 1747 if (!skb->destructor) { 1748 skb->destructor = sock_wfree; 1749 skb->sk = sk; 1750 wmem_alloc_delta += skb->truesize; 1751 } 1752 __skb_queue_tail(queue, skb); 1753 continue; 1754 } 1755 1756 if (copy > length) 1757 copy = length; 1758 1759 if (!(rt->dst.dev->features&NETIF_F_SG) && 1760 skb_tailroom(skb) >= copy) { 1761 unsigned int off; 1762 1763 off = skb->len; 1764 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1765 from, skb_put(skb, copy), 1766 offset, copy, off, skb) < 0) { 1767 __skb_trim(skb, off); 1768 err = -EFAULT; 1769 goto error; 1770 } 1771 } else if (flags & MSG_SPLICE_PAGES) { 1772 struct msghdr *msg = from; 1773 1774 err = -EIO; 1775 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1776 goto error; 1777 1778 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1779 if (err < 0) 1780 goto error; 1781 copy = err; 1782 wmem_alloc_delta += copy; 1783 } else if (!zc) { 1784 int i = skb_shinfo(skb)->nr_frags; 1785 1786 err = -ENOMEM; 1787 if (!sk_page_frag_refill(sk, pfrag)) 1788 goto error; 1789 1790 skb_zcopy_downgrade_managed(skb); 1791 if (!skb_can_coalesce(skb, i, pfrag->page, 1792 pfrag->offset)) { 1793 err = -EMSGSIZE; 1794 if (i == MAX_SKB_FRAGS) 1795 goto error; 1796 1797 __skb_fill_page_desc(skb, i, pfrag->page, 1798 pfrag->offset, 0); 1799 skb_shinfo(skb)->nr_frags = ++i; 1800 get_page(pfrag->page); 1801 } 1802 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1803 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1804 from, 1805 page_address(pfrag->page) + pfrag->offset, 1806 offset, copy, skb->len, skb) < 0) 1807 goto error_efault; 1808 1809 pfrag->offset += copy; 1810 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1811 skb->len += copy; 1812 skb->data_len += copy; 1813 skb->truesize += copy; 1814 wmem_alloc_delta += copy; 1815 } else { 1816 err = skb_zerocopy_iter_dgram(skb, from, copy); 1817 if (err < 0) 1818 goto error; 1819 } 1820 offset += copy; 1821 length -= copy; 1822 } 1823 1824 if (wmem_alloc_delta) 1825 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1826 return 0; 1827 1828 error_efault: 1829 err = -EFAULT; 1830 error: 1831 net_zcopy_put_abort(uarg, extra_uref); 1832 cork->length -= length; 1833 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1834 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1835 if (hold_tskey) 1836 atomic_dec(&sk->sk_tskey); 1837 return err; 1838 } 1839 1840 int ip6_append_data(struct sock *sk, 1841 int getfrag(void *from, char *to, int offset, int len, 1842 int odd, struct sk_buff *skb), 1843 void *from, size_t length, int transhdrlen, 1844 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1845 struct rt6_info *rt, unsigned int flags) 1846 { 1847 struct inet_sock *inet = inet_sk(sk); 1848 int exthdrlen; 1849 int err; 1850 1851 if (flags&MSG_PROBE) 1852 return 0; 1853 if (skb_queue_empty(&sk->sk_write_queue)) { 1854 /* 1855 * setup for corking 1856 */ 1857 dst_hold(&rt->dst); 1858 err = ip6_setup_cork(sk, &inet->cork, 1859 ipc6, rt); 1860 if (err) 1861 return err; 1862 1863 inet->cork.fl.u.ip6 = *fl6; 1864 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1865 length += exthdrlen; 1866 transhdrlen += exthdrlen; 1867 } else { 1868 transhdrlen = 0; 1869 } 1870 1871 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1872 sk_page_frag(sk), getfrag, 1873 from, length, transhdrlen, flags); 1874 } 1875 EXPORT_SYMBOL_GPL(ip6_append_data); 1876 1877 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1878 { 1879 struct dst_entry *dst = cork->base.dst; 1880 1881 cork->base.dst = NULL; 1882 skb_dst_set(skb, dst); 1883 } 1884 1885 static void ip6_cork_release(struct inet_cork_full *cork) 1886 { 1887 struct inet6_cork *v6_cork = &cork->base6; 1888 1889 if (unlikely(v6_cork->opt)) { 1890 struct ipv6_txoptions *opt = v6_cork->opt; 1891 1892 kfree(opt->dst0opt); 1893 kfree(opt->dst1opt); 1894 kfree(opt->hopopt); 1895 kfree(opt->srcrt); 1896 kfree(opt); 1897 v6_cork->opt = NULL; 1898 } 1899 1900 if (cork->base.dst) { 1901 dst_release(cork->base.dst); 1902 cork->base.dst = NULL; 1903 } 1904 } 1905 1906 struct sk_buff *__ip6_make_skb(struct sock *sk, 1907 struct sk_buff_head *queue, 1908 struct inet_cork_full *cork) 1909 { 1910 struct sk_buff *skb, *tmp_skb; 1911 struct sk_buff **tail_skb; 1912 struct in6_addr *final_dst; 1913 struct net *net = sock_net(sk); 1914 struct ipv6hdr *hdr; 1915 struct ipv6_txoptions *opt; 1916 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1917 struct flowi6 *fl6 = &cork->fl.u.ip6; 1918 unsigned char proto = fl6->flowi6_proto; 1919 1920 skb = __skb_dequeue(queue); 1921 if (!skb) 1922 goto out; 1923 tail_skb = &(skb_shinfo(skb)->frag_list); 1924 1925 /* move skb->data to ip header from ext header */ 1926 if (skb->data < skb_network_header(skb)) 1927 __skb_pull(skb, skb_network_offset(skb)); 1928 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1929 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1930 *tail_skb = tmp_skb; 1931 tail_skb = &(tmp_skb->next); 1932 skb->len += tmp_skb->len; 1933 skb->data_len += tmp_skb->len; 1934 skb->truesize += tmp_skb->truesize; 1935 tmp_skb->destructor = NULL; 1936 tmp_skb->sk = NULL; 1937 } 1938 1939 /* Allow local fragmentation. */ 1940 skb->ignore_df = ip6_sk_ignore_df(sk); 1941 __skb_pull(skb, skb_network_header_len(skb)); 1942 1943 final_dst = &fl6->daddr; 1944 opt = cork->base6.opt; 1945 if (unlikely(opt)) { 1946 if (opt->opt_flen) 1947 proto = ipv6_push_frag_opts(skb, opt, proto); 1948 if (opt->opt_nflen) 1949 proto = ipv6_push_nfrag_opts(skb, opt, proto, 1950 &final_dst, &fl6->saddr); 1951 } 1952 skb_push(skb, sizeof(struct ipv6hdr)); 1953 skb_reset_network_header(skb); 1954 hdr = ipv6_hdr(skb); 1955 1956 ip6_flow_hdr(hdr, cork->base6.tclass, 1957 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1958 ip6_autoflowlabel(net, sk), fl6)); 1959 hdr->hop_limit = cork->base6.hop_limit; 1960 hdr->nexthdr = proto; 1961 hdr->saddr = fl6->saddr; 1962 hdr->daddr = *final_dst; 1963 1964 skb->priority = cork->base.priority; 1965 skb->mark = cork->base.mark; 1966 if (sk_is_tcp(sk)) 1967 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1968 else 1969 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1970 1971 ip6_cork_steal_dst(skb, cork); 1972 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1973 if (unlikely(proto == IPPROTO_ICMPV6)) { 1974 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1975 u8 icmp6_type; 1976 1977 if (sk->sk_socket->type == SOCK_RAW && 1978 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1979 icmp6_type = fl6->fl6_icmp_type; 1980 else 1981 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1982 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1983 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1984 } 1985 1986 ip6_cork_release(cork); 1987 out: 1988 return skb; 1989 } 1990 1991 int ip6_send_skb(struct sk_buff *skb) 1992 { 1993 struct net *net = sock_net(skb->sk); 1994 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 1995 int err; 1996 1997 rcu_read_lock(); 1998 err = ip6_local_out(net, skb->sk, skb); 1999 if (err) { 2000 if (err > 0) 2001 err = net_xmit_errno(err); 2002 if (err) 2003 IP6_INC_STATS(net, rt->rt6i_idev, 2004 IPSTATS_MIB_OUTDISCARDS); 2005 } 2006 2007 rcu_read_unlock(); 2008 return err; 2009 } 2010 2011 int ip6_push_pending_frames(struct sock *sk) 2012 { 2013 struct sk_buff *skb; 2014 2015 skb = ip6_finish_skb(sk); 2016 if (!skb) 2017 return 0; 2018 2019 return ip6_send_skb(skb); 2020 } 2021 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2022 2023 static void __ip6_flush_pending_frames(struct sock *sk, 2024 struct sk_buff_head *queue, 2025 struct inet_cork_full *cork) 2026 { 2027 struct sk_buff *skb; 2028 2029 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2030 if (skb_dst(skb)) 2031 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2032 IPSTATS_MIB_OUTDISCARDS); 2033 kfree_skb(skb); 2034 } 2035 2036 ip6_cork_release(cork); 2037 } 2038 2039 void ip6_flush_pending_frames(struct sock *sk) 2040 { 2041 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2042 &inet_sk(sk)->cork); 2043 } 2044 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2045 2046 struct sk_buff *ip6_make_skb(struct sock *sk, 2047 int getfrag(void *from, char *to, int offset, 2048 int len, int odd, struct sk_buff *skb), 2049 void *from, size_t length, int transhdrlen, 2050 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2051 unsigned int flags, struct inet_cork_full *cork) 2052 { 2053 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2054 struct sk_buff_head queue; 2055 int err; 2056 2057 if (flags & MSG_PROBE) { 2058 dst_release(&rt->dst); 2059 return NULL; 2060 } 2061 2062 __skb_queue_head_init(&queue); 2063 2064 cork->base.flags = 0; 2065 cork->base.addr = 0; 2066 cork->base.opt = NULL; 2067 cork->base6.opt = NULL; 2068 err = ip6_setup_cork(sk, cork, ipc6, rt); 2069 if (err) { 2070 ip6_cork_release(cork); 2071 return ERR_PTR(err); 2072 } 2073 2074 err = __ip6_append_data(sk, &queue, cork, 2075 ¤t->task_frag, getfrag, from, 2076 length + exthdrlen, transhdrlen + exthdrlen, 2077 flags); 2078 if (err) { 2079 __ip6_flush_pending_frames(sk, &queue, cork); 2080 return ERR_PTR(err); 2081 } 2082 2083 return __ip6_make_skb(sk, &queue, cork); 2084 } 2085