1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev_rcu(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* idev stays alive because we hold rcu_read_lock(). */ 74 skb = skb_expand_head(skb, hh_len); 75 if (!skb) { 76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 77 return -ENOMEM; 78 } 79 } 80 81 hdr = ipv6_hdr(skb); 82 daddr = &hdr->daddr; 83 if (unlikely(ipv6_addr_is_multicast(daddr))) { 84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 85 ((mroute6_is_socket(net, skb) && 86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 89 90 /* Do not check for IFF_ALLMULTI; multicast routing 91 is not supported in any case. 92 */ 93 if (newskb) 94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 95 net, sk, newskb, NULL, newskb->dev, 96 dev_loopback_xmit); 97 98 if (hdr->hop_limit == 0) { 99 IP6_INC_STATS(net, idev, 100 IPSTATS_MIB_OUTDISCARDS); 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 108 !(dev->flags & IFF_LOOPBACK)) { 109 kfree_skb(skb); 110 return 0; 111 } 112 } 113 114 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 115 int res = lwtunnel_xmit(skb); 116 117 if (res != LWTUNNEL_XMIT_CONTINUE) 118 return res; 119 } 120 121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 122 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (IS_ERR_OR_NULL(neigh)) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 132 return -EINVAL; 133 } 134 } 135 sock_confirm_neigh(skb, neigh); 136 ret = neigh_output(neigh, skb, false); 137 return ret; 138 } 139 140 static int 141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 142 struct sk_buff *skb, unsigned int mtu) 143 { 144 struct sk_buff *segs, *nskb; 145 netdev_features_t features; 146 int ret = 0; 147 148 /* Please see corresponding comment in ip_finish_output_gso 149 * describing the cases where GSO segment length exceeds the 150 * egress MTU. 151 */ 152 features = netif_skb_features(skb); 153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 154 if (IS_ERR_OR_NULL(segs)) { 155 kfree_skb(skb); 156 return -ENOMEM; 157 } 158 159 consume_skb(skb); 160 161 skb_list_walk_safe(segs, segs, nskb) { 162 int err; 163 164 skb_mark_not_on_list(segs); 165 /* Last GSO segment can be smaller than gso_size (and MTU). 166 * Adding a fragment header would produce an "atomic fragment", 167 * which is considered harmful (RFC-8021). Avoid that. 168 */ 169 err = segs->len > mtu ? 170 ip6_fragment(net, sk, segs, ip6_finish_output2) : 171 ip6_finish_output2(net, sk, segs); 172 if (err && ret == 0) 173 ret = err; 174 } 175 176 return ret; 177 } 178 179 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 180 struct sk_buff *skb, unsigned int mtu) 181 { 182 if (unlikely(!skb_gso_validate_network_len(skb, mtu))) 183 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 184 185 return ip6_finish_output2(net, sk, skb); 186 } 187 188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 189 { 190 unsigned int mtu; 191 192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 193 /* Policy lookup after SNAT yielded a new policy */ 194 if (skb_dst(skb)->xfrm) { 195 IP6CB(skb)->flags |= IP6SKB_REROUTED; 196 return dst_output(net, sk, skb); 197 } 198 #endif 199 200 mtu = ip6_skb_dst_mtu(skb); 201 if (skb_is_gso(skb)) 202 return ip6_finish_output_gso(net, sk, skb, mtu); 203 204 if (unlikely(skb->len > mtu || 205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))) 206 return ip6_fragment(net, sk, skb, ip6_finish_output2); 207 208 return ip6_finish_output2(net, sk, skb); 209 } 210 211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 212 { 213 int ret; 214 215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 216 switch (ret) { 217 case NET_XMIT_SUCCESS: 218 case NET_XMIT_CN: 219 return __ip6_finish_output(net, sk, skb) ? : ret; 220 default: 221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 222 return ret; 223 } 224 } 225 226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 227 { 228 struct dst_entry *dst = skb_dst(skb); 229 struct net_device *dev, *indev = skb->dev; 230 struct inet6_dev *idev; 231 int ret; 232 233 skb->protocol = htons(ETH_P_IPV6); 234 rcu_read_lock(); 235 dev = dst_dev_rcu(dst); 236 idev = ip6_dst_idev(dst); 237 skb->dev = dev; 238 239 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 240 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 241 rcu_read_unlock(); 242 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 243 return 0; 244 } 245 246 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 247 net, sk, skb, indev, dev, 248 ip6_finish_output, 249 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 250 rcu_read_unlock(); 251 return ret; 252 } 253 EXPORT_SYMBOL(ip6_output); 254 255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 256 { 257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 258 return ip6_default_np_autolabel(net); 259 return inet6_test_bit(AUTOFLOWLABEL, sk); 260 } 261 262 int ip6_dst_hoplimit(struct dst_entry *dst) 263 { 264 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT); 265 266 rcu_read_lock(); 267 if (hoplimit == 0) { 268 struct net_device *dev = dst_dev_rcu(dst); 269 struct inet6_dev *idev; 270 271 idev = __in6_dev_get(dev); 272 if (idev) 273 hoplimit = READ_ONCE(idev->cnf.hop_limit); 274 else 275 hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit); 276 } 277 rcu_read_unlock(); 278 279 return hoplimit; 280 } 281 EXPORT_SYMBOL(ip6_dst_hoplimit); 282 283 /* 284 * xmit an sk_buff (used by TCP and SCTP) 285 * Note : socket lock is not held for SYNACK packets, but might be modified 286 * by calls to skb_set_owner_w() and ipv6_local_error(), 287 * which are using proper atomic operations or spinlocks. 288 */ 289 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 290 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 291 { 292 const struct ipv6_pinfo *np = inet6_sk(sk); 293 struct in6_addr *first_hop = &fl6->daddr; 294 struct dst_entry *dst = skb_dst(skb); 295 struct inet6_dev *idev = ip6_dst_idev(dst); 296 struct net *net = sock_net(sk); 297 unsigned int head_room; 298 struct net_device *dev; 299 struct ipv6hdr *hdr; 300 u8 proto = fl6->flowi6_proto; 301 int seg_len = skb->len; 302 int ret, hlimit = -1; 303 u32 mtu; 304 305 rcu_read_lock(); 306 307 dev = dst_dev_rcu(dst); 308 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev); 309 if (opt) 310 head_room += opt->opt_nflen + opt->opt_flen; 311 312 if (unlikely(head_room > skb_headroom(skb))) { 313 /* idev stays alive while we hold rcu_read_lock(). */ 314 skb = skb_expand_head(skb, head_room); 315 if (!skb) { 316 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 317 ret = -ENOBUFS; 318 goto unlock; 319 } 320 } 321 322 if (unlikely(opt)) { 323 seg_len += opt->opt_nflen + opt->opt_flen; 324 325 if (opt->opt_flen) 326 proto = ipv6_push_frag_opts(skb, opt, proto); 327 328 if (opt->opt_nflen) 329 proto = ipv6_push_nfrag_opts(skb, opt, proto, 330 &first_hop, 331 &fl6->saddr); 332 } 333 334 if (unlikely(seg_len > IPV6_MAXPLEN)) 335 seg_len = 0; 336 337 __skb_push(skb, sizeof(struct ipv6hdr)); 338 skb_reset_network_header(skb); 339 hdr = ipv6_hdr(skb); 340 341 /* 342 * Fill in the IPv6 header 343 */ 344 if (np) 345 hlimit = READ_ONCE(np->hop_limit); 346 if (hlimit < 0) 347 hlimit = ip6_dst_hoplimit(dst); 348 349 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 350 ip6_autoflowlabel(net, sk), fl6)); 351 352 hdr->payload_len = htons(seg_len); 353 hdr->nexthdr = proto; 354 hdr->hop_limit = hlimit; 355 356 hdr->saddr = fl6->saddr; 357 hdr->daddr = *first_hop; 358 359 skb->protocol = htons(ETH_P_IPV6); 360 skb->priority = priority; 361 skb->mark = mark; 362 363 mtu = dst6_mtu(dst); 364 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) { 365 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 366 367 /* if egress device is enslaved to an L3 master device pass the 368 * skb to its handler for processing 369 */ 370 skb = l3mdev_ip6_out((struct sock *)sk, skb); 371 if (unlikely(!skb)) { 372 ret = 0; 373 goto unlock; 374 } 375 376 /* hooks should never assume socket lock is held. 377 * we promote our socket to non const 378 */ 379 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 380 net, (struct sock *)sk, skb, NULL, dev, 381 dst_output); 382 goto unlock; 383 } 384 385 ret = -EMSGSIZE; 386 skb->dev = dev; 387 /* ipv6_local_error() does not require socket lock, 388 * we promote our socket to non const 389 */ 390 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 391 392 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 393 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 394 unlock: 395 rcu_read_unlock(); 396 return ret; 397 } 398 EXPORT_SYMBOL(ip6_xmit); 399 400 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 401 { 402 struct ip6_ra_chain *ra; 403 struct sock *last = NULL; 404 405 read_lock(&ip6_ra_lock); 406 for (ra = ip6_ra_chain; ra; ra = ra->next) { 407 struct sock *sk = ra->sk; 408 if (sk && ra->sel == sel && 409 (!sk->sk_bound_dev_if || 410 sk->sk_bound_dev_if == skb->dev->ifindex)) { 411 412 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 413 !net_eq(sock_net(sk), dev_net(skb->dev))) { 414 continue; 415 } 416 if (last) { 417 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 418 if (skb2) 419 rawv6_rcv(last, skb2); 420 } 421 last = sk; 422 } 423 } 424 425 if (last) { 426 rawv6_rcv(last, skb); 427 read_unlock(&ip6_ra_lock); 428 return 1; 429 } 430 read_unlock(&ip6_ra_lock); 431 return 0; 432 } 433 434 static int ip6_forward_proxy_check(struct sk_buff *skb) 435 { 436 struct ipv6hdr *hdr = ipv6_hdr(skb); 437 u8 nexthdr = hdr->nexthdr; 438 __be16 frag_off; 439 int offset; 440 441 if (ipv6_ext_hdr(nexthdr)) { 442 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 443 if (offset < 0) 444 return 0; 445 } else 446 offset = sizeof(struct ipv6hdr); 447 448 if (nexthdr == IPPROTO_ICMPV6) { 449 struct icmp6hdr *icmp6; 450 451 if (!pskb_may_pull(skb, (skb_network_header(skb) + 452 offset + 1 - skb->data))) 453 return 0; 454 455 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 456 457 switch (icmp6->icmp6_type) { 458 case NDISC_ROUTER_SOLICITATION: 459 case NDISC_ROUTER_ADVERTISEMENT: 460 case NDISC_NEIGHBOUR_SOLICITATION: 461 case NDISC_NEIGHBOUR_ADVERTISEMENT: 462 case NDISC_REDIRECT: 463 /* For reaction involving unicast neighbor discovery 464 * message destined to the proxied address, pass it to 465 * input function. 466 */ 467 return 1; 468 default: 469 break; 470 } 471 hdr = ipv6_hdr(skb); 472 } 473 474 /* 475 * The proxying router can't forward traffic sent to a link-local 476 * address, so signal the sender and discard the packet. This 477 * behavior is clarified by the MIPv6 specification. 478 */ 479 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 480 dst_link_failure(skb); 481 return -1; 482 } 483 484 return 0; 485 } 486 487 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 488 struct sk_buff *skb) 489 { 490 #ifdef CONFIG_NET_SWITCHDEV 491 if (skb->offload_l3_fwd_mark) { 492 consume_skb(skb); 493 return 0; 494 } 495 #endif 496 497 skb_clear_tstamp(skb); 498 return dst_output(net, sk, skb); 499 } 500 501 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 502 { 503 if (skb->len <= mtu) 504 return false; 505 506 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 507 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 508 return true; 509 510 if (skb->ignore_df) 511 return false; 512 513 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 514 return false; 515 516 return true; 517 } 518 519 int ip6_forward(struct sk_buff *skb) 520 { 521 struct dst_entry *dst = skb_dst(skb); 522 struct ipv6hdr *hdr = ipv6_hdr(skb); 523 struct inet6_skb_parm *opt = IP6CB(skb); 524 struct net *net = dev_net(dst_dev(dst)); 525 struct net_device *dev; 526 struct inet6_dev *idev; 527 SKB_DR(reason); 528 u32 mtu; 529 530 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 531 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) && 532 (!idev || !READ_ONCE(idev->cnf.force_forwarding))) 533 goto error; 534 535 if (skb->pkt_type != PACKET_HOST) 536 goto drop; 537 538 if (unlikely(skb->sk)) 539 goto drop; 540 541 if (skb_warn_if_lro(skb)) 542 goto drop; 543 544 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 545 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 546 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 547 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 548 goto drop; 549 } 550 551 skb_forward_csum(skb); 552 553 /* 554 * We DO NOT make any processing on 555 * RA packets, pushing them to user level AS IS 556 * without ane WARRANTY that application will be able 557 * to interpret them. The reason is that we 558 * cannot make anything clever here. 559 * 560 * We are not end-node, so that if packet contains 561 * AH/ESP, we cannot make anything. 562 * Defragmentation also would be mistake, RA packets 563 * cannot be fragmented, because there is no warranty 564 * that different fragments will go along one path. --ANK 565 */ 566 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 567 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 568 return 0; 569 } 570 571 /* 572 * check and decrement ttl 573 */ 574 if (hdr->hop_limit <= 1) { 575 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 576 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 577 578 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 579 return -ETIMEDOUT; 580 } 581 582 /* XXX: idev->cnf.proxy_ndp? */ 583 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 584 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) { 585 int proxied = ip6_forward_proxy_check(skb); 586 587 hdr = ipv6_hdr(skb); 588 if (proxied > 0) { 589 /* It's tempting to decrease the hop limit 590 * here by 1, as we do at the end of the 591 * function too. 592 * 593 * But that would be incorrect, as proxying is 594 * not forwarding. The ip6_input function 595 * will handle this packet locally, and it 596 * depends on the hop limit being unchanged. 597 * 598 * One example is the NDP hop limit, that 599 * always has to stay 255, but other would be 600 * similar checks around RA packets, where the 601 * user can even change the desired limit. 602 */ 603 return ip6_input(skb); 604 } else if (proxied < 0) { 605 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 606 goto drop; 607 } 608 } 609 610 if (!xfrm6_route_forward(skb)) { 611 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 612 SKB_DR_SET(reason, XFRM_POLICY); 613 goto drop; 614 } 615 dst = skb_dst(skb); 616 dev = dst_dev(dst); 617 /* IPv6 specs say nothing about it, but it is clear that we cannot 618 send redirects to source routed frames. 619 We don't send redirects to frames decapsulated from IPsec. 620 */ 621 if (IP6CB(skb)->iif == dev->ifindex && 622 opt->srcrt == 0 && !skb_sec_path(skb)) { 623 struct in6_addr *target = NULL; 624 struct inet_peer *peer; 625 struct rt6_info *rt; 626 627 /* 628 * incoming and outgoing devices are the same 629 * send a redirect. 630 */ 631 632 rt = dst_rt6_info(dst); 633 if (rt->rt6i_flags & RTF_GATEWAY) 634 target = &rt->rt6i_gateway; 635 else 636 target = &hdr->daddr; 637 638 rcu_read_lock(); 639 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 640 641 /* Limit redirects both by destination (here) 642 and by source (inside ndisc_send_redirect) 643 */ 644 if (inet_peer_xrlim_allow(peer, 1*HZ)) 645 ndisc_send_redirect(skb, target); 646 rcu_read_unlock(); 647 } else { 648 int addrtype = ipv6_addr_type(&hdr->saddr); 649 650 /* This check is security critical. */ 651 if (addrtype == IPV6_ADDR_ANY || 652 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 653 goto error; 654 if (addrtype & IPV6_ADDR_LINKLOCAL) { 655 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 656 ICMPV6_NOT_NEIGHBOUR, 0); 657 goto error; 658 } 659 } 660 661 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 662 663 mtu = ip6_dst_mtu_maybe_forward(dst, true); 664 if (mtu < IPV6_MIN_MTU) 665 mtu = IPV6_MIN_MTU; 666 667 if (unlikely(ip6_pkt_too_big(skb, mtu))) { 668 /* Again, force OUTPUT device used as source address */ 669 skb->dev = dev; 670 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 671 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 672 __IP6_INC_STATS(net, ip6_dst_idev(dst), 673 IPSTATS_MIB_FRAGFAILS); 674 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 675 return -EMSGSIZE; 676 } 677 678 if (skb_cow(skb, dev->hard_header_len)) { 679 __IP6_INC_STATS(net, ip6_dst_idev(dst), 680 IPSTATS_MIB_OUTDISCARDS); 681 goto drop; 682 } 683 684 hdr = ipv6_hdr(skb); 685 686 /* Mangling hops number delayed to point after skb COW */ 687 688 hdr->hop_limit--; 689 690 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 691 net, NULL, skb, skb->dev, dev, 692 ip6_forward_finish); 693 694 error: 695 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 696 SKB_DR_SET(reason, IP_INADDRERRORS); 697 drop: 698 kfree_skb_reason(skb, reason); 699 return -EINVAL; 700 } 701 702 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 703 { 704 to->pkt_type = from->pkt_type; 705 to->priority = from->priority; 706 to->protocol = from->protocol; 707 skb_dst_drop(to); 708 skb_dst_set(to, dst_clone(skb_dst(from))); 709 to->dev = from->dev; 710 to->mark = from->mark; 711 712 skb_copy_hash(to, from); 713 714 #ifdef CONFIG_NET_SCHED 715 to->tc_index = from->tc_index; 716 #endif 717 nf_copy(to, from); 718 skb_ext_copy(to, from); 719 skb_copy_secmark(to, from); 720 } 721 722 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 723 u8 nexthdr, __be32 frag_id, 724 struct ip6_fraglist_iter *iter) 725 { 726 unsigned int first_len; 727 struct frag_hdr *fh; 728 729 /* BUILD HEADER */ 730 *prevhdr = NEXTHDR_FRAGMENT; 731 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 732 if (!iter->tmp_hdr) 733 return -ENOMEM; 734 735 iter->frag = skb_shinfo(skb)->frag_list; 736 skb_frag_list_init(skb); 737 738 iter->offset = 0; 739 iter->hlen = hlen; 740 iter->frag_id = frag_id; 741 iter->nexthdr = nexthdr; 742 743 __skb_pull(skb, hlen); 744 fh = __skb_push(skb, sizeof(struct frag_hdr)); 745 __skb_push(skb, hlen); 746 skb_reset_network_header(skb); 747 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 748 749 fh->nexthdr = nexthdr; 750 fh->reserved = 0; 751 fh->frag_off = htons(IP6_MF); 752 fh->identification = frag_id; 753 754 first_len = skb_pagelen(skb); 755 skb->data_len = first_len - skb_headlen(skb); 756 skb->len = first_len; 757 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 758 759 return 0; 760 } 761 EXPORT_SYMBOL(ip6_fraglist_init); 762 763 void ip6_fraglist_prepare(struct sk_buff *skb, 764 struct ip6_fraglist_iter *iter) 765 { 766 struct sk_buff *frag = iter->frag; 767 unsigned int hlen = iter->hlen; 768 struct frag_hdr *fh; 769 770 frag->ip_summed = CHECKSUM_NONE; 771 skb_reset_transport_header(frag); 772 fh = __skb_push(frag, sizeof(struct frag_hdr)); 773 __skb_push(frag, hlen); 774 skb_reset_network_header(frag); 775 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 776 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 777 fh->nexthdr = iter->nexthdr; 778 fh->reserved = 0; 779 fh->frag_off = htons(iter->offset); 780 if (frag->next) 781 fh->frag_off |= htons(IP6_MF); 782 fh->identification = iter->frag_id; 783 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 784 ip6_copy_metadata(frag, skb); 785 } 786 EXPORT_SYMBOL(ip6_fraglist_prepare); 787 788 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 789 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 790 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 791 { 792 state->prevhdr = prevhdr; 793 state->nexthdr = nexthdr; 794 state->frag_id = frag_id; 795 796 state->hlen = hlen; 797 state->mtu = mtu; 798 799 state->left = skb->len - hlen; /* Space per frame */ 800 state->ptr = hlen; /* Where to start from */ 801 802 state->hroom = hdr_room; 803 state->troom = needed_tailroom; 804 805 state->offset = 0; 806 } 807 EXPORT_SYMBOL(ip6_frag_init); 808 809 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 810 { 811 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 812 struct sk_buff *frag; 813 struct frag_hdr *fh; 814 unsigned int len; 815 816 len = state->left; 817 /* IF: it doesn't fit, use 'mtu' - the data space left */ 818 if (len > state->mtu) 819 len = state->mtu; 820 /* IF: we are not sending up to and including the packet end 821 then align the next start on an eight byte boundary */ 822 if (len < state->left) 823 len &= ~7; 824 825 /* Allocate buffer */ 826 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 827 state->hroom + state->troom, GFP_ATOMIC); 828 if (!frag) 829 return ERR_PTR(-ENOMEM); 830 831 /* 832 * Set up data on packet 833 */ 834 835 ip6_copy_metadata(frag, skb); 836 skb_reserve(frag, state->hroom); 837 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 838 skb_reset_network_header(frag); 839 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 840 frag->transport_header = (frag->network_header + state->hlen + 841 sizeof(struct frag_hdr)); 842 843 /* 844 * Charge the memory for the fragment to any owner 845 * it might possess 846 */ 847 if (skb->sk) 848 skb_set_owner_w(frag, skb->sk); 849 850 /* 851 * Copy the packet header into the new buffer. 852 */ 853 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 854 855 fragnexthdr_offset = skb_network_header(frag); 856 fragnexthdr_offset += prevhdr - skb_network_header(skb); 857 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 858 859 /* 860 * Build fragment header. 861 */ 862 fh->nexthdr = state->nexthdr; 863 fh->reserved = 0; 864 fh->identification = state->frag_id; 865 866 /* 867 * Copy a block of the IP datagram. 868 */ 869 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 870 len)); 871 state->left -= len; 872 873 fh->frag_off = htons(state->offset); 874 if (state->left > 0) 875 fh->frag_off |= htons(IP6_MF); 876 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 877 878 state->ptr += len; 879 state->offset += len; 880 881 return frag; 882 } 883 EXPORT_SYMBOL(ip6_frag_next); 884 885 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 886 int (*output)(struct net *, struct sock *, struct sk_buff *)) 887 { 888 struct sk_buff *frag; 889 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 890 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 891 inet6_sk(skb->sk) : NULL; 892 u8 tstamp_type = skb->tstamp_type; 893 struct ip6_frag_state state; 894 unsigned int mtu, hlen, nexthdr_offset; 895 ktime_t tstamp = skb->tstamp; 896 int hroom, err = 0; 897 __be32 frag_id; 898 u8 *prevhdr, nexthdr = 0; 899 900 if (!ipv6_mod_enabled()) { 901 kfree_skb(skb); 902 return -EAFNOSUPPORT; 903 } 904 905 err = ip6_find_1stfragopt(skb, &prevhdr); 906 if (err < 0) 907 goto fail; 908 hlen = err; 909 nexthdr = *prevhdr; 910 nexthdr_offset = prevhdr - skb_network_header(skb); 911 912 mtu = ip6_skb_dst_mtu(skb); 913 914 /* We must not fragment if the socket is set to force MTU discovery 915 * or if the skb it not generated by a local socket. 916 */ 917 if (unlikely(!skb->ignore_df && skb->len > mtu)) 918 goto fail_toobig; 919 920 if (IP6CB(skb)->frag_max_size) { 921 if (IP6CB(skb)->frag_max_size > mtu) 922 goto fail_toobig; 923 924 /* don't send fragments larger than what we received */ 925 mtu = IP6CB(skb)->frag_max_size; 926 if (mtu < IPV6_MIN_MTU) 927 mtu = IPV6_MIN_MTU; 928 } 929 930 if (np) { 931 u32 frag_size = READ_ONCE(np->frag_size); 932 933 if (frag_size && frag_size < mtu) 934 mtu = frag_size; 935 } 936 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 937 goto fail_toobig; 938 mtu -= hlen + sizeof(struct frag_hdr); 939 940 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 941 &ipv6_hdr(skb)->saddr); 942 943 if (skb->ip_summed == CHECKSUM_PARTIAL && 944 (err = skb_checksum_help(skb))) 945 goto fail; 946 947 prevhdr = skb_network_header(skb) + nexthdr_offset; 948 hroom = LL_RESERVED_SPACE(rt->dst.dev); 949 if (skb_has_frag_list(skb)) { 950 unsigned int first_len = skb_pagelen(skb); 951 struct ip6_fraglist_iter iter; 952 struct sk_buff *frag2; 953 954 if (first_len - hlen > mtu || 955 ((first_len - hlen) & 7) || 956 skb_cloned(skb) || 957 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 958 goto slow_path; 959 960 skb_walk_frags(skb, frag) { 961 /* Correct geometry. */ 962 if (frag->len > mtu || 963 ((frag->len & 7) && frag->next) || 964 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 965 goto slow_path_clean; 966 967 /* Partially cloned skb? */ 968 if (skb_shared(frag)) 969 goto slow_path_clean; 970 971 BUG_ON(frag->sk); 972 if (skb->sk) { 973 frag->sk = skb->sk; 974 frag->destructor = sock_wfree; 975 } 976 skb->truesize -= frag->truesize; 977 } 978 979 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 980 &iter); 981 if (err < 0) 982 goto fail; 983 984 /* We prevent @rt from being freed. */ 985 rcu_read_lock(); 986 987 for (;;) { 988 /* Prepare header of the next frame, 989 * before previous one went down. */ 990 if (iter.frag) 991 ip6_fraglist_prepare(skb, &iter); 992 993 skb_set_delivery_time(skb, tstamp, tstamp_type); 994 err = output(net, sk, skb); 995 if (!err) 996 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 997 IPSTATS_MIB_FRAGCREATES); 998 999 if (err || !iter.frag) 1000 break; 1001 1002 skb = ip6_fraglist_next(&iter); 1003 } 1004 1005 kfree(iter.tmp_hdr); 1006 1007 if (err == 0) { 1008 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1009 IPSTATS_MIB_FRAGOKS); 1010 rcu_read_unlock(); 1011 return 0; 1012 } 1013 1014 kfree_skb_list(iter.frag); 1015 1016 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 1017 IPSTATS_MIB_FRAGFAILS); 1018 rcu_read_unlock(); 1019 return err; 1020 1021 slow_path_clean: 1022 skb_walk_frags(skb, frag2) { 1023 if (frag2 == frag) 1024 break; 1025 frag2->sk = NULL; 1026 frag2->destructor = NULL; 1027 skb->truesize += frag2->truesize; 1028 } 1029 } 1030 1031 slow_path: 1032 /* 1033 * Fragment the datagram. 1034 */ 1035 1036 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1037 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1038 &state); 1039 1040 /* 1041 * Keep copying data until we run out. 1042 */ 1043 1044 while (state.left > 0) { 1045 frag = ip6_frag_next(skb, &state); 1046 if (IS_ERR(frag)) { 1047 err = PTR_ERR(frag); 1048 goto fail; 1049 } 1050 1051 /* 1052 * Put this fragment into the sending queue. 1053 */ 1054 skb_set_delivery_time(frag, tstamp, tstamp_type); 1055 err = output(net, sk, frag); 1056 if (err) 1057 goto fail; 1058 1059 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1060 IPSTATS_MIB_FRAGCREATES); 1061 } 1062 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1063 IPSTATS_MIB_FRAGOKS); 1064 consume_skb(skb); 1065 return err; 1066 1067 fail_toobig: 1068 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1069 err = -EMSGSIZE; 1070 1071 fail: 1072 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1073 IPSTATS_MIB_FRAGFAILS); 1074 kfree_skb(skb); 1075 return err; 1076 } 1077 EXPORT_SYMBOL_GPL(ip6_fragment); 1078 1079 static inline int ip6_rt_check(const struct rt6key *rt_key, 1080 const struct in6_addr *fl_addr, 1081 const struct in6_addr *addr_cache) 1082 { 1083 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1084 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1085 } 1086 1087 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1088 struct dst_entry *dst, 1089 const struct flowi6 *fl6) 1090 { 1091 struct ipv6_pinfo *np = inet6_sk(sk); 1092 struct rt6_info *rt; 1093 1094 if (!dst) 1095 goto out; 1096 1097 if (dst->ops->family != AF_INET6) { 1098 dst_release(dst); 1099 return NULL; 1100 } 1101 1102 rt = dst_rt6_info(dst); 1103 /* Yes, checking route validity in not connected 1104 * case is not very simple. Take into account, 1105 * that we do not support routing by source, TOS, 1106 * and MSG_DONTROUTE --ANK (980726) 1107 * 1108 * 1. ip6_rt_check(): If route was host route, 1109 * check that cached destination is current. 1110 * If it is network route, we still may 1111 * check its validity using saved pointer 1112 * to the last used address: daddr_cache. 1113 * We do not want to save whole address now, 1114 * (because main consumer of this service 1115 * is tcp, which has not this problem), 1116 * so that the last trick works only on connected 1117 * sockets. 1118 * 2. oif also should be the same. 1119 */ 1120 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, 1121 np->daddr_cache ? &sk->sk_v6_daddr : NULL) || 1122 #ifdef CONFIG_IPV6_SUBTREES 1123 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, 1124 np->saddr_cache ? &np->saddr : NULL) || 1125 #endif 1126 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1127 dst_release(dst); 1128 dst = NULL; 1129 } 1130 1131 out: 1132 return dst; 1133 } 1134 1135 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1136 struct dst_entry **dst, struct flowi6 *fl6) 1137 { 1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1139 struct neighbour *n; 1140 struct rt6_info *rt; 1141 #endif 1142 int err; 1143 int flags = 0; 1144 1145 /* The correct way to handle this would be to do 1146 * ip6_route_get_saddr, and then ip6_route_output; however, 1147 * the route-specific preferred source forces the 1148 * ip6_route_output call _before_ ip6_route_get_saddr. 1149 * 1150 * In source specific routing (no src=any default route), 1151 * ip6_route_output will fail given src=any saddr, though, so 1152 * that's why we try it again later. 1153 */ 1154 if (ipv6_addr_any(&fl6->saddr)) { 1155 struct fib6_info *from; 1156 struct rt6_info *rt; 1157 1158 *dst = ip6_route_output(net, sk, fl6); 1159 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1160 1161 rcu_read_lock(); 1162 from = rt ? rcu_dereference(rt->from) : NULL; 1163 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1164 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1165 fl6->flowi6_l3mdev, 1166 &fl6->saddr); 1167 rcu_read_unlock(); 1168 1169 if (err) 1170 goto out_err_release; 1171 1172 /* If we had an erroneous initial result, pretend it 1173 * never existed and let the SA-enabled version take 1174 * over. 1175 */ 1176 if ((*dst)->error) { 1177 dst_release(*dst); 1178 *dst = NULL; 1179 } 1180 1181 if (fl6->flowi6_oif) 1182 flags |= RT6_LOOKUP_F_IFACE; 1183 } 1184 1185 if (!*dst) 1186 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1187 1188 err = (*dst)->error; 1189 if (err) 1190 goto out_err_release; 1191 1192 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1193 /* 1194 * Here if the dst entry we've looked up 1195 * has a neighbour entry that is in the INCOMPLETE 1196 * state and the src address from the flow is 1197 * marked as OPTIMISTIC, we release the found 1198 * dst entry and replace it instead with the 1199 * dst entry of the nexthop router 1200 */ 1201 rt = dst_rt6_info(*dst); 1202 rcu_read_lock(); 1203 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1204 rt6_nexthop(rt, &fl6->daddr)); 1205 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1206 rcu_read_unlock(); 1207 1208 if (err) { 1209 struct inet6_ifaddr *ifp; 1210 struct flowi6 fl_gw6; 1211 int redirect; 1212 1213 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1214 (*dst)->dev, 1); 1215 1216 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1217 if (ifp) 1218 in6_ifa_put(ifp); 1219 1220 if (redirect) { 1221 /* 1222 * We need to get the dst entry for the 1223 * default router instead 1224 */ 1225 dst_release(*dst); 1226 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1227 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1228 *dst = ip6_route_output(net, sk, &fl_gw6); 1229 err = (*dst)->error; 1230 if (err) 1231 goto out_err_release; 1232 } 1233 } 1234 #endif 1235 if (ipv6_addr_v4mapped(&fl6->saddr) && 1236 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1237 err = -EAFNOSUPPORT; 1238 goto out_err_release; 1239 } 1240 1241 return 0; 1242 1243 out_err_release: 1244 dst_release(*dst); 1245 *dst = NULL; 1246 1247 if (err == -ENETUNREACH) 1248 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1249 return err; 1250 } 1251 1252 /** 1253 * ip6_dst_lookup - perform route lookup on flow 1254 * @net: Network namespace to perform lookup in 1255 * @sk: socket which provides route info 1256 * @dst: pointer to dst_entry * for result 1257 * @fl6: flow to lookup 1258 * 1259 * This function performs a route lookup on the given flow. 1260 * 1261 * It returns zero on success, or a standard errno code on error. 1262 */ 1263 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1264 struct flowi6 *fl6) 1265 { 1266 *dst = NULL; 1267 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1268 } 1269 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1270 1271 /** 1272 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1273 * @net: Network namespace to perform lookup in 1274 * @sk: socket which provides route info 1275 * @fl6: flow to lookup 1276 * @final_dst: final destination address for ipsec lookup 1277 * 1278 * This function performs a route lookup on the given flow. 1279 * 1280 * It returns a valid dst pointer on success, or a pointer encoded 1281 * error code. 1282 */ 1283 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1284 const struct in6_addr *final_dst) 1285 { 1286 struct dst_entry *dst = NULL; 1287 int err; 1288 1289 if (!ipv6_mod_enabled()) 1290 return ERR_PTR(-EAFNOSUPPORT); 1291 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1292 if (err) 1293 return ERR_PTR(err); 1294 if (final_dst) 1295 fl6->daddr = *final_dst; 1296 1297 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1298 } 1299 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1300 1301 /** 1302 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1303 * @sk: socket which provides the dst cache and route info 1304 * @fl6: flow to lookup 1305 * @final_dst: final destination address for ipsec lookup 1306 * @connected: whether @sk is connected or not 1307 * 1308 * This function performs a route lookup on the given flow with the 1309 * possibility of using the cached route in the socket if it is valid. 1310 * It will take the socket dst lock when operating on the dst cache. 1311 * As a result, this function can only be used in process context. 1312 * 1313 * In addition, for a connected socket, cache the dst in the socket 1314 * if the current cache is not valid. 1315 * 1316 * It returns a valid dst pointer on success, or a pointer encoded 1317 * error code. 1318 */ 1319 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1320 const struct in6_addr *final_dst, 1321 bool connected) 1322 { 1323 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1324 1325 dst = ip6_sk_dst_check(sk, dst, fl6); 1326 if (dst) 1327 return dst; 1328 1329 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1330 if (connected && !IS_ERR(dst)) 1331 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1332 1333 return dst; 1334 } 1335 1336 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1337 gfp_t gfp) 1338 { 1339 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1340 } 1341 1342 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1343 gfp_t gfp) 1344 { 1345 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1346 } 1347 1348 static void ip6_append_data_mtu(unsigned int *mtu, 1349 int *maxfraglen, 1350 unsigned int fragheaderlen, 1351 struct sk_buff *skb, 1352 struct rt6_info *rt, 1353 unsigned int orig_mtu) 1354 { 1355 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1356 if (!skb) { 1357 /* first fragment, reserve header_len */ 1358 *mtu = orig_mtu - rt->dst.header_len; 1359 1360 } else { 1361 /* 1362 * this fragment is not first, the headers 1363 * space is regarded as data space. 1364 */ 1365 *mtu = orig_mtu; 1366 } 1367 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1368 + fragheaderlen - sizeof(struct frag_hdr); 1369 } 1370 } 1371 1372 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1373 struct ipcm6_cookie *ipc6, 1374 struct rt6_info *rt) 1375 { 1376 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1377 struct inet6_cork *v6_cork = &cork->base6; 1378 struct ipv6_pinfo *np = inet6_sk(sk); 1379 unsigned int mtu, frag_size; 1380 1381 /* callers pass dst together with a reference, set it first so 1382 * ip6_cork_release() can put it down even in case of an error. 1383 */ 1384 cork->base.dst = &rt->dst; 1385 1386 /* 1387 * setup for corking 1388 */ 1389 if (unlikely(opt)) { 1390 if (WARN_ON(v6_cork->opt)) 1391 return -EINVAL; 1392 1393 nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation); 1394 if (unlikely(!nopt)) 1395 return -ENOBUFS; 1396 1397 nopt->tot_len = sizeof(*opt); 1398 nopt->opt_flen = opt->opt_flen; 1399 nopt->opt_nflen = opt->opt_nflen; 1400 1401 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1402 if (opt->dst0opt && !nopt->dst0opt) 1403 return -ENOBUFS; 1404 1405 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1406 if (opt->dst1opt && !nopt->dst1opt) 1407 return -ENOBUFS; 1408 1409 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1410 if (opt->hopopt && !nopt->hopopt) 1411 return -ENOBUFS; 1412 1413 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1414 if (opt->srcrt && !nopt->srcrt) 1415 return -ENOBUFS; 1416 1417 /* need source address above miyazawa*/ 1418 } 1419 v6_cork->hop_limit = ipc6->hlimit; 1420 v6_cork->tclass = ipc6->tclass; 1421 v6_cork->dontfrag = ipc6->dontfrag; 1422 if (rt->dst.flags & DST_XFRM_TUNNEL) 1423 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1424 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst); 1425 else 1426 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1427 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst)); 1428 1429 frag_size = READ_ONCE(np->frag_size); 1430 if (frag_size && frag_size < mtu) 1431 mtu = frag_size; 1432 1433 cork->base.fragsize = mtu; 1434 cork->base.gso_size = ipc6->gso_size; 1435 cork->base.tx_flags = 0; 1436 cork->base.mark = ipc6->sockc.mark; 1437 cork->base.priority = ipc6->sockc.priority; 1438 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1439 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1440 cork->base.flags |= IPCORK_TS_OPT_ID; 1441 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1442 } 1443 cork->base.length = 0; 1444 cork->base.transmit_time = ipc6->sockc.transmit_time; 1445 1446 return 0; 1447 } 1448 1449 static int __ip6_append_data(struct sock *sk, 1450 struct sk_buff_head *queue, 1451 struct inet_cork_full *cork_full, 1452 struct page_frag *pfrag, 1453 int getfrag(void *from, char *to, int offset, 1454 int len, int odd, struct sk_buff *skb), 1455 void *from, size_t length, int transhdrlen, 1456 unsigned int flags) 1457 { 1458 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1459 struct inet6_cork *v6_cork = &cork_full->base6; 1460 struct inet_cork *cork = &cork_full->base; 1461 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1462 struct sk_buff *skb, *skb_prev = NULL; 1463 struct ubuf_info *uarg = NULL; 1464 int exthdrlen = 0; 1465 int dst_exthdrlen = 0; 1466 int hh_len; 1467 int copy; 1468 int err; 1469 int offset = 0; 1470 bool zc = false; 1471 u32 tskey = 0; 1472 struct rt6_info *rt = dst_rt6_info(cork->dst); 1473 bool paged, hold_tskey = false, extra_uref = false; 1474 struct ipv6_txoptions *opt = v6_cork->opt; 1475 int csummode = CHECKSUM_NONE; 1476 unsigned int maxnonfragsize, headersize; 1477 unsigned int wmem_alloc_delta = 0; 1478 1479 skb = skb_peek_tail(queue); 1480 if (!skb) { 1481 exthdrlen = opt ? opt->opt_flen : 0; 1482 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1483 } 1484 1485 paged = !!cork->gso_size; 1486 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1487 orig_mtu = mtu; 1488 1489 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1490 1491 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1492 (opt ? opt->opt_nflen : 0); 1493 1494 headersize = sizeof(struct ipv6hdr) + 1495 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1496 rt->rt6i_nfheader_len; 1497 1498 if (mtu <= fragheaderlen || 1499 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1500 goto emsgsize; 1501 1502 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1503 sizeof(struct frag_hdr); 1504 1505 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1506 * the first fragment 1507 */ 1508 if (headersize + transhdrlen > mtu) 1509 goto emsgsize; 1510 1511 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1512 (sk->sk_protocol == IPPROTO_UDP || 1513 sk->sk_protocol == IPPROTO_ICMPV6 || 1514 sk->sk_protocol == IPPROTO_RAW)) { 1515 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1516 sizeof(struct ipv6hdr)); 1517 goto emsgsize; 1518 } 1519 1520 if (ip6_sk_ignore_df(sk)) 1521 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1522 else 1523 maxnonfragsize = mtu; 1524 1525 if (cork->length + length > maxnonfragsize - headersize) { 1526 emsgsize: 1527 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1528 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1529 return -EMSGSIZE; 1530 } 1531 1532 /* CHECKSUM_PARTIAL only with no extension headers and when 1533 * we are not going to fragment 1534 */ 1535 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1536 headersize == sizeof(struct ipv6hdr) && 1537 length <= mtu - headersize && 1538 (!(flags & MSG_MORE) || cork->gso_size) && 1539 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1540 csummode = CHECKSUM_PARTIAL; 1541 1542 if ((flags & MSG_ZEROCOPY) && length) { 1543 struct msghdr *msg = from; 1544 1545 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1546 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1547 return -EINVAL; 1548 1549 /* Leave uarg NULL if can't zerocopy, callers should 1550 * be able to handle it. 1551 */ 1552 if ((rt->dst.dev->features & NETIF_F_SG) && 1553 csummode == CHECKSUM_PARTIAL) { 1554 paged = true; 1555 zc = true; 1556 uarg = msg->msg_ubuf; 1557 } 1558 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1559 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1560 false); 1561 if (!uarg) 1562 return -ENOBUFS; 1563 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1564 if (rt->dst.dev->features & NETIF_F_SG && 1565 csummode == CHECKSUM_PARTIAL) { 1566 paged = true; 1567 zc = true; 1568 } else { 1569 uarg_to_msgzc(uarg)->zerocopy = 0; 1570 skb_zcopy_set(skb, uarg, &extra_uref); 1571 } 1572 } 1573 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1574 if (inet_test_bit(HDRINCL, sk)) 1575 return -EPERM; 1576 if (rt->dst.dev->features & NETIF_F_SG && 1577 getfrag == ip_generic_getfrag) 1578 /* We need an empty buffer to attach stuff to */ 1579 paged = true; 1580 else 1581 flags &= ~MSG_SPLICE_PAGES; 1582 } 1583 1584 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1585 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1586 if (cork->flags & IPCORK_TS_OPT_ID) { 1587 tskey = cork->ts_opt_id; 1588 } else { 1589 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1590 hold_tskey = true; 1591 } 1592 } 1593 1594 /* 1595 * Let's try using as much space as possible. 1596 * Use MTU if total length of the message fits into the MTU. 1597 * Otherwise, we need to reserve fragment header and 1598 * fragment alignment (= 8-15 octects, in total). 1599 * 1600 * Note that we may need to "move" the data from the tail 1601 * of the buffer to the new fragment when we split 1602 * the message. 1603 * 1604 * FIXME: It may be fragmented into multiple chunks 1605 * at once if non-fragmentable extension headers 1606 * are too large. 1607 * --yoshfuji 1608 */ 1609 1610 cork->length += length; 1611 if (!skb) 1612 goto alloc_new_skb; 1613 1614 while (length > 0) { 1615 /* Check if the remaining data fits into current packet. */ 1616 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1617 if (copy < length) 1618 copy = maxfraglen - skb->len; 1619 1620 if (copy <= 0) { 1621 char *data; 1622 unsigned int datalen; 1623 unsigned int fraglen; 1624 unsigned int fraggap; 1625 unsigned int alloclen, alloc_extra; 1626 unsigned int pagedlen; 1627 alloc_new_skb: 1628 /* There's no room in the current skb */ 1629 if (skb) 1630 fraggap = skb->len - maxfraglen; 1631 else 1632 fraggap = 0; 1633 /* update mtu and maxfraglen if necessary */ 1634 if (!skb || !skb_prev) 1635 ip6_append_data_mtu(&mtu, &maxfraglen, 1636 fragheaderlen, skb, rt, 1637 orig_mtu); 1638 1639 skb_prev = skb; 1640 1641 /* 1642 * If remaining data exceeds the mtu, 1643 * we know we need more fragment(s). 1644 */ 1645 datalen = length + fraggap; 1646 1647 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1648 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1649 fraglen = datalen + fragheaderlen; 1650 pagedlen = 0; 1651 1652 alloc_extra = hh_len; 1653 alloc_extra += dst_exthdrlen; 1654 alloc_extra += rt->dst.trailer_len; 1655 1656 /* We just reserve space for fragment header. 1657 * Note: this may be overallocation if the message 1658 * (without MSG_MORE) fits into the MTU. 1659 */ 1660 alloc_extra += sizeof(struct frag_hdr); 1661 1662 if ((flags & MSG_MORE) && 1663 !(rt->dst.dev->features&NETIF_F_SG)) 1664 alloclen = mtu; 1665 else if (!paged && 1666 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1667 !(rt->dst.dev->features & NETIF_F_SG))) 1668 alloclen = fraglen; 1669 else { 1670 alloclen = fragheaderlen + transhdrlen + fraggap; 1671 pagedlen = datalen - transhdrlen - fraggap; 1672 } 1673 alloclen += alloc_extra; 1674 1675 if (datalen != length + fraggap) { 1676 /* 1677 * this is not the last fragment, the trailer 1678 * space is regarded as data space. 1679 */ 1680 datalen += rt->dst.trailer_len; 1681 } 1682 1683 fraglen = datalen + fragheaderlen; 1684 1685 copy = datalen - transhdrlen - fraggap - pagedlen; 1686 if (copy < 0) { 1687 err = -EINVAL; 1688 goto error; 1689 } 1690 if (transhdrlen) { 1691 skb = sock_alloc_send_skb(sk, alloclen, 1692 (flags & MSG_DONTWAIT), &err); 1693 } else { 1694 skb = NULL; 1695 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1696 2 * sk->sk_sndbuf) 1697 skb = alloc_skb(alloclen, 1698 sk->sk_allocation); 1699 if (unlikely(!skb)) 1700 err = -ENOBUFS; 1701 } 1702 if (!skb) 1703 goto error; 1704 /* 1705 * Fill in the control structures 1706 */ 1707 skb->protocol = htons(ETH_P_IPV6); 1708 skb->ip_summed = csummode; 1709 skb->csum = 0; 1710 /* reserve for fragmentation and ipsec header */ 1711 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1712 dst_exthdrlen); 1713 1714 /* 1715 * Find where to start putting bytes 1716 */ 1717 data = skb_put(skb, fraglen - pagedlen); 1718 skb_set_network_header(skb, exthdrlen); 1719 data += fragheaderlen; 1720 skb->transport_header = (skb->network_header + 1721 fragheaderlen); 1722 if (fraggap) { 1723 skb->csum = skb_copy_and_csum_bits( 1724 skb_prev, maxfraglen, 1725 data + transhdrlen, fraggap); 1726 skb_prev->csum = csum_sub(skb_prev->csum, 1727 skb->csum); 1728 data += fraggap; 1729 pskb_trim_unique(skb_prev, maxfraglen); 1730 } 1731 if (copy > 0 && 1732 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1733 from, data + transhdrlen, offset, 1734 copy, fraggap, skb) < 0) { 1735 err = -EFAULT; 1736 kfree_skb(skb); 1737 goto error; 1738 } else if (flags & MSG_SPLICE_PAGES) { 1739 copy = 0; 1740 } 1741 1742 offset += copy; 1743 length -= copy + transhdrlen; 1744 transhdrlen = 0; 1745 exthdrlen = 0; 1746 dst_exthdrlen = 0; 1747 1748 /* Only the initial fragment is time stamped */ 1749 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1750 cork->tx_flags = 0; 1751 skb_shinfo(skb)->tskey = tskey; 1752 tskey = 0; 1753 skb_zcopy_set(skb, uarg, &extra_uref); 1754 1755 if ((flags & MSG_CONFIRM) && !skb_prev) 1756 skb_set_dst_pending_confirm(skb, 1); 1757 1758 /* 1759 * Put the packet on the pending queue 1760 */ 1761 if (!skb->destructor) { 1762 skb->destructor = sock_wfree; 1763 skb->sk = sk; 1764 wmem_alloc_delta += skb->truesize; 1765 } 1766 __skb_queue_tail(queue, skb); 1767 continue; 1768 } 1769 1770 if (copy > length) 1771 copy = length; 1772 1773 if (!(rt->dst.dev->features&NETIF_F_SG) && 1774 skb_tailroom(skb) >= copy) { 1775 unsigned int off; 1776 1777 off = skb->len; 1778 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1779 from, skb_put(skb, copy), 1780 offset, copy, off, skb) < 0) { 1781 __skb_trim(skb, off); 1782 err = -EFAULT; 1783 goto error; 1784 } 1785 } else if (flags & MSG_SPLICE_PAGES) { 1786 struct msghdr *msg = from; 1787 1788 err = -EIO; 1789 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1790 goto error; 1791 1792 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1793 if (err < 0) 1794 goto error; 1795 copy = err; 1796 if (!(flags & MSG_NO_SHARED_FRAGS)) 1797 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; 1798 wmem_alloc_delta += copy; 1799 } else if (!zc) { 1800 int i = skb_shinfo(skb)->nr_frags; 1801 1802 err = -ENOMEM; 1803 if (!sk_page_frag_refill(sk, pfrag)) 1804 goto error; 1805 1806 skb_zcopy_downgrade_managed(skb); 1807 if (!skb_can_coalesce(skb, i, pfrag->page, 1808 pfrag->offset)) { 1809 err = -EMSGSIZE; 1810 if (i == MAX_SKB_FRAGS) 1811 goto error; 1812 1813 __skb_fill_page_desc(skb, i, pfrag->page, 1814 pfrag->offset, 0); 1815 skb_shinfo(skb)->nr_frags = ++i; 1816 get_page(pfrag->page); 1817 } 1818 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1819 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1820 from, 1821 page_address(pfrag->page) + pfrag->offset, 1822 offset, copy, skb->len, skb) < 0) 1823 goto error_efault; 1824 1825 pfrag->offset += copy; 1826 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1827 skb->len += copy; 1828 skb->data_len += copy; 1829 skb->truesize += copy; 1830 wmem_alloc_delta += copy; 1831 } else { 1832 err = skb_zerocopy_iter_dgram(skb, from, copy); 1833 if (err < 0) 1834 goto error; 1835 } 1836 offset += copy; 1837 length -= copy; 1838 } 1839 1840 if (wmem_alloc_delta) 1841 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1842 return 0; 1843 1844 error_efault: 1845 err = -EFAULT; 1846 error: 1847 net_zcopy_put_abort(uarg, extra_uref); 1848 cork->length -= length; 1849 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1850 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1851 if (hold_tskey) 1852 atomic_dec(&sk->sk_tskey); 1853 return err; 1854 } 1855 1856 int ip6_append_data(struct sock *sk, 1857 int getfrag(void *from, char *to, int offset, int len, 1858 int odd, struct sk_buff *skb), 1859 void *from, size_t length, int transhdrlen, 1860 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1861 struct rt6_info *rt, unsigned int flags) 1862 { 1863 struct inet_sock *inet = inet_sk(sk); 1864 int exthdrlen; 1865 int err; 1866 1867 if (flags&MSG_PROBE) 1868 return 0; 1869 if (skb_queue_empty(&sk->sk_write_queue)) { 1870 /* 1871 * setup for corking 1872 */ 1873 dst_hold(&rt->dst); 1874 err = ip6_setup_cork(sk, &inet->cork, 1875 ipc6, rt); 1876 if (err) 1877 return err; 1878 1879 inet->cork.fl.u.ip6 = *fl6; 1880 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1881 length += exthdrlen; 1882 transhdrlen += exthdrlen; 1883 } else { 1884 transhdrlen = 0; 1885 } 1886 1887 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1888 sk_page_frag(sk), getfrag, 1889 from, length, transhdrlen, flags); 1890 } 1891 EXPORT_SYMBOL_GPL(ip6_append_data); 1892 1893 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1894 { 1895 struct dst_entry *dst = cork->base.dst; 1896 1897 cork->base.dst = NULL; 1898 skb_dst_set(skb, dst); 1899 } 1900 1901 static void ip6_cork_release(struct inet_cork_full *cork) 1902 { 1903 struct inet6_cork *v6_cork = &cork->base6; 1904 1905 if (unlikely(v6_cork->opt)) { 1906 struct ipv6_txoptions *opt = v6_cork->opt; 1907 1908 kfree(opt->dst0opt); 1909 kfree(opt->dst1opt); 1910 kfree(opt->hopopt); 1911 kfree(opt->srcrt); 1912 kfree(opt); 1913 v6_cork->opt = NULL; 1914 } 1915 1916 if (cork->base.dst) { 1917 dst_release(cork->base.dst); 1918 cork->base.dst = NULL; 1919 } 1920 } 1921 1922 struct sk_buff *__ip6_make_skb(struct sock *sk, 1923 struct sk_buff_head *queue, 1924 struct inet_cork_full *cork) 1925 { 1926 struct sk_buff *skb, *tmp_skb; 1927 struct sk_buff **tail_skb; 1928 struct in6_addr *final_dst; 1929 struct net *net = sock_net(sk); 1930 struct ipv6hdr *hdr; 1931 struct ipv6_txoptions *opt; 1932 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1933 struct flowi6 *fl6 = &cork->fl.u.ip6; 1934 unsigned char proto = fl6->flowi6_proto; 1935 1936 skb = __skb_dequeue(queue); 1937 if (!skb) 1938 goto out; 1939 tail_skb = &(skb_shinfo(skb)->frag_list); 1940 1941 /* move skb->data to ip header from ext header */ 1942 if (skb->data < skb_network_header(skb)) 1943 __skb_pull(skb, skb_network_offset(skb)); 1944 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1945 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1946 *tail_skb = tmp_skb; 1947 tail_skb = &(tmp_skb->next); 1948 skb->len += tmp_skb->len; 1949 skb->data_len += tmp_skb->len; 1950 skb->truesize += tmp_skb->truesize; 1951 tmp_skb->destructor = NULL; 1952 tmp_skb->sk = NULL; 1953 } 1954 1955 /* Allow local fragmentation. */ 1956 skb->ignore_df = ip6_sk_ignore_df(sk); 1957 __skb_pull(skb, skb_network_header_len(skb)); 1958 1959 final_dst = &fl6->daddr; 1960 opt = cork->base6.opt; 1961 if (unlikely(opt)) { 1962 if (opt->opt_flen) 1963 proto = ipv6_push_frag_opts(skb, opt, proto); 1964 if (opt->opt_nflen) 1965 proto = ipv6_push_nfrag_opts(skb, opt, proto, 1966 &final_dst, &fl6->saddr); 1967 } 1968 skb_push(skb, sizeof(struct ipv6hdr)); 1969 skb_reset_network_header(skb); 1970 hdr = ipv6_hdr(skb); 1971 1972 ip6_flow_hdr(hdr, cork->base6.tclass, 1973 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1974 ip6_autoflowlabel(net, sk), fl6)); 1975 hdr->hop_limit = cork->base6.hop_limit; 1976 hdr->nexthdr = proto; 1977 hdr->saddr = fl6->saddr; 1978 hdr->daddr = *final_dst; 1979 1980 skb->priority = cork->base.priority; 1981 skb->mark = cork->base.mark; 1982 if (sk_is_tcp(sk)) 1983 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1984 else 1985 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1986 1987 ip6_cork_steal_dst(skb, cork); 1988 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1989 if (unlikely(proto == IPPROTO_ICMPV6)) { 1990 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1991 u8 icmp6_type; 1992 1993 if (sk->sk_socket->type == SOCK_RAW && 1994 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1995 icmp6_type = fl6->fl6_icmp_type; 1996 else 1997 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1998 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1999 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 2000 } 2001 2002 ip6_cork_release(cork); 2003 out: 2004 return skb; 2005 } 2006 2007 int ip6_send_skb(struct sk_buff *skb) 2008 { 2009 struct net *net = sock_net(skb->sk); 2010 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 2011 int err; 2012 2013 rcu_read_lock(); 2014 err = ip6_local_out(net, skb->sk, skb); 2015 if (err) { 2016 if (err > 0) 2017 err = net_xmit_errno(err); 2018 if (err) 2019 IP6_INC_STATS(net, rt->rt6i_idev, 2020 IPSTATS_MIB_OUTDISCARDS); 2021 } 2022 2023 rcu_read_unlock(); 2024 return err; 2025 } 2026 2027 int ip6_push_pending_frames(struct sock *sk) 2028 { 2029 struct sk_buff *skb; 2030 2031 skb = ip6_finish_skb(sk); 2032 if (!skb) 2033 return 0; 2034 2035 return ip6_send_skb(skb); 2036 } 2037 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2038 2039 static void __ip6_flush_pending_frames(struct sock *sk, 2040 struct sk_buff_head *queue, 2041 struct inet_cork_full *cork) 2042 { 2043 struct sk_buff *skb; 2044 2045 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2046 if (skb_dst(skb)) 2047 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2048 IPSTATS_MIB_OUTDISCARDS); 2049 kfree_skb(skb); 2050 } 2051 2052 ip6_cork_release(cork); 2053 } 2054 2055 void ip6_flush_pending_frames(struct sock *sk) 2056 { 2057 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2058 &inet_sk(sk)->cork); 2059 } 2060 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2061 2062 struct sk_buff *ip6_make_skb(struct sock *sk, 2063 int getfrag(void *from, char *to, int offset, 2064 int len, int odd, struct sk_buff *skb), 2065 void *from, size_t length, int transhdrlen, 2066 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2067 unsigned int flags, struct inet_cork_full *cork) 2068 { 2069 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2070 struct sk_buff_head queue; 2071 int err; 2072 2073 if (flags & MSG_PROBE) { 2074 dst_release(&rt->dst); 2075 return NULL; 2076 } 2077 2078 __skb_queue_head_init(&queue); 2079 2080 cork->base.flags = 0; 2081 cork->base.addr = 0; 2082 cork->base.opt = NULL; 2083 cork->base6.opt = NULL; 2084 err = ip6_setup_cork(sk, cork, ipc6, rt); 2085 if (err) { 2086 ip6_cork_release(cork); 2087 return ERR_PTR(err); 2088 } 2089 2090 err = __ip6_append_data(sk, &queue, cork, 2091 ¤t->task_frag, getfrag, from, 2092 length + exthdrlen, transhdrlen + exthdrlen, 2093 flags); 2094 if (err) { 2095 __ip6_flush_pending_frames(sk, &queue, cork); 2096 return ERR_PTR(err); 2097 } 2098 2099 return __ip6_make_skb(sk, &queue, cork); 2100 } 2101