1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst->dev; 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 skb = skb_expand_head(skb, hh_len); 74 if (!skb) { 75 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 76 return -ENOMEM; 77 } 78 } 79 80 hdr = ipv6_hdr(skb); 81 daddr = &hdr->daddr; 82 if (ipv6_addr_is_multicast(daddr)) { 83 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 84 ((mroute6_is_socket(net, skb) && 85 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 86 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 87 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 88 89 /* Do not check for IFF_ALLMULTI; multicast routing 90 is not supported in any case. 91 */ 92 if (newskb) 93 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 94 net, sk, newskb, NULL, newskb->dev, 95 dev_loopback_xmit); 96 97 if (hdr->hop_limit == 0) { 98 IP6_INC_STATS(net, idev, 99 IPSTATS_MIB_OUTDISCARDS); 100 kfree_skb(skb); 101 return 0; 102 } 103 } 104 105 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 106 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 107 !(dev->flags & IFF_LOOPBACK)) { 108 kfree_skb(skb); 109 return 0; 110 } 111 } 112 113 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 114 int res = lwtunnel_xmit(skb); 115 116 if (res != LWTUNNEL_XMIT_CONTINUE) 117 return res; 118 } 119 120 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 121 122 rcu_read_lock(); 123 nexthop = rt6_nexthop((struct rt6_info *)dst, daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (unlikely(IS_ERR_OR_NULL(neigh))) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 rcu_read_unlock(); 131 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 132 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 133 return -EINVAL; 134 } 135 } 136 sock_confirm_neigh(skb, neigh); 137 ret = neigh_output(neigh, skb, false); 138 rcu_read_unlock(); 139 return ret; 140 } 141 142 static int 143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 144 struct sk_buff *skb, unsigned int mtu) 145 { 146 struct sk_buff *segs, *nskb; 147 netdev_features_t features; 148 int ret = 0; 149 150 /* Please see corresponding comment in ip_finish_output_gso 151 * describing the cases where GSO segment length exceeds the 152 * egress MTU. 153 */ 154 features = netif_skb_features(skb); 155 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 156 if (IS_ERR_OR_NULL(segs)) { 157 kfree_skb(skb); 158 return -ENOMEM; 159 } 160 161 consume_skb(skb); 162 163 skb_list_walk_safe(segs, segs, nskb) { 164 int err; 165 166 skb_mark_not_on_list(segs); 167 err = ip6_fragment(net, sk, segs, ip6_finish_output2); 168 if (err && ret == 0) 169 ret = err; 170 } 171 172 return ret; 173 } 174 175 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 176 { 177 unsigned int mtu; 178 179 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 180 /* Policy lookup after SNAT yielded a new policy */ 181 if (skb_dst(skb)->xfrm) { 182 IP6CB(skb)->flags |= IP6SKB_REROUTED; 183 return dst_output(net, sk, skb); 184 } 185 #endif 186 187 mtu = ip6_skb_dst_mtu(skb); 188 if (skb_is_gso(skb) && 189 !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 190 !skb_gso_validate_network_len(skb, mtu)) 191 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 192 193 if ((skb->len > mtu && !skb_is_gso(skb)) || 194 dst_allfrag(skb_dst(skb)) || 195 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 196 return ip6_fragment(net, sk, skb, ip6_finish_output2); 197 else 198 return ip6_finish_output2(net, sk, skb); 199 } 200 201 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 202 { 203 int ret; 204 205 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 206 switch (ret) { 207 case NET_XMIT_SUCCESS: 208 case NET_XMIT_CN: 209 return __ip6_finish_output(net, sk, skb) ? : ret; 210 default: 211 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 212 return ret; 213 } 214 } 215 216 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 217 { 218 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; 219 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 220 221 skb->protocol = htons(ETH_P_IPV6); 222 skb->dev = dev; 223 224 if (unlikely(idev->cnf.disable_ipv6)) { 225 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 226 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 227 return 0; 228 } 229 230 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 231 net, sk, skb, indev, dev, 232 ip6_finish_output, 233 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 234 } 235 EXPORT_SYMBOL(ip6_output); 236 237 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 238 { 239 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 240 return ip6_default_np_autolabel(net); 241 return inet6_test_bit(AUTOFLOWLABEL, sk); 242 } 243 244 /* 245 * xmit an sk_buff (used by TCP, SCTP and DCCP) 246 * Note : socket lock is not held for SYNACK packets, but might be modified 247 * by calls to skb_set_owner_w() and ipv6_local_error(), 248 * which are using proper atomic operations or spinlocks. 249 */ 250 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 251 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 252 { 253 struct net *net = sock_net(sk); 254 const struct ipv6_pinfo *np = inet6_sk(sk); 255 struct in6_addr *first_hop = &fl6->daddr; 256 struct dst_entry *dst = skb_dst(skb); 257 struct net_device *dev = dst->dev; 258 struct inet6_dev *idev = ip6_dst_idev(dst); 259 struct hop_jumbo_hdr *hop_jumbo; 260 int hoplen = sizeof(*hop_jumbo); 261 unsigned int head_room; 262 struct ipv6hdr *hdr; 263 u8 proto = fl6->flowi6_proto; 264 int seg_len = skb->len; 265 int hlimit = -1; 266 u32 mtu; 267 268 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 269 if (opt) 270 head_room += opt->opt_nflen + opt->opt_flen; 271 272 if (unlikely(head_room > skb_headroom(skb))) { 273 skb = skb_expand_head(skb, head_room); 274 if (!skb) { 275 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 276 return -ENOBUFS; 277 } 278 } 279 280 if (opt) { 281 seg_len += opt->opt_nflen + opt->opt_flen; 282 283 if (opt->opt_flen) 284 ipv6_push_frag_opts(skb, opt, &proto); 285 286 if (opt->opt_nflen) 287 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 288 &fl6->saddr); 289 } 290 291 if (unlikely(seg_len > IPV6_MAXPLEN)) { 292 hop_jumbo = skb_push(skb, hoplen); 293 294 hop_jumbo->nexthdr = proto; 295 hop_jumbo->hdrlen = 0; 296 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 297 hop_jumbo->tlv_len = 4; 298 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 299 300 proto = IPPROTO_HOPOPTS; 301 seg_len = 0; 302 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 303 } 304 305 skb_push(skb, sizeof(struct ipv6hdr)); 306 skb_reset_network_header(skb); 307 hdr = ipv6_hdr(skb); 308 309 /* 310 * Fill in the IPv6 header 311 */ 312 if (np) 313 hlimit = READ_ONCE(np->hop_limit); 314 if (hlimit < 0) 315 hlimit = ip6_dst_hoplimit(dst); 316 317 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 318 ip6_autoflowlabel(net, sk), fl6)); 319 320 hdr->payload_len = htons(seg_len); 321 hdr->nexthdr = proto; 322 hdr->hop_limit = hlimit; 323 324 hdr->saddr = fl6->saddr; 325 hdr->daddr = *first_hop; 326 327 skb->protocol = htons(ETH_P_IPV6); 328 skb->priority = priority; 329 skb->mark = mark; 330 331 mtu = dst_mtu(dst); 332 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 333 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 334 335 /* if egress device is enslaved to an L3 master device pass the 336 * skb to its handler for processing 337 */ 338 skb = l3mdev_ip6_out((struct sock *)sk, skb); 339 if (unlikely(!skb)) 340 return 0; 341 342 /* hooks should never assume socket lock is held. 343 * we promote our socket to non const 344 */ 345 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 346 net, (struct sock *)sk, skb, NULL, dev, 347 dst_output); 348 } 349 350 skb->dev = dev; 351 /* ipv6_local_error() does not require socket lock, 352 * we promote our socket to non const 353 */ 354 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 355 356 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 357 kfree_skb(skb); 358 return -EMSGSIZE; 359 } 360 EXPORT_SYMBOL(ip6_xmit); 361 362 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 363 { 364 struct ip6_ra_chain *ra; 365 struct sock *last = NULL; 366 367 read_lock(&ip6_ra_lock); 368 for (ra = ip6_ra_chain; ra; ra = ra->next) { 369 struct sock *sk = ra->sk; 370 if (sk && ra->sel == sel && 371 (!sk->sk_bound_dev_if || 372 sk->sk_bound_dev_if == skb->dev->ifindex)) { 373 374 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 375 !net_eq(sock_net(sk), dev_net(skb->dev))) { 376 continue; 377 } 378 if (last) { 379 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 380 if (skb2) 381 rawv6_rcv(last, skb2); 382 } 383 last = sk; 384 } 385 } 386 387 if (last) { 388 rawv6_rcv(last, skb); 389 read_unlock(&ip6_ra_lock); 390 return 1; 391 } 392 read_unlock(&ip6_ra_lock); 393 return 0; 394 } 395 396 static int ip6_forward_proxy_check(struct sk_buff *skb) 397 { 398 struct ipv6hdr *hdr = ipv6_hdr(skb); 399 u8 nexthdr = hdr->nexthdr; 400 __be16 frag_off; 401 int offset; 402 403 if (ipv6_ext_hdr(nexthdr)) { 404 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 405 if (offset < 0) 406 return 0; 407 } else 408 offset = sizeof(struct ipv6hdr); 409 410 if (nexthdr == IPPROTO_ICMPV6) { 411 struct icmp6hdr *icmp6; 412 413 if (!pskb_may_pull(skb, (skb_network_header(skb) + 414 offset + 1 - skb->data))) 415 return 0; 416 417 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 418 419 switch (icmp6->icmp6_type) { 420 case NDISC_ROUTER_SOLICITATION: 421 case NDISC_ROUTER_ADVERTISEMENT: 422 case NDISC_NEIGHBOUR_SOLICITATION: 423 case NDISC_NEIGHBOUR_ADVERTISEMENT: 424 case NDISC_REDIRECT: 425 /* For reaction involving unicast neighbor discovery 426 * message destined to the proxied address, pass it to 427 * input function. 428 */ 429 return 1; 430 default: 431 break; 432 } 433 } 434 435 /* 436 * The proxying router can't forward traffic sent to a link-local 437 * address, so signal the sender and discard the packet. This 438 * behavior is clarified by the MIPv6 specification. 439 */ 440 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 441 dst_link_failure(skb); 442 return -1; 443 } 444 445 return 0; 446 } 447 448 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 449 struct sk_buff *skb) 450 { 451 #ifdef CONFIG_NET_SWITCHDEV 452 if (skb->offload_l3_fwd_mark) { 453 consume_skb(skb); 454 return 0; 455 } 456 #endif 457 458 skb_clear_tstamp(skb); 459 return dst_output(net, sk, skb); 460 } 461 462 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 463 { 464 if (skb->len <= mtu) 465 return false; 466 467 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 468 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 469 return true; 470 471 if (skb->ignore_df) 472 return false; 473 474 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 475 return false; 476 477 return true; 478 } 479 480 int ip6_forward(struct sk_buff *skb) 481 { 482 struct dst_entry *dst = skb_dst(skb); 483 struct ipv6hdr *hdr = ipv6_hdr(skb); 484 struct inet6_skb_parm *opt = IP6CB(skb); 485 struct net *net = dev_net(dst->dev); 486 struct inet6_dev *idev; 487 SKB_DR(reason); 488 u32 mtu; 489 490 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 491 if (net->ipv6.devconf_all->forwarding == 0) 492 goto error; 493 494 if (skb->pkt_type != PACKET_HOST) 495 goto drop; 496 497 if (unlikely(skb->sk)) 498 goto drop; 499 500 if (skb_warn_if_lro(skb)) 501 goto drop; 502 503 if (!net->ipv6.devconf_all->disable_policy && 504 (!idev || !idev->cnf.disable_policy) && 505 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 506 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 507 goto drop; 508 } 509 510 skb_forward_csum(skb); 511 512 /* 513 * We DO NOT make any processing on 514 * RA packets, pushing them to user level AS IS 515 * without ane WARRANTY that application will be able 516 * to interpret them. The reason is that we 517 * cannot make anything clever here. 518 * 519 * We are not end-node, so that if packet contains 520 * AH/ESP, we cannot make anything. 521 * Defragmentation also would be mistake, RA packets 522 * cannot be fragmented, because there is no warranty 523 * that different fragments will go along one path. --ANK 524 */ 525 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 526 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 527 return 0; 528 } 529 530 /* 531 * check and decrement ttl 532 */ 533 if (hdr->hop_limit <= 1) { 534 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 535 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 536 537 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 538 return -ETIMEDOUT; 539 } 540 541 /* XXX: idev->cnf.proxy_ndp? */ 542 if (net->ipv6.devconf_all->proxy_ndp && 543 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 544 int proxied = ip6_forward_proxy_check(skb); 545 if (proxied > 0) { 546 /* It's tempting to decrease the hop limit 547 * here by 1, as we do at the end of the 548 * function too. 549 * 550 * But that would be incorrect, as proxying is 551 * not forwarding. The ip6_input function 552 * will handle this packet locally, and it 553 * depends on the hop limit being unchanged. 554 * 555 * One example is the NDP hop limit, that 556 * always has to stay 255, but other would be 557 * similar checks around RA packets, where the 558 * user can even change the desired limit. 559 */ 560 return ip6_input(skb); 561 } else if (proxied < 0) { 562 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 563 goto drop; 564 } 565 } 566 567 if (!xfrm6_route_forward(skb)) { 568 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 569 SKB_DR_SET(reason, XFRM_POLICY); 570 goto drop; 571 } 572 dst = skb_dst(skb); 573 574 /* IPv6 specs say nothing about it, but it is clear that we cannot 575 send redirects to source routed frames. 576 We don't send redirects to frames decapsulated from IPsec. 577 */ 578 if (IP6CB(skb)->iif == dst->dev->ifindex && 579 opt->srcrt == 0 && !skb_sec_path(skb)) { 580 struct in6_addr *target = NULL; 581 struct inet_peer *peer; 582 struct rt6_info *rt; 583 584 /* 585 * incoming and outgoing devices are the same 586 * send a redirect. 587 */ 588 589 rt = (struct rt6_info *) dst; 590 if (rt->rt6i_flags & RTF_GATEWAY) 591 target = &rt->rt6i_gateway; 592 else 593 target = &hdr->daddr; 594 595 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 596 597 /* Limit redirects both by destination (here) 598 and by source (inside ndisc_send_redirect) 599 */ 600 if (inet_peer_xrlim_allow(peer, 1*HZ)) 601 ndisc_send_redirect(skb, target); 602 if (peer) 603 inet_putpeer(peer); 604 } else { 605 int addrtype = ipv6_addr_type(&hdr->saddr); 606 607 /* This check is security critical. */ 608 if (addrtype == IPV6_ADDR_ANY || 609 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 610 goto error; 611 if (addrtype & IPV6_ADDR_LINKLOCAL) { 612 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 613 ICMPV6_NOT_NEIGHBOUR, 0); 614 goto error; 615 } 616 } 617 618 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 619 620 mtu = ip6_dst_mtu_maybe_forward(dst, true); 621 if (mtu < IPV6_MIN_MTU) 622 mtu = IPV6_MIN_MTU; 623 624 if (ip6_pkt_too_big(skb, mtu)) { 625 /* Again, force OUTPUT device used as source address */ 626 skb->dev = dst->dev; 627 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 628 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 629 __IP6_INC_STATS(net, ip6_dst_idev(dst), 630 IPSTATS_MIB_FRAGFAILS); 631 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 632 return -EMSGSIZE; 633 } 634 635 if (skb_cow(skb, dst->dev->hard_header_len)) { 636 __IP6_INC_STATS(net, ip6_dst_idev(dst), 637 IPSTATS_MIB_OUTDISCARDS); 638 goto drop; 639 } 640 641 hdr = ipv6_hdr(skb); 642 643 /* Mangling hops number delayed to point after skb COW */ 644 645 hdr->hop_limit--; 646 647 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 648 net, NULL, skb, skb->dev, dst->dev, 649 ip6_forward_finish); 650 651 error: 652 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 653 SKB_DR_SET(reason, IP_INADDRERRORS); 654 drop: 655 kfree_skb_reason(skb, reason); 656 return -EINVAL; 657 } 658 659 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 660 { 661 to->pkt_type = from->pkt_type; 662 to->priority = from->priority; 663 to->protocol = from->protocol; 664 skb_dst_drop(to); 665 skb_dst_set(to, dst_clone(skb_dst(from))); 666 to->dev = from->dev; 667 to->mark = from->mark; 668 669 skb_copy_hash(to, from); 670 671 #ifdef CONFIG_NET_SCHED 672 to->tc_index = from->tc_index; 673 #endif 674 nf_copy(to, from); 675 skb_ext_copy(to, from); 676 skb_copy_secmark(to, from); 677 } 678 679 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 680 u8 nexthdr, __be32 frag_id, 681 struct ip6_fraglist_iter *iter) 682 { 683 unsigned int first_len; 684 struct frag_hdr *fh; 685 686 /* BUILD HEADER */ 687 *prevhdr = NEXTHDR_FRAGMENT; 688 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 689 if (!iter->tmp_hdr) 690 return -ENOMEM; 691 692 iter->frag = skb_shinfo(skb)->frag_list; 693 skb_frag_list_init(skb); 694 695 iter->offset = 0; 696 iter->hlen = hlen; 697 iter->frag_id = frag_id; 698 iter->nexthdr = nexthdr; 699 700 __skb_pull(skb, hlen); 701 fh = __skb_push(skb, sizeof(struct frag_hdr)); 702 __skb_push(skb, hlen); 703 skb_reset_network_header(skb); 704 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 705 706 fh->nexthdr = nexthdr; 707 fh->reserved = 0; 708 fh->frag_off = htons(IP6_MF); 709 fh->identification = frag_id; 710 711 first_len = skb_pagelen(skb); 712 skb->data_len = first_len - skb_headlen(skb); 713 skb->len = first_len; 714 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 715 716 return 0; 717 } 718 EXPORT_SYMBOL(ip6_fraglist_init); 719 720 void ip6_fraglist_prepare(struct sk_buff *skb, 721 struct ip6_fraglist_iter *iter) 722 { 723 struct sk_buff *frag = iter->frag; 724 unsigned int hlen = iter->hlen; 725 struct frag_hdr *fh; 726 727 frag->ip_summed = CHECKSUM_NONE; 728 skb_reset_transport_header(frag); 729 fh = __skb_push(frag, sizeof(struct frag_hdr)); 730 __skb_push(frag, hlen); 731 skb_reset_network_header(frag); 732 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 733 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 734 fh->nexthdr = iter->nexthdr; 735 fh->reserved = 0; 736 fh->frag_off = htons(iter->offset); 737 if (frag->next) 738 fh->frag_off |= htons(IP6_MF); 739 fh->identification = iter->frag_id; 740 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 741 ip6_copy_metadata(frag, skb); 742 } 743 EXPORT_SYMBOL(ip6_fraglist_prepare); 744 745 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 746 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 747 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 748 { 749 state->prevhdr = prevhdr; 750 state->nexthdr = nexthdr; 751 state->frag_id = frag_id; 752 753 state->hlen = hlen; 754 state->mtu = mtu; 755 756 state->left = skb->len - hlen; /* Space per frame */ 757 state->ptr = hlen; /* Where to start from */ 758 759 state->hroom = hdr_room; 760 state->troom = needed_tailroom; 761 762 state->offset = 0; 763 } 764 EXPORT_SYMBOL(ip6_frag_init); 765 766 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 767 { 768 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 769 struct sk_buff *frag; 770 struct frag_hdr *fh; 771 unsigned int len; 772 773 len = state->left; 774 /* IF: it doesn't fit, use 'mtu' - the data space left */ 775 if (len > state->mtu) 776 len = state->mtu; 777 /* IF: we are not sending up to and including the packet end 778 then align the next start on an eight byte boundary */ 779 if (len < state->left) 780 len &= ~7; 781 782 /* Allocate buffer */ 783 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 784 state->hroom + state->troom, GFP_ATOMIC); 785 if (!frag) 786 return ERR_PTR(-ENOMEM); 787 788 /* 789 * Set up data on packet 790 */ 791 792 ip6_copy_metadata(frag, skb); 793 skb_reserve(frag, state->hroom); 794 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 795 skb_reset_network_header(frag); 796 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 797 frag->transport_header = (frag->network_header + state->hlen + 798 sizeof(struct frag_hdr)); 799 800 /* 801 * Charge the memory for the fragment to any owner 802 * it might possess 803 */ 804 if (skb->sk) 805 skb_set_owner_w(frag, skb->sk); 806 807 /* 808 * Copy the packet header into the new buffer. 809 */ 810 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 811 812 fragnexthdr_offset = skb_network_header(frag); 813 fragnexthdr_offset += prevhdr - skb_network_header(skb); 814 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 815 816 /* 817 * Build fragment header. 818 */ 819 fh->nexthdr = state->nexthdr; 820 fh->reserved = 0; 821 fh->identification = state->frag_id; 822 823 /* 824 * Copy a block of the IP datagram. 825 */ 826 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 827 len)); 828 state->left -= len; 829 830 fh->frag_off = htons(state->offset); 831 if (state->left > 0) 832 fh->frag_off |= htons(IP6_MF); 833 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 834 835 state->ptr += len; 836 state->offset += len; 837 838 return frag; 839 } 840 EXPORT_SYMBOL(ip6_frag_next); 841 842 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 843 int (*output)(struct net *, struct sock *, struct sk_buff *)) 844 { 845 struct sk_buff *frag; 846 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 847 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 848 inet6_sk(skb->sk) : NULL; 849 bool mono_delivery_time = skb->mono_delivery_time; 850 struct ip6_frag_state state; 851 unsigned int mtu, hlen, nexthdr_offset; 852 ktime_t tstamp = skb->tstamp; 853 int hroom, err = 0; 854 __be32 frag_id; 855 u8 *prevhdr, nexthdr = 0; 856 857 err = ip6_find_1stfragopt(skb, &prevhdr); 858 if (err < 0) 859 goto fail; 860 hlen = err; 861 nexthdr = *prevhdr; 862 nexthdr_offset = prevhdr - skb_network_header(skb); 863 864 mtu = ip6_skb_dst_mtu(skb); 865 866 /* We must not fragment if the socket is set to force MTU discovery 867 * or if the skb it not generated by a local socket. 868 */ 869 if (unlikely(!skb->ignore_df && skb->len > mtu)) 870 goto fail_toobig; 871 872 if (IP6CB(skb)->frag_max_size) { 873 if (IP6CB(skb)->frag_max_size > mtu) 874 goto fail_toobig; 875 876 /* don't send fragments larger than what we received */ 877 mtu = IP6CB(skb)->frag_max_size; 878 if (mtu < IPV6_MIN_MTU) 879 mtu = IPV6_MIN_MTU; 880 } 881 882 if (np) { 883 u32 frag_size = READ_ONCE(np->frag_size); 884 885 if (frag_size && frag_size < mtu) 886 mtu = frag_size; 887 } 888 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 889 goto fail_toobig; 890 mtu -= hlen + sizeof(struct frag_hdr); 891 892 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 893 &ipv6_hdr(skb)->saddr); 894 895 if (skb->ip_summed == CHECKSUM_PARTIAL && 896 (err = skb_checksum_help(skb))) 897 goto fail; 898 899 prevhdr = skb_network_header(skb) + nexthdr_offset; 900 hroom = LL_RESERVED_SPACE(rt->dst.dev); 901 if (skb_has_frag_list(skb)) { 902 unsigned int first_len = skb_pagelen(skb); 903 struct ip6_fraglist_iter iter; 904 struct sk_buff *frag2; 905 906 if (first_len - hlen > mtu || 907 ((first_len - hlen) & 7) || 908 skb_cloned(skb) || 909 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 910 goto slow_path; 911 912 skb_walk_frags(skb, frag) { 913 /* Correct geometry. */ 914 if (frag->len > mtu || 915 ((frag->len & 7) && frag->next) || 916 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 917 goto slow_path_clean; 918 919 /* Partially cloned skb? */ 920 if (skb_shared(frag)) 921 goto slow_path_clean; 922 923 BUG_ON(frag->sk); 924 if (skb->sk) { 925 frag->sk = skb->sk; 926 frag->destructor = sock_wfree; 927 } 928 skb->truesize -= frag->truesize; 929 } 930 931 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 932 &iter); 933 if (err < 0) 934 goto fail; 935 936 /* We prevent @rt from being freed. */ 937 rcu_read_lock(); 938 939 for (;;) { 940 /* Prepare header of the next frame, 941 * before previous one went down. */ 942 if (iter.frag) 943 ip6_fraglist_prepare(skb, &iter); 944 945 skb_set_delivery_time(skb, tstamp, mono_delivery_time); 946 err = output(net, sk, skb); 947 if (!err) 948 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 949 IPSTATS_MIB_FRAGCREATES); 950 951 if (err || !iter.frag) 952 break; 953 954 skb = ip6_fraglist_next(&iter); 955 } 956 957 kfree(iter.tmp_hdr); 958 959 if (err == 0) { 960 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 961 IPSTATS_MIB_FRAGOKS); 962 rcu_read_unlock(); 963 return 0; 964 } 965 966 kfree_skb_list(iter.frag); 967 968 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 969 IPSTATS_MIB_FRAGFAILS); 970 rcu_read_unlock(); 971 return err; 972 973 slow_path_clean: 974 skb_walk_frags(skb, frag2) { 975 if (frag2 == frag) 976 break; 977 frag2->sk = NULL; 978 frag2->destructor = NULL; 979 skb->truesize += frag2->truesize; 980 } 981 } 982 983 slow_path: 984 /* 985 * Fragment the datagram. 986 */ 987 988 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 989 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 990 &state); 991 992 /* 993 * Keep copying data until we run out. 994 */ 995 996 while (state.left > 0) { 997 frag = ip6_frag_next(skb, &state); 998 if (IS_ERR(frag)) { 999 err = PTR_ERR(frag); 1000 goto fail; 1001 } 1002 1003 /* 1004 * Put this fragment into the sending queue. 1005 */ 1006 skb_set_delivery_time(frag, tstamp, mono_delivery_time); 1007 err = output(net, sk, frag); 1008 if (err) 1009 goto fail; 1010 1011 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1012 IPSTATS_MIB_FRAGCREATES); 1013 } 1014 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1015 IPSTATS_MIB_FRAGOKS); 1016 consume_skb(skb); 1017 return err; 1018 1019 fail_toobig: 1020 if (skb->sk && dst_allfrag(skb_dst(skb))) 1021 sk_gso_disable(skb->sk); 1022 1023 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1024 err = -EMSGSIZE; 1025 1026 fail: 1027 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1028 IPSTATS_MIB_FRAGFAILS); 1029 kfree_skb(skb); 1030 return err; 1031 } 1032 1033 static inline int ip6_rt_check(const struct rt6key *rt_key, 1034 const struct in6_addr *fl_addr, 1035 const struct in6_addr *addr_cache) 1036 { 1037 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1038 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1039 } 1040 1041 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1042 struct dst_entry *dst, 1043 const struct flowi6 *fl6) 1044 { 1045 struct ipv6_pinfo *np = inet6_sk(sk); 1046 struct rt6_info *rt; 1047 1048 if (!dst) 1049 goto out; 1050 1051 if (dst->ops->family != AF_INET6) { 1052 dst_release(dst); 1053 return NULL; 1054 } 1055 1056 rt = (struct rt6_info *)dst; 1057 /* Yes, checking route validity in not connected 1058 * case is not very simple. Take into account, 1059 * that we do not support routing by source, TOS, 1060 * and MSG_DONTROUTE --ANK (980726) 1061 * 1062 * 1. ip6_rt_check(): If route was host route, 1063 * check that cached destination is current. 1064 * If it is network route, we still may 1065 * check its validity using saved pointer 1066 * to the last used address: daddr_cache. 1067 * We do not want to save whole address now, 1068 * (because main consumer of this service 1069 * is tcp, which has not this problem), 1070 * so that the last trick works only on connected 1071 * sockets. 1072 * 2. oif also should be the same. 1073 */ 1074 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1075 #ifdef CONFIG_IPV6_SUBTREES 1076 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1077 #endif 1078 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { 1079 dst_release(dst); 1080 dst = NULL; 1081 } 1082 1083 out: 1084 return dst; 1085 } 1086 1087 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1088 struct dst_entry **dst, struct flowi6 *fl6) 1089 { 1090 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1091 struct neighbour *n; 1092 struct rt6_info *rt; 1093 #endif 1094 int err; 1095 int flags = 0; 1096 1097 /* The correct way to handle this would be to do 1098 * ip6_route_get_saddr, and then ip6_route_output; however, 1099 * the route-specific preferred source forces the 1100 * ip6_route_output call _before_ ip6_route_get_saddr. 1101 * 1102 * In source specific routing (no src=any default route), 1103 * ip6_route_output will fail given src=any saddr, though, so 1104 * that's why we try it again later. 1105 */ 1106 if (ipv6_addr_any(&fl6->saddr)) { 1107 struct fib6_info *from; 1108 struct rt6_info *rt; 1109 1110 *dst = ip6_route_output(net, sk, fl6); 1111 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 1112 1113 rcu_read_lock(); 1114 from = rt ? rcu_dereference(rt->from) : NULL; 1115 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1116 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1117 &fl6->saddr); 1118 rcu_read_unlock(); 1119 1120 if (err) 1121 goto out_err_release; 1122 1123 /* If we had an erroneous initial result, pretend it 1124 * never existed and let the SA-enabled version take 1125 * over. 1126 */ 1127 if ((*dst)->error) { 1128 dst_release(*dst); 1129 *dst = NULL; 1130 } 1131 1132 if (fl6->flowi6_oif) 1133 flags |= RT6_LOOKUP_F_IFACE; 1134 } 1135 1136 if (!*dst) 1137 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1138 1139 err = (*dst)->error; 1140 if (err) 1141 goto out_err_release; 1142 1143 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1144 /* 1145 * Here if the dst entry we've looked up 1146 * has a neighbour entry that is in the INCOMPLETE 1147 * state and the src address from the flow is 1148 * marked as OPTIMISTIC, we release the found 1149 * dst entry and replace it instead with the 1150 * dst entry of the nexthop router 1151 */ 1152 rt = (struct rt6_info *) *dst; 1153 rcu_read_lock(); 1154 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1155 rt6_nexthop(rt, &fl6->daddr)); 1156 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1157 rcu_read_unlock(); 1158 1159 if (err) { 1160 struct inet6_ifaddr *ifp; 1161 struct flowi6 fl_gw6; 1162 int redirect; 1163 1164 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1165 (*dst)->dev, 1); 1166 1167 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1168 if (ifp) 1169 in6_ifa_put(ifp); 1170 1171 if (redirect) { 1172 /* 1173 * We need to get the dst entry for the 1174 * default router instead 1175 */ 1176 dst_release(*dst); 1177 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1178 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1179 *dst = ip6_route_output(net, sk, &fl_gw6); 1180 err = (*dst)->error; 1181 if (err) 1182 goto out_err_release; 1183 } 1184 } 1185 #endif 1186 if (ipv6_addr_v4mapped(&fl6->saddr) && 1187 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1188 err = -EAFNOSUPPORT; 1189 goto out_err_release; 1190 } 1191 1192 return 0; 1193 1194 out_err_release: 1195 dst_release(*dst); 1196 *dst = NULL; 1197 1198 if (err == -ENETUNREACH) 1199 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1200 return err; 1201 } 1202 1203 /** 1204 * ip6_dst_lookup - perform route lookup on flow 1205 * @net: Network namespace to perform lookup in 1206 * @sk: socket which provides route info 1207 * @dst: pointer to dst_entry * for result 1208 * @fl6: flow to lookup 1209 * 1210 * This function performs a route lookup on the given flow. 1211 * 1212 * It returns zero on success, or a standard errno code on error. 1213 */ 1214 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1215 struct flowi6 *fl6) 1216 { 1217 *dst = NULL; 1218 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1219 } 1220 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1221 1222 /** 1223 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1224 * @net: Network namespace to perform lookup in 1225 * @sk: socket which provides route info 1226 * @fl6: flow to lookup 1227 * @final_dst: final destination address for ipsec lookup 1228 * 1229 * This function performs a route lookup on the given flow. 1230 * 1231 * It returns a valid dst pointer on success, or a pointer encoded 1232 * error code. 1233 */ 1234 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1235 const struct in6_addr *final_dst) 1236 { 1237 struct dst_entry *dst = NULL; 1238 int err; 1239 1240 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1241 if (err) 1242 return ERR_PTR(err); 1243 if (final_dst) 1244 fl6->daddr = *final_dst; 1245 1246 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1247 } 1248 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1249 1250 /** 1251 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1252 * @sk: socket which provides the dst cache and route info 1253 * @fl6: flow to lookup 1254 * @final_dst: final destination address for ipsec lookup 1255 * @connected: whether @sk is connected or not 1256 * 1257 * This function performs a route lookup on the given flow with the 1258 * possibility of using the cached route in the socket if it is valid. 1259 * It will take the socket dst lock when operating on the dst cache. 1260 * As a result, this function can only be used in process context. 1261 * 1262 * In addition, for a connected socket, cache the dst in the socket 1263 * if the current cache is not valid. 1264 * 1265 * It returns a valid dst pointer on success, or a pointer encoded 1266 * error code. 1267 */ 1268 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1269 const struct in6_addr *final_dst, 1270 bool connected) 1271 { 1272 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1273 1274 dst = ip6_sk_dst_check(sk, dst, fl6); 1275 if (dst) 1276 return dst; 1277 1278 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1279 if (connected && !IS_ERR(dst)) 1280 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1281 1282 return dst; 1283 } 1284 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1285 1286 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1287 gfp_t gfp) 1288 { 1289 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1290 } 1291 1292 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1293 gfp_t gfp) 1294 { 1295 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1296 } 1297 1298 static void ip6_append_data_mtu(unsigned int *mtu, 1299 int *maxfraglen, 1300 unsigned int fragheaderlen, 1301 struct sk_buff *skb, 1302 struct rt6_info *rt, 1303 unsigned int orig_mtu) 1304 { 1305 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1306 if (!skb) { 1307 /* first fragment, reserve header_len */ 1308 *mtu = orig_mtu - rt->dst.header_len; 1309 1310 } else { 1311 /* 1312 * this fragment is not first, the headers 1313 * space is regarded as data space. 1314 */ 1315 *mtu = orig_mtu; 1316 } 1317 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1318 + fragheaderlen - sizeof(struct frag_hdr); 1319 } 1320 } 1321 1322 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1323 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1324 struct rt6_info *rt) 1325 { 1326 struct ipv6_pinfo *np = inet6_sk(sk); 1327 unsigned int mtu, frag_size; 1328 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1329 1330 /* callers pass dst together with a reference, set it first so 1331 * ip6_cork_release() can put it down even in case of an error. 1332 */ 1333 cork->base.dst = &rt->dst; 1334 1335 /* 1336 * setup for corking 1337 */ 1338 if (opt) { 1339 if (WARN_ON(v6_cork->opt)) 1340 return -EINVAL; 1341 1342 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1343 if (unlikely(!nopt)) 1344 return -ENOBUFS; 1345 1346 nopt->tot_len = sizeof(*opt); 1347 nopt->opt_flen = opt->opt_flen; 1348 nopt->opt_nflen = opt->opt_nflen; 1349 1350 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1351 if (opt->dst0opt && !nopt->dst0opt) 1352 return -ENOBUFS; 1353 1354 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1355 if (opt->dst1opt && !nopt->dst1opt) 1356 return -ENOBUFS; 1357 1358 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1359 if (opt->hopopt && !nopt->hopopt) 1360 return -ENOBUFS; 1361 1362 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1363 if (opt->srcrt && !nopt->srcrt) 1364 return -ENOBUFS; 1365 1366 /* need source address above miyazawa*/ 1367 } 1368 v6_cork->hop_limit = ipc6->hlimit; 1369 v6_cork->tclass = ipc6->tclass; 1370 if (rt->dst.flags & DST_XFRM_TUNNEL) 1371 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1372 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1373 else 1374 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1375 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1376 1377 frag_size = READ_ONCE(np->frag_size); 1378 if (frag_size && frag_size < mtu) 1379 mtu = frag_size; 1380 1381 cork->base.fragsize = mtu; 1382 cork->base.gso_size = ipc6->gso_size; 1383 cork->base.tx_flags = 0; 1384 cork->base.mark = ipc6->sockc.mark; 1385 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1386 1387 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1388 cork->base.flags |= IPCORK_ALLFRAG; 1389 cork->base.length = 0; 1390 1391 cork->base.transmit_time = ipc6->sockc.transmit_time; 1392 1393 return 0; 1394 } 1395 1396 static int __ip6_append_data(struct sock *sk, 1397 struct sk_buff_head *queue, 1398 struct inet_cork_full *cork_full, 1399 struct inet6_cork *v6_cork, 1400 struct page_frag *pfrag, 1401 int getfrag(void *from, char *to, int offset, 1402 int len, int odd, struct sk_buff *skb), 1403 void *from, size_t length, int transhdrlen, 1404 unsigned int flags, struct ipcm6_cookie *ipc6) 1405 { 1406 struct sk_buff *skb, *skb_prev = NULL; 1407 struct inet_cork *cork = &cork_full->base; 1408 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1409 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1410 struct ubuf_info *uarg = NULL; 1411 int exthdrlen = 0; 1412 int dst_exthdrlen = 0; 1413 int hh_len; 1414 int copy; 1415 int err; 1416 int offset = 0; 1417 bool zc = false; 1418 u32 tskey = 0; 1419 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1420 struct ipv6_txoptions *opt = v6_cork->opt; 1421 int csummode = CHECKSUM_NONE; 1422 unsigned int maxnonfragsize, headersize; 1423 unsigned int wmem_alloc_delta = 0; 1424 bool paged, extra_uref = false; 1425 1426 skb = skb_peek_tail(queue); 1427 if (!skb) { 1428 exthdrlen = opt ? opt->opt_flen : 0; 1429 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1430 } 1431 1432 paged = !!cork->gso_size; 1433 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1434 orig_mtu = mtu; 1435 1436 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1437 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) 1438 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1439 1440 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1441 1442 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1443 (opt ? opt->opt_nflen : 0); 1444 1445 headersize = sizeof(struct ipv6hdr) + 1446 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1447 (dst_allfrag(&rt->dst) ? 1448 sizeof(struct frag_hdr) : 0) + 1449 rt->rt6i_nfheader_len; 1450 1451 if (mtu <= fragheaderlen || 1452 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1453 goto emsgsize; 1454 1455 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1456 sizeof(struct frag_hdr); 1457 1458 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1459 * the first fragment 1460 */ 1461 if (headersize + transhdrlen > mtu) 1462 goto emsgsize; 1463 1464 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1465 (sk->sk_protocol == IPPROTO_UDP || 1466 sk->sk_protocol == IPPROTO_ICMPV6 || 1467 sk->sk_protocol == IPPROTO_RAW)) { 1468 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1469 sizeof(struct ipv6hdr)); 1470 goto emsgsize; 1471 } 1472 1473 if (ip6_sk_ignore_df(sk)) 1474 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1475 else 1476 maxnonfragsize = mtu; 1477 1478 if (cork->length + length > maxnonfragsize - headersize) { 1479 emsgsize: 1480 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1481 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1482 return -EMSGSIZE; 1483 } 1484 1485 /* CHECKSUM_PARTIAL only with no extension headers and when 1486 * we are not going to fragment 1487 */ 1488 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1489 headersize == sizeof(struct ipv6hdr) && 1490 length <= mtu - headersize && 1491 (!(flags & MSG_MORE) || cork->gso_size) && 1492 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1493 csummode = CHECKSUM_PARTIAL; 1494 1495 if ((flags & MSG_ZEROCOPY) && length) { 1496 struct msghdr *msg = from; 1497 1498 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1499 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1500 return -EINVAL; 1501 1502 /* Leave uarg NULL if can't zerocopy, callers should 1503 * be able to handle it. 1504 */ 1505 if ((rt->dst.dev->features & NETIF_F_SG) && 1506 csummode == CHECKSUM_PARTIAL) { 1507 paged = true; 1508 zc = true; 1509 uarg = msg->msg_ubuf; 1510 } 1511 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1512 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1513 if (!uarg) 1514 return -ENOBUFS; 1515 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1516 if (rt->dst.dev->features & NETIF_F_SG && 1517 csummode == CHECKSUM_PARTIAL) { 1518 paged = true; 1519 zc = true; 1520 } else { 1521 uarg_to_msgzc(uarg)->zerocopy = 0; 1522 skb_zcopy_set(skb, uarg, &extra_uref); 1523 } 1524 } 1525 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1526 if (inet_test_bit(HDRINCL, sk)) 1527 return -EPERM; 1528 if (rt->dst.dev->features & NETIF_F_SG && 1529 getfrag == ip_generic_getfrag) 1530 /* We need an empty buffer to attach stuff to */ 1531 paged = true; 1532 else 1533 flags &= ~MSG_SPLICE_PAGES; 1534 } 1535 1536 /* 1537 * Let's try using as much space as possible. 1538 * Use MTU if total length of the message fits into the MTU. 1539 * Otherwise, we need to reserve fragment header and 1540 * fragment alignment (= 8-15 octects, in total). 1541 * 1542 * Note that we may need to "move" the data from the tail 1543 * of the buffer to the new fragment when we split 1544 * the message. 1545 * 1546 * FIXME: It may be fragmented into multiple chunks 1547 * at once if non-fragmentable extension headers 1548 * are too large. 1549 * --yoshfuji 1550 */ 1551 1552 cork->length += length; 1553 if (!skb) 1554 goto alloc_new_skb; 1555 1556 while (length > 0) { 1557 /* Check if the remaining data fits into current packet. */ 1558 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1559 if (copy < length) 1560 copy = maxfraglen - skb->len; 1561 1562 if (copy <= 0) { 1563 char *data; 1564 unsigned int datalen; 1565 unsigned int fraglen; 1566 unsigned int fraggap; 1567 unsigned int alloclen, alloc_extra; 1568 unsigned int pagedlen; 1569 alloc_new_skb: 1570 /* There's no room in the current skb */ 1571 if (skb) 1572 fraggap = skb->len - maxfraglen; 1573 else 1574 fraggap = 0; 1575 /* update mtu and maxfraglen if necessary */ 1576 if (!skb || !skb_prev) 1577 ip6_append_data_mtu(&mtu, &maxfraglen, 1578 fragheaderlen, skb, rt, 1579 orig_mtu); 1580 1581 skb_prev = skb; 1582 1583 /* 1584 * If remaining data exceeds the mtu, 1585 * we know we need more fragment(s). 1586 */ 1587 datalen = length + fraggap; 1588 1589 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1590 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1591 fraglen = datalen + fragheaderlen; 1592 pagedlen = 0; 1593 1594 alloc_extra = hh_len; 1595 alloc_extra += dst_exthdrlen; 1596 alloc_extra += rt->dst.trailer_len; 1597 1598 /* We just reserve space for fragment header. 1599 * Note: this may be overallocation if the message 1600 * (without MSG_MORE) fits into the MTU. 1601 */ 1602 alloc_extra += sizeof(struct frag_hdr); 1603 1604 if ((flags & MSG_MORE) && 1605 !(rt->dst.dev->features&NETIF_F_SG)) 1606 alloclen = mtu; 1607 else if (!paged && 1608 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1609 !(rt->dst.dev->features & NETIF_F_SG))) 1610 alloclen = fraglen; 1611 else { 1612 alloclen = fragheaderlen + transhdrlen; 1613 pagedlen = datalen - transhdrlen; 1614 } 1615 alloclen += alloc_extra; 1616 1617 if (datalen != length + fraggap) { 1618 /* 1619 * this is not the last fragment, the trailer 1620 * space is regarded as data space. 1621 */ 1622 datalen += rt->dst.trailer_len; 1623 } 1624 1625 fraglen = datalen + fragheaderlen; 1626 1627 copy = datalen - transhdrlen - fraggap - pagedlen; 1628 /* [!] NOTE: copy may be negative if pagedlen>0 1629 * because then the equation may reduces to -fraggap. 1630 */ 1631 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1632 err = -EINVAL; 1633 goto error; 1634 } 1635 if (transhdrlen) { 1636 skb = sock_alloc_send_skb(sk, alloclen, 1637 (flags & MSG_DONTWAIT), &err); 1638 } else { 1639 skb = NULL; 1640 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1641 2 * sk->sk_sndbuf) 1642 skb = alloc_skb(alloclen, 1643 sk->sk_allocation); 1644 if (unlikely(!skb)) 1645 err = -ENOBUFS; 1646 } 1647 if (!skb) 1648 goto error; 1649 /* 1650 * Fill in the control structures 1651 */ 1652 skb->protocol = htons(ETH_P_IPV6); 1653 skb->ip_summed = csummode; 1654 skb->csum = 0; 1655 /* reserve for fragmentation and ipsec header */ 1656 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1657 dst_exthdrlen); 1658 1659 /* 1660 * Find where to start putting bytes 1661 */ 1662 data = skb_put(skb, fraglen - pagedlen); 1663 skb_set_network_header(skb, exthdrlen); 1664 data += fragheaderlen; 1665 skb->transport_header = (skb->network_header + 1666 fragheaderlen); 1667 if (fraggap) { 1668 skb->csum = skb_copy_and_csum_bits( 1669 skb_prev, maxfraglen, 1670 data + transhdrlen, fraggap); 1671 skb_prev->csum = csum_sub(skb_prev->csum, 1672 skb->csum); 1673 data += fraggap; 1674 pskb_trim_unique(skb_prev, maxfraglen); 1675 } 1676 if (copy > 0 && 1677 getfrag(from, data + transhdrlen, offset, 1678 copy, fraggap, skb) < 0) { 1679 err = -EFAULT; 1680 kfree_skb(skb); 1681 goto error; 1682 } else if (flags & MSG_SPLICE_PAGES) { 1683 copy = 0; 1684 } 1685 1686 offset += copy; 1687 length -= copy + transhdrlen; 1688 transhdrlen = 0; 1689 exthdrlen = 0; 1690 dst_exthdrlen = 0; 1691 1692 /* Only the initial fragment is time stamped */ 1693 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1694 cork->tx_flags = 0; 1695 skb_shinfo(skb)->tskey = tskey; 1696 tskey = 0; 1697 skb_zcopy_set(skb, uarg, &extra_uref); 1698 1699 if ((flags & MSG_CONFIRM) && !skb_prev) 1700 skb_set_dst_pending_confirm(skb, 1); 1701 1702 /* 1703 * Put the packet on the pending queue 1704 */ 1705 if (!skb->destructor) { 1706 skb->destructor = sock_wfree; 1707 skb->sk = sk; 1708 wmem_alloc_delta += skb->truesize; 1709 } 1710 __skb_queue_tail(queue, skb); 1711 continue; 1712 } 1713 1714 if (copy > length) 1715 copy = length; 1716 1717 if (!(rt->dst.dev->features&NETIF_F_SG) && 1718 skb_tailroom(skb) >= copy) { 1719 unsigned int off; 1720 1721 off = skb->len; 1722 if (getfrag(from, skb_put(skb, copy), 1723 offset, copy, off, skb) < 0) { 1724 __skb_trim(skb, off); 1725 err = -EFAULT; 1726 goto error; 1727 } 1728 } else if (flags & MSG_SPLICE_PAGES) { 1729 struct msghdr *msg = from; 1730 1731 err = -EIO; 1732 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1733 goto error; 1734 1735 err = skb_splice_from_iter(skb, &msg->msg_iter, copy, 1736 sk->sk_allocation); 1737 if (err < 0) 1738 goto error; 1739 copy = err; 1740 wmem_alloc_delta += copy; 1741 } else if (!zc) { 1742 int i = skb_shinfo(skb)->nr_frags; 1743 1744 err = -ENOMEM; 1745 if (!sk_page_frag_refill(sk, pfrag)) 1746 goto error; 1747 1748 skb_zcopy_downgrade_managed(skb); 1749 if (!skb_can_coalesce(skb, i, pfrag->page, 1750 pfrag->offset)) { 1751 err = -EMSGSIZE; 1752 if (i == MAX_SKB_FRAGS) 1753 goto error; 1754 1755 __skb_fill_page_desc(skb, i, pfrag->page, 1756 pfrag->offset, 0); 1757 skb_shinfo(skb)->nr_frags = ++i; 1758 get_page(pfrag->page); 1759 } 1760 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1761 if (getfrag(from, 1762 page_address(pfrag->page) + pfrag->offset, 1763 offset, copy, skb->len, skb) < 0) 1764 goto error_efault; 1765 1766 pfrag->offset += copy; 1767 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1768 skb->len += copy; 1769 skb->data_len += copy; 1770 skb->truesize += copy; 1771 wmem_alloc_delta += copy; 1772 } else { 1773 err = skb_zerocopy_iter_dgram(skb, from, copy); 1774 if (err < 0) 1775 goto error; 1776 } 1777 offset += copy; 1778 length -= copy; 1779 } 1780 1781 if (wmem_alloc_delta) 1782 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1783 return 0; 1784 1785 error_efault: 1786 err = -EFAULT; 1787 error: 1788 net_zcopy_put_abort(uarg, extra_uref); 1789 cork->length -= length; 1790 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1791 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1792 return err; 1793 } 1794 1795 int ip6_append_data(struct sock *sk, 1796 int getfrag(void *from, char *to, int offset, int len, 1797 int odd, struct sk_buff *skb), 1798 void *from, size_t length, int transhdrlen, 1799 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1800 struct rt6_info *rt, unsigned int flags) 1801 { 1802 struct inet_sock *inet = inet_sk(sk); 1803 struct ipv6_pinfo *np = inet6_sk(sk); 1804 int exthdrlen; 1805 int err; 1806 1807 if (flags&MSG_PROBE) 1808 return 0; 1809 if (skb_queue_empty(&sk->sk_write_queue)) { 1810 /* 1811 * setup for corking 1812 */ 1813 dst_hold(&rt->dst); 1814 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1815 ipc6, rt); 1816 if (err) 1817 return err; 1818 1819 inet->cork.fl.u.ip6 = *fl6; 1820 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1821 length += exthdrlen; 1822 transhdrlen += exthdrlen; 1823 } else { 1824 transhdrlen = 0; 1825 } 1826 1827 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1828 &np->cork, sk_page_frag(sk), getfrag, 1829 from, length, transhdrlen, flags, ipc6); 1830 } 1831 EXPORT_SYMBOL_GPL(ip6_append_data); 1832 1833 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1834 { 1835 struct dst_entry *dst = cork->base.dst; 1836 1837 cork->base.dst = NULL; 1838 cork->base.flags &= ~IPCORK_ALLFRAG; 1839 skb_dst_set(skb, dst); 1840 } 1841 1842 static void ip6_cork_release(struct inet_cork_full *cork, 1843 struct inet6_cork *v6_cork) 1844 { 1845 if (v6_cork->opt) { 1846 struct ipv6_txoptions *opt = v6_cork->opt; 1847 1848 kfree(opt->dst0opt); 1849 kfree(opt->dst1opt); 1850 kfree(opt->hopopt); 1851 kfree(opt->srcrt); 1852 kfree(opt); 1853 v6_cork->opt = NULL; 1854 } 1855 1856 if (cork->base.dst) { 1857 dst_release(cork->base.dst); 1858 cork->base.dst = NULL; 1859 cork->base.flags &= ~IPCORK_ALLFRAG; 1860 } 1861 } 1862 1863 struct sk_buff *__ip6_make_skb(struct sock *sk, 1864 struct sk_buff_head *queue, 1865 struct inet_cork_full *cork, 1866 struct inet6_cork *v6_cork) 1867 { 1868 struct sk_buff *skb, *tmp_skb; 1869 struct sk_buff **tail_skb; 1870 struct in6_addr *final_dst; 1871 struct net *net = sock_net(sk); 1872 struct ipv6hdr *hdr; 1873 struct ipv6_txoptions *opt = v6_cork->opt; 1874 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1875 struct flowi6 *fl6 = &cork->fl.u.ip6; 1876 unsigned char proto = fl6->flowi6_proto; 1877 1878 skb = __skb_dequeue(queue); 1879 if (!skb) 1880 goto out; 1881 tail_skb = &(skb_shinfo(skb)->frag_list); 1882 1883 /* move skb->data to ip header from ext header */ 1884 if (skb->data < skb_network_header(skb)) 1885 __skb_pull(skb, skb_network_offset(skb)); 1886 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1887 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1888 *tail_skb = tmp_skb; 1889 tail_skb = &(tmp_skb->next); 1890 skb->len += tmp_skb->len; 1891 skb->data_len += tmp_skb->len; 1892 skb->truesize += tmp_skb->truesize; 1893 tmp_skb->destructor = NULL; 1894 tmp_skb->sk = NULL; 1895 } 1896 1897 /* Allow local fragmentation. */ 1898 skb->ignore_df = ip6_sk_ignore_df(sk); 1899 __skb_pull(skb, skb_network_header_len(skb)); 1900 1901 final_dst = &fl6->daddr; 1902 if (opt && opt->opt_flen) 1903 ipv6_push_frag_opts(skb, opt, &proto); 1904 if (opt && opt->opt_nflen) 1905 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1906 1907 skb_push(skb, sizeof(struct ipv6hdr)); 1908 skb_reset_network_header(skb); 1909 hdr = ipv6_hdr(skb); 1910 1911 ip6_flow_hdr(hdr, v6_cork->tclass, 1912 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1913 ip6_autoflowlabel(net, sk), fl6)); 1914 hdr->hop_limit = v6_cork->hop_limit; 1915 hdr->nexthdr = proto; 1916 hdr->saddr = fl6->saddr; 1917 hdr->daddr = *final_dst; 1918 1919 skb->priority = READ_ONCE(sk->sk_priority); 1920 skb->mark = cork->base.mark; 1921 skb->tstamp = cork->base.transmit_time; 1922 1923 ip6_cork_steal_dst(skb, cork); 1924 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1925 if (proto == IPPROTO_ICMPV6) { 1926 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1927 u8 icmp6_type; 1928 1929 if (sk->sk_socket->type == SOCK_RAW && 1930 !inet_test_bit(HDRINCL, sk)) 1931 icmp6_type = fl6->fl6_icmp_type; 1932 else 1933 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1934 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1935 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1936 } 1937 1938 ip6_cork_release(cork, v6_cork); 1939 out: 1940 return skb; 1941 } 1942 1943 int ip6_send_skb(struct sk_buff *skb) 1944 { 1945 struct net *net = sock_net(skb->sk); 1946 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1947 int err; 1948 1949 err = ip6_local_out(net, skb->sk, skb); 1950 if (err) { 1951 if (err > 0) 1952 err = net_xmit_errno(err); 1953 if (err) 1954 IP6_INC_STATS(net, rt->rt6i_idev, 1955 IPSTATS_MIB_OUTDISCARDS); 1956 } 1957 1958 return err; 1959 } 1960 1961 int ip6_push_pending_frames(struct sock *sk) 1962 { 1963 struct sk_buff *skb; 1964 1965 skb = ip6_finish_skb(sk); 1966 if (!skb) 1967 return 0; 1968 1969 return ip6_send_skb(skb); 1970 } 1971 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1972 1973 static void __ip6_flush_pending_frames(struct sock *sk, 1974 struct sk_buff_head *queue, 1975 struct inet_cork_full *cork, 1976 struct inet6_cork *v6_cork) 1977 { 1978 struct sk_buff *skb; 1979 1980 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1981 if (skb_dst(skb)) 1982 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1983 IPSTATS_MIB_OUTDISCARDS); 1984 kfree_skb(skb); 1985 } 1986 1987 ip6_cork_release(cork, v6_cork); 1988 } 1989 1990 void ip6_flush_pending_frames(struct sock *sk) 1991 { 1992 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1993 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1994 } 1995 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1996 1997 struct sk_buff *ip6_make_skb(struct sock *sk, 1998 int getfrag(void *from, char *to, int offset, 1999 int len, int odd, struct sk_buff *skb), 2000 void *from, size_t length, int transhdrlen, 2001 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2002 unsigned int flags, struct inet_cork_full *cork) 2003 { 2004 struct inet6_cork v6_cork; 2005 struct sk_buff_head queue; 2006 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2007 int err; 2008 2009 if (flags & MSG_PROBE) { 2010 dst_release(&rt->dst); 2011 return NULL; 2012 } 2013 2014 __skb_queue_head_init(&queue); 2015 2016 cork->base.flags = 0; 2017 cork->base.addr = 0; 2018 cork->base.opt = NULL; 2019 v6_cork.opt = NULL; 2020 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); 2021 if (err) { 2022 ip6_cork_release(cork, &v6_cork); 2023 return ERR_PTR(err); 2024 } 2025 if (ipc6->dontfrag < 0) 2026 ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); 2027 2028 err = __ip6_append_data(sk, &queue, cork, &v6_cork, 2029 ¤t->task_frag, getfrag, from, 2030 length + exthdrlen, transhdrlen + exthdrlen, 2031 flags, ipc6); 2032 if (err) { 2033 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 2034 return ERR_PTR(err); 2035 } 2036 2037 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 2038 } 2039