1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst->dev; 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 skb = skb_expand_head(skb, hh_len); 74 if (!skb) { 75 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 76 return -ENOMEM; 77 } 78 } 79 80 hdr = ipv6_hdr(skb); 81 daddr = &hdr->daddr; 82 if (ipv6_addr_is_multicast(daddr)) { 83 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 84 ((mroute6_is_socket(net, skb) && 85 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 86 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 87 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 88 89 /* Do not check for IFF_ALLMULTI; multicast routing 90 is not supported in any case. 91 */ 92 if (newskb) 93 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 94 net, sk, newskb, NULL, newskb->dev, 95 dev_loopback_xmit); 96 97 if (hdr->hop_limit == 0) { 98 IP6_INC_STATS(net, idev, 99 IPSTATS_MIB_OUTDISCARDS); 100 kfree_skb(skb); 101 return 0; 102 } 103 } 104 105 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 106 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 107 !(dev->flags & IFF_LOOPBACK)) { 108 kfree_skb(skb); 109 return 0; 110 } 111 } 112 113 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 114 int res = lwtunnel_xmit(skb); 115 116 if (res != LWTUNNEL_XMIT_CONTINUE) 117 return res; 118 } 119 120 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 121 122 rcu_read_lock(); 123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 125 126 if (unlikely(IS_ERR_OR_NULL(neigh))) { 127 if (unlikely(!neigh)) 128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 129 if (IS_ERR(neigh)) { 130 rcu_read_unlock(); 131 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 132 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 133 return -EINVAL; 134 } 135 } 136 sock_confirm_neigh(skb, neigh); 137 ret = neigh_output(neigh, skb, false); 138 rcu_read_unlock(); 139 return ret; 140 } 141 142 static int 143 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 144 struct sk_buff *skb, unsigned int mtu) 145 { 146 struct sk_buff *segs, *nskb; 147 netdev_features_t features; 148 int ret = 0; 149 150 /* Please see corresponding comment in ip_finish_output_gso 151 * describing the cases where GSO segment length exceeds the 152 * egress MTU. 153 */ 154 features = netif_skb_features(skb); 155 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 156 if (IS_ERR_OR_NULL(segs)) { 157 kfree_skb(skb); 158 return -ENOMEM; 159 } 160 161 consume_skb(skb); 162 163 skb_list_walk_safe(segs, segs, nskb) { 164 int err; 165 166 skb_mark_not_on_list(segs); 167 /* Last GSO segment can be smaller than gso_size (and MTU). 168 * Adding a fragment header would produce an "atomic fragment", 169 * which is considered harmful (RFC-8021). Avoid that. 170 */ 171 err = segs->len > mtu ? 172 ip6_fragment(net, sk, segs, ip6_finish_output2) : 173 ip6_finish_output2(net, sk, segs); 174 if (err && ret == 0) 175 ret = err; 176 } 177 178 return ret; 179 } 180 181 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 182 struct sk_buff *skb, unsigned int mtu) 183 { 184 if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 185 !skb_gso_validate_network_len(skb, mtu)) 186 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 187 188 return ip6_finish_output2(net, sk, skb); 189 } 190 191 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 192 { 193 unsigned int mtu; 194 195 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 196 /* Policy lookup after SNAT yielded a new policy */ 197 if (skb_dst(skb)->xfrm) { 198 IP6CB(skb)->flags |= IP6SKB_REROUTED; 199 return dst_output(net, sk, skb); 200 } 201 #endif 202 203 mtu = ip6_skb_dst_mtu(skb); 204 if (skb_is_gso(skb)) 205 return ip6_finish_output_gso(net, sk, skb, mtu); 206 207 if (skb->len > mtu || 208 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 209 return ip6_fragment(net, sk, skb, ip6_finish_output2); 210 211 return ip6_finish_output2(net, sk, skb); 212 } 213 214 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 215 { 216 int ret; 217 218 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 219 switch (ret) { 220 case NET_XMIT_SUCCESS: 221 case NET_XMIT_CN: 222 return __ip6_finish_output(net, sk, skb) ? : ret; 223 default: 224 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 225 return ret; 226 } 227 } 228 229 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 230 { 231 struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev; 232 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 233 234 skb->protocol = htons(ETH_P_IPV6); 235 skb->dev = dev; 236 237 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 238 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 239 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 240 return 0; 241 } 242 243 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 244 net, sk, skb, indev, dev, 245 ip6_finish_output, 246 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 247 } 248 EXPORT_SYMBOL(ip6_output); 249 250 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 251 { 252 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 253 return ip6_default_np_autolabel(net); 254 return inet6_test_bit(AUTOFLOWLABEL, sk); 255 } 256 257 /* 258 * xmit an sk_buff (used by TCP, SCTP and DCCP) 259 * Note : socket lock is not held for SYNACK packets, but might be modified 260 * by calls to skb_set_owner_w() and ipv6_local_error(), 261 * which are using proper atomic operations or spinlocks. 262 */ 263 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 264 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 265 { 266 struct net *net = sock_net(sk); 267 const struct ipv6_pinfo *np = inet6_sk(sk); 268 struct in6_addr *first_hop = &fl6->daddr; 269 struct dst_entry *dst = skb_dst(skb); 270 struct net_device *dev = dst->dev; 271 struct inet6_dev *idev = ip6_dst_idev(dst); 272 struct hop_jumbo_hdr *hop_jumbo; 273 int hoplen = sizeof(*hop_jumbo); 274 unsigned int head_room; 275 struct ipv6hdr *hdr; 276 u8 proto = fl6->flowi6_proto; 277 int seg_len = skb->len; 278 int hlimit = -1; 279 u32 mtu; 280 281 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 282 if (opt) 283 head_room += opt->opt_nflen + opt->opt_flen; 284 285 if (unlikely(head_room > skb_headroom(skb))) { 286 skb = skb_expand_head(skb, head_room); 287 if (!skb) { 288 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 289 return -ENOBUFS; 290 } 291 } 292 293 if (opt) { 294 seg_len += opt->opt_nflen + opt->opt_flen; 295 296 if (opt->opt_flen) 297 ipv6_push_frag_opts(skb, opt, &proto); 298 299 if (opt->opt_nflen) 300 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 301 &fl6->saddr); 302 } 303 304 if (unlikely(seg_len > IPV6_MAXPLEN)) { 305 hop_jumbo = skb_push(skb, hoplen); 306 307 hop_jumbo->nexthdr = proto; 308 hop_jumbo->hdrlen = 0; 309 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 310 hop_jumbo->tlv_len = 4; 311 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 312 313 proto = IPPROTO_HOPOPTS; 314 seg_len = 0; 315 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 316 } 317 318 skb_push(skb, sizeof(struct ipv6hdr)); 319 skb_reset_network_header(skb); 320 hdr = ipv6_hdr(skb); 321 322 /* 323 * Fill in the IPv6 header 324 */ 325 if (np) 326 hlimit = READ_ONCE(np->hop_limit); 327 if (hlimit < 0) 328 hlimit = ip6_dst_hoplimit(dst); 329 330 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 331 ip6_autoflowlabel(net, sk), fl6)); 332 333 hdr->payload_len = htons(seg_len); 334 hdr->nexthdr = proto; 335 hdr->hop_limit = hlimit; 336 337 hdr->saddr = fl6->saddr; 338 hdr->daddr = *first_hop; 339 340 skb->protocol = htons(ETH_P_IPV6); 341 skb->priority = priority; 342 skb->mark = mark; 343 344 mtu = dst_mtu(dst); 345 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 346 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 347 348 /* if egress device is enslaved to an L3 master device pass the 349 * skb to its handler for processing 350 */ 351 skb = l3mdev_ip6_out((struct sock *)sk, skb); 352 if (unlikely(!skb)) 353 return 0; 354 355 /* hooks should never assume socket lock is held. 356 * we promote our socket to non const 357 */ 358 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 359 net, (struct sock *)sk, skb, NULL, dev, 360 dst_output); 361 } 362 363 skb->dev = dev; 364 /* ipv6_local_error() does not require socket lock, 365 * we promote our socket to non const 366 */ 367 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 368 369 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 370 kfree_skb(skb); 371 return -EMSGSIZE; 372 } 373 EXPORT_SYMBOL(ip6_xmit); 374 375 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 376 { 377 struct ip6_ra_chain *ra; 378 struct sock *last = NULL; 379 380 read_lock(&ip6_ra_lock); 381 for (ra = ip6_ra_chain; ra; ra = ra->next) { 382 struct sock *sk = ra->sk; 383 if (sk && ra->sel == sel && 384 (!sk->sk_bound_dev_if || 385 sk->sk_bound_dev_if == skb->dev->ifindex)) { 386 387 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 388 !net_eq(sock_net(sk), dev_net(skb->dev))) { 389 continue; 390 } 391 if (last) { 392 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 393 if (skb2) 394 rawv6_rcv(last, skb2); 395 } 396 last = sk; 397 } 398 } 399 400 if (last) { 401 rawv6_rcv(last, skb); 402 read_unlock(&ip6_ra_lock); 403 return 1; 404 } 405 read_unlock(&ip6_ra_lock); 406 return 0; 407 } 408 409 static int ip6_forward_proxy_check(struct sk_buff *skb) 410 { 411 struct ipv6hdr *hdr = ipv6_hdr(skb); 412 u8 nexthdr = hdr->nexthdr; 413 __be16 frag_off; 414 int offset; 415 416 if (ipv6_ext_hdr(nexthdr)) { 417 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 418 if (offset < 0) 419 return 0; 420 } else 421 offset = sizeof(struct ipv6hdr); 422 423 if (nexthdr == IPPROTO_ICMPV6) { 424 struct icmp6hdr *icmp6; 425 426 if (!pskb_may_pull(skb, (skb_network_header(skb) + 427 offset + 1 - skb->data))) 428 return 0; 429 430 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 431 432 switch (icmp6->icmp6_type) { 433 case NDISC_ROUTER_SOLICITATION: 434 case NDISC_ROUTER_ADVERTISEMENT: 435 case NDISC_NEIGHBOUR_SOLICITATION: 436 case NDISC_NEIGHBOUR_ADVERTISEMENT: 437 case NDISC_REDIRECT: 438 /* For reaction involving unicast neighbor discovery 439 * message destined to the proxied address, pass it to 440 * input function. 441 */ 442 return 1; 443 default: 444 break; 445 } 446 } 447 448 /* 449 * The proxying router can't forward traffic sent to a link-local 450 * address, so signal the sender and discard the packet. This 451 * behavior is clarified by the MIPv6 specification. 452 */ 453 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 454 dst_link_failure(skb); 455 return -1; 456 } 457 458 return 0; 459 } 460 461 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 462 struct sk_buff *skb) 463 { 464 #ifdef CONFIG_NET_SWITCHDEV 465 if (skb->offload_l3_fwd_mark) { 466 consume_skb(skb); 467 return 0; 468 } 469 #endif 470 471 skb_clear_tstamp(skb); 472 return dst_output(net, sk, skb); 473 } 474 475 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 476 { 477 if (skb->len <= mtu) 478 return false; 479 480 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 481 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 482 return true; 483 484 if (skb->ignore_df) 485 return false; 486 487 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 488 return false; 489 490 return true; 491 } 492 493 int ip6_forward(struct sk_buff *skb) 494 { 495 struct dst_entry *dst = skb_dst(skb); 496 struct ipv6hdr *hdr = ipv6_hdr(skb); 497 struct inet6_skb_parm *opt = IP6CB(skb); 498 struct net *net = dev_net(dst->dev); 499 struct inet6_dev *idev; 500 SKB_DR(reason); 501 u32 mtu; 502 503 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 504 if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) 505 goto error; 506 507 if (skb->pkt_type != PACKET_HOST) 508 goto drop; 509 510 if (unlikely(skb->sk)) 511 goto drop; 512 513 if (skb_warn_if_lro(skb)) 514 goto drop; 515 516 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 517 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 518 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 519 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 520 goto drop; 521 } 522 523 skb_forward_csum(skb); 524 525 /* 526 * We DO NOT make any processing on 527 * RA packets, pushing them to user level AS IS 528 * without ane WARRANTY that application will be able 529 * to interpret them. The reason is that we 530 * cannot make anything clever here. 531 * 532 * We are not end-node, so that if packet contains 533 * AH/ESP, we cannot make anything. 534 * Defragmentation also would be mistake, RA packets 535 * cannot be fragmented, because there is no warranty 536 * that different fragments will go along one path. --ANK 537 */ 538 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 539 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 540 return 0; 541 } 542 543 /* 544 * check and decrement ttl 545 */ 546 if (hdr->hop_limit <= 1) { 547 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 548 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 549 550 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 551 return -ETIMEDOUT; 552 } 553 554 /* XXX: idev->cnf.proxy_ndp? */ 555 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 556 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 557 int proxied = ip6_forward_proxy_check(skb); 558 if (proxied > 0) { 559 /* It's tempting to decrease the hop limit 560 * here by 1, as we do at the end of the 561 * function too. 562 * 563 * But that would be incorrect, as proxying is 564 * not forwarding. The ip6_input function 565 * will handle this packet locally, and it 566 * depends on the hop limit being unchanged. 567 * 568 * One example is the NDP hop limit, that 569 * always has to stay 255, but other would be 570 * similar checks around RA packets, where the 571 * user can even change the desired limit. 572 */ 573 return ip6_input(skb); 574 } else if (proxied < 0) { 575 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 576 goto drop; 577 } 578 } 579 580 if (!xfrm6_route_forward(skb)) { 581 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 582 SKB_DR_SET(reason, XFRM_POLICY); 583 goto drop; 584 } 585 dst = skb_dst(skb); 586 587 /* IPv6 specs say nothing about it, but it is clear that we cannot 588 send redirects to source routed frames. 589 We don't send redirects to frames decapsulated from IPsec. 590 */ 591 if (IP6CB(skb)->iif == dst->dev->ifindex && 592 opt->srcrt == 0 && !skb_sec_path(skb)) { 593 struct in6_addr *target = NULL; 594 struct inet_peer *peer; 595 struct rt6_info *rt; 596 597 /* 598 * incoming and outgoing devices are the same 599 * send a redirect. 600 */ 601 602 rt = dst_rt6_info(dst); 603 if (rt->rt6i_flags & RTF_GATEWAY) 604 target = &rt->rt6i_gateway; 605 else 606 target = &hdr->daddr; 607 608 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 609 610 /* Limit redirects both by destination (here) 611 and by source (inside ndisc_send_redirect) 612 */ 613 if (inet_peer_xrlim_allow(peer, 1*HZ)) 614 ndisc_send_redirect(skb, target); 615 if (peer) 616 inet_putpeer(peer); 617 } else { 618 int addrtype = ipv6_addr_type(&hdr->saddr); 619 620 /* This check is security critical. */ 621 if (addrtype == IPV6_ADDR_ANY || 622 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 623 goto error; 624 if (addrtype & IPV6_ADDR_LINKLOCAL) { 625 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 626 ICMPV6_NOT_NEIGHBOUR, 0); 627 goto error; 628 } 629 } 630 631 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 632 633 mtu = ip6_dst_mtu_maybe_forward(dst, true); 634 if (mtu < IPV6_MIN_MTU) 635 mtu = IPV6_MIN_MTU; 636 637 if (ip6_pkt_too_big(skb, mtu)) { 638 /* Again, force OUTPUT device used as source address */ 639 skb->dev = dst->dev; 640 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 641 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 642 __IP6_INC_STATS(net, ip6_dst_idev(dst), 643 IPSTATS_MIB_FRAGFAILS); 644 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 645 return -EMSGSIZE; 646 } 647 648 if (skb_cow(skb, dst->dev->hard_header_len)) { 649 __IP6_INC_STATS(net, ip6_dst_idev(dst), 650 IPSTATS_MIB_OUTDISCARDS); 651 goto drop; 652 } 653 654 hdr = ipv6_hdr(skb); 655 656 /* Mangling hops number delayed to point after skb COW */ 657 658 hdr->hop_limit--; 659 660 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 661 net, NULL, skb, skb->dev, dst->dev, 662 ip6_forward_finish); 663 664 error: 665 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 666 SKB_DR_SET(reason, IP_INADDRERRORS); 667 drop: 668 kfree_skb_reason(skb, reason); 669 return -EINVAL; 670 } 671 672 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 673 { 674 to->pkt_type = from->pkt_type; 675 to->priority = from->priority; 676 to->protocol = from->protocol; 677 skb_dst_drop(to); 678 skb_dst_set(to, dst_clone(skb_dst(from))); 679 to->dev = from->dev; 680 to->mark = from->mark; 681 682 skb_copy_hash(to, from); 683 684 #ifdef CONFIG_NET_SCHED 685 to->tc_index = from->tc_index; 686 #endif 687 nf_copy(to, from); 688 skb_ext_copy(to, from); 689 skb_copy_secmark(to, from); 690 } 691 692 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 693 u8 nexthdr, __be32 frag_id, 694 struct ip6_fraglist_iter *iter) 695 { 696 unsigned int first_len; 697 struct frag_hdr *fh; 698 699 /* BUILD HEADER */ 700 *prevhdr = NEXTHDR_FRAGMENT; 701 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 702 if (!iter->tmp_hdr) 703 return -ENOMEM; 704 705 iter->frag = skb_shinfo(skb)->frag_list; 706 skb_frag_list_init(skb); 707 708 iter->offset = 0; 709 iter->hlen = hlen; 710 iter->frag_id = frag_id; 711 iter->nexthdr = nexthdr; 712 713 __skb_pull(skb, hlen); 714 fh = __skb_push(skb, sizeof(struct frag_hdr)); 715 __skb_push(skb, hlen); 716 skb_reset_network_header(skb); 717 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 718 719 fh->nexthdr = nexthdr; 720 fh->reserved = 0; 721 fh->frag_off = htons(IP6_MF); 722 fh->identification = frag_id; 723 724 first_len = skb_pagelen(skb); 725 skb->data_len = first_len - skb_headlen(skb); 726 skb->len = first_len; 727 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 728 729 return 0; 730 } 731 EXPORT_SYMBOL(ip6_fraglist_init); 732 733 void ip6_fraglist_prepare(struct sk_buff *skb, 734 struct ip6_fraglist_iter *iter) 735 { 736 struct sk_buff *frag = iter->frag; 737 unsigned int hlen = iter->hlen; 738 struct frag_hdr *fh; 739 740 frag->ip_summed = CHECKSUM_NONE; 741 skb_reset_transport_header(frag); 742 fh = __skb_push(frag, sizeof(struct frag_hdr)); 743 __skb_push(frag, hlen); 744 skb_reset_network_header(frag); 745 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 746 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 747 fh->nexthdr = iter->nexthdr; 748 fh->reserved = 0; 749 fh->frag_off = htons(iter->offset); 750 if (frag->next) 751 fh->frag_off |= htons(IP6_MF); 752 fh->identification = iter->frag_id; 753 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 754 ip6_copy_metadata(frag, skb); 755 } 756 EXPORT_SYMBOL(ip6_fraglist_prepare); 757 758 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 759 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 760 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 761 { 762 state->prevhdr = prevhdr; 763 state->nexthdr = nexthdr; 764 state->frag_id = frag_id; 765 766 state->hlen = hlen; 767 state->mtu = mtu; 768 769 state->left = skb->len - hlen; /* Space per frame */ 770 state->ptr = hlen; /* Where to start from */ 771 772 state->hroom = hdr_room; 773 state->troom = needed_tailroom; 774 775 state->offset = 0; 776 } 777 EXPORT_SYMBOL(ip6_frag_init); 778 779 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 780 { 781 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 782 struct sk_buff *frag; 783 struct frag_hdr *fh; 784 unsigned int len; 785 786 len = state->left; 787 /* IF: it doesn't fit, use 'mtu' - the data space left */ 788 if (len > state->mtu) 789 len = state->mtu; 790 /* IF: we are not sending up to and including the packet end 791 then align the next start on an eight byte boundary */ 792 if (len < state->left) 793 len &= ~7; 794 795 /* Allocate buffer */ 796 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 797 state->hroom + state->troom, GFP_ATOMIC); 798 if (!frag) 799 return ERR_PTR(-ENOMEM); 800 801 /* 802 * Set up data on packet 803 */ 804 805 ip6_copy_metadata(frag, skb); 806 skb_reserve(frag, state->hroom); 807 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 808 skb_reset_network_header(frag); 809 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 810 frag->transport_header = (frag->network_header + state->hlen + 811 sizeof(struct frag_hdr)); 812 813 /* 814 * Charge the memory for the fragment to any owner 815 * it might possess 816 */ 817 if (skb->sk) 818 skb_set_owner_w(frag, skb->sk); 819 820 /* 821 * Copy the packet header into the new buffer. 822 */ 823 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 824 825 fragnexthdr_offset = skb_network_header(frag); 826 fragnexthdr_offset += prevhdr - skb_network_header(skb); 827 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 828 829 /* 830 * Build fragment header. 831 */ 832 fh->nexthdr = state->nexthdr; 833 fh->reserved = 0; 834 fh->identification = state->frag_id; 835 836 /* 837 * Copy a block of the IP datagram. 838 */ 839 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 840 len)); 841 state->left -= len; 842 843 fh->frag_off = htons(state->offset); 844 if (state->left > 0) 845 fh->frag_off |= htons(IP6_MF); 846 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 847 848 state->ptr += len; 849 state->offset += len; 850 851 return frag; 852 } 853 EXPORT_SYMBOL(ip6_frag_next); 854 855 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 856 int (*output)(struct net *, struct sock *, struct sk_buff *)) 857 { 858 struct sk_buff *frag; 859 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 860 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 861 inet6_sk(skb->sk) : NULL; 862 u8 tstamp_type = skb->tstamp_type; 863 struct ip6_frag_state state; 864 unsigned int mtu, hlen, nexthdr_offset; 865 ktime_t tstamp = skb->tstamp; 866 int hroom, err = 0; 867 __be32 frag_id; 868 u8 *prevhdr, nexthdr = 0; 869 870 err = ip6_find_1stfragopt(skb, &prevhdr); 871 if (err < 0) 872 goto fail; 873 hlen = err; 874 nexthdr = *prevhdr; 875 nexthdr_offset = prevhdr - skb_network_header(skb); 876 877 mtu = ip6_skb_dst_mtu(skb); 878 879 /* We must not fragment if the socket is set to force MTU discovery 880 * or if the skb it not generated by a local socket. 881 */ 882 if (unlikely(!skb->ignore_df && skb->len > mtu)) 883 goto fail_toobig; 884 885 if (IP6CB(skb)->frag_max_size) { 886 if (IP6CB(skb)->frag_max_size > mtu) 887 goto fail_toobig; 888 889 /* don't send fragments larger than what we received */ 890 mtu = IP6CB(skb)->frag_max_size; 891 if (mtu < IPV6_MIN_MTU) 892 mtu = IPV6_MIN_MTU; 893 } 894 895 if (np) { 896 u32 frag_size = READ_ONCE(np->frag_size); 897 898 if (frag_size && frag_size < mtu) 899 mtu = frag_size; 900 } 901 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 902 goto fail_toobig; 903 mtu -= hlen + sizeof(struct frag_hdr); 904 905 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 906 &ipv6_hdr(skb)->saddr); 907 908 if (skb->ip_summed == CHECKSUM_PARTIAL && 909 (err = skb_checksum_help(skb))) 910 goto fail; 911 912 prevhdr = skb_network_header(skb) + nexthdr_offset; 913 hroom = LL_RESERVED_SPACE(rt->dst.dev); 914 if (skb_has_frag_list(skb)) { 915 unsigned int first_len = skb_pagelen(skb); 916 struct ip6_fraglist_iter iter; 917 struct sk_buff *frag2; 918 919 if (first_len - hlen > mtu || 920 ((first_len - hlen) & 7) || 921 skb_cloned(skb) || 922 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 923 goto slow_path; 924 925 skb_walk_frags(skb, frag) { 926 /* Correct geometry. */ 927 if (frag->len > mtu || 928 ((frag->len & 7) && frag->next) || 929 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 930 goto slow_path_clean; 931 932 /* Partially cloned skb? */ 933 if (skb_shared(frag)) 934 goto slow_path_clean; 935 936 BUG_ON(frag->sk); 937 if (skb->sk) { 938 frag->sk = skb->sk; 939 frag->destructor = sock_wfree; 940 } 941 skb->truesize -= frag->truesize; 942 } 943 944 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 945 &iter); 946 if (err < 0) 947 goto fail; 948 949 /* We prevent @rt from being freed. */ 950 rcu_read_lock(); 951 952 for (;;) { 953 /* Prepare header of the next frame, 954 * before previous one went down. */ 955 if (iter.frag) 956 ip6_fraglist_prepare(skb, &iter); 957 958 skb_set_delivery_time(skb, tstamp, tstamp_type); 959 err = output(net, sk, skb); 960 if (!err) 961 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 962 IPSTATS_MIB_FRAGCREATES); 963 964 if (err || !iter.frag) 965 break; 966 967 skb = ip6_fraglist_next(&iter); 968 } 969 970 kfree(iter.tmp_hdr); 971 972 if (err == 0) { 973 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 974 IPSTATS_MIB_FRAGOKS); 975 rcu_read_unlock(); 976 return 0; 977 } 978 979 kfree_skb_list(iter.frag); 980 981 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 982 IPSTATS_MIB_FRAGFAILS); 983 rcu_read_unlock(); 984 return err; 985 986 slow_path_clean: 987 skb_walk_frags(skb, frag2) { 988 if (frag2 == frag) 989 break; 990 frag2->sk = NULL; 991 frag2->destructor = NULL; 992 skb->truesize += frag2->truesize; 993 } 994 } 995 996 slow_path: 997 /* 998 * Fragment the datagram. 999 */ 1000 1001 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1002 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1003 &state); 1004 1005 /* 1006 * Keep copying data until we run out. 1007 */ 1008 1009 while (state.left > 0) { 1010 frag = ip6_frag_next(skb, &state); 1011 if (IS_ERR(frag)) { 1012 err = PTR_ERR(frag); 1013 goto fail; 1014 } 1015 1016 /* 1017 * Put this fragment into the sending queue. 1018 */ 1019 skb_set_delivery_time(frag, tstamp, tstamp_type); 1020 err = output(net, sk, frag); 1021 if (err) 1022 goto fail; 1023 1024 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1025 IPSTATS_MIB_FRAGCREATES); 1026 } 1027 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1028 IPSTATS_MIB_FRAGOKS); 1029 consume_skb(skb); 1030 return err; 1031 1032 fail_toobig: 1033 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1034 err = -EMSGSIZE; 1035 1036 fail: 1037 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1038 IPSTATS_MIB_FRAGFAILS); 1039 kfree_skb(skb); 1040 return err; 1041 } 1042 1043 static inline int ip6_rt_check(const struct rt6key *rt_key, 1044 const struct in6_addr *fl_addr, 1045 const struct in6_addr *addr_cache) 1046 { 1047 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1048 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1049 } 1050 1051 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1052 struct dst_entry *dst, 1053 const struct flowi6 *fl6) 1054 { 1055 struct ipv6_pinfo *np = inet6_sk(sk); 1056 struct rt6_info *rt; 1057 1058 if (!dst) 1059 goto out; 1060 1061 if (dst->ops->family != AF_INET6) { 1062 dst_release(dst); 1063 return NULL; 1064 } 1065 1066 rt = dst_rt6_info(dst); 1067 /* Yes, checking route validity in not connected 1068 * case is not very simple. Take into account, 1069 * that we do not support routing by source, TOS, 1070 * and MSG_DONTROUTE --ANK (980726) 1071 * 1072 * 1. ip6_rt_check(): If route was host route, 1073 * check that cached destination is current. 1074 * If it is network route, we still may 1075 * check its validity using saved pointer 1076 * to the last used address: daddr_cache. 1077 * We do not want to save whole address now, 1078 * (because main consumer of this service 1079 * is tcp, which has not this problem), 1080 * so that the last trick works only on connected 1081 * sockets. 1082 * 2. oif also should be the same. 1083 */ 1084 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1085 #ifdef CONFIG_IPV6_SUBTREES 1086 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1087 #endif 1088 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { 1089 dst_release(dst); 1090 dst = NULL; 1091 } 1092 1093 out: 1094 return dst; 1095 } 1096 1097 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1098 struct dst_entry **dst, struct flowi6 *fl6) 1099 { 1100 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1101 struct neighbour *n; 1102 struct rt6_info *rt; 1103 #endif 1104 int err; 1105 int flags = 0; 1106 1107 /* The correct way to handle this would be to do 1108 * ip6_route_get_saddr, and then ip6_route_output; however, 1109 * the route-specific preferred source forces the 1110 * ip6_route_output call _before_ ip6_route_get_saddr. 1111 * 1112 * In source specific routing (no src=any default route), 1113 * ip6_route_output will fail given src=any saddr, though, so 1114 * that's why we try it again later. 1115 */ 1116 if (ipv6_addr_any(&fl6->saddr)) { 1117 struct fib6_info *from; 1118 struct rt6_info *rt; 1119 1120 *dst = ip6_route_output(net, sk, fl6); 1121 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1122 1123 rcu_read_lock(); 1124 from = rt ? rcu_dereference(rt->from) : NULL; 1125 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1126 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1127 fl6->flowi6_l3mdev, 1128 &fl6->saddr); 1129 rcu_read_unlock(); 1130 1131 if (err) 1132 goto out_err_release; 1133 1134 /* If we had an erroneous initial result, pretend it 1135 * never existed and let the SA-enabled version take 1136 * over. 1137 */ 1138 if ((*dst)->error) { 1139 dst_release(*dst); 1140 *dst = NULL; 1141 } 1142 1143 if (fl6->flowi6_oif) 1144 flags |= RT6_LOOKUP_F_IFACE; 1145 } 1146 1147 if (!*dst) 1148 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1149 1150 err = (*dst)->error; 1151 if (err) 1152 goto out_err_release; 1153 1154 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1155 /* 1156 * Here if the dst entry we've looked up 1157 * has a neighbour entry that is in the INCOMPLETE 1158 * state and the src address from the flow is 1159 * marked as OPTIMISTIC, we release the found 1160 * dst entry and replace it instead with the 1161 * dst entry of the nexthop router 1162 */ 1163 rt = dst_rt6_info(*dst); 1164 rcu_read_lock(); 1165 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1166 rt6_nexthop(rt, &fl6->daddr)); 1167 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1168 rcu_read_unlock(); 1169 1170 if (err) { 1171 struct inet6_ifaddr *ifp; 1172 struct flowi6 fl_gw6; 1173 int redirect; 1174 1175 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1176 (*dst)->dev, 1); 1177 1178 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1179 if (ifp) 1180 in6_ifa_put(ifp); 1181 1182 if (redirect) { 1183 /* 1184 * We need to get the dst entry for the 1185 * default router instead 1186 */ 1187 dst_release(*dst); 1188 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1189 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1190 *dst = ip6_route_output(net, sk, &fl_gw6); 1191 err = (*dst)->error; 1192 if (err) 1193 goto out_err_release; 1194 } 1195 } 1196 #endif 1197 if (ipv6_addr_v4mapped(&fl6->saddr) && 1198 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1199 err = -EAFNOSUPPORT; 1200 goto out_err_release; 1201 } 1202 1203 return 0; 1204 1205 out_err_release: 1206 dst_release(*dst); 1207 *dst = NULL; 1208 1209 if (err == -ENETUNREACH) 1210 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1211 return err; 1212 } 1213 1214 /** 1215 * ip6_dst_lookup - perform route lookup on flow 1216 * @net: Network namespace to perform lookup in 1217 * @sk: socket which provides route info 1218 * @dst: pointer to dst_entry * for result 1219 * @fl6: flow to lookup 1220 * 1221 * This function performs a route lookup on the given flow. 1222 * 1223 * It returns zero on success, or a standard errno code on error. 1224 */ 1225 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1226 struct flowi6 *fl6) 1227 { 1228 *dst = NULL; 1229 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1230 } 1231 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1232 1233 /** 1234 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1235 * @net: Network namespace to perform lookup in 1236 * @sk: socket which provides route info 1237 * @fl6: flow to lookup 1238 * @final_dst: final destination address for ipsec lookup 1239 * 1240 * This function performs a route lookup on the given flow. 1241 * 1242 * It returns a valid dst pointer on success, or a pointer encoded 1243 * error code. 1244 */ 1245 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1246 const struct in6_addr *final_dst) 1247 { 1248 struct dst_entry *dst = NULL; 1249 int err; 1250 1251 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1252 if (err) 1253 return ERR_PTR(err); 1254 if (final_dst) 1255 fl6->daddr = *final_dst; 1256 1257 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1258 } 1259 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1260 1261 /** 1262 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1263 * @sk: socket which provides the dst cache and route info 1264 * @fl6: flow to lookup 1265 * @final_dst: final destination address for ipsec lookup 1266 * @connected: whether @sk is connected or not 1267 * 1268 * This function performs a route lookup on the given flow with the 1269 * possibility of using the cached route in the socket if it is valid. 1270 * It will take the socket dst lock when operating on the dst cache. 1271 * As a result, this function can only be used in process context. 1272 * 1273 * In addition, for a connected socket, cache the dst in the socket 1274 * if the current cache is not valid. 1275 * 1276 * It returns a valid dst pointer on success, or a pointer encoded 1277 * error code. 1278 */ 1279 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1280 const struct in6_addr *final_dst, 1281 bool connected) 1282 { 1283 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1284 1285 dst = ip6_sk_dst_check(sk, dst, fl6); 1286 if (dst) 1287 return dst; 1288 1289 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1290 if (connected && !IS_ERR(dst)) 1291 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1292 1293 return dst; 1294 } 1295 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1296 1297 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1298 gfp_t gfp) 1299 { 1300 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1301 } 1302 1303 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1304 gfp_t gfp) 1305 { 1306 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1307 } 1308 1309 static void ip6_append_data_mtu(unsigned int *mtu, 1310 int *maxfraglen, 1311 unsigned int fragheaderlen, 1312 struct sk_buff *skb, 1313 struct rt6_info *rt, 1314 unsigned int orig_mtu) 1315 { 1316 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1317 if (!skb) { 1318 /* first fragment, reserve header_len */ 1319 *mtu = orig_mtu - rt->dst.header_len; 1320 1321 } else { 1322 /* 1323 * this fragment is not first, the headers 1324 * space is regarded as data space. 1325 */ 1326 *mtu = orig_mtu; 1327 } 1328 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1329 + fragheaderlen - sizeof(struct frag_hdr); 1330 } 1331 } 1332 1333 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1334 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1335 struct rt6_info *rt) 1336 { 1337 struct ipv6_pinfo *np = inet6_sk(sk); 1338 unsigned int mtu, frag_size; 1339 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1340 1341 /* callers pass dst together with a reference, set it first so 1342 * ip6_cork_release() can put it down even in case of an error. 1343 */ 1344 cork->base.dst = &rt->dst; 1345 1346 /* 1347 * setup for corking 1348 */ 1349 if (opt) { 1350 if (WARN_ON(v6_cork->opt)) 1351 return -EINVAL; 1352 1353 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1354 if (unlikely(!nopt)) 1355 return -ENOBUFS; 1356 1357 nopt->tot_len = sizeof(*opt); 1358 nopt->opt_flen = opt->opt_flen; 1359 nopt->opt_nflen = opt->opt_nflen; 1360 1361 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1362 if (opt->dst0opt && !nopt->dst0opt) 1363 return -ENOBUFS; 1364 1365 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1366 if (opt->dst1opt && !nopt->dst1opt) 1367 return -ENOBUFS; 1368 1369 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1370 if (opt->hopopt && !nopt->hopopt) 1371 return -ENOBUFS; 1372 1373 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1374 if (opt->srcrt && !nopt->srcrt) 1375 return -ENOBUFS; 1376 1377 /* need source address above miyazawa*/ 1378 } 1379 v6_cork->hop_limit = ipc6->hlimit; 1380 v6_cork->tclass = ipc6->tclass; 1381 if (rt->dst.flags & DST_XFRM_TUNNEL) 1382 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1383 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1384 else 1385 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1386 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1387 1388 frag_size = READ_ONCE(np->frag_size); 1389 if (frag_size && frag_size < mtu) 1390 mtu = frag_size; 1391 1392 cork->base.fragsize = mtu; 1393 cork->base.gso_size = ipc6->gso_size; 1394 cork->base.tx_flags = 0; 1395 cork->base.mark = ipc6->sockc.mark; 1396 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1397 1398 cork->base.length = 0; 1399 cork->base.transmit_time = ipc6->sockc.transmit_time; 1400 1401 return 0; 1402 } 1403 1404 static int __ip6_append_data(struct sock *sk, 1405 struct sk_buff_head *queue, 1406 struct inet_cork_full *cork_full, 1407 struct inet6_cork *v6_cork, 1408 struct page_frag *pfrag, 1409 int getfrag(void *from, char *to, int offset, 1410 int len, int odd, struct sk_buff *skb), 1411 void *from, size_t length, int transhdrlen, 1412 unsigned int flags, struct ipcm6_cookie *ipc6) 1413 { 1414 struct sk_buff *skb, *skb_prev = NULL; 1415 struct inet_cork *cork = &cork_full->base; 1416 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1417 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1418 struct ubuf_info *uarg = NULL; 1419 int exthdrlen = 0; 1420 int dst_exthdrlen = 0; 1421 int hh_len; 1422 int copy; 1423 int err; 1424 int offset = 0; 1425 bool zc = false; 1426 u32 tskey = 0; 1427 struct rt6_info *rt = dst_rt6_info(cork->dst); 1428 bool paged, hold_tskey, extra_uref = false; 1429 struct ipv6_txoptions *opt = v6_cork->opt; 1430 int csummode = CHECKSUM_NONE; 1431 unsigned int maxnonfragsize, headersize; 1432 unsigned int wmem_alloc_delta = 0; 1433 1434 skb = skb_peek_tail(queue); 1435 if (!skb) { 1436 exthdrlen = opt ? opt->opt_flen : 0; 1437 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1438 } 1439 1440 paged = !!cork->gso_size; 1441 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1442 orig_mtu = mtu; 1443 1444 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1445 1446 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1447 (opt ? opt->opt_nflen : 0); 1448 1449 headersize = sizeof(struct ipv6hdr) + 1450 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1451 rt->rt6i_nfheader_len; 1452 1453 if (mtu <= fragheaderlen || 1454 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1455 goto emsgsize; 1456 1457 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1458 sizeof(struct frag_hdr); 1459 1460 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1461 * the first fragment 1462 */ 1463 if (headersize + transhdrlen > mtu) 1464 goto emsgsize; 1465 1466 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1467 (sk->sk_protocol == IPPROTO_UDP || 1468 sk->sk_protocol == IPPROTO_ICMPV6 || 1469 sk->sk_protocol == IPPROTO_RAW)) { 1470 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1471 sizeof(struct ipv6hdr)); 1472 goto emsgsize; 1473 } 1474 1475 if (ip6_sk_ignore_df(sk)) 1476 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1477 else 1478 maxnonfragsize = mtu; 1479 1480 if (cork->length + length > maxnonfragsize - headersize) { 1481 emsgsize: 1482 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1483 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1484 return -EMSGSIZE; 1485 } 1486 1487 /* CHECKSUM_PARTIAL only with no extension headers and when 1488 * we are not going to fragment 1489 */ 1490 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1491 headersize == sizeof(struct ipv6hdr) && 1492 length <= mtu - headersize && 1493 (!(flags & MSG_MORE) || cork->gso_size) && 1494 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1495 csummode = CHECKSUM_PARTIAL; 1496 1497 if ((flags & MSG_ZEROCOPY) && length) { 1498 struct msghdr *msg = from; 1499 1500 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1501 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1502 return -EINVAL; 1503 1504 /* Leave uarg NULL if can't zerocopy, callers should 1505 * be able to handle it. 1506 */ 1507 if ((rt->dst.dev->features & NETIF_F_SG) && 1508 csummode == CHECKSUM_PARTIAL) { 1509 paged = true; 1510 zc = true; 1511 uarg = msg->msg_ubuf; 1512 } 1513 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1514 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1515 if (!uarg) 1516 return -ENOBUFS; 1517 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1518 if (rt->dst.dev->features & NETIF_F_SG && 1519 csummode == CHECKSUM_PARTIAL) { 1520 paged = true; 1521 zc = true; 1522 } else { 1523 uarg_to_msgzc(uarg)->zerocopy = 0; 1524 skb_zcopy_set(skb, uarg, &extra_uref); 1525 } 1526 } 1527 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1528 if (inet_test_bit(HDRINCL, sk)) 1529 return -EPERM; 1530 if (rt->dst.dev->features & NETIF_F_SG && 1531 getfrag == ip_generic_getfrag) 1532 /* We need an empty buffer to attach stuff to */ 1533 paged = true; 1534 else 1535 flags &= ~MSG_SPLICE_PAGES; 1536 } 1537 1538 hold_tskey = cork->tx_flags & SKBTX_ANY_TSTAMP && 1539 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID; 1540 if (hold_tskey) 1541 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1542 1543 /* 1544 * Let's try using as much space as possible. 1545 * Use MTU if total length of the message fits into the MTU. 1546 * Otherwise, we need to reserve fragment header and 1547 * fragment alignment (= 8-15 octects, in total). 1548 * 1549 * Note that we may need to "move" the data from the tail 1550 * of the buffer to the new fragment when we split 1551 * the message. 1552 * 1553 * FIXME: It may be fragmented into multiple chunks 1554 * at once if non-fragmentable extension headers 1555 * are too large. 1556 * --yoshfuji 1557 */ 1558 1559 cork->length += length; 1560 if (!skb) 1561 goto alloc_new_skb; 1562 1563 while (length > 0) { 1564 /* Check if the remaining data fits into current packet. */ 1565 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1566 if (copy < length) 1567 copy = maxfraglen - skb->len; 1568 1569 if (copy <= 0) { 1570 char *data; 1571 unsigned int datalen; 1572 unsigned int fraglen; 1573 unsigned int fraggap; 1574 unsigned int alloclen, alloc_extra; 1575 unsigned int pagedlen; 1576 alloc_new_skb: 1577 /* There's no room in the current skb */ 1578 if (skb) 1579 fraggap = skb->len - maxfraglen; 1580 else 1581 fraggap = 0; 1582 /* update mtu and maxfraglen if necessary */ 1583 if (!skb || !skb_prev) 1584 ip6_append_data_mtu(&mtu, &maxfraglen, 1585 fragheaderlen, skb, rt, 1586 orig_mtu); 1587 1588 skb_prev = skb; 1589 1590 /* 1591 * If remaining data exceeds the mtu, 1592 * we know we need more fragment(s). 1593 */ 1594 datalen = length + fraggap; 1595 1596 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1597 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1598 fraglen = datalen + fragheaderlen; 1599 pagedlen = 0; 1600 1601 alloc_extra = hh_len; 1602 alloc_extra += dst_exthdrlen; 1603 alloc_extra += rt->dst.trailer_len; 1604 1605 /* We just reserve space for fragment header. 1606 * Note: this may be overallocation if the message 1607 * (without MSG_MORE) fits into the MTU. 1608 */ 1609 alloc_extra += sizeof(struct frag_hdr); 1610 1611 if ((flags & MSG_MORE) && 1612 !(rt->dst.dev->features&NETIF_F_SG)) 1613 alloclen = mtu; 1614 else if (!paged && 1615 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1616 !(rt->dst.dev->features & NETIF_F_SG))) 1617 alloclen = fraglen; 1618 else { 1619 alloclen = fragheaderlen + transhdrlen; 1620 pagedlen = datalen - transhdrlen; 1621 } 1622 alloclen += alloc_extra; 1623 1624 if (datalen != length + fraggap) { 1625 /* 1626 * this is not the last fragment, the trailer 1627 * space is regarded as data space. 1628 */ 1629 datalen += rt->dst.trailer_len; 1630 } 1631 1632 fraglen = datalen + fragheaderlen; 1633 1634 copy = datalen - transhdrlen - fraggap - pagedlen; 1635 /* [!] NOTE: copy may be negative if pagedlen>0 1636 * because then the equation may reduces to -fraggap. 1637 */ 1638 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1639 err = -EINVAL; 1640 goto error; 1641 } 1642 if (transhdrlen) { 1643 skb = sock_alloc_send_skb(sk, alloclen, 1644 (flags & MSG_DONTWAIT), &err); 1645 } else { 1646 skb = NULL; 1647 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1648 2 * sk->sk_sndbuf) 1649 skb = alloc_skb(alloclen, 1650 sk->sk_allocation); 1651 if (unlikely(!skb)) 1652 err = -ENOBUFS; 1653 } 1654 if (!skb) 1655 goto error; 1656 /* 1657 * Fill in the control structures 1658 */ 1659 skb->protocol = htons(ETH_P_IPV6); 1660 skb->ip_summed = csummode; 1661 skb->csum = 0; 1662 /* reserve for fragmentation and ipsec header */ 1663 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1664 dst_exthdrlen); 1665 1666 /* 1667 * Find where to start putting bytes 1668 */ 1669 data = skb_put(skb, fraglen - pagedlen); 1670 skb_set_network_header(skb, exthdrlen); 1671 data += fragheaderlen; 1672 skb->transport_header = (skb->network_header + 1673 fragheaderlen); 1674 if (fraggap) { 1675 skb->csum = skb_copy_and_csum_bits( 1676 skb_prev, maxfraglen, 1677 data + transhdrlen, fraggap); 1678 skb_prev->csum = csum_sub(skb_prev->csum, 1679 skb->csum); 1680 data += fraggap; 1681 pskb_trim_unique(skb_prev, maxfraglen); 1682 } 1683 if (copy > 0 && 1684 getfrag(from, data + transhdrlen, offset, 1685 copy, fraggap, skb) < 0) { 1686 err = -EFAULT; 1687 kfree_skb(skb); 1688 goto error; 1689 } else if (flags & MSG_SPLICE_PAGES) { 1690 copy = 0; 1691 } 1692 1693 offset += copy; 1694 length -= copy + transhdrlen; 1695 transhdrlen = 0; 1696 exthdrlen = 0; 1697 dst_exthdrlen = 0; 1698 1699 /* Only the initial fragment is time stamped */ 1700 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1701 cork->tx_flags = 0; 1702 skb_shinfo(skb)->tskey = tskey; 1703 tskey = 0; 1704 skb_zcopy_set(skb, uarg, &extra_uref); 1705 1706 if ((flags & MSG_CONFIRM) && !skb_prev) 1707 skb_set_dst_pending_confirm(skb, 1); 1708 1709 /* 1710 * Put the packet on the pending queue 1711 */ 1712 if (!skb->destructor) { 1713 skb->destructor = sock_wfree; 1714 skb->sk = sk; 1715 wmem_alloc_delta += skb->truesize; 1716 } 1717 __skb_queue_tail(queue, skb); 1718 continue; 1719 } 1720 1721 if (copy > length) 1722 copy = length; 1723 1724 if (!(rt->dst.dev->features&NETIF_F_SG) && 1725 skb_tailroom(skb) >= copy) { 1726 unsigned int off; 1727 1728 off = skb->len; 1729 if (getfrag(from, skb_put(skb, copy), 1730 offset, copy, off, skb) < 0) { 1731 __skb_trim(skb, off); 1732 err = -EFAULT; 1733 goto error; 1734 } 1735 } else if (flags & MSG_SPLICE_PAGES) { 1736 struct msghdr *msg = from; 1737 1738 err = -EIO; 1739 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1740 goto error; 1741 1742 err = skb_splice_from_iter(skb, &msg->msg_iter, copy, 1743 sk->sk_allocation); 1744 if (err < 0) 1745 goto error; 1746 copy = err; 1747 wmem_alloc_delta += copy; 1748 } else if (!zc) { 1749 int i = skb_shinfo(skb)->nr_frags; 1750 1751 err = -ENOMEM; 1752 if (!sk_page_frag_refill(sk, pfrag)) 1753 goto error; 1754 1755 skb_zcopy_downgrade_managed(skb); 1756 if (!skb_can_coalesce(skb, i, pfrag->page, 1757 pfrag->offset)) { 1758 err = -EMSGSIZE; 1759 if (i == MAX_SKB_FRAGS) 1760 goto error; 1761 1762 __skb_fill_page_desc(skb, i, pfrag->page, 1763 pfrag->offset, 0); 1764 skb_shinfo(skb)->nr_frags = ++i; 1765 get_page(pfrag->page); 1766 } 1767 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1768 if (getfrag(from, 1769 page_address(pfrag->page) + pfrag->offset, 1770 offset, copy, skb->len, skb) < 0) 1771 goto error_efault; 1772 1773 pfrag->offset += copy; 1774 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1775 skb->len += copy; 1776 skb->data_len += copy; 1777 skb->truesize += copy; 1778 wmem_alloc_delta += copy; 1779 } else { 1780 err = skb_zerocopy_iter_dgram(skb, from, copy); 1781 if (err < 0) 1782 goto error; 1783 } 1784 offset += copy; 1785 length -= copy; 1786 } 1787 1788 if (wmem_alloc_delta) 1789 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1790 return 0; 1791 1792 error_efault: 1793 err = -EFAULT; 1794 error: 1795 net_zcopy_put_abort(uarg, extra_uref); 1796 cork->length -= length; 1797 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1798 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1799 if (hold_tskey) 1800 atomic_dec(&sk->sk_tskey); 1801 return err; 1802 } 1803 1804 int ip6_append_data(struct sock *sk, 1805 int getfrag(void *from, char *to, int offset, int len, 1806 int odd, struct sk_buff *skb), 1807 void *from, size_t length, int transhdrlen, 1808 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1809 struct rt6_info *rt, unsigned int flags) 1810 { 1811 struct inet_sock *inet = inet_sk(sk); 1812 struct ipv6_pinfo *np = inet6_sk(sk); 1813 int exthdrlen; 1814 int err; 1815 1816 if (flags&MSG_PROBE) 1817 return 0; 1818 if (skb_queue_empty(&sk->sk_write_queue)) { 1819 /* 1820 * setup for corking 1821 */ 1822 dst_hold(&rt->dst); 1823 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1824 ipc6, rt); 1825 if (err) 1826 return err; 1827 1828 inet->cork.fl.u.ip6 = *fl6; 1829 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1830 length += exthdrlen; 1831 transhdrlen += exthdrlen; 1832 } else { 1833 transhdrlen = 0; 1834 } 1835 1836 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1837 &np->cork, sk_page_frag(sk), getfrag, 1838 from, length, transhdrlen, flags, ipc6); 1839 } 1840 EXPORT_SYMBOL_GPL(ip6_append_data); 1841 1842 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1843 { 1844 struct dst_entry *dst = cork->base.dst; 1845 1846 cork->base.dst = NULL; 1847 skb_dst_set(skb, dst); 1848 } 1849 1850 static void ip6_cork_release(struct inet_cork_full *cork, 1851 struct inet6_cork *v6_cork) 1852 { 1853 if (v6_cork->opt) { 1854 struct ipv6_txoptions *opt = v6_cork->opt; 1855 1856 kfree(opt->dst0opt); 1857 kfree(opt->dst1opt); 1858 kfree(opt->hopopt); 1859 kfree(opt->srcrt); 1860 kfree(opt); 1861 v6_cork->opt = NULL; 1862 } 1863 1864 if (cork->base.dst) { 1865 dst_release(cork->base.dst); 1866 cork->base.dst = NULL; 1867 } 1868 } 1869 1870 struct sk_buff *__ip6_make_skb(struct sock *sk, 1871 struct sk_buff_head *queue, 1872 struct inet_cork_full *cork, 1873 struct inet6_cork *v6_cork) 1874 { 1875 struct sk_buff *skb, *tmp_skb; 1876 struct sk_buff **tail_skb; 1877 struct in6_addr *final_dst; 1878 struct net *net = sock_net(sk); 1879 struct ipv6hdr *hdr; 1880 struct ipv6_txoptions *opt = v6_cork->opt; 1881 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1882 struct flowi6 *fl6 = &cork->fl.u.ip6; 1883 unsigned char proto = fl6->flowi6_proto; 1884 1885 skb = __skb_dequeue(queue); 1886 if (!skb) 1887 goto out; 1888 tail_skb = &(skb_shinfo(skb)->frag_list); 1889 1890 /* move skb->data to ip header from ext header */ 1891 if (skb->data < skb_network_header(skb)) 1892 __skb_pull(skb, skb_network_offset(skb)); 1893 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1894 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1895 *tail_skb = tmp_skb; 1896 tail_skb = &(tmp_skb->next); 1897 skb->len += tmp_skb->len; 1898 skb->data_len += tmp_skb->len; 1899 skb->truesize += tmp_skb->truesize; 1900 tmp_skb->destructor = NULL; 1901 tmp_skb->sk = NULL; 1902 } 1903 1904 /* Allow local fragmentation. */ 1905 skb->ignore_df = ip6_sk_ignore_df(sk); 1906 __skb_pull(skb, skb_network_header_len(skb)); 1907 1908 final_dst = &fl6->daddr; 1909 if (opt && opt->opt_flen) 1910 ipv6_push_frag_opts(skb, opt, &proto); 1911 if (opt && opt->opt_nflen) 1912 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1913 1914 skb_push(skb, sizeof(struct ipv6hdr)); 1915 skb_reset_network_header(skb); 1916 hdr = ipv6_hdr(skb); 1917 1918 ip6_flow_hdr(hdr, v6_cork->tclass, 1919 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1920 ip6_autoflowlabel(net, sk), fl6)); 1921 hdr->hop_limit = v6_cork->hop_limit; 1922 hdr->nexthdr = proto; 1923 hdr->saddr = fl6->saddr; 1924 hdr->daddr = *final_dst; 1925 1926 skb->priority = READ_ONCE(sk->sk_priority); 1927 skb->mark = cork->base.mark; 1928 if (sk_is_tcp(sk)) 1929 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1930 else 1931 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1932 1933 ip6_cork_steal_dst(skb, cork); 1934 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1935 if (proto == IPPROTO_ICMPV6) { 1936 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1937 u8 icmp6_type; 1938 1939 if (sk->sk_socket->type == SOCK_RAW && 1940 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1941 icmp6_type = fl6->fl6_icmp_type; 1942 else 1943 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1944 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1945 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1946 } 1947 1948 ip6_cork_release(cork, v6_cork); 1949 out: 1950 return skb; 1951 } 1952 1953 int ip6_send_skb(struct sk_buff *skb) 1954 { 1955 struct net *net = sock_net(skb->sk); 1956 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 1957 int err; 1958 1959 err = ip6_local_out(net, skb->sk, skb); 1960 if (err) { 1961 if (err > 0) 1962 err = net_xmit_errno(err); 1963 if (err) 1964 IP6_INC_STATS(net, rt->rt6i_idev, 1965 IPSTATS_MIB_OUTDISCARDS); 1966 } 1967 1968 return err; 1969 } 1970 1971 int ip6_push_pending_frames(struct sock *sk) 1972 { 1973 struct sk_buff *skb; 1974 1975 skb = ip6_finish_skb(sk); 1976 if (!skb) 1977 return 0; 1978 1979 return ip6_send_skb(skb); 1980 } 1981 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1982 1983 static void __ip6_flush_pending_frames(struct sock *sk, 1984 struct sk_buff_head *queue, 1985 struct inet_cork_full *cork, 1986 struct inet6_cork *v6_cork) 1987 { 1988 struct sk_buff *skb; 1989 1990 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1991 if (skb_dst(skb)) 1992 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1993 IPSTATS_MIB_OUTDISCARDS); 1994 kfree_skb(skb); 1995 } 1996 1997 ip6_cork_release(cork, v6_cork); 1998 } 1999 2000 void ip6_flush_pending_frames(struct sock *sk) 2001 { 2002 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2003 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 2004 } 2005 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2006 2007 struct sk_buff *ip6_make_skb(struct sock *sk, 2008 int getfrag(void *from, char *to, int offset, 2009 int len, int odd, struct sk_buff *skb), 2010 void *from, size_t length, int transhdrlen, 2011 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2012 unsigned int flags, struct inet_cork_full *cork) 2013 { 2014 struct inet6_cork v6_cork; 2015 struct sk_buff_head queue; 2016 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2017 int err; 2018 2019 if (flags & MSG_PROBE) { 2020 dst_release(&rt->dst); 2021 return NULL; 2022 } 2023 2024 __skb_queue_head_init(&queue); 2025 2026 cork->base.flags = 0; 2027 cork->base.addr = 0; 2028 cork->base.opt = NULL; 2029 v6_cork.opt = NULL; 2030 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); 2031 if (err) { 2032 ip6_cork_release(cork, &v6_cork); 2033 return ERR_PTR(err); 2034 } 2035 if (ipc6->dontfrag < 0) 2036 ipc6->dontfrag = inet6_test_bit(DONTFRAG, sk); 2037 2038 err = __ip6_append_data(sk, &queue, cork, &v6_cork, 2039 ¤t->task_frag, getfrag, from, 2040 length + exthdrlen, transhdrlen + exthdrlen, 2041 flags, ipc6); 2042 if (err) { 2043 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 2044 return ERR_PTR(err); 2045 } 2046 2047 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 2048 } 2049