1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * IPv6 output functions 4 * Linux INET6 implementation 5 * 6 * Authors: 7 * Pedro Roque <roque@di.fc.ul.pt> 8 * 9 * Based on linux/net/ipv4/ip_output.c 10 * 11 * Changes: 12 * A.N.Kuznetsov : airthmetics in fragmentation. 13 * extension headers are implemented. 14 * route changes now work. 15 * ip6_forward does not confuse sniffers. 16 * etc. 17 * 18 * H. von Brand : Added missing #include <linux/string.h> 19 * Imran Patel : frag id should be in NBO 20 * Kazunori MIYAZAWA @USAGI 21 * : add ip6_append_data and related functions 22 * for datagram xmit 23 */ 24 25 #include <linux/errno.h> 26 #include <linux/kernel.h> 27 #include <linux/string.h> 28 #include <linux/socket.h> 29 #include <linux/net.h> 30 #include <linux/netdevice.h> 31 #include <linux/if_arp.h> 32 #include <linux/in6.h> 33 #include <linux/tcp.h> 34 #include <linux/route.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 38 #include <linux/bpf-cgroup.h> 39 #include <linux/netfilter.h> 40 #include <linux/netfilter_ipv6.h> 41 42 #include <net/sock.h> 43 #include <net/snmp.h> 44 45 #include <net/gso.h> 46 #include <net/ipv6.h> 47 #include <net/ndisc.h> 48 #include <net/protocol.h> 49 #include <net/ip6_route.h> 50 #include <net/addrconf.h> 51 #include <net/rawv6.h> 52 #include <net/icmp.h> 53 #include <net/xfrm.h> 54 #include <net/checksum.h> 55 #include <linux/mroute6.h> 56 #include <net/l3mdev.h> 57 #include <net/lwtunnel.h> 58 #include <net/ip_tunnels.h> 59 60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 61 { 62 struct dst_entry *dst = skb_dst(skb); 63 struct net_device *dev = dst_dev(dst); 64 struct inet6_dev *idev = ip6_dst_idev(dst); 65 unsigned int hh_len = LL_RESERVED_SPACE(dev); 66 const struct in6_addr *daddr, *nexthop; 67 struct ipv6hdr *hdr; 68 struct neighbour *neigh; 69 int ret; 70 71 /* Be paranoid, rather than too clever. */ 72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) { 73 /* Make sure idev stays alive */ 74 rcu_read_lock(); 75 skb = skb_expand_head(skb, hh_len); 76 if (!skb) { 77 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 78 rcu_read_unlock(); 79 return -ENOMEM; 80 } 81 rcu_read_unlock(); 82 } 83 84 hdr = ipv6_hdr(skb); 85 daddr = &hdr->daddr; 86 if (ipv6_addr_is_multicast(daddr)) { 87 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 88 ((mroute6_is_socket(net, skb) && 89 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 90 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) { 91 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 92 93 /* Do not check for IFF_ALLMULTI; multicast routing 94 is not supported in any case. 95 */ 96 if (newskb) 97 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 98 net, sk, newskb, NULL, newskb->dev, 99 dev_loopback_xmit); 100 101 if (hdr->hop_limit == 0) { 102 IP6_INC_STATS(net, idev, 103 IPSTATS_MIB_OUTDISCARDS); 104 kfree_skb(skb); 105 return 0; 106 } 107 } 108 109 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 110 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL && 111 !(dev->flags & IFF_LOOPBACK)) { 112 kfree_skb(skb); 113 return 0; 114 } 115 } 116 117 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 118 int res = lwtunnel_xmit(skb); 119 120 if (res != LWTUNNEL_XMIT_CONTINUE) 121 return res; 122 } 123 124 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len); 125 126 rcu_read_lock(); 127 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr); 128 neigh = __ipv6_neigh_lookup_noref(dev, nexthop); 129 130 if (IS_ERR_OR_NULL(neigh)) { 131 if (unlikely(!neigh)) 132 neigh = __neigh_create(&nd_tbl, nexthop, dev, false); 133 if (IS_ERR(neigh)) { 134 rcu_read_unlock(); 135 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES); 136 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL); 137 return -EINVAL; 138 } 139 } 140 sock_confirm_neigh(skb, neigh); 141 ret = neigh_output(neigh, skb, false); 142 rcu_read_unlock(); 143 return ret; 144 } 145 146 static int 147 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk, 148 struct sk_buff *skb, unsigned int mtu) 149 { 150 struct sk_buff *segs, *nskb; 151 netdev_features_t features; 152 int ret = 0; 153 154 /* Please see corresponding comment in ip_finish_output_gso 155 * describing the cases where GSO segment length exceeds the 156 * egress MTU. 157 */ 158 features = netif_skb_features(skb); 159 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK); 160 if (IS_ERR_OR_NULL(segs)) { 161 kfree_skb(skb); 162 return -ENOMEM; 163 } 164 165 consume_skb(skb); 166 167 skb_list_walk_safe(segs, segs, nskb) { 168 int err; 169 170 skb_mark_not_on_list(segs); 171 /* Last GSO segment can be smaller than gso_size (and MTU). 172 * Adding a fragment header would produce an "atomic fragment", 173 * which is considered harmful (RFC-8021). Avoid that. 174 */ 175 err = segs->len > mtu ? 176 ip6_fragment(net, sk, segs, ip6_finish_output2) : 177 ip6_finish_output2(net, sk, segs); 178 if (err && ret == 0) 179 ret = err; 180 } 181 182 return ret; 183 } 184 185 static int ip6_finish_output_gso(struct net *net, struct sock *sk, 186 struct sk_buff *skb, unsigned int mtu) 187 { 188 if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) && 189 !skb_gso_validate_network_len(skb, mtu)) 190 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu); 191 192 return ip6_finish_output2(net, sk, skb); 193 } 194 195 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 196 { 197 unsigned int mtu; 198 199 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 200 /* Policy lookup after SNAT yielded a new policy */ 201 if (skb_dst(skb)->xfrm) { 202 IP6CB(skb)->flags |= IP6SKB_REROUTED; 203 return dst_output(net, sk, skb); 204 } 205 #endif 206 207 mtu = ip6_skb_dst_mtu(skb); 208 if (skb_is_gso(skb)) 209 return ip6_finish_output_gso(net, sk, skb, mtu); 210 211 if (skb->len > mtu || 212 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 213 return ip6_fragment(net, sk, skb, ip6_finish_output2); 214 215 return ip6_finish_output2(net, sk, skb); 216 } 217 218 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 219 { 220 int ret; 221 222 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 223 switch (ret) { 224 case NET_XMIT_SUCCESS: 225 case NET_XMIT_CN: 226 return __ip6_finish_output(net, sk, skb) ? : ret; 227 default: 228 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS); 229 return ret; 230 } 231 } 232 233 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 234 { 235 struct dst_entry *dst = skb_dst(skb); 236 struct net_device *dev = dst_dev(dst), *indev = skb->dev; 237 struct inet6_dev *idev = ip6_dst_idev(dst); 238 239 skb->protocol = htons(ETH_P_IPV6); 240 skb->dev = dev; 241 242 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) { 243 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 244 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED); 245 return 0; 246 } 247 248 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 249 net, sk, skb, indev, dev, 250 ip6_finish_output, 251 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 252 } 253 EXPORT_SYMBOL(ip6_output); 254 255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk) 256 { 257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk)) 258 return ip6_default_np_autolabel(net); 259 return inet6_test_bit(AUTOFLOWLABEL, sk); 260 } 261 262 /* 263 * xmit an sk_buff (used by TCP and SCTP) 264 * Note : socket lock is not held for SYNACK packets, but might be modified 265 * by calls to skb_set_owner_w() and ipv6_local_error(), 266 * which are using proper atomic operations or spinlocks. 267 */ 268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 269 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority) 270 { 271 struct net *net = sock_net(sk); 272 const struct ipv6_pinfo *np = inet6_sk(sk); 273 struct in6_addr *first_hop = &fl6->daddr; 274 struct dst_entry *dst = skb_dst(skb); 275 struct net_device *dev = dst_dev(dst); 276 struct inet6_dev *idev = ip6_dst_idev(dst); 277 struct hop_jumbo_hdr *hop_jumbo; 278 int hoplen = sizeof(*hop_jumbo); 279 unsigned int head_room; 280 struct ipv6hdr *hdr; 281 u8 proto = fl6->flowi6_proto; 282 int seg_len = skb->len; 283 int hlimit = -1; 284 u32 mtu; 285 286 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev); 287 if (opt) 288 head_room += opt->opt_nflen + opt->opt_flen; 289 290 if (unlikely(head_room > skb_headroom(skb))) { 291 /* Make sure idev stays alive */ 292 rcu_read_lock(); 293 skb = skb_expand_head(skb, head_room); 294 if (!skb) { 295 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 296 rcu_read_unlock(); 297 return -ENOBUFS; 298 } 299 rcu_read_unlock(); 300 } 301 302 if (opt) { 303 seg_len += opt->opt_nflen + opt->opt_flen; 304 305 if (opt->opt_flen) 306 ipv6_push_frag_opts(skb, opt, &proto); 307 308 if (opt->opt_nflen) 309 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 310 &fl6->saddr); 311 } 312 313 if (unlikely(seg_len > IPV6_MAXPLEN)) { 314 hop_jumbo = skb_push(skb, hoplen); 315 316 hop_jumbo->nexthdr = proto; 317 hop_jumbo->hdrlen = 0; 318 hop_jumbo->tlv_type = IPV6_TLV_JUMBO; 319 hop_jumbo->tlv_len = 4; 320 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen); 321 322 proto = IPPROTO_HOPOPTS; 323 seg_len = 0; 324 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO; 325 } 326 327 skb_push(skb, sizeof(struct ipv6hdr)); 328 skb_reset_network_header(skb); 329 hdr = ipv6_hdr(skb); 330 331 /* 332 * Fill in the IPv6 header 333 */ 334 if (np) 335 hlimit = READ_ONCE(np->hop_limit); 336 if (hlimit < 0) 337 hlimit = ip6_dst_hoplimit(dst); 338 339 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 340 ip6_autoflowlabel(net, sk), fl6)); 341 342 hdr->payload_len = htons(seg_len); 343 hdr->nexthdr = proto; 344 hdr->hop_limit = hlimit; 345 346 hdr->saddr = fl6->saddr; 347 hdr->daddr = *first_hop; 348 349 skb->protocol = htons(ETH_P_IPV6); 350 skb->priority = priority; 351 skb->mark = mark; 352 353 mtu = dst_mtu(dst); 354 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 355 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS); 356 357 /* if egress device is enslaved to an L3 master device pass the 358 * skb to its handler for processing 359 */ 360 skb = l3mdev_ip6_out((struct sock *)sk, skb); 361 if (unlikely(!skb)) 362 return 0; 363 364 /* hooks should never assume socket lock is held. 365 * we promote our socket to non const 366 */ 367 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 368 net, (struct sock *)sk, skb, NULL, dev, 369 dst_output); 370 } 371 372 skb->dev = dev; 373 /* ipv6_local_error() does not require socket lock, 374 * we promote our socket to non const 375 */ 376 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 377 378 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS); 379 kfree_skb(skb); 380 return -EMSGSIZE; 381 } 382 EXPORT_SYMBOL(ip6_xmit); 383 384 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 385 { 386 struct ip6_ra_chain *ra; 387 struct sock *last = NULL; 388 389 read_lock(&ip6_ra_lock); 390 for (ra = ip6_ra_chain; ra; ra = ra->next) { 391 struct sock *sk = ra->sk; 392 if (sk && ra->sel == sel && 393 (!sk->sk_bound_dev_if || 394 sk->sk_bound_dev_if == skb->dev->ifindex)) { 395 396 if (inet6_test_bit(RTALERT_ISOLATE, sk) && 397 !net_eq(sock_net(sk), dev_net(skb->dev))) { 398 continue; 399 } 400 if (last) { 401 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 402 if (skb2) 403 rawv6_rcv(last, skb2); 404 } 405 last = sk; 406 } 407 } 408 409 if (last) { 410 rawv6_rcv(last, skb); 411 read_unlock(&ip6_ra_lock); 412 return 1; 413 } 414 read_unlock(&ip6_ra_lock); 415 return 0; 416 } 417 418 static int ip6_forward_proxy_check(struct sk_buff *skb) 419 { 420 struct ipv6hdr *hdr = ipv6_hdr(skb); 421 u8 nexthdr = hdr->nexthdr; 422 __be16 frag_off; 423 int offset; 424 425 if (ipv6_ext_hdr(nexthdr)) { 426 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 427 if (offset < 0) 428 return 0; 429 } else 430 offset = sizeof(struct ipv6hdr); 431 432 if (nexthdr == IPPROTO_ICMPV6) { 433 struct icmp6hdr *icmp6; 434 435 if (!pskb_may_pull(skb, (skb_network_header(skb) + 436 offset + 1 - skb->data))) 437 return 0; 438 439 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 440 441 switch (icmp6->icmp6_type) { 442 case NDISC_ROUTER_SOLICITATION: 443 case NDISC_ROUTER_ADVERTISEMENT: 444 case NDISC_NEIGHBOUR_SOLICITATION: 445 case NDISC_NEIGHBOUR_ADVERTISEMENT: 446 case NDISC_REDIRECT: 447 /* For reaction involving unicast neighbor discovery 448 * message destined to the proxied address, pass it to 449 * input function. 450 */ 451 return 1; 452 default: 453 break; 454 } 455 } 456 457 /* 458 * The proxying router can't forward traffic sent to a link-local 459 * address, so signal the sender and discard the packet. This 460 * behavior is clarified by the MIPv6 specification. 461 */ 462 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 463 dst_link_failure(skb); 464 return -1; 465 } 466 467 return 0; 468 } 469 470 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 471 struct sk_buff *skb) 472 { 473 #ifdef CONFIG_NET_SWITCHDEV 474 if (skb->offload_l3_fwd_mark) { 475 consume_skb(skb); 476 return 0; 477 } 478 #endif 479 480 skb_clear_tstamp(skb); 481 return dst_output(net, sk, skb); 482 } 483 484 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 485 { 486 if (skb->len <= mtu) 487 return false; 488 489 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 490 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 491 return true; 492 493 if (skb->ignore_df) 494 return false; 495 496 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 497 return false; 498 499 return true; 500 } 501 502 int ip6_forward(struct sk_buff *skb) 503 { 504 struct dst_entry *dst = skb_dst(skb); 505 struct ipv6hdr *hdr = ipv6_hdr(skb); 506 struct inet6_skb_parm *opt = IP6CB(skb); 507 struct net *net = dev_net(dst_dev(dst)); 508 struct net_device *dev; 509 struct inet6_dev *idev; 510 SKB_DR(reason); 511 u32 mtu; 512 513 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif)); 514 if (READ_ONCE(net->ipv6.devconf_all->forwarding) == 0) 515 goto error; 516 517 if (skb->pkt_type != PACKET_HOST) 518 goto drop; 519 520 if (unlikely(skb->sk)) 521 goto drop; 522 523 if (skb_warn_if_lro(skb)) 524 goto drop; 525 526 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) && 527 (!idev || !READ_ONCE(idev->cnf.disable_policy)) && 528 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 529 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 530 goto drop; 531 } 532 533 skb_forward_csum(skb); 534 535 /* 536 * We DO NOT make any processing on 537 * RA packets, pushing them to user level AS IS 538 * without ane WARRANTY that application will be able 539 * to interpret them. The reason is that we 540 * cannot make anything clever here. 541 * 542 * We are not end-node, so that if packet contains 543 * AH/ESP, we cannot make anything. 544 * Defragmentation also would be mistake, RA packets 545 * cannot be fragmented, because there is no warranty 546 * that different fragments will go along one path. --ANK 547 */ 548 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 549 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 550 return 0; 551 } 552 553 /* 554 * check and decrement ttl 555 */ 556 if (hdr->hop_limit <= 1) { 557 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 558 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 559 560 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR); 561 return -ETIMEDOUT; 562 } 563 564 /* XXX: idev->cnf.proxy_ndp? */ 565 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) && 566 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 567 int proxied = ip6_forward_proxy_check(skb); 568 if (proxied > 0) { 569 /* It's tempting to decrease the hop limit 570 * here by 1, as we do at the end of the 571 * function too. 572 * 573 * But that would be incorrect, as proxying is 574 * not forwarding. The ip6_input function 575 * will handle this packet locally, and it 576 * depends on the hop limit being unchanged. 577 * 578 * One example is the NDP hop limit, that 579 * always has to stay 255, but other would be 580 * similar checks around RA packets, where the 581 * user can even change the desired limit. 582 */ 583 return ip6_input(skb); 584 } else if (proxied < 0) { 585 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 586 goto drop; 587 } 588 } 589 590 if (!xfrm6_route_forward(skb)) { 591 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 592 SKB_DR_SET(reason, XFRM_POLICY); 593 goto drop; 594 } 595 dst = skb_dst(skb); 596 dev = dst_dev(dst); 597 /* IPv6 specs say nothing about it, but it is clear that we cannot 598 send redirects to source routed frames. 599 We don't send redirects to frames decapsulated from IPsec. 600 */ 601 if (IP6CB(skb)->iif == dev->ifindex && 602 opt->srcrt == 0 && !skb_sec_path(skb)) { 603 struct in6_addr *target = NULL; 604 struct inet_peer *peer; 605 struct rt6_info *rt; 606 607 /* 608 * incoming and outgoing devices are the same 609 * send a redirect. 610 */ 611 612 rt = dst_rt6_info(dst); 613 if (rt->rt6i_flags & RTF_GATEWAY) 614 target = &rt->rt6i_gateway; 615 else 616 target = &hdr->daddr; 617 618 rcu_read_lock(); 619 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr); 620 621 /* Limit redirects both by destination (here) 622 and by source (inside ndisc_send_redirect) 623 */ 624 if (inet_peer_xrlim_allow(peer, 1*HZ)) 625 ndisc_send_redirect(skb, target); 626 rcu_read_unlock(); 627 } else { 628 int addrtype = ipv6_addr_type(&hdr->saddr); 629 630 /* This check is security critical. */ 631 if (addrtype == IPV6_ADDR_ANY || 632 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 633 goto error; 634 if (addrtype & IPV6_ADDR_LINKLOCAL) { 635 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 636 ICMPV6_NOT_NEIGHBOUR, 0); 637 goto error; 638 } 639 } 640 641 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 642 643 mtu = ip6_dst_mtu_maybe_forward(dst, true); 644 if (mtu < IPV6_MIN_MTU) 645 mtu = IPV6_MIN_MTU; 646 647 if (ip6_pkt_too_big(skb, mtu)) { 648 /* Again, force OUTPUT device used as source address */ 649 skb->dev = dev; 650 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 651 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 652 __IP6_INC_STATS(net, ip6_dst_idev(dst), 653 IPSTATS_MIB_FRAGFAILS); 654 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG); 655 return -EMSGSIZE; 656 } 657 658 if (skb_cow(skb, dev->hard_header_len)) { 659 __IP6_INC_STATS(net, ip6_dst_idev(dst), 660 IPSTATS_MIB_OUTDISCARDS); 661 goto drop; 662 } 663 664 hdr = ipv6_hdr(skb); 665 666 /* Mangling hops number delayed to point after skb COW */ 667 668 hdr->hop_limit--; 669 670 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 671 net, NULL, skb, skb->dev, dev, 672 ip6_forward_finish); 673 674 error: 675 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 676 SKB_DR_SET(reason, IP_INADDRERRORS); 677 drop: 678 kfree_skb_reason(skb, reason); 679 return -EINVAL; 680 } 681 682 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 683 { 684 to->pkt_type = from->pkt_type; 685 to->priority = from->priority; 686 to->protocol = from->protocol; 687 skb_dst_drop(to); 688 skb_dst_set(to, dst_clone(skb_dst(from))); 689 to->dev = from->dev; 690 to->mark = from->mark; 691 692 skb_copy_hash(to, from); 693 694 #ifdef CONFIG_NET_SCHED 695 to->tc_index = from->tc_index; 696 #endif 697 nf_copy(to, from); 698 skb_ext_copy(to, from); 699 skb_copy_secmark(to, from); 700 } 701 702 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 703 u8 nexthdr, __be32 frag_id, 704 struct ip6_fraglist_iter *iter) 705 { 706 unsigned int first_len; 707 struct frag_hdr *fh; 708 709 /* BUILD HEADER */ 710 *prevhdr = NEXTHDR_FRAGMENT; 711 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 712 if (!iter->tmp_hdr) 713 return -ENOMEM; 714 715 iter->frag = skb_shinfo(skb)->frag_list; 716 skb_frag_list_init(skb); 717 718 iter->offset = 0; 719 iter->hlen = hlen; 720 iter->frag_id = frag_id; 721 iter->nexthdr = nexthdr; 722 723 __skb_pull(skb, hlen); 724 fh = __skb_push(skb, sizeof(struct frag_hdr)); 725 __skb_push(skb, hlen); 726 skb_reset_network_header(skb); 727 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 728 729 fh->nexthdr = nexthdr; 730 fh->reserved = 0; 731 fh->frag_off = htons(IP6_MF); 732 fh->identification = frag_id; 733 734 first_len = skb_pagelen(skb); 735 skb->data_len = first_len - skb_headlen(skb); 736 skb->len = first_len; 737 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 738 739 return 0; 740 } 741 EXPORT_SYMBOL(ip6_fraglist_init); 742 743 void ip6_fraglist_prepare(struct sk_buff *skb, 744 struct ip6_fraglist_iter *iter) 745 { 746 struct sk_buff *frag = iter->frag; 747 unsigned int hlen = iter->hlen; 748 struct frag_hdr *fh; 749 750 frag->ip_summed = CHECKSUM_NONE; 751 skb_reset_transport_header(frag); 752 fh = __skb_push(frag, sizeof(struct frag_hdr)); 753 __skb_push(frag, hlen); 754 skb_reset_network_header(frag); 755 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 756 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 757 fh->nexthdr = iter->nexthdr; 758 fh->reserved = 0; 759 fh->frag_off = htons(iter->offset); 760 if (frag->next) 761 fh->frag_off |= htons(IP6_MF); 762 fh->identification = iter->frag_id; 763 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 764 ip6_copy_metadata(frag, skb); 765 } 766 EXPORT_SYMBOL(ip6_fraglist_prepare); 767 768 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 769 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 770 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 771 { 772 state->prevhdr = prevhdr; 773 state->nexthdr = nexthdr; 774 state->frag_id = frag_id; 775 776 state->hlen = hlen; 777 state->mtu = mtu; 778 779 state->left = skb->len - hlen; /* Space per frame */ 780 state->ptr = hlen; /* Where to start from */ 781 782 state->hroom = hdr_room; 783 state->troom = needed_tailroom; 784 785 state->offset = 0; 786 } 787 EXPORT_SYMBOL(ip6_frag_init); 788 789 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 790 { 791 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 792 struct sk_buff *frag; 793 struct frag_hdr *fh; 794 unsigned int len; 795 796 len = state->left; 797 /* IF: it doesn't fit, use 'mtu' - the data space left */ 798 if (len > state->mtu) 799 len = state->mtu; 800 /* IF: we are not sending up to and including the packet end 801 then align the next start on an eight byte boundary */ 802 if (len < state->left) 803 len &= ~7; 804 805 /* Allocate buffer */ 806 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 807 state->hroom + state->troom, GFP_ATOMIC); 808 if (!frag) 809 return ERR_PTR(-ENOMEM); 810 811 /* 812 * Set up data on packet 813 */ 814 815 ip6_copy_metadata(frag, skb); 816 skb_reserve(frag, state->hroom); 817 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 818 skb_reset_network_header(frag); 819 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 820 frag->transport_header = (frag->network_header + state->hlen + 821 sizeof(struct frag_hdr)); 822 823 /* 824 * Charge the memory for the fragment to any owner 825 * it might possess 826 */ 827 if (skb->sk) 828 skb_set_owner_w(frag, skb->sk); 829 830 /* 831 * Copy the packet header into the new buffer. 832 */ 833 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 834 835 fragnexthdr_offset = skb_network_header(frag); 836 fragnexthdr_offset += prevhdr - skb_network_header(skb); 837 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 838 839 /* 840 * Build fragment header. 841 */ 842 fh->nexthdr = state->nexthdr; 843 fh->reserved = 0; 844 fh->identification = state->frag_id; 845 846 /* 847 * Copy a block of the IP datagram. 848 */ 849 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 850 len)); 851 state->left -= len; 852 853 fh->frag_off = htons(state->offset); 854 if (state->left > 0) 855 fh->frag_off |= htons(IP6_MF); 856 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 857 858 state->ptr += len; 859 state->offset += len; 860 861 return frag; 862 } 863 EXPORT_SYMBOL(ip6_frag_next); 864 865 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 866 int (*output)(struct net *, struct sock *, struct sk_buff *)) 867 { 868 struct sk_buff *frag; 869 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 870 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 871 inet6_sk(skb->sk) : NULL; 872 u8 tstamp_type = skb->tstamp_type; 873 struct ip6_frag_state state; 874 unsigned int mtu, hlen, nexthdr_offset; 875 ktime_t tstamp = skb->tstamp; 876 int hroom, err = 0; 877 __be32 frag_id; 878 u8 *prevhdr, nexthdr = 0; 879 880 err = ip6_find_1stfragopt(skb, &prevhdr); 881 if (err < 0) 882 goto fail; 883 hlen = err; 884 nexthdr = *prevhdr; 885 nexthdr_offset = prevhdr - skb_network_header(skb); 886 887 mtu = ip6_skb_dst_mtu(skb); 888 889 /* We must not fragment if the socket is set to force MTU discovery 890 * or if the skb it not generated by a local socket. 891 */ 892 if (unlikely(!skb->ignore_df && skb->len > mtu)) 893 goto fail_toobig; 894 895 if (IP6CB(skb)->frag_max_size) { 896 if (IP6CB(skb)->frag_max_size > mtu) 897 goto fail_toobig; 898 899 /* don't send fragments larger than what we received */ 900 mtu = IP6CB(skb)->frag_max_size; 901 if (mtu < IPV6_MIN_MTU) 902 mtu = IPV6_MIN_MTU; 903 } 904 905 if (np) { 906 u32 frag_size = READ_ONCE(np->frag_size); 907 908 if (frag_size && frag_size < mtu) 909 mtu = frag_size; 910 } 911 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 912 goto fail_toobig; 913 mtu -= hlen + sizeof(struct frag_hdr); 914 915 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 916 &ipv6_hdr(skb)->saddr); 917 918 if (skb->ip_summed == CHECKSUM_PARTIAL && 919 (err = skb_checksum_help(skb))) 920 goto fail; 921 922 prevhdr = skb_network_header(skb) + nexthdr_offset; 923 hroom = LL_RESERVED_SPACE(rt->dst.dev); 924 if (skb_has_frag_list(skb)) { 925 unsigned int first_len = skb_pagelen(skb); 926 struct ip6_fraglist_iter iter; 927 struct sk_buff *frag2; 928 929 if (first_len - hlen > mtu || 930 ((first_len - hlen) & 7) || 931 skb_cloned(skb) || 932 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 933 goto slow_path; 934 935 skb_walk_frags(skb, frag) { 936 /* Correct geometry. */ 937 if (frag->len > mtu || 938 ((frag->len & 7) && frag->next) || 939 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 940 goto slow_path_clean; 941 942 /* Partially cloned skb? */ 943 if (skb_shared(frag)) 944 goto slow_path_clean; 945 946 BUG_ON(frag->sk); 947 if (skb->sk) { 948 frag->sk = skb->sk; 949 frag->destructor = sock_wfree; 950 } 951 skb->truesize -= frag->truesize; 952 } 953 954 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 955 &iter); 956 if (err < 0) 957 goto fail; 958 959 /* We prevent @rt from being freed. */ 960 rcu_read_lock(); 961 962 for (;;) { 963 /* Prepare header of the next frame, 964 * before previous one went down. */ 965 if (iter.frag) 966 ip6_fraglist_prepare(skb, &iter); 967 968 skb_set_delivery_time(skb, tstamp, tstamp_type); 969 err = output(net, sk, skb); 970 if (!err) 971 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 972 IPSTATS_MIB_FRAGCREATES); 973 974 if (err || !iter.frag) 975 break; 976 977 skb = ip6_fraglist_next(&iter); 978 } 979 980 kfree(iter.tmp_hdr); 981 982 if (err == 0) { 983 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 984 IPSTATS_MIB_FRAGOKS); 985 rcu_read_unlock(); 986 return 0; 987 } 988 989 kfree_skb_list(iter.frag); 990 991 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 992 IPSTATS_MIB_FRAGFAILS); 993 rcu_read_unlock(); 994 return err; 995 996 slow_path_clean: 997 skb_walk_frags(skb, frag2) { 998 if (frag2 == frag) 999 break; 1000 frag2->sk = NULL; 1001 frag2->destructor = NULL; 1002 skb->truesize += frag2->truesize; 1003 } 1004 } 1005 1006 slow_path: 1007 /* 1008 * Fragment the datagram. 1009 */ 1010 1011 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 1012 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 1013 &state); 1014 1015 /* 1016 * Keep copying data until we run out. 1017 */ 1018 1019 while (state.left > 0) { 1020 frag = ip6_frag_next(skb, &state); 1021 if (IS_ERR(frag)) { 1022 err = PTR_ERR(frag); 1023 goto fail; 1024 } 1025 1026 /* 1027 * Put this fragment into the sending queue. 1028 */ 1029 skb_set_delivery_time(frag, tstamp, tstamp_type); 1030 err = output(net, sk, frag); 1031 if (err) 1032 goto fail; 1033 1034 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1035 IPSTATS_MIB_FRAGCREATES); 1036 } 1037 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1038 IPSTATS_MIB_FRAGOKS); 1039 consume_skb(skb); 1040 return err; 1041 1042 fail_toobig: 1043 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 1044 err = -EMSGSIZE; 1045 1046 fail: 1047 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 1048 IPSTATS_MIB_FRAGFAILS); 1049 kfree_skb(skb); 1050 return err; 1051 } 1052 1053 static inline int ip6_rt_check(const struct rt6key *rt_key, 1054 const struct in6_addr *fl_addr, 1055 const struct in6_addr *addr_cache) 1056 { 1057 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 1058 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 1059 } 1060 1061 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 1062 struct dst_entry *dst, 1063 const struct flowi6 *fl6) 1064 { 1065 struct ipv6_pinfo *np = inet6_sk(sk); 1066 struct rt6_info *rt; 1067 1068 if (!dst) 1069 goto out; 1070 1071 if (dst->ops->family != AF_INET6) { 1072 dst_release(dst); 1073 return NULL; 1074 } 1075 1076 rt = dst_rt6_info(dst); 1077 /* Yes, checking route validity in not connected 1078 * case is not very simple. Take into account, 1079 * that we do not support routing by source, TOS, 1080 * and MSG_DONTROUTE --ANK (980726) 1081 * 1082 * 1. ip6_rt_check(): If route was host route, 1083 * check that cached destination is current. 1084 * If it is network route, we still may 1085 * check its validity using saved pointer 1086 * to the last used address: daddr_cache. 1087 * We do not want to save whole address now, 1088 * (because main consumer of this service 1089 * is tcp, which has not this problem), 1090 * so that the last trick works only on connected 1091 * sockets. 1092 * 2. oif also should be the same. 1093 */ 1094 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 1095 #ifdef CONFIG_IPV6_SUBTREES 1096 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 1097 #endif 1098 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) { 1099 dst_release(dst); 1100 dst = NULL; 1101 } 1102 1103 out: 1104 return dst; 1105 } 1106 1107 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1108 struct dst_entry **dst, struct flowi6 *fl6) 1109 { 1110 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1111 struct neighbour *n; 1112 struct rt6_info *rt; 1113 #endif 1114 int err; 1115 int flags = 0; 1116 1117 /* The correct way to handle this would be to do 1118 * ip6_route_get_saddr, and then ip6_route_output; however, 1119 * the route-specific preferred source forces the 1120 * ip6_route_output call _before_ ip6_route_get_saddr. 1121 * 1122 * In source specific routing (no src=any default route), 1123 * ip6_route_output will fail given src=any saddr, though, so 1124 * that's why we try it again later. 1125 */ 1126 if (ipv6_addr_any(&fl6->saddr)) { 1127 struct fib6_info *from; 1128 struct rt6_info *rt; 1129 1130 *dst = ip6_route_output(net, sk, fl6); 1131 rt = (*dst)->error ? NULL : dst_rt6_info(*dst); 1132 1133 rcu_read_lock(); 1134 from = rt ? rcu_dereference(rt->from) : NULL; 1135 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1136 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0, 1137 fl6->flowi6_l3mdev, 1138 &fl6->saddr); 1139 rcu_read_unlock(); 1140 1141 if (err) 1142 goto out_err_release; 1143 1144 /* If we had an erroneous initial result, pretend it 1145 * never existed and let the SA-enabled version take 1146 * over. 1147 */ 1148 if ((*dst)->error) { 1149 dst_release(*dst); 1150 *dst = NULL; 1151 } 1152 1153 if (fl6->flowi6_oif) 1154 flags |= RT6_LOOKUP_F_IFACE; 1155 } 1156 1157 if (!*dst) 1158 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1159 1160 err = (*dst)->error; 1161 if (err) 1162 goto out_err_release; 1163 1164 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1165 /* 1166 * Here if the dst entry we've looked up 1167 * has a neighbour entry that is in the INCOMPLETE 1168 * state and the src address from the flow is 1169 * marked as OPTIMISTIC, we release the found 1170 * dst entry and replace it instead with the 1171 * dst entry of the nexthop router 1172 */ 1173 rt = dst_rt6_info(*dst); 1174 rcu_read_lock(); 1175 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1176 rt6_nexthop(rt, &fl6->daddr)); 1177 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0; 1178 rcu_read_unlock(); 1179 1180 if (err) { 1181 struct inet6_ifaddr *ifp; 1182 struct flowi6 fl_gw6; 1183 int redirect; 1184 1185 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1186 (*dst)->dev, 1); 1187 1188 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1189 if (ifp) 1190 in6_ifa_put(ifp); 1191 1192 if (redirect) { 1193 /* 1194 * We need to get the dst entry for the 1195 * default router instead 1196 */ 1197 dst_release(*dst); 1198 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1199 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1200 *dst = ip6_route_output(net, sk, &fl_gw6); 1201 err = (*dst)->error; 1202 if (err) 1203 goto out_err_release; 1204 } 1205 } 1206 #endif 1207 if (ipv6_addr_v4mapped(&fl6->saddr) && 1208 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1209 err = -EAFNOSUPPORT; 1210 goto out_err_release; 1211 } 1212 1213 return 0; 1214 1215 out_err_release: 1216 dst_release(*dst); 1217 *dst = NULL; 1218 1219 if (err == -ENETUNREACH) 1220 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1221 return err; 1222 } 1223 1224 /** 1225 * ip6_dst_lookup - perform route lookup on flow 1226 * @net: Network namespace to perform lookup in 1227 * @sk: socket which provides route info 1228 * @dst: pointer to dst_entry * for result 1229 * @fl6: flow to lookup 1230 * 1231 * This function performs a route lookup on the given flow. 1232 * 1233 * It returns zero on success, or a standard errno code on error. 1234 */ 1235 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1236 struct flowi6 *fl6) 1237 { 1238 *dst = NULL; 1239 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1240 } 1241 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1242 1243 /** 1244 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1245 * @net: Network namespace to perform lookup in 1246 * @sk: socket which provides route info 1247 * @fl6: flow to lookup 1248 * @final_dst: final destination address for ipsec lookup 1249 * 1250 * This function performs a route lookup on the given flow. 1251 * 1252 * It returns a valid dst pointer on success, or a pointer encoded 1253 * error code. 1254 */ 1255 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, 1256 const struct in6_addr *final_dst) 1257 { 1258 struct dst_entry *dst = NULL; 1259 int err; 1260 1261 err = ip6_dst_lookup_tail(net, sk, &dst, fl6); 1262 if (err) 1263 return ERR_PTR(err); 1264 if (final_dst) 1265 fl6->daddr = *final_dst; 1266 1267 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0); 1268 } 1269 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1270 1271 /** 1272 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1273 * @sk: socket which provides the dst cache and route info 1274 * @fl6: flow to lookup 1275 * @final_dst: final destination address for ipsec lookup 1276 * @connected: whether @sk is connected or not 1277 * 1278 * This function performs a route lookup on the given flow with the 1279 * possibility of using the cached route in the socket if it is valid. 1280 * It will take the socket dst lock when operating on the dst cache. 1281 * As a result, this function can only be used in process context. 1282 * 1283 * In addition, for a connected socket, cache the dst in the socket 1284 * if the current cache is not valid. 1285 * 1286 * It returns a valid dst pointer on success, or a pointer encoded 1287 * error code. 1288 */ 1289 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1290 const struct in6_addr *final_dst, 1291 bool connected) 1292 { 1293 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1294 1295 dst = ip6_sk_dst_check(sk, dst, fl6); 1296 if (dst) 1297 return dst; 1298 1299 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst); 1300 if (connected && !IS_ERR(dst)) 1301 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1302 1303 return dst; 1304 } 1305 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1306 1307 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1308 gfp_t gfp) 1309 { 1310 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1311 } 1312 1313 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1314 gfp_t gfp) 1315 { 1316 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1317 } 1318 1319 static void ip6_append_data_mtu(unsigned int *mtu, 1320 int *maxfraglen, 1321 unsigned int fragheaderlen, 1322 struct sk_buff *skb, 1323 struct rt6_info *rt, 1324 unsigned int orig_mtu) 1325 { 1326 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1327 if (!skb) { 1328 /* first fragment, reserve header_len */ 1329 *mtu = orig_mtu - rt->dst.header_len; 1330 1331 } else { 1332 /* 1333 * this fragment is not first, the headers 1334 * space is regarded as data space. 1335 */ 1336 *mtu = orig_mtu; 1337 } 1338 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1339 + fragheaderlen - sizeof(struct frag_hdr); 1340 } 1341 } 1342 1343 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1344 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1345 struct rt6_info *rt) 1346 { 1347 struct ipv6_pinfo *np = inet6_sk(sk); 1348 unsigned int mtu, frag_size; 1349 struct ipv6_txoptions *nopt, *opt = ipc6->opt; 1350 1351 /* callers pass dst together with a reference, set it first so 1352 * ip6_cork_release() can put it down even in case of an error. 1353 */ 1354 cork->base.dst = &rt->dst; 1355 1356 /* 1357 * setup for corking 1358 */ 1359 if (opt) { 1360 if (WARN_ON(v6_cork->opt)) 1361 return -EINVAL; 1362 1363 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1364 if (unlikely(!nopt)) 1365 return -ENOBUFS; 1366 1367 nopt->tot_len = sizeof(*opt); 1368 nopt->opt_flen = opt->opt_flen; 1369 nopt->opt_nflen = opt->opt_nflen; 1370 1371 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation); 1372 if (opt->dst0opt && !nopt->dst0opt) 1373 return -ENOBUFS; 1374 1375 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation); 1376 if (opt->dst1opt && !nopt->dst1opt) 1377 return -ENOBUFS; 1378 1379 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation); 1380 if (opt->hopopt && !nopt->hopopt) 1381 return -ENOBUFS; 1382 1383 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation); 1384 if (opt->srcrt && !nopt->srcrt) 1385 return -ENOBUFS; 1386 1387 /* need source address above miyazawa*/ 1388 } 1389 v6_cork->hop_limit = ipc6->hlimit; 1390 v6_cork->tclass = ipc6->tclass; 1391 v6_cork->dontfrag = ipc6->dontfrag; 1392 if (rt->dst.flags & DST_XFRM_TUNNEL) 1393 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1394 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1395 else 1396 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ? 1397 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1398 1399 frag_size = READ_ONCE(np->frag_size); 1400 if (frag_size && frag_size < mtu) 1401 mtu = frag_size; 1402 1403 cork->base.fragsize = mtu; 1404 cork->base.gso_size = ipc6->gso_size; 1405 cork->base.tx_flags = 0; 1406 cork->base.mark = ipc6->sockc.mark; 1407 cork->base.priority = ipc6->sockc.priority; 1408 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags); 1409 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) { 1410 cork->base.flags |= IPCORK_TS_OPT_ID; 1411 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id; 1412 } 1413 cork->base.length = 0; 1414 cork->base.transmit_time = ipc6->sockc.transmit_time; 1415 1416 return 0; 1417 } 1418 1419 static int __ip6_append_data(struct sock *sk, 1420 struct sk_buff_head *queue, 1421 struct inet_cork_full *cork_full, 1422 struct inet6_cork *v6_cork, 1423 struct page_frag *pfrag, 1424 int getfrag(void *from, char *to, int offset, 1425 int len, int odd, struct sk_buff *skb), 1426 void *from, size_t length, int transhdrlen, 1427 unsigned int flags) 1428 { 1429 struct sk_buff *skb, *skb_prev = NULL; 1430 struct inet_cork *cork = &cork_full->base; 1431 struct flowi6 *fl6 = &cork_full->fl.u.ip6; 1432 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1433 struct ubuf_info *uarg = NULL; 1434 int exthdrlen = 0; 1435 int dst_exthdrlen = 0; 1436 int hh_len; 1437 int copy; 1438 int err; 1439 int offset = 0; 1440 bool zc = false; 1441 u32 tskey = 0; 1442 struct rt6_info *rt = dst_rt6_info(cork->dst); 1443 bool paged, hold_tskey = false, extra_uref = false; 1444 struct ipv6_txoptions *opt = v6_cork->opt; 1445 int csummode = CHECKSUM_NONE; 1446 unsigned int maxnonfragsize, headersize; 1447 unsigned int wmem_alloc_delta = 0; 1448 1449 skb = skb_peek_tail(queue); 1450 if (!skb) { 1451 exthdrlen = opt ? opt->opt_flen : 0; 1452 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1453 } 1454 1455 paged = !!cork->gso_size; 1456 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1457 orig_mtu = mtu; 1458 1459 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1460 1461 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1462 (opt ? opt->opt_nflen : 0); 1463 1464 headersize = sizeof(struct ipv6hdr) + 1465 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1466 rt->rt6i_nfheader_len; 1467 1468 if (mtu <= fragheaderlen || 1469 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr)) 1470 goto emsgsize; 1471 1472 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1473 sizeof(struct frag_hdr); 1474 1475 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1476 * the first fragment 1477 */ 1478 if (headersize + transhdrlen > mtu) 1479 goto emsgsize; 1480 1481 if (cork->length + length > mtu - headersize && v6_cork->dontfrag && 1482 (sk->sk_protocol == IPPROTO_UDP || 1483 sk->sk_protocol == IPPROTO_ICMPV6 || 1484 sk->sk_protocol == IPPROTO_RAW)) { 1485 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1486 sizeof(struct ipv6hdr)); 1487 goto emsgsize; 1488 } 1489 1490 if (ip6_sk_ignore_df(sk)) 1491 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1492 else 1493 maxnonfragsize = mtu; 1494 1495 if (cork->length + length > maxnonfragsize - headersize) { 1496 emsgsize: 1497 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1498 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1499 return -EMSGSIZE; 1500 } 1501 1502 /* CHECKSUM_PARTIAL only with no extension headers and when 1503 * we are not going to fragment 1504 */ 1505 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1506 headersize == sizeof(struct ipv6hdr) && 1507 length <= mtu - headersize && 1508 (!(flags & MSG_MORE) || cork->gso_size) && 1509 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1510 csummode = CHECKSUM_PARTIAL; 1511 1512 if ((flags & MSG_ZEROCOPY) && length) { 1513 struct msghdr *msg = from; 1514 1515 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) { 1516 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb)) 1517 return -EINVAL; 1518 1519 /* Leave uarg NULL if can't zerocopy, callers should 1520 * be able to handle it. 1521 */ 1522 if ((rt->dst.dev->features & NETIF_F_SG) && 1523 csummode == CHECKSUM_PARTIAL) { 1524 paged = true; 1525 zc = true; 1526 uarg = msg->msg_ubuf; 1527 } 1528 } else if (sock_flag(sk, SOCK_ZEROCOPY)) { 1529 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb), 1530 false); 1531 if (!uarg) 1532 return -ENOBUFS; 1533 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */ 1534 if (rt->dst.dev->features & NETIF_F_SG && 1535 csummode == CHECKSUM_PARTIAL) { 1536 paged = true; 1537 zc = true; 1538 } else { 1539 uarg_to_msgzc(uarg)->zerocopy = 0; 1540 skb_zcopy_set(skb, uarg, &extra_uref); 1541 } 1542 } 1543 } else if ((flags & MSG_SPLICE_PAGES) && length) { 1544 if (inet_test_bit(HDRINCL, sk)) 1545 return -EPERM; 1546 if (rt->dst.dev->features & NETIF_F_SG && 1547 getfrag == ip_generic_getfrag) 1548 /* We need an empty buffer to attach stuff to */ 1549 paged = true; 1550 else 1551 flags &= ~MSG_SPLICE_PAGES; 1552 } 1553 1554 if (cork->tx_flags & SKBTX_ANY_TSTAMP && 1555 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) { 1556 if (cork->flags & IPCORK_TS_OPT_ID) { 1557 tskey = cork->ts_opt_id; 1558 } else { 1559 tskey = atomic_inc_return(&sk->sk_tskey) - 1; 1560 hold_tskey = true; 1561 } 1562 } 1563 1564 /* 1565 * Let's try using as much space as possible. 1566 * Use MTU if total length of the message fits into the MTU. 1567 * Otherwise, we need to reserve fragment header and 1568 * fragment alignment (= 8-15 octects, in total). 1569 * 1570 * Note that we may need to "move" the data from the tail 1571 * of the buffer to the new fragment when we split 1572 * the message. 1573 * 1574 * FIXME: It may be fragmented into multiple chunks 1575 * at once if non-fragmentable extension headers 1576 * are too large. 1577 * --yoshfuji 1578 */ 1579 1580 cork->length += length; 1581 if (!skb) 1582 goto alloc_new_skb; 1583 1584 while (length > 0) { 1585 /* Check if the remaining data fits into current packet. */ 1586 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len; 1587 if (copy < length) 1588 copy = maxfraglen - skb->len; 1589 1590 if (copy <= 0) { 1591 char *data; 1592 unsigned int datalen; 1593 unsigned int fraglen; 1594 unsigned int fraggap; 1595 unsigned int alloclen, alloc_extra; 1596 unsigned int pagedlen; 1597 alloc_new_skb: 1598 /* There's no room in the current skb */ 1599 if (skb) 1600 fraggap = skb->len - maxfraglen; 1601 else 1602 fraggap = 0; 1603 /* update mtu and maxfraglen if necessary */ 1604 if (!skb || !skb_prev) 1605 ip6_append_data_mtu(&mtu, &maxfraglen, 1606 fragheaderlen, skb, rt, 1607 orig_mtu); 1608 1609 skb_prev = skb; 1610 1611 /* 1612 * If remaining data exceeds the mtu, 1613 * we know we need more fragment(s). 1614 */ 1615 datalen = length + fraggap; 1616 1617 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen) 1618 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1619 fraglen = datalen + fragheaderlen; 1620 pagedlen = 0; 1621 1622 alloc_extra = hh_len; 1623 alloc_extra += dst_exthdrlen; 1624 alloc_extra += rt->dst.trailer_len; 1625 1626 /* We just reserve space for fragment header. 1627 * Note: this may be overallocation if the message 1628 * (without MSG_MORE) fits into the MTU. 1629 */ 1630 alloc_extra += sizeof(struct frag_hdr); 1631 1632 if ((flags & MSG_MORE) && 1633 !(rt->dst.dev->features&NETIF_F_SG)) 1634 alloclen = mtu; 1635 else if (!paged && 1636 (fraglen + alloc_extra < SKB_MAX_ALLOC || 1637 !(rt->dst.dev->features & NETIF_F_SG))) 1638 alloclen = fraglen; 1639 else { 1640 alloclen = fragheaderlen + transhdrlen; 1641 pagedlen = datalen - transhdrlen; 1642 } 1643 alloclen += alloc_extra; 1644 1645 if (datalen != length + fraggap) { 1646 /* 1647 * this is not the last fragment, the trailer 1648 * space is regarded as data space. 1649 */ 1650 datalen += rt->dst.trailer_len; 1651 } 1652 1653 fraglen = datalen + fragheaderlen; 1654 1655 copy = datalen - transhdrlen - fraggap - pagedlen; 1656 /* [!] NOTE: copy may be negative if pagedlen>0 1657 * because then the equation may reduces to -fraggap. 1658 */ 1659 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) { 1660 err = -EINVAL; 1661 goto error; 1662 } 1663 if (transhdrlen) { 1664 skb = sock_alloc_send_skb(sk, alloclen, 1665 (flags & MSG_DONTWAIT), &err); 1666 } else { 1667 skb = NULL; 1668 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1669 2 * sk->sk_sndbuf) 1670 skb = alloc_skb(alloclen, 1671 sk->sk_allocation); 1672 if (unlikely(!skb)) 1673 err = -ENOBUFS; 1674 } 1675 if (!skb) 1676 goto error; 1677 /* 1678 * Fill in the control structures 1679 */ 1680 skb->protocol = htons(ETH_P_IPV6); 1681 skb->ip_summed = csummode; 1682 skb->csum = 0; 1683 /* reserve for fragmentation and ipsec header */ 1684 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1685 dst_exthdrlen); 1686 1687 /* 1688 * Find where to start putting bytes 1689 */ 1690 data = skb_put(skb, fraglen - pagedlen); 1691 skb_set_network_header(skb, exthdrlen); 1692 data += fragheaderlen; 1693 skb->transport_header = (skb->network_header + 1694 fragheaderlen); 1695 if (fraggap) { 1696 skb->csum = skb_copy_and_csum_bits( 1697 skb_prev, maxfraglen, 1698 data + transhdrlen, fraggap); 1699 skb_prev->csum = csum_sub(skb_prev->csum, 1700 skb->csum); 1701 data += fraggap; 1702 pskb_trim_unique(skb_prev, maxfraglen); 1703 } 1704 if (copy > 0 && 1705 INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1706 from, data + transhdrlen, offset, 1707 copy, fraggap, skb) < 0) { 1708 err = -EFAULT; 1709 kfree_skb(skb); 1710 goto error; 1711 } else if (flags & MSG_SPLICE_PAGES) { 1712 copy = 0; 1713 } 1714 1715 offset += copy; 1716 length -= copy + transhdrlen; 1717 transhdrlen = 0; 1718 exthdrlen = 0; 1719 dst_exthdrlen = 0; 1720 1721 /* Only the initial fragment is time stamped */ 1722 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1723 cork->tx_flags = 0; 1724 skb_shinfo(skb)->tskey = tskey; 1725 tskey = 0; 1726 skb_zcopy_set(skb, uarg, &extra_uref); 1727 1728 if ((flags & MSG_CONFIRM) && !skb_prev) 1729 skb_set_dst_pending_confirm(skb, 1); 1730 1731 /* 1732 * Put the packet on the pending queue 1733 */ 1734 if (!skb->destructor) { 1735 skb->destructor = sock_wfree; 1736 skb->sk = sk; 1737 wmem_alloc_delta += skb->truesize; 1738 } 1739 __skb_queue_tail(queue, skb); 1740 continue; 1741 } 1742 1743 if (copy > length) 1744 copy = length; 1745 1746 if (!(rt->dst.dev->features&NETIF_F_SG) && 1747 skb_tailroom(skb) >= copy) { 1748 unsigned int off; 1749 1750 off = skb->len; 1751 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1752 from, skb_put(skb, copy), 1753 offset, copy, off, skb) < 0) { 1754 __skb_trim(skb, off); 1755 err = -EFAULT; 1756 goto error; 1757 } 1758 } else if (flags & MSG_SPLICE_PAGES) { 1759 struct msghdr *msg = from; 1760 1761 err = -EIO; 1762 if (WARN_ON_ONCE(copy > msg->msg_iter.count)) 1763 goto error; 1764 1765 err = skb_splice_from_iter(skb, &msg->msg_iter, copy); 1766 if (err < 0) 1767 goto error; 1768 copy = err; 1769 wmem_alloc_delta += copy; 1770 } else if (!zc) { 1771 int i = skb_shinfo(skb)->nr_frags; 1772 1773 err = -ENOMEM; 1774 if (!sk_page_frag_refill(sk, pfrag)) 1775 goto error; 1776 1777 skb_zcopy_downgrade_managed(skb); 1778 if (!skb_can_coalesce(skb, i, pfrag->page, 1779 pfrag->offset)) { 1780 err = -EMSGSIZE; 1781 if (i == MAX_SKB_FRAGS) 1782 goto error; 1783 1784 __skb_fill_page_desc(skb, i, pfrag->page, 1785 pfrag->offset, 0); 1786 skb_shinfo(skb)->nr_frags = ++i; 1787 get_page(pfrag->page); 1788 } 1789 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1790 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag, 1791 from, 1792 page_address(pfrag->page) + pfrag->offset, 1793 offset, copy, skb->len, skb) < 0) 1794 goto error_efault; 1795 1796 pfrag->offset += copy; 1797 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1798 skb->len += copy; 1799 skb->data_len += copy; 1800 skb->truesize += copy; 1801 wmem_alloc_delta += copy; 1802 } else { 1803 err = skb_zerocopy_iter_dgram(skb, from, copy); 1804 if (err < 0) 1805 goto error; 1806 } 1807 offset += copy; 1808 length -= copy; 1809 } 1810 1811 if (wmem_alloc_delta) 1812 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1813 return 0; 1814 1815 error_efault: 1816 err = -EFAULT; 1817 error: 1818 net_zcopy_put_abort(uarg, extra_uref); 1819 cork->length -= length; 1820 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1821 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1822 if (hold_tskey) 1823 atomic_dec(&sk->sk_tskey); 1824 return err; 1825 } 1826 1827 int ip6_append_data(struct sock *sk, 1828 int getfrag(void *from, char *to, int offset, int len, 1829 int odd, struct sk_buff *skb), 1830 void *from, size_t length, int transhdrlen, 1831 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1832 struct rt6_info *rt, unsigned int flags) 1833 { 1834 struct inet_sock *inet = inet_sk(sk); 1835 struct ipv6_pinfo *np = inet6_sk(sk); 1836 int exthdrlen; 1837 int err; 1838 1839 if (flags&MSG_PROBE) 1840 return 0; 1841 if (skb_queue_empty(&sk->sk_write_queue)) { 1842 /* 1843 * setup for corking 1844 */ 1845 dst_hold(&rt->dst); 1846 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1847 ipc6, rt); 1848 if (err) 1849 return err; 1850 1851 inet->cork.fl.u.ip6 = *fl6; 1852 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1853 length += exthdrlen; 1854 transhdrlen += exthdrlen; 1855 } else { 1856 transhdrlen = 0; 1857 } 1858 1859 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, 1860 &np->cork, sk_page_frag(sk), getfrag, 1861 from, length, transhdrlen, flags); 1862 } 1863 EXPORT_SYMBOL_GPL(ip6_append_data); 1864 1865 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork) 1866 { 1867 struct dst_entry *dst = cork->base.dst; 1868 1869 cork->base.dst = NULL; 1870 skb_dst_set(skb, dst); 1871 } 1872 1873 static void ip6_cork_release(struct inet_cork_full *cork, 1874 struct inet6_cork *v6_cork) 1875 { 1876 if (v6_cork->opt) { 1877 struct ipv6_txoptions *opt = v6_cork->opt; 1878 1879 kfree(opt->dst0opt); 1880 kfree(opt->dst1opt); 1881 kfree(opt->hopopt); 1882 kfree(opt->srcrt); 1883 kfree(opt); 1884 v6_cork->opt = NULL; 1885 } 1886 1887 if (cork->base.dst) { 1888 dst_release(cork->base.dst); 1889 cork->base.dst = NULL; 1890 } 1891 } 1892 1893 struct sk_buff *__ip6_make_skb(struct sock *sk, 1894 struct sk_buff_head *queue, 1895 struct inet_cork_full *cork, 1896 struct inet6_cork *v6_cork) 1897 { 1898 struct sk_buff *skb, *tmp_skb; 1899 struct sk_buff **tail_skb; 1900 struct in6_addr *final_dst; 1901 struct net *net = sock_net(sk); 1902 struct ipv6hdr *hdr; 1903 struct ipv6_txoptions *opt = v6_cork->opt; 1904 struct rt6_info *rt = dst_rt6_info(cork->base.dst); 1905 struct flowi6 *fl6 = &cork->fl.u.ip6; 1906 unsigned char proto = fl6->flowi6_proto; 1907 1908 skb = __skb_dequeue(queue); 1909 if (!skb) 1910 goto out; 1911 tail_skb = &(skb_shinfo(skb)->frag_list); 1912 1913 /* move skb->data to ip header from ext header */ 1914 if (skb->data < skb_network_header(skb)) 1915 __skb_pull(skb, skb_network_offset(skb)); 1916 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1917 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1918 *tail_skb = tmp_skb; 1919 tail_skb = &(tmp_skb->next); 1920 skb->len += tmp_skb->len; 1921 skb->data_len += tmp_skb->len; 1922 skb->truesize += tmp_skb->truesize; 1923 tmp_skb->destructor = NULL; 1924 tmp_skb->sk = NULL; 1925 } 1926 1927 /* Allow local fragmentation. */ 1928 skb->ignore_df = ip6_sk_ignore_df(sk); 1929 __skb_pull(skb, skb_network_header_len(skb)); 1930 1931 final_dst = &fl6->daddr; 1932 if (opt && opt->opt_flen) 1933 ipv6_push_frag_opts(skb, opt, &proto); 1934 if (opt && opt->opt_nflen) 1935 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1936 1937 skb_push(skb, sizeof(struct ipv6hdr)); 1938 skb_reset_network_header(skb); 1939 hdr = ipv6_hdr(skb); 1940 1941 ip6_flow_hdr(hdr, v6_cork->tclass, 1942 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1943 ip6_autoflowlabel(net, sk), fl6)); 1944 hdr->hop_limit = v6_cork->hop_limit; 1945 hdr->nexthdr = proto; 1946 hdr->saddr = fl6->saddr; 1947 hdr->daddr = *final_dst; 1948 1949 skb->priority = cork->base.priority; 1950 skb->mark = cork->base.mark; 1951 if (sk_is_tcp(sk)) 1952 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); 1953 else 1954 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); 1955 1956 ip6_cork_steal_dst(skb, cork); 1957 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); 1958 if (proto == IPPROTO_ICMPV6) { 1959 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1960 u8 icmp6_type; 1961 1962 if (sk->sk_socket->type == SOCK_RAW && 1963 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) 1964 icmp6_type = fl6->fl6_icmp_type; 1965 else 1966 icmp6_type = icmp6_hdr(skb)->icmp6_type; 1967 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type); 1968 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1969 } 1970 1971 ip6_cork_release(cork, v6_cork); 1972 out: 1973 return skb; 1974 } 1975 1976 int ip6_send_skb(struct sk_buff *skb) 1977 { 1978 struct net *net = sock_net(skb->sk); 1979 struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); 1980 int err; 1981 1982 rcu_read_lock(); 1983 err = ip6_local_out(net, skb->sk, skb); 1984 if (err) { 1985 if (err > 0) 1986 err = net_xmit_errno(err); 1987 if (err) 1988 IP6_INC_STATS(net, rt->rt6i_idev, 1989 IPSTATS_MIB_OUTDISCARDS); 1990 } 1991 1992 rcu_read_unlock(); 1993 return err; 1994 } 1995 1996 int ip6_push_pending_frames(struct sock *sk) 1997 { 1998 struct sk_buff *skb; 1999 2000 skb = ip6_finish_skb(sk); 2001 if (!skb) 2002 return 0; 2003 2004 return ip6_send_skb(skb); 2005 } 2006 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 2007 2008 static void __ip6_flush_pending_frames(struct sock *sk, 2009 struct sk_buff_head *queue, 2010 struct inet_cork_full *cork, 2011 struct inet6_cork *v6_cork) 2012 { 2013 struct sk_buff *skb; 2014 2015 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 2016 if (skb_dst(skb)) 2017 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 2018 IPSTATS_MIB_OUTDISCARDS); 2019 kfree_skb(skb); 2020 } 2021 2022 ip6_cork_release(cork, v6_cork); 2023 } 2024 2025 void ip6_flush_pending_frames(struct sock *sk) 2026 { 2027 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 2028 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 2029 } 2030 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 2031 2032 struct sk_buff *ip6_make_skb(struct sock *sk, 2033 int getfrag(void *from, char *to, int offset, 2034 int len, int odd, struct sk_buff *skb), 2035 void *from, size_t length, int transhdrlen, 2036 struct ipcm6_cookie *ipc6, struct rt6_info *rt, 2037 unsigned int flags, struct inet_cork_full *cork) 2038 { 2039 struct inet6_cork v6_cork; 2040 struct sk_buff_head queue; 2041 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 2042 int err; 2043 2044 if (flags & MSG_PROBE) { 2045 dst_release(&rt->dst); 2046 return NULL; 2047 } 2048 2049 __skb_queue_head_init(&queue); 2050 2051 cork->base.flags = 0; 2052 cork->base.addr = 0; 2053 cork->base.opt = NULL; 2054 v6_cork.opt = NULL; 2055 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt); 2056 if (err) { 2057 ip6_cork_release(cork, &v6_cork); 2058 return ERR_PTR(err); 2059 } 2060 2061 err = __ip6_append_data(sk, &queue, cork, &v6_cork, 2062 ¤t->task_frag, getfrag, from, 2063 length + exthdrlen, transhdrlen + exthdrlen, 2064 flags); 2065 if (err) { 2066 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 2067 return ERR_PTR(err); 2068 } 2069 2070 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 2071 } 2072