1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 int ret; 134 135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 if (ret) { 137 kfree_skb(skb); 138 return ret; 139 } 140 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 /* Policy lookup after SNAT yielded a new policy */ 143 if (skb_dst(skb)->xfrm) { 144 IPCB(skb)->flags |= IPSKB_REROUTED; 145 return dst_output(net, sk, skb); 146 } 147 #endif 148 149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 150 dst_allfrag(skb_dst(skb)) || 151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 153 else 154 return ip6_finish_output2(net, sk, skb); 155 } 156 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 { 159 struct net_device *dev = skb_dst(skb)->dev; 160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 161 162 skb->protocol = htons(ETH_P_IPV6); 163 skb->dev = dev; 164 165 if (unlikely(idev->cnf.disable_ipv6)) { 166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 167 kfree_skb(skb); 168 return 0; 169 } 170 171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 172 net, sk, skb, NULL, dev, 173 ip6_finish_output, 174 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 } 176 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 178 { 179 if (!np->autoflowlabel_set) 180 return ip6_default_np_autolabel(net); 181 else 182 return np->autoflowlabel; 183 } 184 185 /* 186 * xmit an sk_buff (used by TCP, SCTP and DCCP) 187 * Note : socket lock is not held for SYNACK packets, but might be modified 188 * by calls to skb_set_owner_w() and ipv6_local_error(), 189 * which are using proper atomic operations or spinlocks. 190 */ 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 192 __u32 mark, struct ipv6_txoptions *opt, int tclass) 193 { 194 struct net *net = sock_net(sk); 195 const struct ipv6_pinfo *np = inet6_sk(sk); 196 struct in6_addr *first_hop = &fl6->daddr; 197 struct dst_entry *dst = skb_dst(skb); 198 struct ipv6hdr *hdr; 199 u8 proto = fl6->flowi6_proto; 200 int seg_len = skb->len; 201 int hlimit = -1; 202 u32 mtu; 203 204 if (opt) { 205 unsigned int head_room; 206 207 /* First: exthdrs may take lots of space (~8K for now) 208 MAX_HEADER is not enough. 209 */ 210 head_room = opt->opt_nflen + opt->opt_flen; 211 seg_len += head_room; 212 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 213 214 if (skb_headroom(skb) < head_room) { 215 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 216 if (!skb2) { 217 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 218 IPSTATS_MIB_OUTDISCARDS); 219 kfree_skb(skb); 220 return -ENOBUFS; 221 } 222 consume_skb(skb); 223 skb = skb2; 224 /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, 225 * it is safe to call in our context (socket lock not held) 226 */ 227 skb_set_owner_w(skb, (struct sock *)sk); 228 } 229 if (opt->opt_flen) 230 ipv6_push_frag_opts(skb, opt, &proto); 231 if (opt->opt_nflen) 232 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 233 &fl6->saddr); 234 } 235 236 skb_push(skb, sizeof(struct ipv6hdr)); 237 skb_reset_network_header(skb); 238 hdr = ipv6_hdr(skb); 239 240 /* 241 * Fill in the IPv6 header 242 */ 243 if (np) 244 hlimit = np->hop_limit; 245 if (hlimit < 0) 246 hlimit = ip6_dst_hoplimit(dst); 247 248 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 249 ip6_autoflowlabel(net, np), fl6)); 250 251 hdr->payload_len = htons(seg_len); 252 hdr->nexthdr = proto; 253 hdr->hop_limit = hlimit; 254 255 hdr->saddr = fl6->saddr; 256 hdr->daddr = *first_hop; 257 258 skb->protocol = htons(ETH_P_IPV6); 259 skb->priority = sk->sk_priority; 260 skb->mark = mark; 261 262 mtu = dst_mtu(dst); 263 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 264 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 265 IPSTATS_MIB_OUT, skb->len); 266 267 /* if egress device is enslaved to an L3 master device pass the 268 * skb to its handler for processing 269 */ 270 skb = l3mdev_ip6_out((struct sock *)sk, skb); 271 if (unlikely(!skb)) 272 return 0; 273 274 /* hooks should never assume socket lock is held. 275 * we promote our socket to non const 276 */ 277 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 278 net, (struct sock *)sk, skb, NULL, dst->dev, 279 dst_output); 280 } 281 282 skb->dev = dst->dev; 283 /* ipv6_local_error() does not require socket lock, 284 * we promote our socket to non const 285 */ 286 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 287 288 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 289 kfree_skb(skb); 290 return -EMSGSIZE; 291 } 292 EXPORT_SYMBOL(ip6_xmit); 293 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 295 { 296 struct ip6_ra_chain *ra; 297 struct sock *last = NULL; 298 299 read_lock(&ip6_ra_lock); 300 for (ra = ip6_ra_chain; ra; ra = ra->next) { 301 struct sock *sk = ra->sk; 302 if (sk && ra->sel == sel && 303 (!sk->sk_bound_dev_if || 304 sk->sk_bound_dev_if == skb->dev->ifindex)) { 305 if (last) { 306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 307 if (skb2) 308 rawv6_rcv(last, skb2); 309 } 310 last = sk; 311 } 312 } 313 314 if (last) { 315 rawv6_rcv(last, skb); 316 read_unlock(&ip6_ra_lock); 317 return 1; 318 } 319 read_unlock(&ip6_ra_lock); 320 return 0; 321 } 322 323 static int ip6_forward_proxy_check(struct sk_buff *skb) 324 { 325 struct ipv6hdr *hdr = ipv6_hdr(skb); 326 u8 nexthdr = hdr->nexthdr; 327 __be16 frag_off; 328 int offset; 329 330 if (ipv6_ext_hdr(nexthdr)) { 331 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 332 if (offset < 0) 333 return 0; 334 } else 335 offset = sizeof(struct ipv6hdr); 336 337 if (nexthdr == IPPROTO_ICMPV6) { 338 struct icmp6hdr *icmp6; 339 340 if (!pskb_may_pull(skb, (skb_network_header(skb) + 341 offset + 1 - skb->data))) 342 return 0; 343 344 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 345 346 switch (icmp6->icmp6_type) { 347 case NDISC_ROUTER_SOLICITATION: 348 case NDISC_ROUTER_ADVERTISEMENT: 349 case NDISC_NEIGHBOUR_SOLICITATION: 350 case NDISC_NEIGHBOUR_ADVERTISEMENT: 351 case NDISC_REDIRECT: 352 /* For reaction involving unicast neighbor discovery 353 * message destined to the proxied address, pass it to 354 * input function. 355 */ 356 return 1; 357 default: 358 break; 359 } 360 } 361 362 /* 363 * The proxying router can't forward traffic sent to a link-local 364 * address, so signal the sender and discard the packet. This 365 * behavior is clarified by the MIPv6 specification. 366 */ 367 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 368 dst_link_failure(skb); 369 return -1; 370 } 371 372 return 0; 373 } 374 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 376 struct sk_buff *skb) 377 { 378 return dst_output(net, sk, skb); 379 } 380 381 unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) 382 { 383 unsigned int mtu; 384 struct inet6_dev *idev; 385 386 if (dst_metric_locked(dst, RTAX_MTU)) { 387 mtu = dst_metric_raw(dst, RTAX_MTU); 388 if (mtu) 389 return mtu; 390 } 391 392 mtu = IPV6_MIN_MTU; 393 rcu_read_lock(); 394 idev = __in6_dev_get(dst->dev); 395 if (idev) 396 mtu = idev->cnf.mtu6; 397 rcu_read_unlock(); 398 399 return mtu; 400 } 401 EXPORT_SYMBOL_GPL(ip6_dst_mtu_forward); 402 403 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 404 { 405 if (skb->len <= mtu) 406 return false; 407 408 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 409 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 410 return true; 411 412 if (skb->ignore_df) 413 return false; 414 415 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 416 return false; 417 418 return true; 419 } 420 421 int ip6_forward(struct sk_buff *skb) 422 { 423 struct dst_entry *dst = skb_dst(skb); 424 struct ipv6hdr *hdr = ipv6_hdr(skb); 425 struct inet6_skb_parm *opt = IP6CB(skb); 426 struct net *net = dev_net(dst->dev); 427 u32 mtu; 428 429 if (net->ipv6.devconf_all->forwarding == 0) 430 goto error; 431 432 if (skb->pkt_type != PACKET_HOST) 433 goto drop; 434 435 if (unlikely(skb->sk)) 436 goto drop; 437 438 if (skb_warn_if_lro(skb)) 439 goto drop; 440 441 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 442 __IP6_INC_STATS(net, ip6_dst_idev(dst), 443 IPSTATS_MIB_INDISCARDS); 444 goto drop; 445 } 446 447 skb_forward_csum(skb); 448 449 /* 450 * We DO NOT make any processing on 451 * RA packets, pushing them to user level AS IS 452 * without ane WARRANTY that application will be able 453 * to interpret them. The reason is that we 454 * cannot make anything clever here. 455 * 456 * We are not end-node, so that if packet contains 457 * AH/ESP, we cannot make anything. 458 * Defragmentation also would be mistake, RA packets 459 * cannot be fragmented, because there is no warranty 460 * that different fragments will go along one path. --ANK 461 */ 462 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 463 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 464 return 0; 465 } 466 467 /* 468 * check and decrement ttl 469 */ 470 if (hdr->hop_limit <= 1) { 471 /* Force OUTPUT device used as source address */ 472 skb->dev = dst->dev; 473 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 474 __IP6_INC_STATS(net, ip6_dst_idev(dst), 475 IPSTATS_MIB_INHDRERRORS); 476 477 kfree_skb(skb); 478 return -ETIMEDOUT; 479 } 480 481 /* XXX: idev->cnf.proxy_ndp? */ 482 if (net->ipv6.devconf_all->proxy_ndp && 483 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 484 int proxied = ip6_forward_proxy_check(skb); 485 if (proxied > 0) 486 return ip6_input(skb); 487 else if (proxied < 0) { 488 __IP6_INC_STATS(net, ip6_dst_idev(dst), 489 IPSTATS_MIB_INDISCARDS); 490 goto drop; 491 } 492 } 493 494 if (!xfrm6_route_forward(skb)) { 495 __IP6_INC_STATS(net, ip6_dst_idev(dst), 496 IPSTATS_MIB_INDISCARDS); 497 goto drop; 498 } 499 dst = skb_dst(skb); 500 501 /* IPv6 specs say nothing about it, but it is clear that we cannot 502 send redirects to source routed frames. 503 We don't send redirects to frames decapsulated from IPsec. 504 */ 505 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { 506 struct in6_addr *target = NULL; 507 struct inet_peer *peer; 508 struct rt6_info *rt; 509 510 /* 511 * incoming and outgoing devices are the same 512 * send a redirect. 513 */ 514 515 rt = (struct rt6_info *) dst; 516 if (rt->rt6i_flags & RTF_GATEWAY) 517 target = &rt->rt6i_gateway; 518 else 519 target = &hdr->daddr; 520 521 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 522 523 /* Limit redirects both by destination (here) 524 and by source (inside ndisc_send_redirect) 525 */ 526 if (inet_peer_xrlim_allow(peer, 1*HZ)) 527 ndisc_send_redirect(skb, target); 528 if (peer) 529 inet_putpeer(peer); 530 } else { 531 int addrtype = ipv6_addr_type(&hdr->saddr); 532 533 /* This check is security critical. */ 534 if (addrtype == IPV6_ADDR_ANY || 535 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 536 goto error; 537 if (addrtype & IPV6_ADDR_LINKLOCAL) { 538 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 539 ICMPV6_NOT_NEIGHBOUR, 0); 540 goto error; 541 } 542 } 543 544 mtu = ip6_dst_mtu_forward(dst); 545 if (mtu < IPV6_MIN_MTU) 546 mtu = IPV6_MIN_MTU; 547 548 if (ip6_pkt_too_big(skb, mtu)) { 549 /* Again, force OUTPUT device used as source address */ 550 skb->dev = dst->dev; 551 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 552 __IP6_INC_STATS(net, ip6_dst_idev(dst), 553 IPSTATS_MIB_INTOOBIGERRORS); 554 __IP6_INC_STATS(net, ip6_dst_idev(dst), 555 IPSTATS_MIB_FRAGFAILS); 556 kfree_skb(skb); 557 return -EMSGSIZE; 558 } 559 560 if (skb_cow(skb, dst->dev->hard_header_len)) { 561 __IP6_INC_STATS(net, ip6_dst_idev(dst), 562 IPSTATS_MIB_OUTDISCARDS); 563 goto drop; 564 } 565 566 hdr = ipv6_hdr(skb); 567 568 /* Mangling hops number delayed to point after skb COW */ 569 570 hdr->hop_limit--; 571 572 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 573 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 574 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 575 net, NULL, skb, skb->dev, dst->dev, 576 ip6_forward_finish); 577 578 error: 579 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 580 drop: 581 kfree_skb(skb); 582 return -EINVAL; 583 } 584 585 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 586 { 587 to->pkt_type = from->pkt_type; 588 to->priority = from->priority; 589 to->protocol = from->protocol; 590 skb_dst_drop(to); 591 skb_dst_set(to, dst_clone(skb_dst(from))); 592 to->dev = from->dev; 593 to->mark = from->mark; 594 595 #ifdef CONFIG_NET_SCHED 596 to->tc_index = from->tc_index; 597 #endif 598 nf_copy(to, from); 599 skb_copy_secmark(to, from); 600 } 601 602 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 603 int (*output)(struct net *, struct sock *, struct sk_buff *)) 604 { 605 struct sk_buff *frag; 606 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 607 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 608 inet6_sk(skb->sk) : NULL; 609 struct ipv6hdr *tmp_hdr; 610 struct frag_hdr *fh; 611 unsigned int mtu, hlen, left, len; 612 int hroom, troom; 613 __be32 frag_id; 614 int ptr, offset = 0, err = 0; 615 u8 *prevhdr, nexthdr = 0; 616 617 err = ip6_find_1stfragopt(skb, &prevhdr); 618 if (err < 0) 619 goto fail; 620 hlen = err; 621 nexthdr = *prevhdr; 622 623 mtu = ip6_skb_dst_mtu(skb); 624 625 /* We must not fragment if the socket is set to force MTU discovery 626 * or if the skb it not generated by a local socket. 627 */ 628 if (unlikely(!skb->ignore_df && skb->len > mtu)) 629 goto fail_toobig; 630 631 if (IP6CB(skb)->frag_max_size) { 632 if (IP6CB(skb)->frag_max_size > mtu) 633 goto fail_toobig; 634 635 /* don't send fragments larger than what we received */ 636 mtu = IP6CB(skb)->frag_max_size; 637 if (mtu < IPV6_MIN_MTU) 638 mtu = IPV6_MIN_MTU; 639 } 640 641 if (np && np->frag_size < mtu) { 642 if (np->frag_size) 643 mtu = np->frag_size; 644 } 645 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 646 goto fail_toobig; 647 mtu -= hlen + sizeof(struct frag_hdr); 648 649 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 650 &ipv6_hdr(skb)->saddr); 651 652 if (skb->ip_summed == CHECKSUM_PARTIAL && 653 (err = skb_checksum_help(skb))) 654 goto fail; 655 656 hroom = LL_RESERVED_SPACE(rt->dst.dev); 657 if (skb_has_frag_list(skb)) { 658 unsigned int first_len = skb_pagelen(skb); 659 struct sk_buff *frag2; 660 661 if (first_len - hlen > mtu || 662 ((first_len - hlen) & 7) || 663 skb_cloned(skb) || 664 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 665 goto slow_path; 666 667 skb_walk_frags(skb, frag) { 668 /* Correct geometry. */ 669 if (frag->len > mtu || 670 ((frag->len & 7) && frag->next) || 671 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 672 goto slow_path_clean; 673 674 /* Partially cloned skb? */ 675 if (skb_shared(frag)) 676 goto slow_path_clean; 677 678 BUG_ON(frag->sk); 679 if (skb->sk) { 680 frag->sk = skb->sk; 681 frag->destructor = sock_wfree; 682 } 683 skb->truesize -= frag->truesize; 684 } 685 686 err = 0; 687 offset = 0; 688 /* BUILD HEADER */ 689 690 *prevhdr = NEXTHDR_FRAGMENT; 691 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 692 if (!tmp_hdr) { 693 err = -ENOMEM; 694 goto fail; 695 } 696 frag = skb_shinfo(skb)->frag_list; 697 skb_frag_list_init(skb); 698 699 __skb_pull(skb, hlen); 700 fh = __skb_push(skb, sizeof(struct frag_hdr)); 701 __skb_push(skb, hlen); 702 skb_reset_network_header(skb); 703 memcpy(skb_network_header(skb), tmp_hdr, hlen); 704 705 fh->nexthdr = nexthdr; 706 fh->reserved = 0; 707 fh->frag_off = htons(IP6_MF); 708 fh->identification = frag_id; 709 710 first_len = skb_pagelen(skb); 711 skb->data_len = first_len - skb_headlen(skb); 712 skb->len = first_len; 713 ipv6_hdr(skb)->payload_len = htons(first_len - 714 sizeof(struct ipv6hdr)); 715 716 for (;;) { 717 /* Prepare header of the next frame, 718 * before previous one went down. */ 719 if (frag) { 720 frag->ip_summed = CHECKSUM_NONE; 721 skb_reset_transport_header(frag); 722 fh = __skb_push(frag, sizeof(struct frag_hdr)); 723 __skb_push(frag, hlen); 724 skb_reset_network_header(frag); 725 memcpy(skb_network_header(frag), tmp_hdr, 726 hlen); 727 offset += skb->len - hlen - sizeof(struct frag_hdr); 728 fh->nexthdr = nexthdr; 729 fh->reserved = 0; 730 fh->frag_off = htons(offset); 731 if (frag->next) 732 fh->frag_off |= htons(IP6_MF); 733 fh->identification = frag_id; 734 ipv6_hdr(frag)->payload_len = 735 htons(frag->len - 736 sizeof(struct ipv6hdr)); 737 ip6_copy_metadata(frag, skb); 738 } 739 740 err = output(net, sk, skb); 741 if (!err) 742 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 743 IPSTATS_MIB_FRAGCREATES); 744 745 if (err || !frag) 746 break; 747 748 skb = frag; 749 frag = skb->next; 750 skb->next = NULL; 751 } 752 753 kfree(tmp_hdr); 754 755 if (err == 0) { 756 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 757 IPSTATS_MIB_FRAGOKS); 758 return 0; 759 } 760 761 kfree_skb_list(frag); 762 763 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 764 IPSTATS_MIB_FRAGFAILS); 765 return err; 766 767 slow_path_clean: 768 skb_walk_frags(skb, frag2) { 769 if (frag2 == frag) 770 break; 771 frag2->sk = NULL; 772 frag2->destructor = NULL; 773 skb->truesize += frag2->truesize; 774 } 775 } 776 777 slow_path: 778 left = skb->len - hlen; /* Space per frame */ 779 ptr = hlen; /* Where to start from */ 780 781 /* 782 * Fragment the datagram. 783 */ 784 785 troom = rt->dst.dev->needed_tailroom; 786 787 /* 788 * Keep copying data until we run out. 789 */ 790 while (left > 0) { 791 u8 *fragnexthdr_offset; 792 793 len = left; 794 /* IF: it doesn't fit, use 'mtu' - the data space left */ 795 if (len > mtu) 796 len = mtu; 797 /* IF: we are not sending up to and including the packet end 798 then align the next start on an eight byte boundary */ 799 if (len < left) { 800 len &= ~7; 801 } 802 803 /* Allocate buffer */ 804 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 805 hroom + troom, GFP_ATOMIC); 806 if (!frag) { 807 err = -ENOMEM; 808 goto fail; 809 } 810 811 /* 812 * Set up data on packet 813 */ 814 815 ip6_copy_metadata(frag, skb); 816 skb_reserve(frag, hroom); 817 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 818 skb_reset_network_header(frag); 819 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 820 frag->transport_header = (frag->network_header + hlen + 821 sizeof(struct frag_hdr)); 822 823 /* 824 * Charge the memory for the fragment to any owner 825 * it might possess 826 */ 827 if (skb->sk) 828 skb_set_owner_w(frag, skb->sk); 829 830 /* 831 * Copy the packet header into the new buffer. 832 */ 833 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 834 835 fragnexthdr_offset = skb_network_header(frag); 836 fragnexthdr_offset += prevhdr - skb_network_header(skb); 837 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 838 839 /* 840 * Build fragment header. 841 */ 842 fh->nexthdr = nexthdr; 843 fh->reserved = 0; 844 fh->identification = frag_id; 845 846 /* 847 * Copy a block of the IP datagram. 848 */ 849 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 850 len)); 851 left -= len; 852 853 fh->frag_off = htons(offset); 854 if (left > 0) 855 fh->frag_off |= htons(IP6_MF); 856 ipv6_hdr(frag)->payload_len = htons(frag->len - 857 sizeof(struct ipv6hdr)); 858 859 ptr += len; 860 offset += len; 861 862 /* 863 * Put this fragment into the sending queue. 864 */ 865 err = output(net, sk, frag); 866 if (err) 867 goto fail; 868 869 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 870 IPSTATS_MIB_FRAGCREATES); 871 } 872 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 873 IPSTATS_MIB_FRAGOKS); 874 consume_skb(skb); 875 return err; 876 877 fail_toobig: 878 if (skb->sk && dst_allfrag(skb_dst(skb))) 879 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 880 881 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 882 err = -EMSGSIZE; 883 884 fail: 885 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 886 IPSTATS_MIB_FRAGFAILS); 887 kfree_skb(skb); 888 return err; 889 } 890 891 static inline int ip6_rt_check(const struct rt6key *rt_key, 892 const struct in6_addr *fl_addr, 893 const struct in6_addr *addr_cache) 894 { 895 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 896 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 897 } 898 899 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 900 struct dst_entry *dst, 901 const struct flowi6 *fl6) 902 { 903 struct ipv6_pinfo *np = inet6_sk(sk); 904 struct rt6_info *rt; 905 906 if (!dst) 907 goto out; 908 909 if (dst->ops->family != AF_INET6) { 910 dst_release(dst); 911 return NULL; 912 } 913 914 rt = (struct rt6_info *)dst; 915 /* Yes, checking route validity in not connected 916 * case is not very simple. Take into account, 917 * that we do not support routing by source, TOS, 918 * and MSG_DONTROUTE --ANK (980726) 919 * 920 * 1. ip6_rt_check(): If route was host route, 921 * check that cached destination is current. 922 * If it is network route, we still may 923 * check its validity using saved pointer 924 * to the last used address: daddr_cache. 925 * We do not want to save whole address now, 926 * (because main consumer of this service 927 * is tcp, which has not this problem), 928 * so that the last trick works only on connected 929 * sockets. 930 * 2. oif also should be the same. 931 */ 932 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 933 #ifdef CONFIG_IPV6_SUBTREES 934 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 935 #endif 936 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 937 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 938 dst_release(dst); 939 dst = NULL; 940 } 941 942 out: 943 return dst; 944 } 945 946 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 947 struct dst_entry **dst, struct flowi6 *fl6) 948 { 949 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 950 struct neighbour *n; 951 struct rt6_info *rt; 952 #endif 953 int err; 954 int flags = 0; 955 956 /* The correct way to handle this would be to do 957 * ip6_route_get_saddr, and then ip6_route_output; however, 958 * the route-specific preferred source forces the 959 * ip6_route_output call _before_ ip6_route_get_saddr. 960 * 961 * In source specific routing (no src=any default route), 962 * ip6_route_output will fail given src=any saddr, though, so 963 * that's why we try it again later. 964 */ 965 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 966 struct rt6_info *rt; 967 bool had_dst = *dst != NULL; 968 969 if (!had_dst) 970 *dst = ip6_route_output(net, sk, fl6); 971 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 972 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 973 sk ? inet6_sk(sk)->srcprefs : 0, 974 &fl6->saddr); 975 if (err) 976 goto out_err_release; 977 978 /* If we had an erroneous initial result, pretend it 979 * never existed and let the SA-enabled version take 980 * over. 981 */ 982 if (!had_dst && (*dst)->error) { 983 dst_release(*dst); 984 *dst = NULL; 985 } 986 987 if (fl6->flowi6_oif) 988 flags |= RT6_LOOKUP_F_IFACE; 989 } 990 991 if (!*dst) 992 *dst = ip6_route_output_flags(net, sk, fl6, flags); 993 994 err = (*dst)->error; 995 if (err) 996 goto out_err_release; 997 998 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 999 /* 1000 * Here if the dst entry we've looked up 1001 * has a neighbour entry that is in the INCOMPLETE 1002 * state and the src address from the flow is 1003 * marked as OPTIMISTIC, we release the found 1004 * dst entry and replace it instead with the 1005 * dst entry of the nexthop router 1006 */ 1007 rt = (struct rt6_info *) *dst; 1008 rcu_read_lock_bh(); 1009 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1010 rt6_nexthop(rt, &fl6->daddr)); 1011 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1012 rcu_read_unlock_bh(); 1013 1014 if (err) { 1015 struct inet6_ifaddr *ifp; 1016 struct flowi6 fl_gw6; 1017 int redirect; 1018 1019 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1020 (*dst)->dev, 1); 1021 1022 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1023 if (ifp) 1024 in6_ifa_put(ifp); 1025 1026 if (redirect) { 1027 /* 1028 * We need to get the dst entry for the 1029 * default router instead 1030 */ 1031 dst_release(*dst); 1032 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1033 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1034 *dst = ip6_route_output(net, sk, &fl_gw6); 1035 err = (*dst)->error; 1036 if (err) 1037 goto out_err_release; 1038 } 1039 } 1040 #endif 1041 if (ipv6_addr_v4mapped(&fl6->saddr) && 1042 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1043 err = -EAFNOSUPPORT; 1044 goto out_err_release; 1045 } 1046 1047 return 0; 1048 1049 out_err_release: 1050 dst_release(*dst); 1051 *dst = NULL; 1052 1053 if (err == -ENETUNREACH) 1054 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1055 return err; 1056 } 1057 1058 /** 1059 * ip6_dst_lookup - perform route lookup on flow 1060 * @sk: socket which provides route info 1061 * @dst: pointer to dst_entry * for result 1062 * @fl6: flow to lookup 1063 * 1064 * This function performs a route lookup on the given flow. 1065 * 1066 * It returns zero on success, or a standard errno code on error. 1067 */ 1068 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1069 struct flowi6 *fl6) 1070 { 1071 *dst = NULL; 1072 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1073 } 1074 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1075 1076 /** 1077 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1078 * @sk: socket which provides route info 1079 * @fl6: flow to lookup 1080 * @final_dst: final destination address for ipsec lookup 1081 * 1082 * This function performs a route lookup on the given flow. 1083 * 1084 * It returns a valid dst pointer on success, or a pointer encoded 1085 * error code. 1086 */ 1087 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1088 const struct in6_addr *final_dst) 1089 { 1090 struct dst_entry *dst = NULL; 1091 int err; 1092 1093 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1094 if (err) 1095 return ERR_PTR(err); 1096 if (final_dst) 1097 fl6->daddr = *final_dst; 1098 1099 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1100 } 1101 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1102 1103 /** 1104 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1105 * @sk: socket which provides the dst cache and route info 1106 * @fl6: flow to lookup 1107 * @final_dst: final destination address for ipsec lookup 1108 * @connected: whether @sk is connected or not 1109 * 1110 * This function performs a route lookup on the given flow with the 1111 * possibility of using the cached route in the socket if it is valid. 1112 * It will take the socket dst lock when operating on the dst cache. 1113 * As a result, this function can only be used in process context. 1114 * 1115 * In addition, for a connected socket, cache the dst in the socket 1116 * if the current cache is not valid. 1117 * 1118 * It returns a valid dst pointer on success, or a pointer encoded 1119 * error code. 1120 */ 1121 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1122 const struct in6_addr *final_dst, 1123 bool connected) 1124 { 1125 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1126 1127 dst = ip6_sk_dst_check(sk, dst, fl6); 1128 if (dst) 1129 return dst; 1130 1131 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1132 if (connected && !IS_ERR(dst)) 1133 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1134 1135 return dst; 1136 } 1137 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1138 1139 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1140 gfp_t gfp) 1141 { 1142 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1143 } 1144 1145 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1146 gfp_t gfp) 1147 { 1148 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1149 } 1150 1151 static void ip6_append_data_mtu(unsigned int *mtu, 1152 int *maxfraglen, 1153 unsigned int fragheaderlen, 1154 struct sk_buff *skb, 1155 struct rt6_info *rt, 1156 unsigned int orig_mtu) 1157 { 1158 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1159 if (!skb) { 1160 /* first fragment, reserve header_len */ 1161 *mtu = orig_mtu - rt->dst.header_len; 1162 1163 } else { 1164 /* 1165 * this fragment is not first, the headers 1166 * space is regarded as data space. 1167 */ 1168 *mtu = orig_mtu; 1169 } 1170 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1171 + fragheaderlen - sizeof(struct frag_hdr); 1172 } 1173 } 1174 1175 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1176 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1177 struct rt6_info *rt, struct flowi6 *fl6) 1178 { 1179 struct ipv6_pinfo *np = inet6_sk(sk); 1180 unsigned int mtu; 1181 struct ipv6_txoptions *opt = ipc6->opt; 1182 1183 /* 1184 * setup for corking 1185 */ 1186 if (opt) { 1187 if (WARN_ON(v6_cork->opt)) 1188 return -EINVAL; 1189 1190 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1191 if (unlikely(!v6_cork->opt)) 1192 return -ENOBUFS; 1193 1194 v6_cork->opt->tot_len = sizeof(*opt); 1195 v6_cork->opt->opt_flen = opt->opt_flen; 1196 v6_cork->opt->opt_nflen = opt->opt_nflen; 1197 1198 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1199 sk->sk_allocation); 1200 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1201 return -ENOBUFS; 1202 1203 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1204 sk->sk_allocation); 1205 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1206 return -ENOBUFS; 1207 1208 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1209 sk->sk_allocation); 1210 if (opt->hopopt && !v6_cork->opt->hopopt) 1211 return -ENOBUFS; 1212 1213 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1214 sk->sk_allocation); 1215 if (opt->srcrt && !v6_cork->opt->srcrt) 1216 return -ENOBUFS; 1217 1218 /* need source address above miyazawa*/ 1219 } 1220 dst_hold(&rt->dst); 1221 cork->base.dst = &rt->dst; 1222 cork->fl.u.ip6 = *fl6; 1223 v6_cork->hop_limit = ipc6->hlimit; 1224 v6_cork->tclass = ipc6->tclass; 1225 if (rt->dst.flags & DST_XFRM_TUNNEL) 1226 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1227 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1228 else 1229 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1230 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1231 if (np->frag_size < mtu) { 1232 if (np->frag_size) 1233 mtu = np->frag_size; 1234 } 1235 if (mtu < IPV6_MIN_MTU) 1236 return -EINVAL; 1237 cork->base.fragsize = mtu; 1238 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1239 cork->base.flags |= IPCORK_ALLFRAG; 1240 cork->base.length = 0; 1241 1242 return 0; 1243 } 1244 1245 static int __ip6_append_data(struct sock *sk, 1246 struct flowi6 *fl6, 1247 struct sk_buff_head *queue, 1248 struct inet_cork *cork, 1249 struct inet6_cork *v6_cork, 1250 struct page_frag *pfrag, 1251 int getfrag(void *from, char *to, int offset, 1252 int len, int odd, struct sk_buff *skb), 1253 void *from, int length, int transhdrlen, 1254 unsigned int flags, struct ipcm6_cookie *ipc6, 1255 const struct sockcm_cookie *sockc) 1256 { 1257 struct sk_buff *skb, *skb_prev = NULL; 1258 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1259 int exthdrlen = 0; 1260 int dst_exthdrlen = 0; 1261 int hh_len; 1262 int copy; 1263 int err; 1264 int offset = 0; 1265 __u8 tx_flags = 0; 1266 u32 tskey = 0; 1267 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1268 struct ipv6_txoptions *opt = v6_cork->opt; 1269 int csummode = CHECKSUM_NONE; 1270 unsigned int maxnonfragsize, headersize; 1271 unsigned int wmem_alloc_delta = 0; 1272 1273 skb = skb_peek_tail(queue); 1274 if (!skb) { 1275 exthdrlen = opt ? opt->opt_flen : 0; 1276 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1277 } 1278 1279 mtu = cork->fragsize; 1280 orig_mtu = mtu; 1281 1282 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1283 1284 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1285 (opt ? opt->opt_nflen : 0); 1286 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1287 sizeof(struct frag_hdr); 1288 1289 headersize = sizeof(struct ipv6hdr) + 1290 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1291 (dst_allfrag(&rt->dst) ? 1292 sizeof(struct frag_hdr) : 0) + 1293 rt->rt6i_nfheader_len; 1294 1295 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1296 * the first fragment 1297 */ 1298 if (headersize + transhdrlen > mtu) 1299 goto emsgsize; 1300 1301 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1302 (sk->sk_protocol == IPPROTO_UDP || 1303 sk->sk_protocol == IPPROTO_RAW)) { 1304 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1305 sizeof(struct ipv6hdr)); 1306 goto emsgsize; 1307 } 1308 1309 if (ip6_sk_ignore_df(sk)) 1310 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1311 else 1312 maxnonfragsize = mtu; 1313 1314 if (cork->length + length > maxnonfragsize - headersize) { 1315 emsgsize: 1316 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1317 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1318 return -EMSGSIZE; 1319 } 1320 1321 /* CHECKSUM_PARTIAL only with no extension headers and when 1322 * we are not going to fragment 1323 */ 1324 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1325 headersize == sizeof(struct ipv6hdr) && 1326 length <= mtu - headersize && 1327 !(flags & MSG_MORE) && 1328 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1329 csummode = CHECKSUM_PARTIAL; 1330 1331 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { 1332 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); 1333 if (tx_flags & SKBTX_ANY_SW_TSTAMP && 1334 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1335 tskey = sk->sk_tskey++; 1336 } 1337 1338 /* 1339 * Let's try using as much space as possible. 1340 * Use MTU if total length of the message fits into the MTU. 1341 * Otherwise, we need to reserve fragment header and 1342 * fragment alignment (= 8-15 octects, in total). 1343 * 1344 * Note that we may need to "move" the data from the tail of 1345 * of the buffer to the new fragment when we split 1346 * the message. 1347 * 1348 * FIXME: It may be fragmented into multiple chunks 1349 * at once if non-fragmentable extension headers 1350 * are too large. 1351 * --yoshfuji 1352 */ 1353 1354 cork->length += length; 1355 if (!skb) 1356 goto alloc_new_skb; 1357 1358 while (length > 0) { 1359 /* Check if the remaining data fits into current packet. */ 1360 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1361 if (copy < length) 1362 copy = maxfraglen - skb->len; 1363 1364 if (copy <= 0) { 1365 char *data; 1366 unsigned int datalen; 1367 unsigned int fraglen; 1368 unsigned int fraggap; 1369 unsigned int alloclen; 1370 alloc_new_skb: 1371 /* There's no room in the current skb */ 1372 if (skb) 1373 fraggap = skb->len - maxfraglen; 1374 else 1375 fraggap = 0; 1376 /* update mtu and maxfraglen if necessary */ 1377 if (!skb || !skb_prev) 1378 ip6_append_data_mtu(&mtu, &maxfraglen, 1379 fragheaderlen, skb, rt, 1380 orig_mtu); 1381 1382 skb_prev = skb; 1383 1384 /* 1385 * If remaining data exceeds the mtu, 1386 * we know we need more fragment(s). 1387 */ 1388 datalen = length + fraggap; 1389 1390 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1391 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1392 if ((flags & MSG_MORE) && 1393 !(rt->dst.dev->features&NETIF_F_SG)) 1394 alloclen = mtu; 1395 else 1396 alloclen = datalen + fragheaderlen; 1397 1398 alloclen += dst_exthdrlen; 1399 1400 if (datalen != length + fraggap) { 1401 /* 1402 * this is not the last fragment, the trailer 1403 * space is regarded as data space. 1404 */ 1405 datalen += rt->dst.trailer_len; 1406 } 1407 1408 alloclen += rt->dst.trailer_len; 1409 fraglen = datalen + fragheaderlen; 1410 1411 /* 1412 * We just reserve space for fragment header. 1413 * Note: this may be overallocation if the message 1414 * (without MSG_MORE) fits into the MTU. 1415 */ 1416 alloclen += sizeof(struct frag_hdr); 1417 1418 copy = datalen - transhdrlen - fraggap; 1419 if (copy < 0) { 1420 err = -EINVAL; 1421 goto error; 1422 } 1423 if (transhdrlen) { 1424 skb = sock_alloc_send_skb(sk, 1425 alloclen + hh_len, 1426 (flags & MSG_DONTWAIT), &err); 1427 } else { 1428 skb = NULL; 1429 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1430 2 * sk->sk_sndbuf) 1431 skb = alloc_skb(alloclen + hh_len, 1432 sk->sk_allocation); 1433 if (unlikely(!skb)) 1434 err = -ENOBUFS; 1435 } 1436 if (!skb) 1437 goto error; 1438 /* 1439 * Fill in the control structures 1440 */ 1441 skb->protocol = htons(ETH_P_IPV6); 1442 skb->ip_summed = csummode; 1443 skb->csum = 0; 1444 /* reserve for fragmentation and ipsec header */ 1445 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1446 dst_exthdrlen); 1447 1448 /* Only the initial fragment is time stamped */ 1449 skb_shinfo(skb)->tx_flags = tx_flags; 1450 tx_flags = 0; 1451 skb_shinfo(skb)->tskey = tskey; 1452 tskey = 0; 1453 1454 /* 1455 * Find where to start putting bytes 1456 */ 1457 data = skb_put(skb, fraglen); 1458 skb_set_network_header(skb, exthdrlen); 1459 data += fragheaderlen; 1460 skb->transport_header = (skb->network_header + 1461 fragheaderlen); 1462 if (fraggap) { 1463 skb->csum = skb_copy_and_csum_bits( 1464 skb_prev, maxfraglen, 1465 data + transhdrlen, fraggap, 0); 1466 skb_prev->csum = csum_sub(skb_prev->csum, 1467 skb->csum); 1468 data += fraggap; 1469 pskb_trim_unique(skb_prev, maxfraglen); 1470 } 1471 if (copy > 0 && 1472 getfrag(from, data + transhdrlen, offset, 1473 copy, fraggap, skb) < 0) { 1474 err = -EFAULT; 1475 kfree_skb(skb); 1476 goto error; 1477 } 1478 1479 offset += copy; 1480 length -= datalen - fraggap; 1481 transhdrlen = 0; 1482 exthdrlen = 0; 1483 dst_exthdrlen = 0; 1484 1485 if ((flags & MSG_CONFIRM) && !skb_prev) 1486 skb_set_dst_pending_confirm(skb, 1); 1487 1488 /* 1489 * Put the packet on the pending queue 1490 */ 1491 if (!skb->destructor) { 1492 skb->destructor = sock_wfree; 1493 skb->sk = sk; 1494 wmem_alloc_delta += skb->truesize; 1495 } 1496 __skb_queue_tail(queue, skb); 1497 continue; 1498 } 1499 1500 if (copy > length) 1501 copy = length; 1502 1503 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1504 unsigned int off; 1505 1506 off = skb->len; 1507 if (getfrag(from, skb_put(skb, copy), 1508 offset, copy, off, skb) < 0) { 1509 __skb_trim(skb, off); 1510 err = -EFAULT; 1511 goto error; 1512 } 1513 } else { 1514 int i = skb_shinfo(skb)->nr_frags; 1515 1516 err = -ENOMEM; 1517 if (!sk_page_frag_refill(sk, pfrag)) 1518 goto error; 1519 1520 if (!skb_can_coalesce(skb, i, pfrag->page, 1521 pfrag->offset)) { 1522 err = -EMSGSIZE; 1523 if (i == MAX_SKB_FRAGS) 1524 goto error; 1525 1526 __skb_fill_page_desc(skb, i, pfrag->page, 1527 pfrag->offset, 0); 1528 skb_shinfo(skb)->nr_frags = ++i; 1529 get_page(pfrag->page); 1530 } 1531 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1532 if (getfrag(from, 1533 page_address(pfrag->page) + pfrag->offset, 1534 offset, copy, skb->len, skb) < 0) 1535 goto error_efault; 1536 1537 pfrag->offset += copy; 1538 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1539 skb->len += copy; 1540 skb->data_len += copy; 1541 skb->truesize += copy; 1542 wmem_alloc_delta += copy; 1543 } 1544 offset += copy; 1545 length -= copy; 1546 } 1547 1548 if (wmem_alloc_delta) 1549 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1550 return 0; 1551 1552 error_efault: 1553 err = -EFAULT; 1554 error: 1555 cork->length -= length; 1556 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1557 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1558 return err; 1559 } 1560 1561 int ip6_append_data(struct sock *sk, 1562 int getfrag(void *from, char *to, int offset, int len, 1563 int odd, struct sk_buff *skb), 1564 void *from, int length, int transhdrlen, 1565 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1566 struct rt6_info *rt, unsigned int flags, 1567 const struct sockcm_cookie *sockc) 1568 { 1569 struct inet_sock *inet = inet_sk(sk); 1570 struct ipv6_pinfo *np = inet6_sk(sk); 1571 int exthdrlen; 1572 int err; 1573 1574 if (flags&MSG_PROBE) 1575 return 0; 1576 if (skb_queue_empty(&sk->sk_write_queue)) { 1577 /* 1578 * setup for corking 1579 */ 1580 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1581 ipc6, rt, fl6); 1582 if (err) 1583 return err; 1584 1585 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1586 length += exthdrlen; 1587 transhdrlen += exthdrlen; 1588 } else { 1589 fl6 = &inet->cork.fl.u.ip6; 1590 transhdrlen = 0; 1591 } 1592 1593 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1594 &np->cork, sk_page_frag(sk), getfrag, 1595 from, length, transhdrlen, flags, ipc6, sockc); 1596 } 1597 EXPORT_SYMBOL_GPL(ip6_append_data); 1598 1599 static void ip6_cork_release(struct inet_cork_full *cork, 1600 struct inet6_cork *v6_cork) 1601 { 1602 if (v6_cork->opt) { 1603 kfree(v6_cork->opt->dst0opt); 1604 kfree(v6_cork->opt->dst1opt); 1605 kfree(v6_cork->opt->hopopt); 1606 kfree(v6_cork->opt->srcrt); 1607 kfree(v6_cork->opt); 1608 v6_cork->opt = NULL; 1609 } 1610 1611 if (cork->base.dst) { 1612 dst_release(cork->base.dst); 1613 cork->base.dst = NULL; 1614 cork->base.flags &= ~IPCORK_ALLFRAG; 1615 } 1616 memset(&cork->fl, 0, sizeof(cork->fl)); 1617 } 1618 1619 struct sk_buff *__ip6_make_skb(struct sock *sk, 1620 struct sk_buff_head *queue, 1621 struct inet_cork_full *cork, 1622 struct inet6_cork *v6_cork) 1623 { 1624 struct sk_buff *skb, *tmp_skb; 1625 struct sk_buff **tail_skb; 1626 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1627 struct ipv6_pinfo *np = inet6_sk(sk); 1628 struct net *net = sock_net(sk); 1629 struct ipv6hdr *hdr; 1630 struct ipv6_txoptions *opt = v6_cork->opt; 1631 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1632 struct flowi6 *fl6 = &cork->fl.u.ip6; 1633 unsigned char proto = fl6->flowi6_proto; 1634 1635 skb = __skb_dequeue(queue); 1636 if (!skb) 1637 goto out; 1638 tail_skb = &(skb_shinfo(skb)->frag_list); 1639 1640 /* move skb->data to ip header from ext header */ 1641 if (skb->data < skb_network_header(skb)) 1642 __skb_pull(skb, skb_network_offset(skb)); 1643 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1644 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1645 *tail_skb = tmp_skb; 1646 tail_skb = &(tmp_skb->next); 1647 skb->len += tmp_skb->len; 1648 skb->data_len += tmp_skb->len; 1649 skb->truesize += tmp_skb->truesize; 1650 tmp_skb->destructor = NULL; 1651 tmp_skb->sk = NULL; 1652 } 1653 1654 /* Allow local fragmentation. */ 1655 skb->ignore_df = ip6_sk_ignore_df(sk); 1656 1657 *final_dst = fl6->daddr; 1658 __skb_pull(skb, skb_network_header_len(skb)); 1659 if (opt && opt->opt_flen) 1660 ipv6_push_frag_opts(skb, opt, &proto); 1661 if (opt && opt->opt_nflen) 1662 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1663 1664 skb_push(skb, sizeof(struct ipv6hdr)); 1665 skb_reset_network_header(skb); 1666 hdr = ipv6_hdr(skb); 1667 1668 ip6_flow_hdr(hdr, v6_cork->tclass, 1669 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1670 ip6_autoflowlabel(net, np), fl6)); 1671 hdr->hop_limit = v6_cork->hop_limit; 1672 hdr->nexthdr = proto; 1673 hdr->saddr = fl6->saddr; 1674 hdr->daddr = *final_dst; 1675 1676 skb->priority = sk->sk_priority; 1677 skb->mark = sk->sk_mark; 1678 1679 skb_dst_set(skb, dst_clone(&rt->dst)); 1680 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1681 if (proto == IPPROTO_ICMPV6) { 1682 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1683 1684 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1685 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1686 } 1687 1688 ip6_cork_release(cork, v6_cork); 1689 out: 1690 return skb; 1691 } 1692 1693 int ip6_send_skb(struct sk_buff *skb) 1694 { 1695 struct net *net = sock_net(skb->sk); 1696 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1697 int err; 1698 1699 err = ip6_local_out(net, skb->sk, skb); 1700 if (err) { 1701 if (err > 0) 1702 err = net_xmit_errno(err); 1703 if (err) 1704 IP6_INC_STATS(net, rt->rt6i_idev, 1705 IPSTATS_MIB_OUTDISCARDS); 1706 } 1707 1708 return err; 1709 } 1710 1711 int ip6_push_pending_frames(struct sock *sk) 1712 { 1713 struct sk_buff *skb; 1714 1715 skb = ip6_finish_skb(sk); 1716 if (!skb) 1717 return 0; 1718 1719 return ip6_send_skb(skb); 1720 } 1721 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1722 1723 static void __ip6_flush_pending_frames(struct sock *sk, 1724 struct sk_buff_head *queue, 1725 struct inet_cork_full *cork, 1726 struct inet6_cork *v6_cork) 1727 { 1728 struct sk_buff *skb; 1729 1730 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1731 if (skb_dst(skb)) 1732 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1733 IPSTATS_MIB_OUTDISCARDS); 1734 kfree_skb(skb); 1735 } 1736 1737 ip6_cork_release(cork, v6_cork); 1738 } 1739 1740 void ip6_flush_pending_frames(struct sock *sk) 1741 { 1742 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1743 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1744 } 1745 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1746 1747 struct sk_buff *ip6_make_skb(struct sock *sk, 1748 int getfrag(void *from, char *to, int offset, 1749 int len, int odd, struct sk_buff *skb), 1750 void *from, int length, int transhdrlen, 1751 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1752 struct rt6_info *rt, unsigned int flags, 1753 const struct sockcm_cookie *sockc) 1754 { 1755 struct inet_cork_full cork; 1756 struct inet6_cork v6_cork; 1757 struct sk_buff_head queue; 1758 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1759 int err; 1760 1761 if (flags & MSG_PROBE) 1762 return NULL; 1763 1764 __skb_queue_head_init(&queue); 1765 1766 cork.base.flags = 0; 1767 cork.base.addr = 0; 1768 cork.base.opt = NULL; 1769 cork.base.dst = NULL; 1770 v6_cork.opt = NULL; 1771 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); 1772 if (err) { 1773 ip6_cork_release(&cork, &v6_cork); 1774 return ERR_PTR(err); 1775 } 1776 if (ipc6->dontfrag < 0) 1777 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1778 1779 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, 1780 ¤t->task_frag, getfrag, from, 1781 length + exthdrlen, transhdrlen + exthdrlen, 1782 flags, ipc6, sockc); 1783 if (err) { 1784 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); 1785 return ERR_PTR(err); 1786 } 1787 1788 return __ip6_make_skb(sk, &queue, &cork, &v6_cork); 1789 } 1790