1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv6.h> 44 45 #include <net/sock.h> 46 #include <net/snmp.h> 47 48 #include <net/ipv6.h> 49 #include <net/ndisc.h> 50 #include <net/protocol.h> 51 #include <net/ip6_route.h> 52 #include <net/addrconf.h> 53 #include <net/rawv6.h> 54 #include <net/icmp.h> 55 #include <net/xfrm.h> 56 #include <net/checksum.h> 57 #include <linux/mroute6.h> 58 #include <net/l3mdev.h> 59 #include <net/lwtunnel.h> 60 61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 62 { 63 struct dst_entry *dst = skb_dst(skb); 64 struct net_device *dev = dst->dev; 65 struct neighbour *neigh; 66 struct in6_addr *nexthop; 67 int ret; 68 69 skb->protocol = htons(ETH_P_IPV6); 70 skb->dev = dev; 71 72 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 73 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 74 75 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 76 ((mroute6_socket(net, skb) && 77 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 78 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 79 &ipv6_hdr(skb)->saddr))) { 80 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 81 82 /* Do not check for IFF_ALLMULTI; multicast routing 83 is not supported in any case. 84 */ 85 if (newskb) 86 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 87 net, sk, newskb, NULL, newskb->dev, 88 dev_loopback_xmit); 89 90 if (ipv6_hdr(skb)->hop_limit == 0) { 91 IP6_INC_STATS(net, idev, 92 IPSTATS_MIB_OUTDISCARDS); 93 kfree_skb(skb); 94 return 0; 95 } 96 } 97 98 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 99 100 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 101 IPV6_ADDR_SCOPE_NODELOCAL && 102 !(dev->flags & IFF_LOOPBACK)) { 103 kfree_skb(skb); 104 return 0; 105 } 106 } 107 108 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 109 int res = lwtunnel_xmit(skb); 110 111 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 112 return res; 113 } 114 115 rcu_read_lock_bh(); 116 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 117 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 118 if (unlikely(!neigh)) 119 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 120 if (!IS_ERR(neigh)) { 121 ret = dst_neigh_output(dst, neigh, skb); 122 rcu_read_unlock_bh(); 123 return ret; 124 } 125 rcu_read_unlock_bh(); 126 127 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 128 kfree_skb(skb); 129 return -EINVAL; 130 } 131 132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 133 { 134 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 135 dst_allfrag(skb_dst(skb)) || 136 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 137 return ip6_fragment(net, sk, skb, ip6_finish_output2); 138 else 139 return ip6_finish_output2(net, sk, skb); 140 } 141 142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 143 { 144 struct net_device *dev = skb_dst(skb)->dev; 145 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 146 147 if (unlikely(idev->cnf.disable_ipv6)) { 148 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 149 kfree_skb(skb); 150 return 0; 151 } 152 153 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 154 net, sk, skb, NULL, dev, 155 ip6_finish_output, 156 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 157 } 158 159 /* 160 * xmit an sk_buff (used by TCP, SCTP and DCCP) 161 * Note : socket lock is not held for SYNACK packets, but might be modified 162 * by calls to skb_set_owner_w() and ipv6_local_error(), 163 * which are using proper atomic operations or spinlocks. 164 */ 165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 166 struct ipv6_txoptions *opt, int tclass) 167 { 168 struct net *net = sock_net(sk); 169 const struct ipv6_pinfo *np = inet6_sk(sk); 170 struct in6_addr *first_hop = &fl6->daddr; 171 struct dst_entry *dst = skb_dst(skb); 172 struct ipv6hdr *hdr; 173 u8 proto = fl6->flowi6_proto; 174 int seg_len = skb->len; 175 int hlimit = -1; 176 u32 mtu; 177 178 if (opt) { 179 unsigned int head_room; 180 181 /* First: exthdrs may take lots of space (~8K for now) 182 MAX_HEADER is not enough. 183 */ 184 head_room = opt->opt_nflen + opt->opt_flen; 185 seg_len += head_room; 186 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 187 188 if (skb_headroom(skb) < head_room) { 189 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 190 if (!skb2) { 191 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 192 IPSTATS_MIB_OUTDISCARDS); 193 kfree_skb(skb); 194 return -ENOBUFS; 195 } 196 consume_skb(skb); 197 skb = skb2; 198 /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, 199 * it is safe to call in our context (socket lock not held) 200 */ 201 skb_set_owner_w(skb, (struct sock *)sk); 202 } 203 if (opt->opt_flen) 204 ipv6_push_frag_opts(skb, opt, &proto); 205 if (opt->opt_nflen) 206 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); 207 } 208 209 skb_push(skb, sizeof(struct ipv6hdr)); 210 skb_reset_network_header(skb); 211 hdr = ipv6_hdr(skb); 212 213 /* 214 * Fill in the IPv6 header 215 */ 216 if (np) 217 hlimit = np->hop_limit; 218 if (hlimit < 0) 219 hlimit = ip6_dst_hoplimit(dst); 220 221 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 222 np->autoflowlabel, fl6)); 223 224 hdr->payload_len = htons(seg_len); 225 hdr->nexthdr = proto; 226 hdr->hop_limit = hlimit; 227 228 hdr->saddr = fl6->saddr; 229 hdr->daddr = *first_hop; 230 231 skb->protocol = htons(ETH_P_IPV6); 232 skb->priority = sk->sk_priority; 233 skb->mark = sk->sk_mark; 234 235 mtu = dst_mtu(dst); 236 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 237 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 238 IPSTATS_MIB_OUT, skb->len); 239 240 /* if egress device is enslaved to an L3 master device pass the 241 * skb to its handler for processing 242 */ 243 skb = l3mdev_ip6_out((struct sock *)sk, skb); 244 if (unlikely(!skb)) 245 return 0; 246 247 /* hooks should never assume socket lock is held. 248 * we promote our socket to non const 249 */ 250 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 251 net, (struct sock *)sk, skb, NULL, dst->dev, 252 dst_output); 253 } 254 255 skb->dev = dst->dev; 256 /* ipv6_local_error() does not require socket lock, 257 * we promote our socket to non const 258 */ 259 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 260 261 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 262 kfree_skb(skb); 263 return -EMSGSIZE; 264 } 265 EXPORT_SYMBOL(ip6_xmit); 266 267 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 268 { 269 struct ip6_ra_chain *ra; 270 struct sock *last = NULL; 271 272 read_lock(&ip6_ra_lock); 273 for (ra = ip6_ra_chain; ra; ra = ra->next) { 274 struct sock *sk = ra->sk; 275 if (sk && ra->sel == sel && 276 (!sk->sk_bound_dev_if || 277 sk->sk_bound_dev_if == skb->dev->ifindex)) { 278 if (last) { 279 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 280 if (skb2) 281 rawv6_rcv(last, skb2); 282 } 283 last = sk; 284 } 285 } 286 287 if (last) { 288 rawv6_rcv(last, skb); 289 read_unlock(&ip6_ra_lock); 290 return 1; 291 } 292 read_unlock(&ip6_ra_lock); 293 return 0; 294 } 295 296 static int ip6_forward_proxy_check(struct sk_buff *skb) 297 { 298 struct ipv6hdr *hdr = ipv6_hdr(skb); 299 u8 nexthdr = hdr->nexthdr; 300 __be16 frag_off; 301 int offset; 302 303 if (ipv6_ext_hdr(nexthdr)) { 304 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 305 if (offset < 0) 306 return 0; 307 } else 308 offset = sizeof(struct ipv6hdr); 309 310 if (nexthdr == IPPROTO_ICMPV6) { 311 struct icmp6hdr *icmp6; 312 313 if (!pskb_may_pull(skb, (skb_network_header(skb) + 314 offset + 1 - skb->data))) 315 return 0; 316 317 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 318 319 switch (icmp6->icmp6_type) { 320 case NDISC_ROUTER_SOLICITATION: 321 case NDISC_ROUTER_ADVERTISEMENT: 322 case NDISC_NEIGHBOUR_SOLICITATION: 323 case NDISC_NEIGHBOUR_ADVERTISEMENT: 324 case NDISC_REDIRECT: 325 /* For reaction involving unicast neighbor discovery 326 * message destined to the proxied address, pass it to 327 * input function. 328 */ 329 return 1; 330 default: 331 break; 332 } 333 } 334 335 /* 336 * The proxying router can't forward traffic sent to a link-local 337 * address, so signal the sender and discard the packet. This 338 * behavior is clarified by the MIPv6 specification. 339 */ 340 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 341 dst_link_failure(skb); 342 return -1; 343 } 344 345 return 0; 346 } 347 348 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 349 struct sk_buff *skb) 350 { 351 return dst_output(net, sk, skb); 352 } 353 354 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) 355 { 356 unsigned int mtu; 357 struct inet6_dev *idev; 358 359 if (dst_metric_locked(dst, RTAX_MTU)) { 360 mtu = dst_metric_raw(dst, RTAX_MTU); 361 if (mtu) 362 return mtu; 363 } 364 365 mtu = IPV6_MIN_MTU; 366 rcu_read_lock(); 367 idev = __in6_dev_get(dst->dev); 368 if (idev) 369 mtu = idev->cnf.mtu6; 370 rcu_read_unlock(); 371 372 return mtu; 373 } 374 375 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 376 { 377 if (skb->len <= mtu) 378 return false; 379 380 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 381 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 382 return true; 383 384 if (skb->ignore_df) 385 return false; 386 387 if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu)) 388 return false; 389 390 return true; 391 } 392 393 int ip6_forward(struct sk_buff *skb) 394 { 395 struct dst_entry *dst = skb_dst(skb); 396 struct ipv6hdr *hdr = ipv6_hdr(skb); 397 struct inet6_skb_parm *opt = IP6CB(skb); 398 struct net *net = dev_net(dst->dev); 399 u32 mtu; 400 401 if (net->ipv6.devconf_all->forwarding == 0) 402 goto error; 403 404 if (skb->pkt_type != PACKET_HOST) 405 goto drop; 406 407 if (unlikely(skb->sk)) 408 goto drop; 409 410 if (skb_warn_if_lro(skb)) 411 goto drop; 412 413 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 414 __IP6_INC_STATS(net, ip6_dst_idev(dst), 415 IPSTATS_MIB_INDISCARDS); 416 goto drop; 417 } 418 419 skb_forward_csum(skb); 420 421 /* 422 * We DO NOT make any processing on 423 * RA packets, pushing them to user level AS IS 424 * without ane WARRANTY that application will be able 425 * to interpret them. The reason is that we 426 * cannot make anything clever here. 427 * 428 * We are not end-node, so that if packet contains 429 * AH/ESP, we cannot make anything. 430 * Defragmentation also would be mistake, RA packets 431 * cannot be fragmented, because there is no warranty 432 * that different fragments will go along one path. --ANK 433 */ 434 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 435 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 436 return 0; 437 } 438 439 /* 440 * check and decrement ttl 441 */ 442 if (hdr->hop_limit <= 1) { 443 /* Force OUTPUT device used as source address */ 444 skb->dev = dst->dev; 445 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 446 __IP6_INC_STATS(net, ip6_dst_idev(dst), 447 IPSTATS_MIB_INHDRERRORS); 448 449 kfree_skb(skb); 450 return -ETIMEDOUT; 451 } 452 453 /* XXX: idev->cnf.proxy_ndp? */ 454 if (net->ipv6.devconf_all->proxy_ndp && 455 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 456 int proxied = ip6_forward_proxy_check(skb); 457 if (proxied > 0) 458 return ip6_input(skb); 459 else if (proxied < 0) { 460 __IP6_INC_STATS(net, ip6_dst_idev(dst), 461 IPSTATS_MIB_INDISCARDS); 462 goto drop; 463 } 464 } 465 466 if (!xfrm6_route_forward(skb)) { 467 __IP6_INC_STATS(net, ip6_dst_idev(dst), 468 IPSTATS_MIB_INDISCARDS); 469 goto drop; 470 } 471 dst = skb_dst(skb); 472 473 /* IPv6 specs say nothing about it, but it is clear that we cannot 474 send redirects to source routed frames. 475 We don't send redirects to frames decapsulated from IPsec. 476 */ 477 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { 478 struct in6_addr *target = NULL; 479 struct inet_peer *peer; 480 struct rt6_info *rt; 481 482 /* 483 * incoming and outgoing devices are the same 484 * send a redirect. 485 */ 486 487 rt = (struct rt6_info *) dst; 488 if (rt->rt6i_flags & RTF_GATEWAY) 489 target = &rt->rt6i_gateway; 490 else 491 target = &hdr->daddr; 492 493 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 494 495 /* Limit redirects both by destination (here) 496 and by source (inside ndisc_send_redirect) 497 */ 498 if (inet_peer_xrlim_allow(peer, 1*HZ)) 499 ndisc_send_redirect(skb, target); 500 if (peer) 501 inet_putpeer(peer); 502 } else { 503 int addrtype = ipv6_addr_type(&hdr->saddr); 504 505 /* This check is security critical. */ 506 if (addrtype == IPV6_ADDR_ANY || 507 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 508 goto error; 509 if (addrtype & IPV6_ADDR_LINKLOCAL) { 510 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 511 ICMPV6_NOT_NEIGHBOUR, 0); 512 goto error; 513 } 514 } 515 516 mtu = ip6_dst_mtu_forward(dst); 517 if (mtu < IPV6_MIN_MTU) 518 mtu = IPV6_MIN_MTU; 519 520 if (ip6_pkt_too_big(skb, mtu)) { 521 /* Again, force OUTPUT device used as source address */ 522 skb->dev = dst->dev; 523 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 524 __IP6_INC_STATS(net, ip6_dst_idev(dst), 525 IPSTATS_MIB_INTOOBIGERRORS); 526 __IP6_INC_STATS(net, ip6_dst_idev(dst), 527 IPSTATS_MIB_FRAGFAILS); 528 kfree_skb(skb); 529 return -EMSGSIZE; 530 } 531 532 if (skb_cow(skb, dst->dev->hard_header_len)) { 533 __IP6_INC_STATS(net, ip6_dst_idev(dst), 534 IPSTATS_MIB_OUTDISCARDS); 535 goto drop; 536 } 537 538 hdr = ipv6_hdr(skb); 539 540 /* Mangling hops number delayed to point after skb COW */ 541 542 hdr->hop_limit--; 543 544 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 545 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 546 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 547 net, NULL, skb, skb->dev, dst->dev, 548 ip6_forward_finish); 549 550 error: 551 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 552 drop: 553 kfree_skb(skb); 554 return -EINVAL; 555 } 556 557 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 558 { 559 to->pkt_type = from->pkt_type; 560 to->priority = from->priority; 561 to->protocol = from->protocol; 562 skb_dst_drop(to); 563 skb_dst_set(to, dst_clone(skb_dst(from))); 564 to->dev = from->dev; 565 to->mark = from->mark; 566 567 #ifdef CONFIG_NET_SCHED 568 to->tc_index = from->tc_index; 569 #endif 570 nf_copy(to, from); 571 skb_copy_secmark(to, from); 572 } 573 574 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 575 int (*output)(struct net *, struct sock *, struct sk_buff *)) 576 { 577 struct sk_buff *frag; 578 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 579 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 580 inet6_sk(skb->sk) : NULL; 581 struct ipv6hdr *tmp_hdr; 582 struct frag_hdr *fh; 583 unsigned int mtu, hlen, left, len; 584 int hroom, troom; 585 __be32 frag_id; 586 int ptr, offset = 0, err = 0; 587 u8 *prevhdr, nexthdr = 0; 588 589 hlen = ip6_find_1stfragopt(skb, &prevhdr); 590 nexthdr = *prevhdr; 591 592 mtu = ip6_skb_dst_mtu(skb); 593 594 /* We must not fragment if the socket is set to force MTU discovery 595 * or if the skb it not generated by a local socket. 596 */ 597 if (unlikely(!skb->ignore_df && skb->len > mtu)) 598 goto fail_toobig; 599 600 if (IP6CB(skb)->frag_max_size) { 601 if (IP6CB(skb)->frag_max_size > mtu) 602 goto fail_toobig; 603 604 /* don't send fragments larger than what we received */ 605 mtu = IP6CB(skb)->frag_max_size; 606 if (mtu < IPV6_MIN_MTU) 607 mtu = IPV6_MIN_MTU; 608 } 609 610 if (np && np->frag_size < mtu) { 611 if (np->frag_size) 612 mtu = np->frag_size; 613 } 614 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 615 goto fail_toobig; 616 mtu -= hlen + sizeof(struct frag_hdr); 617 618 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 619 &ipv6_hdr(skb)->saddr); 620 621 if (skb->ip_summed == CHECKSUM_PARTIAL && 622 (err = skb_checksum_help(skb))) 623 goto fail; 624 625 hroom = LL_RESERVED_SPACE(rt->dst.dev); 626 if (skb_has_frag_list(skb)) { 627 int first_len = skb_pagelen(skb); 628 struct sk_buff *frag2; 629 630 if (first_len - hlen > mtu || 631 ((first_len - hlen) & 7) || 632 skb_cloned(skb) || 633 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 634 goto slow_path; 635 636 skb_walk_frags(skb, frag) { 637 /* Correct geometry. */ 638 if (frag->len > mtu || 639 ((frag->len & 7) && frag->next) || 640 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 641 goto slow_path_clean; 642 643 /* Partially cloned skb? */ 644 if (skb_shared(frag)) 645 goto slow_path_clean; 646 647 BUG_ON(frag->sk); 648 if (skb->sk) { 649 frag->sk = skb->sk; 650 frag->destructor = sock_wfree; 651 } 652 skb->truesize -= frag->truesize; 653 } 654 655 err = 0; 656 offset = 0; 657 /* BUILD HEADER */ 658 659 *prevhdr = NEXTHDR_FRAGMENT; 660 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 661 if (!tmp_hdr) { 662 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 663 IPSTATS_MIB_FRAGFAILS); 664 err = -ENOMEM; 665 goto fail; 666 } 667 frag = skb_shinfo(skb)->frag_list; 668 skb_frag_list_init(skb); 669 670 __skb_pull(skb, hlen); 671 fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr)); 672 __skb_push(skb, hlen); 673 skb_reset_network_header(skb); 674 memcpy(skb_network_header(skb), tmp_hdr, hlen); 675 676 fh->nexthdr = nexthdr; 677 fh->reserved = 0; 678 fh->frag_off = htons(IP6_MF); 679 fh->identification = frag_id; 680 681 first_len = skb_pagelen(skb); 682 skb->data_len = first_len - skb_headlen(skb); 683 skb->len = first_len; 684 ipv6_hdr(skb)->payload_len = htons(first_len - 685 sizeof(struct ipv6hdr)); 686 687 dst_hold(&rt->dst); 688 689 for (;;) { 690 /* Prepare header of the next frame, 691 * before previous one went down. */ 692 if (frag) { 693 frag->ip_summed = CHECKSUM_NONE; 694 skb_reset_transport_header(frag); 695 fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr)); 696 __skb_push(frag, hlen); 697 skb_reset_network_header(frag); 698 memcpy(skb_network_header(frag), tmp_hdr, 699 hlen); 700 offset += skb->len - hlen - sizeof(struct frag_hdr); 701 fh->nexthdr = nexthdr; 702 fh->reserved = 0; 703 fh->frag_off = htons(offset); 704 if (frag->next) 705 fh->frag_off |= htons(IP6_MF); 706 fh->identification = frag_id; 707 ipv6_hdr(frag)->payload_len = 708 htons(frag->len - 709 sizeof(struct ipv6hdr)); 710 ip6_copy_metadata(frag, skb); 711 } 712 713 err = output(net, sk, skb); 714 if (!err) 715 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 716 IPSTATS_MIB_FRAGCREATES); 717 718 if (err || !frag) 719 break; 720 721 skb = frag; 722 frag = skb->next; 723 skb->next = NULL; 724 } 725 726 kfree(tmp_hdr); 727 728 if (err == 0) { 729 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 730 IPSTATS_MIB_FRAGOKS); 731 ip6_rt_put(rt); 732 return 0; 733 } 734 735 kfree_skb_list(frag); 736 737 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 738 IPSTATS_MIB_FRAGFAILS); 739 ip6_rt_put(rt); 740 return err; 741 742 slow_path_clean: 743 skb_walk_frags(skb, frag2) { 744 if (frag2 == frag) 745 break; 746 frag2->sk = NULL; 747 frag2->destructor = NULL; 748 skb->truesize += frag2->truesize; 749 } 750 } 751 752 slow_path: 753 left = skb->len - hlen; /* Space per frame */ 754 ptr = hlen; /* Where to start from */ 755 756 /* 757 * Fragment the datagram. 758 */ 759 760 *prevhdr = NEXTHDR_FRAGMENT; 761 troom = rt->dst.dev->needed_tailroom; 762 763 /* 764 * Keep copying data until we run out. 765 */ 766 while (left > 0) { 767 len = left; 768 /* IF: it doesn't fit, use 'mtu' - the data space left */ 769 if (len > mtu) 770 len = mtu; 771 /* IF: we are not sending up to and including the packet end 772 then align the next start on an eight byte boundary */ 773 if (len < left) { 774 len &= ~7; 775 } 776 777 /* Allocate buffer */ 778 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 779 hroom + troom, GFP_ATOMIC); 780 if (!frag) { 781 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 782 IPSTATS_MIB_FRAGFAILS); 783 err = -ENOMEM; 784 goto fail; 785 } 786 787 /* 788 * Set up data on packet 789 */ 790 791 ip6_copy_metadata(frag, skb); 792 skb_reserve(frag, hroom); 793 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 794 skb_reset_network_header(frag); 795 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 796 frag->transport_header = (frag->network_header + hlen + 797 sizeof(struct frag_hdr)); 798 799 /* 800 * Charge the memory for the fragment to any owner 801 * it might possess 802 */ 803 if (skb->sk) 804 skb_set_owner_w(frag, skb->sk); 805 806 /* 807 * Copy the packet header into the new buffer. 808 */ 809 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 810 811 /* 812 * Build fragment header. 813 */ 814 fh->nexthdr = nexthdr; 815 fh->reserved = 0; 816 fh->identification = frag_id; 817 818 /* 819 * Copy a block of the IP datagram. 820 */ 821 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 822 len)); 823 left -= len; 824 825 fh->frag_off = htons(offset); 826 if (left > 0) 827 fh->frag_off |= htons(IP6_MF); 828 ipv6_hdr(frag)->payload_len = htons(frag->len - 829 sizeof(struct ipv6hdr)); 830 831 ptr += len; 832 offset += len; 833 834 /* 835 * Put this fragment into the sending queue. 836 */ 837 err = output(net, sk, frag); 838 if (err) 839 goto fail; 840 841 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 842 IPSTATS_MIB_FRAGCREATES); 843 } 844 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 845 IPSTATS_MIB_FRAGOKS); 846 consume_skb(skb); 847 return err; 848 849 fail_toobig: 850 if (skb->sk && dst_allfrag(skb_dst(skb))) 851 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 852 853 skb->dev = skb_dst(skb)->dev; 854 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 855 err = -EMSGSIZE; 856 857 fail: 858 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 859 IPSTATS_MIB_FRAGFAILS); 860 kfree_skb(skb); 861 return err; 862 } 863 864 static inline int ip6_rt_check(const struct rt6key *rt_key, 865 const struct in6_addr *fl_addr, 866 const struct in6_addr *addr_cache) 867 { 868 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 869 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 870 } 871 872 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 873 struct dst_entry *dst, 874 const struct flowi6 *fl6) 875 { 876 struct ipv6_pinfo *np = inet6_sk(sk); 877 struct rt6_info *rt; 878 879 if (!dst) 880 goto out; 881 882 if (dst->ops->family != AF_INET6) { 883 dst_release(dst); 884 return NULL; 885 } 886 887 rt = (struct rt6_info *)dst; 888 /* Yes, checking route validity in not connected 889 * case is not very simple. Take into account, 890 * that we do not support routing by source, TOS, 891 * and MSG_DONTROUTE --ANK (980726) 892 * 893 * 1. ip6_rt_check(): If route was host route, 894 * check that cached destination is current. 895 * If it is network route, we still may 896 * check its validity using saved pointer 897 * to the last used address: daddr_cache. 898 * We do not want to save whole address now, 899 * (because main consumer of this service 900 * is tcp, which has not this problem), 901 * so that the last trick works only on connected 902 * sockets. 903 * 2. oif also should be the same. 904 */ 905 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 906 #ifdef CONFIG_IPV6_SUBTREES 907 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 908 #endif 909 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 910 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 911 dst_release(dst); 912 dst = NULL; 913 } 914 915 out: 916 return dst; 917 } 918 919 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 920 struct dst_entry **dst, struct flowi6 *fl6) 921 { 922 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 923 struct neighbour *n; 924 struct rt6_info *rt; 925 #endif 926 int err; 927 int flags = 0; 928 929 if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif && 930 (!*dst || !(*dst)->error)) { 931 err = l3mdev_get_saddr6(net, sk, fl6); 932 if (err) 933 goto out_err; 934 } 935 936 /* The correct way to handle this would be to do 937 * ip6_route_get_saddr, and then ip6_route_output; however, 938 * the route-specific preferred source forces the 939 * ip6_route_output call _before_ ip6_route_get_saddr. 940 * 941 * In source specific routing (no src=any default route), 942 * ip6_route_output will fail given src=any saddr, though, so 943 * that's why we try it again later. 944 */ 945 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 946 struct rt6_info *rt; 947 bool had_dst = *dst != NULL; 948 949 if (!had_dst) 950 *dst = ip6_route_output(net, sk, fl6); 951 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 952 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 953 sk ? inet6_sk(sk)->srcprefs : 0, 954 &fl6->saddr); 955 if (err) 956 goto out_err_release; 957 958 /* If we had an erroneous initial result, pretend it 959 * never existed and let the SA-enabled version take 960 * over. 961 */ 962 if (!had_dst && (*dst)->error) { 963 dst_release(*dst); 964 *dst = NULL; 965 } 966 967 if (fl6->flowi6_oif) 968 flags |= RT6_LOOKUP_F_IFACE; 969 } 970 971 if (!*dst) 972 *dst = ip6_route_output_flags(net, sk, fl6, flags); 973 974 err = (*dst)->error; 975 if (err) 976 goto out_err_release; 977 978 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 979 /* 980 * Here if the dst entry we've looked up 981 * has a neighbour entry that is in the INCOMPLETE 982 * state and the src address from the flow is 983 * marked as OPTIMISTIC, we release the found 984 * dst entry and replace it instead with the 985 * dst entry of the nexthop router 986 */ 987 rt = (struct rt6_info *) *dst; 988 rcu_read_lock_bh(); 989 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 990 rt6_nexthop(rt, &fl6->daddr)); 991 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 992 rcu_read_unlock_bh(); 993 994 if (err) { 995 struct inet6_ifaddr *ifp; 996 struct flowi6 fl_gw6; 997 int redirect; 998 999 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1000 (*dst)->dev, 1); 1001 1002 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1003 if (ifp) 1004 in6_ifa_put(ifp); 1005 1006 if (redirect) { 1007 /* 1008 * We need to get the dst entry for the 1009 * default router instead 1010 */ 1011 dst_release(*dst); 1012 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1013 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1014 *dst = ip6_route_output(net, sk, &fl_gw6); 1015 err = (*dst)->error; 1016 if (err) 1017 goto out_err_release; 1018 } 1019 } 1020 #endif 1021 1022 return 0; 1023 1024 out_err_release: 1025 dst_release(*dst); 1026 *dst = NULL; 1027 out_err: 1028 if (err == -ENETUNREACH) 1029 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1030 return err; 1031 } 1032 1033 /** 1034 * ip6_dst_lookup - perform route lookup on flow 1035 * @sk: socket which provides route info 1036 * @dst: pointer to dst_entry * for result 1037 * @fl6: flow to lookup 1038 * 1039 * This function performs a route lookup on the given flow. 1040 * 1041 * It returns zero on success, or a standard errno code on error. 1042 */ 1043 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1044 struct flowi6 *fl6) 1045 { 1046 *dst = NULL; 1047 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1048 } 1049 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1050 1051 /** 1052 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1053 * @sk: socket which provides route info 1054 * @fl6: flow to lookup 1055 * @final_dst: final destination address for ipsec lookup 1056 * 1057 * This function performs a route lookup on the given flow. 1058 * 1059 * It returns a valid dst pointer on success, or a pointer encoded 1060 * error code. 1061 */ 1062 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1063 const struct in6_addr *final_dst) 1064 { 1065 struct dst_entry *dst = NULL; 1066 int err; 1067 1068 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1069 if (err) 1070 return ERR_PTR(err); 1071 if (final_dst) 1072 fl6->daddr = *final_dst; 1073 1074 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1075 } 1076 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1077 1078 /** 1079 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1080 * @sk: socket which provides the dst cache and route info 1081 * @fl6: flow to lookup 1082 * @final_dst: final destination address for ipsec lookup 1083 * 1084 * This function performs a route lookup on the given flow with the 1085 * possibility of using the cached route in the socket if it is valid. 1086 * It will take the socket dst lock when operating on the dst cache. 1087 * As a result, this function can only be used in process context. 1088 * 1089 * It returns a valid dst pointer on success, or a pointer encoded 1090 * error code. 1091 */ 1092 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1093 const struct in6_addr *final_dst) 1094 { 1095 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1096 1097 dst = ip6_sk_dst_check(sk, dst, fl6); 1098 if (!dst) 1099 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1100 1101 return dst; 1102 } 1103 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1104 1105 static inline int ip6_ufo_append_data(struct sock *sk, 1106 struct sk_buff_head *queue, 1107 int getfrag(void *from, char *to, int offset, int len, 1108 int odd, struct sk_buff *skb), 1109 void *from, int length, int hh_len, int fragheaderlen, 1110 int exthdrlen, int transhdrlen, int mtu, 1111 unsigned int flags, const struct flowi6 *fl6) 1112 1113 { 1114 struct sk_buff *skb; 1115 int err; 1116 1117 /* There is support for UDP large send offload by network 1118 * device, so create one single skb packet containing complete 1119 * udp datagram 1120 */ 1121 skb = skb_peek_tail(queue); 1122 if (!skb) { 1123 skb = sock_alloc_send_skb(sk, 1124 hh_len + fragheaderlen + transhdrlen + 20, 1125 (flags & MSG_DONTWAIT), &err); 1126 if (!skb) 1127 return err; 1128 1129 /* reserve space for Hardware header */ 1130 skb_reserve(skb, hh_len); 1131 1132 /* create space for UDP/IP header */ 1133 skb_put(skb, fragheaderlen + transhdrlen); 1134 1135 /* initialize network header pointer */ 1136 skb_set_network_header(skb, exthdrlen); 1137 1138 /* initialize protocol header pointer */ 1139 skb->transport_header = skb->network_header + fragheaderlen; 1140 1141 skb->protocol = htons(ETH_P_IPV6); 1142 skb->csum = 0; 1143 1144 __skb_queue_tail(queue, skb); 1145 } else if (skb_is_gso(skb)) { 1146 goto append; 1147 } 1148 1149 skb->ip_summed = CHECKSUM_PARTIAL; 1150 /* Specify the length of each IPv6 datagram fragment. 1151 * It has to be a multiple of 8. 1152 */ 1153 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - 1154 sizeof(struct frag_hdr)) & ~7; 1155 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1156 skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk), 1157 &fl6->daddr, 1158 &fl6->saddr); 1159 1160 append: 1161 return skb_append_datato_frags(sk, skb, getfrag, from, 1162 (length - transhdrlen)); 1163 } 1164 1165 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1166 gfp_t gfp) 1167 { 1168 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1169 } 1170 1171 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1172 gfp_t gfp) 1173 { 1174 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1175 } 1176 1177 static void ip6_append_data_mtu(unsigned int *mtu, 1178 int *maxfraglen, 1179 unsigned int fragheaderlen, 1180 struct sk_buff *skb, 1181 struct rt6_info *rt, 1182 unsigned int orig_mtu) 1183 { 1184 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1185 if (!skb) { 1186 /* first fragment, reserve header_len */ 1187 *mtu = orig_mtu - rt->dst.header_len; 1188 1189 } else { 1190 /* 1191 * this fragment is not first, the headers 1192 * space is regarded as data space. 1193 */ 1194 *mtu = orig_mtu; 1195 } 1196 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1197 + fragheaderlen - sizeof(struct frag_hdr); 1198 } 1199 } 1200 1201 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1202 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1203 struct rt6_info *rt, struct flowi6 *fl6) 1204 { 1205 struct ipv6_pinfo *np = inet6_sk(sk); 1206 unsigned int mtu; 1207 struct ipv6_txoptions *opt = ipc6->opt; 1208 1209 /* 1210 * setup for corking 1211 */ 1212 if (opt) { 1213 if (WARN_ON(v6_cork->opt)) 1214 return -EINVAL; 1215 1216 v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation); 1217 if (unlikely(!v6_cork->opt)) 1218 return -ENOBUFS; 1219 1220 v6_cork->opt->tot_len = opt->tot_len; 1221 v6_cork->opt->opt_flen = opt->opt_flen; 1222 v6_cork->opt->opt_nflen = opt->opt_nflen; 1223 1224 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1225 sk->sk_allocation); 1226 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1227 return -ENOBUFS; 1228 1229 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1230 sk->sk_allocation); 1231 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1232 return -ENOBUFS; 1233 1234 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1235 sk->sk_allocation); 1236 if (opt->hopopt && !v6_cork->opt->hopopt) 1237 return -ENOBUFS; 1238 1239 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1240 sk->sk_allocation); 1241 if (opt->srcrt && !v6_cork->opt->srcrt) 1242 return -ENOBUFS; 1243 1244 /* need source address above miyazawa*/ 1245 } 1246 dst_hold(&rt->dst); 1247 cork->base.dst = &rt->dst; 1248 cork->fl.u.ip6 = *fl6; 1249 v6_cork->hop_limit = ipc6->hlimit; 1250 v6_cork->tclass = ipc6->tclass; 1251 if (rt->dst.flags & DST_XFRM_TUNNEL) 1252 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1253 rt->dst.dev->mtu : dst_mtu(&rt->dst); 1254 else 1255 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1256 rt->dst.dev->mtu : dst_mtu(rt->dst.path); 1257 if (np->frag_size < mtu) { 1258 if (np->frag_size) 1259 mtu = np->frag_size; 1260 } 1261 cork->base.fragsize = mtu; 1262 if (dst_allfrag(rt->dst.path)) 1263 cork->base.flags |= IPCORK_ALLFRAG; 1264 cork->base.length = 0; 1265 1266 return 0; 1267 } 1268 1269 static int __ip6_append_data(struct sock *sk, 1270 struct flowi6 *fl6, 1271 struct sk_buff_head *queue, 1272 struct inet_cork *cork, 1273 struct inet6_cork *v6_cork, 1274 struct page_frag *pfrag, 1275 int getfrag(void *from, char *to, int offset, 1276 int len, int odd, struct sk_buff *skb), 1277 void *from, int length, int transhdrlen, 1278 unsigned int flags, struct ipcm6_cookie *ipc6, 1279 const struct sockcm_cookie *sockc) 1280 { 1281 struct sk_buff *skb, *skb_prev = NULL; 1282 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu; 1283 int exthdrlen = 0; 1284 int dst_exthdrlen = 0; 1285 int hh_len; 1286 int copy; 1287 int err; 1288 int offset = 0; 1289 __u8 tx_flags = 0; 1290 u32 tskey = 0; 1291 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1292 struct ipv6_txoptions *opt = v6_cork->opt; 1293 int csummode = CHECKSUM_NONE; 1294 unsigned int maxnonfragsize, headersize; 1295 1296 skb = skb_peek_tail(queue); 1297 if (!skb) { 1298 exthdrlen = opt ? opt->opt_flen : 0; 1299 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1300 } 1301 1302 mtu = cork->fragsize; 1303 orig_mtu = mtu; 1304 1305 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1306 1307 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1308 (opt ? opt->opt_nflen : 0); 1309 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1310 sizeof(struct frag_hdr); 1311 1312 headersize = sizeof(struct ipv6hdr) + 1313 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1314 (dst_allfrag(&rt->dst) ? 1315 sizeof(struct frag_hdr) : 0) + 1316 rt->rt6i_nfheader_len; 1317 1318 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1319 (sk->sk_protocol == IPPROTO_UDP || 1320 sk->sk_protocol == IPPROTO_RAW)) { 1321 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1322 sizeof(struct ipv6hdr)); 1323 goto emsgsize; 1324 } 1325 1326 if (ip6_sk_ignore_df(sk)) 1327 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1328 else 1329 maxnonfragsize = mtu; 1330 1331 if (cork->length + length > maxnonfragsize - headersize) { 1332 emsgsize: 1333 ipv6_local_error(sk, EMSGSIZE, fl6, 1334 mtu - headersize + 1335 sizeof(struct ipv6hdr)); 1336 return -EMSGSIZE; 1337 } 1338 1339 /* CHECKSUM_PARTIAL only with no extension headers and when 1340 * we are not going to fragment 1341 */ 1342 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1343 headersize == sizeof(struct ipv6hdr) && 1344 length < mtu - headersize && 1345 !(flags & MSG_MORE) && 1346 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1347 csummode = CHECKSUM_PARTIAL; 1348 1349 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { 1350 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); 1351 if (tx_flags & SKBTX_ANY_SW_TSTAMP && 1352 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1353 tskey = sk->sk_tskey++; 1354 } 1355 1356 /* 1357 * Let's try using as much space as possible. 1358 * Use MTU if total length of the message fits into the MTU. 1359 * Otherwise, we need to reserve fragment header and 1360 * fragment alignment (= 8-15 octects, in total). 1361 * 1362 * Note that we may need to "move" the data from the tail of 1363 * of the buffer to the new fragment when we split 1364 * the message. 1365 * 1366 * FIXME: It may be fragmented into multiple chunks 1367 * at once if non-fragmentable extension headers 1368 * are too large. 1369 * --yoshfuji 1370 */ 1371 1372 cork->length += length; 1373 if (((length > mtu) || 1374 (skb && skb_is_gso(skb))) && 1375 (sk->sk_protocol == IPPROTO_UDP) && 1376 (rt->dst.dev->features & NETIF_F_UFO) && 1377 (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { 1378 err = ip6_ufo_append_data(sk, queue, getfrag, from, length, 1379 hh_len, fragheaderlen, exthdrlen, 1380 transhdrlen, mtu, flags, fl6); 1381 if (err) 1382 goto error; 1383 return 0; 1384 } 1385 1386 if (!skb) 1387 goto alloc_new_skb; 1388 1389 while (length > 0) { 1390 /* Check if the remaining data fits into current packet. */ 1391 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1392 if (copy < length) 1393 copy = maxfraglen - skb->len; 1394 1395 if (copy <= 0) { 1396 char *data; 1397 unsigned int datalen; 1398 unsigned int fraglen; 1399 unsigned int fraggap; 1400 unsigned int alloclen; 1401 alloc_new_skb: 1402 /* There's no room in the current skb */ 1403 if (skb) 1404 fraggap = skb->len - maxfraglen; 1405 else 1406 fraggap = 0; 1407 /* update mtu and maxfraglen if necessary */ 1408 if (!skb || !skb_prev) 1409 ip6_append_data_mtu(&mtu, &maxfraglen, 1410 fragheaderlen, skb, rt, 1411 orig_mtu); 1412 1413 skb_prev = skb; 1414 1415 /* 1416 * If remaining data exceeds the mtu, 1417 * we know we need more fragment(s). 1418 */ 1419 datalen = length + fraggap; 1420 1421 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1422 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1423 if ((flags & MSG_MORE) && 1424 !(rt->dst.dev->features&NETIF_F_SG)) 1425 alloclen = mtu; 1426 else 1427 alloclen = datalen + fragheaderlen; 1428 1429 alloclen += dst_exthdrlen; 1430 1431 if (datalen != length + fraggap) { 1432 /* 1433 * this is not the last fragment, the trailer 1434 * space is regarded as data space. 1435 */ 1436 datalen += rt->dst.trailer_len; 1437 } 1438 1439 alloclen += rt->dst.trailer_len; 1440 fraglen = datalen + fragheaderlen; 1441 1442 /* 1443 * We just reserve space for fragment header. 1444 * Note: this may be overallocation if the message 1445 * (without MSG_MORE) fits into the MTU. 1446 */ 1447 alloclen += sizeof(struct frag_hdr); 1448 1449 if (transhdrlen) { 1450 skb = sock_alloc_send_skb(sk, 1451 alloclen + hh_len, 1452 (flags & MSG_DONTWAIT), &err); 1453 } else { 1454 skb = NULL; 1455 if (atomic_read(&sk->sk_wmem_alloc) <= 1456 2 * sk->sk_sndbuf) 1457 skb = sock_wmalloc(sk, 1458 alloclen + hh_len, 1, 1459 sk->sk_allocation); 1460 if (unlikely(!skb)) 1461 err = -ENOBUFS; 1462 } 1463 if (!skb) 1464 goto error; 1465 /* 1466 * Fill in the control structures 1467 */ 1468 skb->protocol = htons(ETH_P_IPV6); 1469 skb->ip_summed = csummode; 1470 skb->csum = 0; 1471 /* reserve for fragmentation and ipsec header */ 1472 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1473 dst_exthdrlen); 1474 1475 /* Only the initial fragment is time stamped */ 1476 skb_shinfo(skb)->tx_flags = tx_flags; 1477 tx_flags = 0; 1478 skb_shinfo(skb)->tskey = tskey; 1479 tskey = 0; 1480 1481 /* 1482 * Find where to start putting bytes 1483 */ 1484 data = skb_put(skb, fraglen); 1485 skb_set_network_header(skb, exthdrlen); 1486 data += fragheaderlen; 1487 skb->transport_header = (skb->network_header + 1488 fragheaderlen); 1489 if (fraggap) { 1490 skb->csum = skb_copy_and_csum_bits( 1491 skb_prev, maxfraglen, 1492 data + transhdrlen, fraggap, 0); 1493 skb_prev->csum = csum_sub(skb_prev->csum, 1494 skb->csum); 1495 data += fraggap; 1496 pskb_trim_unique(skb_prev, maxfraglen); 1497 } 1498 copy = datalen - transhdrlen - fraggap; 1499 1500 if (copy < 0) { 1501 err = -EINVAL; 1502 kfree_skb(skb); 1503 goto error; 1504 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 1505 err = -EFAULT; 1506 kfree_skb(skb); 1507 goto error; 1508 } 1509 1510 offset += copy; 1511 length -= datalen - fraggap; 1512 transhdrlen = 0; 1513 exthdrlen = 0; 1514 dst_exthdrlen = 0; 1515 1516 /* 1517 * Put the packet on the pending queue 1518 */ 1519 __skb_queue_tail(queue, skb); 1520 continue; 1521 } 1522 1523 if (copy > length) 1524 copy = length; 1525 1526 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1527 unsigned int off; 1528 1529 off = skb->len; 1530 if (getfrag(from, skb_put(skb, copy), 1531 offset, copy, off, skb) < 0) { 1532 __skb_trim(skb, off); 1533 err = -EFAULT; 1534 goto error; 1535 } 1536 } else { 1537 int i = skb_shinfo(skb)->nr_frags; 1538 1539 err = -ENOMEM; 1540 if (!sk_page_frag_refill(sk, pfrag)) 1541 goto error; 1542 1543 if (!skb_can_coalesce(skb, i, pfrag->page, 1544 pfrag->offset)) { 1545 err = -EMSGSIZE; 1546 if (i == MAX_SKB_FRAGS) 1547 goto error; 1548 1549 __skb_fill_page_desc(skb, i, pfrag->page, 1550 pfrag->offset, 0); 1551 skb_shinfo(skb)->nr_frags = ++i; 1552 get_page(pfrag->page); 1553 } 1554 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1555 if (getfrag(from, 1556 page_address(pfrag->page) + pfrag->offset, 1557 offset, copy, skb->len, skb) < 0) 1558 goto error_efault; 1559 1560 pfrag->offset += copy; 1561 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1562 skb->len += copy; 1563 skb->data_len += copy; 1564 skb->truesize += copy; 1565 atomic_add(copy, &sk->sk_wmem_alloc); 1566 } 1567 offset += copy; 1568 length -= copy; 1569 } 1570 1571 return 0; 1572 1573 error_efault: 1574 err = -EFAULT; 1575 error: 1576 cork->length -= length; 1577 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1578 return err; 1579 } 1580 1581 int ip6_append_data(struct sock *sk, 1582 int getfrag(void *from, char *to, int offset, int len, 1583 int odd, struct sk_buff *skb), 1584 void *from, int length, int transhdrlen, 1585 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1586 struct rt6_info *rt, unsigned int flags, 1587 const struct sockcm_cookie *sockc) 1588 { 1589 struct inet_sock *inet = inet_sk(sk); 1590 struct ipv6_pinfo *np = inet6_sk(sk); 1591 int exthdrlen; 1592 int err; 1593 1594 if (flags&MSG_PROBE) 1595 return 0; 1596 if (skb_queue_empty(&sk->sk_write_queue)) { 1597 /* 1598 * setup for corking 1599 */ 1600 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1601 ipc6, rt, fl6); 1602 if (err) 1603 return err; 1604 1605 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1606 length += exthdrlen; 1607 transhdrlen += exthdrlen; 1608 } else { 1609 fl6 = &inet->cork.fl.u.ip6; 1610 transhdrlen = 0; 1611 } 1612 1613 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1614 &np->cork, sk_page_frag(sk), getfrag, 1615 from, length, transhdrlen, flags, ipc6, sockc); 1616 } 1617 EXPORT_SYMBOL_GPL(ip6_append_data); 1618 1619 static void ip6_cork_release(struct inet_cork_full *cork, 1620 struct inet6_cork *v6_cork) 1621 { 1622 if (v6_cork->opt) { 1623 kfree(v6_cork->opt->dst0opt); 1624 kfree(v6_cork->opt->dst1opt); 1625 kfree(v6_cork->opt->hopopt); 1626 kfree(v6_cork->opt->srcrt); 1627 kfree(v6_cork->opt); 1628 v6_cork->opt = NULL; 1629 } 1630 1631 if (cork->base.dst) { 1632 dst_release(cork->base.dst); 1633 cork->base.dst = NULL; 1634 cork->base.flags &= ~IPCORK_ALLFRAG; 1635 } 1636 memset(&cork->fl, 0, sizeof(cork->fl)); 1637 } 1638 1639 struct sk_buff *__ip6_make_skb(struct sock *sk, 1640 struct sk_buff_head *queue, 1641 struct inet_cork_full *cork, 1642 struct inet6_cork *v6_cork) 1643 { 1644 struct sk_buff *skb, *tmp_skb; 1645 struct sk_buff **tail_skb; 1646 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1647 struct ipv6_pinfo *np = inet6_sk(sk); 1648 struct net *net = sock_net(sk); 1649 struct ipv6hdr *hdr; 1650 struct ipv6_txoptions *opt = v6_cork->opt; 1651 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1652 struct flowi6 *fl6 = &cork->fl.u.ip6; 1653 unsigned char proto = fl6->flowi6_proto; 1654 1655 skb = __skb_dequeue(queue); 1656 if (!skb) 1657 goto out; 1658 tail_skb = &(skb_shinfo(skb)->frag_list); 1659 1660 /* move skb->data to ip header from ext header */ 1661 if (skb->data < skb_network_header(skb)) 1662 __skb_pull(skb, skb_network_offset(skb)); 1663 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1664 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1665 *tail_skb = tmp_skb; 1666 tail_skb = &(tmp_skb->next); 1667 skb->len += tmp_skb->len; 1668 skb->data_len += tmp_skb->len; 1669 skb->truesize += tmp_skb->truesize; 1670 tmp_skb->destructor = NULL; 1671 tmp_skb->sk = NULL; 1672 } 1673 1674 /* Allow local fragmentation. */ 1675 skb->ignore_df = ip6_sk_ignore_df(sk); 1676 1677 *final_dst = fl6->daddr; 1678 __skb_pull(skb, skb_network_header_len(skb)); 1679 if (opt && opt->opt_flen) 1680 ipv6_push_frag_opts(skb, opt, &proto); 1681 if (opt && opt->opt_nflen) 1682 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); 1683 1684 skb_push(skb, sizeof(struct ipv6hdr)); 1685 skb_reset_network_header(skb); 1686 hdr = ipv6_hdr(skb); 1687 1688 ip6_flow_hdr(hdr, v6_cork->tclass, 1689 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1690 np->autoflowlabel, fl6)); 1691 hdr->hop_limit = v6_cork->hop_limit; 1692 hdr->nexthdr = proto; 1693 hdr->saddr = fl6->saddr; 1694 hdr->daddr = *final_dst; 1695 1696 skb->priority = sk->sk_priority; 1697 skb->mark = sk->sk_mark; 1698 1699 skb_dst_set(skb, dst_clone(&rt->dst)); 1700 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1701 if (proto == IPPROTO_ICMPV6) { 1702 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1703 1704 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1705 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1706 } 1707 1708 ip6_cork_release(cork, v6_cork); 1709 out: 1710 return skb; 1711 } 1712 1713 int ip6_send_skb(struct sk_buff *skb) 1714 { 1715 struct net *net = sock_net(skb->sk); 1716 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1717 int err; 1718 1719 err = ip6_local_out(net, skb->sk, skb); 1720 if (err) { 1721 if (err > 0) 1722 err = net_xmit_errno(err); 1723 if (err) 1724 IP6_INC_STATS(net, rt->rt6i_idev, 1725 IPSTATS_MIB_OUTDISCARDS); 1726 } 1727 1728 return err; 1729 } 1730 1731 int ip6_push_pending_frames(struct sock *sk) 1732 { 1733 struct sk_buff *skb; 1734 1735 skb = ip6_finish_skb(sk); 1736 if (!skb) 1737 return 0; 1738 1739 return ip6_send_skb(skb); 1740 } 1741 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1742 1743 static void __ip6_flush_pending_frames(struct sock *sk, 1744 struct sk_buff_head *queue, 1745 struct inet_cork_full *cork, 1746 struct inet6_cork *v6_cork) 1747 { 1748 struct sk_buff *skb; 1749 1750 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1751 if (skb_dst(skb)) 1752 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1753 IPSTATS_MIB_OUTDISCARDS); 1754 kfree_skb(skb); 1755 } 1756 1757 ip6_cork_release(cork, v6_cork); 1758 } 1759 1760 void ip6_flush_pending_frames(struct sock *sk) 1761 { 1762 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1763 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1764 } 1765 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1766 1767 struct sk_buff *ip6_make_skb(struct sock *sk, 1768 int getfrag(void *from, char *to, int offset, 1769 int len, int odd, struct sk_buff *skb), 1770 void *from, int length, int transhdrlen, 1771 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1772 struct rt6_info *rt, unsigned int flags, 1773 const struct sockcm_cookie *sockc) 1774 { 1775 struct inet_cork_full cork; 1776 struct inet6_cork v6_cork; 1777 struct sk_buff_head queue; 1778 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1779 int err; 1780 1781 if (flags & MSG_PROBE) 1782 return NULL; 1783 1784 __skb_queue_head_init(&queue); 1785 1786 cork.base.flags = 0; 1787 cork.base.addr = 0; 1788 cork.base.opt = NULL; 1789 v6_cork.opt = NULL; 1790 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); 1791 if (err) 1792 return ERR_PTR(err); 1793 1794 if (ipc6->dontfrag < 0) 1795 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1796 1797 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, 1798 ¤t->task_frag, getfrag, from, 1799 length + exthdrlen, transhdrlen + exthdrlen, 1800 flags, ipc6, sockc); 1801 if (err) { 1802 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); 1803 return ERR_PTR(err); 1804 } 1805 1806 return __ip6_make_skb(sk, &queue, &cork, &v6_cork); 1807 } 1808