1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/netfilter.h> 43 #include <linux/netfilter_ipv6.h> 44 45 #include <net/sock.h> 46 #include <net/snmp.h> 47 48 #include <net/ipv6.h> 49 #include <net/ndisc.h> 50 #include <net/protocol.h> 51 #include <net/ip6_route.h> 52 #include <net/addrconf.h> 53 #include <net/rawv6.h> 54 #include <net/icmp.h> 55 #include <net/xfrm.h> 56 #include <net/checksum.h> 57 #include <linux/mroute6.h> 58 59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)); 60 61 int __ip6_local_out(struct sk_buff *skb) 62 { 63 int len; 64 65 len = skb->len - sizeof(struct ipv6hdr); 66 if (len > IPV6_MAXPLEN) 67 len = 0; 68 ipv6_hdr(skb)->payload_len = htons(len); 69 70 return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, 71 skb_dst(skb)->dev, dst_output); 72 } 73 74 int ip6_local_out(struct sk_buff *skb) 75 { 76 int err; 77 78 err = __ip6_local_out(skb); 79 if (likely(err == 1)) 80 err = dst_output(skb); 81 82 return err; 83 } 84 EXPORT_SYMBOL_GPL(ip6_local_out); 85 86 /* dev_loopback_xmit for use with netfilter. */ 87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb) 88 { 89 skb_reset_mac_header(newskb); 90 __skb_pull(newskb, skb_network_offset(newskb)); 91 newskb->pkt_type = PACKET_LOOPBACK; 92 newskb->ip_summed = CHECKSUM_UNNECESSARY; 93 WARN_ON(!skb_dst(newskb)); 94 95 netif_rx_ni(newskb); 96 return 0; 97 } 98 99 static int ip6_finish_output2(struct sk_buff *skb) 100 { 101 struct dst_entry *dst = skb_dst(skb); 102 struct net_device *dev = dst->dev; 103 struct neighbour *neigh; 104 105 skb->protocol = htons(ETH_P_IPV6); 106 skb->dev = dev; 107 108 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 109 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 110 111 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) && 112 ((mroute6_socket(dev_net(dev), skb) && 113 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 114 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 115 &ipv6_hdr(skb)->saddr))) { 116 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 117 118 /* Do not check for IFF_ALLMULTI; multicast routing 119 is not supported in any case. 120 */ 121 if (newskb) 122 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 123 newskb, NULL, newskb->dev, 124 ip6_dev_loopback_xmit); 125 126 if (ipv6_hdr(skb)->hop_limit == 0) { 127 IP6_INC_STATS(dev_net(dev), idev, 128 IPSTATS_MIB_OUTDISCARDS); 129 kfree_skb(skb); 130 return 0; 131 } 132 } 133 134 IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST, 135 skb->len); 136 } 137 138 rcu_read_lock(); 139 neigh = dst_get_neighbour_noref(dst); 140 if (neigh) { 141 int res = neigh_output(neigh, skb); 142 143 rcu_read_unlock(); 144 return res; 145 } 146 rcu_read_unlock(); 147 IP6_INC_STATS_BH(dev_net(dst->dev), 148 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 149 kfree_skb(skb); 150 return -EINVAL; 151 } 152 153 static int ip6_finish_output(struct sk_buff *skb) 154 { 155 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 156 dst_allfrag(skb_dst(skb))) 157 return ip6_fragment(skb, ip6_finish_output2); 158 else 159 return ip6_finish_output2(skb); 160 } 161 162 int ip6_output(struct sk_buff *skb) 163 { 164 struct net_device *dev = skb_dst(skb)->dev; 165 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 166 if (unlikely(idev->cnf.disable_ipv6)) { 167 IP6_INC_STATS(dev_net(dev), idev, 168 IPSTATS_MIB_OUTDISCARDS); 169 kfree_skb(skb); 170 return 0; 171 } 172 173 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev, 174 ip6_finish_output, 175 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 176 } 177 178 /* 179 * xmit an sk_buff (used by TCP, SCTP and DCCP) 180 */ 181 182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 183 struct ipv6_txoptions *opt, int tclass) 184 { 185 struct net *net = sock_net(sk); 186 struct ipv6_pinfo *np = inet6_sk(sk); 187 struct in6_addr *first_hop = &fl6->daddr; 188 struct dst_entry *dst = skb_dst(skb); 189 struct ipv6hdr *hdr; 190 u8 proto = fl6->flowi6_proto; 191 int seg_len = skb->len; 192 int hlimit = -1; 193 u32 mtu; 194 195 if (opt) { 196 unsigned int head_room; 197 198 /* First: exthdrs may take lots of space (~8K for now) 199 MAX_HEADER is not enough. 200 */ 201 head_room = opt->opt_nflen + opt->opt_flen; 202 seg_len += head_room; 203 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 204 205 if (skb_headroom(skb) < head_room) { 206 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 207 if (skb2 == NULL) { 208 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 209 IPSTATS_MIB_OUTDISCARDS); 210 kfree_skb(skb); 211 return -ENOBUFS; 212 } 213 kfree_skb(skb); 214 skb = skb2; 215 skb_set_owner_w(skb, sk); 216 } 217 if (opt->opt_flen) 218 ipv6_push_frag_opts(skb, opt, &proto); 219 if (opt->opt_nflen) 220 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop); 221 } 222 223 skb_push(skb, sizeof(struct ipv6hdr)); 224 skb_reset_network_header(skb); 225 hdr = ipv6_hdr(skb); 226 227 /* 228 * Fill in the IPv6 header 229 */ 230 if (np) 231 hlimit = np->hop_limit; 232 if (hlimit < 0) 233 hlimit = ip6_dst_hoplimit(dst); 234 235 *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel; 236 237 hdr->payload_len = htons(seg_len); 238 hdr->nexthdr = proto; 239 hdr->hop_limit = hlimit; 240 241 hdr->saddr = fl6->saddr; 242 hdr->daddr = *first_hop; 243 244 skb->priority = sk->sk_priority; 245 skb->mark = sk->sk_mark; 246 247 mtu = dst_mtu(dst); 248 if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) { 249 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 250 IPSTATS_MIB_OUT, skb->len); 251 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL, 252 dst->dev, dst_output); 253 } 254 255 if (net_ratelimit()) 256 printk(KERN_DEBUG "IPv6: sending pkt_too_big to self\n"); 257 skb->dev = dst->dev; 258 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 259 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 260 kfree_skb(skb); 261 return -EMSGSIZE; 262 } 263 264 EXPORT_SYMBOL(ip6_xmit); 265 266 /* 267 * To avoid extra problems ND packets are send through this 268 * routine. It's code duplication but I really want to avoid 269 * extra checks since ipv6_build_header is used by TCP (which 270 * is for us performance critical) 271 */ 272 273 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev, 274 const struct in6_addr *saddr, const struct in6_addr *daddr, 275 int proto, int len) 276 { 277 struct ipv6_pinfo *np = inet6_sk(sk); 278 struct ipv6hdr *hdr; 279 280 skb->protocol = htons(ETH_P_IPV6); 281 skb->dev = dev; 282 283 skb_reset_network_header(skb); 284 skb_put(skb, sizeof(struct ipv6hdr)); 285 hdr = ipv6_hdr(skb); 286 287 *(__be32*)hdr = htonl(0x60000000); 288 289 hdr->payload_len = htons(len); 290 hdr->nexthdr = proto; 291 hdr->hop_limit = np->hop_limit; 292 293 hdr->saddr = *saddr; 294 hdr->daddr = *daddr; 295 296 return 0; 297 } 298 299 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 300 { 301 struct ip6_ra_chain *ra; 302 struct sock *last = NULL; 303 304 read_lock(&ip6_ra_lock); 305 for (ra = ip6_ra_chain; ra; ra = ra->next) { 306 struct sock *sk = ra->sk; 307 if (sk && ra->sel == sel && 308 (!sk->sk_bound_dev_if || 309 sk->sk_bound_dev_if == skb->dev->ifindex)) { 310 if (last) { 311 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 312 if (skb2) 313 rawv6_rcv(last, skb2); 314 } 315 last = sk; 316 } 317 } 318 319 if (last) { 320 rawv6_rcv(last, skb); 321 read_unlock(&ip6_ra_lock); 322 return 1; 323 } 324 read_unlock(&ip6_ra_lock); 325 return 0; 326 } 327 328 static int ip6_forward_proxy_check(struct sk_buff *skb) 329 { 330 struct ipv6hdr *hdr = ipv6_hdr(skb); 331 u8 nexthdr = hdr->nexthdr; 332 __be16 frag_off; 333 int offset; 334 335 if (ipv6_ext_hdr(nexthdr)) { 336 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 337 if (offset < 0) 338 return 0; 339 } else 340 offset = sizeof(struct ipv6hdr); 341 342 if (nexthdr == IPPROTO_ICMPV6) { 343 struct icmp6hdr *icmp6; 344 345 if (!pskb_may_pull(skb, (skb_network_header(skb) + 346 offset + 1 - skb->data))) 347 return 0; 348 349 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 350 351 switch (icmp6->icmp6_type) { 352 case NDISC_ROUTER_SOLICITATION: 353 case NDISC_ROUTER_ADVERTISEMENT: 354 case NDISC_NEIGHBOUR_SOLICITATION: 355 case NDISC_NEIGHBOUR_ADVERTISEMENT: 356 case NDISC_REDIRECT: 357 /* For reaction involving unicast neighbor discovery 358 * message destined to the proxied address, pass it to 359 * input function. 360 */ 361 return 1; 362 default: 363 break; 364 } 365 } 366 367 /* 368 * The proxying router can't forward traffic sent to a link-local 369 * address, so signal the sender and discard the packet. This 370 * behavior is clarified by the MIPv6 specification. 371 */ 372 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 373 dst_link_failure(skb); 374 return -1; 375 } 376 377 return 0; 378 } 379 380 static inline int ip6_forward_finish(struct sk_buff *skb) 381 { 382 return dst_output(skb); 383 } 384 385 int ip6_forward(struct sk_buff *skb) 386 { 387 struct dst_entry *dst = skb_dst(skb); 388 struct ipv6hdr *hdr = ipv6_hdr(skb); 389 struct inet6_skb_parm *opt = IP6CB(skb); 390 struct net *net = dev_net(dst->dev); 391 struct neighbour *n; 392 u32 mtu; 393 394 if (net->ipv6.devconf_all->forwarding == 0) 395 goto error; 396 397 if (skb_warn_if_lro(skb)) 398 goto drop; 399 400 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 401 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); 402 goto drop; 403 } 404 405 if (skb->pkt_type != PACKET_HOST) 406 goto drop; 407 408 skb_forward_csum(skb); 409 410 /* 411 * We DO NOT make any processing on 412 * RA packets, pushing them to user level AS IS 413 * without ane WARRANTY that application will be able 414 * to interpret them. The reason is that we 415 * cannot make anything clever here. 416 * 417 * We are not end-node, so that if packet contains 418 * AH/ESP, we cannot make anything. 419 * Defragmentation also would be mistake, RA packets 420 * cannot be fragmented, because there is no warranty 421 * that different fragments will go along one path. --ANK 422 */ 423 if (opt->ra) { 424 u8 *ptr = skb_network_header(skb) + opt->ra; 425 if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3])) 426 return 0; 427 } 428 429 /* 430 * check and decrement ttl 431 */ 432 if (hdr->hop_limit <= 1) { 433 /* Force OUTPUT device used as source address */ 434 skb->dev = dst->dev; 435 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 436 IP6_INC_STATS_BH(net, 437 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS); 438 439 kfree_skb(skb); 440 return -ETIMEDOUT; 441 } 442 443 /* XXX: idev->cnf.proxy_ndp? */ 444 if (net->ipv6.devconf_all->proxy_ndp && 445 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 446 int proxied = ip6_forward_proxy_check(skb); 447 if (proxied > 0) 448 return ip6_input(skb); 449 else if (proxied < 0) { 450 IP6_INC_STATS(net, ip6_dst_idev(dst), 451 IPSTATS_MIB_INDISCARDS); 452 goto drop; 453 } 454 } 455 456 if (!xfrm6_route_forward(skb)) { 457 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS); 458 goto drop; 459 } 460 dst = skb_dst(skb); 461 462 /* IPv6 specs say nothing about it, but it is clear that we cannot 463 send redirects to source routed frames. 464 We don't send redirects to frames decapsulated from IPsec. 465 */ 466 n = dst_get_neighbour_noref(dst); 467 if (skb->dev == dst->dev && n && opt->srcrt == 0 && !skb_sec_path(skb)) { 468 struct in6_addr *target = NULL; 469 struct rt6_info *rt; 470 471 /* 472 * incoming and outgoing devices are the same 473 * send a redirect. 474 */ 475 476 rt = (struct rt6_info *) dst; 477 if ((rt->rt6i_flags & RTF_GATEWAY)) 478 target = (struct in6_addr*)&n->primary_key; 479 else 480 target = &hdr->daddr; 481 482 if (!rt->rt6i_peer) 483 rt6_bind_peer(rt, 1); 484 485 /* Limit redirects both by destination (here) 486 and by source (inside ndisc_send_redirect) 487 */ 488 if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ)) 489 ndisc_send_redirect(skb, n, target); 490 } else { 491 int addrtype = ipv6_addr_type(&hdr->saddr); 492 493 /* This check is security critical. */ 494 if (addrtype == IPV6_ADDR_ANY || 495 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 496 goto error; 497 if (addrtype & IPV6_ADDR_LINKLOCAL) { 498 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 499 ICMPV6_NOT_NEIGHBOUR, 0); 500 goto error; 501 } 502 } 503 504 mtu = dst_mtu(dst); 505 if (mtu < IPV6_MIN_MTU) 506 mtu = IPV6_MIN_MTU; 507 508 if (skb->len > mtu && !skb_is_gso(skb)) { 509 /* Again, force OUTPUT device used as source address */ 510 skb->dev = dst->dev; 511 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 512 IP6_INC_STATS_BH(net, 513 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS); 514 IP6_INC_STATS_BH(net, 515 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS); 516 kfree_skb(skb); 517 return -EMSGSIZE; 518 } 519 520 if (skb_cow(skb, dst->dev->hard_header_len)) { 521 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS); 522 goto drop; 523 } 524 525 hdr = ipv6_hdr(skb); 526 527 /* Mangling hops number delayed to point after skb COW */ 528 529 hdr->hop_limit--; 530 531 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 532 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev, 533 ip6_forward_finish); 534 535 error: 536 IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS); 537 drop: 538 kfree_skb(skb); 539 return -EINVAL; 540 } 541 542 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 543 { 544 to->pkt_type = from->pkt_type; 545 to->priority = from->priority; 546 to->protocol = from->protocol; 547 skb_dst_drop(to); 548 skb_dst_set(to, dst_clone(skb_dst(from))); 549 to->dev = from->dev; 550 to->mark = from->mark; 551 552 #ifdef CONFIG_NET_SCHED 553 to->tc_index = from->tc_index; 554 #endif 555 nf_copy(to, from); 556 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \ 557 defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE) 558 to->nf_trace = from->nf_trace; 559 #endif 560 skb_copy_secmark(to, from); 561 } 562 563 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) 564 { 565 u16 offset = sizeof(struct ipv6hdr); 566 struct ipv6_opt_hdr *exthdr = 567 (struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1); 568 unsigned int packet_len = skb->tail - skb->network_header; 569 int found_rhdr = 0; 570 *nexthdr = &ipv6_hdr(skb)->nexthdr; 571 572 while (offset + 1 <= packet_len) { 573 574 switch (**nexthdr) { 575 576 case NEXTHDR_HOP: 577 break; 578 case NEXTHDR_ROUTING: 579 found_rhdr = 1; 580 break; 581 case NEXTHDR_DEST: 582 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) 583 if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0) 584 break; 585 #endif 586 if (found_rhdr) 587 return offset; 588 break; 589 default : 590 return offset; 591 } 592 593 offset += ipv6_optlen(exthdr); 594 *nexthdr = &exthdr->nexthdr; 595 exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + 596 offset); 597 } 598 599 return offset; 600 } 601 602 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt) 603 { 604 static atomic_t ipv6_fragmentation_id; 605 int old, new; 606 607 if (rt && !(rt->dst.flags & DST_NOPEER)) { 608 struct inet_peer *peer; 609 610 if (!rt->rt6i_peer) 611 rt6_bind_peer(rt, 1); 612 peer = rt->rt6i_peer; 613 if (peer) { 614 fhdr->identification = htonl(inet_getid(peer, 0)); 615 return; 616 } 617 } 618 do { 619 old = atomic_read(&ipv6_fragmentation_id); 620 new = old + 1; 621 if (!new) 622 new = 1; 623 } while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old); 624 fhdr->identification = htonl(new); 625 } 626 627 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) 628 { 629 struct sk_buff *frag; 630 struct rt6_info *rt = (struct rt6_info*)skb_dst(skb); 631 struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL; 632 struct ipv6hdr *tmp_hdr; 633 struct frag_hdr *fh; 634 unsigned int mtu, hlen, left, len; 635 int hroom, troom; 636 __be32 frag_id = 0; 637 int ptr, offset = 0, err=0; 638 u8 *prevhdr, nexthdr = 0; 639 struct net *net = dev_net(skb_dst(skb)->dev); 640 641 hlen = ip6_find_1stfragopt(skb, &prevhdr); 642 nexthdr = *prevhdr; 643 644 mtu = ip6_skb_dst_mtu(skb); 645 646 /* We must not fragment if the socket is set to force MTU discovery 647 * or if the skb it not generated by a local socket. 648 */ 649 if (!skb->local_df && skb->len > mtu) { 650 skb->dev = skb_dst(skb)->dev; 651 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 652 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 653 IPSTATS_MIB_FRAGFAILS); 654 kfree_skb(skb); 655 return -EMSGSIZE; 656 } 657 658 if (np && np->frag_size < mtu) { 659 if (np->frag_size) 660 mtu = np->frag_size; 661 } 662 mtu -= hlen + sizeof(struct frag_hdr); 663 664 if (skb_has_frag_list(skb)) { 665 int first_len = skb_pagelen(skb); 666 struct sk_buff *frag2; 667 668 if (first_len - hlen > mtu || 669 ((first_len - hlen) & 7) || 670 skb_cloned(skb)) 671 goto slow_path; 672 673 skb_walk_frags(skb, frag) { 674 /* Correct geometry. */ 675 if (frag->len > mtu || 676 ((frag->len & 7) && frag->next) || 677 skb_headroom(frag) < hlen) 678 goto slow_path_clean; 679 680 /* Partially cloned skb? */ 681 if (skb_shared(frag)) 682 goto slow_path_clean; 683 684 BUG_ON(frag->sk); 685 if (skb->sk) { 686 frag->sk = skb->sk; 687 frag->destructor = sock_wfree; 688 } 689 skb->truesize -= frag->truesize; 690 } 691 692 err = 0; 693 offset = 0; 694 frag = skb_shinfo(skb)->frag_list; 695 skb_frag_list_init(skb); 696 /* BUILD HEADER */ 697 698 *prevhdr = NEXTHDR_FRAGMENT; 699 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 700 if (!tmp_hdr) { 701 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 702 IPSTATS_MIB_FRAGFAILS); 703 return -ENOMEM; 704 } 705 706 __skb_pull(skb, hlen); 707 fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr)); 708 __skb_push(skb, hlen); 709 skb_reset_network_header(skb); 710 memcpy(skb_network_header(skb), tmp_hdr, hlen); 711 712 ipv6_select_ident(fh, rt); 713 fh->nexthdr = nexthdr; 714 fh->reserved = 0; 715 fh->frag_off = htons(IP6_MF); 716 frag_id = fh->identification; 717 718 first_len = skb_pagelen(skb); 719 skb->data_len = first_len - skb_headlen(skb); 720 skb->len = first_len; 721 ipv6_hdr(skb)->payload_len = htons(first_len - 722 sizeof(struct ipv6hdr)); 723 724 dst_hold(&rt->dst); 725 726 for (;;) { 727 /* Prepare header of the next frame, 728 * before previous one went down. */ 729 if (frag) { 730 frag->ip_summed = CHECKSUM_NONE; 731 skb_reset_transport_header(frag); 732 fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr)); 733 __skb_push(frag, hlen); 734 skb_reset_network_header(frag); 735 memcpy(skb_network_header(frag), tmp_hdr, 736 hlen); 737 offset += skb->len - hlen - sizeof(struct frag_hdr); 738 fh->nexthdr = nexthdr; 739 fh->reserved = 0; 740 fh->frag_off = htons(offset); 741 if (frag->next != NULL) 742 fh->frag_off |= htons(IP6_MF); 743 fh->identification = frag_id; 744 ipv6_hdr(frag)->payload_len = 745 htons(frag->len - 746 sizeof(struct ipv6hdr)); 747 ip6_copy_metadata(frag, skb); 748 } 749 750 err = output(skb); 751 if(!err) 752 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 753 IPSTATS_MIB_FRAGCREATES); 754 755 if (err || !frag) 756 break; 757 758 skb = frag; 759 frag = skb->next; 760 skb->next = NULL; 761 } 762 763 kfree(tmp_hdr); 764 765 if (err == 0) { 766 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 767 IPSTATS_MIB_FRAGOKS); 768 dst_release(&rt->dst); 769 return 0; 770 } 771 772 while (frag) { 773 skb = frag->next; 774 kfree_skb(frag); 775 frag = skb; 776 } 777 778 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 779 IPSTATS_MIB_FRAGFAILS); 780 dst_release(&rt->dst); 781 return err; 782 783 slow_path_clean: 784 skb_walk_frags(skb, frag2) { 785 if (frag2 == frag) 786 break; 787 frag2->sk = NULL; 788 frag2->destructor = NULL; 789 skb->truesize += frag2->truesize; 790 } 791 } 792 793 slow_path: 794 left = skb->len - hlen; /* Space per frame */ 795 ptr = hlen; /* Where to start from */ 796 797 /* 798 * Fragment the datagram. 799 */ 800 801 *prevhdr = NEXTHDR_FRAGMENT; 802 hroom = LL_RESERVED_SPACE(rt->dst.dev); 803 troom = rt->dst.dev->needed_tailroom; 804 805 /* 806 * Keep copying data until we run out. 807 */ 808 while(left > 0) { 809 len = left; 810 /* IF: it doesn't fit, use 'mtu' - the data space left */ 811 if (len > mtu) 812 len = mtu; 813 /* IF: we are not sending up to and including the packet end 814 then align the next start on an eight byte boundary */ 815 if (len < left) { 816 len &= ~7; 817 } 818 /* 819 * Allocate buffer. 820 */ 821 822 if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 823 hroom + troom, GFP_ATOMIC)) == NULL) { 824 NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n"); 825 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 826 IPSTATS_MIB_FRAGFAILS); 827 err = -ENOMEM; 828 goto fail; 829 } 830 831 /* 832 * Set up data on packet 833 */ 834 835 ip6_copy_metadata(frag, skb); 836 skb_reserve(frag, hroom); 837 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 838 skb_reset_network_header(frag); 839 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 840 frag->transport_header = (frag->network_header + hlen + 841 sizeof(struct frag_hdr)); 842 843 /* 844 * Charge the memory for the fragment to any owner 845 * it might possess 846 */ 847 if (skb->sk) 848 skb_set_owner_w(frag, skb->sk); 849 850 /* 851 * Copy the packet header into the new buffer. 852 */ 853 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 854 855 /* 856 * Build fragment header. 857 */ 858 fh->nexthdr = nexthdr; 859 fh->reserved = 0; 860 if (!frag_id) { 861 ipv6_select_ident(fh, rt); 862 frag_id = fh->identification; 863 } else 864 fh->identification = frag_id; 865 866 /* 867 * Copy a block of the IP datagram. 868 */ 869 if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len)) 870 BUG(); 871 left -= len; 872 873 fh->frag_off = htons(offset); 874 if (left > 0) 875 fh->frag_off |= htons(IP6_MF); 876 ipv6_hdr(frag)->payload_len = htons(frag->len - 877 sizeof(struct ipv6hdr)); 878 879 ptr += len; 880 offset += len; 881 882 /* 883 * Put this fragment into the sending queue. 884 */ 885 err = output(frag); 886 if (err) 887 goto fail; 888 889 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 890 IPSTATS_MIB_FRAGCREATES); 891 } 892 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 893 IPSTATS_MIB_FRAGOKS); 894 kfree_skb(skb); 895 return err; 896 897 fail: 898 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 899 IPSTATS_MIB_FRAGFAILS); 900 kfree_skb(skb); 901 return err; 902 } 903 904 static inline int ip6_rt_check(const struct rt6key *rt_key, 905 const struct in6_addr *fl_addr, 906 const struct in6_addr *addr_cache) 907 { 908 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 909 (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); 910 } 911 912 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 913 struct dst_entry *dst, 914 const struct flowi6 *fl6) 915 { 916 struct ipv6_pinfo *np = inet6_sk(sk); 917 struct rt6_info *rt = (struct rt6_info *)dst; 918 919 if (!dst) 920 goto out; 921 922 /* Yes, checking route validity in not connected 923 * case is not very simple. Take into account, 924 * that we do not support routing by source, TOS, 925 * and MSG_DONTROUTE --ANK (980726) 926 * 927 * 1. ip6_rt_check(): If route was host route, 928 * check that cached destination is current. 929 * If it is network route, we still may 930 * check its validity using saved pointer 931 * to the last used address: daddr_cache. 932 * We do not want to save whole address now, 933 * (because main consumer of this service 934 * is tcp, which has not this problem), 935 * so that the last trick works only on connected 936 * sockets. 937 * 2. oif also should be the same. 938 */ 939 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 940 #ifdef CONFIG_IPV6_SUBTREES 941 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 942 #endif 943 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) { 944 dst_release(dst); 945 dst = NULL; 946 } 947 948 out: 949 return dst; 950 } 951 952 static int ip6_dst_lookup_tail(struct sock *sk, 953 struct dst_entry **dst, struct flowi6 *fl6) 954 { 955 struct net *net = sock_net(sk); 956 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 957 struct neighbour *n; 958 #endif 959 int err; 960 961 if (*dst == NULL) 962 *dst = ip6_route_output(net, sk, fl6); 963 964 if ((err = (*dst)->error)) 965 goto out_err_release; 966 967 if (ipv6_addr_any(&fl6->saddr)) { 968 struct rt6_info *rt = (struct rt6_info *) *dst; 969 err = ip6_route_get_saddr(net, rt, &fl6->daddr, 970 sk ? inet6_sk(sk)->srcprefs : 0, 971 &fl6->saddr); 972 if (err) 973 goto out_err_release; 974 } 975 976 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 977 /* 978 * Here if the dst entry we've looked up 979 * has a neighbour entry that is in the INCOMPLETE 980 * state and the src address from the flow is 981 * marked as OPTIMISTIC, we release the found 982 * dst entry and replace it instead with the 983 * dst entry of the nexthop router 984 */ 985 rcu_read_lock(); 986 n = dst_get_neighbour_noref(*dst); 987 if (n && !(n->nud_state & NUD_VALID)) { 988 struct inet6_ifaddr *ifp; 989 struct flowi6 fl_gw6; 990 int redirect; 991 992 rcu_read_unlock(); 993 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 994 (*dst)->dev, 1); 995 996 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 997 if (ifp) 998 in6_ifa_put(ifp); 999 1000 if (redirect) { 1001 /* 1002 * We need to get the dst entry for the 1003 * default router instead 1004 */ 1005 dst_release(*dst); 1006 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1007 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1008 *dst = ip6_route_output(net, sk, &fl_gw6); 1009 if ((err = (*dst)->error)) 1010 goto out_err_release; 1011 } 1012 } else { 1013 rcu_read_unlock(); 1014 } 1015 #endif 1016 1017 return 0; 1018 1019 out_err_release: 1020 if (err == -ENETUNREACH) 1021 IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1022 dst_release(*dst); 1023 *dst = NULL; 1024 return err; 1025 } 1026 1027 /** 1028 * ip6_dst_lookup - perform route lookup on flow 1029 * @sk: socket which provides route info 1030 * @dst: pointer to dst_entry * for result 1031 * @fl6: flow to lookup 1032 * 1033 * This function performs a route lookup on the given flow. 1034 * 1035 * It returns zero on success, or a standard errno code on error. 1036 */ 1037 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6) 1038 { 1039 *dst = NULL; 1040 return ip6_dst_lookup_tail(sk, dst, fl6); 1041 } 1042 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1043 1044 /** 1045 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1046 * @sk: socket which provides route info 1047 * @fl6: flow to lookup 1048 * @final_dst: final destination address for ipsec lookup 1049 * @can_sleep: we are in a sleepable context 1050 * 1051 * This function performs a route lookup on the given flow. 1052 * 1053 * It returns a valid dst pointer on success, or a pointer encoded 1054 * error code. 1055 */ 1056 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1057 const struct in6_addr *final_dst, 1058 bool can_sleep) 1059 { 1060 struct dst_entry *dst = NULL; 1061 int err; 1062 1063 err = ip6_dst_lookup_tail(sk, &dst, fl6); 1064 if (err) 1065 return ERR_PTR(err); 1066 if (final_dst) 1067 fl6->daddr = *final_dst; 1068 if (can_sleep) 1069 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; 1070 1071 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1072 } 1073 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1074 1075 /** 1076 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1077 * @sk: socket which provides the dst cache and route info 1078 * @fl6: flow to lookup 1079 * @final_dst: final destination address for ipsec lookup 1080 * @can_sleep: we are in a sleepable context 1081 * 1082 * This function performs a route lookup on the given flow with the 1083 * possibility of using the cached route in the socket if it is valid. 1084 * It will take the socket dst lock when operating on the dst cache. 1085 * As a result, this function can only be used in process context. 1086 * 1087 * It returns a valid dst pointer on success, or a pointer encoded 1088 * error code. 1089 */ 1090 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1091 const struct in6_addr *final_dst, 1092 bool can_sleep) 1093 { 1094 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1095 int err; 1096 1097 dst = ip6_sk_dst_check(sk, dst, fl6); 1098 1099 err = ip6_dst_lookup_tail(sk, &dst, fl6); 1100 if (err) 1101 return ERR_PTR(err); 1102 if (final_dst) 1103 fl6->daddr = *final_dst; 1104 if (can_sleep) 1105 fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP; 1106 1107 return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1108 } 1109 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1110 1111 static inline int ip6_ufo_append_data(struct sock *sk, 1112 int getfrag(void *from, char *to, int offset, int len, 1113 int odd, struct sk_buff *skb), 1114 void *from, int length, int hh_len, int fragheaderlen, 1115 int transhdrlen, int mtu,unsigned int flags, 1116 struct rt6_info *rt) 1117 1118 { 1119 struct sk_buff *skb; 1120 int err; 1121 1122 /* There is support for UDP large send offload by network 1123 * device, so create one single skb packet containing complete 1124 * udp datagram 1125 */ 1126 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { 1127 skb = sock_alloc_send_skb(sk, 1128 hh_len + fragheaderlen + transhdrlen + 20, 1129 (flags & MSG_DONTWAIT), &err); 1130 if (skb == NULL) 1131 return err; 1132 1133 /* reserve space for Hardware header */ 1134 skb_reserve(skb, hh_len); 1135 1136 /* create space for UDP/IP header */ 1137 skb_put(skb,fragheaderlen + transhdrlen); 1138 1139 /* initialize network header pointer */ 1140 skb_reset_network_header(skb); 1141 1142 /* initialize protocol header pointer */ 1143 skb->transport_header = skb->network_header + fragheaderlen; 1144 1145 skb->ip_summed = CHECKSUM_PARTIAL; 1146 skb->csum = 0; 1147 } 1148 1149 err = skb_append_datato_frags(sk,skb, getfrag, from, 1150 (length - transhdrlen)); 1151 if (!err) { 1152 struct frag_hdr fhdr; 1153 1154 /* Specify the length of each IPv6 datagram fragment. 1155 * It has to be a multiple of 8. 1156 */ 1157 skb_shinfo(skb)->gso_size = (mtu - fragheaderlen - 1158 sizeof(struct frag_hdr)) & ~7; 1159 skb_shinfo(skb)->gso_type = SKB_GSO_UDP; 1160 ipv6_select_ident(&fhdr, rt); 1161 skb_shinfo(skb)->ip6_frag_id = fhdr.identification; 1162 __skb_queue_tail(&sk->sk_write_queue, skb); 1163 1164 return 0; 1165 } 1166 /* There is not enough support do UPD LSO, 1167 * so follow normal path 1168 */ 1169 kfree_skb(skb); 1170 1171 return err; 1172 } 1173 1174 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1175 gfp_t gfp) 1176 { 1177 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1178 } 1179 1180 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1181 gfp_t gfp) 1182 { 1183 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1184 } 1185 1186 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, 1187 int offset, int len, int odd, struct sk_buff *skb), 1188 void *from, int length, int transhdrlen, 1189 int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6, 1190 struct rt6_info *rt, unsigned int flags, int dontfrag) 1191 { 1192 struct inet_sock *inet = inet_sk(sk); 1193 struct ipv6_pinfo *np = inet6_sk(sk); 1194 struct inet_cork *cork; 1195 struct sk_buff *skb; 1196 unsigned int maxfraglen, fragheaderlen; 1197 int exthdrlen; 1198 int dst_exthdrlen; 1199 int hh_len; 1200 int mtu; 1201 int copy; 1202 int err; 1203 int offset = 0; 1204 int csummode = CHECKSUM_NONE; 1205 __u8 tx_flags = 0; 1206 1207 if (flags&MSG_PROBE) 1208 return 0; 1209 cork = &inet->cork.base; 1210 if (skb_queue_empty(&sk->sk_write_queue)) { 1211 /* 1212 * setup for corking 1213 */ 1214 if (opt) { 1215 if (WARN_ON(np->cork.opt)) 1216 return -EINVAL; 1217 1218 np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation); 1219 if (unlikely(np->cork.opt == NULL)) 1220 return -ENOBUFS; 1221 1222 np->cork.opt->tot_len = opt->tot_len; 1223 np->cork.opt->opt_flen = opt->opt_flen; 1224 np->cork.opt->opt_nflen = opt->opt_nflen; 1225 1226 np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1227 sk->sk_allocation); 1228 if (opt->dst0opt && !np->cork.opt->dst0opt) 1229 return -ENOBUFS; 1230 1231 np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1232 sk->sk_allocation); 1233 if (opt->dst1opt && !np->cork.opt->dst1opt) 1234 return -ENOBUFS; 1235 1236 np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt, 1237 sk->sk_allocation); 1238 if (opt->hopopt && !np->cork.opt->hopopt) 1239 return -ENOBUFS; 1240 1241 np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1242 sk->sk_allocation); 1243 if (opt->srcrt && !np->cork.opt->srcrt) 1244 return -ENOBUFS; 1245 1246 /* need source address above miyazawa*/ 1247 } 1248 dst_hold(&rt->dst); 1249 cork->dst = &rt->dst; 1250 inet->cork.fl.u.ip6 = *fl6; 1251 np->cork.hop_limit = hlimit; 1252 np->cork.tclass = tclass; 1253 mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ? 1254 rt->dst.dev->mtu : dst_mtu(&rt->dst); 1255 if (np->frag_size < mtu) { 1256 if (np->frag_size) 1257 mtu = np->frag_size; 1258 } 1259 cork->fragsize = mtu; 1260 if (dst_allfrag(rt->dst.path)) 1261 cork->flags |= IPCORK_ALLFRAG; 1262 cork->length = 0; 1263 sk->sk_sndmsg_page = NULL; 1264 sk->sk_sndmsg_off = 0; 1265 exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len; 1266 length += exthdrlen; 1267 transhdrlen += exthdrlen; 1268 dst_exthdrlen = rt->dst.header_len; 1269 } else { 1270 rt = (struct rt6_info *)cork->dst; 1271 fl6 = &inet->cork.fl.u.ip6; 1272 opt = np->cork.opt; 1273 transhdrlen = 0; 1274 exthdrlen = 0; 1275 dst_exthdrlen = 0; 1276 mtu = cork->fragsize; 1277 } 1278 1279 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1280 1281 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1282 (opt ? opt->opt_nflen : 0); 1283 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr); 1284 1285 if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { 1286 if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { 1287 ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen); 1288 return -EMSGSIZE; 1289 } 1290 } 1291 1292 /* For UDP, check if TX timestamp is enabled */ 1293 if (sk->sk_type == SOCK_DGRAM) { 1294 err = sock_tx_timestamp(sk, &tx_flags); 1295 if (err) 1296 goto error; 1297 } 1298 1299 /* 1300 * Let's try using as much space as possible. 1301 * Use MTU if total length of the message fits into the MTU. 1302 * Otherwise, we need to reserve fragment header and 1303 * fragment alignment (= 8-15 octects, in total). 1304 * 1305 * Note that we may need to "move" the data from the tail of 1306 * of the buffer to the new fragment when we split 1307 * the message. 1308 * 1309 * FIXME: It may be fragmented into multiple chunks 1310 * at once if non-fragmentable extension headers 1311 * are too large. 1312 * --yoshfuji 1313 */ 1314 1315 cork->length += length; 1316 if (length > mtu) { 1317 int proto = sk->sk_protocol; 1318 if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){ 1319 ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen); 1320 return -EMSGSIZE; 1321 } 1322 1323 if (proto == IPPROTO_UDP && 1324 (rt->dst.dev->features & NETIF_F_UFO)) { 1325 1326 err = ip6_ufo_append_data(sk, getfrag, from, length, 1327 hh_len, fragheaderlen, 1328 transhdrlen, mtu, flags, rt); 1329 if (err) 1330 goto error; 1331 return 0; 1332 } 1333 } 1334 1335 if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) 1336 goto alloc_new_skb; 1337 1338 while (length > 0) { 1339 /* Check if the remaining data fits into current packet. */ 1340 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1341 if (copy < length) 1342 copy = maxfraglen - skb->len; 1343 1344 if (copy <= 0) { 1345 char *data; 1346 unsigned int datalen; 1347 unsigned int fraglen; 1348 unsigned int fraggap; 1349 unsigned int alloclen; 1350 struct sk_buff *skb_prev; 1351 alloc_new_skb: 1352 skb_prev = skb; 1353 1354 /* There's no room in the current skb */ 1355 if (skb_prev) 1356 fraggap = skb_prev->len - maxfraglen; 1357 else 1358 fraggap = 0; 1359 1360 /* 1361 * If remaining data exceeds the mtu, 1362 * we know we need more fragment(s). 1363 */ 1364 datalen = length + fraggap; 1365 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1366 datalen = maxfraglen - fragheaderlen; 1367 1368 fraglen = datalen + fragheaderlen; 1369 if ((flags & MSG_MORE) && 1370 !(rt->dst.dev->features&NETIF_F_SG)) 1371 alloclen = mtu; 1372 else 1373 alloclen = datalen + fragheaderlen; 1374 1375 alloclen += dst_exthdrlen; 1376 1377 /* 1378 * The last fragment gets additional space at tail. 1379 * Note: we overallocate on fragments with MSG_MODE 1380 * because we have no idea if we're the last one. 1381 */ 1382 if (datalen == length + fraggap) 1383 alloclen += rt->dst.trailer_len; 1384 1385 /* 1386 * We just reserve space for fragment header. 1387 * Note: this may be overallocation if the message 1388 * (without MSG_MORE) fits into the MTU. 1389 */ 1390 alloclen += sizeof(struct frag_hdr); 1391 1392 if (transhdrlen) { 1393 skb = sock_alloc_send_skb(sk, 1394 alloclen + hh_len, 1395 (flags & MSG_DONTWAIT), &err); 1396 } else { 1397 skb = NULL; 1398 if (atomic_read(&sk->sk_wmem_alloc) <= 1399 2 * sk->sk_sndbuf) 1400 skb = sock_wmalloc(sk, 1401 alloclen + hh_len, 1, 1402 sk->sk_allocation); 1403 if (unlikely(skb == NULL)) 1404 err = -ENOBUFS; 1405 else { 1406 /* Only the initial fragment 1407 * is time stamped. 1408 */ 1409 tx_flags = 0; 1410 } 1411 } 1412 if (skb == NULL) 1413 goto error; 1414 /* 1415 * Fill in the control structures 1416 */ 1417 skb->ip_summed = csummode; 1418 skb->csum = 0; 1419 /* reserve for fragmentation */ 1420 skb_reserve(skb, hh_len+sizeof(struct frag_hdr)); 1421 1422 if (sk->sk_type == SOCK_DGRAM) 1423 skb_shinfo(skb)->tx_flags = tx_flags; 1424 1425 /* 1426 * Find where to start putting bytes 1427 */ 1428 data = skb_put(skb, fraglen + dst_exthdrlen); 1429 skb_set_network_header(skb, exthdrlen + dst_exthdrlen); 1430 data += fragheaderlen + dst_exthdrlen; 1431 skb->transport_header = (skb->network_header + 1432 fragheaderlen); 1433 if (fraggap) { 1434 skb->csum = skb_copy_and_csum_bits( 1435 skb_prev, maxfraglen, 1436 data + transhdrlen, fraggap, 0); 1437 skb_prev->csum = csum_sub(skb_prev->csum, 1438 skb->csum); 1439 data += fraggap; 1440 pskb_trim_unique(skb_prev, maxfraglen); 1441 } 1442 copy = datalen - transhdrlen - fraggap; 1443 1444 if (copy < 0) { 1445 err = -EINVAL; 1446 kfree_skb(skb); 1447 goto error; 1448 } else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) { 1449 err = -EFAULT; 1450 kfree_skb(skb); 1451 goto error; 1452 } 1453 1454 offset += copy; 1455 length -= datalen - fraggap; 1456 transhdrlen = 0; 1457 exthdrlen = 0; 1458 dst_exthdrlen = 0; 1459 csummode = CHECKSUM_NONE; 1460 1461 /* 1462 * Put the packet on the pending queue 1463 */ 1464 __skb_queue_tail(&sk->sk_write_queue, skb); 1465 continue; 1466 } 1467 1468 if (copy > length) 1469 copy = length; 1470 1471 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1472 unsigned int off; 1473 1474 off = skb->len; 1475 if (getfrag(from, skb_put(skb, copy), 1476 offset, copy, off, skb) < 0) { 1477 __skb_trim(skb, off); 1478 err = -EFAULT; 1479 goto error; 1480 } 1481 } else { 1482 int i = skb_shinfo(skb)->nr_frags; 1483 skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1]; 1484 struct page *page = sk->sk_sndmsg_page; 1485 int off = sk->sk_sndmsg_off; 1486 unsigned int left; 1487 1488 if (page && (left = PAGE_SIZE - off) > 0) { 1489 if (copy >= left) 1490 copy = left; 1491 if (page != skb_frag_page(frag)) { 1492 if (i == MAX_SKB_FRAGS) { 1493 err = -EMSGSIZE; 1494 goto error; 1495 } 1496 skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0); 1497 skb_frag_ref(skb, i); 1498 frag = &skb_shinfo(skb)->frags[i]; 1499 } 1500 } else if(i < MAX_SKB_FRAGS) { 1501 if (copy > PAGE_SIZE) 1502 copy = PAGE_SIZE; 1503 page = alloc_pages(sk->sk_allocation, 0); 1504 if (page == NULL) { 1505 err = -ENOMEM; 1506 goto error; 1507 } 1508 sk->sk_sndmsg_page = page; 1509 sk->sk_sndmsg_off = 0; 1510 1511 skb_fill_page_desc(skb, i, page, 0, 0); 1512 frag = &skb_shinfo(skb)->frags[i]; 1513 } else { 1514 err = -EMSGSIZE; 1515 goto error; 1516 } 1517 if (getfrag(from, 1518 skb_frag_address(frag) + skb_frag_size(frag), 1519 offset, copy, skb->len, skb) < 0) { 1520 err = -EFAULT; 1521 goto error; 1522 } 1523 sk->sk_sndmsg_off += copy; 1524 skb_frag_size_add(frag, copy); 1525 skb->len += copy; 1526 skb->data_len += copy; 1527 skb->truesize += copy; 1528 atomic_add(copy, &sk->sk_wmem_alloc); 1529 } 1530 offset += copy; 1531 length -= copy; 1532 } 1533 return 0; 1534 error: 1535 cork->length -= length; 1536 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1537 return err; 1538 } 1539 1540 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np) 1541 { 1542 if (np->cork.opt) { 1543 kfree(np->cork.opt->dst0opt); 1544 kfree(np->cork.opt->dst1opt); 1545 kfree(np->cork.opt->hopopt); 1546 kfree(np->cork.opt->srcrt); 1547 kfree(np->cork.opt); 1548 np->cork.opt = NULL; 1549 } 1550 1551 if (inet->cork.base.dst) { 1552 dst_release(inet->cork.base.dst); 1553 inet->cork.base.dst = NULL; 1554 inet->cork.base.flags &= ~IPCORK_ALLFRAG; 1555 } 1556 memset(&inet->cork.fl, 0, sizeof(inet->cork.fl)); 1557 } 1558 1559 int ip6_push_pending_frames(struct sock *sk) 1560 { 1561 struct sk_buff *skb, *tmp_skb; 1562 struct sk_buff **tail_skb; 1563 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1564 struct inet_sock *inet = inet_sk(sk); 1565 struct ipv6_pinfo *np = inet6_sk(sk); 1566 struct net *net = sock_net(sk); 1567 struct ipv6hdr *hdr; 1568 struct ipv6_txoptions *opt = np->cork.opt; 1569 struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst; 1570 struct flowi6 *fl6 = &inet->cork.fl.u.ip6; 1571 unsigned char proto = fl6->flowi6_proto; 1572 int err = 0; 1573 1574 if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL) 1575 goto out; 1576 tail_skb = &(skb_shinfo(skb)->frag_list); 1577 1578 /* move skb->data to ip header from ext header */ 1579 if (skb->data < skb_network_header(skb)) 1580 __skb_pull(skb, skb_network_offset(skb)); 1581 while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) { 1582 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1583 *tail_skb = tmp_skb; 1584 tail_skb = &(tmp_skb->next); 1585 skb->len += tmp_skb->len; 1586 skb->data_len += tmp_skb->len; 1587 skb->truesize += tmp_skb->truesize; 1588 tmp_skb->destructor = NULL; 1589 tmp_skb->sk = NULL; 1590 } 1591 1592 /* Allow local fragmentation. */ 1593 if (np->pmtudisc < IPV6_PMTUDISC_DO) 1594 skb->local_df = 1; 1595 1596 *final_dst = fl6->daddr; 1597 __skb_pull(skb, skb_network_header_len(skb)); 1598 if (opt && opt->opt_flen) 1599 ipv6_push_frag_opts(skb, opt, &proto); 1600 if (opt && opt->opt_nflen) 1601 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst); 1602 1603 skb_push(skb, sizeof(struct ipv6hdr)); 1604 skb_reset_network_header(skb); 1605 hdr = ipv6_hdr(skb); 1606 1607 *(__be32*)hdr = fl6->flowlabel | 1608 htonl(0x60000000 | ((int)np->cork.tclass << 20)); 1609 1610 hdr->hop_limit = np->cork.hop_limit; 1611 hdr->nexthdr = proto; 1612 hdr->saddr = fl6->saddr; 1613 hdr->daddr = *final_dst; 1614 1615 skb->priority = sk->sk_priority; 1616 skb->mark = sk->sk_mark; 1617 1618 skb_dst_set(skb, dst_clone(&rt->dst)); 1619 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1620 if (proto == IPPROTO_ICMPV6) { 1621 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1622 1623 ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type); 1624 ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS); 1625 } 1626 1627 err = ip6_local_out(skb); 1628 if (err) { 1629 if (err > 0) 1630 err = net_xmit_errno(err); 1631 if (err) 1632 goto error; 1633 } 1634 1635 out: 1636 ip6_cork_release(inet, np); 1637 return err; 1638 error: 1639 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1640 goto out; 1641 } 1642 1643 void ip6_flush_pending_frames(struct sock *sk) 1644 { 1645 struct sk_buff *skb; 1646 1647 while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) { 1648 if (skb_dst(skb)) 1649 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1650 IPSTATS_MIB_OUTDISCARDS); 1651 kfree_skb(skb); 1652 } 1653 1654 ip6_cork_release(inet_sk(sk), inet6_sk(sk)); 1655 } 1656