1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 int ret; 134 135 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 136 if (ret) { 137 kfree_skb(skb); 138 return ret; 139 } 140 141 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 142 /* Policy lookup after SNAT yielded a new policy */ 143 if (skb_dst(skb)->xfrm) { 144 IPCB(skb)->flags |= IPSKB_REROUTED; 145 return dst_output(net, sk, skb); 146 } 147 #endif 148 149 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 150 dst_allfrag(skb_dst(skb)) || 151 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 152 return ip6_fragment(net, sk, skb, ip6_finish_output2); 153 else 154 return ip6_finish_output2(net, sk, skb); 155 } 156 157 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 158 { 159 struct net_device *dev = skb_dst(skb)->dev; 160 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 161 162 skb->protocol = htons(ETH_P_IPV6); 163 skb->dev = dev; 164 165 if (unlikely(idev->cnf.disable_ipv6)) { 166 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 167 kfree_skb(skb); 168 return 0; 169 } 170 171 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 172 net, sk, skb, NULL, dev, 173 ip6_finish_output, 174 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 175 } 176 177 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 178 { 179 if (!np->autoflowlabel_set) 180 return ip6_default_np_autolabel(net); 181 else 182 return np->autoflowlabel; 183 } 184 185 /* 186 * xmit an sk_buff (used by TCP, SCTP and DCCP) 187 * Note : socket lock is not held for SYNACK packets, but might be modified 188 * by calls to skb_set_owner_w() and ipv6_local_error(), 189 * which are using proper atomic operations or spinlocks. 190 */ 191 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 192 __u32 mark, struct ipv6_txoptions *opt, int tclass) 193 { 194 struct net *net = sock_net(sk); 195 const struct ipv6_pinfo *np = inet6_sk(sk); 196 struct in6_addr *first_hop = &fl6->daddr; 197 struct dst_entry *dst = skb_dst(skb); 198 struct ipv6hdr *hdr; 199 u8 proto = fl6->flowi6_proto; 200 int seg_len = skb->len; 201 int hlimit = -1; 202 u32 mtu; 203 204 if (opt) { 205 unsigned int head_room; 206 207 /* First: exthdrs may take lots of space (~8K for now) 208 MAX_HEADER is not enough. 209 */ 210 head_room = opt->opt_nflen + opt->opt_flen; 211 seg_len += head_room; 212 head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 213 214 if (skb_headroom(skb) < head_room) { 215 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 216 if (!skb2) { 217 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 218 IPSTATS_MIB_OUTDISCARDS); 219 kfree_skb(skb); 220 return -ENOBUFS; 221 } 222 consume_skb(skb); 223 skb = skb2; 224 /* skb_set_owner_w() changes sk->sk_wmem_alloc atomically, 225 * it is safe to call in our context (socket lock not held) 226 */ 227 skb_set_owner_w(skb, (struct sock *)sk); 228 } 229 if (opt->opt_flen) 230 ipv6_push_frag_opts(skb, opt, &proto); 231 if (opt->opt_nflen) 232 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 233 &fl6->saddr); 234 } 235 236 skb_push(skb, sizeof(struct ipv6hdr)); 237 skb_reset_network_header(skb); 238 hdr = ipv6_hdr(skb); 239 240 /* 241 * Fill in the IPv6 header 242 */ 243 if (np) 244 hlimit = np->hop_limit; 245 if (hlimit < 0) 246 hlimit = ip6_dst_hoplimit(dst); 247 248 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 249 ip6_autoflowlabel(net, np), fl6)); 250 251 hdr->payload_len = htons(seg_len); 252 hdr->nexthdr = proto; 253 hdr->hop_limit = hlimit; 254 255 hdr->saddr = fl6->saddr; 256 hdr->daddr = *first_hop; 257 258 skb->protocol = htons(ETH_P_IPV6); 259 skb->priority = sk->sk_priority; 260 skb->mark = mark; 261 262 mtu = dst_mtu(dst); 263 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 264 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 265 IPSTATS_MIB_OUT, skb->len); 266 267 /* if egress device is enslaved to an L3 master device pass the 268 * skb to its handler for processing 269 */ 270 skb = l3mdev_ip6_out((struct sock *)sk, skb); 271 if (unlikely(!skb)) 272 return 0; 273 274 /* hooks should never assume socket lock is held. 275 * we promote our socket to non const 276 */ 277 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 278 net, (struct sock *)sk, skb, NULL, dst->dev, 279 dst_output); 280 } 281 282 skb->dev = dst->dev; 283 /* ipv6_local_error() does not require socket lock, 284 * we promote our socket to non const 285 */ 286 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 287 288 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 289 kfree_skb(skb); 290 return -EMSGSIZE; 291 } 292 EXPORT_SYMBOL(ip6_xmit); 293 294 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 295 { 296 struct ip6_ra_chain *ra; 297 struct sock *last = NULL; 298 299 read_lock(&ip6_ra_lock); 300 for (ra = ip6_ra_chain; ra; ra = ra->next) { 301 struct sock *sk = ra->sk; 302 if (sk && ra->sel == sel && 303 (!sk->sk_bound_dev_if || 304 sk->sk_bound_dev_if == skb->dev->ifindex)) { 305 if (last) { 306 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 307 if (skb2) 308 rawv6_rcv(last, skb2); 309 } 310 last = sk; 311 } 312 } 313 314 if (last) { 315 rawv6_rcv(last, skb); 316 read_unlock(&ip6_ra_lock); 317 return 1; 318 } 319 read_unlock(&ip6_ra_lock); 320 return 0; 321 } 322 323 static int ip6_forward_proxy_check(struct sk_buff *skb) 324 { 325 struct ipv6hdr *hdr = ipv6_hdr(skb); 326 u8 nexthdr = hdr->nexthdr; 327 __be16 frag_off; 328 int offset; 329 330 if (ipv6_ext_hdr(nexthdr)) { 331 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 332 if (offset < 0) 333 return 0; 334 } else 335 offset = sizeof(struct ipv6hdr); 336 337 if (nexthdr == IPPROTO_ICMPV6) { 338 struct icmp6hdr *icmp6; 339 340 if (!pskb_may_pull(skb, (skb_network_header(skb) + 341 offset + 1 - skb->data))) 342 return 0; 343 344 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 345 346 switch (icmp6->icmp6_type) { 347 case NDISC_ROUTER_SOLICITATION: 348 case NDISC_ROUTER_ADVERTISEMENT: 349 case NDISC_NEIGHBOUR_SOLICITATION: 350 case NDISC_NEIGHBOUR_ADVERTISEMENT: 351 case NDISC_REDIRECT: 352 /* For reaction involving unicast neighbor discovery 353 * message destined to the proxied address, pass it to 354 * input function. 355 */ 356 return 1; 357 default: 358 break; 359 } 360 } 361 362 /* 363 * The proxying router can't forward traffic sent to a link-local 364 * address, so signal the sender and discard the packet. This 365 * behavior is clarified by the MIPv6 specification. 366 */ 367 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 368 dst_link_failure(skb); 369 return -1; 370 } 371 372 return 0; 373 } 374 375 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 376 struct sk_buff *skb) 377 { 378 struct dst_entry *dst = skb_dst(skb); 379 380 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 381 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 382 383 return dst_output(net, sk, skb); 384 } 385 386 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 387 { 388 if (skb->len <= mtu) 389 return false; 390 391 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 392 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 393 return true; 394 395 if (skb->ignore_df) 396 return false; 397 398 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 399 return false; 400 401 return true; 402 } 403 404 int ip6_forward(struct sk_buff *skb) 405 { 406 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 407 struct dst_entry *dst = skb_dst(skb); 408 struct ipv6hdr *hdr = ipv6_hdr(skb); 409 struct inet6_skb_parm *opt = IP6CB(skb); 410 struct net *net = dev_net(dst->dev); 411 u32 mtu; 412 413 if (net->ipv6.devconf_all->forwarding == 0) 414 goto error; 415 416 if (skb->pkt_type != PACKET_HOST) 417 goto drop; 418 419 if (unlikely(skb->sk)) 420 goto drop; 421 422 if (skb_warn_if_lro(skb)) 423 goto drop; 424 425 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 426 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 427 goto drop; 428 } 429 430 skb_forward_csum(skb); 431 432 /* 433 * We DO NOT make any processing on 434 * RA packets, pushing them to user level AS IS 435 * without ane WARRANTY that application will be able 436 * to interpret them. The reason is that we 437 * cannot make anything clever here. 438 * 439 * We are not end-node, so that if packet contains 440 * AH/ESP, we cannot make anything. 441 * Defragmentation also would be mistake, RA packets 442 * cannot be fragmented, because there is no warranty 443 * that different fragments will go along one path. --ANK 444 */ 445 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 446 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 447 return 0; 448 } 449 450 /* 451 * check and decrement ttl 452 */ 453 if (hdr->hop_limit <= 1) { 454 /* Force OUTPUT device used as source address */ 455 skb->dev = dst->dev; 456 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 457 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 458 459 kfree_skb(skb); 460 return -ETIMEDOUT; 461 } 462 463 /* XXX: idev->cnf.proxy_ndp? */ 464 if (net->ipv6.devconf_all->proxy_ndp && 465 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 466 int proxied = ip6_forward_proxy_check(skb); 467 if (proxied > 0) 468 return ip6_input(skb); 469 else if (proxied < 0) { 470 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 471 goto drop; 472 } 473 } 474 475 if (!xfrm6_route_forward(skb)) { 476 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 477 goto drop; 478 } 479 dst = skb_dst(skb); 480 481 /* IPv6 specs say nothing about it, but it is clear that we cannot 482 send redirects to source routed frames. 483 We don't send redirects to frames decapsulated from IPsec. 484 */ 485 if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) { 486 struct in6_addr *target = NULL; 487 struct inet_peer *peer; 488 struct rt6_info *rt; 489 490 /* 491 * incoming and outgoing devices are the same 492 * send a redirect. 493 */ 494 495 rt = (struct rt6_info *) dst; 496 if (rt->rt6i_flags & RTF_GATEWAY) 497 target = &rt->rt6i_gateway; 498 else 499 target = &hdr->daddr; 500 501 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 502 503 /* Limit redirects both by destination (here) 504 and by source (inside ndisc_send_redirect) 505 */ 506 if (inet_peer_xrlim_allow(peer, 1*HZ)) 507 ndisc_send_redirect(skb, target); 508 if (peer) 509 inet_putpeer(peer); 510 } else { 511 int addrtype = ipv6_addr_type(&hdr->saddr); 512 513 /* This check is security critical. */ 514 if (addrtype == IPV6_ADDR_ANY || 515 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 516 goto error; 517 if (addrtype & IPV6_ADDR_LINKLOCAL) { 518 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 519 ICMPV6_NOT_NEIGHBOUR, 0); 520 goto error; 521 } 522 } 523 524 mtu = ip6_dst_mtu_forward(dst); 525 if (mtu < IPV6_MIN_MTU) 526 mtu = IPV6_MIN_MTU; 527 528 if (ip6_pkt_too_big(skb, mtu)) { 529 /* Again, force OUTPUT device used as source address */ 530 skb->dev = dst->dev; 531 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 532 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 533 __IP6_INC_STATS(net, ip6_dst_idev(dst), 534 IPSTATS_MIB_FRAGFAILS); 535 kfree_skb(skb); 536 return -EMSGSIZE; 537 } 538 539 if (skb_cow(skb, dst->dev->hard_header_len)) { 540 __IP6_INC_STATS(net, ip6_dst_idev(dst), 541 IPSTATS_MIB_OUTDISCARDS); 542 goto drop; 543 } 544 545 hdr = ipv6_hdr(skb); 546 547 /* Mangling hops number delayed to point after skb COW */ 548 549 hdr->hop_limit--; 550 551 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 552 net, NULL, skb, skb->dev, dst->dev, 553 ip6_forward_finish); 554 555 error: 556 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 557 drop: 558 kfree_skb(skb); 559 return -EINVAL; 560 } 561 562 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 563 { 564 to->pkt_type = from->pkt_type; 565 to->priority = from->priority; 566 to->protocol = from->protocol; 567 skb_dst_drop(to); 568 skb_dst_set(to, dst_clone(skb_dst(from))); 569 to->dev = from->dev; 570 to->mark = from->mark; 571 572 #ifdef CONFIG_NET_SCHED 573 to->tc_index = from->tc_index; 574 #endif 575 nf_copy(to, from); 576 skb_copy_secmark(to, from); 577 } 578 579 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 580 int (*output)(struct net *, struct sock *, struct sk_buff *)) 581 { 582 struct sk_buff *frag; 583 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 584 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 585 inet6_sk(skb->sk) : NULL; 586 struct ipv6hdr *tmp_hdr; 587 struct frag_hdr *fh; 588 unsigned int mtu, hlen, left, len; 589 int hroom, troom; 590 __be32 frag_id; 591 int ptr, offset = 0, err = 0; 592 u8 *prevhdr, nexthdr = 0; 593 594 err = ip6_find_1stfragopt(skb, &prevhdr); 595 if (err < 0) 596 goto fail; 597 hlen = err; 598 nexthdr = *prevhdr; 599 600 mtu = ip6_skb_dst_mtu(skb); 601 602 /* We must not fragment if the socket is set to force MTU discovery 603 * or if the skb it not generated by a local socket. 604 */ 605 if (unlikely(!skb->ignore_df && skb->len > mtu)) 606 goto fail_toobig; 607 608 if (IP6CB(skb)->frag_max_size) { 609 if (IP6CB(skb)->frag_max_size > mtu) 610 goto fail_toobig; 611 612 /* don't send fragments larger than what we received */ 613 mtu = IP6CB(skb)->frag_max_size; 614 if (mtu < IPV6_MIN_MTU) 615 mtu = IPV6_MIN_MTU; 616 } 617 618 if (np && np->frag_size < mtu) { 619 if (np->frag_size) 620 mtu = np->frag_size; 621 } 622 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 623 goto fail_toobig; 624 mtu -= hlen + sizeof(struct frag_hdr); 625 626 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 627 &ipv6_hdr(skb)->saddr); 628 629 if (skb->ip_summed == CHECKSUM_PARTIAL && 630 (err = skb_checksum_help(skb))) 631 goto fail; 632 633 hroom = LL_RESERVED_SPACE(rt->dst.dev); 634 if (skb_has_frag_list(skb)) { 635 unsigned int first_len = skb_pagelen(skb); 636 struct sk_buff *frag2; 637 638 if (first_len - hlen > mtu || 639 ((first_len - hlen) & 7) || 640 skb_cloned(skb) || 641 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 642 goto slow_path; 643 644 skb_walk_frags(skb, frag) { 645 /* Correct geometry. */ 646 if (frag->len > mtu || 647 ((frag->len & 7) && frag->next) || 648 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 649 goto slow_path_clean; 650 651 /* Partially cloned skb? */ 652 if (skb_shared(frag)) 653 goto slow_path_clean; 654 655 BUG_ON(frag->sk); 656 if (skb->sk) { 657 frag->sk = skb->sk; 658 frag->destructor = sock_wfree; 659 } 660 skb->truesize -= frag->truesize; 661 } 662 663 err = 0; 664 offset = 0; 665 /* BUILD HEADER */ 666 667 *prevhdr = NEXTHDR_FRAGMENT; 668 tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 669 if (!tmp_hdr) { 670 err = -ENOMEM; 671 goto fail; 672 } 673 frag = skb_shinfo(skb)->frag_list; 674 skb_frag_list_init(skb); 675 676 __skb_pull(skb, hlen); 677 fh = __skb_push(skb, sizeof(struct frag_hdr)); 678 __skb_push(skb, hlen); 679 skb_reset_network_header(skb); 680 memcpy(skb_network_header(skb), tmp_hdr, hlen); 681 682 fh->nexthdr = nexthdr; 683 fh->reserved = 0; 684 fh->frag_off = htons(IP6_MF); 685 fh->identification = frag_id; 686 687 first_len = skb_pagelen(skb); 688 skb->data_len = first_len - skb_headlen(skb); 689 skb->len = first_len; 690 ipv6_hdr(skb)->payload_len = htons(first_len - 691 sizeof(struct ipv6hdr)); 692 693 for (;;) { 694 /* Prepare header of the next frame, 695 * before previous one went down. */ 696 if (frag) { 697 frag->ip_summed = CHECKSUM_NONE; 698 skb_reset_transport_header(frag); 699 fh = __skb_push(frag, sizeof(struct frag_hdr)); 700 __skb_push(frag, hlen); 701 skb_reset_network_header(frag); 702 memcpy(skb_network_header(frag), tmp_hdr, 703 hlen); 704 offset += skb->len - hlen - sizeof(struct frag_hdr); 705 fh->nexthdr = nexthdr; 706 fh->reserved = 0; 707 fh->frag_off = htons(offset); 708 if (frag->next) 709 fh->frag_off |= htons(IP6_MF); 710 fh->identification = frag_id; 711 ipv6_hdr(frag)->payload_len = 712 htons(frag->len - 713 sizeof(struct ipv6hdr)); 714 ip6_copy_metadata(frag, skb); 715 } 716 717 err = output(net, sk, skb); 718 if (!err) 719 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 720 IPSTATS_MIB_FRAGCREATES); 721 722 if (err || !frag) 723 break; 724 725 skb = frag; 726 frag = skb->next; 727 skb->next = NULL; 728 } 729 730 kfree(tmp_hdr); 731 732 if (err == 0) { 733 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 734 IPSTATS_MIB_FRAGOKS); 735 return 0; 736 } 737 738 kfree_skb_list(frag); 739 740 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 741 IPSTATS_MIB_FRAGFAILS); 742 return err; 743 744 slow_path_clean: 745 skb_walk_frags(skb, frag2) { 746 if (frag2 == frag) 747 break; 748 frag2->sk = NULL; 749 frag2->destructor = NULL; 750 skb->truesize += frag2->truesize; 751 } 752 } 753 754 slow_path: 755 left = skb->len - hlen; /* Space per frame */ 756 ptr = hlen; /* Where to start from */ 757 758 /* 759 * Fragment the datagram. 760 */ 761 762 troom = rt->dst.dev->needed_tailroom; 763 764 /* 765 * Keep copying data until we run out. 766 */ 767 while (left > 0) { 768 u8 *fragnexthdr_offset; 769 770 len = left; 771 /* IF: it doesn't fit, use 'mtu' - the data space left */ 772 if (len > mtu) 773 len = mtu; 774 /* IF: we are not sending up to and including the packet end 775 then align the next start on an eight byte boundary */ 776 if (len < left) { 777 len &= ~7; 778 } 779 780 /* Allocate buffer */ 781 frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) + 782 hroom + troom, GFP_ATOMIC); 783 if (!frag) { 784 err = -ENOMEM; 785 goto fail; 786 } 787 788 /* 789 * Set up data on packet 790 */ 791 792 ip6_copy_metadata(frag, skb); 793 skb_reserve(frag, hroom); 794 skb_put(frag, len + hlen + sizeof(struct frag_hdr)); 795 skb_reset_network_header(frag); 796 fh = (struct frag_hdr *)(skb_network_header(frag) + hlen); 797 frag->transport_header = (frag->network_header + hlen + 798 sizeof(struct frag_hdr)); 799 800 /* 801 * Charge the memory for the fragment to any owner 802 * it might possess 803 */ 804 if (skb->sk) 805 skb_set_owner_w(frag, skb->sk); 806 807 /* 808 * Copy the packet header into the new buffer. 809 */ 810 skb_copy_from_linear_data(skb, skb_network_header(frag), hlen); 811 812 fragnexthdr_offset = skb_network_header(frag); 813 fragnexthdr_offset += prevhdr - skb_network_header(skb); 814 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 815 816 /* 817 * Build fragment header. 818 */ 819 fh->nexthdr = nexthdr; 820 fh->reserved = 0; 821 fh->identification = frag_id; 822 823 /* 824 * Copy a block of the IP datagram. 825 */ 826 BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag), 827 len)); 828 left -= len; 829 830 fh->frag_off = htons(offset); 831 if (left > 0) 832 fh->frag_off |= htons(IP6_MF); 833 ipv6_hdr(frag)->payload_len = htons(frag->len - 834 sizeof(struct ipv6hdr)); 835 836 ptr += len; 837 offset += len; 838 839 /* 840 * Put this fragment into the sending queue. 841 */ 842 err = output(net, sk, frag); 843 if (err) 844 goto fail; 845 846 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 847 IPSTATS_MIB_FRAGCREATES); 848 } 849 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 850 IPSTATS_MIB_FRAGOKS); 851 consume_skb(skb); 852 return err; 853 854 fail_toobig: 855 if (skb->sk && dst_allfrag(skb_dst(skb))) 856 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 857 858 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 859 err = -EMSGSIZE; 860 861 fail: 862 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 863 IPSTATS_MIB_FRAGFAILS); 864 kfree_skb(skb); 865 return err; 866 } 867 868 static inline int ip6_rt_check(const struct rt6key *rt_key, 869 const struct in6_addr *fl_addr, 870 const struct in6_addr *addr_cache) 871 { 872 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 873 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 874 } 875 876 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 877 struct dst_entry *dst, 878 const struct flowi6 *fl6) 879 { 880 struct ipv6_pinfo *np = inet6_sk(sk); 881 struct rt6_info *rt; 882 883 if (!dst) 884 goto out; 885 886 if (dst->ops->family != AF_INET6) { 887 dst_release(dst); 888 return NULL; 889 } 890 891 rt = (struct rt6_info *)dst; 892 /* Yes, checking route validity in not connected 893 * case is not very simple. Take into account, 894 * that we do not support routing by source, TOS, 895 * and MSG_DONTROUTE --ANK (980726) 896 * 897 * 1. ip6_rt_check(): If route was host route, 898 * check that cached destination is current. 899 * If it is network route, we still may 900 * check its validity using saved pointer 901 * to the last used address: daddr_cache. 902 * We do not want to save whole address now, 903 * (because main consumer of this service 904 * is tcp, which has not this problem), 905 * so that the last trick works only on connected 906 * sockets. 907 * 2. oif also should be the same. 908 */ 909 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 910 #ifdef CONFIG_IPV6_SUBTREES 911 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 912 #endif 913 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 914 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 915 dst_release(dst); 916 dst = NULL; 917 } 918 919 out: 920 return dst; 921 } 922 923 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 924 struct dst_entry **dst, struct flowi6 *fl6) 925 { 926 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 927 struct neighbour *n; 928 struct rt6_info *rt; 929 #endif 930 int err; 931 int flags = 0; 932 933 /* The correct way to handle this would be to do 934 * ip6_route_get_saddr, and then ip6_route_output; however, 935 * the route-specific preferred source forces the 936 * ip6_route_output call _before_ ip6_route_get_saddr. 937 * 938 * In source specific routing (no src=any default route), 939 * ip6_route_output will fail given src=any saddr, though, so 940 * that's why we try it again later. 941 */ 942 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 943 struct rt6_info *rt; 944 bool had_dst = *dst != NULL; 945 946 if (!had_dst) 947 *dst = ip6_route_output(net, sk, fl6); 948 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 949 err = ip6_route_get_saddr(net, rt ? rt->from : NULL, 950 &fl6->daddr, 951 sk ? inet6_sk(sk)->srcprefs : 0, 952 &fl6->saddr); 953 if (err) 954 goto out_err_release; 955 956 /* If we had an erroneous initial result, pretend it 957 * never existed and let the SA-enabled version take 958 * over. 959 */ 960 if (!had_dst && (*dst)->error) { 961 dst_release(*dst); 962 *dst = NULL; 963 } 964 965 if (fl6->flowi6_oif) 966 flags |= RT6_LOOKUP_F_IFACE; 967 } 968 969 if (!*dst) 970 *dst = ip6_route_output_flags(net, sk, fl6, flags); 971 972 err = (*dst)->error; 973 if (err) 974 goto out_err_release; 975 976 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 977 /* 978 * Here if the dst entry we've looked up 979 * has a neighbour entry that is in the INCOMPLETE 980 * state and the src address from the flow is 981 * marked as OPTIMISTIC, we release the found 982 * dst entry and replace it instead with the 983 * dst entry of the nexthop router 984 */ 985 rt = (struct rt6_info *) *dst; 986 rcu_read_lock_bh(); 987 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 988 rt6_nexthop(rt, &fl6->daddr)); 989 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 990 rcu_read_unlock_bh(); 991 992 if (err) { 993 struct inet6_ifaddr *ifp; 994 struct flowi6 fl_gw6; 995 int redirect; 996 997 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 998 (*dst)->dev, 1); 999 1000 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1001 if (ifp) 1002 in6_ifa_put(ifp); 1003 1004 if (redirect) { 1005 /* 1006 * We need to get the dst entry for the 1007 * default router instead 1008 */ 1009 dst_release(*dst); 1010 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1011 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1012 *dst = ip6_route_output(net, sk, &fl_gw6); 1013 err = (*dst)->error; 1014 if (err) 1015 goto out_err_release; 1016 } 1017 } 1018 #endif 1019 if (ipv6_addr_v4mapped(&fl6->saddr) && 1020 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1021 err = -EAFNOSUPPORT; 1022 goto out_err_release; 1023 } 1024 1025 return 0; 1026 1027 out_err_release: 1028 dst_release(*dst); 1029 *dst = NULL; 1030 1031 if (err == -ENETUNREACH) 1032 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1033 return err; 1034 } 1035 1036 /** 1037 * ip6_dst_lookup - perform route lookup on flow 1038 * @sk: socket which provides route info 1039 * @dst: pointer to dst_entry * for result 1040 * @fl6: flow to lookup 1041 * 1042 * This function performs a route lookup on the given flow. 1043 * 1044 * It returns zero on success, or a standard errno code on error. 1045 */ 1046 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1047 struct flowi6 *fl6) 1048 { 1049 *dst = NULL; 1050 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1051 } 1052 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1053 1054 /** 1055 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1056 * @sk: socket which provides route info 1057 * @fl6: flow to lookup 1058 * @final_dst: final destination address for ipsec lookup 1059 * 1060 * This function performs a route lookup on the given flow. 1061 * 1062 * It returns a valid dst pointer on success, or a pointer encoded 1063 * error code. 1064 */ 1065 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1066 const struct in6_addr *final_dst) 1067 { 1068 struct dst_entry *dst = NULL; 1069 int err; 1070 1071 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1072 if (err) 1073 return ERR_PTR(err); 1074 if (final_dst) 1075 fl6->daddr = *final_dst; 1076 1077 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1078 } 1079 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1080 1081 /** 1082 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1083 * @sk: socket which provides the dst cache and route info 1084 * @fl6: flow to lookup 1085 * @final_dst: final destination address for ipsec lookup 1086 * @connected: whether @sk is connected or not 1087 * 1088 * This function performs a route lookup on the given flow with the 1089 * possibility of using the cached route in the socket if it is valid. 1090 * It will take the socket dst lock when operating on the dst cache. 1091 * As a result, this function can only be used in process context. 1092 * 1093 * In addition, for a connected socket, cache the dst in the socket 1094 * if the current cache is not valid. 1095 * 1096 * It returns a valid dst pointer on success, or a pointer encoded 1097 * error code. 1098 */ 1099 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1100 const struct in6_addr *final_dst, 1101 bool connected) 1102 { 1103 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1104 1105 dst = ip6_sk_dst_check(sk, dst, fl6); 1106 if (dst) 1107 return dst; 1108 1109 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1110 if (connected && !IS_ERR(dst)) 1111 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1112 1113 return dst; 1114 } 1115 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1116 1117 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1118 gfp_t gfp) 1119 { 1120 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1121 } 1122 1123 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1124 gfp_t gfp) 1125 { 1126 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1127 } 1128 1129 static void ip6_append_data_mtu(unsigned int *mtu, 1130 int *maxfraglen, 1131 unsigned int fragheaderlen, 1132 struct sk_buff *skb, 1133 struct rt6_info *rt, 1134 unsigned int orig_mtu) 1135 { 1136 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1137 if (!skb) { 1138 /* first fragment, reserve header_len */ 1139 *mtu = orig_mtu - rt->dst.header_len; 1140 1141 } else { 1142 /* 1143 * this fragment is not first, the headers 1144 * space is regarded as data space. 1145 */ 1146 *mtu = orig_mtu; 1147 } 1148 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1149 + fragheaderlen - sizeof(struct frag_hdr); 1150 } 1151 } 1152 1153 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1154 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1155 struct rt6_info *rt, struct flowi6 *fl6) 1156 { 1157 struct ipv6_pinfo *np = inet6_sk(sk); 1158 unsigned int mtu; 1159 struct ipv6_txoptions *opt = ipc6->opt; 1160 1161 /* 1162 * setup for corking 1163 */ 1164 if (opt) { 1165 if (WARN_ON(v6_cork->opt)) 1166 return -EINVAL; 1167 1168 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1169 if (unlikely(!v6_cork->opt)) 1170 return -ENOBUFS; 1171 1172 v6_cork->opt->tot_len = sizeof(*opt); 1173 v6_cork->opt->opt_flen = opt->opt_flen; 1174 v6_cork->opt->opt_nflen = opt->opt_nflen; 1175 1176 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1177 sk->sk_allocation); 1178 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1179 return -ENOBUFS; 1180 1181 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1182 sk->sk_allocation); 1183 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1184 return -ENOBUFS; 1185 1186 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1187 sk->sk_allocation); 1188 if (opt->hopopt && !v6_cork->opt->hopopt) 1189 return -ENOBUFS; 1190 1191 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1192 sk->sk_allocation); 1193 if (opt->srcrt && !v6_cork->opt->srcrt) 1194 return -ENOBUFS; 1195 1196 /* need source address above miyazawa*/ 1197 } 1198 dst_hold(&rt->dst); 1199 cork->base.dst = &rt->dst; 1200 cork->fl.u.ip6 = *fl6; 1201 v6_cork->hop_limit = ipc6->hlimit; 1202 v6_cork->tclass = ipc6->tclass; 1203 if (rt->dst.flags & DST_XFRM_TUNNEL) 1204 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1205 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1206 else 1207 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1208 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1209 if (np->frag_size < mtu) { 1210 if (np->frag_size) 1211 mtu = np->frag_size; 1212 } 1213 if (mtu < IPV6_MIN_MTU) 1214 return -EINVAL; 1215 cork->base.fragsize = mtu; 1216 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1217 cork->base.flags |= IPCORK_ALLFRAG; 1218 cork->base.length = 0; 1219 1220 return 0; 1221 } 1222 1223 static int __ip6_append_data(struct sock *sk, 1224 struct flowi6 *fl6, 1225 struct sk_buff_head *queue, 1226 struct inet_cork *cork, 1227 struct inet6_cork *v6_cork, 1228 struct page_frag *pfrag, 1229 int getfrag(void *from, char *to, int offset, 1230 int len, int odd, struct sk_buff *skb), 1231 void *from, int length, int transhdrlen, 1232 unsigned int flags, struct ipcm6_cookie *ipc6, 1233 const struct sockcm_cookie *sockc) 1234 { 1235 struct sk_buff *skb, *skb_prev = NULL; 1236 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1237 int exthdrlen = 0; 1238 int dst_exthdrlen = 0; 1239 int hh_len; 1240 int copy; 1241 int err; 1242 int offset = 0; 1243 __u8 tx_flags = 0; 1244 u32 tskey = 0; 1245 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1246 struct ipv6_txoptions *opt = v6_cork->opt; 1247 int csummode = CHECKSUM_NONE; 1248 unsigned int maxnonfragsize, headersize; 1249 unsigned int wmem_alloc_delta = 0; 1250 1251 skb = skb_peek_tail(queue); 1252 if (!skb) { 1253 exthdrlen = opt ? opt->opt_flen : 0; 1254 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1255 } 1256 1257 mtu = cork->fragsize; 1258 orig_mtu = mtu; 1259 1260 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1261 1262 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1263 (opt ? opt->opt_nflen : 0); 1264 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1265 sizeof(struct frag_hdr); 1266 1267 headersize = sizeof(struct ipv6hdr) + 1268 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1269 (dst_allfrag(&rt->dst) ? 1270 sizeof(struct frag_hdr) : 0) + 1271 rt->rt6i_nfheader_len; 1272 1273 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1274 * the first fragment 1275 */ 1276 if (headersize + transhdrlen > mtu) 1277 goto emsgsize; 1278 1279 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1280 (sk->sk_protocol == IPPROTO_UDP || 1281 sk->sk_protocol == IPPROTO_RAW)) { 1282 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1283 sizeof(struct ipv6hdr)); 1284 goto emsgsize; 1285 } 1286 1287 if (ip6_sk_ignore_df(sk)) 1288 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1289 else 1290 maxnonfragsize = mtu; 1291 1292 if (cork->length + length > maxnonfragsize - headersize) { 1293 emsgsize: 1294 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1295 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1296 return -EMSGSIZE; 1297 } 1298 1299 /* CHECKSUM_PARTIAL only with no extension headers and when 1300 * we are not going to fragment 1301 */ 1302 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1303 headersize == sizeof(struct ipv6hdr) && 1304 length <= mtu - headersize && 1305 !(flags & MSG_MORE) && 1306 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1307 csummode = CHECKSUM_PARTIAL; 1308 1309 if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) { 1310 sock_tx_timestamp(sk, sockc->tsflags, &tx_flags); 1311 if (tx_flags & SKBTX_ANY_SW_TSTAMP && 1312 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1313 tskey = sk->sk_tskey++; 1314 } 1315 1316 /* 1317 * Let's try using as much space as possible. 1318 * Use MTU if total length of the message fits into the MTU. 1319 * Otherwise, we need to reserve fragment header and 1320 * fragment alignment (= 8-15 octects, in total). 1321 * 1322 * Note that we may need to "move" the data from the tail of 1323 * of the buffer to the new fragment when we split 1324 * the message. 1325 * 1326 * FIXME: It may be fragmented into multiple chunks 1327 * at once if non-fragmentable extension headers 1328 * are too large. 1329 * --yoshfuji 1330 */ 1331 1332 cork->length += length; 1333 if (!skb) 1334 goto alloc_new_skb; 1335 1336 while (length > 0) { 1337 /* Check if the remaining data fits into current packet. */ 1338 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1339 if (copy < length) 1340 copy = maxfraglen - skb->len; 1341 1342 if (copy <= 0) { 1343 char *data; 1344 unsigned int datalen; 1345 unsigned int fraglen; 1346 unsigned int fraggap; 1347 unsigned int alloclen; 1348 alloc_new_skb: 1349 /* There's no room in the current skb */ 1350 if (skb) 1351 fraggap = skb->len - maxfraglen; 1352 else 1353 fraggap = 0; 1354 /* update mtu and maxfraglen if necessary */ 1355 if (!skb || !skb_prev) 1356 ip6_append_data_mtu(&mtu, &maxfraglen, 1357 fragheaderlen, skb, rt, 1358 orig_mtu); 1359 1360 skb_prev = skb; 1361 1362 /* 1363 * If remaining data exceeds the mtu, 1364 * we know we need more fragment(s). 1365 */ 1366 datalen = length + fraggap; 1367 1368 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1369 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1370 if ((flags & MSG_MORE) && 1371 !(rt->dst.dev->features&NETIF_F_SG)) 1372 alloclen = mtu; 1373 else 1374 alloclen = datalen + fragheaderlen; 1375 1376 alloclen += dst_exthdrlen; 1377 1378 if (datalen != length + fraggap) { 1379 /* 1380 * this is not the last fragment, the trailer 1381 * space is regarded as data space. 1382 */ 1383 datalen += rt->dst.trailer_len; 1384 } 1385 1386 alloclen += rt->dst.trailer_len; 1387 fraglen = datalen + fragheaderlen; 1388 1389 /* 1390 * We just reserve space for fragment header. 1391 * Note: this may be overallocation if the message 1392 * (without MSG_MORE) fits into the MTU. 1393 */ 1394 alloclen += sizeof(struct frag_hdr); 1395 1396 copy = datalen - transhdrlen - fraggap; 1397 if (copy < 0) { 1398 err = -EINVAL; 1399 goto error; 1400 } 1401 if (transhdrlen) { 1402 skb = sock_alloc_send_skb(sk, 1403 alloclen + hh_len, 1404 (flags & MSG_DONTWAIT), &err); 1405 } else { 1406 skb = NULL; 1407 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1408 2 * sk->sk_sndbuf) 1409 skb = alloc_skb(alloclen + hh_len, 1410 sk->sk_allocation); 1411 if (unlikely(!skb)) 1412 err = -ENOBUFS; 1413 } 1414 if (!skb) 1415 goto error; 1416 /* 1417 * Fill in the control structures 1418 */ 1419 skb->protocol = htons(ETH_P_IPV6); 1420 skb->ip_summed = csummode; 1421 skb->csum = 0; 1422 /* reserve for fragmentation and ipsec header */ 1423 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1424 dst_exthdrlen); 1425 1426 /* Only the initial fragment is time stamped */ 1427 skb_shinfo(skb)->tx_flags = tx_flags; 1428 tx_flags = 0; 1429 skb_shinfo(skb)->tskey = tskey; 1430 tskey = 0; 1431 1432 /* 1433 * Find where to start putting bytes 1434 */ 1435 data = skb_put(skb, fraglen); 1436 skb_set_network_header(skb, exthdrlen); 1437 data += fragheaderlen; 1438 skb->transport_header = (skb->network_header + 1439 fragheaderlen); 1440 if (fraggap) { 1441 skb->csum = skb_copy_and_csum_bits( 1442 skb_prev, maxfraglen, 1443 data + transhdrlen, fraggap, 0); 1444 skb_prev->csum = csum_sub(skb_prev->csum, 1445 skb->csum); 1446 data += fraggap; 1447 pskb_trim_unique(skb_prev, maxfraglen); 1448 } 1449 if (copy > 0 && 1450 getfrag(from, data + transhdrlen, offset, 1451 copy, fraggap, skb) < 0) { 1452 err = -EFAULT; 1453 kfree_skb(skb); 1454 goto error; 1455 } 1456 1457 offset += copy; 1458 length -= datalen - fraggap; 1459 transhdrlen = 0; 1460 exthdrlen = 0; 1461 dst_exthdrlen = 0; 1462 1463 if ((flags & MSG_CONFIRM) && !skb_prev) 1464 skb_set_dst_pending_confirm(skb, 1); 1465 1466 /* 1467 * Put the packet on the pending queue 1468 */ 1469 if (!skb->destructor) { 1470 skb->destructor = sock_wfree; 1471 skb->sk = sk; 1472 wmem_alloc_delta += skb->truesize; 1473 } 1474 __skb_queue_tail(queue, skb); 1475 continue; 1476 } 1477 1478 if (copy > length) 1479 copy = length; 1480 1481 if (!(rt->dst.dev->features&NETIF_F_SG)) { 1482 unsigned int off; 1483 1484 off = skb->len; 1485 if (getfrag(from, skb_put(skb, copy), 1486 offset, copy, off, skb) < 0) { 1487 __skb_trim(skb, off); 1488 err = -EFAULT; 1489 goto error; 1490 } 1491 } else { 1492 int i = skb_shinfo(skb)->nr_frags; 1493 1494 err = -ENOMEM; 1495 if (!sk_page_frag_refill(sk, pfrag)) 1496 goto error; 1497 1498 if (!skb_can_coalesce(skb, i, pfrag->page, 1499 pfrag->offset)) { 1500 err = -EMSGSIZE; 1501 if (i == MAX_SKB_FRAGS) 1502 goto error; 1503 1504 __skb_fill_page_desc(skb, i, pfrag->page, 1505 pfrag->offset, 0); 1506 skb_shinfo(skb)->nr_frags = ++i; 1507 get_page(pfrag->page); 1508 } 1509 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1510 if (getfrag(from, 1511 page_address(pfrag->page) + pfrag->offset, 1512 offset, copy, skb->len, skb) < 0) 1513 goto error_efault; 1514 1515 pfrag->offset += copy; 1516 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1517 skb->len += copy; 1518 skb->data_len += copy; 1519 skb->truesize += copy; 1520 wmem_alloc_delta += copy; 1521 } 1522 offset += copy; 1523 length -= copy; 1524 } 1525 1526 if (wmem_alloc_delta) 1527 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1528 return 0; 1529 1530 error_efault: 1531 err = -EFAULT; 1532 error: 1533 cork->length -= length; 1534 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1535 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1536 return err; 1537 } 1538 1539 int ip6_append_data(struct sock *sk, 1540 int getfrag(void *from, char *to, int offset, int len, 1541 int odd, struct sk_buff *skb), 1542 void *from, int length, int transhdrlen, 1543 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1544 struct rt6_info *rt, unsigned int flags, 1545 const struct sockcm_cookie *sockc) 1546 { 1547 struct inet_sock *inet = inet_sk(sk); 1548 struct ipv6_pinfo *np = inet6_sk(sk); 1549 int exthdrlen; 1550 int err; 1551 1552 if (flags&MSG_PROBE) 1553 return 0; 1554 if (skb_queue_empty(&sk->sk_write_queue)) { 1555 /* 1556 * setup for corking 1557 */ 1558 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1559 ipc6, rt, fl6); 1560 if (err) 1561 return err; 1562 1563 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1564 length += exthdrlen; 1565 transhdrlen += exthdrlen; 1566 } else { 1567 fl6 = &inet->cork.fl.u.ip6; 1568 transhdrlen = 0; 1569 } 1570 1571 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1572 &np->cork, sk_page_frag(sk), getfrag, 1573 from, length, transhdrlen, flags, ipc6, sockc); 1574 } 1575 EXPORT_SYMBOL_GPL(ip6_append_data); 1576 1577 static void ip6_cork_release(struct inet_cork_full *cork, 1578 struct inet6_cork *v6_cork) 1579 { 1580 if (v6_cork->opt) { 1581 kfree(v6_cork->opt->dst0opt); 1582 kfree(v6_cork->opt->dst1opt); 1583 kfree(v6_cork->opt->hopopt); 1584 kfree(v6_cork->opt->srcrt); 1585 kfree(v6_cork->opt); 1586 v6_cork->opt = NULL; 1587 } 1588 1589 if (cork->base.dst) { 1590 dst_release(cork->base.dst); 1591 cork->base.dst = NULL; 1592 cork->base.flags &= ~IPCORK_ALLFRAG; 1593 } 1594 memset(&cork->fl, 0, sizeof(cork->fl)); 1595 } 1596 1597 struct sk_buff *__ip6_make_skb(struct sock *sk, 1598 struct sk_buff_head *queue, 1599 struct inet_cork_full *cork, 1600 struct inet6_cork *v6_cork) 1601 { 1602 struct sk_buff *skb, *tmp_skb; 1603 struct sk_buff **tail_skb; 1604 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1605 struct ipv6_pinfo *np = inet6_sk(sk); 1606 struct net *net = sock_net(sk); 1607 struct ipv6hdr *hdr; 1608 struct ipv6_txoptions *opt = v6_cork->opt; 1609 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1610 struct flowi6 *fl6 = &cork->fl.u.ip6; 1611 unsigned char proto = fl6->flowi6_proto; 1612 1613 skb = __skb_dequeue(queue); 1614 if (!skb) 1615 goto out; 1616 tail_skb = &(skb_shinfo(skb)->frag_list); 1617 1618 /* move skb->data to ip header from ext header */ 1619 if (skb->data < skb_network_header(skb)) 1620 __skb_pull(skb, skb_network_offset(skb)); 1621 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1622 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1623 *tail_skb = tmp_skb; 1624 tail_skb = &(tmp_skb->next); 1625 skb->len += tmp_skb->len; 1626 skb->data_len += tmp_skb->len; 1627 skb->truesize += tmp_skb->truesize; 1628 tmp_skb->destructor = NULL; 1629 tmp_skb->sk = NULL; 1630 } 1631 1632 /* Allow local fragmentation. */ 1633 skb->ignore_df = ip6_sk_ignore_df(sk); 1634 1635 *final_dst = fl6->daddr; 1636 __skb_pull(skb, skb_network_header_len(skb)); 1637 if (opt && opt->opt_flen) 1638 ipv6_push_frag_opts(skb, opt, &proto); 1639 if (opt && opt->opt_nflen) 1640 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1641 1642 skb_push(skb, sizeof(struct ipv6hdr)); 1643 skb_reset_network_header(skb); 1644 hdr = ipv6_hdr(skb); 1645 1646 ip6_flow_hdr(hdr, v6_cork->tclass, 1647 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1648 ip6_autoflowlabel(net, np), fl6)); 1649 hdr->hop_limit = v6_cork->hop_limit; 1650 hdr->nexthdr = proto; 1651 hdr->saddr = fl6->saddr; 1652 hdr->daddr = *final_dst; 1653 1654 skb->priority = sk->sk_priority; 1655 skb->mark = sk->sk_mark; 1656 1657 skb_dst_set(skb, dst_clone(&rt->dst)); 1658 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1659 if (proto == IPPROTO_ICMPV6) { 1660 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1661 1662 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1663 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1664 } 1665 1666 ip6_cork_release(cork, v6_cork); 1667 out: 1668 return skb; 1669 } 1670 1671 int ip6_send_skb(struct sk_buff *skb) 1672 { 1673 struct net *net = sock_net(skb->sk); 1674 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1675 int err; 1676 1677 err = ip6_local_out(net, skb->sk, skb); 1678 if (err) { 1679 if (err > 0) 1680 err = net_xmit_errno(err); 1681 if (err) 1682 IP6_INC_STATS(net, rt->rt6i_idev, 1683 IPSTATS_MIB_OUTDISCARDS); 1684 } 1685 1686 return err; 1687 } 1688 1689 int ip6_push_pending_frames(struct sock *sk) 1690 { 1691 struct sk_buff *skb; 1692 1693 skb = ip6_finish_skb(sk); 1694 if (!skb) 1695 return 0; 1696 1697 return ip6_send_skb(skb); 1698 } 1699 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1700 1701 static void __ip6_flush_pending_frames(struct sock *sk, 1702 struct sk_buff_head *queue, 1703 struct inet_cork_full *cork, 1704 struct inet6_cork *v6_cork) 1705 { 1706 struct sk_buff *skb; 1707 1708 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1709 if (skb_dst(skb)) 1710 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1711 IPSTATS_MIB_OUTDISCARDS); 1712 kfree_skb(skb); 1713 } 1714 1715 ip6_cork_release(cork, v6_cork); 1716 } 1717 1718 void ip6_flush_pending_frames(struct sock *sk) 1719 { 1720 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1721 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1722 } 1723 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1724 1725 struct sk_buff *ip6_make_skb(struct sock *sk, 1726 int getfrag(void *from, char *to, int offset, 1727 int len, int odd, struct sk_buff *skb), 1728 void *from, int length, int transhdrlen, 1729 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1730 struct rt6_info *rt, unsigned int flags, 1731 const struct sockcm_cookie *sockc) 1732 { 1733 struct inet_cork_full cork; 1734 struct inet6_cork v6_cork; 1735 struct sk_buff_head queue; 1736 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1737 int err; 1738 1739 if (flags & MSG_PROBE) 1740 return NULL; 1741 1742 __skb_queue_head_init(&queue); 1743 1744 cork.base.flags = 0; 1745 cork.base.addr = 0; 1746 cork.base.opt = NULL; 1747 cork.base.dst = NULL; 1748 v6_cork.opt = NULL; 1749 err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6); 1750 if (err) { 1751 ip6_cork_release(&cork, &v6_cork); 1752 return ERR_PTR(err); 1753 } 1754 if (ipc6->dontfrag < 0) 1755 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1756 1757 err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork, 1758 ¤t->task_frag, getfrag, from, 1759 length + exthdrlen, transhdrlen + exthdrlen, 1760 flags, ipc6, sockc); 1761 if (err) { 1762 __ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork); 1763 return ERR_PTR(err); 1764 } 1765 1766 return __ip6_make_skb(sk, &queue, &cork, &v6_cork); 1767 } 1768