1 /* 2 * IPv6 output functions 3 * Linux INET6 implementation 4 * 5 * Authors: 6 * Pedro Roque <roque@di.fc.ul.pt> 7 * 8 * Based on linux/net/ipv4/ip_output.c 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public License 12 * as published by the Free Software Foundation; either version 13 * 2 of the License, or (at your option) any later version. 14 * 15 * Changes: 16 * A.N.Kuznetsov : airthmetics in fragmentation. 17 * extension headers are implemented. 18 * route changes now work. 19 * ip6_forward does not confuse sniffers. 20 * etc. 21 * 22 * H. von Brand : Added missing #include <linux/string.h> 23 * Imran Patel : frag id should be in NBO 24 * Kazunori MIYAZAWA @USAGI 25 * : add ip6_append_data and related functions 26 * for datagram xmit 27 */ 28 29 #include <linux/errno.h> 30 #include <linux/kernel.h> 31 #include <linux/string.h> 32 #include <linux/socket.h> 33 #include <linux/net.h> 34 #include <linux/netdevice.h> 35 #include <linux/if_arp.h> 36 #include <linux/in6.h> 37 #include <linux/tcp.h> 38 #include <linux/route.h> 39 #include <linux/module.h> 40 #include <linux/slab.h> 41 42 #include <linux/bpf-cgroup.h> 43 #include <linux/netfilter.h> 44 #include <linux/netfilter_ipv6.h> 45 46 #include <net/sock.h> 47 #include <net/snmp.h> 48 49 #include <net/ipv6.h> 50 #include <net/ndisc.h> 51 #include <net/protocol.h> 52 #include <net/ip6_route.h> 53 #include <net/addrconf.h> 54 #include <net/rawv6.h> 55 #include <net/icmp.h> 56 #include <net/xfrm.h> 57 #include <net/checksum.h> 58 #include <linux/mroute6.h> 59 #include <net/l3mdev.h> 60 #include <net/lwtunnel.h> 61 62 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb) 63 { 64 struct dst_entry *dst = skb_dst(skb); 65 struct net_device *dev = dst->dev; 66 struct neighbour *neigh; 67 struct in6_addr *nexthop; 68 int ret; 69 70 if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) { 71 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 72 73 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) && 74 ((mroute6_is_socket(net, skb) && 75 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) || 76 ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr, 77 &ipv6_hdr(skb)->saddr))) { 78 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); 79 80 /* Do not check for IFF_ALLMULTI; multicast routing 81 is not supported in any case. 82 */ 83 if (newskb) 84 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING, 85 net, sk, newskb, NULL, newskb->dev, 86 dev_loopback_xmit); 87 88 if (ipv6_hdr(skb)->hop_limit == 0) { 89 IP6_INC_STATS(net, idev, 90 IPSTATS_MIB_OUTDISCARDS); 91 kfree_skb(skb); 92 return 0; 93 } 94 } 95 96 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len); 97 98 if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <= 99 IPV6_ADDR_SCOPE_NODELOCAL && 100 !(dev->flags & IFF_LOOPBACK)) { 101 kfree_skb(skb); 102 return 0; 103 } 104 } 105 106 if (lwtunnel_xmit_redirect(dst->lwtstate)) { 107 int res = lwtunnel_xmit(skb); 108 109 if (res < 0 || res == LWTUNNEL_XMIT_DONE) 110 return res; 111 } 112 113 rcu_read_lock_bh(); 114 nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); 115 neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); 116 if (unlikely(!neigh)) 117 neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false); 118 if (!IS_ERR(neigh)) { 119 sock_confirm_neigh(skb, neigh); 120 ret = neigh_output(neigh, skb, false); 121 rcu_read_unlock_bh(); 122 return ret; 123 } 124 rcu_read_unlock_bh(); 125 126 IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); 127 kfree_skb(skb); 128 return -EINVAL; 129 } 130 131 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 132 { 133 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM) 134 /* Policy lookup after SNAT yielded a new policy */ 135 if (skb_dst(skb)->xfrm) { 136 IPCB(skb)->flags |= IPSKB_REROUTED; 137 return dst_output(net, sk, skb); 138 } 139 #endif 140 141 if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || 142 dst_allfrag(skb_dst(skb)) || 143 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)) 144 return ip6_fragment(net, sk, skb, ip6_finish_output2); 145 else 146 return ip6_finish_output2(net, sk, skb); 147 } 148 149 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb) 150 { 151 int ret; 152 153 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb); 154 switch (ret) { 155 case NET_XMIT_SUCCESS: 156 return __ip6_finish_output(net, sk, skb); 157 case NET_XMIT_CN: 158 return __ip6_finish_output(net, sk, skb) ? : ret; 159 default: 160 kfree_skb(skb); 161 return ret; 162 } 163 } 164 165 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb) 166 { 167 struct net_device *dev = skb_dst(skb)->dev; 168 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 169 170 skb->protocol = htons(ETH_P_IPV6); 171 skb->dev = dev; 172 173 if (unlikely(idev->cnf.disable_ipv6)) { 174 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS); 175 kfree_skb(skb); 176 return 0; 177 } 178 179 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, 180 net, sk, skb, NULL, dev, 181 ip6_finish_output, 182 !(IP6CB(skb)->flags & IP6SKB_REROUTED)); 183 } 184 185 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np) 186 { 187 if (!np->autoflowlabel_set) 188 return ip6_default_np_autolabel(net); 189 else 190 return np->autoflowlabel; 191 } 192 193 /* 194 * xmit an sk_buff (used by TCP, SCTP and DCCP) 195 * Note : socket lock is not held for SYNACK packets, but might be modified 196 * by calls to skb_set_owner_w() and ipv6_local_error(), 197 * which are using proper atomic operations or spinlocks. 198 */ 199 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, 200 __u32 mark, struct ipv6_txoptions *opt, int tclass) 201 { 202 struct net *net = sock_net(sk); 203 const struct ipv6_pinfo *np = inet6_sk(sk); 204 struct in6_addr *first_hop = &fl6->daddr; 205 struct dst_entry *dst = skb_dst(skb); 206 unsigned int head_room; 207 struct ipv6hdr *hdr; 208 u8 proto = fl6->flowi6_proto; 209 int seg_len = skb->len; 210 int hlimit = -1; 211 u32 mtu; 212 213 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev); 214 if (opt) 215 head_room += opt->opt_nflen + opt->opt_flen; 216 217 if (unlikely(skb_headroom(skb) < head_room)) { 218 struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room); 219 if (!skb2) { 220 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 221 IPSTATS_MIB_OUTDISCARDS); 222 kfree_skb(skb); 223 return -ENOBUFS; 224 } 225 if (skb->sk) 226 skb_set_owner_w(skb2, skb->sk); 227 consume_skb(skb); 228 skb = skb2; 229 } 230 231 if (opt) { 232 seg_len += opt->opt_nflen + opt->opt_flen; 233 234 if (opt->opt_flen) 235 ipv6_push_frag_opts(skb, opt, &proto); 236 237 if (opt->opt_nflen) 238 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop, 239 &fl6->saddr); 240 } 241 242 skb_push(skb, sizeof(struct ipv6hdr)); 243 skb_reset_network_header(skb); 244 hdr = ipv6_hdr(skb); 245 246 /* 247 * Fill in the IPv6 header 248 */ 249 if (np) 250 hlimit = np->hop_limit; 251 if (hlimit < 0) 252 hlimit = ip6_dst_hoplimit(dst); 253 254 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel, 255 ip6_autoflowlabel(net, np), fl6)); 256 257 hdr->payload_len = htons(seg_len); 258 hdr->nexthdr = proto; 259 hdr->hop_limit = hlimit; 260 261 hdr->saddr = fl6->saddr; 262 hdr->daddr = *first_hop; 263 264 skb->protocol = htons(ETH_P_IPV6); 265 skb->priority = sk->sk_priority; 266 skb->mark = mark; 267 268 mtu = dst_mtu(dst); 269 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) { 270 IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)), 271 IPSTATS_MIB_OUT, skb->len); 272 273 /* if egress device is enslaved to an L3 master device pass the 274 * skb to its handler for processing 275 */ 276 skb = l3mdev_ip6_out((struct sock *)sk, skb); 277 if (unlikely(!skb)) 278 return 0; 279 280 /* hooks should never assume socket lock is held. 281 * we promote our socket to non const 282 */ 283 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, 284 net, (struct sock *)sk, skb, NULL, dst->dev, 285 dst_output); 286 } 287 288 skb->dev = dst->dev; 289 /* ipv6_local_error() does not require socket lock, 290 * we promote our socket to non const 291 */ 292 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu); 293 294 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS); 295 kfree_skb(skb); 296 return -EMSGSIZE; 297 } 298 EXPORT_SYMBOL(ip6_xmit); 299 300 static int ip6_call_ra_chain(struct sk_buff *skb, int sel) 301 { 302 struct ip6_ra_chain *ra; 303 struct sock *last = NULL; 304 305 read_lock(&ip6_ra_lock); 306 for (ra = ip6_ra_chain; ra; ra = ra->next) { 307 struct sock *sk = ra->sk; 308 if (sk && ra->sel == sel && 309 (!sk->sk_bound_dev_if || 310 sk->sk_bound_dev_if == skb->dev->ifindex)) { 311 struct ipv6_pinfo *np = inet6_sk(sk); 312 313 if (np && np->rtalert_isolate && 314 !net_eq(sock_net(sk), dev_net(skb->dev))) { 315 continue; 316 } 317 if (last) { 318 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); 319 if (skb2) 320 rawv6_rcv(last, skb2); 321 } 322 last = sk; 323 } 324 } 325 326 if (last) { 327 rawv6_rcv(last, skb); 328 read_unlock(&ip6_ra_lock); 329 return 1; 330 } 331 read_unlock(&ip6_ra_lock); 332 return 0; 333 } 334 335 static int ip6_forward_proxy_check(struct sk_buff *skb) 336 { 337 struct ipv6hdr *hdr = ipv6_hdr(skb); 338 u8 nexthdr = hdr->nexthdr; 339 __be16 frag_off; 340 int offset; 341 342 if (ipv6_ext_hdr(nexthdr)) { 343 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off); 344 if (offset < 0) 345 return 0; 346 } else 347 offset = sizeof(struct ipv6hdr); 348 349 if (nexthdr == IPPROTO_ICMPV6) { 350 struct icmp6hdr *icmp6; 351 352 if (!pskb_may_pull(skb, (skb_network_header(skb) + 353 offset + 1 - skb->data))) 354 return 0; 355 356 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset); 357 358 switch (icmp6->icmp6_type) { 359 case NDISC_ROUTER_SOLICITATION: 360 case NDISC_ROUTER_ADVERTISEMENT: 361 case NDISC_NEIGHBOUR_SOLICITATION: 362 case NDISC_NEIGHBOUR_ADVERTISEMENT: 363 case NDISC_REDIRECT: 364 /* For reaction involving unicast neighbor discovery 365 * message destined to the proxied address, pass it to 366 * input function. 367 */ 368 return 1; 369 default: 370 break; 371 } 372 } 373 374 /* 375 * The proxying router can't forward traffic sent to a link-local 376 * address, so signal the sender and discard the packet. This 377 * behavior is clarified by the MIPv6 specification. 378 */ 379 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) { 380 dst_link_failure(skb); 381 return -1; 382 } 383 384 return 0; 385 } 386 387 static inline int ip6_forward_finish(struct net *net, struct sock *sk, 388 struct sk_buff *skb) 389 { 390 struct dst_entry *dst = skb_dst(skb); 391 392 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS); 393 __IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len); 394 395 #ifdef CONFIG_NET_SWITCHDEV 396 if (skb->offload_l3_fwd_mark) { 397 consume_skb(skb); 398 return 0; 399 } 400 #endif 401 402 skb->tstamp = 0; 403 return dst_output(net, sk, skb); 404 } 405 406 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu) 407 { 408 if (skb->len <= mtu) 409 return false; 410 411 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */ 412 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu) 413 return true; 414 415 if (skb->ignore_df) 416 return false; 417 418 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) 419 return false; 420 421 return true; 422 } 423 424 int ip6_forward(struct sk_buff *skb) 425 { 426 struct inet6_dev *idev = __in6_dev_get_safely(skb->dev); 427 struct dst_entry *dst = skb_dst(skb); 428 struct ipv6hdr *hdr = ipv6_hdr(skb); 429 struct inet6_skb_parm *opt = IP6CB(skb); 430 struct net *net = dev_net(dst->dev); 431 u32 mtu; 432 433 if (net->ipv6.devconf_all->forwarding == 0) 434 goto error; 435 436 if (skb->pkt_type != PACKET_HOST) 437 goto drop; 438 439 if (unlikely(skb->sk)) 440 goto drop; 441 442 if (skb_warn_if_lro(skb)) 443 goto drop; 444 445 if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) { 446 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 447 goto drop; 448 } 449 450 skb_forward_csum(skb); 451 452 /* 453 * We DO NOT make any processing on 454 * RA packets, pushing them to user level AS IS 455 * without ane WARRANTY that application will be able 456 * to interpret them. The reason is that we 457 * cannot make anything clever here. 458 * 459 * We are not end-node, so that if packet contains 460 * AH/ESP, we cannot make anything. 461 * Defragmentation also would be mistake, RA packets 462 * cannot be fragmented, because there is no warranty 463 * that different fragments will go along one path. --ANK 464 */ 465 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) { 466 if (ip6_call_ra_chain(skb, ntohs(opt->ra))) 467 return 0; 468 } 469 470 /* 471 * check and decrement ttl 472 */ 473 if (hdr->hop_limit <= 1) { 474 /* Force OUTPUT device used as source address */ 475 skb->dev = dst->dev; 476 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0); 477 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS); 478 479 kfree_skb(skb); 480 return -ETIMEDOUT; 481 } 482 483 /* XXX: idev->cnf.proxy_ndp? */ 484 if (net->ipv6.devconf_all->proxy_ndp && 485 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) { 486 int proxied = ip6_forward_proxy_check(skb); 487 if (proxied > 0) 488 return ip6_input(skb); 489 else if (proxied < 0) { 490 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 491 goto drop; 492 } 493 } 494 495 if (!xfrm6_route_forward(skb)) { 496 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS); 497 goto drop; 498 } 499 dst = skb_dst(skb); 500 501 /* IPv6 specs say nothing about it, but it is clear that we cannot 502 send redirects to source routed frames. 503 We don't send redirects to frames decapsulated from IPsec. 504 */ 505 if (IP6CB(skb)->iif == dst->dev->ifindex && 506 opt->srcrt == 0 && !skb_sec_path(skb)) { 507 struct in6_addr *target = NULL; 508 struct inet_peer *peer; 509 struct rt6_info *rt; 510 511 /* 512 * incoming and outgoing devices are the same 513 * send a redirect. 514 */ 515 516 rt = (struct rt6_info *) dst; 517 if (rt->rt6i_flags & RTF_GATEWAY) 518 target = &rt->rt6i_gateway; 519 else 520 target = &hdr->daddr; 521 522 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1); 523 524 /* Limit redirects both by destination (here) 525 and by source (inside ndisc_send_redirect) 526 */ 527 if (inet_peer_xrlim_allow(peer, 1*HZ)) 528 ndisc_send_redirect(skb, target); 529 if (peer) 530 inet_putpeer(peer); 531 } else { 532 int addrtype = ipv6_addr_type(&hdr->saddr); 533 534 /* This check is security critical. */ 535 if (addrtype == IPV6_ADDR_ANY || 536 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK)) 537 goto error; 538 if (addrtype & IPV6_ADDR_LINKLOCAL) { 539 icmpv6_send(skb, ICMPV6_DEST_UNREACH, 540 ICMPV6_NOT_NEIGHBOUR, 0); 541 goto error; 542 } 543 } 544 545 mtu = ip6_dst_mtu_forward(dst); 546 if (mtu < IPV6_MIN_MTU) 547 mtu = IPV6_MIN_MTU; 548 549 if (ip6_pkt_too_big(skb, mtu)) { 550 /* Again, force OUTPUT device used as source address */ 551 skb->dev = dst->dev; 552 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 553 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS); 554 __IP6_INC_STATS(net, ip6_dst_idev(dst), 555 IPSTATS_MIB_FRAGFAILS); 556 kfree_skb(skb); 557 return -EMSGSIZE; 558 } 559 560 if (skb_cow(skb, dst->dev->hard_header_len)) { 561 __IP6_INC_STATS(net, ip6_dst_idev(dst), 562 IPSTATS_MIB_OUTDISCARDS); 563 goto drop; 564 } 565 566 hdr = ipv6_hdr(skb); 567 568 /* Mangling hops number delayed to point after skb COW */ 569 570 hdr->hop_limit--; 571 572 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, 573 net, NULL, skb, skb->dev, dst->dev, 574 ip6_forward_finish); 575 576 error: 577 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS); 578 drop: 579 kfree_skb(skb); 580 return -EINVAL; 581 } 582 583 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from) 584 { 585 to->pkt_type = from->pkt_type; 586 to->priority = from->priority; 587 to->protocol = from->protocol; 588 skb_dst_drop(to); 589 skb_dst_set(to, dst_clone(skb_dst(from))); 590 to->dev = from->dev; 591 to->mark = from->mark; 592 593 skb_copy_hash(to, from); 594 595 #ifdef CONFIG_NET_SCHED 596 to->tc_index = from->tc_index; 597 #endif 598 nf_copy(to, from); 599 skb_ext_copy(to, from); 600 skb_copy_secmark(to, from); 601 } 602 603 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, 604 u8 nexthdr, __be32 frag_id, 605 struct ip6_fraglist_iter *iter) 606 { 607 unsigned int first_len; 608 struct frag_hdr *fh; 609 610 /* BUILD HEADER */ 611 *prevhdr = NEXTHDR_FRAGMENT; 612 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC); 613 if (!iter->tmp_hdr) 614 return -ENOMEM; 615 616 iter->frag_list = skb_shinfo(skb)->frag_list; 617 iter->frag = iter->frag_list; 618 skb_frag_list_init(skb); 619 620 iter->offset = 0; 621 iter->hlen = hlen; 622 iter->frag_id = frag_id; 623 iter->nexthdr = nexthdr; 624 625 __skb_pull(skb, hlen); 626 fh = __skb_push(skb, sizeof(struct frag_hdr)); 627 __skb_push(skb, hlen); 628 skb_reset_network_header(skb); 629 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen); 630 631 fh->nexthdr = nexthdr; 632 fh->reserved = 0; 633 fh->frag_off = htons(IP6_MF); 634 fh->identification = frag_id; 635 636 first_len = skb_pagelen(skb); 637 skb->data_len = first_len - skb_headlen(skb); 638 skb->len = first_len; 639 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr)); 640 641 return 0; 642 } 643 EXPORT_SYMBOL(ip6_fraglist_init); 644 645 void ip6_fraglist_prepare(struct sk_buff *skb, 646 struct ip6_fraglist_iter *iter) 647 { 648 struct sk_buff *frag = iter->frag; 649 unsigned int hlen = iter->hlen; 650 struct frag_hdr *fh; 651 652 frag->ip_summed = CHECKSUM_NONE; 653 skb_reset_transport_header(frag); 654 fh = __skb_push(frag, sizeof(struct frag_hdr)); 655 __skb_push(frag, hlen); 656 skb_reset_network_header(frag); 657 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen); 658 iter->offset += skb->len - hlen - sizeof(struct frag_hdr); 659 fh->nexthdr = iter->nexthdr; 660 fh->reserved = 0; 661 fh->frag_off = htons(iter->offset); 662 if (frag->next) 663 fh->frag_off |= htons(IP6_MF); 664 fh->identification = iter->frag_id; 665 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 666 ip6_copy_metadata(frag, skb); 667 } 668 EXPORT_SYMBOL(ip6_fraglist_prepare); 669 670 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, 671 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, 672 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state) 673 { 674 state->prevhdr = prevhdr; 675 state->nexthdr = nexthdr; 676 state->frag_id = frag_id; 677 678 state->hlen = hlen; 679 state->mtu = mtu; 680 681 state->left = skb->len - hlen; /* Space per frame */ 682 state->ptr = hlen; /* Where to start from */ 683 684 state->hroom = hdr_room; 685 state->troom = needed_tailroom; 686 687 state->offset = 0; 688 } 689 EXPORT_SYMBOL(ip6_frag_init); 690 691 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state) 692 { 693 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset; 694 struct sk_buff *frag; 695 struct frag_hdr *fh; 696 unsigned int len; 697 698 len = state->left; 699 /* IF: it doesn't fit, use 'mtu' - the data space left */ 700 if (len > state->mtu) 701 len = state->mtu; 702 /* IF: we are not sending up to and including the packet end 703 then align the next start on an eight byte boundary */ 704 if (len < state->left) 705 len &= ~7; 706 707 /* Allocate buffer */ 708 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) + 709 state->hroom + state->troom, GFP_ATOMIC); 710 if (!frag) 711 return ERR_PTR(-ENOMEM); 712 713 /* 714 * Set up data on packet 715 */ 716 717 ip6_copy_metadata(frag, skb); 718 skb_reserve(frag, state->hroom); 719 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr)); 720 skb_reset_network_header(frag); 721 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen); 722 frag->transport_header = (frag->network_header + state->hlen + 723 sizeof(struct frag_hdr)); 724 725 /* 726 * Charge the memory for the fragment to any owner 727 * it might possess 728 */ 729 if (skb->sk) 730 skb_set_owner_w(frag, skb->sk); 731 732 /* 733 * Copy the packet header into the new buffer. 734 */ 735 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen); 736 737 fragnexthdr_offset = skb_network_header(frag); 738 fragnexthdr_offset += prevhdr - skb_network_header(skb); 739 *fragnexthdr_offset = NEXTHDR_FRAGMENT; 740 741 /* 742 * Build fragment header. 743 */ 744 fh->nexthdr = state->nexthdr; 745 fh->reserved = 0; 746 fh->identification = state->frag_id; 747 748 /* 749 * Copy a block of the IP datagram. 750 */ 751 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag), 752 len)); 753 state->left -= len; 754 755 fh->frag_off = htons(state->offset); 756 if (state->left > 0) 757 fh->frag_off |= htons(IP6_MF); 758 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr)); 759 760 state->ptr += len; 761 state->offset += len; 762 763 return frag; 764 } 765 EXPORT_SYMBOL(ip6_frag_next); 766 767 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, 768 int (*output)(struct net *, struct sock *, struct sk_buff *)) 769 { 770 struct sk_buff *frag; 771 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 772 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? 773 inet6_sk(skb->sk) : NULL; 774 struct ip6_frag_state state; 775 unsigned int mtu, hlen, nexthdr_offset; 776 int hroom, err = 0; 777 __be32 frag_id; 778 u8 *prevhdr, nexthdr = 0; 779 780 err = ip6_find_1stfragopt(skb, &prevhdr); 781 if (err < 0) 782 goto fail; 783 hlen = err; 784 nexthdr = *prevhdr; 785 nexthdr_offset = prevhdr - skb_network_header(skb); 786 787 mtu = ip6_skb_dst_mtu(skb); 788 789 /* We must not fragment if the socket is set to force MTU discovery 790 * or if the skb it not generated by a local socket. 791 */ 792 if (unlikely(!skb->ignore_df && skb->len > mtu)) 793 goto fail_toobig; 794 795 if (IP6CB(skb)->frag_max_size) { 796 if (IP6CB(skb)->frag_max_size > mtu) 797 goto fail_toobig; 798 799 /* don't send fragments larger than what we received */ 800 mtu = IP6CB(skb)->frag_max_size; 801 if (mtu < IPV6_MIN_MTU) 802 mtu = IPV6_MIN_MTU; 803 } 804 805 if (np && np->frag_size < mtu) { 806 if (np->frag_size) 807 mtu = np->frag_size; 808 } 809 if (mtu < hlen + sizeof(struct frag_hdr) + 8) 810 goto fail_toobig; 811 mtu -= hlen + sizeof(struct frag_hdr); 812 813 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr, 814 &ipv6_hdr(skb)->saddr); 815 816 if (skb->ip_summed == CHECKSUM_PARTIAL && 817 (err = skb_checksum_help(skb))) 818 goto fail; 819 820 prevhdr = skb_network_header(skb) + nexthdr_offset; 821 hroom = LL_RESERVED_SPACE(rt->dst.dev); 822 if (skb_has_frag_list(skb)) { 823 unsigned int first_len = skb_pagelen(skb); 824 struct ip6_fraglist_iter iter; 825 struct sk_buff *frag2; 826 827 if (first_len - hlen > mtu || 828 ((first_len - hlen) & 7) || 829 skb_cloned(skb) || 830 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr))) 831 goto slow_path; 832 833 skb_walk_frags(skb, frag) { 834 /* Correct geometry. */ 835 if (frag->len > mtu || 836 ((frag->len & 7) && frag->next) || 837 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr))) 838 goto slow_path_clean; 839 840 /* Partially cloned skb? */ 841 if (skb_shared(frag)) 842 goto slow_path_clean; 843 844 BUG_ON(frag->sk); 845 if (skb->sk) { 846 frag->sk = skb->sk; 847 frag->destructor = sock_wfree; 848 } 849 skb->truesize -= frag->truesize; 850 } 851 852 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id, 853 &iter); 854 if (err < 0) 855 goto fail; 856 857 for (;;) { 858 /* Prepare header of the next frame, 859 * before previous one went down. */ 860 if (iter.frag) 861 ip6_fraglist_prepare(skb, &iter); 862 863 err = output(net, sk, skb); 864 if (!err) 865 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 866 IPSTATS_MIB_FRAGCREATES); 867 868 if (err || !iter.frag) 869 break; 870 871 skb = ip6_fraglist_next(&iter); 872 } 873 874 kfree(iter.tmp_hdr); 875 876 if (err == 0) { 877 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 878 IPSTATS_MIB_FRAGOKS); 879 return 0; 880 } 881 882 kfree_skb_list(iter.frag_list); 883 884 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), 885 IPSTATS_MIB_FRAGFAILS); 886 return err; 887 888 slow_path_clean: 889 skb_walk_frags(skb, frag2) { 890 if (frag2 == frag) 891 break; 892 frag2->sk = NULL; 893 frag2->destructor = NULL; 894 skb->truesize += frag2->truesize; 895 } 896 } 897 898 slow_path: 899 /* 900 * Fragment the datagram. 901 */ 902 903 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom, 904 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id, 905 &state); 906 907 /* 908 * Keep copying data until we run out. 909 */ 910 911 while (state.left > 0) { 912 frag = ip6_frag_next(skb, &state); 913 if (IS_ERR(frag)) { 914 err = PTR_ERR(frag); 915 goto fail; 916 } 917 918 /* 919 * Put this fragment into the sending queue. 920 */ 921 err = output(net, sk, frag); 922 if (err) 923 goto fail; 924 925 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 926 IPSTATS_MIB_FRAGCREATES); 927 } 928 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 929 IPSTATS_MIB_FRAGOKS); 930 consume_skb(skb); 931 return err; 932 933 fail_toobig: 934 if (skb->sk && dst_allfrag(skb_dst(skb))) 935 sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK); 936 937 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); 938 err = -EMSGSIZE; 939 940 fail: 941 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), 942 IPSTATS_MIB_FRAGFAILS); 943 kfree_skb(skb); 944 return err; 945 } 946 947 static inline int ip6_rt_check(const struct rt6key *rt_key, 948 const struct in6_addr *fl_addr, 949 const struct in6_addr *addr_cache) 950 { 951 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && 952 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache)); 953 } 954 955 static struct dst_entry *ip6_sk_dst_check(struct sock *sk, 956 struct dst_entry *dst, 957 const struct flowi6 *fl6) 958 { 959 struct ipv6_pinfo *np = inet6_sk(sk); 960 struct rt6_info *rt; 961 962 if (!dst) 963 goto out; 964 965 if (dst->ops->family != AF_INET6) { 966 dst_release(dst); 967 return NULL; 968 } 969 970 rt = (struct rt6_info *)dst; 971 /* Yes, checking route validity in not connected 972 * case is not very simple. Take into account, 973 * that we do not support routing by source, TOS, 974 * and MSG_DONTROUTE --ANK (980726) 975 * 976 * 1. ip6_rt_check(): If route was host route, 977 * check that cached destination is current. 978 * If it is network route, we still may 979 * check its validity using saved pointer 980 * to the last used address: daddr_cache. 981 * We do not want to save whole address now, 982 * (because main consumer of this service 983 * is tcp, which has not this problem), 984 * so that the last trick works only on connected 985 * sockets. 986 * 2. oif also should be the same. 987 */ 988 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) || 989 #ifdef CONFIG_IPV6_SUBTREES 990 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) || 991 #endif 992 (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) && 993 (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) { 994 dst_release(dst); 995 dst = NULL; 996 } 997 998 out: 999 return dst; 1000 } 1001 1002 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk, 1003 struct dst_entry **dst, struct flowi6 *fl6) 1004 { 1005 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1006 struct neighbour *n; 1007 struct rt6_info *rt; 1008 #endif 1009 int err; 1010 int flags = 0; 1011 1012 /* The correct way to handle this would be to do 1013 * ip6_route_get_saddr, and then ip6_route_output; however, 1014 * the route-specific preferred source forces the 1015 * ip6_route_output call _before_ ip6_route_get_saddr. 1016 * 1017 * In source specific routing (no src=any default route), 1018 * ip6_route_output will fail given src=any saddr, though, so 1019 * that's why we try it again later. 1020 */ 1021 if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) { 1022 struct fib6_info *from; 1023 struct rt6_info *rt; 1024 bool had_dst = *dst != NULL; 1025 1026 if (!had_dst) 1027 *dst = ip6_route_output(net, sk, fl6); 1028 rt = (*dst)->error ? NULL : (struct rt6_info *)*dst; 1029 1030 rcu_read_lock(); 1031 from = rt ? rcu_dereference(rt->from) : NULL; 1032 err = ip6_route_get_saddr(net, from, &fl6->daddr, 1033 sk ? inet6_sk(sk)->srcprefs : 0, 1034 &fl6->saddr); 1035 rcu_read_unlock(); 1036 1037 if (err) 1038 goto out_err_release; 1039 1040 /* If we had an erroneous initial result, pretend it 1041 * never existed and let the SA-enabled version take 1042 * over. 1043 */ 1044 if (!had_dst && (*dst)->error) { 1045 dst_release(*dst); 1046 *dst = NULL; 1047 } 1048 1049 if (fl6->flowi6_oif) 1050 flags |= RT6_LOOKUP_F_IFACE; 1051 } 1052 1053 if (!*dst) 1054 *dst = ip6_route_output_flags(net, sk, fl6, flags); 1055 1056 err = (*dst)->error; 1057 if (err) 1058 goto out_err_release; 1059 1060 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD 1061 /* 1062 * Here if the dst entry we've looked up 1063 * has a neighbour entry that is in the INCOMPLETE 1064 * state and the src address from the flow is 1065 * marked as OPTIMISTIC, we release the found 1066 * dst entry and replace it instead with the 1067 * dst entry of the nexthop router 1068 */ 1069 rt = (struct rt6_info *) *dst; 1070 rcu_read_lock_bh(); 1071 n = __ipv6_neigh_lookup_noref(rt->dst.dev, 1072 rt6_nexthop(rt, &fl6->daddr)); 1073 err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0; 1074 rcu_read_unlock_bh(); 1075 1076 if (err) { 1077 struct inet6_ifaddr *ifp; 1078 struct flowi6 fl_gw6; 1079 int redirect; 1080 1081 ifp = ipv6_get_ifaddr(net, &fl6->saddr, 1082 (*dst)->dev, 1); 1083 1084 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); 1085 if (ifp) 1086 in6_ifa_put(ifp); 1087 1088 if (redirect) { 1089 /* 1090 * We need to get the dst entry for the 1091 * default router instead 1092 */ 1093 dst_release(*dst); 1094 memcpy(&fl_gw6, fl6, sizeof(struct flowi6)); 1095 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr)); 1096 *dst = ip6_route_output(net, sk, &fl_gw6); 1097 err = (*dst)->error; 1098 if (err) 1099 goto out_err_release; 1100 } 1101 } 1102 #endif 1103 if (ipv6_addr_v4mapped(&fl6->saddr) && 1104 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) { 1105 err = -EAFNOSUPPORT; 1106 goto out_err_release; 1107 } 1108 1109 return 0; 1110 1111 out_err_release: 1112 dst_release(*dst); 1113 *dst = NULL; 1114 1115 if (err == -ENETUNREACH) 1116 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES); 1117 return err; 1118 } 1119 1120 /** 1121 * ip6_dst_lookup - perform route lookup on flow 1122 * @sk: socket which provides route info 1123 * @dst: pointer to dst_entry * for result 1124 * @fl6: flow to lookup 1125 * 1126 * This function performs a route lookup on the given flow. 1127 * 1128 * It returns zero on success, or a standard errno code on error. 1129 */ 1130 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, 1131 struct flowi6 *fl6) 1132 { 1133 *dst = NULL; 1134 return ip6_dst_lookup_tail(net, sk, dst, fl6); 1135 } 1136 EXPORT_SYMBOL_GPL(ip6_dst_lookup); 1137 1138 /** 1139 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec 1140 * @sk: socket which provides route info 1141 * @fl6: flow to lookup 1142 * @final_dst: final destination address for ipsec lookup 1143 * 1144 * This function performs a route lookup on the given flow. 1145 * 1146 * It returns a valid dst pointer on success, or a pointer encoded 1147 * error code. 1148 */ 1149 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6, 1150 const struct in6_addr *final_dst) 1151 { 1152 struct dst_entry *dst = NULL; 1153 int err; 1154 1155 err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6); 1156 if (err) 1157 return ERR_PTR(err); 1158 if (final_dst) 1159 fl6->daddr = *final_dst; 1160 1161 return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0); 1162 } 1163 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow); 1164 1165 /** 1166 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow 1167 * @sk: socket which provides the dst cache and route info 1168 * @fl6: flow to lookup 1169 * @final_dst: final destination address for ipsec lookup 1170 * @connected: whether @sk is connected or not 1171 * 1172 * This function performs a route lookup on the given flow with the 1173 * possibility of using the cached route in the socket if it is valid. 1174 * It will take the socket dst lock when operating on the dst cache. 1175 * As a result, this function can only be used in process context. 1176 * 1177 * In addition, for a connected socket, cache the dst in the socket 1178 * if the current cache is not valid. 1179 * 1180 * It returns a valid dst pointer on success, or a pointer encoded 1181 * error code. 1182 */ 1183 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, 1184 const struct in6_addr *final_dst, 1185 bool connected) 1186 { 1187 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie); 1188 1189 dst = ip6_sk_dst_check(sk, dst, fl6); 1190 if (dst) 1191 return dst; 1192 1193 dst = ip6_dst_lookup_flow(sk, fl6, final_dst); 1194 if (connected && !IS_ERR(dst)) 1195 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6); 1196 1197 return dst; 1198 } 1199 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow); 1200 1201 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src, 1202 gfp_t gfp) 1203 { 1204 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1205 } 1206 1207 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src, 1208 gfp_t gfp) 1209 { 1210 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL; 1211 } 1212 1213 static void ip6_append_data_mtu(unsigned int *mtu, 1214 int *maxfraglen, 1215 unsigned int fragheaderlen, 1216 struct sk_buff *skb, 1217 struct rt6_info *rt, 1218 unsigned int orig_mtu) 1219 { 1220 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) { 1221 if (!skb) { 1222 /* first fragment, reserve header_len */ 1223 *mtu = orig_mtu - rt->dst.header_len; 1224 1225 } else { 1226 /* 1227 * this fragment is not first, the headers 1228 * space is regarded as data space. 1229 */ 1230 *mtu = orig_mtu; 1231 } 1232 *maxfraglen = ((*mtu - fragheaderlen) & ~7) 1233 + fragheaderlen - sizeof(struct frag_hdr); 1234 } 1235 } 1236 1237 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, 1238 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6, 1239 struct rt6_info *rt, struct flowi6 *fl6) 1240 { 1241 struct ipv6_pinfo *np = inet6_sk(sk); 1242 unsigned int mtu; 1243 struct ipv6_txoptions *opt = ipc6->opt; 1244 1245 /* 1246 * setup for corking 1247 */ 1248 if (opt) { 1249 if (WARN_ON(v6_cork->opt)) 1250 return -EINVAL; 1251 1252 v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation); 1253 if (unlikely(!v6_cork->opt)) 1254 return -ENOBUFS; 1255 1256 v6_cork->opt->tot_len = sizeof(*opt); 1257 v6_cork->opt->opt_flen = opt->opt_flen; 1258 v6_cork->opt->opt_nflen = opt->opt_nflen; 1259 1260 v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt, 1261 sk->sk_allocation); 1262 if (opt->dst0opt && !v6_cork->opt->dst0opt) 1263 return -ENOBUFS; 1264 1265 v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt, 1266 sk->sk_allocation); 1267 if (opt->dst1opt && !v6_cork->opt->dst1opt) 1268 return -ENOBUFS; 1269 1270 v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt, 1271 sk->sk_allocation); 1272 if (opt->hopopt && !v6_cork->opt->hopopt) 1273 return -ENOBUFS; 1274 1275 v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt, 1276 sk->sk_allocation); 1277 if (opt->srcrt && !v6_cork->opt->srcrt) 1278 return -ENOBUFS; 1279 1280 /* need source address above miyazawa*/ 1281 } 1282 dst_hold(&rt->dst); 1283 cork->base.dst = &rt->dst; 1284 cork->fl.u.ip6 = *fl6; 1285 v6_cork->hop_limit = ipc6->hlimit; 1286 v6_cork->tclass = ipc6->tclass; 1287 if (rt->dst.flags & DST_XFRM_TUNNEL) 1288 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1289 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); 1290 else 1291 mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? 1292 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst)); 1293 if (np->frag_size < mtu) { 1294 if (np->frag_size) 1295 mtu = np->frag_size; 1296 } 1297 if (mtu < IPV6_MIN_MTU) 1298 return -EINVAL; 1299 cork->base.fragsize = mtu; 1300 cork->base.gso_size = ipc6->gso_size; 1301 cork->base.tx_flags = 0; 1302 sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags); 1303 1304 if (dst_allfrag(xfrm_dst_path(&rt->dst))) 1305 cork->base.flags |= IPCORK_ALLFRAG; 1306 cork->base.length = 0; 1307 1308 cork->base.transmit_time = ipc6->sockc.transmit_time; 1309 1310 return 0; 1311 } 1312 1313 static int __ip6_append_data(struct sock *sk, 1314 struct flowi6 *fl6, 1315 struct sk_buff_head *queue, 1316 struct inet_cork *cork, 1317 struct inet6_cork *v6_cork, 1318 struct page_frag *pfrag, 1319 int getfrag(void *from, char *to, int offset, 1320 int len, int odd, struct sk_buff *skb), 1321 void *from, int length, int transhdrlen, 1322 unsigned int flags, struct ipcm6_cookie *ipc6) 1323 { 1324 struct sk_buff *skb, *skb_prev = NULL; 1325 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu; 1326 struct ubuf_info *uarg = NULL; 1327 int exthdrlen = 0; 1328 int dst_exthdrlen = 0; 1329 int hh_len; 1330 int copy; 1331 int err; 1332 int offset = 0; 1333 u32 tskey = 0; 1334 struct rt6_info *rt = (struct rt6_info *)cork->dst; 1335 struct ipv6_txoptions *opt = v6_cork->opt; 1336 int csummode = CHECKSUM_NONE; 1337 unsigned int maxnonfragsize, headersize; 1338 unsigned int wmem_alloc_delta = 0; 1339 bool paged, extra_uref = false; 1340 1341 skb = skb_peek_tail(queue); 1342 if (!skb) { 1343 exthdrlen = opt ? opt->opt_flen : 0; 1344 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len; 1345 } 1346 1347 paged = !!cork->gso_size; 1348 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize; 1349 orig_mtu = mtu; 1350 1351 if (cork->tx_flags & SKBTX_ANY_SW_TSTAMP && 1352 sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) 1353 tskey = sk->sk_tskey++; 1354 1355 hh_len = LL_RESERVED_SPACE(rt->dst.dev); 1356 1357 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len + 1358 (opt ? opt->opt_nflen : 0); 1359 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - 1360 sizeof(struct frag_hdr); 1361 1362 headersize = sizeof(struct ipv6hdr) + 1363 (opt ? opt->opt_flen + opt->opt_nflen : 0) + 1364 (dst_allfrag(&rt->dst) ? 1365 sizeof(struct frag_hdr) : 0) + 1366 rt->rt6i_nfheader_len; 1367 1368 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit 1369 * the first fragment 1370 */ 1371 if (headersize + transhdrlen > mtu) 1372 goto emsgsize; 1373 1374 if (cork->length + length > mtu - headersize && ipc6->dontfrag && 1375 (sk->sk_protocol == IPPROTO_UDP || 1376 sk->sk_protocol == IPPROTO_RAW)) { 1377 ipv6_local_rxpmtu(sk, fl6, mtu - headersize + 1378 sizeof(struct ipv6hdr)); 1379 goto emsgsize; 1380 } 1381 1382 if (ip6_sk_ignore_df(sk)) 1383 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN; 1384 else 1385 maxnonfragsize = mtu; 1386 1387 if (cork->length + length > maxnonfragsize - headersize) { 1388 emsgsize: 1389 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0); 1390 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu); 1391 return -EMSGSIZE; 1392 } 1393 1394 /* CHECKSUM_PARTIAL only with no extension headers and when 1395 * we are not going to fragment 1396 */ 1397 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP && 1398 headersize == sizeof(struct ipv6hdr) && 1399 length <= mtu - headersize && 1400 (!(flags & MSG_MORE) || cork->gso_size) && 1401 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM)) 1402 csummode = CHECKSUM_PARTIAL; 1403 1404 if (flags & MSG_ZEROCOPY && length && sock_flag(sk, SOCK_ZEROCOPY)) { 1405 uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb)); 1406 if (!uarg) 1407 return -ENOBUFS; 1408 extra_uref = !skb; /* only extra ref if !MSG_MORE */ 1409 if (rt->dst.dev->features & NETIF_F_SG && 1410 csummode == CHECKSUM_PARTIAL) { 1411 paged = true; 1412 } else { 1413 uarg->zerocopy = 0; 1414 skb_zcopy_set(skb, uarg, &extra_uref); 1415 } 1416 } 1417 1418 /* 1419 * Let's try using as much space as possible. 1420 * Use MTU if total length of the message fits into the MTU. 1421 * Otherwise, we need to reserve fragment header and 1422 * fragment alignment (= 8-15 octects, in total). 1423 * 1424 * Note that we may need to "move" the data from the tail of 1425 * of the buffer to the new fragment when we split 1426 * the message. 1427 * 1428 * FIXME: It may be fragmented into multiple chunks 1429 * at once if non-fragmentable extension headers 1430 * are too large. 1431 * --yoshfuji 1432 */ 1433 1434 cork->length += length; 1435 if (!skb) 1436 goto alloc_new_skb; 1437 1438 while (length > 0) { 1439 /* Check if the remaining data fits into current packet. */ 1440 copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len; 1441 if (copy < length) 1442 copy = maxfraglen - skb->len; 1443 1444 if (copy <= 0) { 1445 char *data; 1446 unsigned int datalen; 1447 unsigned int fraglen; 1448 unsigned int fraggap; 1449 unsigned int alloclen; 1450 unsigned int pagedlen; 1451 alloc_new_skb: 1452 /* There's no room in the current skb */ 1453 if (skb) 1454 fraggap = skb->len - maxfraglen; 1455 else 1456 fraggap = 0; 1457 /* update mtu and maxfraglen if necessary */ 1458 if (!skb || !skb_prev) 1459 ip6_append_data_mtu(&mtu, &maxfraglen, 1460 fragheaderlen, skb, rt, 1461 orig_mtu); 1462 1463 skb_prev = skb; 1464 1465 /* 1466 * If remaining data exceeds the mtu, 1467 * we know we need more fragment(s). 1468 */ 1469 datalen = length + fraggap; 1470 1471 if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen) 1472 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len; 1473 fraglen = datalen + fragheaderlen; 1474 pagedlen = 0; 1475 1476 if ((flags & MSG_MORE) && 1477 !(rt->dst.dev->features&NETIF_F_SG)) 1478 alloclen = mtu; 1479 else if (!paged) 1480 alloclen = fraglen; 1481 else { 1482 alloclen = min_t(int, fraglen, MAX_HEADER); 1483 pagedlen = fraglen - alloclen; 1484 } 1485 1486 alloclen += dst_exthdrlen; 1487 1488 if (datalen != length + fraggap) { 1489 /* 1490 * this is not the last fragment, the trailer 1491 * space is regarded as data space. 1492 */ 1493 datalen += rt->dst.trailer_len; 1494 } 1495 1496 alloclen += rt->dst.trailer_len; 1497 fraglen = datalen + fragheaderlen; 1498 1499 /* 1500 * We just reserve space for fragment header. 1501 * Note: this may be overallocation if the message 1502 * (without MSG_MORE) fits into the MTU. 1503 */ 1504 alloclen += sizeof(struct frag_hdr); 1505 1506 copy = datalen - transhdrlen - fraggap - pagedlen; 1507 if (copy < 0) { 1508 err = -EINVAL; 1509 goto error; 1510 } 1511 if (transhdrlen) { 1512 skb = sock_alloc_send_skb(sk, 1513 alloclen + hh_len, 1514 (flags & MSG_DONTWAIT), &err); 1515 } else { 1516 skb = NULL; 1517 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= 1518 2 * sk->sk_sndbuf) 1519 skb = alloc_skb(alloclen + hh_len, 1520 sk->sk_allocation); 1521 if (unlikely(!skb)) 1522 err = -ENOBUFS; 1523 } 1524 if (!skb) 1525 goto error; 1526 /* 1527 * Fill in the control structures 1528 */ 1529 skb->protocol = htons(ETH_P_IPV6); 1530 skb->ip_summed = csummode; 1531 skb->csum = 0; 1532 /* reserve for fragmentation and ipsec header */ 1533 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) + 1534 dst_exthdrlen); 1535 1536 /* 1537 * Find where to start putting bytes 1538 */ 1539 data = skb_put(skb, fraglen - pagedlen); 1540 skb_set_network_header(skb, exthdrlen); 1541 data += fragheaderlen; 1542 skb->transport_header = (skb->network_header + 1543 fragheaderlen); 1544 if (fraggap) { 1545 skb->csum = skb_copy_and_csum_bits( 1546 skb_prev, maxfraglen, 1547 data + transhdrlen, fraggap, 0); 1548 skb_prev->csum = csum_sub(skb_prev->csum, 1549 skb->csum); 1550 data += fraggap; 1551 pskb_trim_unique(skb_prev, maxfraglen); 1552 } 1553 if (copy > 0 && 1554 getfrag(from, data + transhdrlen, offset, 1555 copy, fraggap, skb) < 0) { 1556 err = -EFAULT; 1557 kfree_skb(skb); 1558 goto error; 1559 } 1560 1561 offset += copy; 1562 length -= copy + transhdrlen; 1563 transhdrlen = 0; 1564 exthdrlen = 0; 1565 dst_exthdrlen = 0; 1566 1567 /* Only the initial fragment is time stamped */ 1568 skb_shinfo(skb)->tx_flags = cork->tx_flags; 1569 cork->tx_flags = 0; 1570 skb_shinfo(skb)->tskey = tskey; 1571 tskey = 0; 1572 skb_zcopy_set(skb, uarg, &extra_uref); 1573 1574 if ((flags & MSG_CONFIRM) && !skb_prev) 1575 skb_set_dst_pending_confirm(skb, 1); 1576 1577 /* 1578 * Put the packet on the pending queue 1579 */ 1580 if (!skb->destructor) { 1581 skb->destructor = sock_wfree; 1582 skb->sk = sk; 1583 wmem_alloc_delta += skb->truesize; 1584 } 1585 __skb_queue_tail(queue, skb); 1586 continue; 1587 } 1588 1589 if (copy > length) 1590 copy = length; 1591 1592 if (!(rt->dst.dev->features&NETIF_F_SG) && 1593 skb_tailroom(skb) >= copy) { 1594 unsigned int off; 1595 1596 off = skb->len; 1597 if (getfrag(from, skb_put(skb, copy), 1598 offset, copy, off, skb) < 0) { 1599 __skb_trim(skb, off); 1600 err = -EFAULT; 1601 goto error; 1602 } 1603 } else if (!uarg || !uarg->zerocopy) { 1604 int i = skb_shinfo(skb)->nr_frags; 1605 1606 err = -ENOMEM; 1607 if (!sk_page_frag_refill(sk, pfrag)) 1608 goto error; 1609 1610 if (!skb_can_coalesce(skb, i, pfrag->page, 1611 pfrag->offset)) { 1612 err = -EMSGSIZE; 1613 if (i == MAX_SKB_FRAGS) 1614 goto error; 1615 1616 __skb_fill_page_desc(skb, i, pfrag->page, 1617 pfrag->offset, 0); 1618 skb_shinfo(skb)->nr_frags = ++i; 1619 get_page(pfrag->page); 1620 } 1621 copy = min_t(int, copy, pfrag->size - pfrag->offset); 1622 if (getfrag(from, 1623 page_address(pfrag->page) + pfrag->offset, 1624 offset, copy, skb->len, skb) < 0) 1625 goto error_efault; 1626 1627 pfrag->offset += copy; 1628 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); 1629 skb->len += copy; 1630 skb->data_len += copy; 1631 skb->truesize += copy; 1632 wmem_alloc_delta += copy; 1633 } else { 1634 err = skb_zerocopy_iter_dgram(skb, from, copy); 1635 if (err < 0) 1636 goto error; 1637 } 1638 offset += copy; 1639 length -= copy; 1640 } 1641 1642 if (wmem_alloc_delta) 1643 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1644 return 0; 1645 1646 error_efault: 1647 err = -EFAULT; 1648 error: 1649 if (uarg) 1650 sock_zerocopy_put_abort(uarg, extra_uref); 1651 cork->length -= length; 1652 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS); 1653 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc); 1654 return err; 1655 } 1656 1657 int ip6_append_data(struct sock *sk, 1658 int getfrag(void *from, char *to, int offset, int len, 1659 int odd, struct sk_buff *skb), 1660 void *from, int length, int transhdrlen, 1661 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1662 struct rt6_info *rt, unsigned int flags) 1663 { 1664 struct inet_sock *inet = inet_sk(sk); 1665 struct ipv6_pinfo *np = inet6_sk(sk); 1666 int exthdrlen; 1667 int err; 1668 1669 if (flags&MSG_PROBE) 1670 return 0; 1671 if (skb_queue_empty(&sk->sk_write_queue)) { 1672 /* 1673 * setup for corking 1674 */ 1675 err = ip6_setup_cork(sk, &inet->cork, &np->cork, 1676 ipc6, rt, fl6); 1677 if (err) 1678 return err; 1679 1680 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1681 length += exthdrlen; 1682 transhdrlen += exthdrlen; 1683 } else { 1684 fl6 = &inet->cork.fl.u.ip6; 1685 transhdrlen = 0; 1686 } 1687 1688 return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base, 1689 &np->cork, sk_page_frag(sk), getfrag, 1690 from, length, transhdrlen, flags, ipc6); 1691 } 1692 EXPORT_SYMBOL_GPL(ip6_append_data); 1693 1694 static void ip6_cork_release(struct inet_cork_full *cork, 1695 struct inet6_cork *v6_cork) 1696 { 1697 if (v6_cork->opt) { 1698 kfree(v6_cork->opt->dst0opt); 1699 kfree(v6_cork->opt->dst1opt); 1700 kfree(v6_cork->opt->hopopt); 1701 kfree(v6_cork->opt->srcrt); 1702 kfree(v6_cork->opt); 1703 v6_cork->opt = NULL; 1704 } 1705 1706 if (cork->base.dst) { 1707 dst_release(cork->base.dst); 1708 cork->base.dst = NULL; 1709 cork->base.flags &= ~IPCORK_ALLFRAG; 1710 } 1711 memset(&cork->fl, 0, sizeof(cork->fl)); 1712 } 1713 1714 struct sk_buff *__ip6_make_skb(struct sock *sk, 1715 struct sk_buff_head *queue, 1716 struct inet_cork_full *cork, 1717 struct inet6_cork *v6_cork) 1718 { 1719 struct sk_buff *skb, *tmp_skb; 1720 struct sk_buff **tail_skb; 1721 struct in6_addr final_dst_buf, *final_dst = &final_dst_buf; 1722 struct ipv6_pinfo *np = inet6_sk(sk); 1723 struct net *net = sock_net(sk); 1724 struct ipv6hdr *hdr; 1725 struct ipv6_txoptions *opt = v6_cork->opt; 1726 struct rt6_info *rt = (struct rt6_info *)cork->base.dst; 1727 struct flowi6 *fl6 = &cork->fl.u.ip6; 1728 unsigned char proto = fl6->flowi6_proto; 1729 1730 skb = __skb_dequeue(queue); 1731 if (!skb) 1732 goto out; 1733 tail_skb = &(skb_shinfo(skb)->frag_list); 1734 1735 /* move skb->data to ip header from ext header */ 1736 if (skb->data < skb_network_header(skb)) 1737 __skb_pull(skb, skb_network_offset(skb)); 1738 while ((tmp_skb = __skb_dequeue(queue)) != NULL) { 1739 __skb_pull(tmp_skb, skb_network_header_len(skb)); 1740 *tail_skb = tmp_skb; 1741 tail_skb = &(tmp_skb->next); 1742 skb->len += tmp_skb->len; 1743 skb->data_len += tmp_skb->len; 1744 skb->truesize += tmp_skb->truesize; 1745 tmp_skb->destructor = NULL; 1746 tmp_skb->sk = NULL; 1747 } 1748 1749 /* Allow local fragmentation. */ 1750 skb->ignore_df = ip6_sk_ignore_df(sk); 1751 1752 *final_dst = fl6->daddr; 1753 __skb_pull(skb, skb_network_header_len(skb)); 1754 if (opt && opt->opt_flen) 1755 ipv6_push_frag_opts(skb, opt, &proto); 1756 if (opt && opt->opt_nflen) 1757 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr); 1758 1759 skb_push(skb, sizeof(struct ipv6hdr)); 1760 skb_reset_network_header(skb); 1761 hdr = ipv6_hdr(skb); 1762 1763 ip6_flow_hdr(hdr, v6_cork->tclass, 1764 ip6_make_flowlabel(net, skb, fl6->flowlabel, 1765 ip6_autoflowlabel(net, np), fl6)); 1766 hdr->hop_limit = v6_cork->hop_limit; 1767 hdr->nexthdr = proto; 1768 hdr->saddr = fl6->saddr; 1769 hdr->daddr = *final_dst; 1770 1771 skb->priority = sk->sk_priority; 1772 skb->mark = sk->sk_mark; 1773 1774 skb->tstamp = cork->base.transmit_time; 1775 1776 skb_dst_set(skb, dst_clone(&rt->dst)); 1777 IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len); 1778 if (proto == IPPROTO_ICMPV6) { 1779 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); 1780 1781 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type); 1782 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS); 1783 } 1784 1785 ip6_cork_release(cork, v6_cork); 1786 out: 1787 return skb; 1788 } 1789 1790 int ip6_send_skb(struct sk_buff *skb) 1791 { 1792 struct net *net = sock_net(skb->sk); 1793 struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); 1794 int err; 1795 1796 err = ip6_local_out(net, skb->sk, skb); 1797 if (err) { 1798 if (err > 0) 1799 err = net_xmit_errno(err); 1800 if (err) 1801 IP6_INC_STATS(net, rt->rt6i_idev, 1802 IPSTATS_MIB_OUTDISCARDS); 1803 } 1804 1805 return err; 1806 } 1807 1808 int ip6_push_pending_frames(struct sock *sk) 1809 { 1810 struct sk_buff *skb; 1811 1812 skb = ip6_finish_skb(sk); 1813 if (!skb) 1814 return 0; 1815 1816 return ip6_send_skb(skb); 1817 } 1818 EXPORT_SYMBOL_GPL(ip6_push_pending_frames); 1819 1820 static void __ip6_flush_pending_frames(struct sock *sk, 1821 struct sk_buff_head *queue, 1822 struct inet_cork_full *cork, 1823 struct inet6_cork *v6_cork) 1824 { 1825 struct sk_buff *skb; 1826 1827 while ((skb = __skb_dequeue_tail(queue)) != NULL) { 1828 if (skb_dst(skb)) 1829 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)), 1830 IPSTATS_MIB_OUTDISCARDS); 1831 kfree_skb(skb); 1832 } 1833 1834 ip6_cork_release(cork, v6_cork); 1835 } 1836 1837 void ip6_flush_pending_frames(struct sock *sk) 1838 { 1839 __ip6_flush_pending_frames(sk, &sk->sk_write_queue, 1840 &inet_sk(sk)->cork, &inet6_sk(sk)->cork); 1841 } 1842 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames); 1843 1844 struct sk_buff *ip6_make_skb(struct sock *sk, 1845 int getfrag(void *from, char *to, int offset, 1846 int len, int odd, struct sk_buff *skb), 1847 void *from, int length, int transhdrlen, 1848 struct ipcm6_cookie *ipc6, struct flowi6 *fl6, 1849 struct rt6_info *rt, unsigned int flags, 1850 struct inet_cork_full *cork) 1851 { 1852 struct inet6_cork v6_cork; 1853 struct sk_buff_head queue; 1854 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0); 1855 int err; 1856 1857 if (flags & MSG_PROBE) 1858 return NULL; 1859 1860 __skb_queue_head_init(&queue); 1861 1862 cork->base.flags = 0; 1863 cork->base.addr = 0; 1864 cork->base.opt = NULL; 1865 cork->base.dst = NULL; 1866 v6_cork.opt = NULL; 1867 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt, fl6); 1868 if (err) { 1869 ip6_cork_release(cork, &v6_cork); 1870 return ERR_PTR(err); 1871 } 1872 if (ipc6->dontfrag < 0) 1873 ipc6->dontfrag = inet6_sk(sk)->dontfrag; 1874 1875 err = __ip6_append_data(sk, fl6, &queue, &cork->base, &v6_cork, 1876 ¤t->task_frag, getfrag, from, 1877 length + exthdrlen, transhdrlen + exthdrlen, 1878 flags, ipc6); 1879 if (err) { 1880 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); 1881 return ERR_PTR(err); 1882 } 1883 1884 return __ip6_make_skb(sk, &queue, cork, &v6_cork); 1885 } 1886