xref: /linux/net/ipv6/ip6_output.c (revision ca55b2fef3a9373fcfc30f82fd26bc7fccbda732)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sock *sk, struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					sk, newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sock *sk, struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(sk, skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(sk, skb);
133 }
134 
135 int ip6_output(struct sock *sk, struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, sk, skb,
147 			    NULL, dev,
148 			    ip6_finish_output,
149 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
150 }
151 
152 /*
153  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
154  */
155 
156 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
157 	     struct ipv6_txoptions *opt, int tclass)
158 {
159 	struct net *net = sock_net(sk);
160 	struct ipv6_pinfo *np = inet6_sk(sk);
161 	struct in6_addr *first_hop = &fl6->daddr;
162 	struct dst_entry *dst = skb_dst(skb);
163 	struct ipv6hdr *hdr;
164 	u8  proto = fl6->flowi6_proto;
165 	int seg_len = skb->len;
166 	int hlimit = -1;
167 	u32 mtu;
168 
169 	if (opt) {
170 		unsigned int head_room;
171 
172 		/* First: exthdrs may take lots of space (~8K for now)
173 		   MAX_HEADER is not enough.
174 		 */
175 		head_room = opt->opt_nflen + opt->opt_flen;
176 		seg_len += head_room;
177 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
178 
179 		if (skb_headroom(skb) < head_room) {
180 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
181 			if (!skb2) {
182 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
183 					      IPSTATS_MIB_OUTDISCARDS);
184 				kfree_skb(skb);
185 				return -ENOBUFS;
186 			}
187 			consume_skb(skb);
188 			skb = skb2;
189 			skb_set_owner_w(skb, sk);
190 		}
191 		if (opt->opt_flen)
192 			ipv6_push_frag_opts(skb, opt, &proto);
193 		if (opt->opt_nflen)
194 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
195 	}
196 
197 	skb_push(skb, sizeof(struct ipv6hdr));
198 	skb_reset_network_header(skb);
199 	hdr = ipv6_hdr(skb);
200 
201 	/*
202 	 *	Fill in the IPv6 header
203 	 */
204 	if (np)
205 		hlimit = np->hop_limit;
206 	if (hlimit < 0)
207 		hlimit = ip6_dst_hoplimit(dst);
208 
209 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
210 						     np->autoflowlabel, fl6));
211 
212 	hdr->payload_len = htons(seg_len);
213 	hdr->nexthdr = proto;
214 	hdr->hop_limit = hlimit;
215 
216 	hdr->saddr = fl6->saddr;
217 	hdr->daddr = *first_hop;
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->priority = sk->sk_priority;
221 	skb->mark = sk->sk_mark;
222 
223 	mtu = dst_mtu(dst);
224 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
225 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
226 			      IPSTATS_MIB_OUT, skb->len);
227 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, sk, skb,
228 			       NULL, dst->dev, dst_output_sk);
229 	}
230 
231 	skb->dev = dst->dev;
232 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
233 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
234 	kfree_skb(skb);
235 	return -EMSGSIZE;
236 }
237 EXPORT_SYMBOL(ip6_xmit);
238 
239 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
240 {
241 	struct ip6_ra_chain *ra;
242 	struct sock *last = NULL;
243 
244 	read_lock(&ip6_ra_lock);
245 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
246 		struct sock *sk = ra->sk;
247 		if (sk && ra->sel == sel &&
248 		    (!sk->sk_bound_dev_if ||
249 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
250 			if (last) {
251 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
252 				if (skb2)
253 					rawv6_rcv(last, skb2);
254 			}
255 			last = sk;
256 		}
257 	}
258 
259 	if (last) {
260 		rawv6_rcv(last, skb);
261 		read_unlock(&ip6_ra_lock);
262 		return 1;
263 	}
264 	read_unlock(&ip6_ra_lock);
265 	return 0;
266 }
267 
268 static int ip6_forward_proxy_check(struct sk_buff *skb)
269 {
270 	struct ipv6hdr *hdr = ipv6_hdr(skb);
271 	u8 nexthdr = hdr->nexthdr;
272 	__be16 frag_off;
273 	int offset;
274 
275 	if (ipv6_ext_hdr(nexthdr)) {
276 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
277 		if (offset < 0)
278 			return 0;
279 	} else
280 		offset = sizeof(struct ipv6hdr);
281 
282 	if (nexthdr == IPPROTO_ICMPV6) {
283 		struct icmp6hdr *icmp6;
284 
285 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
286 					 offset + 1 - skb->data)))
287 			return 0;
288 
289 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
290 
291 		switch (icmp6->icmp6_type) {
292 		case NDISC_ROUTER_SOLICITATION:
293 		case NDISC_ROUTER_ADVERTISEMENT:
294 		case NDISC_NEIGHBOUR_SOLICITATION:
295 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
296 		case NDISC_REDIRECT:
297 			/* For reaction involving unicast neighbor discovery
298 			 * message destined to the proxied address, pass it to
299 			 * input function.
300 			 */
301 			return 1;
302 		default:
303 			break;
304 		}
305 	}
306 
307 	/*
308 	 * The proxying router can't forward traffic sent to a link-local
309 	 * address, so signal the sender and discard the packet. This
310 	 * behavior is clarified by the MIPv6 specification.
311 	 */
312 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
313 		dst_link_failure(skb);
314 		return -1;
315 	}
316 
317 	return 0;
318 }
319 
320 static inline int ip6_forward_finish(struct sock *sk, struct sk_buff *skb)
321 {
322 	skb_sender_cpu_clear(skb);
323 	return dst_output_sk(sk, skb);
324 }
325 
326 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
327 {
328 	unsigned int mtu;
329 	struct inet6_dev *idev;
330 
331 	if (dst_metric_locked(dst, RTAX_MTU)) {
332 		mtu = dst_metric_raw(dst, RTAX_MTU);
333 		if (mtu)
334 			return mtu;
335 	}
336 
337 	mtu = IPV6_MIN_MTU;
338 	rcu_read_lock();
339 	idev = __in6_dev_get(dst->dev);
340 	if (idev)
341 		mtu = idev->cnf.mtu6;
342 	rcu_read_unlock();
343 
344 	return mtu;
345 }
346 
347 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
348 {
349 	if (skb->len <= mtu)
350 		return false;
351 
352 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
353 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
354 		return true;
355 
356 	if (skb->ignore_df)
357 		return false;
358 
359 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
360 		return false;
361 
362 	return true;
363 }
364 
365 int ip6_forward(struct sk_buff *skb)
366 {
367 	struct dst_entry *dst = skb_dst(skb);
368 	struct ipv6hdr *hdr = ipv6_hdr(skb);
369 	struct inet6_skb_parm *opt = IP6CB(skb);
370 	struct net *net = dev_net(dst->dev);
371 	u32 mtu;
372 
373 	if (net->ipv6.devconf_all->forwarding == 0)
374 		goto error;
375 
376 	if (skb->pkt_type != PACKET_HOST)
377 		goto drop;
378 
379 	if (unlikely(skb->sk))
380 		goto drop;
381 
382 	if (skb_warn_if_lro(skb))
383 		goto drop;
384 
385 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
386 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
387 				 IPSTATS_MIB_INDISCARDS);
388 		goto drop;
389 	}
390 
391 	skb_forward_csum(skb);
392 
393 	/*
394 	 *	We DO NOT make any processing on
395 	 *	RA packets, pushing them to user level AS IS
396 	 *	without ane WARRANTY that application will be able
397 	 *	to interpret them. The reason is that we
398 	 *	cannot make anything clever here.
399 	 *
400 	 *	We are not end-node, so that if packet contains
401 	 *	AH/ESP, we cannot make anything.
402 	 *	Defragmentation also would be mistake, RA packets
403 	 *	cannot be fragmented, because there is no warranty
404 	 *	that different fragments will go along one path. --ANK
405 	 */
406 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
407 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
408 			return 0;
409 	}
410 
411 	/*
412 	 *	check and decrement ttl
413 	 */
414 	if (hdr->hop_limit <= 1) {
415 		/* Force OUTPUT device used as source address */
416 		skb->dev = dst->dev;
417 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
418 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
419 				 IPSTATS_MIB_INHDRERRORS);
420 
421 		kfree_skb(skb);
422 		return -ETIMEDOUT;
423 	}
424 
425 	/* XXX: idev->cnf.proxy_ndp? */
426 	if (net->ipv6.devconf_all->proxy_ndp &&
427 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
428 		int proxied = ip6_forward_proxy_check(skb);
429 		if (proxied > 0)
430 			return ip6_input(skb);
431 		else if (proxied < 0) {
432 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
433 					 IPSTATS_MIB_INDISCARDS);
434 			goto drop;
435 		}
436 	}
437 
438 	if (!xfrm6_route_forward(skb)) {
439 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
440 				 IPSTATS_MIB_INDISCARDS);
441 		goto drop;
442 	}
443 	dst = skb_dst(skb);
444 
445 	/* IPv6 specs say nothing about it, but it is clear that we cannot
446 	   send redirects to source routed frames.
447 	   We don't send redirects to frames decapsulated from IPsec.
448 	 */
449 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
450 		struct in6_addr *target = NULL;
451 		struct inet_peer *peer;
452 		struct rt6_info *rt;
453 
454 		/*
455 		 *	incoming and outgoing devices are the same
456 		 *	send a redirect.
457 		 */
458 
459 		rt = (struct rt6_info *) dst;
460 		if (rt->rt6i_flags & RTF_GATEWAY)
461 			target = &rt->rt6i_gateway;
462 		else
463 			target = &hdr->daddr;
464 
465 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
466 
467 		/* Limit redirects both by destination (here)
468 		   and by source (inside ndisc_send_redirect)
469 		 */
470 		if (inet_peer_xrlim_allow(peer, 1*HZ))
471 			ndisc_send_redirect(skb, target);
472 		if (peer)
473 			inet_putpeer(peer);
474 	} else {
475 		int addrtype = ipv6_addr_type(&hdr->saddr);
476 
477 		/* This check is security critical. */
478 		if (addrtype == IPV6_ADDR_ANY ||
479 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
480 			goto error;
481 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
482 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
483 				    ICMPV6_NOT_NEIGHBOUR, 0);
484 			goto error;
485 		}
486 	}
487 
488 	mtu = ip6_dst_mtu_forward(dst);
489 	if (mtu < IPV6_MIN_MTU)
490 		mtu = IPV6_MIN_MTU;
491 
492 	if (ip6_pkt_too_big(skb, mtu)) {
493 		/* Again, force OUTPUT device used as source address */
494 		skb->dev = dst->dev;
495 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
496 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
497 				 IPSTATS_MIB_INTOOBIGERRORS);
498 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
499 				 IPSTATS_MIB_FRAGFAILS);
500 		kfree_skb(skb);
501 		return -EMSGSIZE;
502 	}
503 
504 	if (skb_cow(skb, dst->dev->hard_header_len)) {
505 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
506 				 IPSTATS_MIB_OUTDISCARDS);
507 		goto drop;
508 	}
509 
510 	hdr = ipv6_hdr(skb);
511 
512 	/* Mangling hops number delayed to point after skb COW */
513 
514 	hdr->hop_limit--;
515 
516 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
517 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
518 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
519 		       skb->dev, dst->dev,
520 		       ip6_forward_finish);
521 
522 error:
523 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
524 drop:
525 	kfree_skb(skb);
526 	return -EINVAL;
527 }
528 
529 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
530 {
531 	to->pkt_type = from->pkt_type;
532 	to->priority = from->priority;
533 	to->protocol = from->protocol;
534 	skb_dst_drop(to);
535 	skb_dst_set(to, dst_clone(skb_dst(from)));
536 	to->dev = from->dev;
537 	to->mark = from->mark;
538 
539 #ifdef CONFIG_NET_SCHED
540 	to->tc_index = from->tc_index;
541 #endif
542 	nf_copy(to, from);
543 	skb_copy_secmark(to, from);
544 }
545 
546 int ip6_fragment(struct sock *sk, struct sk_buff *skb,
547 		 int (*output)(struct sock *, struct sk_buff *))
548 {
549 	struct sk_buff *frag;
550 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
551 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
552 				inet6_sk(skb->sk) : NULL;
553 	struct ipv6hdr *tmp_hdr;
554 	struct frag_hdr *fh;
555 	unsigned int mtu, hlen, left, len;
556 	int hroom, troom;
557 	__be32 frag_id;
558 	int ptr, offset = 0, err = 0;
559 	u8 *prevhdr, nexthdr = 0;
560 	struct net *net = dev_net(skb_dst(skb)->dev);
561 
562 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
563 	nexthdr = *prevhdr;
564 
565 	mtu = ip6_skb_dst_mtu(skb);
566 
567 	/* We must not fragment if the socket is set to force MTU discovery
568 	 * or if the skb it not generated by a local socket.
569 	 */
570 	if (unlikely(!skb->ignore_df && skb->len > mtu))
571 		goto fail_toobig;
572 
573 	if (IP6CB(skb)->frag_max_size) {
574 		if (IP6CB(skb)->frag_max_size > mtu)
575 			goto fail_toobig;
576 
577 		/* don't send fragments larger than what we received */
578 		mtu = IP6CB(skb)->frag_max_size;
579 		if (mtu < IPV6_MIN_MTU)
580 			mtu = IPV6_MIN_MTU;
581 	}
582 
583 	if (np && np->frag_size < mtu) {
584 		if (np->frag_size)
585 			mtu = np->frag_size;
586 	}
587 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
588 		goto fail_toobig;
589 	mtu -= hlen + sizeof(struct frag_hdr);
590 
591 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
592 				    &ipv6_hdr(skb)->saddr);
593 
594 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
595 	if (skb_has_frag_list(skb)) {
596 		int first_len = skb_pagelen(skb);
597 		struct sk_buff *frag2;
598 
599 		if (first_len - hlen > mtu ||
600 		    ((first_len - hlen) & 7) ||
601 		    skb_cloned(skb) ||
602 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
603 			goto slow_path;
604 
605 		skb_walk_frags(skb, frag) {
606 			/* Correct geometry. */
607 			if (frag->len > mtu ||
608 			    ((frag->len & 7) && frag->next) ||
609 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
610 				goto slow_path_clean;
611 
612 			/* Partially cloned skb? */
613 			if (skb_shared(frag))
614 				goto slow_path_clean;
615 
616 			BUG_ON(frag->sk);
617 			if (skb->sk) {
618 				frag->sk = skb->sk;
619 				frag->destructor = sock_wfree;
620 			}
621 			skb->truesize -= frag->truesize;
622 		}
623 
624 		err = 0;
625 		offset = 0;
626 		/* BUILD HEADER */
627 
628 		*prevhdr = NEXTHDR_FRAGMENT;
629 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
630 		if (!tmp_hdr) {
631 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
632 				      IPSTATS_MIB_FRAGFAILS);
633 			err = -ENOMEM;
634 			goto fail;
635 		}
636 		frag = skb_shinfo(skb)->frag_list;
637 		skb_frag_list_init(skb);
638 
639 		__skb_pull(skb, hlen);
640 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
641 		__skb_push(skb, hlen);
642 		skb_reset_network_header(skb);
643 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
644 
645 		fh->nexthdr = nexthdr;
646 		fh->reserved = 0;
647 		fh->frag_off = htons(IP6_MF);
648 		fh->identification = frag_id;
649 
650 		first_len = skb_pagelen(skb);
651 		skb->data_len = first_len - skb_headlen(skb);
652 		skb->len = first_len;
653 		ipv6_hdr(skb)->payload_len = htons(first_len -
654 						   sizeof(struct ipv6hdr));
655 
656 		dst_hold(&rt->dst);
657 
658 		for (;;) {
659 			/* Prepare header of the next frame,
660 			 * before previous one went down. */
661 			if (frag) {
662 				frag->ip_summed = CHECKSUM_NONE;
663 				skb_reset_transport_header(frag);
664 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
665 				__skb_push(frag, hlen);
666 				skb_reset_network_header(frag);
667 				memcpy(skb_network_header(frag), tmp_hdr,
668 				       hlen);
669 				offset += skb->len - hlen - sizeof(struct frag_hdr);
670 				fh->nexthdr = nexthdr;
671 				fh->reserved = 0;
672 				fh->frag_off = htons(offset);
673 				if (frag->next)
674 					fh->frag_off |= htons(IP6_MF);
675 				fh->identification = frag_id;
676 				ipv6_hdr(frag)->payload_len =
677 						htons(frag->len -
678 						      sizeof(struct ipv6hdr));
679 				ip6_copy_metadata(frag, skb);
680 			}
681 
682 			err = output(sk, skb);
683 			if (!err)
684 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
685 					      IPSTATS_MIB_FRAGCREATES);
686 
687 			if (err || !frag)
688 				break;
689 
690 			skb = frag;
691 			frag = skb->next;
692 			skb->next = NULL;
693 		}
694 
695 		kfree(tmp_hdr);
696 
697 		if (err == 0) {
698 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
699 				      IPSTATS_MIB_FRAGOKS);
700 			ip6_rt_put(rt);
701 			return 0;
702 		}
703 
704 		kfree_skb_list(frag);
705 
706 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
707 			      IPSTATS_MIB_FRAGFAILS);
708 		ip6_rt_put(rt);
709 		return err;
710 
711 slow_path_clean:
712 		skb_walk_frags(skb, frag2) {
713 			if (frag2 == frag)
714 				break;
715 			frag2->sk = NULL;
716 			frag2->destructor = NULL;
717 			skb->truesize += frag2->truesize;
718 		}
719 	}
720 
721 slow_path:
722 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
723 	    skb_checksum_help(skb))
724 		goto fail;
725 
726 	left = skb->len - hlen;		/* Space per frame */
727 	ptr = hlen;			/* Where to start from */
728 
729 	/*
730 	 *	Fragment the datagram.
731 	 */
732 
733 	*prevhdr = NEXTHDR_FRAGMENT;
734 	troom = rt->dst.dev->needed_tailroom;
735 
736 	/*
737 	 *	Keep copying data until we run out.
738 	 */
739 	while (left > 0)	{
740 		len = left;
741 		/* IF: it doesn't fit, use 'mtu' - the data space left */
742 		if (len > mtu)
743 			len = mtu;
744 		/* IF: we are not sending up to and including the packet end
745 		   then align the next start on an eight byte boundary */
746 		if (len < left)	{
747 			len &= ~7;
748 		}
749 
750 		/* Allocate buffer */
751 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
752 				 hroom + troom, GFP_ATOMIC);
753 		if (!frag) {
754 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
755 				      IPSTATS_MIB_FRAGFAILS);
756 			err = -ENOMEM;
757 			goto fail;
758 		}
759 
760 		/*
761 		 *	Set up data on packet
762 		 */
763 
764 		ip6_copy_metadata(frag, skb);
765 		skb_reserve(frag, hroom);
766 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
767 		skb_reset_network_header(frag);
768 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
769 		frag->transport_header = (frag->network_header + hlen +
770 					  sizeof(struct frag_hdr));
771 
772 		/*
773 		 *	Charge the memory for the fragment to any owner
774 		 *	it might possess
775 		 */
776 		if (skb->sk)
777 			skb_set_owner_w(frag, skb->sk);
778 
779 		/*
780 		 *	Copy the packet header into the new buffer.
781 		 */
782 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
783 
784 		/*
785 		 *	Build fragment header.
786 		 */
787 		fh->nexthdr = nexthdr;
788 		fh->reserved = 0;
789 		fh->identification = frag_id;
790 
791 		/*
792 		 *	Copy a block of the IP datagram.
793 		 */
794 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
795 				     len));
796 		left -= len;
797 
798 		fh->frag_off = htons(offset);
799 		if (left > 0)
800 			fh->frag_off |= htons(IP6_MF);
801 		ipv6_hdr(frag)->payload_len = htons(frag->len -
802 						    sizeof(struct ipv6hdr));
803 
804 		ptr += len;
805 		offset += len;
806 
807 		/*
808 		 *	Put this fragment into the sending queue.
809 		 */
810 		err = output(sk, frag);
811 		if (err)
812 			goto fail;
813 
814 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
815 			      IPSTATS_MIB_FRAGCREATES);
816 	}
817 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
818 		      IPSTATS_MIB_FRAGOKS);
819 	consume_skb(skb);
820 	return err;
821 
822 fail_toobig:
823 	if (skb->sk && dst_allfrag(skb_dst(skb)))
824 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
825 
826 	skb->dev = skb_dst(skb)->dev;
827 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
828 	err = -EMSGSIZE;
829 
830 fail:
831 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
832 		      IPSTATS_MIB_FRAGFAILS);
833 	kfree_skb(skb);
834 	return err;
835 }
836 
837 static inline int ip6_rt_check(const struct rt6key *rt_key,
838 			       const struct in6_addr *fl_addr,
839 			       const struct in6_addr *addr_cache)
840 {
841 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
842 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
843 }
844 
845 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
846 					  struct dst_entry *dst,
847 					  const struct flowi6 *fl6)
848 {
849 	struct ipv6_pinfo *np = inet6_sk(sk);
850 	struct rt6_info *rt;
851 
852 	if (!dst)
853 		goto out;
854 
855 	if (dst->ops->family != AF_INET6) {
856 		dst_release(dst);
857 		return NULL;
858 	}
859 
860 	rt = (struct rt6_info *)dst;
861 	/* Yes, checking route validity in not connected
862 	 * case is not very simple. Take into account,
863 	 * that we do not support routing by source, TOS,
864 	 * and MSG_DONTROUTE		--ANK (980726)
865 	 *
866 	 * 1. ip6_rt_check(): If route was host route,
867 	 *    check that cached destination is current.
868 	 *    If it is network route, we still may
869 	 *    check its validity using saved pointer
870 	 *    to the last used address: daddr_cache.
871 	 *    We do not want to save whole address now,
872 	 *    (because main consumer of this service
873 	 *    is tcp, which has not this problem),
874 	 *    so that the last trick works only on connected
875 	 *    sockets.
876 	 * 2. oif also should be the same.
877 	 */
878 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
879 #ifdef CONFIG_IPV6_SUBTREES
880 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
881 #endif
882 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
883 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
884 		dst_release(dst);
885 		dst = NULL;
886 	}
887 
888 out:
889 	return dst;
890 }
891 
892 static int ip6_dst_lookup_tail(struct net *net, struct sock *sk,
893 			       struct dst_entry **dst, struct flowi6 *fl6)
894 {
895 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
896 	struct neighbour *n;
897 	struct rt6_info *rt;
898 #endif
899 	int err;
900 
901 	/* The correct way to handle this would be to do
902 	 * ip6_route_get_saddr, and then ip6_route_output; however,
903 	 * the route-specific preferred source forces the
904 	 * ip6_route_output call _before_ ip6_route_get_saddr.
905 	 *
906 	 * In source specific routing (no src=any default route),
907 	 * ip6_route_output will fail given src=any saddr, though, so
908 	 * that's why we try it again later.
909 	 */
910 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
911 		struct rt6_info *rt;
912 		bool had_dst = *dst != NULL;
913 
914 		if (!had_dst)
915 			*dst = ip6_route_output(net, sk, fl6);
916 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
917 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
918 					  sk ? inet6_sk(sk)->srcprefs : 0,
919 					  &fl6->saddr);
920 		if (err)
921 			goto out_err_release;
922 
923 		/* If we had an erroneous initial result, pretend it
924 		 * never existed and let the SA-enabled version take
925 		 * over.
926 		 */
927 		if (!had_dst && (*dst)->error) {
928 			dst_release(*dst);
929 			*dst = NULL;
930 		}
931 	}
932 
933 	if (!*dst)
934 		*dst = ip6_route_output(net, sk, fl6);
935 
936 	err = (*dst)->error;
937 	if (err)
938 		goto out_err_release;
939 
940 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
941 	/*
942 	 * Here if the dst entry we've looked up
943 	 * has a neighbour entry that is in the INCOMPLETE
944 	 * state and the src address from the flow is
945 	 * marked as OPTIMISTIC, we release the found
946 	 * dst entry and replace it instead with the
947 	 * dst entry of the nexthop router
948 	 */
949 	rt = (struct rt6_info *) *dst;
950 	rcu_read_lock_bh();
951 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
952 				      rt6_nexthop(rt, &fl6->daddr));
953 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
954 	rcu_read_unlock_bh();
955 
956 	if (err) {
957 		struct inet6_ifaddr *ifp;
958 		struct flowi6 fl_gw6;
959 		int redirect;
960 
961 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
962 				      (*dst)->dev, 1);
963 
964 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
965 		if (ifp)
966 			in6_ifa_put(ifp);
967 
968 		if (redirect) {
969 			/*
970 			 * We need to get the dst entry for the
971 			 * default router instead
972 			 */
973 			dst_release(*dst);
974 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
975 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
976 			*dst = ip6_route_output(net, sk, &fl_gw6);
977 			err = (*dst)->error;
978 			if (err)
979 				goto out_err_release;
980 		}
981 	}
982 #endif
983 
984 	return 0;
985 
986 out_err_release:
987 	if (err == -ENETUNREACH)
988 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
989 	dst_release(*dst);
990 	*dst = NULL;
991 	return err;
992 }
993 
994 /**
995  *	ip6_dst_lookup - perform route lookup on flow
996  *	@sk: socket which provides route info
997  *	@dst: pointer to dst_entry * for result
998  *	@fl6: flow to lookup
999  *
1000  *	This function performs a route lookup on the given flow.
1001  *
1002  *	It returns zero on success, or a standard errno code on error.
1003  */
1004 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1005 		   struct flowi6 *fl6)
1006 {
1007 	*dst = NULL;
1008 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1009 }
1010 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1011 
1012 /**
1013  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1014  *	@sk: socket which provides route info
1015  *	@fl6: flow to lookup
1016  *	@final_dst: final destination address for ipsec lookup
1017  *
1018  *	This function performs a route lookup on the given flow.
1019  *
1020  *	It returns a valid dst pointer on success, or a pointer encoded
1021  *	error code.
1022  */
1023 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1024 				      const struct in6_addr *final_dst)
1025 {
1026 	struct dst_entry *dst = NULL;
1027 	int err;
1028 
1029 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1030 	if (err)
1031 		return ERR_PTR(err);
1032 	if (final_dst)
1033 		fl6->daddr = *final_dst;
1034 	if (!fl6->flowi6_oif)
1035 		fl6->flowi6_oif = dst->dev->ifindex;
1036 
1037 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1038 }
1039 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1040 
1041 /**
1042  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1043  *	@sk: socket which provides the dst cache and route info
1044  *	@fl6: flow to lookup
1045  *	@final_dst: final destination address for ipsec lookup
1046  *
1047  *	This function performs a route lookup on the given flow with the
1048  *	possibility of using the cached route in the socket if it is valid.
1049  *	It will take the socket dst lock when operating on the dst cache.
1050  *	As a result, this function can only be used in process context.
1051  *
1052  *	It returns a valid dst pointer on success, or a pointer encoded
1053  *	error code.
1054  */
1055 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1056 					 const struct in6_addr *final_dst)
1057 {
1058 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1059 	int err;
1060 
1061 	dst = ip6_sk_dst_check(sk, dst, fl6);
1062 
1063 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1064 	if (err)
1065 		return ERR_PTR(err);
1066 	if (final_dst)
1067 		fl6->daddr = *final_dst;
1068 
1069 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1070 }
1071 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1072 
1073 static inline int ip6_ufo_append_data(struct sock *sk,
1074 			struct sk_buff_head *queue,
1075 			int getfrag(void *from, char *to, int offset, int len,
1076 			int odd, struct sk_buff *skb),
1077 			void *from, int length, int hh_len, int fragheaderlen,
1078 			int transhdrlen, int mtu, unsigned int flags,
1079 			const struct flowi6 *fl6)
1080 
1081 {
1082 	struct sk_buff *skb;
1083 	int err;
1084 
1085 	/* There is support for UDP large send offload by network
1086 	 * device, so create one single skb packet containing complete
1087 	 * udp datagram
1088 	 */
1089 	skb = skb_peek_tail(queue);
1090 	if (!skb) {
1091 		skb = sock_alloc_send_skb(sk,
1092 			hh_len + fragheaderlen + transhdrlen + 20,
1093 			(flags & MSG_DONTWAIT), &err);
1094 		if (!skb)
1095 			return err;
1096 
1097 		/* reserve space for Hardware header */
1098 		skb_reserve(skb, hh_len);
1099 
1100 		/* create space for UDP/IP header */
1101 		skb_put(skb, fragheaderlen + transhdrlen);
1102 
1103 		/* initialize network header pointer */
1104 		skb_reset_network_header(skb);
1105 
1106 		/* initialize protocol header pointer */
1107 		skb->transport_header = skb->network_header + fragheaderlen;
1108 
1109 		skb->protocol = htons(ETH_P_IPV6);
1110 		skb->csum = 0;
1111 
1112 		__skb_queue_tail(queue, skb);
1113 	} else if (skb_is_gso(skb)) {
1114 		goto append;
1115 	}
1116 
1117 	skb->ip_summed = CHECKSUM_PARTIAL;
1118 	/* Specify the length of each IPv6 datagram fragment.
1119 	 * It has to be a multiple of 8.
1120 	 */
1121 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1122 				     sizeof(struct frag_hdr)) & ~7;
1123 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1124 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1125 							 &fl6->daddr,
1126 							 &fl6->saddr);
1127 
1128 append:
1129 	return skb_append_datato_frags(sk, skb, getfrag, from,
1130 				       (length - transhdrlen));
1131 }
1132 
1133 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1134 					       gfp_t gfp)
1135 {
1136 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1137 }
1138 
1139 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1140 						gfp_t gfp)
1141 {
1142 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1143 }
1144 
1145 static void ip6_append_data_mtu(unsigned int *mtu,
1146 				int *maxfraglen,
1147 				unsigned int fragheaderlen,
1148 				struct sk_buff *skb,
1149 				struct rt6_info *rt,
1150 				unsigned int orig_mtu)
1151 {
1152 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1153 		if (!skb) {
1154 			/* first fragment, reserve header_len */
1155 			*mtu = orig_mtu - rt->dst.header_len;
1156 
1157 		} else {
1158 			/*
1159 			 * this fragment is not first, the headers
1160 			 * space is regarded as data space.
1161 			 */
1162 			*mtu = orig_mtu;
1163 		}
1164 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1165 			      + fragheaderlen - sizeof(struct frag_hdr);
1166 	}
1167 }
1168 
1169 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1170 			  struct inet6_cork *v6_cork,
1171 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1172 			  struct rt6_info *rt, struct flowi6 *fl6)
1173 {
1174 	struct ipv6_pinfo *np = inet6_sk(sk);
1175 	unsigned int mtu;
1176 
1177 	/*
1178 	 * setup for corking
1179 	 */
1180 	if (opt) {
1181 		if (WARN_ON(v6_cork->opt))
1182 			return -EINVAL;
1183 
1184 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1185 		if (unlikely(!v6_cork->opt))
1186 			return -ENOBUFS;
1187 
1188 		v6_cork->opt->tot_len = opt->tot_len;
1189 		v6_cork->opt->opt_flen = opt->opt_flen;
1190 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1191 
1192 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1193 						    sk->sk_allocation);
1194 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1195 			return -ENOBUFS;
1196 
1197 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1198 						    sk->sk_allocation);
1199 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1200 			return -ENOBUFS;
1201 
1202 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1203 						   sk->sk_allocation);
1204 		if (opt->hopopt && !v6_cork->opt->hopopt)
1205 			return -ENOBUFS;
1206 
1207 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1208 						    sk->sk_allocation);
1209 		if (opt->srcrt && !v6_cork->opt->srcrt)
1210 			return -ENOBUFS;
1211 
1212 		/* need source address above miyazawa*/
1213 	}
1214 	dst_hold(&rt->dst);
1215 	cork->base.dst = &rt->dst;
1216 	cork->fl.u.ip6 = *fl6;
1217 	v6_cork->hop_limit = hlimit;
1218 	v6_cork->tclass = tclass;
1219 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1220 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1221 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1222 	else
1223 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1224 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1225 	if (np->frag_size < mtu) {
1226 		if (np->frag_size)
1227 			mtu = np->frag_size;
1228 	}
1229 	cork->base.fragsize = mtu;
1230 	if (dst_allfrag(rt->dst.path))
1231 		cork->base.flags |= IPCORK_ALLFRAG;
1232 	cork->base.length = 0;
1233 
1234 	return 0;
1235 }
1236 
1237 static int __ip6_append_data(struct sock *sk,
1238 			     struct flowi6 *fl6,
1239 			     struct sk_buff_head *queue,
1240 			     struct inet_cork *cork,
1241 			     struct inet6_cork *v6_cork,
1242 			     struct page_frag *pfrag,
1243 			     int getfrag(void *from, char *to, int offset,
1244 					 int len, int odd, struct sk_buff *skb),
1245 			     void *from, int length, int transhdrlen,
1246 			     unsigned int flags, int dontfrag)
1247 {
1248 	struct sk_buff *skb, *skb_prev = NULL;
1249 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1250 	int exthdrlen = 0;
1251 	int dst_exthdrlen = 0;
1252 	int hh_len;
1253 	int copy;
1254 	int err;
1255 	int offset = 0;
1256 	__u8 tx_flags = 0;
1257 	u32 tskey = 0;
1258 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1259 	struct ipv6_txoptions *opt = v6_cork->opt;
1260 	int csummode = CHECKSUM_NONE;
1261 
1262 	skb = skb_peek_tail(queue);
1263 	if (!skb) {
1264 		exthdrlen = opt ? opt->opt_flen : 0;
1265 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1266 	}
1267 
1268 	mtu = cork->fragsize;
1269 	orig_mtu = mtu;
1270 
1271 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1272 
1273 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1274 			(opt ? opt->opt_nflen : 0);
1275 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1276 		     sizeof(struct frag_hdr);
1277 
1278 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1279 		unsigned int maxnonfragsize, headersize;
1280 
1281 		headersize = sizeof(struct ipv6hdr) +
1282 			     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1283 			     (dst_allfrag(&rt->dst) ?
1284 			      sizeof(struct frag_hdr) : 0) +
1285 			     rt->rt6i_nfheader_len;
1286 
1287 		if (ip6_sk_ignore_df(sk))
1288 			maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1289 		else
1290 			maxnonfragsize = mtu;
1291 
1292 		/* dontfrag active */
1293 		if ((cork->length + length > mtu - headersize) && dontfrag &&
1294 		    (sk->sk_protocol == IPPROTO_UDP ||
1295 		     sk->sk_protocol == IPPROTO_RAW)) {
1296 			ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1297 						   sizeof(struct ipv6hdr));
1298 			goto emsgsize;
1299 		}
1300 
1301 		if (cork->length + length > maxnonfragsize - headersize) {
1302 emsgsize:
1303 			ipv6_local_error(sk, EMSGSIZE, fl6,
1304 					 mtu - headersize +
1305 					 sizeof(struct ipv6hdr));
1306 			return -EMSGSIZE;
1307 		}
1308 	}
1309 
1310 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1311 		sock_tx_timestamp(sk, &tx_flags);
1312 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1313 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1314 			tskey = sk->sk_tskey++;
1315 	}
1316 
1317 	/* If this is the first and only packet and device
1318 	 * supports checksum offloading, let's use it.
1319 	 * Use transhdrlen, same as IPv4, because partial
1320 	 * sums only work when transhdrlen is set.
1321 	 */
1322 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1323 	    length + fragheaderlen < mtu &&
1324 	    rt->dst.dev->features & NETIF_F_V6_CSUM &&
1325 	    !exthdrlen)
1326 		csummode = CHECKSUM_PARTIAL;
1327 	/*
1328 	 * Let's try using as much space as possible.
1329 	 * Use MTU if total length of the message fits into the MTU.
1330 	 * Otherwise, we need to reserve fragment header and
1331 	 * fragment alignment (= 8-15 octects, in total).
1332 	 *
1333 	 * Note that we may need to "move" the data from the tail of
1334 	 * of the buffer to the new fragment when we split
1335 	 * the message.
1336 	 *
1337 	 * FIXME: It may be fragmented into multiple chunks
1338 	 *        at once if non-fragmentable extension headers
1339 	 *        are too large.
1340 	 * --yoshfuji
1341 	 */
1342 
1343 	cork->length += length;
1344 	if (((length > mtu) ||
1345 	     (skb && skb_is_gso(skb))) &&
1346 	    (sk->sk_protocol == IPPROTO_UDP) &&
1347 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1348 	    (sk->sk_type == SOCK_DGRAM)) {
1349 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1350 					  hh_len, fragheaderlen,
1351 					  transhdrlen, mtu, flags, fl6);
1352 		if (err)
1353 			goto error;
1354 		return 0;
1355 	}
1356 
1357 	if (!skb)
1358 		goto alloc_new_skb;
1359 
1360 	while (length > 0) {
1361 		/* Check if the remaining data fits into current packet. */
1362 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1363 		if (copy < length)
1364 			copy = maxfraglen - skb->len;
1365 
1366 		if (copy <= 0) {
1367 			char *data;
1368 			unsigned int datalen;
1369 			unsigned int fraglen;
1370 			unsigned int fraggap;
1371 			unsigned int alloclen;
1372 alloc_new_skb:
1373 			/* There's no room in the current skb */
1374 			if (skb)
1375 				fraggap = skb->len - maxfraglen;
1376 			else
1377 				fraggap = 0;
1378 			/* update mtu and maxfraglen if necessary */
1379 			if (!skb || !skb_prev)
1380 				ip6_append_data_mtu(&mtu, &maxfraglen,
1381 						    fragheaderlen, skb, rt,
1382 						    orig_mtu);
1383 
1384 			skb_prev = skb;
1385 
1386 			/*
1387 			 * If remaining data exceeds the mtu,
1388 			 * we know we need more fragment(s).
1389 			 */
1390 			datalen = length + fraggap;
1391 
1392 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1393 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1394 			if ((flags & MSG_MORE) &&
1395 			    !(rt->dst.dev->features&NETIF_F_SG))
1396 				alloclen = mtu;
1397 			else
1398 				alloclen = datalen + fragheaderlen;
1399 
1400 			alloclen += dst_exthdrlen;
1401 
1402 			if (datalen != length + fraggap) {
1403 				/*
1404 				 * this is not the last fragment, the trailer
1405 				 * space is regarded as data space.
1406 				 */
1407 				datalen += rt->dst.trailer_len;
1408 			}
1409 
1410 			alloclen += rt->dst.trailer_len;
1411 			fraglen = datalen + fragheaderlen;
1412 
1413 			/*
1414 			 * We just reserve space for fragment header.
1415 			 * Note: this may be overallocation if the message
1416 			 * (without MSG_MORE) fits into the MTU.
1417 			 */
1418 			alloclen += sizeof(struct frag_hdr);
1419 
1420 			if (transhdrlen) {
1421 				skb = sock_alloc_send_skb(sk,
1422 						alloclen + hh_len,
1423 						(flags & MSG_DONTWAIT), &err);
1424 			} else {
1425 				skb = NULL;
1426 				if (atomic_read(&sk->sk_wmem_alloc) <=
1427 				    2 * sk->sk_sndbuf)
1428 					skb = sock_wmalloc(sk,
1429 							   alloclen + hh_len, 1,
1430 							   sk->sk_allocation);
1431 				if (unlikely(!skb))
1432 					err = -ENOBUFS;
1433 			}
1434 			if (!skb)
1435 				goto error;
1436 			/*
1437 			 *	Fill in the control structures
1438 			 */
1439 			skb->protocol = htons(ETH_P_IPV6);
1440 			skb->ip_summed = csummode;
1441 			skb->csum = 0;
1442 			/* reserve for fragmentation and ipsec header */
1443 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1444 				    dst_exthdrlen);
1445 
1446 			/* Only the initial fragment is time stamped */
1447 			skb_shinfo(skb)->tx_flags = tx_flags;
1448 			tx_flags = 0;
1449 			skb_shinfo(skb)->tskey = tskey;
1450 			tskey = 0;
1451 
1452 			/*
1453 			 *	Find where to start putting bytes
1454 			 */
1455 			data = skb_put(skb, fraglen);
1456 			skb_set_network_header(skb, exthdrlen);
1457 			data += fragheaderlen;
1458 			skb->transport_header = (skb->network_header +
1459 						 fragheaderlen);
1460 			if (fraggap) {
1461 				skb->csum = skb_copy_and_csum_bits(
1462 					skb_prev, maxfraglen,
1463 					data + transhdrlen, fraggap, 0);
1464 				skb_prev->csum = csum_sub(skb_prev->csum,
1465 							  skb->csum);
1466 				data += fraggap;
1467 				pskb_trim_unique(skb_prev, maxfraglen);
1468 			}
1469 			copy = datalen - transhdrlen - fraggap;
1470 
1471 			if (copy < 0) {
1472 				err = -EINVAL;
1473 				kfree_skb(skb);
1474 				goto error;
1475 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1476 				err = -EFAULT;
1477 				kfree_skb(skb);
1478 				goto error;
1479 			}
1480 
1481 			offset += copy;
1482 			length -= datalen - fraggap;
1483 			transhdrlen = 0;
1484 			exthdrlen = 0;
1485 			dst_exthdrlen = 0;
1486 
1487 			/*
1488 			 * Put the packet on the pending queue
1489 			 */
1490 			__skb_queue_tail(queue, skb);
1491 			continue;
1492 		}
1493 
1494 		if (copy > length)
1495 			copy = length;
1496 
1497 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1498 			unsigned int off;
1499 
1500 			off = skb->len;
1501 			if (getfrag(from, skb_put(skb, copy),
1502 						offset, copy, off, skb) < 0) {
1503 				__skb_trim(skb, off);
1504 				err = -EFAULT;
1505 				goto error;
1506 			}
1507 		} else {
1508 			int i = skb_shinfo(skb)->nr_frags;
1509 
1510 			err = -ENOMEM;
1511 			if (!sk_page_frag_refill(sk, pfrag))
1512 				goto error;
1513 
1514 			if (!skb_can_coalesce(skb, i, pfrag->page,
1515 					      pfrag->offset)) {
1516 				err = -EMSGSIZE;
1517 				if (i == MAX_SKB_FRAGS)
1518 					goto error;
1519 
1520 				__skb_fill_page_desc(skb, i, pfrag->page,
1521 						     pfrag->offset, 0);
1522 				skb_shinfo(skb)->nr_frags = ++i;
1523 				get_page(pfrag->page);
1524 			}
1525 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1526 			if (getfrag(from,
1527 				    page_address(pfrag->page) + pfrag->offset,
1528 				    offset, copy, skb->len, skb) < 0)
1529 				goto error_efault;
1530 
1531 			pfrag->offset += copy;
1532 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1533 			skb->len += copy;
1534 			skb->data_len += copy;
1535 			skb->truesize += copy;
1536 			atomic_add(copy, &sk->sk_wmem_alloc);
1537 		}
1538 		offset += copy;
1539 		length -= copy;
1540 	}
1541 
1542 	return 0;
1543 
1544 error_efault:
1545 	err = -EFAULT;
1546 error:
1547 	cork->length -= length;
1548 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1549 	return err;
1550 }
1551 
1552 int ip6_append_data(struct sock *sk,
1553 		    int getfrag(void *from, char *to, int offset, int len,
1554 				int odd, struct sk_buff *skb),
1555 		    void *from, int length, int transhdrlen, int hlimit,
1556 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1557 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1558 {
1559 	struct inet_sock *inet = inet_sk(sk);
1560 	struct ipv6_pinfo *np = inet6_sk(sk);
1561 	int exthdrlen;
1562 	int err;
1563 
1564 	if (flags&MSG_PROBE)
1565 		return 0;
1566 	if (skb_queue_empty(&sk->sk_write_queue)) {
1567 		/*
1568 		 * setup for corking
1569 		 */
1570 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1571 				     tclass, opt, rt, fl6);
1572 		if (err)
1573 			return err;
1574 
1575 		exthdrlen = (opt ? opt->opt_flen : 0);
1576 		length += exthdrlen;
1577 		transhdrlen += exthdrlen;
1578 	} else {
1579 		fl6 = &inet->cork.fl.u.ip6;
1580 		transhdrlen = 0;
1581 	}
1582 
1583 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1584 				 &np->cork, sk_page_frag(sk), getfrag,
1585 				 from, length, transhdrlen, flags, dontfrag);
1586 }
1587 EXPORT_SYMBOL_GPL(ip6_append_data);
1588 
1589 static void ip6_cork_release(struct inet_cork_full *cork,
1590 			     struct inet6_cork *v6_cork)
1591 {
1592 	if (v6_cork->opt) {
1593 		kfree(v6_cork->opt->dst0opt);
1594 		kfree(v6_cork->opt->dst1opt);
1595 		kfree(v6_cork->opt->hopopt);
1596 		kfree(v6_cork->opt->srcrt);
1597 		kfree(v6_cork->opt);
1598 		v6_cork->opt = NULL;
1599 	}
1600 
1601 	if (cork->base.dst) {
1602 		dst_release(cork->base.dst);
1603 		cork->base.dst = NULL;
1604 		cork->base.flags &= ~IPCORK_ALLFRAG;
1605 	}
1606 	memset(&cork->fl, 0, sizeof(cork->fl));
1607 }
1608 
1609 struct sk_buff *__ip6_make_skb(struct sock *sk,
1610 			       struct sk_buff_head *queue,
1611 			       struct inet_cork_full *cork,
1612 			       struct inet6_cork *v6_cork)
1613 {
1614 	struct sk_buff *skb, *tmp_skb;
1615 	struct sk_buff **tail_skb;
1616 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1617 	struct ipv6_pinfo *np = inet6_sk(sk);
1618 	struct net *net = sock_net(sk);
1619 	struct ipv6hdr *hdr;
1620 	struct ipv6_txoptions *opt = v6_cork->opt;
1621 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1622 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1623 	unsigned char proto = fl6->flowi6_proto;
1624 
1625 	skb = __skb_dequeue(queue);
1626 	if (!skb)
1627 		goto out;
1628 	tail_skb = &(skb_shinfo(skb)->frag_list);
1629 
1630 	/* move skb->data to ip header from ext header */
1631 	if (skb->data < skb_network_header(skb))
1632 		__skb_pull(skb, skb_network_offset(skb));
1633 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1634 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1635 		*tail_skb = tmp_skb;
1636 		tail_skb = &(tmp_skb->next);
1637 		skb->len += tmp_skb->len;
1638 		skb->data_len += tmp_skb->len;
1639 		skb->truesize += tmp_skb->truesize;
1640 		tmp_skb->destructor = NULL;
1641 		tmp_skb->sk = NULL;
1642 	}
1643 
1644 	/* Allow local fragmentation. */
1645 	skb->ignore_df = ip6_sk_ignore_df(sk);
1646 
1647 	*final_dst = fl6->daddr;
1648 	__skb_pull(skb, skb_network_header_len(skb));
1649 	if (opt && opt->opt_flen)
1650 		ipv6_push_frag_opts(skb, opt, &proto);
1651 	if (opt && opt->opt_nflen)
1652 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1653 
1654 	skb_push(skb, sizeof(struct ipv6hdr));
1655 	skb_reset_network_header(skb);
1656 	hdr = ipv6_hdr(skb);
1657 
1658 	ip6_flow_hdr(hdr, v6_cork->tclass,
1659 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1660 					np->autoflowlabel, fl6));
1661 	hdr->hop_limit = v6_cork->hop_limit;
1662 	hdr->nexthdr = proto;
1663 	hdr->saddr = fl6->saddr;
1664 	hdr->daddr = *final_dst;
1665 
1666 	skb->priority = sk->sk_priority;
1667 	skb->mark = sk->sk_mark;
1668 
1669 	skb_dst_set(skb, dst_clone(&rt->dst));
1670 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1671 	if (proto == IPPROTO_ICMPV6) {
1672 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1673 
1674 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1675 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1676 	}
1677 
1678 	ip6_cork_release(cork, v6_cork);
1679 out:
1680 	return skb;
1681 }
1682 
1683 int ip6_send_skb(struct sk_buff *skb)
1684 {
1685 	struct net *net = sock_net(skb->sk);
1686 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1687 	int err;
1688 
1689 	err = ip6_local_out(skb);
1690 	if (err) {
1691 		if (err > 0)
1692 			err = net_xmit_errno(err);
1693 		if (err)
1694 			IP6_INC_STATS(net, rt->rt6i_idev,
1695 				      IPSTATS_MIB_OUTDISCARDS);
1696 	}
1697 
1698 	return err;
1699 }
1700 
1701 int ip6_push_pending_frames(struct sock *sk)
1702 {
1703 	struct sk_buff *skb;
1704 
1705 	skb = ip6_finish_skb(sk);
1706 	if (!skb)
1707 		return 0;
1708 
1709 	return ip6_send_skb(skb);
1710 }
1711 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1712 
1713 static void __ip6_flush_pending_frames(struct sock *sk,
1714 				       struct sk_buff_head *queue,
1715 				       struct inet_cork_full *cork,
1716 				       struct inet6_cork *v6_cork)
1717 {
1718 	struct sk_buff *skb;
1719 
1720 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1721 		if (skb_dst(skb))
1722 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1723 				      IPSTATS_MIB_OUTDISCARDS);
1724 		kfree_skb(skb);
1725 	}
1726 
1727 	ip6_cork_release(cork, v6_cork);
1728 }
1729 
1730 void ip6_flush_pending_frames(struct sock *sk)
1731 {
1732 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1733 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1734 }
1735 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1736 
1737 struct sk_buff *ip6_make_skb(struct sock *sk,
1738 			     int getfrag(void *from, char *to, int offset,
1739 					 int len, int odd, struct sk_buff *skb),
1740 			     void *from, int length, int transhdrlen,
1741 			     int hlimit, int tclass,
1742 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1743 			     struct rt6_info *rt, unsigned int flags,
1744 			     int dontfrag)
1745 {
1746 	struct inet_cork_full cork;
1747 	struct inet6_cork v6_cork;
1748 	struct sk_buff_head queue;
1749 	int exthdrlen = (opt ? opt->opt_flen : 0);
1750 	int err;
1751 
1752 	if (flags & MSG_PROBE)
1753 		return NULL;
1754 
1755 	__skb_queue_head_init(&queue);
1756 
1757 	cork.base.flags = 0;
1758 	cork.base.addr = 0;
1759 	cork.base.opt = NULL;
1760 	v6_cork.opt = NULL;
1761 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1762 	if (err)
1763 		return ERR_PTR(err);
1764 
1765 	if (dontfrag < 0)
1766 		dontfrag = inet6_sk(sk)->dontfrag;
1767 
1768 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1769 				&current->task_frag, getfrag, from,
1770 				length + exthdrlen, transhdrlen + exthdrlen,
1771 				flags, dontfrag);
1772 	if (err) {
1773 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1774 		return ERR_PTR(err);
1775 	}
1776 
1777 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1778 }
1779