xref: /linux/net/ipv6/ip6_output.c (revision 89e47d3b8a273b0eac21e4bf6d7fdb86b654fa16)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 static int ip6_finish_output2(struct sk_buff *skb)
60 {
61 	struct dst_entry *dst = skb_dst(skb);
62 	struct net_device *dev = dst->dev;
63 	struct neighbour *neigh;
64 	struct in6_addr *nexthop;
65 	int ret;
66 
67 	skb->protocol = htons(ETH_P_IPV6);
68 	skb->dev = dev;
69 
70 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
71 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
72 
73 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
74 		    ((mroute6_socket(dev_net(dev), skb) &&
75 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
76 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
77 					 &ipv6_hdr(skb)->saddr))) {
78 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
79 
80 			/* Do not check for IFF_ALLMULTI; multicast routing
81 			   is not supported in any case.
82 			 */
83 			if (newskb)
84 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
85 					newskb, NULL, newskb->dev,
86 					dev_loopback_xmit);
87 
88 			if (ipv6_hdr(skb)->hop_limit == 0) {
89 				IP6_INC_STATS(dev_net(dev), idev,
90 					      IPSTATS_MIB_OUTDISCARDS);
91 				kfree_skb(skb);
92 				return 0;
93 			}
94 		}
95 
96 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
97 				skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(dev_net(dst->dev),
120 		      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
121 	kfree_skb(skb);
122 	return -EINVAL;
123 }
124 
125 static int ip6_finish_output(struct sk_buff *skb)
126 {
127 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
128 	    dst_allfrag(skb_dst(skb)) ||
129 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
130 		return ip6_fragment(skb, ip6_finish_output2);
131 	else
132 		return ip6_finish_output2(skb);
133 }
134 
135 int ip6_output(struct sk_buff *skb)
136 {
137 	struct net_device *dev = skb_dst(skb)->dev;
138 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(dev_net(dev), idev,
141 			      IPSTATS_MIB_OUTDISCARDS);
142 		kfree_skb(skb);
143 		return 0;
144 	}
145 
146 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
147 			    ip6_finish_output,
148 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150 
151 /*
152  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
153  */
154 
155 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
156 	     struct ipv6_txoptions *opt, int tclass)
157 {
158 	struct net *net = sock_net(sk);
159 	struct ipv6_pinfo *np = inet6_sk(sk);
160 	struct in6_addr *first_hop = &fl6->daddr;
161 	struct dst_entry *dst = skb_dst(skb);
162 	struct ipv6hdr *hdr;
163 	u8  proto = fl6->flowi6_proto;
164 	int seg_len = skb->len;
165 	int hlimit = -1;
166 	u32 mtu;
167 
168 	if (opt) {
169 		unsigned int head_room;
170 
171 		/* First: exthdrs may take lots of space (~8K for now)
172 		   MAX_HEADER is not enough.
173 		 */
174 		head_room = opt->opt_nflen + opt->opt_flen;
175 		seg_len += head_room;
176 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
177 
178 		if (skb_headroom(skb) < head_room) {
179 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
180 			if (skb2 == NULL) {
181 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
182 					      IPSTATS_MIB_OUTDISCARDS);
183 				kfree_skb(skb);
184 				return -ENOBUFS;
185 			}
186 			consume_skb(skb);
187 			skb = skb2;
188 			skb_set_owner_w(skb, sk);
189 		}
190 		if (opt->opt_flen)
191 			ipv6_push_frag_opts(skb, opt, &proto);
192 		if (opt->opt_nflen)
193 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
194 	}
195 
196 	skb_push(skb, sizeof(struct ipv6hdr));
197 	skb_reset_network_header(skb);
198 	hdr = ipv6_hdr(skb);
199 
200 	/*
201 	 *	Fill in the IPv6 header
202 	 */
203 	if (np)
204 		hlimit = np->hop_limit;
205 	if (hlimit < 0)
206 		hlimit = ip6_dst_hoplimit(dst);
207 
208 	ip6_flow_hdr(hdr, tclass, fl6->flowlabel);
209 
210 	hdr->payload_len = htons(seg_len);
211 	hdr->nexthdr = proto;
212 	hdr->hop_limit = hlimit;
213 
214 	hdr->saddr = fl6->saddr;
215 	hdr->daddr = *first_hop;
216 
217 	skb->protocol = htons(ETH_P_IPV6);
218 	skb->priority = sk->sk_priority;
219 	skb->mark = sk->sk_mark;
220 
221 	mtu = dst_mtu(dst);
222 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
223 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
224 			      IPSTATS_MIB_OUT, skb->len);
225 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
226 			       dst->dev, dst_output);
227 	}
228 
229 	skb->dev = dst->dev;
230 	ipv6_local_error(sk, EMSGSIZE, fl6, mtu);
231 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
232 	kfree_skb(skb);
233 	return -EMSGSIZE;
234 }
235 
236 EXPORT_SYMBOL(ip6_xmit);
237 
238 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
239 {
240 	struct ip6_ra_chain *ra;
241 	struct sock *last = NULL;
242 
243 	read_lock(&ip6_ra_lock);
244 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
245 		struct sock *sk = ra->sk;
246 		if (sk && ra->sel == sel &&
247 		    (!sk->sk_bound_dev_if ||
248 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
249 			if (last) {
250 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
251 				if (skb2)
252 					rawv6_rcv(last, skb2);
253 			}
254 			last = sk;
255 		}
256 	}
257 
258 	if (last) {
259 		rawv6_rcv(last, skb);
260 		read_unlock(&ip6_ra_lock);
261 		return 1;
262 	}
263 	read_unlock(&ip6_ra_lock);
264 	return 0;
265 }
266 
267 static int ip6_forward_proxy_check(struct sk_buff *skb)
268 {
269 	struct ipv6hdr *hdr = ipv6_hdr(skb);
270 	u8 nexthdr = hdr->nexthdr;
271 	__be16 frag_off;
272 	int offset;
273 
274 	if (ipv6_ext_hdr(nexthdr)) {
275 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
276 		if (offset < 0)
277 			return 0;
278 	} else
279 		offset = sizeof(struct ipv6hdr);
280 
281 	if (nexthdr == IPPROTO_ICMPV6) {
282 		struct icmp6hdr *icmp6;
283 
284 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
285 					 offset + 1 - skb->data)))
286 			return 0;
287 
288 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
289 
290 		switch (icmp6->icmp6_type) {
291 		case NDISC_ROUTER_SOLICITATION:
292 		case NDISC_ROUTER_ADVERTISEMENT:
293 		case NDISC_NEIGHBOUR_SOLICITATION:
294 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
295 		case NDISC_REDIRECT:
296 			/* For reaction involving unicast neighbor discovery
297 			 * message destined to the proxied address, pass it to
298 			 * input function.
299 			 */
300 			return 1;
301 		default:
302 			break;
303 		}
304 	}
305 
306 	/*
307 	 * The proxying router can't forward traffic sent to a link-local
308 	 * address, so signal the sender and discard the packet. This
309 	 * behavior is clarified by the MIPv6 specification.
310 	 */
311 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
312 		dst_link_failure(skb);
313 		return -1;
314 	}
315 
316 	return 0;
317 }
318 
319 static inline int ip6_forward_finish(struct sk_buff *skb)
320 {
321 	return dst_output(skb);
322 }
323 
324 int ip6_forward(struct sk_buff *skb)
325 {
326 	struct dst_entry *dst = skb_dst(skb);
327 	struct ipv6hdr *hdr = ipv6_hdr(skb);
328 	struct inet6_skb_parm *opt = IP6CB(skb);
329 	struct net *net = dev_net(dst->dev);
330 	u32 mtu;
331 
332 	if (net->ipv6.devconf_all->forwarding == 0)
333 		goto error;
334 
335 	if (skb_warn_if_lro(skb))
336 		goto drop;
337 
338 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
339 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
340 				 IPSTATS_MIB_INDISCARDS);
341 		goto drop;
342 	}
343 
344 	if (skb->pkt_type != PACKET_HOST)
345 		goto drop;
346 
347 	skb_forward_csum(skb);
348 
349 	/*
350 	 *	We DO NOT make any processing on
351 	 *	RA packets, pushing them to user level AS IS
352 	 *	without ane WARRANTY that application will be able
353 	 *	to interpret them. The reason is that we
354 	 *	cannot make anything clever here.
355 	 *
356 	 *	We are not end-node, so that if packet contains
357 	 *	AH/ESP, we cannot make anything.
358 	 *	Defragmentation also would be mistake, RA packets
359 	 *	cannot be fragmented, because there is no warranty
360 	 *	that different fragments will go along one path. --ANK
361 	 */
362 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
363 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
364 			return 0;
365 	}
366 
367 	/*
368 	 *	check and decrement ttl
369 	 */
370 	if (hdr->hop_limit <= 1) {
371 		/* Force OUTPUT device used as source address */
372 		skb->dev = dst->dev;
373 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
374 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
375 				 IPSTATS_MIB_INHDRERRORS);
376 
377 		kfree_skb(skb);
378 		return -ETIMEDOUT;
379 	}
380 
381 	/* XXX: idev->cnf.proxy_ndp? */
382 	if (net->ipv6.devconf_all->proxy_ndp &&
383 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
384 		int proxied = ip6_forward_proxy_check(skb);
385 		if (proxied > 0)
386 			return ip6_input(skb);
387 		else if (proxied < 0) {
388 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
389 					 IPSTATS_MIB_INDISCARDS);
390 			goto drop;
391 		}
392 	}
393 
394 	if (!xfrm6_route_forward(skb)) {
395 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
396 				 IPSTATS_MIB_INDISCARDS);
397 		goto drop;
398 	}
399 	dst = skb_dst(skb);
400 
401 	/* IPv6 specs say nothing about it, but it is clear that we cannot
402 	   send redirects to source routed frames.
403 	   We don't send redirects to frames decapsulated from IPsec.
404 	 */
405 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
406 		struct in6_addr *target = NULL;
407 		struct inet_peer *peer;
408 		struct rt6_info *rt;
409 
410 		/*
411 		 *	incoming and outgoing devices are the same
412 		 *	send a redirect.
413 		 */
414 
415 		rt = (struct rt6_info *) dst;
416 		if (rt->rt6i_flags & RTF_GATEWAY)
417 			target = &rt->rt6i_gateway;
418 		else
419 			target = &hdr->daddr;
420 
421 		peer = inet_getpeer_v6(net->ipv6.peers, &rt->rt6i_dst.addr, 1);
422 
423 		/* Limit redirects both by destination (here)
424 		   and by source (inside ndisc_send_redirect)
425 		 */
426 		if (inet_peer_xrlim_allow(peer, 1*HZ))
427 			ndisc_send_redirect(skb, target);
428 		if (peer)
429 			inet_putpeer(peer);
430 	} else {
431 		int addrtype = ipv6_addr_type(&hdr->saddr);
432 
433 		/* This check is security critical. */
434 		if (addrtype == IPV6_ADDR_ANY ||
435 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
436 			goto error;
437 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
438 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
439 				    ICMPV6_NOT_NEIGHBOUR, 0);
440 			goto error;
441 		}
442 	}
443 
444 	mtu = dst_mtu(dst);
445 	if (mtu < IPV6_MIN_MTU)
446 		mtu = IPV6_MIN_MTU;
447 
448 	if ((!skb->local_df && skb->len > mtu && !skb_is_gso(skb)) ||
449 	    (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)) {
450 		/* Again, force OUTPUT device used as source address */
451 		skb->dev = dst->dev;
452 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
453 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
454 				 IPSTATS_MIB_INTOOBIGERRORS);
455 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
456 				 IPSTATS_MIB_FRAGFAILS);
457 		kfree_skb(skb);
458 		return -EMSGSIZE;
459 	}
460 
461 	if (skb_cow(skb, dst->dev->hard_header_len)) {
462 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
463 				 IPSTATS_MIB_OUTDISCARDS);
464 		goto drop;
465 	}
466 
467 	hdr = ipv6_hdr(skb);
468 
469 	/* Mangling hops number delayed to point after skb COW */
470 
471 	hdr->hop_limit--;
472 
473 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
474 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
475 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
476 		       ip6_forward_finish);
477 
478 error:
479 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
480 drop:
481 	kfree_skb(skb);
482 	return -EINVAL;
483 }
484 
485 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
486 {
487 	to->pkt_type = from->pkt_type;
488 	to->priority = from->priority;
489 	to->protocol = from->protocol;
490 	skb_dst_drop(to);
491 	skb_dst_set(to, dst_clone(skb_dst(from)));
492 	to->dev = from->dev;
493 	to->mark = from->mark;
494 
495 #ifdef CONFIG_NET_SCHED
496 	to->tc_index = from->tc_index;
497 #endif
498 	nf_copy(to, from);
499 #if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
500 	to->nf_trace = from->nf_trace;
501 #endif
502 	skb_copy_secmark(to, from);
503 }
504 
505 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
506 {
507 	struct sk_buff *frag;
508 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
509 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
510 	struct ipv6hdr *tmp_hdr;
511 	struct frag_hdr *fh;
512 	unsigned int mtu, hlen, left, len;
513 	int hroom, troom;
514 	__be32 frag_id = 0;
515 	int ptr, offset = 0, err=0;
516 	u8 *prevhdr, nexthdr = 0;
517 	struct net *net = dev_net(skb_dst(skb)->dev);
518 
519 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
520 	nexthdr = *prevhdr;
521 
522 	mtu = ip6_skb_dst_mtu(skb);
523 
524 	/* We must not fragment if the socket is set to force MTU discovery
525 	 * or if the skb it not generated by a local socket.
526 	 */
527 	if (unlikely(!skb->local_df && skb->len > mtu) ||
528 		     (IP6CB(skb)->frag_max_size &&
529 		      IP6CB(skb)->frag_max_size > mtu)) {
530 		if (skb->sk && dst_allfrag(skb_dst(skb)))
531 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
532 
533 		skb->dev = skb_dst(skb)->dev;
534 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
535 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
536 			      IPSTATS_MIB_FRAGFAILS);
537 		kfree_skb(skb);
538 		return -EMSGSIZE;
539 	}
540 
541 	if (np && np->frag_size < mtu) {
542 		if (np->frag_size)
543 			mtu = np->frag_size;
544 	}
545 	mtu -= hlen + sizeof(struct frag_hdr);
546 
547 	if (skb_has_frag_list(skb)) {
548 		int first_len = skb_pagelen(skb);
549 		struct sk_buff *frag2;
550 
551 		if (first_len - hlen > mtu ||
552 		    ((first_len - hlen) & 7) ||
553 		    skb_cloned(skb))
554 			goto slow_path;
555 
556 		skb_walk_frags(skb, frag) {
557 			/* Correct geometry. */
558 			if (frag->len > mtu ||
559 			    ((frag->len & 7) && frag->next) ||
560 			    skb_headroom(frag) < hlen)
561 				goto slow_path_clean;
562 
563 			/* Partially cloned skb? */
564 			if (skb_shared(frag))
565 				goto slow_path_clean;
566 
567 			BUG_ON(frag->sk);
568 			if (skb->sk) {
569 				frag->sk = skb->sk;
570 				frag->destructor = sock_wfree;
571 			}
572 			skb->truesize -= frag->truesize;
573 		}
574 
575 		err = 0;
576 		offset = 0;
577 		frag = skb_shinfo(skb)->frag_list;
578 		skb_frag_list_init(skb);
579 		/* BUILD HEADER */
580 
581 		*prevhdr = NEXTHDR_FRAGMENT;
582 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
583 		if (!tmp_hdr) {
584 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
585 				      IPSTATS_MIB_FRAGFAILS);
586 			return -ENOMEM;
587 		}
588 
589 		__skb_pull(skb, hlen);
590 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
591 		__skb_push(skb, hlen);
592 		skb_reset_network_header(skb);
593 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
594 
595 		ipv6_select_ident(fh, rt);
596 		fh->nexthdr = nexthdr;
597 		fh->reserved = 0;
598 		fh->frag_off = htons(IP6_MF);
599 		frag_id = fh->identification;
600 
601 		first_len = skb_pagelen(skb);
602 		skb->data_len = first_len - skb_headlen(skb);
603 		skb->len = first_len;
604 		ipv6_hdr(skb)->payload_len = htons(first_len -
605 						   sizeof(struct ipv6hdr));
606 
607 		dst_hold(&rt->dst);
608 
609 		for (;;) {
610 			/* Prepare header of the next frame,
611 			 * before previous one went down. */
612 			if (frag) {
613 				frag->ip_summed = CHECKSUM_NONE;
614 				skb_reset_transport_header(frag);
615 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
616 				__skb_push(frag, hlen);
617 				skb_reset_network_header(frag);
618 				memcpy(skb_network_header(frag), tmp_hdr,
619 				       hlen);
620 				offset += skb->len - hlen - sizeof(struct frag_hdr);
621 				fh->nexthdr = nexthdr;
622 				fh->reserved = 0;
623 				fh->frag_off = htons(offset);
624 				if (frag->next != NULL)
625 					fh->frag_off |= htons(IP6_MF);
626 				fh->identification = frag_id;
627 				ipv6_hdr(frag)->payload_len =
628 						htons(frag->len -
629 						      sizeof(struct ipv6hdr));
630 				ip6_copy_metadata(frag, skb);
631 			}
632 
633 			err = output(skb);
634 			if(!err)
635 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
636 					      IPSTATS_MIB_FRAGCREATES);
637 
638 			if (err || !frag)
639 				break;
640 
641 			skb = frag;
642 			frag = skb->next;
643 			skb->next = NULL;
644 		}
645 
646 		kfree(tmp_hdr);
647 
648 		if (err == 0) {
649 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
650 				      IPSTATS_MIB_FRAGOKS);
651 			ip6_rt_put(rt);
652 			return 0;
653 		}
654 
655 		while (frag) {
656 			skb = frag->next;
657 			kfree_skb(frag);
658 			frag = skb;
659 		}
660 
661 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
662 			      IPSTATS_MIB_FRAGFAILS);
663 		ip6_rt_put(rt);
664 		return err;
665 
666 slow_path_clean:
667 		skb_walk_frags(skb, frag2) {
668 			if (frag2 == frag)
669 				break;
670 			frag2->sk = NULL;
671 			frag2->destructor = NULL;
672 			skb->truesize += frag2->truesize;
673 		}
674 	}
675 
676 slow_path:
677 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
678 	    skb_checksum_help(skb))
679 		goto fail;
680 
681 	left = skb->len - hlen;		/* Space per frame */
682 	ptr = hlen;			/* Where to start from */
683 
684 	/*
685 	 *	Fragment the datagram.
686 	 */
687 
688 	*prevhdr = NEXTHDR_FRAGMENT;
689 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
690 	troom = rt->dst.dev->needed_tailroom;
691 
692 	/*
693 	 *	Keep copying data until we run out.
694 	 */
695 	while(left > 0)	{
696 		len = left;
697 		/* IF: it doesn't fit, use 'mtu' - the data space left */
698 		if (len > mtu)
699 			len = mtu;
700 		/* IF: we are not sending up to and including the packet end
701 		   then align the next start on an eight byte boundary */
702 		if (len < left)	{
703 			len &= ~7;
704 		}
705 		/*
706 		 *	Allocate buffer.
707 		 */
708 
709 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
710 				      hroom + troom, GFP_ATOMIC)) == NULL) {
711 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
712 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
713 				      IPSTATS_MIB_FRAGFAILS);
714 			err = -ENOMEM;
715 			goto fail;
716 		}
717 
718 		/*
719 		 *	Set up data on packet
720 		 */
721 
722 		ip6_copy_metadata(frag, skb);
723 		skb_reserve(frag, hroom);
724 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
725 		skb_reset_network_header(frag);
726 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
727 		frag->transport_header = (frag->network_header + hlen +
728 					  sizeof(struct frag_hdr));
729 
730 		/*
731 		 *	Charge the memory for the fragment to any owner
732 		 *	it might possess
733 		 */
734 		if (skb->sk)
735 			skb_set_owner_w(frag, skb->sk);
736 
737 		/*
738 		 *	Copy the packet header into the new buffer.
739 		 */
740 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
741 
742 		/*
743 		 *	Build fragment header.
744 		 */
745 		fh->nexthdr = nexthdr;
746 		fh->reserved = 0;
747 		if (!frag_id) {
748 			ipv6_select_ident(fh, rt);
749 			frag_id = fh->identification;
750 		} else
751 			fh->identification = frag_id;
752 
753 		/*
754 		 *	Copy a block of the IP datagram.
755 		 */
756 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
757 			BUG();
758 		left -= len;
759 
760 		fh->frag_off = htons(offset);
761 		if (left > 0)
762 			fh->frag_off |= htons(IP6_MF);
763 		ipv6_hdr(frag)->payload_len = htons(frag->len -
764 						    sizeof(struct ipv6hdr));
765 
766 		ptr += len;
767 		offset += len;
768 
769 		/*
770 		 *	Put this fragment into the sending queue.
771 		 */
772 		err = output(frag);
773 		if (err)
774 			goto fail;
775 
776 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
777 			      IPSTATS_MIB_FRAGCREATES);
778 	}
779 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
780 		      IPSTATS_MIB_FRAGOKS);
781 	consume_skb(skb);
782 	return err;
783 
784 fail:
785 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
786 		      IPSTATS_MIB_FRAGFAILS);
787 	kfree_skb(skb);
788 	return err;
789 }
790 
791 static inline int ip6_rt_check(const struct rt6key *rt_key,
792 			       const struct in6_addr *fl_addr,
793 			       const struct in6_addr *addr_cache)
794 {
795 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
796 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
797 }
798 
799 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
800 					  struct dst_entry *dst,
801 					  const struct flowi6 *fl6)
802 {
803 	struct ipv6_pinfo *np = inet6_sk(sk);
804 	struct rt6_info *rt;
805 
806 	if (!dst)
807 		goto out;
808 
809 	if (dst->ops->family != AF_INET6) {
810 		dst_release(dst);
811 		return NULL;
812 	}
813 
814 	rt = (struct rt6_info *)dst;
815 	/* Yes, checking route validity in not connected
816 	 * case is not very simple. Take into account,
817 	 * that we do not support routing by source, TOS,
818 	 * and MSG_DONTROUTE 		--ANK (980726)
819 	 *
820 	 * 1. ip6_rt_check(): If route was host route,
821 	 *    check that cached destination is current.
822 	 *    If it is network route, we still may
823 	 *    check its validity using saved pointer
824 	 *    to the last used address: daddr_cache.
825 	 *    We do not want to save whole address now,
826 	 *    (because main consumer of this service
827 	 *    is tcp, which has not this problem),
828 	 *    so that the last trick works only on connected
829 	 *    sockets.
830 	 * 2. oif also should be the same.
831 	 */
832 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
833 #ifdef CONFIG_IPV6_SUBTREES
834 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
835 #endif
836 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
837 		dst_release(dst);
838 		dst = NULL;
839 	}
840 
841 out:
842 	return dst;
843 }
844 
845 static int ip6_dst_lookup_tail(struct sock *sk,
846 			       struct dst_entry **dst, struct flowi6 *fl6)
847 {
848 	struct net *net = sock_net(sk);
849 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
850 	struct neighbour *n;
851 	struct rt6_info *rt;
852 #endif
853 	int err;
854 
855 	if (*dst == NULL)
856 		*dst = ip6_route_output(net, sk, fl6);
857 
858 	if ((err = (*dst)->error))
859 		goto out_err_release;
860 
861 	if (ipv6_addr_any(&fl6->saddr)) {
862 		struct rt6_info *rt = (struct rt6_info *) *dst;
863 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
864 					  sk ? inet6_sk(sk)->srcprefs : 0,
865 					  &fl6->saddr);
866 		if (err)
867 			goto out_err_release;
868 	}
869 
870 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
871 	/*
872 	 * Here if the dst entry we've looked up
873 	 * has a neighbour entry that is in the INCOMPLETE
874 	 * state and the src address from the flow is
875 	 * marked as OPTIMISTIC, we release the found
876 	 * dst entry and replace it instead with the
877 	 * dst entry of the nexthop router
878 	 */
879 	rt = (struct rt6_info *) *dst;
880 	rcu_read_lock_bh();
881 	n = __ipv6_neigh_lookup_noref(rt->dst.dev, rt6_nexthop(rt));
882 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
883 	rcu_read_unlock_bh();
884 
885 	if (err) {
886 		struct inet6_ifaddr *ifp;
887 		struct flowi6 fl_gw6;
888 		int redirect;
889 
890 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
891 				      (*dst)->dev, 1);
892 
893 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
894 		if (ifp)
895 			in6_ifa_put(ifp);
896 
897 		if (redirect) {
898 			/*
899 			 * We need to get the dst entry for the
900 			 * default router instead
901 			 */
902 			dst_release(*dst);
903 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
904 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
905 			*dst = ip6_route_output(net, sk, &fl_gw6);
906 			if ((err = (*dst)->error))
907 				goto out_err_release;
908 		}
909 	}
910 #endif
911 
912 	return 0;
913 
914 out_err_release:
915 	if (err == -ENETUNREACH)
916 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
917 	dst_release(*dst);
918 	*dst = NULL;
919 	return err;
920 }
921 
922 /**
923  *	ip6_dst_lookup - perform route lookup on flow
924  *	@sk: socket which provides route info
925  *	@dst: pointer to dst_entry * for result
926  *	@fl6: flow to lookup
927  *
928  *	This function performs a route lookup on the given flow.
929  *
930  *	It returns zero on success, or a standard errno code on error.
931  */
932 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
933 {
934 	*dst = NULL;
935 	return ip6_dst_lookup_tail(sk, dst, fl6);
936 }
937 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
938 
939 /**
940  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
941  *	@sk: socket which provides route info
942  *	@fl6: flow to lookup
943  *	@final_dst: final destination address for ipsec lookup
944  *	@can_sleep: we are in a sleepable context
945  *
946  *	This function performs a route lookup on the given flow.
947  *
948  *	It returns a valid dst pointer on success, or a pointer encoded
949  *	error code.
950  */
951 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
952 				      const struct in6_addr *final_dst,
953 				      bool can_sleep)
954 {
955 	struct dst_entry *dst = NULL;
956 	int err;
957 
958 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
959 	if (err)
960 		return ERR_PTR(err);
961 	if (final_dst)
962 		fl6->daddr = *final_dst;
963 	if (can_sleep)
964 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
965 
966 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
967 }
968 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
969 
970 /**
971  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
972  *	@sk: socket which provides the dst cache and route info
973  *	@fl6: flow to lookup
974  *	@final_dst: final destination address for ipsec lookup
975  *	@can_sleep: we are in a sleepable context
976  *
977  *	This function performs a route lookup on the given flow with the
978  *	possibility of using the cached route in the socket if it is valid.
979  *	It will take the socket dst lock when operating on the dst cache.
980  *	As a result, this function can only be used in process context.
981  *
982  *	It returns a valid dst pointer on success, or a pointer encoded
983  *	error code.
984  */
985 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
986 					 const struct in6_addr *final_dst,
987 					 bool can_sleep)
988 {
989 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
990 	int err;
991 
992 	dst = ip6_sk_dst_check(sk, dst, fl6);
993 
994 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
995 	if (err)
996 		return ERR_PTR(err);
997 	if (final_dst)
998 		fl6->daddr = *final_dst;
999 	if (can_sleep)
1000 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1001 
1002 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1003 }
1004 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1005 
1006 static inline int ip6_ufo_append_data(struct sock *sk,
1007 			int getfrag(void *from, char *to, int offset, int len,
1008 			int odd, struct sk_buff *skb),
1009 			void *from, int length, int hh_len, int fragheaderlen,
1010 			int transhdrlen, int mtu,unsigned int flags,
1011 			struct rt6_info *rt)
1012 
1013 {
1014 	struct sk_buff *skb;
1015 	struct frag_hdr fhdr;
1016 	int err;
1017 
1018 	/* There is support for UDP large send offload by network
1019 	 * device, so create one single skb packet containing complete
1020 	 * udp datagram
1021 	 */
1022 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1023 		skb = sock_alloc_send_skb(sk,
1024 			hh_len + fragheaderlen + transhdrlen + 20,
1025 			(flags & MSG_DONTWAIT), &err);
1026 		if (skb == NULL)
1027 			return err;
1028 
1029 		/* reserve space for Hardware header */
1030 		skb_reserve(skb, hh_len);
1031 
1032 		/* create space for UDP/IP header */
1033 		skb_put(skb,fragheaderlen + transhdrlen);
1034 
1035 		/* initialize network header pointer */
1036 		skb_reset_network_header(skb);
1037 
1038 		/* initialize protocol header pointer */
1039 		skb->transport_header = skb->network_header + fragheaderlen;
1040 
1041 		skb->protocol = htons(ETH_P_IPV6);
1042 		skb->csum = 0;
1043 
1044 		__skb_queue_tail(&sk->sk_write_queue, skb);
1045 	} else if (skb_is_gso(skb)) {
1046 		goto append;
1047 	}
1048 
1049 	skb->ip_summed = CHECKSUM_PARTIAL;
1050 	/* Specify the length of each IPv6 datagram fragment.
1051 	 * It has to be a multiple of 8.
1052 	 */
1053 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1054 				     sizeof(struct frag_hdr)) & ~7;
1055 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1056 	ipv6_select_ident(&fhdr, rt);
1057 	skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1058 
1059 append:
1060 	return skb_append_datato_frags(sk, skb, getfrag, from,
1061 				       (length - transhdrlen));
1062 }
1063 
1064 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1065 					       gfp_t gfp)
1066 {
1067 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1068 }
1069 
1070 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1071 						gfp_t gfp)
1072 {
1073 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1074 }
1075 
1076 static void ip6_append_data_mtu(unsigned int *mtu,
1077 				int *maxfraglen,
1078 				unsigned int fragheaderlen,
1079 				struct sk_buff *skb,
1080 				struct rt6_info *rt,
1081 				bool pmtuprobe)
1082 {
1083 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1084 		if (skb == NULL) {
1085 			/* first fragment, reserve header_len */
1086 			*mtu = *mtu - rt->dst.header_len;
1087 
1088 		} else {
1089 			/*
1090 			 * this fragment is not first, the headers
1091 			 * space is regarded as data space.
1092 			 */
1093 			*mtu = min(*mtu, pmtuprobe ?
1094 				   rt->dst.dev->mtu :
1095 				   dst_mtu(rt->dst.path));
1096 		}
1097 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1098 			      + fragheaderlen - sizeof(struct frag_hdr);
1099 	}
1100 }
1101 
1102 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1103 	int offset, int len, int odd, struct sk_buff *skb),
1104 	void *from, int length, int transhdrlen,
1105 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1106 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1107 {
1108 	struct inet_sock *inet = inet_sk(sk);
1109 	struct ipv6_pinfo *np = inet6_sk(sk);
1110 	struct inet_cork *cork;
1111 	struct sk_buff *skb, *skb_prev = NULL;
1112 	unsigned int maxfraglen, fragheaderlen, mtu;
1113 	int exthdrlen;
1114 	int dst_exthdrlen;
1115 	int hh_len;
1116 	int copy;
1117 	int err;
1118 	int offset = 0;
1119 	__u8 tx_flags = 0;
1120 
1121 	if (flags&MSG_PROBE)
1122 		return 0;
1123 	cork = &inet->cork.base;
1124 	if (skb_queue_empty(&sk->sk_write_queue)) {
1125 		/*
1126 		 * setup for corking
1127 		 */
1128 		if (opt) {
1129 			if (WARN_ON(np->cork.opt))
1130 				return -EINVAL;
1131 
1132 			np->cork.opt = kzalloc(opt->tot_len, sk->sk_allocation);
1133 			if (unlikely(np->cork.opt == NULL))
1134 				return -ENOBUFS;
1135 
1136 			np->cork.opt->tot_len = opt->tot_len;
1137 			np->cork.opt->opt_flen = opt->opt_flen;
1138 			np->cork.opt->opt_nflen = opt->opt_nflen;
1139 
1140 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1141 							    sk->sk_allocation);
1142 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1143 				return -ENOBUFS;
1144 
1145 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1146 							    sk->sk_allocation);
1147 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1148 				return -ENOBUFS;
1149 
1150 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1151 							   sk->sk_allocation);
1152 			if (opt->hopopt && !np->cork.opt->hopopt)
1153 				return -ENOBUFS;
1154 
1155 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1156 							    sk->sk_allocation);
1157 			if (opt->srcrt && !np->cork.opt->srcrt)
1158 				return -ENOBUFS;
1159 
1160 			/* need source address above miyazawa*/
1161 		}
1162 		dst_hold(&rt->dst);
1163 		cork->dst = &rt->dst;
1164 		inet->cork.fl.u.ip6 = *fl6;
1165 		np->cork.hop_limit = hlimit;
1166 		np->cork.tclass = tclass;
1167 		if (rt->dst.flags & DST_XFRM_TUNNEL)
1168 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1169 			      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1170 		else
1171 			mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1172 			      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1173 		if (np->frag_size < mtu) {
1174 			if (np->frag_size)
1175 				mtu = np->frag_size;
1176 		}
1177 		cork->fragsize = mtu;
1178 		if (dst_allfrag(rt->dst.path))
1179 			cork->flags |= IPCORK_ALLFRAG;
1180 		cork->length = 0;
1181 		exthdrlen = (opt ? opt->opt_flen : 0);
1182 		length += exthdrlen;
1183 		transhdrlen += exthdrlen;
1184 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1185 	} else {
1186 		rt = (struct rt6_info *)cork->dst;
1187 		fl6 = &inet->cork.fl.u.ip6;
1188 		opt = np->cork.opt;
1189 		transhdrlen = 0;
1190 		exthdrlen = 0;
1191 		dst_exthdrlen = 0;
1192 		mtu = cork->fragsize;
1193 	}
1194 
1195 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1196 
1197 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1198 			(opt ? opt->opt_nflen : 0);
1199 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1200 
1201 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1202 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1203 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1204 			return -EMSGSIZE;
1205 		}
1206 	}
1207 
1208 	/* For UDP, check if TX timestamp is enabled */
1209 	if (sk->sk_type == SOCK_DGRAM)
1210 		sock_tx_timestamp(sk, &tx_flags);
1211 
1212 	/*
1213 	 * Let's try using as much space as possible.
1214 	 * Use MTU if total length of the message fits into the MTU.
1215 	 * Otherwise, we need to reserve fragment header and
1216 	 * fragment alignment (= 8-15 octects, in total).
1217 	 *
1218 	 * Note that we may need to "move" the data from the tail of
1219 	 * of the buffer to the new fragment when we split
1220 	 * the message.
1221 	 *
1222 	 * FIXME: It may be fragmented into multiple chunks
1223 	 *        at once if non-fragmentable extension headers
1224 	 *        are too large.
1225 	 * --yoshfuji
1226 	 */
1227 
1228 	if ((length > mtu) && dontfrag && (sk->sk_protocol == IPPROTO_UDP ||
1229 					   sk->sk_protocol == IPPROTO_RAW)) {
1230 		ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1231 		return -EMSGSIZE;
1232 	}
1233 
1234 	skb = skb_peek_tail(&sk->sk_write_queue);
1235 	cork->length += length;
1236 	if (((length > mtu) ||
1237 	     (skb && skb_is_gso(skb))) &&
1238 	    (sk->sk_protocol == IPPROTO_UDP) &&
1239 	    (rt->dst.dev->features & NETIF_F_UFO)) {
1240 		err = ip6_ufo_append_data(sk, getfrag, from, length,
1241 					  hh_len, fragheaderlen,
1242 					  transhdrlen, mtu, flags, rt);
1243 		if (err)
1244 			goto error;
1245 		return 0;
1246 	}
1247 
1248 	if (!skb)
1249 		goto alloc_new_skb;
1250 
1251 	while (length > 0) {
1252 		/* Check if the remaining data fits into current packet. */
1253 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1254 		if (copy < length)
1255 			copy = maxfraglen - skb->len;
1256 
1257 		if (copy <= 0) {
1258 			char *data;
1259 			unsigned int datalen;
1260 			unsigned int fraglen;
1261 			unsigned int fraggap;
1262 			unsigned int alloclen;
1263 alloc_new_skb:
1264 			/* There's no room in the current skb */
1265 			if (skb)
1266 				fraggap = skb->len - maxfraglen;
1267 			else
1268 				fraggap = 0;
1269 			/* update mtu and maxfraglen if necessary */
1270 			if (skb == NULL || skb_prev == NULL)
1271 				ip6_append_data_mtu(&mtu, &maxfraglen,
1272 						    fragheaderlen, skb, rt,
1273 						    np->pmtudisc ==
1274 						    IPV6_PMTUDISC_PROBE);
1275 
1276 			skb_prev = skb;
1277 
1278 			/*
1279 			 * If remaining data exceeds the mtu,
1280 			 * we know we need more fragment(s).
1281 			 */
1282 			datalen = length + fraggap;
1283 
1284 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1285 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1286 			if ((flags & MSG_MORE) &&
1287 			    !(rt->dst.dev->features&NETIF_F_SG))
1288 				alloclen = mtu;
1289 			else
1290 				alloclen = datalen + fragheaderlen;
1291 
1292 			alloclen += dst_exthdrlen;
1293 
1294 			if (datalen != length + fraggap) {
1295 				/*
1296 				 * this is not the last fragment, the trailer
1297 				 * space is regarded as data space.
1298 				 */
1299 				datalen += rt->dst.trailer_len;
1300 			}
1301 
1302 			alloclen += rt->dst.trailer_len;
1303 			fraglen = datalen + fragheaderlen;
1304 
1305 			/*
1306 			 * We just reserve space for fragment header.
1307 			 * Note: this may be overallocation if the message
1308 			 * (without MSG_MORE) fits into the MTU.
1309 			 */
1310 			alloclen += sizeof(struct frag_hdr);
1311 
1312 			if (transhdrlen) {
1313 				skb = sock_alloc_send_skb(sk,
1314 						alloclen + hh_len,
1315 						(flags & MSG_DONTWAIT), &err);
1316 			} else {
1317 				skb = NULL;
1318 				if (atomic_read(&sk->sk_wmem_alloc) <=
1319 				    2 * sk->sk_sndbuf)
1320 					skb = sock_wmalloc(sk,
1321 							   alloclen + hh_len, 1,
1322 							   sk->sk_allocation);
1323 				if (unlikely(skb == NULL))
1324 					err = -ENOBUFS;
1325 				else {
1326 					/* Only the initial fragment
1327 					 * is time stamped.
1328 					 */
1329 					tx_flags = 0;
1330 				}
1331 			}
1332 			if (skb == NULL)
1333 				goto error;
1334 			/*
1335 			 *	Fill in the control structures
1336 			 */
1337 			skb->protocol = htons(ETH_P_IPV6);
1338 			skb->ip_summed = CHECKSUM_NONE;
1339 			skb->csum = 0;
1340 			/* reserve for fragmentation and ipsec header */
1341 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1342 				    dst_exthdrlen);
1343 
1344 			if (sk->sk_type == SOCK_DGRAM)
1345 				skb_shinfo(skb)->tx_flags = tx_flags;
1346 
1347 			/*
1348 			 *	Find where to start putting bytes
1349 			 */
1350 			data = skb_put(skb, fraglen);
1351 			skb_set_network_header(skb, exthdrlen);
1352 			data += fragheaderlen;
1353 			skb->transport_header = (skb->network_header +
1354 						 fragheaderlen);
1355 			if (fraggap) {
1356 				skb->csum = skb_copy_and_csum_bits(
1357 					skb_prev, maxfraglen,
1358 					data + transhdrlen, fraggap, 0);
1359 				skb_prev->csum = csum_sub(skb_prev->csum,
1360 							  skb->csum);
1361 				data += fraggap;
1362 				pskb_trim_unique(skb_prev, maxfraglen);
1363 			}
1364 			copy = datalen - transhdrlen - fraggap;
1365 
1366 			if (copy < 0) {
1367 				err = -EINVAL;
1368 				kfree_skb(skb);
1369 				goto error;
1370 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1371 				err = -EFAULT;
1372 				kfree_skb(skb);
1373 				goto error;
1374 			}
1375 
1376 			offset += copy;
1377 			length -= datalen - fraggap;
1378 			transhdrlen = 0;
1379 			exthdrlen = 0;
1380 			dst_exthdrlen = 0;
1381 
1382 			/*
1383 			 * Put the packet on the pending queue
1384 			 */
1385 			__skb_queue_tail(&sk->sk_write_queue, skb);
1386 			continue;
1387 		}
1388 
1389 		if (copy > length)
1390 			copy = length;
1391 
1392 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1393 			unsigned int off;
1394 
1395 			off = skb->len;
1396 			if (getfrag(from, skb_put(skb, copy),
1397 						offset, copy, off, skb) < 0) {
1398 				__skb_trim(skb, off);
1399 				err = -EFAULT;
1400 				goto error;
1401 			}
1402 		} else {
1403 			int i = skb_shinfo(skb)->nr_frags;
1404 			struct page_frag *pfrag = sk_page_frag(sk);
1405 
1406 			err = -ENOMEM;
1407 			if (!sk_page_frag_refill(sk, pfrag))
1408 				goto error;
1409 
1410 			if (!skb_can_coalesce(skb, i, pfrag->page,
1411 					      pfrag->offset)) {
1412 				err = -EMSGSIZE;
1413 				if (i == MAX_SKB_FRAGS)
1414 					goto error;
1415 
1416 				__skb_fill_page_desc(skb, i, pfrag->page,
1417 						     pfrag->offset, 0);
1418 				skb_shinfo(skb)->nr_frags = ++i;
1419 				get_page(pfrag->page);
1420 			}
1421 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1422 			if (getfrag(from,
1423 				    page_address(pfrag->page) + pfrag->offset,
1424 				    offset, copy, skb->len, skb) < 0)
1425 				goto error_efault;
1426 
1427 			pfrag->offset += copy;
1428 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1429 			skb->len += copy;
1430 			skb->data_len += copy;
1431 			skb->truesize += copy;
1432 			atomic_add(copy, &sk->sk_wmem_alloc);
1433 		}
1434 		offset += copy;
1435 		length -= copy;
1436 	}
1437 
1438 	return 0;
1439 
1440 error_efault:
1441 	err = -EFAULT;
1442 error:
1443 	cork->length -= length;
1444 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1445 	return err;
1446 }
1447 EXPORT_SYMBOL_GPL(ip6_append_data);
1448 
1449 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1450 {
1451 	if (np->cork.opt) {
1452 		kfree(np->cork.opt->dst0opt);
1453 		kfree(np->cork.opt->dst1opt);
1454 		kfree(np->cork.opt->hopopt);
1455 		kfree(np->cork.opt->srcrt);
1456 		kfree(np->cork.opt);
1457 		np->cork.opt = NULL;
1458 	}
1459 
1460 	if (inet->cork.base.dst) {
1461 		dst_release(inet->cork.base.dst);
1462 		inet->cork.base.dst = NULL;
1463 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1464 	}
1465 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1466 }
1467 
1468 int ip6_push_pending_frames(struct sock *sk)
1469 {
1470 	struct sk_buff *skb, *tmp_skb;
1471 	struct sk_buff **tail_skb;
1472 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1473 	struct inet_sock *inet = inet_sk(sk);
1474 	struct ipv6_pinfo *np = inet6_sk(sk);
1475 	struct net *net = sock_net(sk);
1476 	struct ipv6hdr *hdr;
1477 	struct ipv6_txoptions *opt = np->cork.opt;
1478 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1479 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1480 	unsigned char proto = fl6->flowi6_proto;
1481 	int err = 0;
1482 
1483 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1484 		goto out;
1485 	tail_skb = &(skb_shinfo(skb)->frag_list);
1486 
1487 	/* move skb->data to ip header from ext header */
1488 	if (skb->data < skb_network_header(skb))
1489 		__skb_pull(skb, skb_network_offset(skb));
1490 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1491 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1492 		*tail_skb = tmp_skb;
1493 		tail_skb = &(tmp_skb->next);
1494 		skb->len += tmp_skb->len;
1495 		skb->data_len += tmp_skb->len;
1496 		skb->truesize += tmp_skb->truesize;
1497 		tmp_skb->destructor = NULL;
1498 		tmp_skb->sk = NULL;
1499 	}
1500 
1501 	/* Allow local fragmentation. */
1502 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1503 		skb->local_df = 1;
1504 
1505 	*final_dst = fl6->daddr;
1506 	__skb_pull(skb, skb_network_header_len(skb));
1507 	if (opt && opt->opt_flen)
1508 		ipv6_push_frag_opts(skb, opt, &proto);
1509 	if (opt && opt->opt_nflen)
1510 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1511 
1512 	skb_push(skb, sizeof(struct ipv6hdr));
1513 	skb_reset_network_header(skb);
1514 	hdr = ipv6_hdr(skb);
1515 
1516 	ip6_flow_hdr(hdr, np->cork.tclass, fl6->flowlabel);
1517 	hdr->hop_limit = np->cork.hop_limit;
1518 	hdr->nexthdr = proto;
1519 	hdr->saddr = fl6->saddr;
1520 	hdr->daddr = *final_dst;
1521 
1522 	skb->priority = sk->sk_priority;
1523 	skb->mark = sk->sk_mark;
1524 
1525 	skb_dst_set(skb, dst_clone(&rt->dst));
1526 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1527 	if (proto == IPPROTO_ICMPV6) {
1528 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1529 
1530 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1531 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1532 	}
1533 
1534 	err = ip6_local_out(skb);
1535 	if (err) {
1536 		if (err > 0)
1537 			err = net_xmit_errno(err);
1538 		if (err)
1539 			goto error;
1540 	}
1541 
1542 out:
1543 	ip6_cork_release(inet, np);
1544 	return err;
1545 error:
1546 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1547 	goto out;
1548 }
1549 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1550 
1551 void ip6_flush_pending_frames(struct sock *sk)
1552 {
1553 	struct sk_buff *skb;
1554 
1555 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1556 		if (skb_dst(skb))
1557 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1558 				      IPSTATS_MIB_OUTDISCARDS);
1559 		kfree_skb(skb);
1560 	}
1561 
1562 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1563 }
1564 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1565