xref: /linux/net/ipv6/ip6_output.c (revision a8fe58cec351c25e09c393bf46117c0c47b5a17c)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct neighbour *neigh;
65 	struct in6_addr *nexthop;
66 	int ret;
67 
68 	skb->protocol = htons(ETH_P_IPV6);
69 	skb->dev = dev;
70 
71 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
72 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
73 
74 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
75 		    ((mroute6_socket(net, skb) &&
76 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
77 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
78 					 &ipv6_hdr(skb)->saddr))) {
79 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
80 
81 			/* Do not check for IFF_ALLMULTI; multicast routing
82 			   is not supported in any case.
83 			 */
84 			if (newskb)
85 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
86 					net, sk, newskb, NULL, newskb->dev,
87 					dev_loopback_xmit);
88 
89 			if (ipv6_hdr(skb)->hop_limit == 0) {
90 				IP6_INC_STATS(net, idev,
91 					      IPSTATS_MIB_OUTDISCARDS);
92 				kfree_skb(skb);
93 				return 0;
94 			}
95 		}
96 
97 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
98 
99 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
100 		    IPV6_ADDR_SCOPE_NODELOCAL &&
101 		    !(dev->flags & IFF_LOOPBACK)) {
102 			kfree_skb(skb);
103 			return 0;
104 		}
105 	}
106 
107 	rcu_read_lock_bh();
108 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
109 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
110 	if (unlikely(!neigh))
111 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
112 	if (!IS_ERR(neigh)) {
113 		ret = dst_neigh_output(dst, neigh, skb);
114 		rcu_read_unlock_bh();
115 		return ret;
116 	}
117 	rcu_read_unlock_bh();
118 
119 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
120 	kfree_skb(skb);
121 	return -EINVAL;
122 }
123 
124 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
125 {
126 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
127 	    dst_allfrag(skb_dst(skb)) ||
128 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
129 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
130 	else
131 		return ip6_finish_output2(net, sk, skb);
132 }
133 
134 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
135 {
136 	struct net_device *dev = skb_dst(skb)->dev;
137 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
138 
139 	if (unlikely(idev->cnf.disable_ipv6)) {
140 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
141 		kfree_skb(skb);
142 		return 0;
143 	}
144 
145 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
146 			    net, sk, skb, NULL, dev,
147 			    ip6_finish_output,
148 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
149 }
150 
151 /*
152  * xmit an sk_buff (used by TCP, SCTP and DCCP)
153  * Note : socket lock is not held for SYNACK packets, but might be modified
154  * by calls to skb_set_owner_w() and ipv6_local_error(),
155  * which are using proper atomic operations or spinlocks.
156  */
157 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
158 	     struct ipv6_txoptions *opt, int tclass)
159 {
160 	struct net *net = sock_net(sk);
161 	const struct ipv6_pinfo *np = inet6_sk(sk);
162 	struct in6_addr *first_hop = &fl6->daddr;
163 	struct dst_entry *dst = skb_dst(skb);
164 	struct ipv6hdr *hdr;
165 	u8  proto = fl6->flowi6_proto;
166 	int seg_len = skb->len;
167 	int hlimit = -1;
168 	u32 mtu;
169 
170 	if (opt) {
171 		unsigned int head_room;
172 
173 		/* First: exthdrs may take lots of space (~8K for now)
174 		   MAX_HEADER is not enough.
175 		 */
176 		head_room = opt->opt_nflen + opt->opt_flen;
177 		seg_len += head_room;
178 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
179 
180 		if (skb_headroom(skb) < head_room) {
181 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
182 			if (!skb2) {
183 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
184 					      IPSTATS_MIB_OUTDISCARDS);
185 				kfree_skb(skb);
186 				return -ENOBUFS;
187 			}
188 			consume_skb(skb);
189 			skb = skb2;
190 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
191 			 * it is safe to call in our context (socket lock not held)
192 			 */
193 			skb_set_owner_w(skb, (struct sock *)sk);
194 		}
195 		if (opt->opt_flen)
196 			ipv6_push_frag_opts(skb, opt, &proto);
197 		if (opt->opt_nflen)
198 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
199 	}
200 
201 	skb_push(skb, sizeof(struct ipv6hdr));
202 	skb_reset_network_header(skb);
203 	hdr = ipv6_hdr(skb);
204 
205 	/*
206 	 *	Fill in the IPv6 header
207 	 */
208 	if (np)
209 		hlimit = np->hop_limit;
210 	if (hlimit < 0)
211 		hlimit = ip6_dst_hoplimit(dst);
212 
213 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
214 						     np->autoflowlabel, fl6));
215 
216 	hdr->payload_len = htons(seg_len);
217 	hdr->nexthdr = proto;
218 	hdr->hop_limit = hlimit;
219 
220 	hdr->saddr = fl6->saddr;
221 	hdr->daddr = *first_hop;
222 
223 	skb->protocol = htons(ETH_P_IPV6);
224 	skb->priority = sk->sk_priority;
225 	skb->mark = sk->sk_mark;
226 
227 	mtu = dst_mtu(dst);
228 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
229 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
230 			      IPSTATS_MIB_OUT, skb->len);
231 		/* hooks should never assume socket lock is held.
232 		 * we promote our socket to non const
233 		 */
234 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
235 			       net, (struct sock *)sk, skb, NULL, dst->dev,
236 			       dst_output);
237 	}
238 
239 	skb->dev = dst->dev;
240 	/* ipv6_local_error() does not require socket lock,
241 	 * we promote our socket to non const
242 	 */
243 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
244 
245 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
246 	kfree_skb(skb);
247 	return -EMSGSIZE;
248 }
249 EXPORT_SYMBOL(ip6_xmit);
250 
251 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
252 {
253 	struct ip6_ra_chain *ra;
254 	struct sock *last = NULL;
255 
256 	read_lock(&ip6_ra_lock);
257 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
258 		struct sock *sk = ra->sk;
259 		if (sk && ra->sel == sel &&
260 		    (!sk->sk_bound_dev_if ||
261 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
262 			if (last) {
263 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
264 				if (skb2)
265 					rawv6_rcv(last, skb2);
266 			}
267 			last = sk;
268 		}
269 	}
270 
271 	if (last) {
272 		rawv6_rcv(last, skb);
273 		read_unlock(&ip6_ra_lock);
274 		return 1;
275 	}
276 	read_unlock(&ip6_ra_lock);
277 	return 0;
278 }
279 
280 static int ip6_forward_proxy_check(struct sk_buff *skb)
281 {
282 	struct ipv6hdr *hdr = ipv6_hdr(skb);
283 	u8 nexthdr = hdr->nexthdr;
284 	__be16 frag_off;
285 	int offset;
286 
287 	if (ipv6_ext_hdr(nexthdr)) {
288 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
289 		if (offset < 0)
290 			return 0;
291 	} else
292 		offset = sizeof(struct ipv6hdr);
293 
294 	if (nexthdr == IPPROTO_ICMPV6) {
295 		struct icmp6hdr *icmp6;
296 
297 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
298 					 offset + 1 - skb->data)))
299 			return 0;
300 
301 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
302 
303 		switch (icmp6->icmp6_type) {
304 		case NDISC_ROUTER_SOLICITATION:
305 		case NDISC_ROUTER_ADVERTISEMENT:
306 		case NDISC_NEIGHBOUR_SOLICITATION:
307 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
308 		case NDISC_REDIRECT:
309 			/* For reaction involving unicast neighbor discovery
310 			 * message destined to the proxied address, pass it to
311 			 * input function.
312 			 */
313 			return 1;
314 		default:
315 			break;
316 		}
317 	}
318 
319 	/*
320 	 * The proxying router can't forward traffic sent to a link-local
321 	 * address, so signal the sender and discard the packet. This
322 	 * behavior is clarified by the MIPv6 specification.
323 	 */
324 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
325 		dst_link_failure(skb);
326 		return -1;
327 	}
328 
329 	return 0;
330 }
331 
332 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
333 				     struct sk_buff *skb)
334 {
335 	skb_sender_cpu_clear(skb);
336 	return dst_output(net, sk, skb);
337 }
338 
339 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
340 {
341 	unsigned int mtu;
342 	struct inet6_dev *idev;
343 
344 	if (dst_metric_locked(dst, RTAX_MTU)) {
345 		mtu = dst_metric_raw(dst, RTAX_MTU);
346 		if (mtu)
347 			return mtu;
348 	}
349 
350 	mtu = IPV6_MIN_MTU;
351 	rcu_read_lock();
352 	idev = __in6_dev_get(dst->dev);
353 	if (idev)
354 		mtu = idev->cnf.mtu6;
355 	rcu_read_unlock();
356 
357 	return mtu;
358 }
359 
360 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
361 {
362 	if (skb->len <= mtu)
363 		return false;
364 
365 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
366 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
367 		return true;
368 
369 	if (skb->ignore_df)
370 		return false;
371 
372 	if (skb_is_gso(skb) && skb_gso_network_seglen(skb) <= mtu)
373 		return false;
374 
375 	return true;
376 }
377 
378 int ip6_forward(struct sk_buff *skb)
379 {
380 	struct dst_entry *dst = skb_dst(skb);
381 	struct ipv6hdr *hdr = ipv6_hdr(skb);
382 	struct inet6_skb_parm *opt = IP6CB(skb);
383 	struct net *net = dev_net(dst->dev);
384 	u32 mtu;
385 
386 	if (net->ipv6.devconf_all->forwarding == 0)
387 		goto error;
388 
389 	if (skb->pkt_type != PACKET_HOST)
390 		goto drop;
391 
392 	if (unlikely(skb->sk))
393 		goto drop;
394 
395 	if (skb_warn_if_lro(skb))
396 		goto drop;
397 
398 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
400 				 IPSTATS_MIB_INDISCARDS);
401 		goto drop;
402 	}
403 
404 	skb_forward_csum(skb);
405 
406 	/*
407 	 *	We DO NOT make any processing on
408 	 *	RA packets, pushing them to user level AS IS
409 	 *	without ane WARRANTY that application will be able
410 	 *	to interpret them. The reason is that we
411 	 *	cannot make anything clever here.
412 	 *
413 	 *	We are not end-node, so that if packet contains
414 	 *	AH/ESP, we cannot make anything.
415 	 *	Defragmentation also would be mistake, RA packets
416 	 *	cannot be fragmented, because there is no warranty
417 	 *	that different fragments will go along one path. --ANK
418 	 */
419 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
420 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
421 			return 0;
422 	}
423 
424 	/*
425 	 *	check and decrement ttl
426 	 */
427 	if (hdr->hop_limit <= 1) {
428 		/* Force OUTPUT device used as source address */
429 		skb->dev = dst->dev;
430 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
431 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
432 				 IPSTATS_MIB_INHDRERRORS);
433 
434 		kfree_skb(skb);
435 		return -ETIMEDOUT;
436 	}
437 
438 	/* XXX: idev->cnf.proxy_ndp? */
439 	if (net->ipv6.devconf_all->proxy_ndp &&
440 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
441 		int proxied = ip6_forward_proxy_check(skb);
442 		if (proxied > 0)
443 			return ip6_input(skb);
444 		else if (proxied < 0) {
445 			IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
446 					 IPSTATS_MIB_INDISCARDS);
447 			goto drop;
448 		}
449 	}
450 
451 	if (!xfrm6_route_forward(skb)) {
452 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
453 				 IPSTATS_MIB_INDISCARDS);
454 		goto drop;
455 	}
456 	dst = skb_dst(skb);
457 
458 	/* IPv6 specs say nothing about it, but it is clear that we cannot
459 	   send redirects to source routed frames.
460 	   We don't send redirects to frames decapsulated from IPsec.
461 	 */
462 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
463 		struct in6_addr *target = NULL;
464 		struct inet_peer *peer;
465 		struct rt6_info *rt;
466 
467 		/*
468 		 *	incoming and outgoing devices are the same
469 		 *	send a redirect.
470 		 */
471 
472 		rt = (struct rt6_info *) dst;
473 		if (rt->rt6i_flags & RTF_GATEWAY)
474 			target = &rt->rt6i_gateway;
475 		else
476 			target = &hdr->daddr;
477 
478 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
479 
480 		/* Limit redirects both by destination (here)
481 		   and by source (inside ndisc_send_redirect)
482 		 */
483 		if (inet_peer_xrlim_allow(peer, 1*HZ))
484 			ndisc_send_redirect(skb, target);
485 		if (peer)
486 			inet_putpeer(peer);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = ip6_dst_mtu_forward(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (ip6_pkt_too_big(skb, mtu)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
510 				 IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
512 				 IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS_BH(net, ip6_dst_idev(dst),
519 				 IPSTATS_MIB_OUTDISCARDS);
520 		goto drop;
521 	}
522 
523 	hdr = ipv6_hdr(skb);
524 
525 	/* Mangling hops number delayed to point after skb COW */
526 
527 	hdr->hop_limit--;
528 
529 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
530 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
531 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
532 		       net, NULL, skb, skb->dev, dst->dev,
533 		       ip6_forward_finish);
534 
535 error:
536 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
537 drop:
538 	kfree_skb(skb);
539 	return -EINVAL;
540 }
541 
542 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
543 {
544 	to->pkt_type = from->pkt_type;
545 	to->priority = from->priority;
546 	to->protocol = from->protocol;
547 	skb_dst_drop(to);
548 	skb_dst_set(to, dst_clone(skb_dst(from)));
549 	to->dev = from->dev;
550 	to->mark = from->mark;
551 
552 #ifdef CONFIG_NET_SCHED
553 	to->tc_index = from->tc_index;
554 #endif
555 	nf_copy(to, from);
556 	skb_copy_secmark(to, from);
557 }
558 
559 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
560 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
561 {
562 	struct sk_buff *frag;
563 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
564 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
565 				inet6_sk(skb->sk) : NULL;
566 	struct ipv6hdr *tmp_hdr;
567 	struct frag_hdr *fh;
568 	unsigned int mtu, hlen, left, len;
569 	int hroom, troom;
570 	__be32 frag_id;
571 	int ptr, offset = 0, err = 0;
572 	u8 *prevhdr, nexthdr = 0;
573 
574 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
575 	nexthdr = *prevhdr;
576 
577 	mtu = ip6_skb_dst_mtu(skb);
578 
579 	/* We must not fragment if the socket is set to force MTU discovery
580 	 * or if the skb it not generated by a local socket.
581 	 */
582 	if (unlikely(!skb->ignore_df && skb->len > mtu))
583 		goto fail_toobig;
584 
585 	if (IP6CB(skb)->frag_max_size) {
586 		if (IP6CB(skb)->frag_max_size > mtu)
587 			goto fail_toobig;
588 
589 		/* don't send fragments larger than what we received */
590 		mtu = IP6CB(skb)->frag_max_size;
591 		if (mtu < IPV6_MIN_MTU)
592 			mtu = IPV6_MIN_MTU;
593 	}
594 
595 	if (np && np->frag_size < mtu) {
596 		if (np->frag_size)
597 			mtu = np->frag_size;
598 	}
599 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
600 		goto fail_toobig;
601 	mtu -= hlen + sizeof(struct frag_hdr);
602 
603 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
604 				    &ipv6_hdr(skb)->saddr);
605 
606 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
607 	    (err = skb_checksum_help(skb)))
608 		goto fail;
609 
610 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
611 	if (skb_has_frag_list(skb)) {
612 		int first_len = skb_pagelen(skb);
613 		struct sk_buff *frag2;
614 
615 		if (first_len - hlen > mtu ||
616 		    ((first_len - hlen) & 7) ||
617 		    skb_cloned(skb) ||
618 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
619 			goto slow_path;
620 
621 		skb_walk_frags(skb, frag) {
622 			/* Correct geometry. */
623 			if (frag->len > mtu ||
624 			    ((frag->len & 7) && frag->next) ||
625 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
626 				goto slow_path_clean;
627 
628 			/* Partially cloned skb? */
629 			if (skb_shared(frag))
630 				goto slow_path_clean;
631 
632 			BUG_ON(frag->sk);
633 			if (skb->sk) {
634 				frag->sk = skb->sk;
635 				frag->destructor = sock_wfree;
636 			}
637 			skb->truesize -= frag->truesize;
638 		}
639 
640 		err = 0;
641 		offset = 0;
642 		/* BUILD HEADER */
643 
644 		*prevhdr = NEXTHDR_FRAGMENT;
645 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
646 		if (!tmp_hdr) {
647 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
648 				      IPSTATS_MIB_FRAGFAILS);
649 			err = -ENOMEM;
650 			goto fail;
651 		}
652 		frag = skb_shinfo(skb)->frag_list;
653 		skb_frag_list_init(skb);
654 
655 		__skb_pull(skb, hlen);
656 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
657 		__skb_push(skb, hlen);
658 		skb_reset_network_header(skb);
659 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
660 
661 		fh->nexthdr = nexthdr;
662 		fh->reserved = 0;
663 		fh->frag_off = htons(IP6_MF);
664 		fh->identification = frag_id;
665 
666 		first_len = skb_pagelen(skb);
667 		skb->data_len = first_len - skb_headlen(skb);
668 		skb->len = first_len;
669 		ipv6_hdr(skb)->payload_len = htons(first_len -
670 						   sizeof(struct ipv6hdr));
671 
672 		dst_hold(&rt->dst);
673 
674 		for (;;) {
675 			/* Prepare header of the next frame,
676 			 * before previous one went down. */
677 			if (frag) {
678 				frag->ip_summed = CHECKSUM_NONE;
679 				skb_reset_transport_header(frag);
680 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
681 				__skb_push(frag, hlen);
682 				skb_reset_network_header(frag);
683 				memcpy(skb_network_header(frag), tmp_hdr,
684 				       hlen);
685 				offset += skb->len - hlen - sizeof(struct frag_hdr);
686 				fh->nexthdr = nexthdr;
687 				fh->reserved = 0;
688 				fh->frag_off = htons(offset);
689 				if (frag->next)
690 					fh->frag_off |= htons(IP6_MF);
691 				fh->identification = frag_id;
692 				ipv6_hdr(frag)->payload_len =
693 						htons(frag->len -
694 						      sizeof(struct ipv6hdr));
695 				ip6_copy_metadata(frag, skb);
696 			}
697 
698 			err = output(net, sk, skb);
699 			if (!err)
700 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
701 					      IPSTATS_MIB_FRAGCREATES);
702 
703 			if (err || !frag)
704 				break;
705 
706 			skb = frag;
707 			frag = skb->next;
708 			skb->next = NULL;
709 		}
710 
711 		kfree(tmp_hdr);
712 
713 		if (err == 0) {
714 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
715 				      IPSTATS_MIB_FRAGOKS);
716 			ip6_rt_put(rt);
717 			return 0;
718 		}
719 
720 		kfree_skb_list(frag);
721 
722 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
723 			      IPSTATS_MIB_FRAGFAILS);
724 		ip6_rt_put(rt);
725 		return err;
726 
727 slow_path_clean:
728 		skb_walk_frags(skb, frag2) {
729 			if (frag2 == frag)
730 				break;
731 			frag2->sk = NULL;
732 			frag2->destructor = NULL;
733 			skb->truesize += frag2->truesize;
734 		}
735 	}
736 
737 slow_path:
738 	left = skb->len - hlen;		/* Space per frame */
739 	ptr = hlen;			/* Where to start from */
740 
741 	/*
742 	 *	Fragment the datagram.
743 	 */
744 
745 	*prevhdr = NEXTHDR_FRAGMENT;
746 	troom = rt->dst.dev->needed_tailroom;
747 
748 	/*
749 	 *	Keep copying data until we run out.
750 	 */
751 	while (left > 0)	{
752 		len = left;
753 		/* IF: it doesn't fit, use 'mtu' - the data space left */
754 		if (len > mtu)
755 			len = mtu;
756 		/* IF: we are not sending up to and including the packet end
757 		   then align the next start on an eight byte boundary */
758 		if (len < left)	{
759 			len &= ~7;
760 		}
761 
762 		/* Allocate buffer */
763 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
764 				 hroom + troom, GFP_ATOMIC);
765 		if (!frag) {
766 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
767 				      IPSTATS_MIB_FRAGFAILS);
768 			err = -ENOMEM;
769 			goto fail;
770 		}
771 
772 		/*
773 		 *	Set up data on packet
774 		 */
775 
776 		ip6_copy_metadata(frag, skb);
777 		skb_reserve(frag, hroom);
778 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
779 		skb_reset_network_header(frag);
780 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
781 		frag->transport_header = (frag->network_header + hlen +
782 					  sizeof(struct frag_hdr));
783 
784 		/*
785 		 *	Charge the memory for the fragment to any owner
786 		 *	it might possess
787 		 */
788 		if (skb->sk)
789 			skb_set_owner_w(frag, skb->sk);
790 
791 		/*
792 		 *	Copy the packet header into the new buffer.
793 		 */
794 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
795 
796 		/*
797 		 *	Build fragment header.
798 		 */
799 		fh->nexthdr = nexthdr;
800 		fh->reserved = 0;
801 		fh->identification = frag_id;
802 
803 		/*
804 		 *	Copy a block of the IP datagram.
805 		 */
806 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
807 				     len));
808 		left -= len;
809 
810 		fh->frag_off = htons(offset);
811 		if (left > 0)
812 			fh->frag_off |= htons(IP6_MF);
813 		ipv6_hdr(frag)->payload_len = htons(frag->len -
814 						    sizeof(struct ipv6hdr));
815 
816 		ptr += len;
817 		offset += len;
818 
819 		/*
820 		 *	Put this fragment into the sending queue.
821 		 */
822 		err = output(net, sk, frag);
823 		if (err)
824 			goto fail;
825 
826 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
827 			      IPSTATS_MIB_FRAGCREATES);
828 	}
829 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830 		      IPSTATS_MIB_FRAGOKS);
831 	consume_skb(skb);
832 	return err;
833 
834 fail_toobig:
835 	if (skb->sk && dst_allfrag(skb_dst(skb)))
836 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
837 
838 	skb->dev = skb_dst(skb)->dev;
839 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
840 	err = -EMSGSIZE;
841 
842 fail:
843 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
844 		      IPSTATS_MIB_FRAGFAILS);
845 	kfree_skb(skb);
846 	return err;
847 }
848 
849 static inline int ip6_rt_check(const struct rt6key *rt_key,
850 			       const struct in6_addr *fl_addr,
851 			       const struct in6_addr *addr_cache)
852 {
853 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
854 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
855 }
856 
857 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
858 					  struct dst_entry *dst,
859 					  const struct flowi6 *fl6)
860 {
861 	struct ipv6_pinfo *np = inet6_sk(sk);
862 	struct rt6_info *rt;
863 
864 	if (!dst)
865 		goto out;
866 
867 	if (dst->ops->family != AF_INET6) {
868 		dst_release(dst);
869 		return NULL;
870 	}
871 
872 	rt = (struct rt6_info *)dst;
873 	/* Yes, checking route validity in not connected
874 	 * case is not very simple. Take into account,
875 	 * that we do not support routing by source, TOS,
876 	 * and MSG_DONTROUTE		--ANK (980726)
877 	 *
878 	 * 1. ip6_rt_check(): If route was host route,
879 	 *    check that cached destination is current.
880 	 *    If it is network route, we still may
881 	 *    check its validity using saved pointer
882 	 *    to the last used address: daddr_cache.
883 	 *    We do not want to save whole address now,
884 	 *    (because main consumer of this service
885 	 *    is tcp, which has not this problem),
886 	 *    so that the last trick works only on connected
887 	 *    sockets.
888 	 * 2. oif also should be the same.
889 	 */
890 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
891 #ifdef CONFIG_IPV6_SUBTREES
892 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
893 #endif
894 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
895 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
896 		dst_release(dst);
897 		dst = NULL;
898 	}
899 
900 out:
901 	return dst;
902 }
903 
904 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
905 			       struct dst_entry **dst, struct flowi6 *fl6)
906 {
907 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
908 	struct neighbour *n;
909 	struct rt6_info *rt;
910 #endif
911 	int err;
912 	int flags = 0;
913 
914 	/* The correct way to handle this would be to do
915 	 * ip6_route_get_saddr, and then ip6_route_output; however,
916 	 * the route-specific preferred source forces the
917 	 * ip6_route_output call _before_ ip6_route_get_saddr.
918 	 *
919 	 * In source specific routing (no src=any default route),
920 	 * ip6_route_output will fail given src=any saddr, though, so
921 	 * that's why we try it again later.
922 	 */
923 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
924 		struct rt6_info *rt;
925 		bool had_dst = *dst != NULL;
926 
927 		if (!had_dst)
928 			*dst = ip6_route_output(net, sk, fl6);
929 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
930 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
931 					  sk ? inet6_sk(sk)->srcprefs : 0,
932 					  &fl6->saddr);
933 		if (err)
934 			goto out_err_release;
935 
936 		/* If we had an erroneous initial result, pretend it
937 		 * never existed and let the SA-enabled version take
938 		 * over.
939 		 */
940 		if (!had_dst && (*dst)->error) {
941 			dst_release(*dst);
942 			*dst = NULL;
943 		}
944 
945 		if (fl6->flowi6_oif)
946 			flags |= RT6_LOOKUP_F_IFACE;
947 	}
948 
949 	if (!*dst)
950 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
951 
952 	err = (*dst)->error;
953 	if (err)
954 		goto out_err_release;
955 
956 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
957 	/*
958 	 * Here if the dst entry we've looked up
959 	 * has a neighbour entry that is in the INCOMPLETE
960 	 * state and the src address from the flow is
961 	 * marked as OPTIMISTIC, we release the found
962 	 * dst entry and replace it instead with the
963 	 * dst entry of the nexthop router
964 	 */
965 	rt = (struct rt6_info *) *dst;
966 	rcu_read_lock_bh();
967 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
968 				      rt6_nexthop(rt, &fl6->daddr));
969 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
970 	rcu_read_unlock_bh();
971 
972 	if (err) {
973 		struct inet6_ifaddr *ifp;
974 		struct flowi6 fl_gw6;
975 		int redirect;
976 
977 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
978 				      (*dst)->dev, 1);
979 
980 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
981 		if (ifp)
982 			in6_ifa_put(ifp);
983 
984 		if (redirect) {
985 			/*
986 			 * We need to get the dst entry for the
987 			 * default router instead
988 			 */
989 			dst_release(*dst);
990 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
991 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
992 			*dst = ip6_route_output(net, sk, &fl_gw6);
993 			err = (*dst)->error;
994 			if (err)
995 				goto out_err_release;
996 		}
997 	}
998 #endif
999 
1000 	return 0;
1001 
1002 out_err_release:
1003 	if (err == -ENETUNREACH)
1004 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1005 	dst_release(*dst);
1006 	*dst = NULL;
1007 	return err;
1008 }
1009 
1010 /**
1011  *	ip6_dst_lookup - perform route lookup on flow
1012  *	@sk: socket which provides route info
1013  *	@dst: pointer to dst_entry * for result
1014  *	@fl6: flow to lookup
1015  *
1016  *	This function performs a route lookup on the given flow.
1017  *
1018  *	It returns zero on success, or a standard errno code on error.
1019  */
1020 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1021 		   struct flowi6 *fl6)
1022 {
1023 	*dst = NULL;
1024 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1025 }
1026 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1027 
1028 /**
1029  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1030  *	@sk: socket which provides route info
1031  *	@fl6: flow to lookup
1032  *	@final_dst: final destination address for ipsec lookup
1033  *
1034  *	This function performs a route lookup on the given flow.
1035  *
1036  *	It returns a valid dst pointer on success, or a pointer encoded
1037  *	error code.
1038  */
1039 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1040 				      const struct in6_addr *final_dst)
1041 {
1042 	struct dst_entry *dst = NULL;
1043 	int err;
1044 
1045 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1046 	if (err)
1047 		return ERR_PTR(err);
1048 	if (final_dst)
1049 		fl6->daddr = *final_dst;
1050 	if (!fl6->flowi6_oif)
1051 		fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1052 
1053 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1054 }
1055 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1056 
1057 /**
1058  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1059  *	@sk: socket which provides the dst cache and route info
1060  *	@fl6: flow to lookup
1061  *	@final_dst: final destination address for ipsec lookup
1062  *
1063  *	This function performs a route lookup on the given flow with the
1064  *	possibility of using the cached route in the socket if it is valid.
1065  *	It will take the socket dst lock when operating on the dst cache.
1066  *	As a result, this function can only be used in process context.
1067  *
1068  *	It returns a valid dst pointer on success, or a pointer encoded
1069  *	error code.
1070  */
1071 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1072 					 const struct in6_addr *final_dst)
1073 {
1074 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1075 	int err;
1076 
1077 	dst = ip6_sk_dst_check(sk, dst, fl6);
1078 
1079 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1080 	if (err)
1081 		return ERR_PTR(err);
1082 	if (final_dst)
1083 		fl6->daddr = *final_dst;
1084 
1085 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1086 }
1087 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1088 
1089 static inline int ip6_ufo_append_data(struct sock *sk,
1090 			struct sk_buff_head *queue,
1091 			int getfrag(void *from, char *to, int offset, int len,
1092 			int odd, struct sk_buff *skb),
1093 			void *from, int length, int hh_len, int fragheaderlen,
1094 			int transhdrlen, int mtu, unsigned int flags,
1095 			const struct flowi6 *fl6)
1096 
1097 {
1098 	struct sk_buff *skb;
1099 	int err;
1100 
1101 	/* There is support for UDP large send offload by network
1102 	 * device, so create one single skb packet containing complete
1103 	 * udp datagram
1104 	 */
1105 	skb = skb_peek_tail(queue);
1106 	if (!skb) {
1107 		skb = sock_alloc_send_skb(sk,
1108 			hh_len + fragheaderlen + transhdrlen + 20,
1109 			(flags & MSG_DONTWAIT), &err);
1110 		if (!skb)
1111 			return err;
1112 
1113 		/* reserve space for Hardware header */
1114 		skb_reserve(skb, hh_len);
1115 
1116 		/* create space for UDP/IP header */
1117 		skb_put(skb, fragheaderlen + transhdrlen);
1118 
1119 		/* initialize network header pointer */
1120 		skb_reset_network_header(skb);
1121 
1122 		/* initialize protocol header pointer */
1123 		skb->transport_header = skb->network_header + fragheaderlen;
1124 
1125 		skb->protocol = htons(ETH_P_IPV6);
1126 		skb->csum = 0;
1127 
1128 		__skb_queue_tail(queue, skb);
1129 	} else if (skb_is_gso(skb)) {
1130 		goto append;
1131 	}
1132 
1133 	skb->ip_summed = CHECKSUM_PARTIAL;
1134 	/* Specify the length of each IPv6 datagram fragment.
1135 	 * It has to be a multiple of 8.
1136 	 */
1137 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1138 				     sizeof(struct frag_hdr)) & ~7;
1139 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1140 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1141 							 &fl6->daddr,
1142 							 &fl6->saddr);
1143 
1144 append:
1145 	return skb_append_datato_frags(sk, skb, getfrag, from,
1146 				       (length - transhdrlen));
1147 }
1148 
1149 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1150 					       gfp_t gfp)
1151 {
1152 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1153 }
1154 
1155 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1156 						gfp_t gfp)
1157 {
1158 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1159 }
1160 
1161 static void ip6_append_data_mtu(unsigned int *mtu,
1162 				int *maxfraglen,
1163 				unsigned int fragheaderlen,
1164 				struct sk_buff *skb,
1165 				struct rt6_info *rt,
1166 				unsigned int orig_mtu)
1167 {
1168 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1169 		if (!skb) {
1170 			/* first fragment, reserve header_len */
1171 			*mtu = orig_mtu - rt->dst.header_len;
1172 
1173 		} else {
1174 			/*
1175 			 * this fragment is not first, the headers
1176 			 * space is regarded as data space.
1177 			 */
1178 			*mtu = orig_mtu;
1179 		}
1180 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1181 			      + fragheaderlen - sizeof(struct frag_hdr);
1182 	}
1183 }
1184 
1185 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1186 			  struct inet6_cork *v6_cork,
1187 			  int hlimit, int tclass, struct ipv6_txoptions *opt,
1188 			  struct rt6_info *rt, struct flowi6 *fl6)
1189 {
1190 	struct ipv6_pinfo *np = inet6_sk(sk);
1191 	unsigned int mtu;
1192 
1193 	/*
1194 	 * setup for corking
1195 	 */
1196 	if (opt) {
1197 		if (WARN_ON(v6_cork->opt))
1198 			return -EINVAL;
1199 
1200 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1201 		if (unlikely(!v6_cork->opt))
1202 			return -ENOBUFS;
1203 
1204 		v6_cork->opt->tot_len = opt->tot_len;
1205 		v6_cork->opt->opt_flen = opt->opt_flen;
1206 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1207 
1208 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1209 						    sk->sk_allocation);
1210 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1211 			return -ENOBUFS;
1212 
1213 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1214 						    sk->sk_allocation);
1215 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1216 			return -ENOBUFS;
1217 
1218 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1219 						   sk->sk_allocation);
1220 		if (opt->hopopt && !v6_cork->opt->hopopt)
1221 			return -ENOBUFS;
1222 
1223 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1224 						    sk->sk_allocation);
1225 		if (opt->srcrt && !v6_cork->opt->srcrt)
1226 			return -ENOBUFS;
1227 
1228 		/* need source address above miyazawa*/
1229 	}
1230 	dst_hold(&rt->dst);
1231 	cork->base.dst = &rt->dst;
1232 	cork->fl.u.ip6 = *fl6;
1233 	v6_cork->hop_limit = hlimit;
1234 	v6_cork->tclass = tclass;
1235 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1236 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1237 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1238 	else
1239 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1240 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1241 	if (np->frag_size < mtu) {
1242 		if (np->frag_size)
1243 			mtu = np->frag_size;
1244 	}
1245 	cork->base.fragsize = mtu;
1246 	if (dst_allfrag(rt->dst.path))
1247 		cork->base.flags |= IPCORK_ALLFRAG;
1248 	cork->base.length = 0;
1249 
1250 	return 0;
1251 }
1252 
1253 static int __ip6_append_data(struct sock *sk,
1254 			     struct flowi6 *fl6,
1255 			     struct sk_buff_head *queue,
1256 			     struct inet_cork *cork,
1257 			     struct inet6_cork *v6_cork,
1258 			     struct page_frag *pfrag,
1259 			     int getfrag(void *from, char *to, int offset,
1260 					 int len, int odd, struct sk_buff *skb),
1261 			     void *from, int length, int transhdrlen,
1262 			     unsigned int flags, int dontfrag)
1263 {
1264 	struct sk_buff *skb, *skb_prev = NULL;
1265 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1266 	int exthdrlen = 0;
1267 	int dst_exthdrlen = 0;
1268 	int hh_len;
1269 	int copy;
1270 	int err;
1271 	int offset = 0;
1272 	__u8 tx_flags = 0;
1273 	u32 tskey = 0;
1274 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1275 	struct ipv6_txoptions *opt = v6_cork->opt;
1276 	int csummode = CHECKSUM_NONE;
1277 	unsigned int maxnonfragsize, headersize;
1278 
1279 	skb = skb_peek_tail(queue);
1280 	if (!skb) {
1281 		exthdrlen = opt ? opt->opt_flen : 0;
1282 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1283 	}
1284 
1285 	mtu = cork->fragsize;
1286 	orig_mtu = mtu;
1287 
1288 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1289 
1290 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1291 			(opt ? opt->opt_nflen : 0);
1292 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1293 		     sizeof(struct frag_hdr);
1294 
1295 	headersize = sizeof(struct ipv6hdr) +
1296 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1297 		     (dst_allfrag(&rt->dst) ?
1298 		      sizeof(struct frag_hdr) : 0) +
1299 		     rt->rt6i_nfheader_len;
1300 
1301 	if (cork->length + length > mtu - headersize && dontfrag &&
1302 	    (sk->sk_protocol == IPPROTO_UDP ||
1303 	     sk->sk_protocol == IPPROTO_RAW)) {
1304 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1305 				sizeof(struct ipv6hdr));
1306 		goto emsgsize;
1307 	}
1308 
1309 	if (ip6_sk_ignore_df(sk))
1310 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1311 	else
1312 		maxnonfragsize = mtu;
1313 
1314 	if (cork->length + length > maxnonfragsize - headersize) {
1315 emsgsize:
1316 		ipv6_local_error(sk, EMSGSIZE, fl6,
1317 				 mtu - headersize +
1318 				 sizeof(struct ipv6hdr));
1319 		return -EMSGSIZE;
1320 	}
1321 
1322 	/* CHECKSUM_PARTIAL only with no extension headers and when
1323 	 * we are not going to fragment
1324 	 */
1325 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1326 	    headersize == sizeof(struct ipv6hdr) &&
1327 	    length < mtu - headersize &&
1328 	    !(flags & MSG_MORE) &&
1329 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1330 		csummode = CHECKSUM_PARTIAL;
1331 
1332 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1333 		sock_tx_timestamp(sk, &tx_flags);
1334 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1335 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1336 			tskey = sk->sk_tskey++;
1337 	}
1338 
1339 	/*
1340 	 * Let's try using as much space as possible.
1341 	 * Use MTU if total length of the message fits into the MTU.
1342 	 * Otherwise, we need to reserve fragment header and
1343 	 * fragment alignment (= 8-15 octects, in total).
1344 	 *
1345 	 * Note that we may need to "move" the data from the tail of
1346 	 * of the buffer to the new fragment when we split
1347 	 * the message.
1348 	 *
1349 	 * FIXME: It may be fragmented into multiple chunks
1350 	 *        at once if non-fragmentable extension headers
1351 	 *        are too large.
1352 	 * --yoshfuji
1353 	 */
1354 
1355 	cork->length += length;
1356 	if (((length > mtu) ||
1357 	     (skb && skb_is_gso(skb))) &&
1358 	    (sk->sk_protocol == IPPROTO_UDP) &&
1359 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1360 	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1361 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1362 					  hh_len, fragheaderlen,
1363 					  transhdrlen, mtu, flags, fl6);
1364 		if (err)
1365 			goto error;
1366 		return 0;
1367 	}
1368 
1369 	if (!skb)
1370 		goto alloc_new_skb;
1371 
1372 	while (length > 0) {
1373 		/* Check if the remaining data fits into current packet. */
1374 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1375 		if (copy < length)
1376 			copy = maxfraglen - skb->len;
1377 
1378 		if (copy <= 0) {
1379 			char *data;
1380 			unsigned int datalen;
1381 			unsigned int fraglen;
1382 			unsigned int fraggap;
1383 			unsigned int alloclen;
1384 alloc_new_skb:
1385 			/* There's no room in the current skb */
1386 			if (skb)
1387 				fraggap = skb->len - maxfraglen;
1388 			else
1389 				fraggap = 0;
1390 			/* update mtu and maxfraglen if necessary */
1391 			if (!skb || !skb_prev)
1392 				ip6_append_data_mtu(&mtu, &maxfraglen,
1393 						    fragheaderlen, skb, rt,
1394 						    orig_mtu);
1395 
1396 			skb_prev = skb;
1397 
1398 			/*
1399 			 * If remaining data exceeds the mtu,
1400 			 * we know we need more fragment(s).
1401 			 */
1402 			datalen = length + fraggap;
1403 
1404 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1405 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1406 			if ((flags & MSG_MORE) &&
1407 			    !(rt->dst.dev->features&NETIF_F_SG))
1408 				alloclen = mtu;
1409 			else
1410 				alloclen = datalen + fragheaderlen;
1411 
1412 			alloclen += dst_exthdrlen;
1413 
1414 			if (datalen != length + fraggap) {
1415 				/*
1416 				 * this is not the last fragment, the trailer
1417 				 * space is regarded as data space.
1418 				 */
1419 				datalen += rt->dst.trailer_len;
1420 			}
1421 
1422 			alloclen += rt->dst.trailer_len;
1423 			fraglen = datalen + fragheaderlen;
1424 
1425 			/*
1426 			 * We just reserve space for fragment header.
1427 			 * Note: this may be overallocation if the message
1428 			 * (without MSG_MORE) fits into the MTU.
1429 			 */
1430 			alloclen += sizeof(struct frag_hdr);
1431 
1432 			if (transhdrlen) {
1433 				skb = sock_alloc_send_skb(sk,
1434 						alloclen + hh_len,
1435 						(flags & MSG_DONTWAIT), &err);
1436 			} else {
1437 				skb = NULL;
1438 				if (atomic_read(&sk->sk_wmem_alloc) <=
1439 				    2 * sk->sk_sndbuf)
1440 					skb = sock_wmalloc(sk,
1441 							   alloclen + hh_len, 1,
1442 							   sk->sk_allocation);
1443 				if (unlikely(!skb))
1444 					err = -ENOBUFS;
1445 			}
1446 			if (!skb)
1447 				goto error;
1448 			/*
1449 			 *	Fill in the control structures
1450 			 */
1451 			skb->protocol = htons(ETH_P_IPV6);
1452 			skb->ip_summed = csummode;
1453 			skb->csum = 0;
1454 			/* reserve for fragmentation and ipsec header */
1455 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1456 				    dst_exthdrlen);
1457 
1458 			/* Only the initial fragment is time stamped */
1459 			skb_shinfo(skb)->tx_flags = tx_flags;
1460 			tx_flags = 0;
1461 			skb_shinfo(skb)->tskey = tskey;
1462 			tskey = 0;
1463 
1464 			/*
1465 			 *	Find where to start putting bytes
1466 			 */
1467 			data = skb_put(skb, fraglen);
1468 			skb_set_network_header(skb, exthdrlen);
1469 			data += fragheaderlen;
1470 			skb->transport_header = (skb->network_header +
1471 						 fragheaderlen);
1472 			if (fraggap) {
1473 				skb->csum = skb_copy_and_csum_bits(
1474 					skb_prev, maxfraglen,
1475 					data + transhdrlen, fraggap, 0);
1476 				skb_prev->csum = csum_sub(skb_prev->csum,
1477 							  skb->csum);
1478 				data += fraggap;
1479 				pskb_trim_unique(skb_prev, maxfraglen);
1480 			}
1481 			copy = datalen - transhdrlen - fraggap;
1482 
1483 			if (copy < 0) {
1484 				err = -EINVAL;
1485 				kfree_skb(skb);
1486 				goto error;
1487 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1488 				err = -EFAULT;
1489 				kfree_skb(skb);
1490 				goto error;
1491 			}
1492 
1493 			offset += copy;
1494 			length -= datalen - fraggap;
1495 			transhdrlen = 0;
1496 			exthdrlen = 0;
1497 			dst_exthdrlen = 0;
1498 
1499 			/*
1500 			 * Put the packet on the pending queue
1501 			 */
1502 			__skb_queue_tail(queue, skb);
1503 			continue;
1504 		}
1505 
1506 		if (copy > length)
1507 			copy = length;
1508 
1509 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1510 			unsigned int off;
1511 
1512 			off = skb->len;
1513 			if (getfrag(from, skb_put(skb, copy),
1514 						offset, copy, off, skb) < 0) {
1515 				__skb_trim(skb, off);
1516 				err = -EFAULT;
1517 				goto error;
1518 			}
1519 		} else {
1520 			int i = skb_shinfo(skb)->nr_frags;
1521 
1522 			err = -ENOMEM;
1523 			if (!sk_page_frag_refill(sk, pfrag))
1524 				goto error;
1525 
1526 			if (!skb_can_coalesce(skb, i, pfrag->page,
1527 					      pfrag->offset)) {
1528 				err = -EMSGSIZE;
1529 				if (i == MAX_SKB_FRAGS)
1530 					goto error;
1531 
1532 				__skb_fill_page_desc(skb, i, pfrag->page,
1533 						     pfrag->offset, 0);
1534 				skb_shinfo(skb)->nr_frags = ++i;
1535 				get_page(pfrag->page);
1536 			}
1537 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1538 			if (getfrag(from,
1539 				    page_address(pfrag->page) + pfrag->offset,
1540 				    offset, copy, skb->len, skb) < 0)
1541 				goto error_efault;
1542 
1543 			pfrag->offset += copy;
1544 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1545 			skb->len += copy;
1546 			skb->data_len += copy;
1547 			skb->truesize += copy;
1548 			atomic_add(copy, &sk->sk_wmem_alloc);
1549 		}
1550 		offset += copy;
1551 		length -= copy;
1552 	}
1553 
1554 	return 0;
1555 
1556 error_efault:
1557 	err = -EFAULT;
1558 error:
1559 	cork->length -= length;
1560 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1561 	return err;
1562 }
1563 
1564 int ip6_append_data(struct sock *sk,
1565 		    int getfrag(void *from, char *to, int offset, int len,
1566 				int odd, struct sk_buff *skb),
1567 		    void *from, int length, int transhdrlen, int hlimit,
1568 		    int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1569 		    struct rt6_info *rt, unsigned int flags, int dontfrag)
1570 {
1571 	struct inet_sock *inet = inet_sk(sk);
1572 	struct ipv6_pinfo *np = inet6_sk(sk);
1573 	int exthdrlen;
1574 	int err;
1575 
1576 	if (flags&MSG_PROBE)
1577 		return 0;
1578 	if (skb_queue_empty(&sk->sk_write_queue)) {
1579 		/*
1580 		 * setup for corking
1581 		 */
1582 		err = ip6_setup_cork(sk, &inet->cork, &np->cork, hlimit,
1583 				     tclass, opt, rt, fl6);
1584 		if (err)
1585 			return err;
1586 
1587 		exthdrlen = (opt ? opt->opt_flen : 0);
1588 		length += exthdrlen;
1589 		transhdrlen += exthdrlen;
1590 	} else {
1591 		fl6 = &inet->cork.fl.u.ip6;
1592 		transhdrlen = 0;
1593 	}
1594 
1595 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1596 				 &np->cork, sk_page_frag(sk), getfrag,
1597 				 from, length, transhdrlen, flags, dontfrag);
1598 }
1599 EXPORT_SYMBOL_GPL(ip6_append_data);
1600 
1601 static void ip6_cork_release(struct inet_cork_full *cork,
1602 			     struct inet6_cork *v6_cork)
1603 {
1604 	if (v6_cork->opt) {
1605 		kfree(v6_cork->opt->dst0opt);
1606 		kfree(v6_cork->opt->dst1opt);
1607 		kfree(v6_cork->opt->hopopt);
1608 		kfree(v6_cork->opt->srcrt);
1609 		kfree(v6_cork->opt);
1610 		v6_cork->opt = NULL;
1611 	}
1612 
1613 	if (cork->base.dst) {
1614 		dst_release(cork->base.dst);
1615 		cork->base.dst = NULL;
1616 		cork->base.flags &= ~IPCORK_ALLFRAG;
1617 	}
1618 	memset(&cork->fl, 0, sizeof(cork->fl));
1619 }
1620 
1621 struct sk_buff *__ip6_make_skb(struct sock *sk,
1622 			       struct sk_buff_head *queue,
1623 			       struct inet_cork_full *cork,
1624 			       struct inet6_cork *v6_cork)
1625 {
1626 	struct sk_buff *skb, *tmp_skb;
1627 	struct sk_buff **tail_skb;
1628 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1629 	struct ipv6_pinfo *np = inet6_sk(sk);
1630 	struct net *net = sock_net(sk);
1631 	struct ipv6hdr *hdr;
1632 	struct ipv6_txoptions *opt = v6_cork->opt;
1633 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1634 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1635 	unsigned char proto = fl6->flowi6_proto;
1636 
1637 	skb = __skb_dequeue(queue);
1638 	if (!skb)
1639 		goto out;
1640 	tail_skb = &(skb_shinfo(skb)->frag_list);
1641 
1642 	/* move skb->data to ip header from ext header */
1643 	if (skb->data < skb_network_header(skb))
1644 		__skb_pull(skb, skb_network_offset(skb));
1645 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1646 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1647 		*tail_skb = tmp_skb;
1648 		tail_skb = &(tmp_skb->next);
1649 		skb->len += tmp_skb->len;
1650 		skb->data_len += tmp_skb->len;
1651 		skb->truesize += tmp_skb->truesize;
1652 		tmp_skb->destructor = NULL;
1653 		tmp_skb->sk = NULL;
1654 	}
1655 
1656 	/* Allow local fragmentation. */
1657 	skb->ignore_df = ip6_sk_ignore_df(sk);
1658 
1659 	*final_dst = fl6->daddr;
1660 	__skb_pull(skb, skb_network_header_len(skb));
1661 	if (opt && opt->opt_flen)
1662 		ipv6_push_frag_opts(skb, opt, &proto);
1663 	if (opt && opt->opt_nflen)
1664 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1665 
1666 	skb_push(skb, sizeof(struct ipv6hdr));
1667 	skb_reset_network_header(skb);
1668 	hdr = ipv6_hdr(skb);
1669 
1670 	ip6_flow_hdr(hdr, v6_cork->tclass,
1671 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1672 					np->autoflowlabel, fl6));
1673 	hdr->hop_limit = v6_cork->hop_limit;
1674 	hdr->nexthdr = proto;
1675 	hdr->saddr = fl6->saddr;
1676 	hdr->daddr = *final_dst;
1677 
1678 	skb->priority = sk->sk_priority;
1679 	skb->mark = sk->sk_mark;
1680 
1681 	skb_dst_set(skb, dst_clone(&rt->dst));
1682 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1683 	if (proto == IPPROTO_ICMPV6) {
1684 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1685 
1686 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1687 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1688 	}
1689 
1690 	ip6_cork_release(cork, v6_cork);
1691 out:
1692 	return skb;
1693 }
1694 
1695 int ip6_send_skb(struct sk_buff *skb)
1696 {
1697 	struct net *net = sock_net(skb->sk);
1698 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1699 	int err;
1700 
1701 	err = ip6_local_out(net, skb->sk, skb);
1702 	if (err) {
1703 		if (err > 0)
1704 			err = net_xmit_errno(err);
1705 		if (err)
1706 			IP6_INC_STATS(net, rt->rt6i_idev,
1707 				      IPSTATS_MIB_OUTDISCARDS);
1708 	}
1709 
1710 	return err;
1711 }
1712 
1713 int ip6_push_pending_frames(struct sock *sk)
1714 {
1715 	struct sk_buff *skb;
1716 
1717 	skb = ip6_finish_skb(sk);
1718 	if (!skb)
1719 		return 0;
1720 
1721 	return ip6_send_skb(skb);
1722 }
1723 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1724 
1725 static void __ip6_flush_pending_frames(struct sock *sk,
1726 				       struct sk_buff_head *queue,
1727 				       struct inet_cork_full *cork,
1728 				       struct inet6_cork *v6_cork)
1729 {
1730 	struct sk_buff *skb;
1731 
1732 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1733 		if (skb_dst(skb))
1734 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1735 				      IPSTATS_MIB_OUTDISCARDS);
1736 		kfree_skb(skb);
1737 	}
1738 
1739 	ip6_cork_release(cork, v6_cork);
1740 }
1741 
1742 void ip6_flush_pending_frames(struct sock *sk)
1743 {
1744 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1745 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1746 }
1747 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1748 
1749 struct sk_buff *ip6_make_skb(struct sock *sk,
1750 			     int getfrag(void *from, char *to, int offset,
1751 					 int len, int odd, struct sk_buff *skb),
1752 			     void *from, int length, int transhdrlen,
1753 			     int hlimit, int tclass,
1754 			     struct ipv6_txoptions *opt, struct flowi6 *fl6,
1755 			     struct rt6_info *rt, unsigned int flags,
1756 			     int dontfrag)
1757 {
1758 	struct inet_cork_full cork;
1759 	struct inet6_cork v6_cork;
1760 	struct sk_buff_head queue;
1761 	int exthdrlen = (opt ? opt->opt_flen : 0);
1762 	int err;
1763 
1764 	if (flags & MSG_PROBE)
1765 		return NULL;
1766 
1767 	__skb_queue_head_init(&queue);
1768 
1769 	cork.base.flags = 0;
1770 	cork.base.addr = 0;
1771 	cork.base.opt = NULL;
1772 	v6_cork.opt = NULL;
1773 	err = ip6_setup_cork(sk, &cork, &v6_cork, hlimit, tclass, opt, rt, fl6);
1774 	if (err)
1775 		return ERR_PTR(err);
1776 
1777 	if (dontfrag < 0)
1778 		dontfrag = inet6_sk(sk)->dontfrag;
1779 
1780 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1781 				&current->task_frag, getfrag, from,
1782 				length + exthdrlen, transhdrlen + exthdrlen,
1783 				flags, dontfrag);
1784 	if (err) {
1785 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1786 		return ERR_PTR(err);
1787 	}
1788 
1789 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1790 }
1791