xref: /linux/net/ipv6/ip6_output.c (revision 9ee0034b8f49aaaa7e7c2da8db1038915db99c19)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	:	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 #include <net/l3mdev.h>
59 #include <net/lwtunnel.h>
60 
61 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
62 {
63 	struct dst_entry *dst = skb_dst(skb);
64 	struct net_device *dev = dst->dev;
65 	struct neighbour *neigh;
66 	struct in6_addr *nexthop;
67 	int ret;
68 
69 	skb->protocol = htons(ETH_P_IPV6);
70 	skb->dev = dev;
71 
72 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
73 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
74 
75 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
76 		    ((mroute6_socket(net, skb) &&
77 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
78 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
79 					 &ipv6_hdr(skb)->saddr))) {
80 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
81 
82 			/* Do not check for IFF_ALLMULTI; multicast routing
83 			   is not supported in any case.
84 			 */
85 			if (newskb)
86 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
87 					net, sk, newskb, NULL, newskb->dev,
88 					dev_loopback_xmit);
89 
90 			if (ipv6_hdr(skb)->hop_limit == 0) {
91 				IP6_INC_STATS(net, idev,
92 					      IPSTATS_MIB_OUTDISCARDS);
93 				kfree_skb(skb);
94 				return 0;
95 			}
96 		}
97 
98 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
99 
100 		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
101 		    IPV6_ADDR_SCOPE_NODELOCAL &&
102 		    !(dev->flags & IFF_LOOPBACK)) {
103 			kfree_skb(skb);
104 			return 0;
105 		}
106 	}
107 
108 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
109 		int res = lwtunnel_xmit(skb);
110 
111 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
112 			return res;
113 	}
114 
115 	rcu_read_lock_bh();
116 	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
117 	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
118 	if (unlikely(!neigh))
119 		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
120 	if (!IS_ERR(neigh)) {
121 		ret = dst_neigh_output(dst, neigh, skb);
122 		rcu_read_unlock_bh();
123 		return ret;
124 	}
125 	rcu_read_unlock_bh();
126 
127 	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
128 	kfree_skb(skb);
129 	return -EINVAL;
130 }
131 
132 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
133 {
134 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
135 	    dst_allfrag(skb_dst(skb)) ||
136 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
137 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
138 	else
139 		return ip6_finish_output2(net, sk, skb);
140 }
141 
142 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
143 {
144 	struct net_device *dev = skb_dst(skb)->dev;
145 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
146 
147 	if (unlikely(idev->cnf.disable_ipv6)) {
148 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
149 		kfree_skb(skb);
150 		return 0;
151 	}
152 
153 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
154 			    net, sk, skb, NULL, dev,
155 			    ip6_finish_output,
156 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
157 }
158 
159 /*
160  * xmit an sk_buff (used by TCP, SCTP and DCCP)
161  * Note : socket lock is not held for SYNACK packets, but might be modified
162  * by calls to skb_set_owner_w() and ipv6_local_error(),
163  * which are using proper atomic operations or spinlocks.
164  */
165 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
166 	     struct ipv6_txoptions *opt, int tclass)
167 {
168 	struct net *net = sock_net(sk);
169 	const struct ipv6_pinfo *np = inet6_sk(sk);
170 	struct in6_addr *first_hop = &fl6->daddr;
171 	struct dst_entry *dst = skb_dst(skb);
172 	struct ipv6hdr *hdr;
173 	u8  proto = fl6->flowi6_proto;
174 	int seg_len = skb->len;
175 	int hlimit = -1;
176 	u32 mtu;
177 
178 	if (opt) {
179 		unsigned int head_room;
180 
181 		/* First: exthdrs may take lots of space (~8K for now)
182 		   MAX_HEADER is not enough.
183 		 */
184 		head_room = opt->opt_nflen + opt->opt_flen;
185 		seg_len += head_room;
186 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
187 
188 		if (skb_headroom(skb) < head_room) {
189 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
190 			if (!skb2) {
191 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
192 					      IPSTATS_MIB_OUTDISCARDS);
193 				kfree_skb(skb);
194 				return -ENOBUFS;
195 			}
196 			consume_skb(skb);
197 			skb = skb2;
198 			/* skb_set_owner_w() changes sk->sk_wmem_alloc atomically,
199 			 * it is safe to call in our context (socket lock not held)
200 			 */
201 			skb_set_owner_w(skb, (struct sock *)sk);
202 		}
203 		if (opt->opt_flen)
204 			ipv6_push_frag_opts(skb, opt, &proto);
205 		if (opt->opt_nflen)
206 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
207 	}
208 
209 	skb_push(skb, sizeof(struct ipv6hdr));
210 	skb_reset_network_header(skb);
211 	hdr = ipv6_hdr(skb);
212 
213 	/*
214 	 *	Fill in the IPv6 header
215 	 */
216 	if (np)
217 		hlimit = np->hop_limit;
218 	if (hlimit < 0)
219 		hlimit = ip6_dst_hoplimit(dst);
220 
221 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
222 						     np->autoflowlabel, fl6));
223 
224 	hdr->payload_len = htons(seg_len);
225 	hdr->nexthdr = proto;
226 	hdr->hop_limit = hlimit;
227 
228 	hdr->saddr = fl6->saddr;
229 	hdr->daddr = *first_hop;
230 
231 	skb->protocol = htons(ETH_P_IPV6);
232 	skb->priority = sk->sk_priority;
233 	skb->mark = sk->sk_mark;
234 
235 	mtu = dst_mtu(dst);
236 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
237 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
238 			      IPSTATS_MIB_OUT, skb->len);
239 		/* hooks should never assume socket lock is held.
240 		 * we promote our socket to non const
241 		 */
242 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
243 			       net, (struct sock *)sk, skb, NULL, dst->dev,
244 			       dst_output);
245 	}
246 
247 	skb->dev = dst->dev;
248 	/* ipv6_local_error() does not require socket lock,
249 	 * we promote our socket to non const
250 	 */
251 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
252 
253 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
254 	kfree_skb(skb);
255 	return -EMSGSIZE;
256 }
257 EXPORT_SYMBOL(ip6_xmit);
258 
259 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
260 {
261 	struct ip6_ra_chain *ra;
262 	struct sock *last = NULL;
263 
264 	read_lock(&ip6_ra_lock);
265 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
266 		struct sock *sk = ra->sk;
267 		if (sk && ra->sel == sel &&
268 		    (!sk->sk_bound_dev_if ||
269 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
270 			if (last) {
271 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
272 				if (skb2)
273 					rawv6_rcv(last, skb2);
274 			}
275 			last = sk;
276 		}
277 	}
278 
279 	if (last) {
280 		rawv6_rcv(last, skb);
281 		read_unlock(&ip6_ra_lock);
282 		return 1;
283 	}
284 	read_unlock(&ip6_ra_lock);
285 	return 0;
286 }
287 
288 static int ip6_forward_proxy_check(struct sk_buff *skb)
289 {
290 	struct ipv6hdr *hdr = ipv6_hdr(skb);
291 	u8 nexthdr = hdr->nexthdr;
292 	__be16 frag_off;
293 	int offset;
294 
295 	if (ipv6_ext_hdr(nexthdr)) {
296 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
297 		if (offset < 0)
298 			return 0;
299 	} else
300 		offset = sizeof(struct ipv6hdr);
301 
302 	if (nexthdr == IPPROTO_ICMPV6) {
303 		struct icmp6hdr *icmp6;
304 
305 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
306 					 offset + 1 - skb->data)))
307 			return 0;
308 
309 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
310 
311 		switch (icmp6->icmp6_type) {
312 		case NDISC_ROUTER_SOLICITATION:
313 		case NDISC_ROUTER_ADVERTISEMENT:
314 		case NDISC_NEIGHBOUR_SOLICITATION:
315 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
316 		case NDISC_REDIRECT:
317 			/* For reaction involving unicast neighbor discovery
318 			 * message destined to the proxied address, pass it to
319 			 * input function.
320 			 */
321 			return 1;
322 		default:
323 			break;
324 		}
325 	}
326 
327 	/*
328 	 * The proxying router can't forward traffic sent to a link-local
329 	 * address, so signal the sender and discard the packet. This
330 	 * behavior is clarified by the MIPv6 specification.
331 	 */
332 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
333 		dst_link_failure(skb);
334 		return -1;
335 	}
336 
337 	return 0;
338 }
339 
340 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
341 				     struct sk_buff *skb)
342 {
343 	return dst_output(net, sk, skb);
344 }
345 
346 static unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst)
347 {
348 	unsigned int mtu;
349 	struct inet6_dev *idev;
350 
351 	if (dst_metric_locked(dst, RTAX_MTU)) {
352 		mtu = dst_metric_raw(dst, RTAX_MTU);
353 		if (mtu)
354 			return mtu;
355 	}
356 
357 	mtu = IPV6_MIN_MTU;
358 	rcu_read_lock();
359 	idev = __in6_dev_get(dst->dev);
360 	if (idev)
361 		mtu = idev->cnf.mtu6;
362 	rcu_read_unlock();
363 
364 	return mtu;
365 }
366 
367 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
368 {
369 	if (skb->len <= mtu)
370 		return false;
371 
372 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
373 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
374 		return true;
375 
376 	if (skb->ignore_df)
377 		return false;
378 
379 	if (skb_is_gso(skb) && skb_gso_validate_mtu(skb, mtu))
380 		return false;
381 
382 	return true;
383 }
384 
385 int ip6_forward(struct sk_buff *skb)
386 {
387 	struct dst_entry *dst = skb_dst(skb);
388 	struct ipv6hdr *hdr = ipv6_hdr(skb);
389 	struct inet6_skb_parm *opt = IP6CB(skb);
390 	struct net *net = dev_net(dst->dev);
391 	u32 mtu;
392 
393 	if (net->ipv6.devconf_all->forwarding == 0)
394 		goto error;
395 
396 	if (skb->pkt_type != PACKET_HOST)
397 		goto drop;
398 
399 	if (unlikely(skb->sk))
400 		goto drop;
401 
402 	if (skb_warn_if_lro(skb))
403 		goto drop;
404 
405 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
406 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
407 				IPSTATS_MIB_INDISCARDS);
408 		goto drop;
409 	}
410 
411 	skb_forward_csum(skb);
412 
413 	/*
414 	 *	We DO NOT make any processing on
415 	 *	RA packets, pushing them to user level AS IS
416 	 *	without ane WARRANTY that application will be able
417 	 *	to interpret them. The reason is that we
418 	 *	cannot make anything clever here.
419 	 *
420 	 *	We are not end-node, so that if packet contains
421 	 *	AH/ESP, we cannot make anything.
422 	 *	Defragmentation also would be mistake, RA packets
423 	 *	cannot be fragmented, because there is no warranty
424 	 *	that different fragments will go along one path. --ANK
425 	 */
426 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
427 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
428 			return 0;
429 	}
430 
431 	/*
432 	 *	check and decrement ttl
433 	 */
434 	if (hdr->hop_limit <= 1) {
435 		/* Force OUTPUT device used as source address */
436 		skb->dev = dst->dev;
437 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
438 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
439 				IPSTATS_MIB_INHDRERRORS);
440 
441 		kfree_skb(skb);
442 		return -ETIMEDOUT;
443 	}
444 
445 	/* XXX: idev->cnf.proxy_ndp? */
446 	if (net->ipv6.devconf_all->proxy_ndp &&
447 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
448 		int proxied = ip6_forward_proxy_check(skb);
449 		if (proxied > 0)
450 			return ip6_input(skb);
451 		else if (proxied < 0) {
452 			__IP6_INC_STATS(net, ip6_dst_idev(dst),
453 					IPSTATS_MIB_INDISCARDS);
454 			goto drop;
455 		}
456 	}
457 
458 	if (!xfrm6_route_forward(skb)) {
459 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
460 				IPSTATS_MIB_INDISCARDS);
461 		goto drop;
462 	}
463 	dst = skb_dst(skb);
464 
465 	/* IPv6 specs say nothing about it, but it is clear that we cannot
466 	   send redirects to source routed frames.
467 	   We don't send redirects to frames decapsulated from IPsec.
468 	 */
469 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
470 		struct in6_addr *target = NULL;
471 		struct inet_peer *peer;
472 		struct rt6_info *rt;
473 
474 		/*
475 		 *	incoming and outgoing devices are the same
476 		 *	send a redirect.
477 		 */
478 
479 		rt = (struct rt6_info *) dst;
480 		if (rt->rt6i_flags & RTF_GATEWAY)
481 			target = &rt->rt6i_gateway;
482 		else
483 			target = &hdr->daddr;
484 
485 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
486 
487 		/* Limit redirects both by destination (here)
488 		   and by source (inside ndisc_send_redirect)
489 		 */
490 		if (inet_peer_xrlim_allow(peer, 1*HZ))
491 			ndisc_send_redirect(skb, target);
492 		if (peer)
493 			inet_putpeer(peer);
494 	} else {
495 		int addrtype = ipv6_addr_type(&hdr->saddr);
496 
497 		/* This check is security critical. */
498 		if (addrtype == IPV6_ADDR_ANY ||
499 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
500 			goto error;
501 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
502 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
503 				    ICMPV6_NOT_NEIGHBOUR, 0);
504 			goto error;
505 		}
506 	}
507 
508 	mtu = ip6_dst_mtu_forward(dst);
509 	if (mtu < IPV6_MIN_MTU)
510 		mtu = IPV6_MIN_MTU;
511 
512 	if (ip6_pkt_too_big(skb, mtu)) {
513 		/* Again, force OUTPUT device used as source address */
514 		skb->dev = dst->dev;
515 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
516 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
517 				IPSTATS_MIB_INTOOBIGERRORS);
518 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
519 				IPSTATS_MIB_FRAGFAILS);
520 		kfree_skb(skb);
521 		return -EMSGSIZE;
522 	}
523 
524 	if (skb_cow(skb, dst->dev->hard_header_len)) {
525 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
526 				IPSTATS_MIB_OUTDISCARDS);
527 		goto drop;
528 	}
529 
530 	hdr = ipv6_hdr(skb);
531 
532 	/* Mangling hops number delayed to point after skb COW */
533 
534 	hdr->hop_limit--;
535 
536 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
537 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
538 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
539 		       net, NULL, skb, skb->dev, dst->dev,
540 		       ip6_forward_finish);
541 
542 error:
543 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
544 drop:
545 	kfree_skb(skb);
546 	return -EINVAL;
547 }
548 
549 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
550 {
551 	to->pkt_type = from->pkt_type;
552 	to->priority = from->priority;
553 	to->protocol = from->protocol;
554 	skb_dst_drop(to);
555 	skb_dst_set(to, dst_clone(skb_dst(from)));
556 	to->dev = from->dev;
557 	to->mark = from->mark;
558 
559 #ifdef CONFIG_NET_SCHED
560 	to->tc_index = from->tc_index;
561 #endif
562 	nf_copy(to, from);
563 	skb_copy_secmark(to, from);
564 }
565 
566 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
567 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
568 {
569 	struct sk_buff *frag;
570 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
571 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
572 				inet6_sk(skb->sk) : NULL;
573 	struct ipv6hdr *tmp_hdr;
574 	struct frag_hdr *fh;
575 	unsigned int mtu, hlen, left, len;
576 	int hroom, troom;
577 	__be32 frag_id;
578 	int ptr, offset = 0, err = 0;
579 	u8 *prevhdr, nexthdr = 0;
580 
581 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
582 	nexthdr = *prevhdr;
583 
584 	mtu = ip6_skb_dst_mtu(skb);
585 
586 	/* We must not fragment if the socket is set to force MTU discovery
587 	 * or if the skb it not generated by a local socket.
588 	 */
589 	if (unlikely(!skb->ignore_df && skb->len > mtu))
590 		goto fail_toobig;
591 
592 	if (IP6CB(skb)->frag_max_size) {
593 		if (IP6CB(skb)->frag_max_size > mtu)
594 			goto fail_toobig;
595 
596 		/* don't send fragments larger than what we received */
597 		mtu = IP6CB(skb)->frag_max_size;
598 		if (mtu < IPV6_MIN_MTU)
599 			mtu = IPV6_MIN_MTU;
600 	}
601 
602 	if (np && np->frag_size < mtu) {
603 		if (np->frag_size)
604 			mtu = np->frag_size;
605 	}
606 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
607 		goto fail_toobig;
608 	mtu -= hlen + sizeof(struct frag_hdr);
609 
610 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
611 				    &ipv6_hdr(skb)->saddr);
612 
613 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
614 	    (err = skb_checksum_help(skb)))
615 		goto fail;
616 
617 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
618 	if (skb_has_frag_list(skb)) {
619 		int first_len = skb_pagelen(skb);
620 		struct sk_buff *frag2;
621 
622 		if (first_len - hlen > mtu ||
623 		    ((first_len - hlen) & 7) ||
624 		    skb_cloned(skb) ||
625 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
626 			goto slow_path;
627 
628 		skb_walk_frags(skb, frag) {
629 			/* Correct geometry. */
630 			if (frag->len > mtu ||
631 			    ((frag->len & 7) && frag->next) ||
632 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
633 				goto slow_path_clean;
634 
635 			/* Partially cloned skb? */
636 			if (skb_shared(frag))
637 				goto slow_path_clean;
638 
639 			BUG_ON(frag->sk);
640 			if (skb->sk) {
641 				frag->sk = skb->sk;
642 				frag->destructor = sock_wfree;
643 			}
644 			skb->truesize -= frag->truesize;
645 		}
646 
647 		err = 0;
648 		offset = 0;
649 		/* BUILD HEADER */
650 
651 		*prevhdr = NEXTHDR_FRAGMENT;
652 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
653 		if (!tmp_hdr) {
654 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
655 				      IPSTATS_MIB_FRAGFAILS);
656 			err = -ENOMEM;
657 			goto fail;
658 		}
659 		frag = skb_shinfo(skb)->frag_list;
660 		skb_frag_list_init(skb);
661 
662 		__skb_pull(skb, hlen);
663 		fh = (struct frag_hdr *)__skb_push(skb, sizeof(struct frag_hdr));
664 		__skb_push(skb, hlen);
665 		skb_reset_network_header(skb);
666 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
667 
668 		fh->nexthdr = nexthdr;
669 		fh->reserved = 0;
670 		fh->frag_off = htons(IP6_MF);
671 		fh->identification = frag_id;
672 
673 		first_len = skb_pagelen(skb);
674 		skb->data_len = first_len - skb_headlen(skb);
675 		skb->len = first_len;
676 		ipv6_hdr(skb)->payload_len = htons(first_len -
677 						   sizeof(struct ipv6hdr));
678 
679 		dst_hold(&rt->dst);
680 
681 		for (;;) {
682 			/* Prepare header of the next frame,
683 			 * before previous one went down. */
684 			if (frag) {
685 				frag->ip_summed = CHECKSUM_NONE;
686 				skb_reset_transport_header(frag);
687 				fh = (struct frag_hdr *)__skb_push(frag, sizeof(struct frag_hdr));
688 				__skb_push(frag, hlen);
689 				skb_reset_network_header(frag);
690 				memcpy(skb_network_header(frag), tmp_hdr,
691 				       hlen);
692 				offset += skb->len - hlen - sizeof(struct frag_hdr);
693 				fh->nexthdr = nexthdr;
694 				fh->reserved = 0;
695 				fh->frag_off = htons(offset);
696 				if (frag->next)
697 					fh->frag_off |= htons(IP6_MF);
698 				fh->identification = frag_id;
699 				ipv6_hdr(frag)->payload_len =
700 						htons(frag->len -
701 						      sizeof(struct ipv6hdr));
702 				ip6_copy_metadata(frag, skb);
703 			}
704 
705 			err = output(net, sk, skb);
706 			if (!err)
707 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
708 					      IPSTATS_MIB_FRAGCREATES);
709 
710 			if (err || !frag)
711 				break;
712 
713 			skb = frag;
714 			frag = skb->next;
715 			skb->next = NULL;
716 		}
717 
718 		kfree(tmp_hdr);
719 
720 		if (err == 0) {
721 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
722 				      IPSTATS_MIB_FRAGOKS);
723 			ip6_rt_put(rt);
724 			return 0;
725 		}
726 
727 		kfree_skb_list(frag);
728 
729 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
730 			      IPSTATS_MIB_FRAGFAILS);
731 		ip6_rt_put(rt);
732 		return err;
733 
734 slow_path_clean:
735 		skb_walk_frags(skb, frag2) {
736 			if (frag2 == frag)
737 				break;
738 			frag2->sk = NULL;
739 			frag2->destructor = NULL;
740 			skb->truesize += frag2->truesize;
741 		}
742 	}
743 
744 slow_path:
745 	left = skb->len - hlen;		/* Space per frame */
746 	ptr = hlen;			/* Where to start from */
747 
748 	/*
749 	 *	Fragment the datagram.
750 	 */
751 
752 	*prevhdr = NEXTHDR_FRAGMENT;
753 	troom = rt->dst.dev->needed_tailroom;
754 
755 	/*
756 	 *	Keep copying data until we run out.
757 	 */
758 	while (left > 0)	{
759 		len = left;
760 		/* IF: it doesn't fit, use 'mtu' - the data space left */
761 		if (len > mtu)
762 			len = mtu;
763 		/* IF: we are not sending up to and including the packet end
764 		   then align the next start on an eight byte boundary */
765 		if (len < left)	{
766 			len &= ~7;
767 		}
768 
769 		/* Allocate buffer */
770 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
771 				 hroom + troom, GFP_ATOMIC);
772 		if (!frag) {
773 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
774 				      IPSTATS_MIB_FRAGFAILS);
775 			err = -ENOMEM;
776 			goto fail;
777 		}
778 
779 		/*
780 		 *	Set up data on packet
781 		 */
782 
783 		ip6_copy_metadata(frag, skb);
784 		skb_reserve(frag, hroom);
785 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
786 		skb_reset_network_header(frag);
787 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
788 		frag->transport_header = (frag->network_header + hlen +
789 					  sizeof(struct frag_hdr));
790 
791 		/*
792 		 *	Charge the memory for the fragment to any owner
793 		 *	it might possess
794 		 */
795 		if (skb->sk)
796 			skb_set_owner_w(frag, skb->sk);
797 
798 		/*
799 		 *	Copy the packet header into the new buffer.
800 		 */
801 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
802 
803 		/*
804 		 *	Build fragment header.
805 		 */
806 		fh->nexthdr = nexthdr;
807 		fh->reserved = 0;
808 		fh->identification = frag_id;
809 
810 		/*
811 		 *	Copy a block of the IP datagram.
812 		 */
813 		BUG_ON(skb_copy_bits(skb, ptr, skb_transport_header(frag),
814 				     len));
815 		left -= len;
816 
817 		fh->frag_off = htons(offset);
818 		if (left > 0)
819 			fh->frag_off |= htons(IP6_MF);
820 		ipv6_hdr(frag)->payload_len = htons(frag->len -
821 						    sizeof(struct ipv6hdr));
822 
823 		ptr += len;
824 		offset += len;
825 
826 		/*
827 		 *	Put this fragment into the sending queue.
828 		 */
829 		err = output(net, sk, frag);
830 		if (err)
831 			goto fail;
832 
833 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
834 			      IPSTATS_MIB_FRAGCREATES);
835 	}
836 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
837 		      IPSTATS_MIB_FRAGOKS);
838 	consume_skb(skb);
839 	return err;
840 
841 fail_toobig:
842 	if (skb->sk && dst_allfrag(skb_dst(skb)))
843 		sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
844 
845 	skb->dev = skb_dst(skb)->dev;
846 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
847 	err = -EMSGSIZE;
848 
849 fail:
850 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
851 		      IPSTATS_MIB_FRAGFAILS);
852 	kfree_skb(skb);
853 	return err;
854 }
855 
856 static inline int ip6_rt_check(const struct rt6key *rt_key,
857 			       const struct in6_addr *fl_addr,
858 			       const struct in6_addr *addr_cache)
859 {
860 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
861 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
862 }
863 
864 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
865 					  struct dst_entry *dst,
866 					  const struct flowi6 *fl6)
867 {
868 	struct ipv6_pinfo *np = inet6_sk(sk);
869 	struct rt6_info *rt;
870 
871 	if (!dst)
872 		goto out;
873 
874 	if (dst->ops->family != AF_INET6) {
875 		dst_release(dst);
876 		return NULL;
877 	}
878 
879 	rt = (struct rt6_info *)dst;
880 	/* Yes, checking route validity in not connected
881 	 * case is not very simple. Take into account,
882 	 * that we do not support routing by source, TOS,
883 	 * and MSG_DONTROUTE		--ANK (980726)
884 	 *
885 	 * 1. ip6_rt_check(): If route was host route,
886 	 *    check that cached destination is current.
887 	 *    If it is network route, we still may
888 	 *    check its validity using saved pointer
889 	 *    to the last used address: daddr_cache.
890 	 *    We do not want to save whole address now,
891 	 *    (because main consumer of this service
892 	 *    is tcp, which has not this problem),
893 	 *    so that the last trick works only on connected
894 	 *    sockets.
895 	 * 2. oif also should be the same.
896 	 */
897 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
898 #ifdef CONFIG_IPV6_SUBTREES
899 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
900 #endif
901 	   (!(fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF) &&
902 	      (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex))) {
903 		dst_release(dst);
904 		dst = NULL;
905 	}
906 
907 out:
908 	return dst;
909 }
910 
911 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
912 			       struct dst_entry **dst, struct flowi6 *fl6)
913 {
914 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
915 	struct neighbour *n;
916 	struct rt6_info *rt;
917 #endif
918 	int err;
919 	int flags = 0;
920 
921 	if (ipv6_addr_any(&fl6->saddr) && fl6->flowi6_oif &&
922 	    (!*dst || !(*dst)->error)) {
923 		err = l3mdev_get_saddr6(net, sk, fl6);
924 		if (err)
925 			goto out_err;
926 	}
927 
928 	/* The correct way to handle this would be to do
929 	 * ip6_route_get_saddr, and then ip6_route_output; however,
930 	 * the route-specific preferred source forces the
931 	 * ip6_route_output call _before_ ip6_route_get_saddr.
932 	 *
933 	 * In source specific routing (no src=any default route),
934 	 * ip6_route_output will fail given src=any saddr, though, so
935 	 * that's why we try it again later.
936 	 */
937 	if (ipv6_addr_any(&fl6->saddr) && (!*dst || !(*dst)->error)) {
938 		struct rt6_info *rt;
939 		bool had_dst = *dst != NULL;
940 
941 		if (!had_dst)
942 			*dst = ip6_route_output(net, sk, fl6);
943 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
944 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
945 					  sk ? inet6_sk(sk)->srcprefs : 0,
946 					  &fl6->saddr);
947 		if (err)
948 			goto out_err_release;
949 
950 		/* If we had an erroneous initial result, pretend it
951 		 * never existed and let the SA-enabled version take
952 		 * over.
953 		 */
954 		if (!had_dst && (*dst)->error) {
955 			dst_release(*dst);
956 			*dst = NULL;
957 		}
958 
959 		if (fl6->flowi6_oif)
960 			flags |= RT6_LOOKUP_F_IFACE;
961 	}
962 
963 	if (!*dst)
964 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
965 
966 	err = (*dst)->error;
967 	if (err)
968 		goto out_err_release;
969 
970 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
971 	/*
972 	 * Here if the dst entry we've looked up
973 	 * has a neighbour entry that is in the INCOMPLETE
974 	 * state and the src address from the flow is
975 	 * marked as OPTIMISTIC, we release the found
976 	 * dst entry and replace it instead with the
977 	 * dst entry of the nexthop router
978 	 */
979 	rt = (struct rt6_info *) *dst;
980 	rcu_read_lock_bh();
981 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
982 				      rt6_nexthop(rt, &fl6->daddr));
983 	err = n && !(n->nud_state & NUD_VALID) ? -EINVAL : 0;
984 	rcu_read_unlock_bh();
985 
986 	if (err) {
987 		struct inet6_ifaddr *ifp;
988 		struct flowi6 fl_gw6;
989 		int redirect;
990 
991 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
992 				      (*dst)->dev, 1);
993 
994 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
995 		if (ifp)
996 			in6_ifa_put(ifp);
997 
998 		if (redirect) {
999 			/*
1000 			 * We need to get the dst entry for the
1001 			 * default router instead
1002 			 */
1003 			dst_release(*dst);
1004 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1005 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1006 			*dst = ip6_route_output(net, sk, &fl_gw6);
1007 			err = (*dst)->error;
1008 			if (err)
1009 				goto out_err_release;
1010 		}
1011 	}
1012 #endif
1013 
1014 	return 0;
1015 
1016 out_err_release:
1017 	dst_release(*dst);
1018 	*dst = NULL;
1019 out_err:
1020 	if (err == -ENETUNREACH)
1021 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1022 	return err;
1023 }
1024 
1025 /**
1026  *	ip6_dst_lookup - perform route lookup on flow
1027  *	@sk: socket which provides route info
1028  *	@dst: pointer to dst_entry * for result
1029  *	@fl6: flow to lookup
1030  *
1031  *	This function performs a route lookup on the given flow.
1032  *
1033  *	It returns zero on success, or a standard errno code on error.
1034  */
1035 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1036 		   struct flowi6 *fl6)
1037 {
1038 	*dst = NULL;
1039 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1040 }
1041 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1042 
1043 /**
1044  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1045  *	@sk: socket which provides route info
1046  *	@fl6: flow to lookup
1047  *	@final_dst: final destination address for ipsec lookup
1048  *
1049  *	This function performs a route lookup on the given flow.
1050  *
1051  *	It returns a valid dst pointer on success, or a pointer encoded
1052  *	error code.
1053  */
1054 struct dst_entry *ip6_dst_lookup_flow(const struct sock *sk, struct flowi6 *fl6,
1055 				      const struct in6_addr *final_dst)
1056 {
1057 	struct dst_entry *dst = NULL;
1058 	int err;
1059 
1060 	err = ip6_dst_lookup_tail(sock_net(sk), sk, &dst, fl6);
1061 	if (err)
1062 		return ERR_PTR(err);
1063 	if (final_dst)
1064 		fl6->daddr = *final_dst;
1065 	if (!fl6->flowi6_oif)
1066 		fl6->flowi6_oif = l3mdev_fib_oif(dst->dev);
1067 
1068 	return xfrm_lookup_route(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1069 }
1070 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1071 
1072 /**
1073  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1074  *	@sk: socket which provides the dst cache and route info
1075  *	@fl6: flow to lookup
1076  *	@final_dst: final destination address for ipsec lookup
1077  *
1078  *	This function performs a route lookup on the given flow with the
1079  *	possibility of using the cached route in the socket if it is valid.
1080  *	It will take the socket dst lock when operating on the dst cache.
1081  *	As a result, this function can only be used in process context.
1082  *
1083  *	It returns a valid dst pointer on success, or a pointer encoded
1084  *	error code.
1085  */
1086 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1087 					 const struct in6_addr *final_dst)
1088 {
1089 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1090 
1091 	dst = ip6_sk_dst_check(sk, dst, fl6);
1092 	if (!dst)
1093 		dst = ip6_dst_lookup_flow(sk, fl6, final_dst);
1094 
1095 	return dst;
1096 }
1097 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1098 
1099 static inline int ip6_ufo_append_data(struct sock *sk,
1100 			struct sk_buff_head *queue,
1101 			int getfrag(void *from, char *to, int offset, int len,
1102 			int odd, struct sk_buff *skb),
1103 			void *from, int length, int hh_len, int fragheaderlen,
1104 			int exthdrlen, int transhdrlen, int mtu,
1105 			unsigned int flags, const struct flowi6 *fl6)
1106 
1107 {
1108 	struct sk_buff *skb;
1109 	int err;
1110 
1111 	/* There is support for UDP large send offload by network
1112 	 * device, so create one single skb packet containing complete
1113 	 * udp datagram
1114 	 */
1115 	skb = skb_peek_tail(queue);
1116 	if (!skb) {
1117 		skb = sock_alloc_send_skb(sk,
1118 			hh_len + fragheaderlen + transhdrlen + 20,
1119 			(flags & MSG_DONTWAIT), &err);
1120 		if (!skb)
1121 			return err;
1122 
1123 		/* reserve space for Hardware header */
1124 		skb_reserve(skb, hh_len);
1125 
1126 		/* create space for UDP/IP header */
1127 		skb_put(skb, fragheaderlen + transhdrlen);
1128 
1129 		/* initialize network header pointer */
1130 		skb_set_network_header(skb, exthdrlen);
1131 
1132 		/* initialize protocol header pointer */
1133 		skb->transport_header = skb->network_header + fragheaderlen;
1134 
1135 		skb->protocol = htons(ETH_P_IPV6);
1136 		skb->csum = 0;
1137 
1138 		__skb_queue_tail(queue, skb);
1139 	} else if (skb_is_gso(skb)) {
1140 		goto append;
1141 	}
1142 
1143 	skb->ip_summed = CHECKSUM_PARTIAL;
1144 	/* Specify the length of each IPv6 datagram fragment.
1145 	 * It has to be a multiple of 8.
1146 	 */
1147 	skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1148 				     sizeof(struct frag_hdr)) & ~7;
1149 	skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1150 	skb_shinfo(skb)->ip6_frag_id = ipv6_select_ident(sock_net(sk),
1151 							 &fl6->daddr,
1152 							 &fl6->saddr);
1153 
1154 append:
1155 	return skb_append_datato_frags(sk, skb, getfrag, from,
1156 				       (length - transhdrlen));
1157 }
1158 
1159 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1160 					       gfp_t gfp)
1161 {
1162 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1163 }
1164 
1165 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1166 						gfp_t gfp)
1167 {
1168 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1169 }
1170 
1171 static void ip6_append_data_mtu(unsigned int *mtu,
1172 				int *maxfraglen,
1173 				unsigned int fragheaderlen,
1174 				struct sk_buff *skb,
1175 				struct rt6_info *rt,
1176 				unsigned int orig_mtu)
1177 {
1178 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1179 		if (!skb) {
1180 			/* first fragment, reserve header_len */
1181 			*mtu = orig_mtu - rt->dst.header_len;
1182 
1183 		} else {
1184 			/*
1185 			 * this fragment is not first, the headers
1186 			 * space is regarded as data space.
1187 			 */
1188 			*mtu = orig_mtu;
1189 		}
1190 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1191 			      + fragheaderlen - sizeof(struct frag_hdr);
1192 	}
1193 }
1194 
1195 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1196 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1197 			  struct rt6_info *rt, struct flowi6 *fl6)
1198 {
1199 	struct ipv6_pinfo *np = inet6_sk(sk);
1200 	unsigned int mtu;
1201 	struct ipv6_txoptions *opt = ipc6->opt;
1202 
1203 	/*
1204 	 * setup for corking
1205 	 */
1206 	if (opt) {
1207 		if (WARN_ON(v6_cork->opt))
1208 			return -EINVAL;
1209 
1210 		v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
1211 		if (unlikely(!v6_cork->opt))
1212 			return -ENOBUFS;
1213 
1214 		v6_cork->opt->tot_len = opt->tot_len;
1215 		v6_cork->opt->opt_flen = opt->opt_flen;
1216 		v6_cork->opt->opt_nflen = opt->opt_nflen;
1217 
1218 		v6_cork->opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1219 						    sk->sk_allocation);
1220 		if (opt->dst0opt && !v6_cork->opt->dst0opt)
1221 			return -ENOBUFS;
1222 
1223 		v6_cork->opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1224 						    sk->sk_allocation);
1225 		if (opt->dst1opt && !v6_cork->opt->dst1opt)
1226 			return -ENOBUFS;
1227 
1228 		v6_cork->opt->hopopt = ip6_opt_dup(opt->hopopt,
1229 						   sk->sk_allocation);
1230 		if (opt->hopopt && !v6_cork->opt->hopopt)
1231 			return -ENOBUFS;
1232 
1233 		v6_cork->opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1234 						    sk->sk_allocation);
1235 		if (opt->srcrt && !v6_cork->opt->srcrt)
1236 			return -ENOBUFS;
1237 
1238 		/* need source address above miyazawa*/
1239 	}
1240 	dst_hold(&rt->dst);
1241 	cork->base.dst = &rt->dst;
1242 	cork->fl.u.ip6 = *fl6;
1243 	v6_cork->hop_limit = ipc6->hlimit;
1244 	v6_cork->tclass = ipc6->tclass;
1245 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1246 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1247 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1248 	else
1249 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1250 		      rt->dst.dev->mtu : dst_mtu(rt->dst.path);
1251 	if (np->frag_size < mtu) {
1252 		if (np->frag_size)
1253 			mtu = np->frag_size;
1254 	}
1255 	cork->base.fragsize = mtu;
1256 	if (dst_allfrag(rt->dst.path))
1257 		cork->base.flags |= IPCORK_ALLFRAG;
1258 	cork->base.length = 0;
1259 
1260 	return 0;
1261 }
1262 
1263 static int __ip6_append_data(struct sock *sk,
1264 			     struct flowi6 *fl6,
1265 			     struct sk_buff_head *queue,
1266 			     struct inet_cork *cork,
1267 			     struct inet6_cork *v6_cork,
1268 			     struct page_frag *pfrag,
1269 			     int getfrag(void *from, char *to, int offset,
1270 					 int len, int odd, struct sk_buff *skb),
1271 			     void *from, int length, int transhdrlen,
1272 			     unsigned int flags, struct ipcm6_cookie *ipc6,
1273 			     const struct sockcm_cookie *sockc)
1274 {
1275 	struct sk_buff *skb, *skb_prev = NULL;
1276 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu;
1277 	int exthdrlen = 0;
1278 	int dst_exthdrlen = 0;
1279 	int hh_len;
1280 	int copy;
1281 	int err;
1282 	int offset = 0;
1283 	__u8 tx_flags = 0;
1284 	u32 tskey = 0;
1285 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1286 	struct ipv6_txoptions *opt = v6_cork->opt;
1287 	int csummode = CHECKSUM_NONE;
1288 	unsigned int maxnonfragsize, headersize;
1289 
1290 	skb = skb_peek_tail(queue);
1291 	if (!skb) {
1292 		exthdrlen = opt ? opt->opt_flen : 0;
1293 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1294 	}
1295 
1296 	mtu = cork->fragsize;
1297 	orig_mtu = mtu;
1298 
1299 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1300 
1301 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1302 			(opt ? opt->opt_nflen : 0);
1303 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1304 		     sizeof(struct frag_hdr);
1305 
1306 	headersize = sizeof(struct ipv6hdr) +
1307 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1308 		     (dst_allfrag(&rt->dst) ?
1309 		      sizeof(struct frag_hdr) : 0) +
1310 		     rt->rt6i_nfheader_len;
1311 
1312 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1313 	    (sk->sk_protocol == IPPROTO_UDP ||
1314 	     sk->sk_protocol == IPPROTO_RAW)) {
1315 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1316 				sizeof(struct ipv6hdr));
1317 		goto emsgsize;
1318 	}
1319 
1320 	if (ip6_sk_ignore_df(sk))
1321 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1322 	else
1323 		maxnonfragsize = mtu;
1324 
1325 	if (cork->length + length > maxnonfragsize - headersize) {
1326 emsgsize:
1327 		ipv6_local_error(sk, EMSGSIZE, fl6,
1328 				 mtu - headersize +
1329 				 sizeof(struct ipv6hdr));
1330 		return -EMSGSIZE;
1331 	}
1332 
1333 	/* CHECKSUM_PARTIAL only with no extension headers and when
1334 	 * we are not going to fragment
1335 	 */
1336 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1337 	    headersize == sizeof(struct ipv6hdr) &&
1338 	    length < mtu - headersize &&
1339 	    !(flags & MSG_MORE) &&
1340 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1341 		csummode = CHECKSUM_PARTIAL;
1342 
1343 	if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
1344 		sock_tx_timestamp(sk, sockc->tsflags, &tx_flags);
1345 		if (tx_flags & SKBTX_ANY_SW_TSTAMP &&
1346 		    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1347 			tskey = sk->sk_tskey++;
1348 	}
1349 
1350 	/*
1351 	 * Let's try using as much space as possible.
1352 	 * Use MTU if total length of the message fits into the MTU.
1353 	 * Otherwise, we need to reserve fragment header and
1354 	 * fragment alignment (= 8-15 octects, in total).
1355 	 *
1356 	 * Note that we may need to "move" the data from the tail of
1357 	 * of the buffer to the new fragment when we split
1358 	 * the message.
1359 	 *
1360 	 * FIXME: It may be fragmented into multiple chunks
1361 	 *        at once if non-fragmentable extension headers
1362 	 *        are too large.
1363 	 * --yoshfuji
1364 	 */
1365 
1366 	cork->length += length;
1367 	if (((length > mtu) ||
1368 	     (skb && skb_is_gso(skb))) &&
1369 	    (sk->sk_protocol == IPPROTO_UDP) &&
1370 	    (rt->dst.dev->features & NETIF_F_UFO) &&
1371 	    (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
1372 		err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
1373 					  hh_len, fragheaderlen, exthdrlen,
1374 					  transhdrlen, mtu, flags, fl6);
1375 		if (err)
1376 			goto error;
1377 		return 0;
1378 	}
1379 
1380 	if (!skb)
1381 		goto alloc_new_skb;
1382 
1383 	while (length > 0) {
1384 		/* Check if the remaining data fits into current packet. */
1385 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1386 		if (copy < length)
1387 			copy = maxfraglen - skb->len;
1388 
1389 		if (copy <= 0) {
1390 			char *data;
1391 			unsigned int datalen;
1392 			unsigned int fraglen;
1393 			unsigned int fraggap;
1394 			unsigned int alloclen;
1395 alloc_new_skb:
1396 			/* There's no room in the current skb */
1397 			if (skb)
1398 				fraggap = skb->len - maxfraglen;
1399 			else
1400 				fraggap = 0;
1401 			/* update mtu and maxfraglen if necessary */
1402 			if (!skb || !skb_prev)
1403 				ip6_append_data_mtu(&mtu, &maxfraglen,
1404 						    fragheaderlen, skb, rt,
1405 						    orig_mtu);
1406 
1407 			skb_prev = skb;
1408 
1409 			/*
1410 			 * If remaining data exceeds the mtu,
1411 			 * we know we need more fragment(s).
1412 			 */
1413 			datalen = length + fraggap;
1414 
1415 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1416 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1417 			if ((flags & MSG_MORE) &&
1418 			    !(rt->dst.dev->features&NETIF_F_SG))
1419 				alloclen = mtu;
1420 			else
1421 				alloclen = datalen + fragheaderlen;
1422 
1423 			alloclen += dst_exthdrlen;
1424 
1425 			if (datalen != length + fraggap) {
1426 				/*
1427 				 * this is not the last fragment, the trailer
1428 				 * space is regarded as data space.
1429 				 */
1430 				datalen += rt->dst.trailer_len;
1431 			}
1432 
1433 			alloclen += rt->dst.trailer_len;
1434 			fraglen = datalen + fragheaderlen;
1435 
1436 			/*
1437 			 * We just reserve space for fragment header.
1438 			 * Note: this may be overallocation if the message
1439 			 * (without MSG_MORE) fits into the MTU.
1440 			 */
1441 			alloclen += sizeof(struct frag_hdr);
1442 
1443 			if (transhdrlen) {
1444 				skb = sock_alloc_send_skb(sk,
1445 						alloclen + hh_len,
1446 						(flags & MSG_DONTWAIT), &err);
1447 			} else {
1448 				skb = NULL;
1449 				if (atomic_read(&sk->sk_wmem_alloc) <=
1450 				    2 * sk->sk_sndbuf)
1451 					skb = sock_wmalloc(sk,
1452 							   alloclen + hh_len, 1,
1453 							   sk->sk_allocation);
1454 				if (unlikely(!skb))
1455 					err = -ENOBUFS;
1456 			}
1457 			if (!skb)
1458 				goto error;
1459 			/*
1460 			 *	Fill in the control structures
1461 			 */
1462 			skb->protocol = htons(ETH_P_IPV6);
1463 			skb->ip_summed = csummode;
1464 			skb->csum = 0;
1465 			/* reserve for fragmentation and ipsec header */
1466 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1467 				    dst_exthdrlen);
1468 
1469 			/* Only the initial fragment is time stamped */
1470 			skb_shinfo(skb)->tx_flags = tx_flags;
1471 			tx_flags = 0;
1472 			skb_shinfo(skb)->tskey = tskey;
1473 			tskey = 0;
1474 
1475 			/*
1476 			 *	Find where to start putting bytes
1477 			 */
1478 			data = skb_put(skb, fraglen);
1479 			skb_set_network_header(skb, exthdrlen);
1480 			data += fragheaderlen;
1481 			skb->transport_header = (skb->network_header +
1482 						 fragheaderlen);
1483 			if (fraggap) {
1484 				skb->csum = skb_copy_and_csum_bits(
1485 					skb_prev, maxfraglen,
1486 					data + transhdrlen, fraggap, 0);
1487 				skb_prev->csum = csum_sub(skb_prev->csum,
1488 							  skb->csum);
1489 				data += fraggap;
1490 				pskb_trim_unique(skb_prev, maxfraglen);
1491 			}
1492 			copy = datalen - transhdrlen - fraggap;
1493 
1494 			if (copy < 0) {
1495 				err = -EINVAL;
1496 				kfree_skb(skb);
1497 				goto error;
1498 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1499 				err = -EFAULT;
1500 				kfree_skb(skb);
1501 				goto error;
1502 			}
1503 
1504 			offset += copy;
1505 			length -= datalen - fraggap;
1506 			transhdrlen = 0;
1507 			exthdrlen = 0;
1508 			dst_exthdrlen = 0;
1509 
1510 			/*
1511 			 * Put the packet on the pending queue
1512 			 */
1513 			__skb_queue_tail(queue, skb);
1514 			continue;
1515 		}
1516 
1517 		if (copy > length)
1518 			copy = length;
1519 
1520 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1521 			unsigned int off;
1522 
1523 			off = skb->len;
1524 			if (getfrag(from, skb_put(skb, copy),
1525 						offset, copy, off, skb) < 0) {
1526 				__skb_trim(skb, off);
1527 				err = -EFAULT;
1528 				goto error;
1529 			}
1530 		} else {
1531 			int i = skb_shinfo(skb)->nr_frags;
1532 
1533 			err = -ENOMEM;
1534 			if (!sk_page_frag_refill(sk, pfrag))
1535 				goto error;
1536 
1537 			if (!skb_can_coalesce(skb, i, pfrag->page,
1538 					      pfrag->offset)) {
1539 				err = -EMSGSIZE;
1540 				if (i == MAX_SKB_FRAGS)
1541 					goto error;
1542 
1543 				__skb_fill_page_desc(skb, i, pfrag->page,
1544 						     pfrag->offset, 0);
1545 				skb_shinfo(skb)->nr_frags = ++i;
1546 				get_page(pfrag->page);
1547 			}
1548 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1549 			if (getfrag(from,
1550 				    page_address(pfrag->page) + pfrag->offset,
1551 				    offset, copy, skb->len, skb) < 0)
1552 				goto error_efault;
1553 
1554 			pfrag->offset += copy;
1555 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1556 			skb->len += copy;
1557 			skb->data_len += copy;
1558 			skb->truesize += copy;
1559 			atomic_add(copy, &sk->sk_wmem_alloc);
1560 		}
1561 		offset += copy;
1562 		length -= copy;
1563 	}
1564 
1565 	return 0;
1566 
1567 error_efault:
1568 	err = -EFAULT;
1569 error:
1570 	cork->length -= length;
1571 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1572 	return err;
1573 }
1574 
1575 int ip6_append_data(struct sock *sk,
1576 		    int getfrag(void *from, char *to, int offset, int len,
1577 				int odd, struct sk_buff *skb),
1578 		    void *from, int length, int transhdrlen,
1579 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1580 		    struct rt6_info *rt, unsigned int flags,
1581 		    const struct sockcm_cookie *sockc)
1582 {
1583 	struct inet_sock *inet = inet_sk(sk);
1584 	struct ipv6_pinfo *np = inet6_sk(sk);
1585 	int exthdrlen;
1586 	int err;
1587 
1588 	if (flags&MSG_PROBE)
1589 		return 0;
1590 	if (skb_queue_empty(&sk->sk_write_queue)) {
1591 		/*
1592 		 * setup for corking
1593 		 */
1594 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1595 				     ipc6, rt, fl6);
1596 		if (err)
1597 			return err;
1598 
1599 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1600 		length += exthdrlen;
1601 		transhdrlen += exthdrlen;
1602 	} else {
1603 		fl6 = &inet->cork.fl.u.ip6;
1604 		transhdrlen = 0;
1605 	}
1606 
1607 	return __ip6_append_data(sk, fl6, &sk->sk_write_queue, &inet->cork.base,
1608 				 &np->cork, sk_page_frag(sk), getfrag,
1609 				 from, length, transhdrlen, flags, ipc6, sockc);
1610 }
1611 EXPORT_SYMBOL_GPL(ip6_append_data);
1612 
1613 static void ip6_cork_release(struct inet_cork_full *cork,
1614 			     struct inet6_cork *v6_cork)
1615 {
1616 	if (v6_cork->opt) {
1617 		kfree(v6_cork->opt->dst0opt);
1618 		kfree(v6_cork->opt->dst1opt);
1619 		kfree(v6_cork->opt->hopopt);
1620 		kfree(v6_cork->opt->srcrt);
1621 		kfree(v6_cork->opt);
1622 		v6_cork->opt = NULL;
1623 	}
1624 
1625 	if (cork->base.dst) {
1626 		dst_release(cork->base.dst);
1627 		cork->base.dst = NULL;
1628 		cork->base.flags &= ~IPCORK_ALLFRAG;
1629 	}
1630 	memset(&cork->fl, 0, sizeof(cork->fl));
1631 }
1632 
1633 struct sk_buff *__ip6_make_skb(struct sock *sk,
1634 			       struct sk_buff_head *queue,
1635 			       struct inet_cork_full *cork,
1636 			       struct inet6_cork *v6_cork)
1637 {
1638 	struct sk_buff *skb, *tmp_skb;
1639 	struct sk_buff **tail_skb;
1640 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1641 	struct ipv6_pinfo *np = inet6_sk(sk);
1642 	struct net *net = sock_net(sk);
1643 	struct ipv6hdr *hdr;
1644 	struct ipv6_txoptions *opt = v6_cork->opt;
1645 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1646 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1647 	unsigned char proto = fl6->flowi6_proto;
1648 
1649 	skb = __skb_dequeue(queue);
1650 	if (!skb)
1651 		goto out;
1652 	tail_skb = &(skb_shinfo(skb)->frag_list);
1653 
1654 	/* move skb->data to ip header from ext header */
1655 	if (skb->data < skb_network_header(skb))
1656 		__skb_pull(skb, skb_network_offset(skb));
1657 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1658 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1659 		*tail_skb = tmp_skb;
1660 		tail_skb = &(tmp_skb->next);
1661 		skb->len += tmp_skb->len;
1662 		skb->data_len += tmp_skb->len;
1663 		skb->truesize += tmp_skb->truesize;
1664 		tmp_skb->destructor = NULL;
1665 		tmp_skb->sk = NULL;
1666 	}
1667 
1668 	/* Allow local fragmentation. */
1669 	skb->ignore_df = ip6_sk_ignore_df(sk);
1670 
1671 	*final_dst = fl6->daddr;
1672 	__skb_pull(skb, skb_network_header_len(skb));
1673 	if (opt && opt->opt_flen)
1674 		ipv6_push_frag_opts(skb, opt, &proto);
1675 	if (opt && opt->opt_nflen)
1676 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1677 
1678 	skb_push(skb, sizeof(struct ipv6hdr));
1679 	skb_reset_network_header(skb);
1680 	hdr = ipv6_hdr(skb);
1681 
1682 	ip6_flow_hdr(hdr, v6_cork->tclass,
1683 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1684 					np->autoflowlabel, fl6));
1685 	hdr->hop_limit = v6_cork->hop_limit;
1686 	hdr->nexthdr = proto;
1687 	hdr->saddr = fl6->saddr;
1688 	hdr->daddr = *final_dst;
1689 
1690 	skb->priority = sk->sk_priority;
1691 	skb->mark = sk->sk_mark;
1692 
1693 	skb_dst_set(skb, dst_clone(&rt->dst));
1694 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1695 	if (proto == IPPROTO_ICMPV6) {
1696 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1697 
1698 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_hdr(skb)->icmp6_type);
1699 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1700 	}
1701 
1702 	ip6_cork_release(cork, v6_cork);
1703 out:
1704 	return skb;
1705 }
1706 
1707 int ip6_send_skb(struct sk_buff *skb)
1708 {
1709 	struct net *net = sock_net(skb->sk);
1710 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
1711 	int err;
1712 
1713 	err = ip6_local_out(net, skb->sk, skb);
1714 	if (err) {
1715 		if (err > 0)
1716 			err = net_xmit_errno(err);
1717 		if (err)
1718 			IP6_INC_STATS(net, rt->rt6i_idev,
1719 				      IPSTATS_MIB_OUTDISCARDS);
1720 	}
1721 
1722 	return err;
1723 }
1724 
1725 int ip6_push_pending_frames(struct sock *sk)
1726 {
1727 	struct sk_buff *skb;
1728 
1729 	skb = ip6_finish_skb(sk);
1730 	if (!skb)
1731 		return 0;
1732 
1733 	return ip6_send_skb(skb);
1734 }
1735 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1736 
1737 static void __ip6_flush_pending_frames(struct sock *sk,
1738 				       struct sk_buff_head *queue,
1739 				       struct inet_cork_full *cork,
1740 				       struct inet6_cork *v6_cork)
1741 {
1742 	struct sk_buff *skb;
1743 
1744 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
1745 		if (skb_dst(skb))
1746 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1747 				      IPSTATS_MIB_OUTDISCARDS);
1748 		kfree_skb(skb);
1749 	}
1750 
1751 	ip6_cork_release(cork, v6_cork);
1752 }
1753 
1754 void ip6_flush_pending_frames(struct sock *sk)
1755 {
1756 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
1757 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
1758 }
1759 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1760 
1761 struct sk_buff *ip6_make_skb(struct sock *sk,
1762 			     int getfrag(void *from, char *to, int offset,
1763 					 int len, int odd, struct sk_buff *skb),
1764 			     void *from, int length, int transhdrlen,
1765 			     struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1766 			     struct rt6_info *rt, unsigned int flags,
1767 			     const struct sockcm_cookie *sockc)
1768 {
1769 	struct inet_cork_full cork;
1770 	struct inet6_cork v6_cork;
1771 	struct sk_buff_head queue;
1772 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1773 	int err;
1774 
1775 	if (flags & MSG_PROBE)
1776 		return NULL;
1777 
1778 	__skb_queue_head_init(&queue);
1779 
1780 	cork.base.flags = 0;
1781 	cork.base.addr = 0;
1782 	cork.base.opt = NULL;
1783 	v6_cork.opt = NULL;
1784 	err = ip6_setup_cork(sk, &cork, &v6_cork, ipc6, rt, fl6);
1785 	if (err)
1786 		return ERR_PTR(err);
1787 
1788 	if (ipc6->dontfrag < 0)
1789 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
1790 
1791 	err = __ip6_append_data(sk, fl6, &queue, &cork.base, &v6_cork,
1792 				&current->task_frag, getfrag, from,
1793 				length + exthdrlen, transhdrlen + exthdrlen,
1794 				flags, ipc6, sockc);
1795 	if (err) {
1796 		__ip6_flush_pending_frames(sk, &queue, &cork, &v6_cork);
1797 		return ERR_PTR(err);
1798 	}
1799 
1800 	return __ip6_make_skb(sk, &queue, &cork, &v6_cork);
1801 }
1802