xref: /linux/net/ipv6/ip6_output.c (revision d97b46a64674a267bc41c9e16132ee2a98c3347d)
1 /*
2  *	IPv6 output functions
3  *	Linux INET6 implementation
4  *
5  *	Authors:
6  *	Pedro Roque		<roque@di.fc.ul.pt>
7  *
8  *	Based on linux/net/ipv4/ip_output.c
9  *
10  *	This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  *
15  *	Changes:
16  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
17  *				extension headers are implemented.
18  *				route changes now work.
19  *				ip6_forward does not confuse sniffers.
20  *				etc.
21  *
22  *      H. von Brand    :       Added missing #include <linux/string.h>
23  *	Imran Patel	: 	frag id should be in NBO
24  *      Kazunori MIYAZAWA @USAGI
25  *			:       add ip6_append_data and related functions
26  *				for datagram xmit
27  */
28 
29 #include <linux/errno.h>
30 #include <linux/kernel.h>
31 #include <linux/string.h>
32 #include <linux/socket.h>
33 #include <linux/net.h>
34 #include <linux/netdevice.h>
35 #include <linux/if_arp.h>
36 #include <linux/in6.h>
37 #include <linux/tcp.h>
38 #include <linux/route.h>
39 #include <linux/module.h>
40 #include <linux/slab.h>
41 
42 #include <linux/netfilter.h>
43 #include <linux/netfilter_ipv6.h>
44 
45 #include <net/sock.h>
46 #include <net/snmp.h>
47 
48 #include <net/ipv6.h>
49 #include <net/ndisc.h>
50 #include <net/protocol.h>
51 #include <net/ip6_route.h>
52 #include <net/addrconf.h>
53 #include <net/rawv6.h>
54 #include <net/icmp.h>
55 #include <net/xfrm.h>
56 #include <net/checksum.h>
57 #include <linux/mroute6.h>
58 
59 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
60 
61 int __ip6_local_out(struct sk_buff *skb)
62 {
63 	int len;
64 
65 	len = skb->len - sizeof(struct ipv6hdr);
66 	if (len > IPV6_MAXPLEN)
67 		len = 0;
68 	ipv6_hdr(skb)->payload_len = htons(len);
69 
70 	return nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
71 		       skb_dst(skb)->dev, dst_output);
72 }
73 
74 int ip6_local_out(struct sk_buff *skb)
75 {
76 	int err;
77 
78 	err = __ip6_local_out(skb);
79 	if (likely(err == 1))
80 		err = dst_output(skb);
81 
82 	return err;
83 }
84 EXPORT_SYMBOL_GPL(ip6_local_out);
85 
86 /* dev_loopback_xmit for use with netfilter. */
87 static int ip6_dev_loopback_xmit(struct sk_buff *newskb)
88 {
89 	skb_reset_mac_header(newskb);
90 	__skb_pull(newskb, skb_network_offset(newskb));
91 	newskb->pkt_type = PACKET_LOOPBACK;
92 	newskb->ip_summed = CHECKSUM_UNNECESSARY;
93 	WARN_ON(!skb_dst(newskb));
94 
95 	netif_rx_ni(newskb);
96 	return 0;
97 }
98 
99 static int ip6_finish_output2(struct sk_buff *skb)
100 {
101 	struct dst_entry *dst = skb_dst(skb);
102 	struct net_device *dev = dst->dev;
103 	struct neighbour *neigh;
104 
105 	skb->protocol = htons(ETH_P_IPV6);
106 	skb->dev = dev;
107 
108 	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
109 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
110 
111 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(skb->sk) &&
112 		    ((mroute6_socket(dev_net(dev), skb) &&
113 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
114 		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
115 					 &ipv6_hdr(skb)->saddr))) {
116 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
117 
118 			/* Do not check for IFF_ALLMULTI; multicast routing
119 			   is not supported in any case.
120 			 */
121 			if (newskb)
122 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
123 					newskb, NULL, newskb->dev,
124 					ip6_dev_loopback_xmit);
125 
126 			if (ipv6_hdr(skb)->hop_limit == 0) {
127 				IP6_INC_STATS(dev_net(dev), idev,
128 					      IPSTATS_MIB_OUTDISCARDS);
129 				kfree_skb(skb);
130 				return 0;
131 			}
132 		}
133 
134 		IP6_UPD_PO_STATS(dev_net(dev), idev, IPSTATS_MIB_OUTMCAST,
135 				skb->len);
136 	}
137 
138 	rcu_read_lock();
139 	neigh = dst_get_neighbour_noref(dst);
140 	if (neigh) {
141 		int res = neigh_output(neigh, skb);
142 
143 		rcu_read_unlock();
144 		return res;
145 	}
146 	rcu_read_unlock();
147 	IP6_INC_STATS_BH(dev_net(dst->dev),
148 			 ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
149 	kfree_skb(skb);
150 	return -EINVAL;
151 }
152 
153 static int ip6_finish_output(struct sk_buff *skb)
154 {
155 	if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
156 	    dst_allfrag(skb_dst(skb)))
157 		return ip6_fragment(skb, ip6_finish_output2);
158 	else
159 		return ip6_finish_output2(skb);
160 }
161 
162 int ip6_output(struct sk_buff *skb)
163 {
164 	struct net_device *dev = skb_dst(skb)->dev;
165 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
166 	if (unlikely(idev->cnf.disable_ipv6)) {
167 		IP6_INC_STATS(dev_net(dev), idev,
168 			      IPSTATS_MIB_OUTDISCARDS);
169 		kfree_skb(skb);
170 		return 0;
171 	}
172 
173 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, skb, NULL, dev,
174 			    ip6_finish_output,
175 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
176 }
177 
178 /*
179  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
180  */
181 
182 int ip6_xmit(struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
183 	     struct ipv6_txoptions *opt, int tclass)
184 {
185 	struct net *net = sock_net(sk);
186 	struct ipv6_pinfo *np = inet6_sk(sk);
187 	struct in6_addr *first_hop = &fl6->daddr;
188 	struct dst_entry *dst = skb_dst(skb);
189 	struct ipv6hdr *hdr;
190 	u8  proto = fl6->flowi6_proto;
191 	int seg_len = skb->len;
192 	int hlimit = -1;
193 	u32 mtu;
194 
195 	if (opt) {
196 		unsigned int head_room;
197 
198 		/* First: exthdrs may take lots of space (~8K for now)
199 		   MAX_HEADER is not enough.
200 		 */
201 		head_room = opt->opt_nflen + opt->opt_flen;
202 		seg_len += head_room;
203 		head_room += sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
204 
205 		if (skb_headroom(skb) < head_room) {
206 			struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
207 			if (skb2 == NULL) {
208 				IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
209 					      IPSTATS_MIB_OUTDISCARDS);
210 				kfree_skb(skb);
211 				return -ENOBUFS;
212 			}
213 			consume_skb(skb);
214 			skb = skb2;
215 			skb_set_owner_w(skb, sk);
216 		}
217 		if (opt->opt_flen)
218 			ipv6_push_frag_opts(skb, opt, &proto);
219 		if (opt->opt_nflen)
220 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop);
221 	}
222 
223 	skb_push(skb, sizeof(struct ipv6hdr));
224 	skb_reset_network_header(skb);
225 	hdr = ipv6_hdr(skb);
226 
227 	/*
228 	 *	Fill in the IPv6 header
229 	 */
230 	if (np)
231 		hlimit = np->hop_limit;
232 	if (hlimit < 0)
233 		hlimit = ip6_dst_hoplimit(dst);
234 
235 	*(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | fl6->flowlabel;
236 
237 	hdr->payload_len = htons(seg_len);
238 	hdr->nexthdr = proto;
239 	hdr->hop_limit = hlimit;
240 
241 	hdr->saddr = fl6->saddr;
242 	hdr->daddr = *first_hop;
243 
244 	skb->priority = sk->sk_priority;
245 	skb->mark = sk->sk_mark;
246 
247 	mtu = dst_mtu(dst);
248 	if ((skb->len <= mtu) || skb->local_df || skb_is_gso(skb)) {
249 		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
250 			      IPSTATS_MIB_OUT, skb->len);
251 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT, skb, NULL,
252 			       dst->dev, dst_output);
253 	}
254 
255 	net_dbg_ratelimited("IPv6: sending pkt_too_big to self\n");
256 	skb->dev = dst->dev;
257 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
258 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
259 	kfree_skb(skb);
260 	return -EMSGSIZE;
261 }
262 
263 EXPORT_SYMBOL(ip6_xmit);
264 
265 /*
266  *	To avoid extra problems ND packets are send through this
267  *	routine. It's code duplication but I really want to avoid
268  *	extra checks since ipv6_build_header is used by TCP (which
269  *	is for us performance critical)
270  */
271 
272 int ip6_nd_hdr(struct sock *sk, struct sk_buff *skb, struct net_device *dev,
273 	       const struct in6_addr *saddr, const struct in6_addr *daddr,
274 	       int proto, int len)
275 {
276 	struct ipv6_pinfo *np = inet6_sk(sk);
277 	struct ipv6hdr *hdr;
278 
279 	skb->protocol = htons(ETH_P_IPV6);
280 	skb->dev = dev;
281 
282 	skb_reset_network_header(skb);
283 	skb_put(skb, sizeof(struct ipv6hdr));
284 	hdr = ipv6_hdr(skb);
285 
286 	*(__be32*)hdr = htonl(0x60000000);
287 
288 	hdr->payload_len = htons(len);
289 	hdr->nexthdr = proto;
290 	hdr->hop_limit = np->hop_limit;
291 
292 	hdr->saddr = *saddr;
293 	hdr->daddr = *daddr;
294 
295 	return 0;
296 }
297 
298 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
299 {
300 	struct ip6_ra_chain *ra;
301 	struct sock *last = NULL;
302 
303 	read_lock(&ip6_ra_lock);
304 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
305 		struct sock *sk = ra->sk;
306 		if (sk && ra->sel == sel &&
307 		    (!sk->sk_bound_dev_if ||
308 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
309 			if (last) {
310 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
311 				if (skb2)
312 					rawv6_rcv(last, skb2);
313 			}
314 			last = sk;
315 		}
316 	}
317 
318 	if (last) {
319 		rawv6_rcv(last, skb);
320 		read_unlock(&ip6_ra_lock);
321 		return 1;
322 	}
323 	read_unlock(&ip6_ra_lock);
324 	return 0;
325 }
326 
327 static int ip6_forward_proxy_check(struct sk_buff *skb)
328 {
329 	struct ipv6hdr *hdr = ipv6_hdr(skb);
330 	u8 nexthdr = hdr->nexthdr;
331 	__be16 frag_off;
332 	int offset;
333 
334 	if (ipv6_ext_hdr(nexthdr)) {
335 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
336 		if (offset < 0)
337 			return 0;
338 	} else
339 		offset = sizeof(struct ipv6hdr);
340 
341 	if (nexthdr == IPPROTO_ICMPV6) {
342 		struct icmp6hdr *icmp6;
343 
344 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
345 					 offset + 1 - skb->data)))
346 			return 0;
347 
348 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
349 
350 		switch (icmp6->icmp6_type) {
351 		case NDISC_ROUTER_SOLICITATION:
352 		case NDISC_ROUTER_ADVERTISEMENT:
353 		case NDISC_NEIGHBOUR_SOLICITATION:
354 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
355 		case NDISC_REDIRECT:
356 			/* For reaction involving unicast neighbor discovery
357 			 * message destined to the proxied address, pass it to
358 			 * input function.
359 			 */
360 			return 1;
361 		default:
362 			break;
363 		}
364 	}
365 
366 	/*
367 	 * The proxying router can't forward traffic sent to a link-local
368 	 * address, so signal the sender and discard the packet. This
369 	 * behavior is clarified by the MIPv6 specification.
370 	 */
371 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
372 		dst_link_failure(skb);
373 		return -1;
374 	}
375 
376 	return 0;
377 }
378 
379 static inline int ip6_forward_finish(struct sk_buff *skb)
380 {
381 	return dst_output(skb);
382 }
383 
384 int ip6_forward(struct sk_buff *skb)
385 {
386 	struct dst_entry *dst = skb_dst(skb);
387 	struct ipv6hdr *hdr = ipv6_hdr(skb);
388 	struct inet6_skb_parm *opt = IP6CB(skb);
389 	struct net *net = dev_net(dst->dev);
390 	u32 mtu;
391 
392 	if (net->ipv6.devconf_all->forwarding == 0)
393 		goto error;
394 
395 	if (skb_warn_if_lro(skb))
396 		goto drop;
397 
398 	if (!xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
399 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
400 		goto drop;
401 	}
402 
403 	if (skb->pkt_type != PACKET_HOST)
404 		goto drop;
405 
406 	skb_forward_csum(skb);
407 
408 	/*
409 	 *	We DO NOT make any processing on
410 	 *	RA packets, pushing them to user level AS IS
411 	 *	without ane WARRANTY that application will be able
412 	 *	to interpret them. The reason is that we
413 	 *	cannot make anything clever here.
414 	 *
415 	 *	We are not end-node, so that if packet contains
416 	 *	AH/ESP, we cannot make anything.
417 	 *	Defragmentation also would be mistake, RA packets
418 	 *	cannot be fragmented, because there is no warranty
419 	 *	that different fragments will go along one path. --ANK
420 	 */
421 	if (opt->ra) {
422 		u8 *ptr = skb_network_header(skb) + opt->ra;
423 		if (ip6_call_ra_chain(skb, (ptr[2]<<8) + ptr[3]))
424 			return 0;
425 	}
426 
427 	/*
428 	 *	check and decrement ttl
429 	 */
430 	if (hdr->hop_limit <= 1) {
431 		/* Force OUTPUT device used as source address */
432 		skb->dev = dst->dev;
433 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
434 		IP6_INC_STATS_BH(net,
435 				 ip6_dst_idev(dst), IPSTATS_MIB_INHDRERRORS);
436 
437 		kfree_skb(skb);
438 		return -ETIMEDOUT;
439 	}
440 
441 	/* XXX: idev->cnf.proxy_ndp? */
442 	if (net->ipv6.devconf_all->proxy_ndp &&
443 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
444 		int proxied = ip6_forward_proxy_check(skb);
445 		if (proxied > 0)
446 			return ip6_input(skb);
447 		else if (proxied < 0) {
448 			IP6_INC_STATS(net, ip6_dst_idev(dst),
449 				      IPSTATS_MIB_INDISCARDS);
450 			goto drop;
451 		}
452 	}
453 
454 	if (!xfrm6_route_forward(skb)) {
455 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_INDISCARDS);
456 		goto drop;
457 	}
458 	dst = skb_dst(skb);
459 
460 	/* IPv6 specs say nothing about it, but it is clear that we cannot
461 	   send redirects to source routed frames.
462 	   We don't send redirects to frames decapsulated from IPsec.
463 	 */
464 	if (skb->dev == dst->dev && opt->srcrt == 0 && !skb_sec_path(skb)) {
465 		struct in6_addr *target = NULL;
466 		struct rt6_info *rt;
467 
468 		/*
469 		 *	incoming and outgoing devices are the same
470 		 *	send a redirect.
471 		 */
472 
473 		rt = (struct rt6_info *) dst;
474 		if (rt->rt6i_flags & RTF_GATEWAY)
475 			target = &rt->rt6i_gateway;
476 		else
477 			target = &hdr->daddr;
478 
479 		if (!rt->rt6i_peer)
480 			rt6_bind_peer(rt, 1);
481 
482 		/* Limit redirects both by destination (here)
483 		   and by source (inside ndisc_send_redirect)
484 		 */
485 		if (inet_peer_xrlim_allow(rt->rt6i_peer, 1*HZ))
486 			ndisc_send_redirect(skb, target);
487 	} else {
488 		int addrtype = ipv6_addr_type(&hdr->saddr);
489 
490 		/* This check is security critical. */
491 		if (addrtype == IPV6_ADDR_ANY ||
492 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
493 			goto error;
494 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
495 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
496 				    ICMPV6_NOT_NEIGHBOUR, 0);
497 			goto error;
498 		}
499 	}
500 
501 	mtu = dst_mtu(dst);
502 	if (mtu < IPV6_MIN_MTU)
503 		mtu = IPV6_MIN_MTU;
504 
505 	if (skb->len > mtu && !skb_is_gso(skb)) {
506 		/* Again, force OUTPUT device used as source address */
507 		skb->dev = dst->dev;
508 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
509 		IP6_INC_STATS_BH(net,
510 				 ip6_dst_idev(dst), IPSTATS_MIB_INTOOBIGERRORS);
511 		IP6_INC_STATS_BH(net,
512 				 ip6_dst_idev(dst), IPSTATS_MIB_FRAGFAILS);
513 		kfree_skb(skb);
514 		return -EMSGSIZE;
515 	}
516 
517 	if (skb_cow(skb, dst->dev->hard_header_len)) {
518 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
519 		goto drop;
520 	}
521 
522 	hdr = ipv6_hdr(skb);
523 
524 	/* Mangling hops number delayed to point after skb COW */
525 
526 	hdr->hop_limit--;
527 
528 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
529 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, skb, skb->dev, dst->dev,
530 		       ip6_forward_finish);
531 
532 error:
533 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_INADDRERRORS);
534 drop:
535 	kfree_skb(skb);
536 	return -EINVAL;
537 }
538 
539 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
540 {
541 	to->pkt_type = from->pkt_type;
542 	to->priority = from->priority;
543 	to->protocol = from->protocol;
544 	skb_dst_drop(to);
545 	skb_dst_set(to, dst_clone(skb_dst(from)));
546 	to->dev = from->dev;
547 	to->mark = from->mark;
548 
549 #ifdef CONFIG_NET_SCHED
550 	to->tc_index = from->tc_index;
551 #endif
552 	nf_copy(to, from);
553 #if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
554     defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
555 	to->nf_trace = from->nf_trace;
556 #endif
557 	skb_copy_secmark(to, from);
558 }
559 
560 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
561 {
562 	u16 offset = sizeof(struct ipv6hdr);
563 	struct ipv6_opt_hdr *exthdr =
564 				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
565 	unsigned int packet_len = skb->tail - skb->network_header;
566 	int found_rhdr = 0;
567 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
568 
569 	while (offset + 1 <= packet_len) {
570 
571 		switch (**nexthdr) {
572 
573 		case NEXTHDR_HOP:
574 			break;
575 		case NEXTHDR_ROUTING:
576 			found_rhdr = 1;
577 			break;
578 		case NEXTHDR_DEST:
579 #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE)
580 			if (ipv6_find_tlv(skb, offset, IPV6_TLV_HAO) >= 0)
581 				break;
582 #endif
583 			if (found_rhdr)
584 				return offset;
585 			break;
586 		default :
587 			return offset;
588 		}
589 
590 		offset += ipv6_optlen(exthdr);
591 		*nexthdr = &exthdr->nexthdr;
592 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
593 						 offset);
594 	}
595 
596 	return offset;
597 }
598 
599 void ipv6_select_ident(struct frag_hdr *fhdr, struct rt6_info *rt)
600 {
601 	static atomic_t ipv6_fragmentation_id;
602 	int old, new;
603 
604 	if (rt && !(rt->dst.flags & DST_NOPEER)) {
605 		struct inet_peer *peer;
606 
607 		if (!rt->rt6i_peer)
608 			rt6_bind_peer(rt, 1);
609 		peer = rt->rt6i_peer;
610 		if (peer) {
611 			fhdr->identification = htonl(inet_getid(peer, 0));
612 			return;
613 		}
614 	}
615 	do {
616 		old = atomic_read(&ipv6_fragmentation_id);
617 		new = old + 1;
618 		if (!new)
619 			new = 1;
620 	} while (atomic_cmpxchg(&ipv6_fragmentation_id, old, new) != old);
621 	fhdr->identification = htonl(new);
622 }
623 
624 int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
625 {
626 	struct sk_buff *frag;
627 	struct rt6_info *rt = (struct rt6_info*)skb_dst(skb);
628 	struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
629 	struct ipv6hdr *tmp_hdr;
630 	struct frag_hdr *fh;
631 	unsigned int mtu, hlen, left, len;
632 	int hroom, troom;
633 	__be32 frag_id = 0;
634 	int ptr, offset = 0, err=0;
635 	u8 *prevhdr, nexthdr = 0;
636 	struct net *net = dev_net(skb_dst(skb)->dev);
637 
638 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
639 	nexthdr = *prevhdr;
640 
641 	mtu = ip6_skb_dst_mtu(skb);
642 
643 	/* We must not fragment if the socket is set to force MTU discovery
644 	 * or if the skb it not generated by a local socket.
645 	 */
646 	if (unlikely(!skb->local_df && skb->len > mtu)) {
647 		if (skb->sk && dst_allfrag(skb_dst(skb)))
648 			sk_nocaps_add(skb->sk, NETIF_F_GSO_MASK);
649 
650 		skb->dev = skb_dst(skb)->dev;
651 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
653 			      IPSTATS_MIB_FRAGFAILS);
654 		kfree_skb(skb);
655 		return -EMSGSIZE;
656 	}
657 
658 	if (np && np->frag_size < mtu) {
659 		if (np->frag_size)
660 			mtu = np->frag_size;
661 	}
662 	mtu -= hlen + sizeof(struct frag_hdr);
663 
664 	if (skb_has_frag_list(skb)) {
665 		int first_len = skb_pagelen(skb);
666 		struct sk_buff *frag2;
667 
668 		if (first_len - hlen > mtu ||
669 		    ((first_len - hlen) & 7) ||
670 		    skb_cloned(skb))
671 			goto slow_path;
672 
673 		skb_walk_frags(skb, frag) {
674 			/* Correct geometry. */
675 			if (frag->len > mtu ||
676 			    ((frag->len & 7) && frag->next) ||
677 			    skb_headroom(frag) < hlen)
678 				goto slow_path_clean;
679 
680 			/* Partially cloned skb? */
681 			if (skb_shared(frag))
682 				goto slow_path_clean;
683 
684 			BUG_ON(frag->sk);
685 			if (skb->sk) {
686 				frag->sk = skb->sk;
687 				frag->destructor = sock_wfree;
688 			}
689 			skb->truesize -= frag->truesize;
690 		}
691 
692 		err = 0;
693 		offset = 0;
694 		frag = skb_shinfo(skb)->frag_list;
695 		skb_frag_list_init(skb);
696 		/* BUILD HEADER */
697 
698 		*prevhdr = NEXTHDR_FRAGMENT;
699 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
700 		if (!tmp_hdr) {
701 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
702 				      IPSTATS_MIB_FRAGFAILS);
703 			return -ENOMEM;
704 		}
705 
706 		__skb_pull(skb, hlen);
707 		fh = (struct frag_hdr*)__skb_push(skb, sizeof(struct frag_hdr));
708 		__skb_push(skb, hlen);
709 		skb_reset_network_header(skb);
710 		memcpy(skb_network_header(skb), tmp_hdr, hlen);
711 
712 		ipv6_select_ident(fh, rt);
713 		fh->nexthdr = nexthdr;
714 		fh->reserved = 0;
715 		fh->frag_off = htons(IP6_MF);
716 		frag_id = fh->identification;
717 
718 		first_len = skb_pagelen(skb);
719 		skb->data_len = first_len - skb_headlen(skb);
720 		skb->len = first_len;
721 		ipv6_hdr(skb)->payload_len = htons(first_len -
722 						   sizeof(struct ipv6hdr));
723 
724 		dst_hold(&rt->dst);
725 
726 		for (;;) {
727 			/* Prepare header of the next frame,
728 			 * before previous one went down. */
729 			if (frag) {
730 				frag->ip_summed = CHECKSUM_NONE;
731 				skb_reset_transport_header(frag);
732 				fh = (struct frag_hdr*)__skb_push(frag, sizeof(struct frag_hdr));
733 				__skb_push(frag, hlen);
734 				skb_reset_network_header(frag);
735 				memcpy(skb_network_header(frag), tmp_hdr,
736 				       hlen);
737 				offset += skb->len - hlen - sizeof(struct frag_hdr);
738 				fh->nexthdr = nexthdr;
739 				fh->reserved = 0;
740 				fh->frag_off = htons(offset);
741 				if (frag->next != NULL)
742 					fh->frag_off |= htons(IP6_MF);
743 				fh->identification = frag_id;
744 				ipv6_hdr(frag)->payload_len =
745 						htons(frag->len -
746 						      sizeof(struct ipv6hdr));
747 				ip6_copy_metadata(frag, skb);
748 			}
749 
750 			err = output(skb);
751 			if(!err)
752 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
753 					      IPSTATS_MIB_FRAGCREATES);
754 
755 			if (err || !frag)
756 				break;
757 
758 			skb = frag;
759 			frag = skb->next;
760 			skb->next = NULL;
761 		}
762 
763 		kfree(tmp_hdr);
764 
765 		if (err == 0) {
766 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
767 				      IPSTATS_MIB_FRAGOKS);
768 			dst_release(&rt->dst);
769 			return 0;
770 		}
771 
772 		while (frag) {
773 			skb = frag->next;
774 			kfree_skb(frag);
775 			frag = skb;
776 		}
777 
778 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
779 			      IPSTATS_MIB_FRAGFAILS);
780 		dst_release(&rt->dst);
781 		return err;
782 
783 slow_path_clean:
784 		skb_walk_frags(skb, frag2) {
785 			if (frag2 == frag)
786 				break;
787 			frag2->sk = NULL;
788 			frag2->destructor = NULL;
789 			skb->truesize += frag2->truesize;
790 		}
791 	}
792 
793 slow_path:
794 	if ((skb->ip_summed == CHECKSUM_PARTIAL) &&
795 	    skb_checksum_help(skb))
796 		goto fail;
797 
798 	left = skb->len - hlen;		/* Space per frame */
799 	ptr = hlen;			/* Where to start from */
800 
801 	/*
802 	 *	Fragment the datagram.
803 	 */
804 
805 	*prevhdr = NEXTHDR_FRAGMENT;
806 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
807 	troom = rt->dst.dev->needed_tailroom;
808 
809 	/*
810 	 *	Keep copying data until we run out.
811 	 */
812 	while(left > 0)	{
813 		len = left;
814 		/* IF: it doesn't fit, use 'mtu' - the data space left */
815 		if (len > mtu)
816 			len = mtu;
817 		/* IF: we are not sending up to and including the packet end
818 		   then align the next start on an eight byte boundary */
819 		if (len < left)	{
820 			len &= ~7;
821 		}
822 		/*
823 		 *	Allocate buffer.
824 		 */
825 
826 		if ((frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
827 				      hroom + troom, GFP_ATOMIC)) == NULL) {
828 			NETDEBUG(KERN_INFO "IPv6: frag: no memory for new fragment!\n");
829 			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
830 				      IPSTATS_MIB_FRAGFAILS);
831 			err = -ENOMEM;
832 			goto fail;
833 		}
834 
835 		/*
836 		 *	Set up data on packet
837 		 */
838 
839 		ip6_copy_metadata(frag, skb);
840 		skb_reserve(frag, hroom);
841 		skb_put(frag, len + hlen + sizeof(struct frag_hdr));
842 		skb_reset_network_header(frag);
843 		fh = (struct frag_hdr *)(skb_network_header(frag) + hlen);
844 		frag->transport_header = (frag->network_header + hlen +
845 					  sizeof(struct frag_hdr));
846 
847 		/*
848 		 *	Charge the memory for the fragment to any owner
849 		 *	it might possess
850 		 */
851 		if (skb->sk)
852 			skb_set_owner_w(frag, skb->sk);
853 
854 		/*
855 		 *	Copy the packet header into the new buffer.
856 		 */
857 		skb_copy_from_linear_data(skb, skb_network_header(frag), hlen);
858 
859 		/*
860 		 *	Build fragment header.
861 		 */
862 		fh->nexthdr = nexthdr;
863 		fh->reserved = 0;
864 		if (!frag_id) {
865 			ipv6_select_ident(fh, rt);
866 			frag_id = fh->identification;
867 		} else
868 			fh->identification = frag_id;
869 
870 		/*
871 		 *	Copy a block of the IP datagram.
872 		 */
873 		if (skb_copy_bits(skb, ptr, skb_transport_header(frag), len))
874 			BUG();
875 		left -= len;
876 
877 		fh->frag_off = htons(offset);
878 		if (left > 0)
879 			fh->frag_off |= htons(IP6_MF);
880 		ipv6_hdr(frag)->payload_len = htons(frag->len -
881 						    sizeof(struct ipv6hdr));
882 
883 		ptr += len;
884 		offset += len;
885 
886 		/*
887 		 *	Put this fragment into the sending queue.
888 		 */
889 		err = output(frag);
890 		if (err)
891 			goto fail;
892 
893 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
894 			      IPSTATS_MIB_FRAGCREATES);
895 	}
896 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
897 		      IPSTATS_MIB_FRAGOKS);
898 	consume_skb(skb);
899 	return err;
900 
901 fail:
902 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
903 		      IPSTATS_MIB_FRAGFAILS);
904 	kfree_skb(skb);
905 	return err;
906 }
907 
908 static inline int ip6_rt_check(const struct rt6key *rt_key,
909 			       const struct in6_addr *fl_addr,
910 			       const struct in6_addr *addr_cache)
911 {
912 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
913 		(addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache));
914 }
915 
916 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
917 					  struct dst_entry *dst,
918 					  const struct flowi6 *fl6)
919 {
920 	struct ipv6_pinfo *np = inet6_sk(sk);
921 	struct rt6_info *rt = (struct rt6_info *)dst;
922 
923 	if (!dst)
924 		goto out;
925 
926 	/* Yes, checking route validity in not connected
927 	 * case is not very simple. Take into account,
928 	 * that we do not support routing by source, TOS,
929 	 * and MSG_DONTROUTE 		--ANK (980726)
930 	 *
931 	 * 1. ip6_rt_check(): If route was host route,
932 	 *    check that cached destination is current.
933 	 *    If it is network route, we still may
934 	 *    check its validity using saved pointer
935 	 *    to the last used address: daddr_cache.
936 	 *    We do not want to save whole address now,
937 	 *    (because main consumer of this service
938 	 *    is tcp, which has not this problem),
939 	 *    so that the last trick works only on connected
940 	 *    sockets.
941 	 * 2. oif also should be the same.
942 	 */
943 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
944 #ifdef CONFIG_IPV6_SUBTREES
945 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
946 #endif
947 	    (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
948 		dst_release(dst);
949 		dst = NULL;
950 	}
951 
952 out:
953 	return dst;
954 }
955 
956 static int ip6_dst_lookup_tail(struct sock *sk,
957 			       struct dst_entry **dst, struct flowi6 *fl6)
958 {
959 	struct net *net = sock_net(sk);
960 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
961 	struct neighbour *n;
962 #endif
963 	int err;
964 
965 	if (*dst == NULL)
966 		*dst = ip6_route_output(net, sk, fl6);
967 
968 	if ((err = (*dst)->error))
969 		goto out_err_release;
970 
971 	if (ipv6_addr_any(&fl6->saddr)) {
972 		struct rt6_info *rt = (struct rt6_info *) *dst;
973 		err = ip6_route_get_saddr(net, rt, &fl6->daddr,
974 					  sk ? inet6_sk(sk)->srcprefs : 0,
975 					  &fl6->saddr);
976 		if (err)
977 			goto out_err_release;
978 	}
979 
980 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
981 	/*
982 	 * Here if the dst entry we've looked up
983 	 * has a neighbour entry that is in the INCOMPLETE
984 	 * state and the src address from the flow is
985 	 * marked as OPTIMISTIC, we release the found
986 	 * dst entry and replace it instead with the
987 	 * dst entry of the nexthop router
988 	 */
989 	rcu_read_lock();
990 	n = dst_get_neighbour_noref(*dst);
991 	if (n && !(n->nud_state & NUD_VALID)) {
992 		struct inet6_ifaddr *ifp;
993 		struct flowi6 fl_gw6;
994 		int redirect;
995 
996 		rcu_read_unlock();
997 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
998 				      (*dst)->dev, 1);
999 
1000 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1001 		if (ifp)
1002 			in6_ifa_put(ifp);
1003 
1004 		if (redirect) {
1005 			/*
1006 			 * We need to get the dst entry for the
1007 			 * default router instead
1008 			 */
1009 			dst_release(*dst);
1010 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1011 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1012 			*dst = ip6_route_output(net, sk, &fl_gw6);
1013 			if ((err = (*dst)->error))
1014 				goto out_err_release;
1015 		}
1016 	} else {
1017 		rcu_read_unlock();
1018 	}
1019 #endif
1020 
1021 	return 0;
1022 
1023 out_err_release:
1024 	if (err == -ENETUNREACH)
1025 		IP6_INC_STATS_BH(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1026 	dst_release(*dst);
1027 	*dst = NULL;
1028 	return err;
1029 }
1030 
1031 /**
1032  *	ip6_dst_lookup - perform route lookup on flow
1033  *	@sk: socket which provides route info
1034  *	@dst: pointer to dst_entry * for result
1035  *	@fl6: flow to lookup
1036  *
1037  *	This function performs a route lookup on the given flow.
1038  *
1039  *	It returns zero on success, or a standard errno code on error.
1040  */
1041 int ip6_dst_lookup(struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6)
1042 {
1043 	*dst = NULL;
1044 	return ip6_dst_lookup_tail(sk, dst, fl6);
1045 }
1046 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1047 
1048 /**
1049  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1050  *	@sk: socket which provides route info
1051  *	@fl6: flow to lookup
1052  *	@final_dst: final destination address for ipsec lookup
1053  *	@can_sleep: we are in a sleepable context
1054  *
1055  *	This function performs a route lookup on the given flow.
1056  *
1057  *	It returns a valid dst pointer on success, or a pointer encoded
1058  *	error code.
1059  */
1060 struct dst_entry *ip6_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1061 				      const struct in6_addr *final_dst,
1062 				      bool can_sleep)
1063 {
1064 	struct dst_entry *dst = NULL;
1065 	int err;
1066 
1067 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1068 	if (err)
1069 		return ERR_PTR(err);
1070 	if (final_dst)
1071 		fl6->daddr = *final_dst;
1072 	if (can_sleep)
1073 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1074 
1075 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1078 
1079 /**
1080  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1081  *	@sk: socket which provides the dst cache and route info
1082  *	@fl6: flow to lookup
1083  *	@final_dst: final destination address for ipsec lookup
1084  *	@can_sleep: we are in a sleepable context
1085  *
1086  *	This function performs a route lookup on the given flow with the
1087  *	possibility of using the cached route in the socket if it is valid.
1088  *	It will take the socket dst lock when operating on the dst cache.
1089  *	As a result, this function can only be used in process context.
1090  *
1091  *	It returns a valid dst pointer on success, or a pointer encoded
1092  *	error code.
1093  */
1094 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1095 					 const struct in6_addr *final_dst,
1096 					 bool can_sleep)
1097 {
1098 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1099 	int err;
1100 
1101 	dst = ip6_sk_dst_check(sk, dst, fl6);
1102 
1103 	err = ip6_dst_lookup_tail(sk, &dst, fl6);
1104 	if (err)
1105 		return ERR_PTR(err);
1106 	if (final_dst)
1107 		fl6->daddr = *final_dst;
1108 	if (can_sleep)
1109 		fl6->flowi6_flags |= FLOWI_FLAG_CAN_SLEEP;
1110 
1111 	return xfrm_lookup(sock_net(sk), dst, flowi6_to_flowi(fl6), sk, 0);
1112 }
1113 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1114 
1115 static inline int ip6_ufo_append_data(struct sock *sk,
1116 			int getfrag(void *from, char *to, int offset, int len,
1117 			int odd, struct sk_buff *skb),
1118 			void *from, int length, int hh_len, int fragheaderlen,
1119 			int transhdrlen, int mtu,unsigned int flags,
1120 			struct rt6_info *rt)
1121 
1122 {
1123 	struct sk_buff *skb;
1124 	int err;
1125 
1126 	/* There is support for UDP large send offload by network
1127 	 * device, so create one single skb packet containing complete
1128 	 * udp datagram
1129 	 */
1130 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
1131 		skb = sock_alloc_send_skb(sk,
1132 			hh_len + fragheaderlen + transhdrlen + 20,
1133 			(flags & MSG_DONTWAIT), &err);
1134 		if (skb == NULL)
1135 			return err;
1136 
1137 		/* reserve space for Hardware header */
1138 		skb_reserve(skb, hh_len);
1139 
1140 		/* create space for UDP/IP header */
1141 		skb_put(skb,fragheaderlen + transhdrlen);
1142 
1143 		/* initialize network header pointer */
1144 		skb_reset_network_header(skb);
1145 
1146 		/* initialize protocol header pointer */
1147 		skb->transport_header = skb->network_header + fragheaderlen;
1148 
1149 		skb->ip_summed = CHECKSUM_PARTIAL;
1150 		skb->csum = 0;
1151 	}
1152 
1153 	err = skb_append_datato_frags(sk,skb, getfrag, from,
1154 				      (length - transhdrlen));
1155 	if (!err) {
1156 		struct frag_hdr fhdr;
1157 
1158 		/* Specify the length of each IPv6 datagram fragment.
1159 		 * It has to be a multiple of 8.
1160 		 */
1161 		skb_shinfo(skb)->gso_size = (mtu - fragheaderlen -
1162 					     sizeof(struct frag_hdr)) & ~7;
1163 		skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
1164 		ipv6_select_ident(&fhdr, rt);
1165 		skb_shinfo(skb)->ip6_frag_id = fhdr.identification;
1166 		__skb_queue_tail(&sk->sk_write_queue, skb);
1167 
1168 		return 0;
1169 	}
1170 	/* There is not enough support do UPD LSO,
1171 	 * so follow normal path
1172 	 */
1173 	kfree_skb(skb);
1174 
1175 	return err;
1176 }
1177 
1178 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1179 					       gfp_t gfp)
1180 {
1181 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1182 }
1183 
1184 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1185 						gfp_t gfp)
1186 {
1187 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1188 }
1189 
1190 int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
1191 	int offset, int len, int odd, struct sk_buff *skb),
1192 	void *from, int length, int transhdrlen,
1193 	int hlimit, int tclass, struct ipv6_txoptions *opt, struct flowi6 *fl6,
1194 	struct rt6_info *rt, unsigned int flags, int dontfrag)
1195 {
1196 	struct inet_sock *inet = inet_sk(sk);
1197 	struct ipv6_pinfo *np = inet6_sk(sk);
1198 	struct inet_cork *cork;
1199 	struct sk_buff *skb;
1200 	unsigned int maxfraglen, fragheaderlen;
1201 	int exthdrlen;
1202 	int dst_exthdrlen;
1203 	int hh_len;
1204 	int mtu;
1205 	int copy;
1206 	int err;
1207 	int offset = 0;
1208 	__u8 tx_flags = 0;
1209 
1210 	if (flags&MSG_PROBE)
1211 		return 0;
1212 	cork = &inet->cork.base;
1213 	if (skb_queue_empty(&sk->sk_write_queue)) {
1214 		/*
1215 		 * setup for corking
1216 		 */
1217 		if (opt) {
1218 			if (WARN_ON(np->cork.opt))
1219 				return -EINVAL;
1220 
1221 			np->cork.opt = kmalloc(opt->tot_len, sk->sk_allocation);
1222 			if (unlikely(np->cork.opt == NULL))
1223 				return -ENOBUFS;
1224 
1225 			np->cork.opt->tot_len = opt->tot_len;
1226 			np->cork.opt->opt_flen = opt->opt_flen;
1227 			np->cork.opt->opt_nflen = opt->opt_nflen;
1228 
1229 			np->cork.opt->dst0opt = ip6_opt_dup(opt->dst0opt,
1230 							    sk->sk_allocation);
1231 			if (opt->dst0opt && !np->cork.opt->dst0opt)
1232 				return -ENOBUFS;
1233 
1234 			np->cork.opt->dst1opt = ip6_opt_dup(opt->dst1opt,
1235 							    sk->sk_allocation);
1236 			if (opt->dst1opt && !np->cork.opt->dst1opt)
1237 				return -ENOBUFS;
1238 
1239 			np->cork.opt->hopopt = ip6_opt_dup(opt->hopopt,
1240 							   sk->sk_allocation);
1241 			if (opt->hopopt && !np->cork.opt->hopopt)
1242 				return -ENOBUFS;
1243 
1244 			np->cork.opt->srcrt = ip6_rthdr_dup(opt->srcrt,
1245 							    sk->sk_allocation);
1246 			if (opt->srcrt && !np->cork.opt->srcrt)
1247 				return -ENOBUFS;
1248 
1249 			/* need source address above miyazawa*/
1250 		}
1251 		dst_hold(&rt->dst);
1252 		cork->dst = &rt->dst;
1253 		inet->cork.fl.u.ip6 = *fl6;
1254 		np->cork.hop_limit = hlimit;
1255 		np->cork.tclass = tclass;
1256 		mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
1257 		      rt->dst.dev->mtu : dst_mtu(&rt->dst);
1258 		if (np->frag_size < mtu) {
1259 			if (np->frag_size)
1260 				mtu = np->frag_size;
1261 		}
1262 		cork->fragsize = mtu;
1263 		if (dst_allfrag(rt->dst.path))
1264 			cork->flags |= IPCORK_ALLFRAG;
1265 		cork->length = 0;
1266 		sk->sk_sndmsg_page = NULL;
1267 		sk->sk_sndmsg_off = 0;
1268 		exthdrlen = (opt ? opt->opt_flen : 0) - rt->rt6i_nfheader_len;
1269 		length += exthdrlen;
1270 		transhdrlen += exthdrlen;
1271 		dst_exthdrlen = rt->dst.header_len;
1272 	} else {
1273 		rt = (struct rt6_info *)cork->dst;
1274 		fl6 = &inet->cork.fl.u.ip6;
1275 		opt = np->cork.opt;
1276 		transhdrlen = 0;
1277 		exthdrlen = 0;
1278 		dst_exthdrlen = 0;
1279 		mtu = cork->fragsize;
1280 	}
1281 
1282 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1283 
1284 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1285 			(opt ? opt->opt_nflen : 0);
1286 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
1287 
1288 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
1289 		if (cork->length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
1290 			ipv6_local_error(sk, EMSGSIZE, fl6, mtu-exthdrlen);
1291 			return -EMSGSIZE;
1292 		}
1293 	}
1294 
1295 	/* For UDP, check if TX timestamp is enabled */
1296 	if (sk->sk_type == SOCK_DGRAM) {
1297 		err = sock_tx_timestamp(sk, &tx_flags);
1298 		if (err)
1299 			goto error;
1300 	}
1301 
1302 	/*
1303 	 * Let's try using as much space as possible.
1304 	 * Use MTU if total length of the message fits into the MTU.
1305 	 * Otherwise, we need to reserve fragment header and
1306 	 * fragment alignment (= 8-15 octects, in total).
1307 	 *
1308 	 * Note that we may need to "move" the data from the tail of
1309 	 * of the buffer to the new fragment when we split
1310 	 * the message.
1311 	 *
1312 	 * FIXME: It may be fragmented into multiple chunks
1313 	 *        at once if non-fragmentable extension headers
1314 	 *        are too large.
1315 	 * --yoshfuji
1316 	 */
1317 
1318 	cork->length += length;
1319 	if (length > mtu) {
1320 		int proto = sk->sk_protocol;
1321 		if (dontfrag && (proto == IPPROTO_UDP || proto == IPPROTO_RAW)){
1322 			ipv6_local_rxpmtu(sk, fl6, mtu-exthdrlen);
1323 			return -EMSGSIZE;
1324 		}
1325 
1326 		if (proto == IPPROTO_UDP &&
1327 		    (rt->dst.dev->features & NETIF_F_UFO)) {
1328 
1329 			err = ip6_ufo_append_data(sk, getfrag, from, length,
1330 						  hh_len, fragheaderlen,
1331 						  transhdrlen, mtu, flags, rt);
1332 			if (err)
1333 				goto error;
1334 			return 0;
1335 		}
1336 	}
1337 
1338 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
1339 		goto alloc_new_skb;
1340 
1341 	while (length > 0) {
1342 		/* Check if the remaining data fits into current packet. */
1343 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1344 		if (copy < length)
1345 			copy = maxfraglen - skb->len;
1346 
1347 		if (copy <= 0) {
1348 			char *data;
1349 			unsigned int datalen;
1350 			unsigned int fraglen;
1351 			unsigned int fraggap;
1352 			unsigned int alloclen;
1353 			struct sk_buff *skb_prev;
1354 alloc_new_skb:
1355 			skb_prev = skb;
1356 
1357 			/* There's no room in the current skb */
1358 			if (skb_prev)
1359 				fraggap = skb_prev->len - maxfraglen;
1360 			else
1361 				fraggap = 0;
1362 
1363 			/*
1364 			 * If remaining data exceeds the mtu,
1365 			 * we know we need more fragment(s).
1366 			 */
1367 			datalen = length + fraggap;
1368 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1369 				datalen = maxfraglen - fragheaderlen;
1370 
1371 			fraglen = datalen + fragheaderlen;
1372 			if ((flags & MSG_MORE) &&
1373 			    !(rt->dst.dev->features&NETIF_F_SG))
1374 				alloclen = mtu;
1375 			else
1376 				alloclen = datalen + fragheaderlen;
1377 
1378 			alloclen += dst_exthdrlen;
1379 
1380 			/*
1381 			 * The last fragment gets additional space at tail.
1382 			 * Note: we overallocate on fragments with MSG_MODE
1383 			 * because we have no idea if we're the last one.
1384 			 */
1385 			if (datalen == length + fraggap)
1386 				alloclen += rt->dst.trailer_len;
1387 
1388 			/*
1389 			 * We just reserve space for fragment header.
1390 			 * Note: this may be overallocation if the message
1391 			 * (without MSG_MORE) fits into the MTU.
1392 			 */
1393 			alloclen += sizeof(struct frag_hdr);
1394 
1395 			if (transhdrlen) {
1396 				skb = sock_alloc_send_skb(sk,
1397 						alloclen + hh_len,
1398 						(flags & MSG_DONTWAIT), &err);
1399 			} else {
1400 				skb = NULL;
1401 				if (atomic_read(&sk->sk_wmem_alloc) <=
1402 				    2 * sk->sk_sndbuf)
1403 					skb = sock_wmalloc(sk,
1404 							   alloclen + hh_len, 1,
1405 							   sk->sk_allocation);
1406 				if (unlikely(skb == NULL))
1407 					err = -ENOBUFS;
1408 				else {
1409 					/* Only the initial fragment
1410 					 * is time stamped.
1411 					 */
1412 					tx_flags = 0;
1413 				}
1414 			}
1415 			if (skb == NULL)
1416 				goto error;
1417 			/*
1418 			 *	Fill in the control structures
1419 			 */
1420 			skb->ip_summed = CHECKSUM_NONE;
1421 			skb->csum = 0;
1422 			/* reserve for fragmentation and ipsec header */
1423 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1424 				    dst_exthdrlen);
1425 
1426 			if (sk->sk_type == SOCK_DGRAM)
1427 				skb_shinfo(skb)->tx_flags = tx_flags;
1428 
1429 			/*
1430 			 *	Find where to start putting bytes
1431 			 */
1432 			data = skb_put(skb, fraglen);
1433 			skb_set_network_header(skb, exthdrlen);
1434 			data += fragheaderlen;
1435 			skb->transport_header = (skb->network_header +
1436 						 fragheaderlen);
1437 			if (fraggap) {
1438 				skb->csum = skb_copy_and_csum_bits(
1439 					skb_prev, maxfraglen,
1440 					data + transhdrlen, fraggap, 0);
1441 				skb_prev->csum = csum_sub(skb_prev->csum,
1442 							  skb->csum);
1443 				data += fraggap;
1444 				pskb_trim_unique(skb_prev, maxfraglen);
1445 			}
1446 			copy = datalen - transhdrlen - fraggap;
1447 
1448 			if (copy < 0) {
1449 				err = -EINVAL;
1450 				kfree_skb(skb);
1451 				goto error;
1452 			} else if (copy > 0 && getfrag(from, data + transhdrlen, offset, copy, fraggap, skb) < 0) {
1453 				err = -EFAULT;
1454 				kfree_skb(skb);
1455 				goto error;
1456 			}
1457 
1458 			offset += copy;
1459 			length -= datalen - fraggap;
1460 			transhdrlen = 0;
1461 			exthdrlen = 0;
1462 			dst_exthdrlen = 0;
1463 
1464 			/*
1465 			 * Put the packet on the pending queue
1466 			 */
1467 			__skb_queue_tail(&sk->sk_write_queue, skb);
1468 			continue;
1469 		}
1470 
1471 		if (copy > length)
1472 			copy = length;
1473 
1474 		if (!(rt->dst.dev->features&NETIF_F_SG)) {
1475 			unsigned int off;
1476 
1477 			off = skb->len;
1478 			if (getfrag(from, skb_put(skb, copy),
1479 						offset, copy, off, skb) < 0) {
1480 				__skb_trim(skb, off);
1481 				err = -EFAULT;
1482 				goto error;
1483 			}
1484 		} else {
1485 			int i = skb_shinfo(skb)->nr_frags;
1486 			skb_frag_t *frag = &skb_shinfo(skb)->frags[i-1];
1487 			struct page *page = sk->sk_sndmsg_page;
1488 			int off = sk->sk_sndmsg_off;
1489 			unsigned int left;
1490 
1491 			if (page && (left = PAGE_SIZE - off) > 0) {
1492 				if (copy >= left)
1493 					copy = left;
1494 				if (page != skb_frag_page(frag)) {
1495 					if (i == MAX_SKB_FRAGS) {
1496 						err = -EMSGSIZE;
1497 						goto error;
1498 					}
1499 					skb_fill_page_desc(skb, i, page, sk->sk_sndmsg_off, 0);
1500 					skb_frag_ref(skb, i);
1501 					frag = &skb_shinfo(skb)->frags[i];
1502 				}
1503 			} else if(i < MAX_SKB_FRAGS) {
1504 				if (copy > PAGE_SIZE)
1505 					copy = PAGE_SIZE;
1506 				page = alloc_pages(sk->sk_allocation, 0);
1507 				if (page == NULL) {
1508 					err = -ENOMEM;
1509 					goto error;
1510 				}
1511 				sk->sk_sndmsg_page = page;
1512 				sk->sk_sndmsg_off = 0;
1513 
1514 				skb_fill_page_desc(skb, i, page, 0, 0);
1515 				frag = &skb_shinfo(skb)->frags[i];
1516 			} else {
1517 				err = -EMSGSIZE;
1518 				goto error;
1519 			}
1520 			if (getfrag(from,
1521 				    skb_frag_address(frag) + skb_frag_size(frag),
1522 				    offset, copy, skb->len, skb) < 0) {
1523 				err = -EFAULT;
1524 				goto error;
1525 			}
1526 			sk->sk_sndmsg_off += copy;
1527 			skb_frag_size_add(frag, copy);
1528 			skb->len += copy;
1529 			skb->data_len += copy;
1530 			skb->truesize += copy;
1531 			atomic_add(copy, &sk->sk_wmem_alloc);
1532 		}
1533 		offset += copy;
1534 		length -= copy;
1535 	}
1536 	return 0;
1537 error:
1538 	cork->length -= length;
1539 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1540 	return err;
1541 }
1542 EXPORT_SYMBOL_GPL(ip6_append_data);
1543 
1544 static void ip6_cork_release(struct inet_sock *inet, struct ipv6_pinfo *np)
1545 {
1546 	if (np->cork.opt) {
1547 		kfree(np->cork.opt->dst0opt);
1548 		kfree(np->cork.opt->dst1opt);
1549 		kfree(np->cork.opt->hopopt);
1550 		kfree(np->cork.opt->srcrt);
1551 		kfree(np->cork.opt);
1552 		np->cork.opt = NULL;
1553 	}
1554 
1555 	if (inet->cork.base.dst) {
1556 		dst_release(inet->cork.base.dst);
1557 		inet->cork.base.dst = NULL;
1558 		inet->cork.base.flags &= ~IPCORK_ALLFRAG;
1559 	}
1560 	memset(&inet->cork.fl, 0, sizeof(inet->cork.fl));
1561 }
1562 
1563 int ip6_push_pending_frames(struct sock *sk)
1564 {
1565 	struct sk_buff *skb, *tmp_skb;
1566 	struct sk_buff **tail_skb;
1567 	struct in6_addr final_dst_buf, *final_dst = &final_dst_buf;
1568 	struct inet_sock *inet = inet_sk(sk);
1569 	struct ipv6_pinfo *np = inet6_sk(sk);
1570 	struct net *net = sock_net(sk);
1571 	struct ipv6hdr *hdr;
1572 	struct ipv6_txoptions *opt = np->cork.opt;
1573 	struct rt6_info *rt = (struct rt6_info *)inet->cork.base.dst;
1574 	struct flowi6 *fl6 = &inet->cork.fl.u.ip6;
1575 	unsigned char proto = fl6->flowi6_proto;
1576 	int err = 0;
1577 
1578 	if ((skb = __skb_dequeue(&sk->sk_write_queue)) == NULL)
1579 		goto out;
1580 	tail_skb = &(skb_shinfo(skb)->frag_list);
1581 
1582 	/* move skb->data to ip header from ext header */
1583 	if (skb->data < skb_network_header(skb))
1584 		__skb_pull(skb, skb_network_offset(skb));
1585 	while ((tmp_skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) {
1586 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1587 		*tail_skb = tmp_skb;
1588 		tail_skb = &(tmp_skb->next);
1589 		skb->len += tmp_skb->len;
1590 		skb->data_len += tmp_skb->len;
1591 		skb->truesize += tmp_skb->truesize;
1592 		tmp_skb->destructor = NULL;
1593 		tmp_skb->sk = NULL;
1594 	}
1595 
1596 	/* Allow local fragmentation. */
1597 	if (np->pmtudisc < IPV6_PMTUDISC_DO)
1598 		skb->local_df = 1;
1599 
1600 	*final_dst = fl6->daddr;
1601 	__skb_pull(skb, skb_network_header_len(skb));
1602 	if (opt && opt->opt_flen)
1603 		ipv6_push_frag_opts(skb, opt, &proto);
1604 	if (opt && opt->opt_nflen)
1605 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst);
1606 
1607 	skb_push(skb, sizeof(struct ipv6hdr));
1608 	skb_reset_network_header(skb);
1609 	hdr = ipv6_hdr(skb);
1610 
1611 	*(__be32*)hdr = fl6->flowlabel |
1612 		     htonl(0x60000000 | ((int)np->cork.tclass << 20));
1613 
1614 	hdr->hop_limit = np->cork.hop_limit;
1615 	hdr->nexthdr = proto;
1616 	hdr->saddr = fl6->saddr;
1617 	hdr->daddr = *final_dst;
1618 
1619 	skb->priority = sk->sk_priority;
1620 	skb->mark = sk->sk_mark;
1621 
1622 	skb_dst_set(skb, dst_clone(&rt->dst));
1623 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1624 	if (proto == IPPROTO_ICMPV6) {
1625 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1626 
1627 		ICMP6MSGOUT_INC_STATS_BH(net, idev, icmp6_hdr(skb)->icmp6_type);
1628 		ICMP6_INC_STATS_BH(net, idev, ICMP6_MIB_OUTMSGS);
1629 	}
1630 
1631 	err = ip6_local_out(skb);
1632 	if (err) {
1633 		if (err > 0)
1634 			err = net_xmit_errno(err);
1635 		if (err)
1636 			goto error;
1637 	}
1638 
1639 out:
1640 	ip6_cork_release(inet, np);
1641 	return err;
1642 error:
1643 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1644 	goto out;
1645 }
1646 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
1647 
1648 void ip6_flush_pending_frames(struct sock *sk)
1649 {
1650 	struct sk_buff *skb;
1651 
1652 	while ((skb = __skb_dequeue_tail(&sk->sk_write_queue)) != NULL) {
1653 		if (skb_dst(skb))
1654 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
1655 				      IPSTATS_MIB_OUTDISCARDS);
1656 		kfree_skb(skb);
1657 	}
1658 
1659 	ip6_cork_release(inet_sk(sk), inet6_sk(sk));
1660 }
1661 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
1662