xref: /linux/net/ipv6/ip6_output.c (revision 47aab53331effedd3f5a6136854bd1da011f94b6)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst->dev;
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		skb = skb_expand_head(skb, hh_len);
74 		if (!skb) {
75 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
76 			return -ENOMEM;
77 		}
78 	}
79 
80 	hdr = ipv6_hdr(skb);
81 	daddr = &hdr->daddr;
82 	if (ipv6_addr_is_multicast(daddr)) {
83 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
84 		    ((mroute6_is_socket(net, skb) &&
85 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
86 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
87 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
88 
89 			/* Do not check for IFF_ALLMULTI; multicast routing
90 			   is not supported in any case.
91 			 */
92 			if (newskb)
93 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
94 					net, sk, newskb, NULL, newskb->dev,
95 					dev_loopback_xmit);
96 
97 			if (hdr->hop_limit == 0) {
98 				IP6_INC_STATS(net, idev,
99 					      IPSTATS_MIB_OUTDISCARDS);
100 				kfree_skb(skb);
101 				return 0;
102 			}
103 		}
104 
105 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
106 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
107 		    !(dev->flags & IFF_LOOPBACK)) {
108 			kfree_skb(skb);
109 			return 0;
110 		}
111 	}
112 
113 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
114 		int res = lwtunnel_xmit(skb);
115 
116 		if (res < 0 || res == LWTUNNEL_XMIT_DONE)
117 			return res;
118 	}
119 
120 	rcu_read_lock();
121 	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
122 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
123 
124 	if (unlikely(IS_ERR_OR_NULL(neigh))) {
125 		if (unlikely(!neigh))
126 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
127 		if (IS_ERR(neigh)) {
128 			rcu_read_unlock();
129 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
130 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
131 			return -EINVAL;
132 		}
133 	}
134 	sock_confirm_neigh(skb, neigh);
135 	ret = neigh_output(neigh, skb, false);
136 	rcu_read_unlock();
137 	return ret;
138 }
139 
140 static int
141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		err = ip6_fragment(net, sk, segs, ip6_finish_output2);
166 		if (err && ret == 0)
167 			ret = err;
168 	}
169 
170 	return ret;
171 }
172 
173 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
174 {
175 	unsigned int mtu;
176 
177 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
178 	/* Policy lookup after SNAT yielded a new policy */
179 	if (skb_dst(skb)->xfrm) {
180 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
181 		return dst_output(net, sk, skb);
182 	}
183 #endif
184 
185 	mtu = ip6_skb_dst_mtu(skb);
186 	if (skb_is_gso(skb) &&
187 	    !(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
188 	    !skb_gso_validate_network_len(skb, mtu))
189 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
190 
191 	if ((skb->len > mtu && !skb_is_gso(skb)) ||
192 	    dst_allfrag(skb_dst(skb)) ||
193 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
194 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
195 	else
196 		return ip6_finish_output2(net, sk, skb);
197 }
198 
199 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
200 {
201 	int ret;
202 
203 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
204 	switch (ret) {
205 	case NET_XMIT_SUCCESS:
206 	case NET_XMIT_CN:
207 		return __ip6_finish_output(net, sk, skb) ? : ret;
208 	default:
209 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
210 		return ret;
211 	}
212 }
213 
214 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
215 {
216 	struct net_device *dev = skb_dst(skb)->dev, *indev = skb->dev;
217 	struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
218 
219 	skb->protocol = htons(ETH_P_IPV6);
220 	skb->dev = dev;
221 
222 	if (unlikely(idev->cnf.disable_ipv6)) {
223 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
224 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
225 		return 0;
226 	}
227 
228 	return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
229 			    net, sk, skb, indev, dev,
230 			    ip6_finish_output,
231 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
232 }
233 EXPORT_SYMBOL(ip6_output);
234 
235 bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
236 {
237 	if (!np->autoflowlabel_set)
238 		return ip6_default_np_autolabel(net);
239 	else
240 		return np->autoflowlabel;
241 }
242 
243 /*
244  * xmit an sk_buff (used by TCP, SCTP and DCCP)
245  * Note : socket lock is not held for SYNACK packets, but might be modified
246  * by calls to skb_set_owner_w() and ipv6_local_error(),
247  * which are using proper atomic operations or spinlocks.
248  */
249 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
250 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
251 {
252 	struct net *net = sock_net(sk);
253 	const struct ipv6_pinfo *np = inet6_sk(sk);
254 	struct in6_addr *first_hop = &fl6->daddr;
255 	struct dst_entry *dst = skb_dst(skb);
256 	struct net_device *dev = dst->dev;
257 	struct inet6_dev *idev = ip6_dst_idev(dst);
258 	struct hop_jumbo_hdr *hop_jumbo;
259 	int hoplen = sizeof(*hop_jumbo);
260 	unsigned int head_room;
261 	struct ipv6hdr *hdr;
262 	u8  proto = fl6->flowi6_proto;
263 	int seg_len = skb->len;
264 	int hlimit = -1;
265 	u32 mtu;
266 
267 	head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
268 	if (opt)
269 		head_room += opt->opt_nflen + opt->opt_flen;
270 
271 	if (unlikely(head_room > skb_headroom(skb))) {
272 		skb = skb_expand_head(skb, head_room);
273 		if (!skb) {
274 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
275 			return -ENOBUFS;
276 		}
277 	}
278 
279 	if (opt) {
280 		seg_len += opt->opt_nflen + opt->opt_flen;
281 
282 		if (opt->opt_flen)
283 			ipv6_push_frag_opts(skb, opt, &proto);
284 
285 		if (opt->opt_nflen)
286 			ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
287 					     &fl6->saddr);
288 	}
289 
290 	if (unlikely(seg_len > IPV6_MAXPLEN)) {
291 		hop_jumbo = skb_push(skb, hoplen);
292 
293 		hop_jumbo->nexthdr = proto;
294 		hop_jumbo->hdrlen = 0;
295 		hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
296 		hop_jumbo->tlv_len = 4;
297 		hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
298 
299 		proto = IPPROTO_HOPOPTS;
300 		seg_len = 0;
301 		IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
302 	}
303 
304 	skb_push(skb, sizeof(struct ipv6hdr));
305 	skb_reset_network_header(skb);
306 	hdr = ipv6_hdr(skb);
307 
308 	/*
309 	 *	Fill in the IPv6 header
310 	 */
311 	if (np)
312 		hlimit = np->hop_limit;
313 	if (hlimit < 0)
314 		hlimit = ip6_dst_hoplimit(dst);
315 
316 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
317 				ip6_autoflowlabel(net, np), fl6));
318 
319 	hdr->payload_len = htons(seg_len);
320 	hdr->nexthdr = proto;
321 	hdr->hop_limit = hlimit;
322 
323 	hdr->saddr = fl6->saddr;
324 	hdr->daddr = *first_hop;
325 
326 	skb->protocol = htons(ETH_P_IPV6);
327 	skb->priority = priority;
328 	skb->mark = mark;
329 
330 	mtu = dst_mtu(dst);
331 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
332 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
333 
334 		/* if egress device is enslaved to an L3 master device pass the
335 		 * skb to its handler for processing
336 		 */
337 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
338 		if (unlikely(!skb))
339 			return 0;
340 
341 		/* hooks should never assume socket lock is held.
342 		 * we promote our socket to non const
343 		 */
344 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
345 			       net, (struct sock *)sk, skb, NULL, dev,
346 			       dst_output);
347 	}
348 
349 	skb->dev = dev;
350 	/* ipv6_local_error() does not require socket lock,
351 	 * we promote our socket to non const
352 	 */
353 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
354 
355 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
356 	kfree_skb(skb);
357 	return -EMSGSIZE;
358 }
359 EXPORT_SYMBOL(ip6_xmit);
360 
361 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
362 {
363 	struct ip6_ra_chain *ra;
364 	struct sock *last = NULL;
365 
366 	read_lock(&ip6_ra_lock);
367 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
368 		struct sock *sk = ra->sk;
369 		if (sk && ra->sel == sel &&
370 		    (!sk->sk_bound_dev_if ||
371 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
372 			struct ipv6_pinfo *np = inet6_sk(sk);
373 
374 			if (np && np->rtalert_isolate &&
375 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
376 				continue;
377 			}
378 			if (last) {
379 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
380 				if (skb2)
381 					rawv6_rcv(last, skb2);
382 			}
383 			last = sk;
384 		}
385 	}
386 
387 	if (last) {
388 		rawv6_rcv(last, skb);
389 		read_unlock(&ip6_ra_lock);
390 		return 1;
391 	}
392 	read_unlock(&ip6_ra_lock);
393 	return 0;
394 }
395 
396 static int ip6_forward_proxy_check(struct sk_buff *skb)
397 {
398 	struct ipv6hdr *hdr = ipv6_hdr(skb);
399 	u8 nexthdr = hdr->nexthdr;
400 	__be16 frag_off;
401 	int offset;
402 
403 	if (ipv6_ext_hdr(nexthdr)) {
404 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
405 		if (offset < 0)
406 			return 0;
407 	} else
408 		offset = sizeof(struct ipv6hdr);
409 
410 	if (nexthdr == IPPROTO_ICMPV6) {
411 		struct icmp6hdr *icmp6;
412 
413 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
414 					 offset + 1 - skb->data)))
415 			return 0;
416 
417 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
418 
419 		switch (icmp6->icmp6_type) {
420 		case NDISC_ROUTER_SOLICITATION:
421 		case NDISC_ROUTER_ADVERTISEMENT:
422 		case NDISC_NEIGHBOUR_SOLICITATION:
423 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
424 		case NDISC_REDIRECT:
425 			/* For reaction involving unicast neighbor discovery
426 			 * message destined to the proxied address, pass it to
427 			 * input function.
428 			 */
429 			return 1;
430 		default:
431 			break;
432 		}
433 	}
434 
435 	/*
436 	 * The proxying router can't forward traffic sent to a link-local
437 	 * address, so signal the sender and discard the packet. This
438 	 * behavior is clarified by the MIPv6 specification.
439 	 */
440 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
441 		dst_link_failure(skb);
442 		return -1;
443 	}
444 
445 	return 0;
446 }
447 
448 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
449 				     struct sk_buff *skb)
450 {
451 	struct dst_entry *dst = skb_dst(skb);
452 
453 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
454 	__IP6_ADD_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
455 
456 #ifdef CONFIG_NET_SWITCHDEV
457 	if (skb->offload_l3_fwd_mark) {
458 		consume_skb(skb);
459 		return 0;
460 	}
461 #endif
462 
463 	skb_clear_tstamp(skb);
464 	return dst_output(net, sk, skb);
465 }
466 
467 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
468 {
469 	if (skb->len <= mtu)
470 		return false;
471 
472 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
473 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
474 		return true;
475 
476 	if (skb->ignore_df)
477 		return false;
478 
479 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
480 		return false;
481 
482 	return true;
483 }
484 
485 int ip6_forward(struct sk_buff *skb)
486 {
487 	struct dst_entry *dst = skb_dst(skb);
488 	struct ipv6hdr *hdr = ipv6_hdr(skb);
489 	struct inet6_skb_parm *opt = IP6CB(skb);
490 	struct net *net = dev_net(dst->dev);
491 	struct inet6_dev *idev;
492 	SKB_DR(reason);
493 	u32 mtu;
494 
495 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
496 	if (net->ipv6.devconf_all->forwarding == 0)
497 		goto error;
498 
499 	if (skb->pkt_type != PACKET_HOST)
500 		goto drop;
501 
502 	if (unlikely(skb->sk))
503 		goto drop;
504 
505 	if (skb_warn_if_lro(skb))
506 		goto drop;
507 
508 	if (!net->ipv6.devconf_all->disable_policy &&
509 	    (!idev || !idev->cnf.disable_policy) &&
510 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
511 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
512 		goto drop;
513 	}
514 
515 	skb_forward_csum(skb);
516 
517 	/*
518 	 *	We DO NOT make any processing on
519 	 *	RA packets, pushing them to user level AS IS
520 	 *	without ane WARRANTY that application will be able
521 	 *	to interpret them. The reason is that we
522 	 *	cannot make anything clever here.
523 	 *
524 	 *	We are not end-node, so that if packet contains
525 	 *	AH/ESP, we cannot make anything.
526 	 *	Defragmentation also would be mistake, RA packets
527 	 *	cannot be fragmented, because there is no warranty
528 	 *	that different fragments will go along one path. --ANK
529 	 */
530 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
531 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
532 			return 0;
533 	}
534 
535 	/*
536 	 *	check and decrement ttl
537 	 */
538 	if (hdr->hop_limit <= 1) {
539 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
540 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
541 
542 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
543 		return -ETIMEDOUT;
544 	}
545 
546 	/* XXX: idev->cnf.proxy_ndp? */
547 	if (net->ipv6.devconf_all->proxy_ndp &&
548 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev, 0)) {
549 		int proxied = ip6_forward_proxy_check(skb);
550 		if (proxied > 0) {
551 			/* It's tempting to decrease the hop limit
552 			 * here by 1, as we do at the end of the
553 			 * function too.
554 			 *
555 			 * But that would be incorrect, as proxying is
556 			 * not forwarding.  The ip6_input function
557 			 * will handle this packet locally, and it
558 			 * depends on the hop limit being unchanged.
559 			 *
560 			 * One example is the NDP hop limit, that
561 			 * always has to stay 255, but other would be
562 			 * similar checks around RA packets, where the
563 			 * user can even change the desired limit.
564 			 */
565 			return ip6_input(skb);
566 		} else if (proxied < 0) {
567 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
568 			goto drop;
569 		}
570 	}
571 
572 	if (!xfrm6_route_forward(skb)) {
573 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
574 		SKB_DR_SET(reason, XFRM_POLICY);
575 		goto drop;
576 	}
577 	dst = skb_dst(skb);
578 
579 	/* IPv6 specs say nothing about it, but it is clear that we cannot
580 	   send redirects to source routed frames.
581 	   We don't send redirects to frames decapsulated from IPsec.
582 	 */
583 	if (IP6CB(skb)->iif == dst->dev->ifindex &&
584 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
585 		struct in6_addr *target = NULL;
586 		struct inet_peer *peer;
587 		struct rt6_info *rt;
588 
589 		/*
590 		 *	incoming and outgoing devices are the same
591 		 *	send a redirect.
592 		 */
593 
594 		rt = (struct rt6_info *) dst;
595 		if (rt->rt6i_flags & RTF_GATEWAY)
596 			target = &rt->rt6i_gateway;
597 		else
598 			target = &hdr->daddr;
599 
600 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr, 1);
601 
602 		/* Limit redirects both by destination (here)
603 		   and by source (inside ndisc_send_redirect)
604 		 */
605 		if (inet_peer_xrlim_allow(peer, 1*HZ))
606 			ndisc_send_redirect(skb, target);
607 		if (peer)
608 			inet_putpeer(peer);
609 	} else {
610 		int addrtype = ipv6_addr_type(&hdr->saddr);
611 
612 		/* This check is security critical. */
613 		if (addrtype == IPV6_ADDR_ANY ||
614 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
615 			goto error;
616 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
617 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
618 				    ICMPV6_NOT_NEIGHBOUR, 0);
619 			goto error;
620 		}
621 	}
622 
623 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
624 	if (mtu < IPV6_MIN_MTU)
625 		mtu = IPV6_MIN_MTU;
626 
627 	if (ip6_pkt_too_big(skb, mtu)) {
628 		/* Again, force OUTPUT device used as source address */
629 		skb->dev = dst->dev;
630 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
631 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
632 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
633 				IPSTATS_MIB_FRAGFAILS);
634 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
635 		return -EMSGSIZE;
636 	}
637 
638 	if (skb_cow(skb, dst->dev->hard_header_len)) {
639 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
640 				IPSTATS_MIB_OUTDISCARDS);
641 		goto drop;
642 	}
643 
644 	hdr = ipv6_hdr(skb);
645 
646 	/* Mangling hops number delayed to point after skb COW */
647 
648 	hdr->hop_limit--;
649 
650 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
651 		       net, NULL, skb, skb->dev, dst->dev,
652 		       ip6_forward_finish);
653 
654 error:
655 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
656 	SKB_DR_SET(reason, IP_INADDRERRORS);
657 drop:
658 	kfree_skb_reason(skb, reason);
659 	return -EINVAL;
660 }
661 
662 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
663 {
664 	to->pkt_type = from->pkt_type;
665 	to->priority = from->priority;
666 	to->protocol = from->protocol;
667 	skb_dst_drop(to);
668 	skb_dst_set(to, dst_clone(skb_dst(from)));
669 	to->dev = from->dev;
670 	to->mark = from->mark;
671 
672 	skb_copy_hash(to, from);
673 
674 #ifdef CONFIG_NET_SCHED
675 	to->tc_index = from->tc_index;
676 #endif
677 	nf_copy(to, from);
678 	skb_ext_copy(to, from);
679 	skb_copy_secmark(to, from);
680 }
681 
682 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
683 		      u8 nexthdr, __be32 frag_id,
684 		      struct ip6_fraglist_iter *iter)
685 {
686 	unsigned int first_len;
687 	struct frag_hdr *fh;
688 
689 	/* BUILD HEADER */
690 	*prevhdr = NEXTHDR_FRAGMENT;
691 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
692 	if (!iter->tmp_hdr)
693 		return -ENOMEM;
694 
695 	iter->frag = skb_shinfo(skb)->frag_list;
696 	skb_frag_list_init(skb);
697 
698 	iter->offset = 0;
699 	iter->hlen = hlen;
700 	iter->frag_id = frag_id;
701 	iter->nexthdr = nexthdr;
702 
703 	__skb_pull(skb, hlen);
704 	fh = __skb_push(skb, sizeof(struct frag_hdr));
705 	__skb_push(skb, hlen);
706 	skb_reset_network_header(skb);
707 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
708 
709 	fh->nexthdr = nexthdr;
710 	fh->reserved = 0;
711 	fh->frag_off = htons(IP6_MF);
712 	fh->identification = frag_id;
713 
714 	first_len = skb_pagelen(skb);
715 	skb->data_len = first_len - skb_headlen(skb);
716 	skb->len = first_len;
717 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
718 
719 	return 0;
720 }
721 EXPORT_SYMBOL(ip6_fraglist_init);
722 
723 void ip6_fraglist_prepare(struct sk_buff *skb,
724 			  struct ip6_fraglist_iter *iter)
725 {
726 	struct sk_buff *frag = iter->frag;
727 	unsigned int hlen = iter->hlen;
728 	struct frag_hdr *fh;
729 
730 	frag->ip_summed = CHECKSUM_NONE;
731 	skb_reset_transport_header(frag);
732 	fh = __skb_push(frag, sizeof(struct frag_hdr));
733 	__skb_push(frag, hlen);
734 	skb_reset_network_header(frag);
735 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
736 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
737 	fh->nexthdr = iter->nexthdr;
738 	fh->reserved = 0;
739 	fh->frag_off = htons(iter->offset);
740 	if (frag->next)
741 		fh->frag_off |= htons(IP6_MF);
742 	fh->identification = iter->frag_id;
743 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
744 	ip6_copy_metadata(frag, skb);
745 }
746 EXPORT_SYMBOL(ip6_fraglist_prepare);
747 
748 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
749 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
750 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
751 {
752 	state->prevhdr = prevhdr;
753 	state->nexthdr = nexthdr;
754 	state->frag_id = frag_id;
755 
756 	state->hlen = hlen;
757 	state->mtu = mtu;
758 
759 	state->left = skb->len - hlen;	/* Space per frame */
760 	state->ptr = hlen;		/* Where to start from */
761 
762 	state->hroom = hdr_room;
763 	state->troom = needed_tailroom;
764 
765 	state->offset = 0;
766 }
767 EXPORT_SYMBOL(ip6_frag_init);
768 
769 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
770 {
771 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
772 	struct sk_buff *frag;
773 	struct frag_hdr *fh;
774 	unsigned int len;
775 
776 	len = state->left;
777 	/* IF: it doesn't fit, use 'mtu' - the data space left */
778 	if (len > state->mtu)
779 		len = state->mtu;
780 	/* IF: we are not sending up to and including the packet end
781 	   then align the next start on an eight byte boundary */
782 	if (len < state->left)
783 		len &= ~7;
784 
785 	/* Allocate buffer */
786 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
787 			 state->hroom + state->troom, GFP_ATOMIC);
788 	if (!frag)
789 		return ERR_PTR(-ENOMEM);
790 
791 	/*
792 	 *	Set up data on packet
793 	 */
794 
795 	ip6_copy_metadata(frag, skb);
796 	skb_reserve(frag, state->hroom);
797 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
798 	skb_reset_network_header(frag);
799 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
800 	frag->transport_header = (frag->network_header + state->hlen +
801 				  sizeof(struct frag_hdr));
802 
803 	/*
804 	 *	Charge the memory for the fragment to any owner
805 	 *	it might possess
806 	 */
807 	if (skb->sk)
808 		skb_set_owner_w(frag, skb->sk);
809 
810 	/*
811 	 *	Copy the packet header into the new buffer.
812 	 */
813 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
814 
815 	fragnexthdr_offset = skb_network_header(frag);
816 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
817 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
818 
819 	/*
820 	 *	Build fragment header.
821 	 */
822 	fh->nexthdr = state->nexthdr;
823 	fh->reserved = 0;
824 	fh->identification = state->frag_id;
825 
826 	/*
827 	 *	Copy a block of the IP datagram.
828 	 */
829 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
830 			     len));
831 	state->left -= len;
832 
833 	fh->frag_off = htons(state->offset);
834 	if (state->left > 0)
835 		fh->frag_off |= htons(IP6_MF);
836 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
837 
838 	state->ptr += len;
839 	state->offset += len;
840 
841 	return frag;
842 }
843 EXPORT_SYMBOL(ip6_frag_next);
844 
845 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
846 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
847 {
848 	struct sk_buff *frag;
849 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
850 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
851 				inet6_sk(skb->sk) : NULL;
852 	bool mono_delivery_time = skb->mono_delivery_time;
853 	struct ip6_frag_state state;
854 	unsigned int mtu, hlen, nexthdr_offset;
855 	ktime_t tstamp = skb->tstamp;
856 	int hroom, err = 0;
857 	__be32 frag_id;
858 	u8 *prevhdr, nexthdr = 0;
859 
860 	err = ip6_find_1stfragopt(skb, &prevhdr);
861 	if (err < 0)
862 		goto fail;
863 	hlen = err;
864 	nexthdr = *prevhdr;
865 	nexthdr_offset = prevhdr - skb_network_header(skb);
866 
867 	mtu = ip6_skb_dst_mtu(skb);
868 
869 	/* We must not fragment if the socket is set to force MTU discovery
870 	 * or if the skb it not generated by a local socket.
871 	 */
872 	if (unlikely(!skb->ignore_df && skb->len > mtu))
873 		goto fail_toobig;
874 
875 	if (IP6CB(skb)->frag_max_size) {
876 		if (IP6CB(skb)->frag_max_size > mtu)
877 			goto fail_toobig;
878 
879 		/* don't send fragments larger than what we received */
880 		mtu = IP6CB(skb)->frag_max_size;
881 		if (mtu < IPV6_MIN_MTU)
882 			mtu = IPV6_MIN_MTU;
883 	}
884 
885 	if (np && np->frag_size < mtu) {
886 		if (np->frag_size)
887 			mtu = np->frag_size;
888 	}
889 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
890 		goto fail_toobig;
891 	mtu -= hlen + sizeof(struct frag_hdr);
892 
893 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
894 				    &ipv6_hdr(skb)->saddr);
895 
896 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
897 	    (err = skb_checksum_help(skb)))
898 		goto fail;
899 
900 	prevhdr = skb_network_header(skb) + nexthdr_offset;
901 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
902 	if (skb_has_frag_list(skb)) {
903 		unsigned int first_len = skb_pagelen(skb);
904 		struct ip6_fraglist_iter iter;
905 		struct sk_buff *frag2;
906 
907 		if (first_len - hlen > mtu ||
908 		    ((first_len - hlen) & 7) ||
909 		    skb_cloned(skb) ||
910 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
911 			goto slow_path;
912 
913 		skb_walk_frags(skb, frag) {
914 			/* Correct geometry. */
915 			if (frag->len > mtu ||
916 			    ((frag->len & 7) && frag->next) ||
917 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
918 				goto slow_path_clean;
919 
920 			/* Partially cloned skb? */
921 			if (skb_shared(frag))
922 				goto slow_path_clean;
923 
924 			BUG_ON(frag->sk);
925 			if (skb->sk) {
926 				frag->sk = skb->sk;
927 				frag->destructor = sock_wfree;
928 			}
929 			skb->truesize -= frag->truesize;
930 		}
931 
932 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
933 					&iter);
934 		if (err < 0)
935 			goto fail;
936 
937 		/* We prevent @rt from being freed. */
938 		rcu_read_lock();
939 
940 		for (;;) {
941 			/* Prepare header of the next frame,
942 			 * before previous one went down. */
943 			if (iter.frag)
944 				ip6_fraglist_prepare(skb, &iter);
945 
946 			skb_set_delivery_time(skb, tstamp, mono_delivery_time);
947 			err = output(net, sk, skb);
948 			if (!err)
949 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
950 					      IPSTATS_MIB_FRAGCREATES);
951 
952 			if (err || !iter.frag)
953 				break;
954 
955 			skb = ip6_fraglist_next(&iter);
956 		}
957 
958 		kfree(iter.tmp_hdr);
959 
960 		if (err == 0) {
961 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
962 				      IPSTATS_MIB_FRAGOKS);
963 			rcu_read_unlock();
964 			return 0;
965 		}
966 
967 		kfree_skb_list(iter.frag);
968 
969 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
970 			      IPSTATS_MIB_FRAGFAILS);
971 		rcu_read_unlock();
972 		return err;
973 
974 slow_path_clean:
975 		skb_walk_frags(skb, frag2) {
976 			if (frag2 == frag)
977 				break;
978 			frag2->sk = NULL;
979 			frag2->destructor = NULL;
980 			skb->truesize += frag2->truesize;
981 		}
982 	}
983 
984 slow_path:
985 	/*
986 	 *	Fragment the datagram.
987 	 */
988 
989 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
990 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
991 		      &state);
992 
993 	/*
994 	 *	Keep copying data until we run out.
995 	 */
996 
997 	while (state.left > 0) {
998 		frag = ip6_frag_next(skb, &state);
999 		if (IS_ERR(frag)) {
1000 			err = PTR_ERR(frag);
1001 			goto fail;
1002 		}
1003 
1004 		/*
1005 		 *	Put this fragment into the sending queue.
1006 		 */
1007 		skb_set_delivery_time(frag, tstamp, mono_delivery_time);
1008 		err = output(net, sk, frag);
1009 		if (err)
1010 			goto fail;
1011 
1012 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1013 			      IPSTATS_MIB_FRAGCREATES);
1014 	}
1015 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1016 		      IPSTATS_MIB_FRAGOKS);
1017 	consume_skb(skb);
1018 	return err;
1019 
1020 fail_toobig:
1021 	if (skb->sk && dst_allfrag(skb_dst(skb)))
1022 		sk_gso_disable(skb->sk);
1023 
1024 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1025 	err = -EMSGSIZE;
1026 
1027 fail:
1028 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1029 		      IPSTATS_MIB_FRAGFAILS);
1030 	kfree_skb(skb);
1031 	return err;
1032 }
1033 
1034 static inline int ip6_rt_check(const struct rt6key *rt_key,
1035 			       const struct in6_addr *fl_addr,
1036 			       const struct in6_addr *addr_cache)
1037 {
1038 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1039 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1040 }
1041 
1042 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1043 					  struct dst_entry *dst,
1044 					  const struct flowi6 *fl6)
1045 {
1046 	struct ipv6_pinfo *np = inet6_sk(sk);
1047 	struct rt6_info *rt;
1048 
1049 	if (!dst)
1050 		goto out;
1051 
1052 	if (dst->ops->family != AF_INET6) {
1053 		dst_release(dst);
1054 		return NULL;
1055 	}
1056 
1057 	rt = (struct rt6_info *)dst;
1058 	/* Yes, checking route validity in not connected
1059 	 * case is not very simple. Take into account,
1060 	 * that we do not support routing by source, TOS,
1061 	 * and MSG_DONTROUTE		--ANK (980726)
1062 	 *
1063 	 * 1. ip6_rt_check(): If route was host route,
1064 	 *    check that cached destination is current.
1065 	 *    If it is network route, we still may
1066 	 *    check its validity using saved pointer
1067 	 *    to the last used address: daddr_cache.
1068 	 *    We do not want to save whole address now,
1069 	 *    (because main consumer of this service
1070 	 *    is tcp, which has not this problem),
1071 	 *    so that the last trick works only on connected
1072 	 *    sockets.
1073 	 * 2. oif also should be the same.
1074 	 */
1075 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1076 #ifdef CONFIG_IPV6_SUBTREES
1077 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1078 #endif
1079 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst->dev->ifindex)) {
1080 		dst_release(dst);
1081 		dst = NULL;
1082 	}
1083 
1084 out:
1085 	return dst;
1086 }
1087 
1088 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1089 			       struct dst_entry **dst, struct flowi6 *fl6)
1090 {
1091 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1092 	struct neighbour *n;
1093 	struct rt6_info *rt;
1094 #endif
1095 	int err;
1096 	int flags = 0;
1097 
1098 	/* The correct way to handle this would be to do
1099 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1100 	 * the route-specific preferred source forces the
1101 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1102 	 *
1103 	 * In source specific routing (no src=any default route),
1104 	 * ip6_route_output will fail given src=any saddr, though, so
1105 	 * that's why we try it again later.
1106 	 */
1107 	if (ipv6_addr_any(&fl6->saddr)) {
1108 		struct fib6_info *from;
1109 		struct rt6_info *rt;
1110 
1111 		*dst = ip6_route_output(net, sk, fl6);
1112 		rt = (*dst)->error ? NULL : (struct rt6_info *)*dst;
1113 
1114 		rcu_read_lock();
1115 		from = rt ? rcu_dereference(rt->from) : NULL;
1116 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1117 					  sk ? inet6_sk(sk)->srcprefs : 0,
1118 					  &fl6->saddr);
1119 		rcu_read_unlock();
1120 
1121 		if (err)
1122 			goto out_err_release;
1123 
1124 		/* If we had an erroneous initial result, pretend it
1125 		 * never existed and let the SA-enabled version take
1126 		 * over.
1127 		 */
1128 		if ((*dst)->error) {
1129 			dst_release(*dst);
1130 			*dst = NULL;
1131 		}
1132 
1133 		if (fl6->flowi6_oif)
1134 			flags |= RT6_LOOKUP_F_IFACE;
1135 	}
1136 
1137 	if (!*dst)
1138 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1139 
1140 	err = (*dst)->error;
1141 	if (err)
1142 		goto out_err_release;
1143 
1144 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1145 	/*
1146 	 * Here if the dst entry we've looked up
1147 	 * has a neighbour entry that is in the INCOMPLETE
1148 	 * state and the src address from the flow is
1149 	 * marked as OPTIMISTIC, we release the found
1150 	 * dst entry and replace it instead with the
1151 	 * dst entry of the nexthop router
1152 	 */
1153 	rt = (struct rt6_info *) *dst;
1154 	rcu_read_lock();
1155 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1156 				      rt6_nexthop(rt, &fl6->daddr));
1157 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1158 	rcu_read_unlock();
1159 
1160 	if (err) {
1161 		struct inet6_ifaddr *ifp;
1162 		struct flowi6 fl_gw6;
1163 		int redirect;
1164 
1165 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1166 				      (*dst)->dev, 1);
1167 
1168 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1169 		if (ifp)
1170 			in6_ifa_put(ifp);
1171 
1172 		if (redirect) {
1173 			/*
1174 			 * We need to get the dst entry for the
1175 			 * default router instead
1176 			 */
1177 			dst_release(*dst);
1178 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1179 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1180 			*dst = ip6_route_output(net, sk, &fl_gw6);
1181 			err = (*dst)->error;
1182 			if (err)
1183 				goto out_err_release;
1184 		}
1185 	}
1186 #endif
1187 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1188 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1189 		err = -EAFNOSUPPORT;
1190 		goto out_err_release;
1191 	}
1192 
1193 	return 0;
1194 
1195 out_err_release:
1196 	dst_release(*dst);
1197 	*dst = NULL;
1198 
1199 	if (err == -ENETUNREACH)
1200 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1201 	return err;
1202 }
1203 
1204 /**
1205  *	ip6_dst_lookup - perform route lookup on flow
1206  *	@net: Network namespace to perform lookup in
1207  *	@sk: socket which provides route info
1208  *	@dst: pointer to dst_entry * for result
1209  *	@fl6: flow to lookup
1210  *
1211  *	This function performs a route lookup on the given flow.
1212  *
1213  *	It returns zero on success, or a standard errno code on error.
1214  */
1215 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1216 		   struct flowi6 *fl6)
1217 {
1218 	*dst = NULL;
1219 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1220 }
1221 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1222 
1223 /**
1224  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1225  *	@net: Network namespace to perform lookup in
1226  *	@sk: socket which provides route info
1227  *	@fl6: flow to lookup
1228  *	@final_dst: final destination address for ipsec lookup
1229  *
1230  *	This function performs a route lookup on the given flow.
1231  *
1232  *	It returns a valid dst pointer on success, or a pointer encoded
1233  *	error code.
1234  */
1235 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1236 				      const struct in6_addr *final_dst)
1237 {
1238 	struct dst_entry *dst = NULL;
1239 	int err;
1240 
1241 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1242 	if (err)
1243 		return ERR_PTR(err);
1244 	if (final_dst)
1245 		fl6->daddr = *final_dst;
1246 
1247 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1248 }
1249 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1250 
1251 /**
1252  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1253  *	@sk: socket which provides the dst cache and route info
1254  *	@fl6: flow to lookup
1255  *	@final_dst: final destination address for ipsec lookup
1256  *	@connected: whether @sk is connected or not
1257  *
1258  *	This function performs a route lookup on the given flow with the
1259  *	possibility of using the cached route in the socket if it is valid.
1260  *	It will take the socket dst lock when operating on the dst cache.
1261  *	As a result, this function can only be used in process context.
1262  *
1263  *	In addition, for a connected socket, cache the dst in the socket
1264  *	if the current cache is not valid.
1265  *
1266  *	It returns a valid dst pointer on success, or a pointer encoded
1267  *	error code.
1268  */
1269 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1270 					 const struct in6_addr *final_dst,
1271 					 bool connected)
1272 {
1273 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1274 
1275 	dst = ip6_sk_dst_check(sk, dst, fl6);
1276 	if (dst)
1277 		return dst;
1278 
1279 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1280 	if (connected && !IS_ERR(dst))
1281 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1282 
1283 	return dst;
1284 }
1285 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1286 
1287 /**
1288  *      ip6_dst_lookup_tunnel - perform route lookup on tunnel
1289  *      @skb: Packet for which lookup is done
1290  *      @dev: Tunnel device
1291  *      @net: Network namespace of tunnel device
1292  *      @sock: Socket which provides route info
1293  *      @saddr: Memory to store the src ip address
1294  *      @info: Tunnel information
1295  *      @protocol: IP protocol
1296  *      @use_cache: Flag to enable cache usage
1297  *      This function performs a route lookup on a tunnel
1298  *
1299  *      It returns a valid dst pointer and stores src address to be used in
1300  *      tunnel in param saddr on success, else a pointer encoded error code.
1301  */
1302 
1303 struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb,
1304 					struct net_device *dev,
1305 					struct net *net,
1306 					struct socket *sock,
1307 					struct in6_addr *saddr,
1308 					const struct ip_tunnel_info *info,
1309 					u8 protocol,
1310 					bool use_cache)
1311 {
1312 	struct dst_entry *dst = NULL;
1313 #ifdef CONFIG_DST_CACHE
1314 	struct dst_cache *dst_cache;
1315 #endif
1316 	struct flowi6 fl6;
1317 	__u8 prio;
1318 
1319 #ifdef CONFIG_DST_CACHE
1320 	dst_cache = (struct dst_cache *)&info->dst_cache;
1321 	if (use_cache) {
1322 		dst = dst_cache_get_ip6(dst_cache, saddr);
1323 		if (dst)
1324 			return dst;
1325 	}
1326 #endif
1327 	memset(&fl6, 0, sizeof(fl6));
1328 	fl6.flowi6_mark = skb->mark;
1329 	fl6.flowi6_proto = protocol;
1330 	fl6.daddr = info->key.u.ipv6.dst;
1331 	fl6.saddr = info->key.u.ipv6.src;
1332 	prio = info->key.tos;
1333 	fl6.flowlabel = ip6_make_flowinfo(prio, info->key.label);
1334 
1335 	dst = ipv6_stub->ipv6_dst_lookup_flow(net, sock->sk, &fl6,
1336 					      NULL);
1337 	if (IS_ERR(dst)) {
1338 		netdev_dbg(dev, "no route to %pI6\n", &fl6.daddr);
1339 		return ERR_PTR(-ENETUNREACH);
1340 	}
1341 	if (dst->dev == dev) { /* is this necessary? */
1342 		netdev_dbg(dev, "circular route to %pI6\n", &fl6.daddr);
1343 		dst_release(dst);
1344 		return ERR_PTR(-ELOOP);
1345 	}
1346 #ifdef CONFIG_DST_CACHE
1347 	if (use_cache)
1348 		dst_cache_set_ip6(dst_cache, dst, &fl6.saddr);
1349 #endif
1350 	*saddr = fl6.saddr;
1351 	return dst;
1352 }
1353 EXPORT_SYMBOL_GPL(ip6_dst_lookup_tunnel);
1354 
1355 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1356 					       gfp_t gfp)
1357 {
1358 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1359 }
1360 
1361 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1362 						gfp_t gfp)
1363 {
1364 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1365 }
1366 
1367 static void ip6_append_data_mtu(unsigned int *mtu,
1368 				int *maxfraglen,
1369 				unsigned int fragheaderlen,
1370 				struct sk_buff *skb,
1371 				struct rt6_info *rt,
1372 				unsigned int orig_mtu)
1373 {
1374 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1375 		if (!skb) {
1376 			/* first fragment, reserve header_len */
1377 			*mtu = orig_mtu - rt->dst.header_len;
1378 
1379 		} else {
1380 			/*
1381 			 * this fragment is not first, the headers
1382 			 * space is regarded as data space.
1383 			 */
1384 			*mtu = orig_mtu;
1385 		}
1386 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1387 			      + fragheaderlen - sizeof(struct frag_hdr);
1388 	}
1389 }
1390 
1391 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1392 			  struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1393 			  struct rt6_info *rt)
1394 {
1395 	struct ipv6_pinfo *np = inet6_sk(sk);
1396 	unsigned int mtu;
1397 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1398 
1399 	/* callers pass dst together with a reference, set it first so
1400 	 * ip6_cork_release() can put it down even in case of an error.
1401 	 */
1402 	cork->base.dst = &rt->dst;
1403 
1404 	/*
1405 	 * setup for corking
1406 	 */
1407 	if (opt) {
1408 		if (WARN_ON(v6_cork->opt))
1409 			return -EINVAL;
1410 
1411 		nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1412 		if (unlikely(!nopt))
1413 			return -ENOBUFS;
1414 
1415 		nopt->tot_len = sizeof(*opt);
1416 		nopt->opt_flen = opt->opt_flen;
1417 		nopt->opt_nflen = opt->opt_nflen;
1418 
1419 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1420 		if (opt->dst0opt && !nopt->dst0opt)
1421 			return -ENOBUFS;
1422 
1423 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1424 		if (opt->dst1opt && !nopt->dst1opt)
1425 			return -ENOBUFS;
1426 
1427 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1428 		if (opt->hopopt && !nopt->hopopt)
1429 			return -ENOBUFS;
1430 
1431 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1432 		if (opt->srcrt && !nopt->srcrt)
1433 			return -ENOBUFS;
1434 
1435 		/* need source address above miyazawa*/
1436 	}
1437 	v6_cork->hop_limit = ipc6->hlimit;
1438 	v6_cork->tclass = ipc6->tclass;
1439 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1440 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1441 		      READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1442 	else
1443 		mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ?
1444 			READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1445 	if (np->frag_size < mtu) {
1446 		if (np->frag_size)
1447 			mtu = np->frag_size;
1448 	}
1449 	cork->base.fragsize = mtu;
1450 	cork->base.gso_size = ipc6->gso_size;
1451 	cork->base.tx_flags = 0;
1452 	cork->base.mark = ipc6->sockc.mark;
1453 	sock_tx_timestamp(sk, ipc6->sockc.tsflags, &cork->base.tx_flags);
1454 
1455 	if (dst_allfrag(xfrm_dst_path(&rt->dst)))
1456 		cork->base.flags |= IPCORK_ALLFRAG;
1457 	cork->base.length = 0;
1458 
1459 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1460 
1461 	return 0;
1462 }
1463 
1464 static int __ip6_append_data(struct sock *sk,
1465 			     struct sk_buff_head *queue,
1466 			     struct inet_cork_full *cork_full,
1467 			     struct inet6_cork *v6_cork,
1468 			     struct page_frag *pfrag,
1469 			     int getfrag(void *from, char *to, int offset,
1470 					 int len, int odd, struct sk_buff *skb),
1471 			     void *from, size_t length, int transhdrlen,
1472 			     unsigned int flags, struct ipcm6_cookie *ipc6)
1473 {
1474 	struct sk_buff *skb, *skb_prev = NULL;
1475 	struct inet_cork *cork = &cork_full->base;
1476 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1477 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1478 	struct ubuf_info *uarg = NULL;
1479 	int exthdrlen = 0;
1480 	int dst_exthdrlen = 0;
1481 	int hh_len;
1482 	int copy;
1483 	int err;
1484 	int offset = 0;
1485 	bool zc = false;
1486 	u32 tskey = 0;
1487 	struct rt6_info *rt = (struct rt6_info *)cork->dst;
1488 	struct ipv6_txoptions *opt = v6_cork->opt;
1489 	int csummode = CHECKSUM_NONE;
1490 	unsigned int maxnonfragsize, headersize;
1491 	unsigned int wmem_alloc_delta = 0;
1492 	bool paged, extra_uref = false;
1493 
1494 	skb = skb_peek_tail(queue);
1495 	if (!skb) {
1496 		exthdrlen = opt ? opt->opt_flen : 0;
1497 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1498 	}
1499 
1500 	paged = !!cork->gso_size;
1501 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1502 	orig_mtu = mtu;
1503 
1504 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1505 	    sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)
1506 		tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1507 
1508 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1509 
1510 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1511 			(opt ? opt->opt_nflen : 0);
1512 
1513 	headersize = sizeof(struct ipv6hdr) +
1514 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1515 		     (dst_allfrag(&rt->dst) ?
1516 		      sizeof(struct frag_hdr) : 0) +
1517 		     rt->rt6i_nfheader_len;
1518 
1519 	if (mtu <= fragheaderlen ||
1520 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1521 		goto emsgsize;
1522 
1523 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1524 		     sizeof(struct frag_hdr);
1525 
1526 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1527 	 * the first fragment
1528 	 */
1529 	if (headersize + transhdrlen > mtu)
1530 		goto emsgsize;
1531 
1532 	if (cork->length + length > mtu - headersize && ipc6->dontfrag &&
1533 	    (sk->sk_protocol == IPPROTO_UDP ||
1534 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1535 	     sk->sk_protocol == IPPROTO_RAW)) {
1536 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1537 				sizeof(struct ipv6hdr));
1538 		goto emsgsize;
1539 	}
1540 
1541 	if (ip6_sk_ignore_df(sk))
1542 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1543 	else
1544 		maxnonfragsize = mtu;
1545 
1546 	if (cork->length + length > maxnonfragsize - headersize) {
1547 emsgsize:
1548 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1549 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1550 		return -EMSGSIZE;
1551 	}
1552 
1553 	/* CHECKSUM_PARTIAL only with no extension headers and when
1554 	 * we are not going to fragment
1555 	 */
1556 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1557 	    headersize == sizeof(struct ipv6hdr) &&
1558 	    length <= mtu - headersize &&
1559 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1560 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1561 		csummode = CHECKSUM_PARTIAL;
1562 
1563 	if ((flags & MSG_ZEROCOPY) && length) {
1564 		struct msghdr *msg = from;
1565 
1566 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1567 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1568 				return -EINVAL;
1569 
1570 			/* Leave uarg NULL if can't zerocopy, callers should
1571 			 * be able to handle it.
1572 			 */
1573 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1574 			    csummode == CHECKSUM_PARTIAL) {
1575 				paged = true;
1576 				zc = true;
1577 				uarg = msg->msg_ubuf;
1578 			}
1579 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1580 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb));
1581 			if (!uarg)
1582 				return -ENOBUFS;
1583 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1584 			if (rt->dst.dev->features & NETIF_F_SG &&
1585 			    csummode == CHECKSUM_PARTIAL) {
1586 				paged = true;
1587 				zc = true;
1588 			} else {
1589 				uarg_to_msgzc(uarg)->zerocopy = 0;
1590 				skb_zcopy_set(skb, uarg, &extra_uref);
1591 			}
1592 		}
1593 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1594 		if (inet_sk(sk)->hdrincl)
1595 			return -EPERM;
1596 		if (rt->dst.dev->features & NETIF_F_SG &&
1597 		    getfrag == ip_generic_getfrag)
1598 			/* We need an empty buffer to attach stuff to */
1599 			paged = true;
1600 		else
1601 			flags &= ~MSG_SPLICE_PAGES;
1602 	}
1603 
1604 	/*
1605 	 * Let's try using as much space as possible.
1606 	 * Use MTU if total length of the message fits into the MTU.
1607 	 * Otherwise, we need to reserve fragment header and
1608 	 * fragment alignment (= 8-15 octects, in total).
1609 	 *
1610 	 * Note that we may need to "move" the data from the tail
1611 	 * of the buffer to the new fragment when we split
1612 	 * the message.
1613 	 *
1614 	 * FIXME: It may be fragmented into multiple chunks
1615 	 *        at once if non-fragmentable extension headers
1616 	 *        are too large.
1617 	 * --yoshfuji
1618 	 */
1619 
1620 	cork->length += length;
1621 	if (!skb)
1622 		goto alloc_new_skb;
1623 
1624 	while (length > 0) {
1625 		/* Check if the remaining data fits into current packet. */
1626 		copy = (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - skb->len;
1627 		if (copy < length)
1628 			copy = maxfraglen - skb->len;
1629 
1630 		if (copy <= 0) {
1631 			char *data;
1632 			unsigned int datalen;
1633 			unsigned int fraglen;
1634 			unsigned int fraggap;
1635 			unsigned int alloclen, alloc_extra;
1636 			unsigned int pagedlen;
1637 alloc_new_skb:
1638 			/* There's no room in the current skb */
1639 			if (skb)
1640 				fraggap = skb->len - maxfraglen;
1641 			else
1642 				fraggap = 0;
1643 			/* update mtu and maxfraglen if necessary */
1644 			if (!skb || !skb_prev)
1645 				ip6_append_data_mtu(&mtu, &maxfraglen,
1646 						    fragheaderlen, skb, rt,
1647 						    orig_mtu);
1648 
1649 			skb_prev = skb;
1650 
1651 			/*
1652 			 * If remaining data exceeds the mtu,
1653 			 * we know we need more fragment(s).
1654 			 */
1655 			datalen = length + fraggap;
1656 
1657 			if (datalen > (cork->length <= mtu && !(cork->flags & IPCORK_ALLFRAG) ? mtu : maxfraglen) - fragheaderlen)
1658 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1659 			fraglen = datalen + fragheaderlen;
1660 			pagedlen = 0;
1661 
1662 			alloc_extra = hh_len;
1663 			alloc_extra += dst_exthdrlen;
1664 			alloc_extra += rt->dst.trailer_len;
1665 
1666 			/* We just reserve space for fragment header.
1667 			 * Note: this may be overallocation if the message
1668 			 * (without MSG_MORE) fits into the MTU.
1669 			 */
1670 			alloc_extra += sizeof(struct frag_hdr);
1671 
1672 			if ((flags & MSG_MORE) &&
1673 			    !(rt->dst.dev->features&NETIF_F_SG))
1674 				alloclen = mtu;
1675 			else if (!paged &&
1676 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1677 				  !(rt->dst.dev->features & NETIF_F_SG)))
1678 				alloclen = fraglen;
1679 			else {
1680 				alloclen = fragheaderlen + transhdrlen;
1681 				pagedlen = datalen - transhdrlen;
1682 			}
1683 			alloclen += alloc_extra;
1684 
1685 			if (datalen != length + fraggap) {
1686 				/*
1687 				 * this is not the last fragment, the trailer
1688 				 * space is regarded as data space.
1689 				 */
1690 				datalen += rt->dst.trailer_len;
1691 			}
1692 
1693 			fraglen = datalen + fragheaderlen;
1694 
1695 			copy = datalen - transhdrlen - fraggap - pagedlen;
1696 			if (copy < 0) {
1697 				err = -EINVAL;
1698 				goto error;
1699 			}
1700 			if (transhdrlen) {
1701 				skb = sock_alloc_send_skb(sk, alloclen,
1702 						(flags & MSG_DONTWAIT), &err);
1703 			} else {
1704 				skb = NULL;
1705 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1706 				    2 * sk->sk_sndbuf)
1707 					skb = alloc_skb(alloclen,
1708 							sk->sk_allocation);
1709 				if (unlikely(!skb))
1710 					err = -ENOBUFS;
1711 			}
1712 			if (!skb)
1713 				goto error;
1714 			/*
1715 			 *	Fill in the control structures
1716 			 */
1717 			skb->protocol = htons(ETH_P_IPV6);
1718 			skb->ip_summed = csummode;
1719 			skb->csum = 0;
1720 			/* reserve for fragmentation and ipsec header */
1721 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1722 				    dst_exthdrlen);
1723 
1724 			/*
1725 			 *	Find where to start putting bytes
1726 			 */
1727 			data = skb_put(skb, fraglen - pagedlen);
1728 			skb_set_network_header(skb, exthdrlen);
1729 			data += fragheaderlen;
1730 			skb->transport_header = (skb->network_header +
1731 						 fragheaderlen);
1732 			if (fraggap) {
1733 				skb->csum = skb_copy_and_csum_bits(
1734 					skb_prev, maxfraglen,
1735 					data + transhdrlen, fraggap);
1736 				skb_prev->csum = csum_sub(skb_prev->csum,
1737 							  skb->csum);
1738 				data += fraggap;
1739 				pskb_trim_unique(skb_prev, maxfraglen);
1740 			}
1741 			if (copy > 0 &&
1742 			    getfrag(from, data + transhdrlen, offset,
1743 				    copy, fraggap, skb) < 0) {
1744 				err = -EFAULT;
1745 				kfree_skb(skb);
1746 				goto error;
1747 			}
1748 
1749 			offset += copy;
1750 			length -= copy + transhdrlen;
1751 			transhdrlen = 0;
1752 			exthdrlen = 0;
1753 			dst_exthdrlen = 0;
1754 
1755 			/* Only the initial fragment is time stamped */
1756 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1757 			cork->tx_flags = 0;
1758 			skb_shinfo(skb)->tskey = tskey;
1759 			tskey = 0;
1760 			skb_zcopy_set(skb, uarg, &extra_uref);
1761 
1762 			if ((flags & MSG_CONFIRM) && !skb_prev)
1763 				skb_set_dst_pending_confirm(skb, 1);
1764 
1765 			/*
1766 			 * Put the packet on the pending queue
1767 			 */
1768 			if (!skb->destructor) {
1769 				skb->destructor = sock_wfree;
1770 				skb->sk = sk;
1771 				wmem_alloc_delta += skb->truesize;
1772 			}
1773 			__skb_queue_tail(queue, skb);
1774 			continue;
1775 		}
1776 
1777 		if (copy > length)
1778 			copy = length;
1779 
1780 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1781 		    skb_tailroom(skb) >= copy) {
1782 			unsigned int off;
1783 
1784 			off = skb->len;
1785 			if (getfrag(from, skb_put(skb, copy),
1786 						offset, copy, off, skb) < 0) {
1787 				__skb_trim(skb, off);
1788 				err = -EFAULT;
1789 				goto error;
1790 			}
1791 		} else if (flags & MSG_SPLICE_PAGES) {
1792 			struct msghdr *msg = from;
1793 
1794 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy,
1795 						   sk->sk_allocation);
1796 			if (err < 0)
1797 				goto error;
1798 			copy = err;
1799 			wmem_alloc_delta += copy;
1800 		} else if (!zc) {
1801 			int i = skb_shinfo(skb)->nr_frags;
1802 
1803 			err = -ENOMEM;
1804 			if (!sk_page_frag_refill(sk, pfrag))
1805 				goto error;
1806 
1807 			skb_zcopy_downgrade_managed(skb);
1808 			if (!skb_can_coalesce(skb, i, pfrag->page,
1809 					      pfrag->offset)) {
1810 				err = -EMSGSIZE;
1811 				if (i == MAX_SKB_FRAGS)
1812 					goto error;
1813 
1814 				__skb_fill_page_desc(skb, i, pfrag->page,
1815 						     pfrag->offset, 0);
1816 				skb_shinfo(skb)->nr_frags = ++i;
1817 				get_page(pfrag->page);
1818 			}
1819 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1820 			if (getfrag(from,
1821 				    page_address(pfrag->page) + pfrag->offset,
1822 				    offset, copy, skb->len, skb) < 0)
1823 				goto error_efault;
1824 
1825 			pfrag->offset += copy;
1826 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1827 			skb->len += copy;
1828 			skb->data_len += copy;
1829 			skb->truesize += copy;
1830 			wmem_alloc_delta += copy;
1831 		} else {
1832 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1833 			if (err < 0)
1834 				goto error;
1835 		}
1836 		offset += copy;
1837 		length -= copy;
1838 	}
1839 
1840 	if (wmem_alloc_delta)
1841 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1842 	return 0;
1843 
1844 error_efault:
1845 	err = -EFAULT;
1846 error:
1847 	net_zcopy_put_abort(uarg, extra_uref);
1848 	cork->length -= length;
1849 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1850 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1851 	return err;
1852 }
1853 
1854 int ip6_append_data(struct sock *sk,
1855 		    int getfrag(void *from, char *to, int offset, int len,
1856 				int odd, struct sk_buff *skb),
1857 		    void *from, size_t length, int transhdrlen,
1858 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1859 		    struct rt6_info *rt, unsigned int flags)
1860 {
1861 	struct inet_sock *inet = inet_sk(sk);
1862 	struct ipv6_pinfo *np = inet6_sk(sk);
1863 	int exthdrlen;
1864 	int err;
1865 
1866 	if (flags&MSG_PROBE)
1867 		return 0;
1868 	if (skb_queue_empty(&sk->sk_write_queue)) {
1869 		/*
1870 		 * setup for corking
1871 		 */
1872 		dst_hold(&rt->dst);
1873 		err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1874 				     ipc6, rt);
1875 		if (err)
1876 			return err;
1877 
1878 		inet->cork.fl.u.ip6 = *fl6;
1879 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1880 		length += exthdrlen;
1881 		transhdrlen += exthdrlen;
1882 	} else {
1883 		transhdrlen = 0;
1884 	}
1885 
1886 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1887 				 &np->cork, sk_page_frag(sk), getfrag,
1888 				 from, length, transhdrlen, flags, ipc6);
1889 }
1890 EXPORT_SYMBOL_GPL(ip6_append_data);
1891 
1892 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1893 {
1894 	struct dst_entry *dst = cork->base.dst;
1895 
1896 	cork->base.dst = NULL;
1897 	cork->base.flags &= ~IPCORK_ALLFRAG;
1898 	skb_dst_set(skb, dst);
1899 }
1900 
1901 static void ip6_cork_release(struct inet_cork_full *cork,
1902 			     struct inet6_cork *v6_cork)
1903 {
1904 	if (v6_cork->opt) {
1905 		struct ipv6_txoptions *opt = v6_cork->opt;
1906 
1907 		kfree(opt->dst0opt);
1908 		kfree(opt->dst1opt);
1909 		kfree(opt->hopopt);
1910 		kfree(opt->srcrt);
1911 		kfree(opt);
1912 		v6_cork->opt = NULL;
1913 	}
1914 
1915 	if (cork->base.dst) {
1916 		dst_release(cork->base.dst);
1917 		cork->base.dst = NULL;
1918 		cork->base.flags &= ~IPCORK_ALLFRAG;
1919 	}
1920 }
1921 
1922 struct sk_buff *__ip6_make_skb(struct sock *sk,
1923 			       struct sk_buff_head *queue,
1924 			       struct inet_cork_full *cork,
1925 			       struct inet6_cork *v6_cork)
1926 {
1927 	struct sk_buff *skb, *tmp_skb;
1928 	struct sk_buff **tail_skb;
1929 	struct in6_addr *final_dst;
1930 	struct ipv6_pinfo *np = inet6_sk(sk);
1931 	struct net *net = sock_net(sk);
1932 	struct ipv6hdr *hdr;
1933 	struct ipv6_txoptions *opt = v6_cork->opt;
1934 	struct rt6_info *rt = (struct rt6_info *)cork->base.dst;
1935 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1936 	unsigned char proto = fl6->flowi6_proto;
1937 
1938 	skb = __skb_dequeue(queue);
1939 	if (!skb)
1940 		goto out;
1941 	tail_skb = &(skb_shinfo(skb)->frag_list);
1942 
1943 	/* move skb->data to ip header from ext header */
1944 	if (skb->data < skb_network_header(skb))
1945 		__skb_pull(skb, skb_network_offset(skb));
1946 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1947 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1948 		*tail_skb = tmp_skb;
1949 		tail_skb = &(tmp_skb->next);
1950 		skb->len += tmp_skb->len;
1951 		skb->data_len += tmp_skb->len;
1952 		skb->truesize += tmp_skb->truesize;
1953 		tmp_skb->destructor = NULL;
1954 		tmp_skb->sk = NULL;
1955 	}
1956 
1957 	/* Allow local fragmentation. */
1958 	skb->ignore_df = ip6_sk_ignore_df(sk);
1959 	__skb_pull(skb, skb_network_header_len(skb));
1960 
1961 	final_dst = &fl6->daddr;
1962 	if (opt && opt->opt_flen)
1963 		ipv6_push_frag_opts(skb, opt, &proto);
1964 	if (opt && opt->opt_nflen)
1965 		ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1966 
1967 	skb_push(skb, sizeof(struct ipv6hdr));
1968 	skb_reset_network_header(skb);
1969 	hdr = ipv6_hdr(skb);
1970 
1971 	ip6_flow_hdr(hdr, v6_cork->tclass,
1972 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1973 					ip6_autoflowlabel(net, np), fl6));
1974 	hdr->hop_limit = v6_cork->hop_limit;
1975 	hdr->nexthdr = proto;
1976 	hdr->saddr = fl6->saddr;
1977 	hdr->daddr = *final_dst;
1978 
1979 	skb->priority = sk->sk_priority;
1980 	skb->mark = cork->base.mark;
1981 	skb->tstamp = cork->base.transmit_time;
1982 
1983 	ip6_cork_steal_dst(skb, cork);
1984 	IP6_UPD_PO_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUT, skb->len);
1985 	if (proto == IPPROTO_ICMPV6) {
1986 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1987 		u8 icmp6_type;
1988 
1989 		if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl)
1990 			icmp6_type = fl6->fl6_icmp_type;
1991 		else
1992 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
1993 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1994 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1995 	}
1996 
1997 	ip6_cork_release(cork, v6_cork);
1998 out:
1999 	return skb;
2000 }
2001 
2002 int ip6_send_skb(struct sk_buff *skb)
2003 {
2004 	struct net *net = sock_net(skb->sk);
2005 	struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
2006 	int err;
2007 
2008 	err = ip6_local_out(net, skb->sk, skb);
2009 	if (err) {
2010 		if (err > 0)
2011 			err = net_xmit_errno(err);
2012 		if (err)
2013 			IP6_INC_STATS(net, rt->rt6i_idev,
2014 				      IPSTATS_MIB_OUTDISCARDS);
2015 	}
2016 
2017 	return err;
2018 }
2019 
2020 int ip6_push_pending_frames(struct sock *sk)
2021 {
2022 	struct sk_buff *skb;
2023 
2024 	skb = ip6_finish_skb(sk);
2025 	if (!skb)
2026 		return 0;
2027 
2028 	return ip6_send_skb(skb);
2029 }
2030 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2031 
2032 static void __ip6_flush_pending_frames(struct sock *sk,
2033 				       struct sk_buff_head *queue,
2034 				       struct inet_cork_full *cork,
2035 				       struct inet6_cork *v6_cork)
2036 {
2037 	struct sk_buff *skb;
2038 
2039 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2040 		if (skb_dst(skb))
2041 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2042 				      IPSTATS_MIB_OUTDISCARDS);
2043 		kfree_skb(skb);
2044 	}
2045 
2046 	ip6_cork_release(cork, v6_cork);
2047 }
2048 
2049 void ip6_flush_pending_frames(struct sock *sk)
2050 {
2051 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2052 				   &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2053 }
2054 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2055 
2056 struct sk_buff *ip6_make_skb(struct sock *sk,
2057 			     int getfrag(void *from, char *to, int offset,
2058 					 int len, int odd, struct sk_buff *skb),
2059 			     void *from, size_t length, int transhdrlen,
2060 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2061 			     unsigned int flags, struct inet_cork_full *cork)
2062 {
2063 	struct inet6_cork v6_cork;
2064 	struct sk_buff_head queue;
2065 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2066 	int err;
2067 
2068 	if (flags & MSG_PROBE) {
2069 		dst_release(&rt->dst);
2070 		return NULL;
2071 	}
2072 
2073 	__skb_queue_head_init(&queue);
2074 
2075 	cork->base.flags = 0;
2076 	cork->base.addr = 0;
2077 	cork->base.opt = NULL;
2078 	v6_cork.opt = NULL;
2079 	err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2080 	if (err) {
2081 		ip6_cork_release(cork, &v6_cork);
2082 		return ERR_PTR(err);
2083 	}
2084 	if (ipc6->dontfrag < 0)
2085 		ipc6->dontfrag = inet6_sk(sk)->dontfrag;
2086 
2087 	err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2088 				&current->task_frag, getfrag, from,
2089 				length + exthdrlen, transhdrlen + exthdrlen,
2090 				flags, ipc6);
2091 	if (err) {
2092 		__ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2093 		return ERR_PTR(err);
2094 	}
2095 
2096 	return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2097 }
2098