xref: /linux/net/ipv6/ip6_output.c (revision fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *	IPv6 output functions
4  *	Linux INET6 implementation
5  *
6  *	Authors:
7  *	Pedro Roque		<roque@di.fc.ul.pt>
8  *
9  *	Based on linux/net/ipv4/ip_output.c
10  *
11  *	Changes:
12  *	A.N.Kuznetsov	:	airthmetics in fragmentation.
13  *				extension headers are implemented.
14  *				route changes now work.
15  *				ip6_forward does not confuse sniffers.
16  *				etc.
17  *
18  *      H. von Brand    :       Added missing #include <linux/string.h>
19  *	Imran Patel	:	frag id should be in NBO
20  *      Kazunori MIYAZAWA @USAGI
21  *			:       add ip6_append_data and related functions
22  *				for datagram xmit
23  */
24 
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37 
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41 
42 #include <net/sock.h>
43 #include <net/snmp.h>
44 
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59 
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 	struct dst_entry *dst = skb_dst(skb);
63 	struct net_device *dev = dst_dev_rcu(dst);
64 	struct inet6_dev *idev = ip6_dst_idev(dst);
65 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 	const struct in6_addr *daddr, *nexthop;
67 	struct ipv6hdr *hdr;
68 	struct neighbour *neigh;
69 	int ret;
70 
71 	/* Be paranoid, rather than too clever. */
72 	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 		/* idev stays alive because we hold rcu_read_lock(). */
74 		skb = skb_expand_head(skb, hh_len);
75 		if (!skb) {
76 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77 			return -ENOMEM;
78 		}
79 	}
80 
81 	hdr = ipv6_hdr(skb);
82 	daddr = &hdr->daddr;
83 	if (unlikely(ipv6_addr_is_multicast(daddr))) {
84 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85 		    ((mroute6_is_socket(net, skb) &&
86 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87 		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89 
90 			/* Do not check for IFF_ALLMULTI; multicast routing
91 			   is not supported in any case.
92 			 */
93 			if (newskb)
94 				NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95 					net, sk, newskb, NULL, newskb->dev,
96 					dev_loopback_xmit);
97 
98 			if (hdr->hop_limit == 0) {
99 				IP6_INC_STATS(net, idev,
100 					      IPSTATS_MIB_OUTDISCARDS);
101 				kfree_skb(skb);
102 				return 0;
103 			}
104 		}
105 
106 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107 		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108 		    !(dev->flags & IFF_LOOPBACK)) {
109 			kfree_skb(skb);
110 			return 0;
111 		}
112 	}
113 
114 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115 		int res = lwtunnel_xmit(skb);
116 
117 		if (res != LWTUNNEL_XMIT_CONTINUE)
118 			return res;
119 	}
120 
121 	IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122 
123 	nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125 
126 	if (IS_ERR_OR_NULL(neigh)) {
127 		if (unlikely(!neigh))
128 			neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 		if (IS_ERR(neigh)) {
130 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131 			kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132 			return -EINVAL;
133 		}
134 	}
135 	sock_confirm_neigh(skb, neigh);
136 	ret = neigh_output(neigh, skb, false);
137 	return ret;
138 }
139 
140 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 				    struct sk_buff *skb, unsigned int mtu)
143 {
144 	struct sk_buff *segs, *nskb;
145 	netdev_features_t features;
146 	int ret = 0;
147 
148 	/* Please see corresponding comment in ip_finish_output_gso
149 	 * describing the cases where GSO segment length exceeds the
150 	 * egress MTU.
151 	 */
152 	features = netif_skb_features(skb);
153 	segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 	if (IS_ERR_OR_NULL(segs)) {
155 		kfree_skb(skb);
156 		return -ENOMEM;
157 	}
158 
159 	consume_skb(skb);
160 
161 	skb_list_walk_safe(segs, segs, nskb) {
162 		int err;
163 
164 		skb_mark_not_on_list(segs);
165 		/* Last GSO segment can be smaller than gso_size (and MTU).
166 		 * Adding a fragment header would produce an "atomic fragment",
167 		 * which is considered harmful (RFC-8021). Avoid that.
168 		 */
169 		err = segs->len > mtu ?
170 			ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 			ip6_finish_output2(net, sk, segs);
172 		if (err && ret == 0)
173 			ret = err;
174 	}
175 
176 	return ret;
177 }
178 
ip6_finish_output_gso(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)179 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180 				 struct sk_buff *skb, unsigned int mtu)
181 {
182 	if (unlikely(!skb_gso_validate_network_len(skb, mtu)))
183 		return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
184 
185 	return ip6_finish_output2(net, sk, skb);
186 }
187 
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 {
190 	unsigned int mtu;
191 
192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
193 	/* Policy lookup after SNAT yielded a new policy */
194 	if (skb_dst(skb)->xfrm) {
195 		IP6CB(skb)->flags |= IP6SKB_REROUTED;
196 		return dst_output(net, sk, skb);
197 	}
198 #endif
199 
200 	mtu = ip6_skb_dst_mtu(skb);
201 	if (skb_is_gso(skb))
202 		return ip6_finish_output_gso(net, sk, skb, mtu);
203 
204 	if (unlikely(skb->len > mtu ||
205 	    (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
206 		return ip6_fragment(net, sk, skb, ip6_finish_output2);
207 
208 	return ip6_finish_output2(net, sk, skb);
209 }
210 
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 	int ret;
214 
215 	ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 	switch (ret) {
217 	case NET_XMIT_SUCCESS:
218 	case NET_XMIT_CN:
219 		return __ip6_finish_output(net, sk, skb) ? : ret;
220 	default:
221 		kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 		return ret;
223 	}
224 }
225 
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 	struct dst_entry *dst = skb_dst(skb);
229 	struct net_device *dev, *indev = skb->dev;
230 	struct inet6_dev *idev;
231 	int ret;
232 
233 	skb->protocol = htons(ETH_P_IPV6);
234 	rcu_read_lock();
235 	dev = dst_dev_rcu(dst);
236 	idev = ip6_dst_idev(dst);
237 	skb->dev = dev;
238 
239 	if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
240 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
241 		rcu_read_unlock();
242 		kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
243 		return 0;
244 	}
245 
246 	ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
247 			   net, sk, skb, indev, dev,
248 			   ip6_finish_output,
249 			   !(IP6CB(skb)->flags & IP6SKB_REROUTED));
250 	rcu_read_unlock();
251 	return ret;
252 }
253 EXPORT_SYMBOL(ip6_output);
254 
ip6_autoflowlabel(struct net * net,const struct sock * sk)255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 	if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 		return ip6_default_np_autolabel(net);
259 	return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261 
ip6_dst_hoplimit(struct dst_entry * dst)262 int ip6_dst_hoplimit(struct dst_entry *dst)
263 {
264 	int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
265 
266 	rcu_read_lock();
267 	if (hoplimit == 0) {
268 		struct net_device *dev = dst_dev_rcu(dst);
269 		struct inet6_dev *idev;
270 
271 		idev = __in6_dev_get(dev);
272 		if (idev)
273 			hoplimit = READ_ONCE(idev->cnf.hop_limit);
274 		else
275 			hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
276 	}
277 	rcu_read_unlock();
278 
279 	return hoplimit;
280 }
281 EXPORT_SYMBOL(ip6_dst_hoplimit);
282 
283 /*
284  * xmit an sk_buff (used by TCP and SCTP)
285  * Note : socket lock is not held for SYNACK packets, but might be modified
286  * by calls to skb_set_owner_w() and ipv6_local_error(),
287  * which are using proper atomic operations or spinlocks.
288  */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)289 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
290 	     __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
291 {
292 	const struct ipv6_pinfo *np = inet6_sk(sk);
293 	struct in6_addr *first_hop = &fl6->daddr;
294 	struct dst_entry *dst = skb_dst(skb);
295 	struct inet6_dev *idev = ip6_dst_idev(dst);
296 	struct net *net = sock_net(sk);
297 	unsigned int head_room;
298 	struct net_device *dev;
299 	struct ipv6hdr *hdr;
300 	u8  proto = fl6->flowi6_proto;
301 	int seg_len = skb->len;
302 	int ret, hlimit = -1;
303 	u32 mtu;
304 
305 	rcu_read_lock();
306 
307 	dev = dst_dev_rcu(dst);
308 	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
309 	if (opt)
310 		head_room += opt->opt_nflen + opt->opt_flen;
311 
312 	if (unlikely(head_room > skb_headroom(skb))) {
313 		/* idev stays alive while we hold rcu_read_lock(). */
314 		skb = skb_expand_head(skb, head_room);
315 		if (!skb) {
316 			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
317 			ret = -ENOBUFS;
318 			goto unlock;
319 		}
320 	}
321 
322 	if (unlikely(opt)) {
323 		seg_len += opt->opt_nflen + opt->opt_flen;
324 
325 		if (opt->opt_flen)
326 			proto = ipv6_push_frag_opts(skb, opt, proto);
327 
328 		if (opt->opt_nflen)
329 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
330 						     &first_hop,
331 						     &fl6->saddr);
332 	}
333 
334 	if (unlikely(seg_len > IPV6_MAXPLEN))
335 		seg_len = 0;
336 
337 	__skb_push(skb, sizeof(struct ipv6hdr));
338 	skb_reset_network_header(skb);
339 	hdr = ipv6_hdr(skb);
340 
341 	/*
342 	 *	Fill in the IPv6 header
343 	 */
344 	if (np)
345 		hlimit = READ_ONCE(np->hop_limit);
346 	if (hlimit < 0)
347 		hlimit = ip6_dst_hoplimit(dst);
348 
349 	ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
350 				ip6_autoflowlabel(net, sk), fl6));
351 
352 	hdr->payload_len = htons(seg_len);
353 	hdr->nexthdr = proto;
354 	hdr->hop_limit = hlimit;
355 
356 	hdr->saddr = fl6->saddr;
357 	hdr->daddr = *first_hop;
358 
359 	skb->protocol = htons(ETH_P_IPV6);
360 	skb->priority = priority;
361 	skb->mark = mark;
362 
363 	mtu = dst6_mtu(dst);
364 	if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
365 		IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
366 
367 		/* if egress device is enslaved to an L3 master device pass the
368 		 * skb to its handler for processing
369 		 */
370 		skb = l3mdev_ip6_out((struct sock *)sk, skb);
371 		if (unlikely(!skb)) {
372 			ret = 0;
373 			goto unlock;
374 		}
375 
376 		/* hooks should never assume socket lock is held.
377 		 * we promote our socket to non const
378 		 */
379 		ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
380 			      net, (struct sock *)sk, skb, NULL, dev,
381 			      dst_output);
382 		goto unlock;
383 	}
384 
385 	ret = -EMSGSIZE;
386 	skb->dev = dev;
387 	/* ipv6_local_error() does not require socket lock,
388 	 * we promote our socket to non const
389 	 */
390 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
391 
392 	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
393 	kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
394 unlock:
395 	rcu_read_unlock();
396 	return ret;
397 }
398 EXPORT_SYMBOL(ip6_xmit);
399 
ip6_call_ra_chain(struct sk_buff * skb,int sel)400 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
401 {
402 	struct ip6_ra_chain *ra;
403 	struct sock *last = NULL;
404 
405 	read_lock(&ip6_ra_lock);
406 	for (ra = ip6_ra_chain; ra; ra = ra->next) {
407 		struct sock *sk = ra->sk;
408 		if (sk && ra->sel == sel &&
409 		    (!sk->sk_bound_dev_if ||
410 		     sk->sk_bound_dev_if == skb->dev->ifindex)) {
411 
412 			if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
413 			    !net_eq(sock_net(sk), dev_net(skb->dev))) {
414 				continue;
415 			}
416 			if (last) {
417 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
418 				if (skb2)
419 					rawv6_rcv(last, skb2);
420 			}
421 			last = sk;
422 		}
423 	}
424 
425 	if (last) {
426 		rawv6_rcv(last, skb);
427 		read_unlock(&ip6_ra_lock);
428 		return 1;
429 	}
430 	read_unlock(&ip6_ra_lock);
431 	return 0;
432 }
433 
ip6_forward_proxy_check(struct sk_buff * skb)434 static int ip6_forward_proxy_check(struct sk_buff *skb)
435 {
436 	struct ipv6hdr *hdr = ipv6_hdr(skb);
437 	u8 nexthdr = hdr->nexthdr;
438 	__be16 frag_off;
439 	int offset;
440 
441 	if (ipv6_ext_hdr(nexthdr)) {
442 		offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
443 		if (offset < 0)
444 			return 0;
445 	} else
446 		offset = sizeof(struct ipv6hdr);
447 
448 	if (nexthdr == IPPROTO_ICMPV6) {
449 		struct icmp6hdr *icmp6;
450 
451 		if (!pskb_may_pull(skb, (skb_network_header(skb) +
452 					 offset + 1 - skb->data)))
453 			return 0;
454 
455 		icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
456 
457 		switch (icmp6->icmp6_type) {
458 		case NDISC_ROUTER_SOLICITATION:
459 		case NDISC_ROUTER_ADVERTISEMENT:
460 		case NDISC_NEIGHBOUR_SOLICITATION:
461 		case NDISC_NEIGHBOUR_ADVERTISEMENT:
462 		case NDISC_REDIRECT:
463 			/* For reaction involving unicast neighbor discovery
464 			 * message destined to the proxied address, pass it to
465 			 * input function.
466 			 */
467 			return 1;
468 		default:
469 			break;
470 		}
471 		hdr = ipv6_hdr(skb);
472 	}
473 
474 	/*
475 	 * The proxying router can't forward traffic sent to a link-local
476 	 * address, so signal the sender and discard the packet. This
477 	 * behavior is clarified by the MIPv6 specification.
478 	 */
479 	if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
480 		dst_link_failure(skb);
481 		return -1;
482 	}
483 
484 	return 0;
485 }
486 
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)487 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
488 				     struct sk_buff *skb)
489 {
490 #ifdef CONFIG_NET_SWITCHDEV
491 	if (skb->offload_l3_fwd_mark) {
492 		consume_skb(skb);
493 		return 0;
494 	}
495 #endif
496 
497 	skb_clear_tstamp(skb);
498 	return dst_output(net, sk, skb);
499 }
500 
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)501 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
502 {
503 	if (skb->len <= mtu)
504 		return false;
505 
506 	/* ipv6 conntrack defrag sets max_frag_size + ignore_df */
507 	if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
508 		return true;
509 
510 	if (skb->ignore_df)
511 		return false;
512 
513 	if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
514 		return false;
515 
516 	return true;
517 }
518 
ip6_forward(struct sk_buff * skb)519 int ip6_forward(struct sk_buff *skb)
520 {
521 	struct dst_entry *dst = skb_dst(skb);
522 	struct ipv6hdr *hdr = ipv6_hdr(skb);
523 	struct inet6_skb_parm *opt = IP6CB(skb);
524 	struct net *net = dev_net(dst_dev(dst));
525 	struct net_device *dev;
526 	struct inet6_dev *idev;
527 	SKB_DR(reason);
528 	u32 mtu;
529 
530 	idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
531 	if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
532 	    (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
533 		goto error;
534 
535 	if (skb->pkt_type != PACKET_HOST)
536 		goto drop;
537 
538 	if (unlikely(skb->sk))
539 		goto drop;
540 
541 	if (skb_warn_if_lro(skb))
542 		goto drop;
543 
544 	if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
545 	    (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
546 	    !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
547 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
548 		goto drop;
549 	}
550 
551 	skb_forward_csum(skb);
552 
553 	/*
554 	 *	We DO NOT make any processing on
555 	 *	RA packets, pushing them to user level AS IS
556 	 *	without ane WARRANTY that application will be able
557 	 *	to interpret them. The reason is that we
558 	 *	cannot make anything clever here.
559 	 *
560 	 *	We are not end-node, so that if packet contains
561 	 *	AH/ESP, we cannot make anything.
562 	 *	Defragmentation also would be mistake, RA packets
563 	 *	cannot be fragmented, because there is no warranty
564 	 *	that different fragments will go along one path. --ANK
565 	 */
566 	if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
567 		if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
568 			return 0;
569 	}
570 
571 	/*
572 	 *	check and decrement ttl
573 	 */
574 	if (hdr->hop_limit <= 1) {
575 		icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
576 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
577 
578 		kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
579 		return -ETIMEDOUT;
580 	}
581 
582 	/* XXX: idev->cnf.proxy_ndp? */
583 	if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
584 	    pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
585 		int proxied = ip6_forward_proxy_check(skb);
586 
587 		hdr = ipv6_hdr(skb);
588 		if (proxied > 0) {
589 			/* It's tempting to decrease the hop limit
590 			 * here by 1, as we do at the end of the
591 			 * function too.
592 			 *
593 			 * But that would be incorrect, as proxying is
594 			 * not forwarding.  The ip6_input function
595 			 * will handle this packet locally, and it
596 			 * depends on the hop limit being unchanged.
597 			 *
598 			 * One example is the NDP hop limit, that
599 			 * always has to stay 255, but other would be
600 			 * similar checks around RA packets, where the
601 			 * user can even change the desired limit.
602 			 */
603 			return ip6_input(skb);
604 		} else if (proxied < 0) {
605 			__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
606 			goto drop;
607 		}
608 	}
609 
610 	if (!xfrm6_route_forward(skb)) {
611 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
612 		SKB_DR_SET(reason, XFRM_POLICY);
613 		goto drop;
614 	}
615 	dst = skb_dst(skb);
616 	dev = dst_dev(dst);
617 	/* IPv6 specs say nothing about it, but it is clear that we cannot
618 	   send redirects to source routed frames.
619 	   We don't send redirects to frames decapsulated from IPsec.
620 	 */
621 	if (IP6CB(skb)->iif == dev->ifindex &&
622 	    opt->srcrt == 0 && !skb_sec_path(skb)) {
623 		struct in6_addr *target = NULL;
624 		struct inet_peer *peer;
625 		struct rt6_info *rt;
626 
627 		/*
628 		 *	incoming and outgoing devices are the same
629 		 *	send a redirect.
630 		 */
631 
632 		rt = dst_rt6_info(dst);
633 		if (rt->rt6i_flags & RTF_GATEWAY)
634 			target = &rt->rt6i_gateway;
635 		else
636 			target = &hdr->daddr;
637 
638 		rcu_read_lock();
639 		peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
640 
641 		/* Limit redirects both by destination (here)
642 		   and by source (inside ndisc_send_redirect)
643 		 */
644 		if (inet_peer_xrlim_allow(peer, 1*HZ))
645 			ndisc_send_redirect(skb, target);
646 		rcu_read_unlock();
647 	} else {
648 		int addrtype = ipv6_addr_type(&hdr->saddr);
649 
650 		/* This check is security critical. */
651 		if (addrtype == IPV6_ADDR_ANY ||
652 		    addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
653 			goto error;
654 		if (addrtype & IPV6_ADDR_LINKLOCAL) {
655 			icmpv6_send(skb, ICMPV6_DEST_UNREACH,
656 				    ICMPV6_NOT_NEIGHBOUR, 0);
657 			goto error;
658 		}
659 	}
660 
661 	__IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
662 
663 	mtu = ip6_dst_mtu_maybe_forward(dst, true);
664 	if (mtu < IPV6_MIN_MTU)
665 		mtu = IPV6_MIN_MTU;
666 
667 	if (unlikely(ip6_pkt_too_big(skb, mtu))) {
668 		/* Again, force OUTPUT device used as source address */
669 		skb->dev = dev;
670 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
671 		__IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
672 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
673 				IPSTATS_MIB_FRAGFAILS);
674 		kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
675 		return -EMSGSIZE;
676 	}
677 
678 	if (skb_cow(skb, dev->hard_header_len)) {
679 		__IP6_INC_STATS(net, ip6_dst_idev(dst),
680 				IPSTATS_MIB_OUTDISCARDS);
681 		goto drop;
682 	}
683 
684 	hdr = ipv6_hdr(skb);
685 
686 	/* Mangling hops number delayed to point after skb COW */
687 
688 	hdr->hop_limit--;
689 
690 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
691 		       net, NULL, skb, skb->dev, dev,
692 		       ip6_forward_finish);
693 
694 error:
695 	__IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
696 	SKB_DR_SET(reason, IP_INADDRERRORS);
697 drop:
698 	kfree_skb_reason(skb, reason);
699 	return -EINVAL;
700 }
701 
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)702 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
703 {
704 	to->pkt_type = from->pkt_type;
705 	to->priority = from->priority;
706 	to->protocol = from->protocol;
707 	skb_dst_drop(to);
708 	skb_dst_set(to, dst_clone(skb_dst(from)));
709 	to->dev = from->dev;
710 	to->mark = from->mark;
711 
712 	skb_copy_hash(to, from);
713 
714 #ifdef CONFIG_NET_SCHED
715 	to->tc_index = from->tc_index;
716 #endif
717 	nf_copy(to, from);
718 	skb_ext_copy(to, from);
719 	skb_copy_secmark(to, from);
720 }
721 
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)722 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
723 		      u8 nexthdr, __be32 frag_id,
724 		      struct ip6_fraglist_iter *iter)
725 {
726 	unsigned int first_len;
727 	struct frag_hdr *fh;
728 
729 	/* BUILD HEADER */
730 	*prevhdr = NEXTHDR_FRAGMENT;
731 	iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
732 	if (!iter->tmp_hdr)
733 		return -ENOMEM;
734 
735 	iter->frag = skb_shinfo(skb)->frag_list;
736 	skb_frag_list_init(skb);
737 
738 	iter->offset = 0;
739 	iter->hlen = hlen;
740 	iter->frag_id = frag_id;
741 	iter->nexthdr = nexthdr;
742 
743 	__skb_pull(skb, hlen);
744 	fh = __skb_push(skb, sizeof(struct frag_hdr));
745 	__skb_push(skb, hlen);
746 	skb_reset_network_header(skb);
747 	memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
748 
749 	fh->nexthdr = nexthdr;
750 	fh->reserved = 0;
751 	fh->frag_off = htons(IP6_MF);
752 	fh->identification = frag_id;
753 
754 	first_len = skb_pagelen(skb);
755 	skb->data_len = first_len - skb_headlen(skb);
756 	skb->len = first_len;
757 	ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
758 
759 	return 0;
760 }
761 EXPORT_SYMBOL(ip6_fraglist_init);
762 
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)763 void ip6_fraglist_prepare(struct sk_buff *skb,
764 			  struct ip6_fraglist_iter *iter)
765 {
766 	struct sk_buff *frag = iter->frag;
767 	unsigned int hlen = iter->hlen;
768 	struct frag_hdr *fh;
769 
770 	frag->ip_summed = CHECKSUM_NONE;
771 	skb_reset_transport_header(frag);
772 	fh = __skb_push(frag, sizeof(struct frag_hdr));
773 	__skb_push(frag, hlen);
774 	skb_reset_network_header(frag);
775 	memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
776 	iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
777 	fh->nexthdr = iter->nexthdr;
778 	fh->reserved = 0;
779 	fh->frag_off = htons(iter->offset);
780 	if (frag->next)
781 		fh->frag_off |= htons(IP6_MF);
782 	fh->identification = iter->frag_id;
783 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
784 	ip6_copy_metadata(frag, skb);
785 }
786 EXPORT_SYMBOL(ip6_fraglist_prepare);
787 
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)788 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
789 		   unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
790 		   u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
791 {
792 	state->prevhdr = prevhdr;
793 	state->nexthdr = nexthdr;
794 	state->frag_id = frag_id;
795 
796 	state->hlen = hlen;
797 	state->mtu = mtu;
798 
799 	state->left = skb->len - hlen;	/* Space per frame */
800 	state->ptr = hlen;		/* Where to start from */
801 
802 	state->hroom = hdr_room;
803 	state->troom = needed_tailroom;
804 
805 	state->offset = 0;
806 }
807 EXPORT_SYMBOL(ip6_frag_init);
808 
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)809 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
810 {
811 	u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
812 	struct sk_buff *frag;
813 	struct frag_hdr *fh;
814 	unsigned int len;
815 
816 	len = state->left;
817 	/* IF: it doesn't fit, use 'mtu' - the data space left */
818 	if (len > state->mtu)
819 		len = state->mtu;
820 	/* IF: we are not sending up to and including the packet end
821 	   then align the next start on an eight byte boundary */
822 	if (len < state->left)
823 		len &= ~7;
824 
825 	/* Allocate buffer */
826 	frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
827 			 state->hroom + state->troom, GFP_ATOMIC);
828 	if (!frag)
829 		return ERR_PTR(-ENOMEM);
830 
831 	/*
832 	 *	Set up data on packet
833 	 */
834 
835 	ip6_copy_metadata(frag, skb);
836 	skb_reserve(frag, state->hroom);
837 	skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
838 	skb_reset_network_header(frag);
839 	fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
840 	frag->transport_header = (frag->network_header + state->hlen +
841 				  sizeof(struct frag_hdr));
842 
843 	/*
844 	 *	Charge the memory for the fragment to any owner
845 	 *	it might possess
846 	 */
847 	if (skb->sk)
848 		skb_set_owner_w(frag, skb->sk);
849 
850 	/*
851 	 *	Copy the packet header into the new buffer.
852 	 */
853 	skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
854 
855 	fragnexthdr_offset = skb_network_header(frag);
856 	fragnexthdr_offset += prevhdr - skb_network_header(skb);
857 	*fragnexthdr_offset = NEXTHDR_FRAGMENT;
858 
859 	/*
860 	 *	Build fragment header.
861 	 */
862 	fh->nexthdr = state->nexthdr;
863 	fh->reserved = 0;
864 	fh->identification = state->frag_id;
865 
866 	/*
867 	 *	Copy a block of the IP datagram.
868 	 */
869 	BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
870 			     len));
871 	state->left -= len;
872 
873 	fh->frag_off = htons(state->offset);
874 	if (state->left > 0)
875 		fh->frag_off |= htons(IP6_MF);
876 	ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
877 
878 	state->ptr += len;
879 	state->offset += len;
880 
881 	return frag;
882 }
883 EXPORT_SYMBOL(ip6_frag_next);
884 
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))885 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
886 		 int (*output)(struct net *, struct sock *, struct sk_buff *))
887 {
888 	struct sk_buff *frag;
889 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
890 	struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
891 				inet6_sk(skb->sk) : NULL;
892 	u8 tstamp_type = skb->tstamp_type;
893 	struct ip6_frag_state state;
894 	unsigned int mtu, hlen, nexthdr_offset;
895 	ktime_t tstamp = skb->tstamp;
896 	int hroom, err = 0;
897 	__be32 frag_id;
898 	u8 *prevhdr, nexthdr = 0;
899 
900 	if (!ipv6_mod_enabled()) {
901 		kfree_skb(skb);
902 		return -EAFNOSUPPORT;
903 	}
904 
905 	err = ip6_find_1stfragopt(skb, &prevhdr);
906 	if (err < 0)
907 		goto fail;
908 	hlen = err;
909 	nexthdr = *prevhdr;
910 	nexthdr_offset = prevhdr - skb_network_header(skb);
911 
912 	mtu = ip6_skb_dst_mtu(skb);
913 
914 	/* We must not fragment if the socket is set to force MTU discovery
915 	 * or if the skb it not generated by a local socket.
916 	 */
917 	if (unlikely(!skb->ignore_df && skb->len > mtu))
918 		goto fail_toobig;
919 
920 	if (IP6CB(skb)->frag_max_size) {
921 		if (IP6CB(skb)->frag_max_size > mtu)
922 			goto fail_toobig;
923 
924 		/* don't send fragments larger than what we received */
925 		mtu = IP6CB(skb)->frag_max_size;
926 		if (mtu < IPV6_MIN_MTU)
927 			mtu = IPV6_MIN_MTU;
928 	}
929 
930 	if (np) {
931 		u32 frag_size = READ_ONCE(np->frag_size);
932 
933 		if (frag_size && frag_size < mtu)
934 			mtu = frag_size;
935 	}
936 	if (mtu < hlen + sizeof(struct frag_hdr) + 8)
937 		goto fail_toobig;
938 	mtu -= hlen + sizeof(struct frag_hdr);
939 
940 	frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
941 				    &ipv6_hdr(skb)->saddr);
942 
943 	if (skb->ip_summed == CHECKSUM_PARTIAL &&
944 	    (err = skb_checksum_help(skb)))
945 		goto fail;
946 
947 	prevhdr = skb_network_header(skb) + nexthdr_offset;
948 	hroom = LL_RESERVED_SPACE(rt->dst.dev);
949 	if (skb_has_frag_list(skb)) {
950 		unsigned int first_len = skb_pagelen(skb);
951 		struct ip6_fraglist_iter iter;
952 		struct sk_buff *frag2;
953 
954 		if (first_len - hlen > mtu ||
955 		    ((first_len - hlen) & 7) ||
956 		    skb_cloned(skb) ||
957 		    skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
958 			goto slow_path;
959 
960 		skb_walk_frags(skb, frag) {
961 			/* Correct geometry. */
962 			if (frag->len > mtu ||
963 			    ((frag->len & 7) && frag->next) ||
964 			    skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
965 				goto slow_path_clean;
966 
967 			/* Partially cloned skb? */
968 			if (skb_shared(frag))
969 				goto slow_path_clean;
970 
971 			BUG_ON(frag->sk);
972 			if (skb->sk) {
973 				frag->sk = skb->sk;
974 				frag->destructor = sock_wfree;
975 			}
976 			skb->truesize -= frag->truesize;
977 		}
978 
979 		err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
980 					&iter);
981 		if (err < 0)
982 			goto fail;
983 
984 		/* We prevent @rt from being freed. */
985 		rcu_read_lock();
986 
987 		for (;;) {
988 			/* Prepare header of the next frame,
989 			 * before previous one went down. */
990 			if (iter.frag)
991 				ip6_fraglist_prepare(skb, &iter);
992 
993 			skb_set_delivery_time(skb, tstamp, tstamp_type);
994 			err = output(net, sk, skb);
995 			if (!err)
996 				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
997 					      IPSTATS_MIB_FRAGCREATES);
998 
999 			if (err || !iter.frag)
1000 				break;
1001 
1002 			skb = ip6_fraglist_next(&iter);
1003 		}
1004 
1005 		kfree(iter.tmp_hdr);
1006 
1007 		if (err == 0) {
1008 			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1009 				      IPSTATS_MIB_FRAGOKS);
1010 			rcu_read_unlock();
1011 			return 0;
1012 		}
1013 
1014 		kfree_skb_list(iter.frag);
1015 
1016 		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1017 			      IPSTATS_MIB_FRAGFAILS);
1018 		rcu_read_unlock();
1019 		return err;
1020 
1021 slow_path_clean:
1022 		skb_walk_frags(skb, frag2) {
1023 			if (frag2 == frag)
1024 				break;
1025 			frag2->sk = NULL;
1026 			frag2->destructor = NULL;
1027 			skb->truesize += frag2->truesize;
1028 		}
1029 	}
1030 
1031 slow_path:
1032 	/*
1033 	 *	Fragment the datagram.
1034 	 */
1035 
1036 	ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1037 		      LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1038 		      &state);
1039 
1040 	/*
1041 	 *	Keep copying data until we run out.
1042 	 */
1043 
1044 	while (state.left > 0) {
1045 		frag = ip6_frag_next(skb, &state);
1046 		if (IS_ERR(frag)) {
1047 			err = PTR_ERR(frag);
1048 			goto fail;
1049 		}
1050 
1051 		/*
1052 		 *	Put this fragment into the sending queue.
1053 		 */
1054 		skb_set_delivery_time(frag, tstamp, tstamp_type);
1055 		err = output(net, sk, frag);
1056 		if (err)
1057 			goto fail;
1058 
1059 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1060 			      IPSTATS_MIB_FRAGCREATES);
1061 	}
1062 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1063 		      IPSTATS_MIB_FRAGOKS);
1064 	consume_skb(skb);
1065 	return err;
1066 
1067 fail_toobig:
1068 	icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1069 	err = -EMSGSIZE;
1070 
1071 fail:
1072 	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1073 		      IPSTATS_MIB_FRAGFAILS);
1074 	kfree_skb(skb);
1075 	return err;
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_fragment);
1078 
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1079 static inline int ip6_rt_check(const struct rt6key *rt_key,
1080 			       const struct in6_addr *fl_addr,
1081 			       const struct in6_addr *addr_cache)
1082 {
1083 	return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1084 		(!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1085 }
1086 
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1087 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1088 					  struct dst_entry *dst,
1089 					  const struct flowi6 *fl6)
1090 {
1091 	struct ipv6_pinfo *np = inet6_sk(sk);
1092 	struct rt6_info *rt;
1093 
1094 	if (!dst)
1095 		goto out;
1096 
1097 	if (dst->ops->family != AF_INET6) {
1098 		dst_release(dst);
1099 		return NULL;
1100 	}
1101 
1102 	rt = dst_rt6_info(dst);
1103 	/* Yes, checking route validity in not connected
1104 	 * case is not very simple. Take into account,
1105 	 * that we do not support routing by source, TOS,
1106 	 * and MSG_DONTROUTE		--ANK (980726)
1107 	 *
1108 	 * 1. ip6_rt_check(): If route was host route,
1109 	 *    check that cached destination is current.
1110 	 *    If it is network route, we still may
1111 	 *    check its validity using saved pointer
1112 	 *    to the last used address: daddr_cache.
1113 	 *    We do not want to save whole address now,
1114 	 *    (because main consumer of this service
1115 	 *    is tcp, which has not this problem),
1116 	 *    so that the last trick works only on connected
1117 	 *    sockets.
1118 	 * 2. oif also should be the same.
1119 	 */
1120 	if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1121 			 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1122 #ifdef CONFIG_IPV6_SUBTREES
1123 	    ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1124 			 np->saddr_cache ? &np->saddr : NULL) ||
1125 #endif
1126 	   (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1127 		dst_release(dst);
1128 		dst = NULL;
1129 	}
1130 
1131 out:
1132 	return dst;
1133 }
1134 
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1135 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1136 			       struct dst_entry **dst, struct flowi6 *fl6)
1137 {
1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1139 	struct neighbour *n;
1140 	struct rt6_info *rt;
1141 #endif
1142 	int err;
1143 	int flags = 0;
1144 
1145 	/* The correct way to handle this would be to do
1146 	 * ip6_route_get_saddr, and then ip6_route_output; however,
1147 	 * the route-specific preferred source forces the
1148 	 * ip6_route_output call _before_ ip6_route_get_saddr.
1149 	 *
1150 	 * In source specific routing (no src=any default route),
1151 	 * ip6_route_output will fail given src=any saddr, though, so
1152 	 * that's why we try it again later.
1153 	 */
1154 	if (ipv6_addr_any(&fl6->saddr)) {
1155 		struct fib6_info *from;
1156 		struct rt6_info *rt;
1157 
1158 		*dst = ip6_route_output(net, sk, fl6);
1159 		rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1160 
1161 		rcu_read_lock();
1162 		from = rt ? rcu_dereference(rt->from) : NULL;
1163 		err = ip6_route_get_saddr(net, from, &fl6->daddr,
1164 					  sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1165 					  fl6->flowi6_l3mdev,
1166 					  &fl6->saddr);
1167 		rcu_read_unlock();
1168 
1169 		if (err)
1170 			goto out_err_release;
1171 
1172 		/* If we had an erroneous initial result, pretend it
1173 		 * never existed and let the SA-enabled version take
1174 		 * over.
1175 		 */
1176 		if ((*dst)->error) {
1177 			dst_release(*dst);
1178 			*dst = NULL;
1179 		}
1180 
1181 		if (fl6->flowi6_oif)
1182 			flags |= RT6_LOOKUP_F_IFACE;
1183 	}
1184 
1185 	if (!*dst)
1186 		*dst = ip6_route_output_flags(net, sk, fl6, flags);
1187 
1188 	err = (*dst)->error;
1189 	if (err)
1190 		goto out_err_release;
1191 
1192 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1193 	/*
1194 	 * Here if the dst entry we've looked up
1195 	 * has a neighbour entry that is in the INCOMPLETE
1196 	 * state and the src address from the flow is
1197 	 * marked as OPTIMISTIC, we release the found
1198 	 * dst entry and replace it instead with the
1199 	 * dst entry of the nexthop router
1200 	 */
1201 	rt = dst_rt6_info(*dst);
1202 	rcu_read_lock();
1203 	n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1204 				      rt6_nexthop(rt, &fl6->daddr));
1205 	err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1206 	rcu_read_unlock();
1207 
1208 	if (err) {
1209 		struct inet6_ifaddr *ifp;
1210 		struct flowi6 fl_gw6;
1211 		int redirect;
1212 
1213 		ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1214 				      (*dst)->dev, 1);
1215 
1216 		redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1217 		if (ifp)
1218 			in6_ifa_put(ifp);
1219 
1220 		if (redirect) {
1221 			/*
1222 			 * We need to get the dst entry for the
1223 			 * default router instead
1224 			 */
1225 			dst_release(*dst);
1226 			memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1227 			memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1228 			*dst = ip6_route_output(net, sk, &fl_gw6);
1229 			err = (*dst)->error;
1230 			if (err)
1231 				goto out_err_release;
1232 		}
1233 	}
1234 #endif
1235 	if (ipv6_addr_v4mapped(&fl6->saddr) &&
1236 	    !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1237 		err = -EAFNOSUPPORT;
1238 		goto out_err_release;
1239 	}
1240 
1241 	return 0;
1242 
1243 out_err_release:
1244 	dst_release(*dst);
1245 	*dst = NULL;
1246 
1247 	if (err == -ENETUNREACH)
1248 		IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1249 	return err;
1250 }
1251 
1252 /**
1253  *	ip6_dst_lookup - perform route lookup on flow
1254  *	@net: Network namespace to perform lookup in
1255  *	@sk: socket which provides route info
1256  *	@dst: pointer to dst_entry * for result
1257  *	@fl6: flow to lookup
1258  *
1259  *	This function performs a route lookup on the given flow.
1260  *
1261  *	It returns zero on success, or a standard errno code on error.
1262  */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1263 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1264 		   struct flowi6 *fl6)
1265 {
1266 	*dst = NULL;
1267 	return ip6_dst_lookup_tail(net, sk, dst, fl6);
1268 }
1269 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1270 
1271 /**
1272  *	ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1273  *	@net: Network namespace to perform lookup in
1274  *	@sk: socket which provides route info
1275  *	@fl6: flow to lookup
1276  *	@final_dst: final destination address for ipsec lookup
1277  *
1278  *	This function performs a route lookup on the given flow.
1279  *
1280  *	It returns a valid dst pointer on success, or a pointer encoded
1281  *	error code.
1282  */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1283 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1284 				      const struct in6_addr *final_dst)
1285 {
1286 	struct dst_entry *dst = NULL;
1287 	int err;
1288 
1289 	if (!ipv6_mod_enabled())
1290 		return ERR_PTR(-EAFNOSUPPORT);
1291 	err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1292 	if (err)
1293 		return ERR_PTR(err);
1294 	if (final_dst)
1295 		fl6->daddr = *final_dst;
1296 
1297 	return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1298 }
1299 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1300 
1301 /**
1302  *	ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1303  *	@sk: socket which provides the dst cache and route info
1304  *	@fl6: flow to lookup
1305  *	@final_dst: final destination address for ipsec lookup
1306  *	@connected: whether @sk is connected or not
1307  *
1308  *	This function performs a route lookup on the given flow with the
1309  *	possibility of using the cached route in the socket if it is valid.
1310  *	It will take the socket dst lock when operating on the dst cache.
1311  *	As a result, this function can only be used in process context.
1312  *
1313  *	In addition, for a connected socket, cache the dst in the socket
1314  *	if the current cache is not valid.
1315  *
1316  *	It returns a valid dst pointer on success, or a pointer encoded
1317  *	error code.
1318  */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1319 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1320 					 const struct in6_addr *final_dst,
1321 					 bool connected)
1322 {
1323 	struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1324 
1325 	dst = ip6_sk_dst_check(sk, dst, fl6);
1326 	if (dst)
1327 		return dst;
1328 
1329 	dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1330 	if (connected && !IS_ERR(dst))
1331 		ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1332 
1333 	return dst;
1334 }
1335 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1336 
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1337 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1338 					       gfp_t gfp)
1339 {
1340 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1341 }
1342 
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1343 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1344 						gfp_t gfp)
1345 {
1346 	return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1347 }
1348 
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1349 static void ip6_append_data_mtu(unsigned int *mtu,
1350 				int *maxfraglen,
1351 				unsigned int fragheaderlen,
1352 				struct sk_buff *skb,
1353 				struct rt6_info *rt,
1354 				unsigned int orig_mtu)
1355 {
1356 	if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1357 		if (!skb) {
1358 			/* first fragment, reserve header_len */
1359 			*mtu = orig_mtu - rt->dst.header_len;
1360 
1361 		} else {
1362 			/*
1363 			 * this fragment is not first, the headers
1364 			 * space is regarded as data space.
1365 			 */
1366 			*mtu = orig_mtu;
1367 		}
1368 		*maxfraglen = ((*mtu - fragheaderlen) & ~7)
1369 			      + fragheaderlen - sizeof(struct frag_hdr);
1370 	}
1371 }
1372 
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1373 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1374 			  struct ipcm6_cookie *ipc6,
1375 			  struct rt6_info *rt)
1376 {
1377 	struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1378 	struct inet6_cork *v6_cork = &cork->base6;
1379 	struct ipv6_pinfo *np = inet6_sk(sk);
1380 	unsigned int mtu, frag_size;
1381 
1382 	/* callers pass dst together with a reference, set it first so
1383 	 * ip6_cork_release() can put it down even in case of an error.
1384 	 */
1385 	cork->base.dst = &rt->dst;
1386 
1387 	/*
1388 	 * setup for corking
1389 	 */
1390 	if (unlikely(opt)) {
1391 		if (WARN_ON(v6_cork->opt))
1392 			return -EINVAL;
1393 
1394 		nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation);
1395 		if (unlikely(!nopt))
1396 			return -ENOBUFS;
1397 
1398 		nopt->tot_len = sizeof(*opt);
1399 		nopt->opt_flen = opt->opt_flen;
1400 		nopt->opt_nflen = opt->opt_nflen;
1401 
1402 		nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1403 		if (opt->dst0opt && !nopt->dst0opt)
1404 			return -ENOBUFS;
1405 
1406 		nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1407 		if (opt->dst1opt && !nopt->dst1opt)
1408 			return -ENOBUFS;
1409 
1410 		nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1411 		if (opt->hopopt && !nopt->hopopt)
1412 			return -ENOBUFS;
1413 
1414 		nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1415 		if (opt->srcrt && !nopt->srcrt)
1416 			return -ENOBUFS;
1417 
1418 		/* need source address above miyazawa*/
1419 	}
1420 	v6_cork->hop_limit = ipc6->hlimit;
1421 	v6_cork->tclass = ipc6->tclass;
1422 	v6_cork->dontfrag = ipc6->dontfrag;
1423 	if (rt->dst.flags & DST_XFRM_TUNNEL)
1424 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1425 		      READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
1426 	else
1427 		mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1428 			READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
1429 
1430 	frag_size = READ_ONCE(np->frag_size);
1431 	if (frag_size && frag_size < mtu)
1432 		mtu = frag_size;
1433 
1434 	cork->base.fragsize = mtu;
1435 	cork->base.gso_size = ipc6->gso_size;
1436 	cork->base.tx_flags = 0;
1437 	cork->base.mark = ipc6->sockc.mark;
1438 	cork->base.priority = ipc6->sockc.priority;
1439 	sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1440 	if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1441 		cork->base.flags |= IPCORK_TS_OPT_ID;
1442 		cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1443 	}
1444 	cork->base.length = 0;
1445 	cork->base.transmit_time = ipc6->sockc.transmit_time;
1446 
1447 	return 0;
1448 }
1449 
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags)1450 static int __ip6_append_data(struct sock *sk,
1451 			     struct sk_buff_head *queue,
1452 			     struct inet_cork_full *cork_full,
1453 			     struct page_frag *pfrag,
1454 			     int getfrag(void *from, char *to, int offset,
1455 					 int len, int odd, struct sk_buff *skb),
1456 			     void *from, size_t length, int transhdrlen,
1457 			     unsigned int flags)
1458 {
1459 	unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1460 	struct inet6_cork *v6_cork = &cork_full->base6;
1461 	struct inet_cork *cork = &cork_full->base;
1462 	struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1463 	struct sk_buff *skb, *skb_prev = NULL;
1464 	struct ubuf_info *uarg = NULL;
1465 	int exthdrlen = 0;
1466 	int dst_exthdrlen = 0;
1467 	int hh_len;
1468 	int copy;
1469 	int err;
1470 	int offset = 0;
1471 	bool zc = false;
1472 	u32 tskey = 0;
1473 	struct rt6_info *rt = dst_rt6_info(cork->dst);
1474 	bool paged, hold_tskey = false, extra_uref = false;
1475 	struct ipv6_txoptions *opt = v6_cork->opt;
1476 	int csummode = CHECKSUM_NONE;
1477 	unsigned int maxnonfragsize, headersize;
1478 	unsigned int wmem_alloc_delta = 0;
1479 
1480 	skb = skb_peek_tail(queue);
1481 	if (!skb) {
1482 		exthdrlen = opt ? opt->opt_flen : 0;
1483 		dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1484 	}
1485 
1486 	paged = !!cork->gso_size;
1487 	mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1488 	orig_mtu = mtu;
1489 
1490 	hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1491 
1492 	fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1493 			(opt ? opt->opt_nflen : 0);
1494 
1495 	headersize = sizeof(struct ipv6hdr) +
1496 		     (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1497 		     rt->rt6i_nfheader_len;
1498 
1499 	if (mtu <= fragheaderlen ||
1500 	    ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1501 		goto emsgsize;
1502 
1503 	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1504 		     sizeof(struct frag_hdr);
1505 
1506 	/* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1507 	 * the first fragment
1508 	 */
1509 	if (headersize + transhdrlen > mtu)
1510 		goto emsgsize;
1511 
1512 	if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1513 	    (sk->sk_protocol == IPPROTO_UDP ||
1514 	     sk->sk_protocol == IPPROTO_ICMPV6 ||
1515 	     sk->sk_protocol == IPPROTO_RAW)) {
1516 		ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1517 				sizeof(struct ipv6hdr));
1518 		goto emsgsize;
1519 	}
1520 
1521 	if (ip6_sk_ignore_df(sk))
1522 		maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1523 	else
1524 		maxnonfragsize = mtu;
1525 
1526 	if (cork->length + length > maxnonfragsize - headersize) {
1527 emsgsize:
1528 		pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1529 		ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1530 		return -EMSGSIZE;
1531 	}
1532 
1533 	/* CHECKSUM_PARTIAL only with no extension headers and when
1534 	 * we are not going to fragment
1535 	 */
1536 	if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1537 	    headersize == sizeof(struct ipv6hdr) &&
1538 	    length <= mtu - headersize &&
1539 	    (!(flags & MSG_MORE) || cork->gso_size) &&
1540 	    rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1541 		csummode = CHECKSUM_PARTIAL;
1542 
1543 	if ((flags & MSG_ZEROCOPY) && length) {
1544 		struct msghdr *msg = from;
1545 
1546 		if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1547 			if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1548 				return -EINVAL;
1549 
1550 			/* Leave uarg NULL if can't zerocopy, callers should
1551 			 * be able to handle it.
1552 			 */
1553 			if ((rt->dst.dev->features & NETIF_F_SG) &&
1554 			    csummode == CHECKSUM_PARTIAL) {
1555 				paged = true;
1556 				zc = true;
1557 				uarg = msg->msg_ubuf;
1558 			}
1559 		} else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1560 			uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1561 						    false);
1562 			if (!uarg)
1563 				return -ENOBUFS;
1564 			extra_uref = !skb_zcopy(skb);	/* only ref on new uarg */
1565 			if (rt->dst.dev->features & NETIF_F_SG &&
1566 			    csummode == CHECKSUM_PARTIAL) {
1567 				paged = true;
1568 				zc = true;
1569 			} else {
1570 				uarg_to_msgzc(uarg)->zerocopy = 0;
1571 				skb_zcopy_set(skb, uarg, &extra_uref);
1572 			}
1573 		}
1574 	} else if ((flags & MSG_SPLICE_PAGES) && length) {
1575 		if (inet_test_bit(HDRINCL, sk))
1576 			return -EPERM;
1577 		if (rt->dst.dev->features & NETIF_F_SG &&
1578 		    getfrag == ip_generic_getfrag)
1579 			/* We need an empty buffer to attach stuff to */
1580 			paged = true;
1581 		else
1582 			flags &= ~MSG_SPLICE_PAGES;
1583 	}
1584 
1585 	if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1586 	    READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1587 		if (cork->flags & IPCORK_TS_OPT_ID) {
1588 			tskey = cork->ts_opt_id;
1589 		} else {
1590 			tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1591 			hold_tskey = true;
1592 		}
1593 	}
1594 
1595 	/*
1596 	 * Let's try using as much space as possible.
1597 	 * Use MTU if total length of the message fits into the MTU.
1598 	 * Otherwise, we need to reserve fragment header and
1599 	 * fragment alignment (= 8-15 octects, in total).
1600 	 *
1601 	 * Note that we may need to "move" the data from the tail
1602 	 * of the buffer to the new fragment when we split
1603 	 * the message.
1604 	 *
1605 	 * FIXME: It may be fragmented into multiple chunks
1606 	 *        at once if non-fragmentable extension headers
1607 	 *        are too large.
1608 	 * --yoshfuji
1609 	 */
1610 
1611 	cork->length += length;
1612 	if (!skb)
1613 		goto alloc_new_skb;
1614 
1615 	while (length > 0) {
1616 		/* Check if the remaining data fits into current packet. */
1617 		copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1618 		if (copy < length)
1619 			copy = maxfraglen - skb->len;
1620 
1621 		if (copy <= 0) {
1622 			char *data;
1623 			unsigned int datalen;
1624 			unsigned int fraglen;
1625 			unsigned int fraggap;
1626 			unsigned int alloclen, alloc_extra;
1627 			unsigned int pagedlen;
1628 alloc_new_skb:
1629 			/* There's no room in the current skb */
1630 			if (skb)
1631 				fraggap = skb->len - maxfraglen;
1632 			else
1633 				fraggap = 0;
1634 			/* update mtu and maxfraglen if necessary */
1635 			if (!skb || !skb_prev)
1636 				ip6_append_data_mtu(&mtu, &maxfraglen,
1637 						    fragheaderlen, skb, rt,
1638 						    orig_mtu);
1639 
1640 			skb_prev = skb;
1641 
1642 			/*
1643 			 * If remaining data exceeds the mtu,
1644 			 * we know we need more fragment(s).
1645 			 */
1646 			datalen = length + fraggap;
1647 
1648 			if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1649 				datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1650 			fraglen = datalen + fragheaderlen;
1651 			pagedlen = 0;
1652 
1653 			alloc_extra = hh_len;
1654 			alloc_extra += dst_exthdrlen;
1655 			alloc_extra += rt->dst.trailer_len;
1656 
1657 			/* We just reserve space for fragment header.
1658 			 * Note: this may be overallocation if the message
1659 			 * (without MSG_MORE) fits into the MTU.
1660 			 */
1661 			alloc_extra += sizeof(struct frag_hdr);
1662 
1663 			if ((flags & MSG_MORE) &&
1664 			    !(rt->dst.dev->features&NETIF_F_SG))
1665 				alloclen = mtu;
1666 			else if (!paged &&
1667 				 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1668 				  !(rt->dst.dev->features & NETIF_F_SG)))
1669 				alloclen = fraglen;
1670 			else {
1671 				alloclen = fragheaderlen + transhdrlen;
1672 				pagedlen = datalen - transhdrlen;
1673 			}
1674 			alloclen += alloc_extra;
1675 
1676 			if (datalen != length + fraggap) {
1677 				/*
1678 				 * this is not the last fragment, the trailer
1679 				 * space is regarded as data space.
1680 				 */
1681 				datalen += rt->dst.trailer_len;
1682 			}
1683 
1684 			fraglen = datalen + fragheaderlen;
1685 
1686 			copy = datalen - transhdrlen - fraggap - pagedlen;
1687 			/* [!] NOTE: copy may be negative if pagedlen>0
1688 			 * because then the equation may reduces to -fraggap.
1689 			 */
1690 			if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1691 				err = -EINVAL;
1692 				goto error;
1693 			}
1694 			if (transhdrlen) {
1695 				skb = sock_alloc_send_skb(sk, alloclen,
1696 						(flags & MSG_DONTWAIT), &err);
1697 			} else {
1698 				skb = NULL;
1699 				if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1700 				    2 * sk->sk_sndbuf)
1701 					skb = alloc_skb(alloclen,
1702 							sk->sk_allocation);
1703 				if (unlikely(!skb))
1704 					err = -ENOBUFS;
1705 			}
1706 			if (!skb)
1707 				goto error;
1708 			/*
1709 			 *	Fill in the control structures
1710 			 */
1711 			skb->protocol = htons(ETH_P_IPV6);
1712 			skb->ip_summed = csummode;
1713 			skb->csum = 0;
1714 			/* reserve for fragmentation and ipsec header */
1715 			skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1716 				    dst_exthdrlen);
1717 
1718 			/*
1719 			 *	Find where to start putting bytes
1720 			 */
1721 			data = skb_put(skb, fraglen - pagedlen);
1722 			skb_set_network_header(skb, exthdrlen);
1723 			data += fragheaderlen;
1724 			skb->transport_header = (skb->network_header +
1725 						 fragheaderlen);
1726 			if (fraggap) {
1727 				skb->csum = skb_copy_and_csum_bits(
1728 					skb_prev, maxfraglen,
1729 					data + transhdrlen, fraggap);
1730 				skb_prev->csum = csum_sub(skb_prev->csum,
1731 							  skb->csum);
1732 				data += fraggap;
1733 				pskb_trim_unique(skb_prev, maxfraglen);
1734 			}
1735 			if (copy > 0 &&
1736 			    INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1737 					   from, data + transhdrlen, offset,
1738 					   copy, fraggap, skb) < 0) {
1739 				err = -EFAULT;
1740 				kfree_skb(skb);
1741 				goto error;
1742 			} else if (flags & MSG_SPLICE_PAGES) {
1743 				copy = 0;
1744 			}
1745 
1746 			offset += copy;
1747 			length -= copy + transhdrlen;
1748 			transhdrlen = 0;
1749 			exthdrlen = 0;
1750 			dst_exthdrlen = 0;
1751 
1752 			/* Only the initial fragment is time stamped */
1753 			skb_shinfo(skb)->tx_flags = cork->tx_flags;
1754 			cork->tx_flags = 0;
1755 			skb_shinfo(skb)->tskey = tskey;
1756 			tskey = 0;
1757 			skb_zcopy_set(skb, uarg, &extra_uref);
1758 
1759 			if ((flags & MSG_CONFIRM) && !skb_prev)
1760 				skb_set_dst_pending_confirm(skb, 1);
1761 
1762 			/*
1763 			 * Put the packet on the pending queue
1764 			 */
1765 			if (!skb->destructor) {
1766 				skb->destructor = sock_wfree;
1767 				skb->sk = sk;
1768 				wmem_alloc_delta += skb->truesize;
1769 			}
1770 			__skb_queue_tail(queue, skb);
1771 			continue;
1772 		}
1773 
1774 		if (copy > length)
1775 			copy = length;
1776 
1777 		if (!(rt->dst.dev->features&NETIF_F_SG) &&
1778 		    skb_tailroom(skb) >= copy) {
1779 			unsigned int off;
1780 
1781 			off = skb->len;
1782 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1783 					    from, skb_put(skb, copy),
1784 					    offset, copy, off, skb) < 0) {
1785 				__skb_trim(skb, off);
1786 				err = -EFAULT;
1787 				goto error;
1788 			}
1789 		} else if (flags & MSG_SPLICE_PAGES) {
1790 			struct msghdr *msg = from;
1791 
1792 			err = -EIO;
1793 			if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1794 				goto error;
1795 
1796 			err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1797 			if (err < 0)
1798 				goto error;
1799 			copy = err;
1800 			if (!(flags & MSG_NO_SHARED_FRAGS))
1801 				skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1802 			wmem_alloc_delta += copy;
1803 		} else if (!zc) {
1804 			int i = skb_shinfo(skb)->nr_frags;
1805 
1806 			err = -ENOMEM;
1807 			if (!sk_page_frag_refill(sk, pfrag))
1808 				goto error;
1809 
1810 			skb_zcopy_downgrade_managed(skb);
1811 			if (!skb_can_coalesce(skb, i, pfrag->page,
1812 					      pfrag->offset)) {
1813 				err = -EMSGSIZE;
1814 				if (i == MAX_SKB_FRAGS)
1815 					goto error;
1816 
1817 				__skb_fill_page_desc(skb, i, pfrag->page,
1818 						     pfrag->offset, 0);
1819 				skb_shinfo(skb)->nr_frags = ++i;
1820 				get_page(pfrag->page);
1821 			}
1822 			copy = min_t(int, copy, pfrag->size - pfrag->offset);
1823 			if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1824 				    from,
1825 				    page_address(pfrag->page) + pfrag->offset,
1826 				    offset, copy, skb->len, skb) < 0)
1827 				goto error_efault;
1828 
1829 			pfrag->offset += copy;
1830 			skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1831 			skb->len += copy;
1832 			skb->data_len += copy;
1833 			skb->truesize += copy;
1834 			wmem_alloc_delta += copy;
1835 		} else {
1836 			err = skb_zerocopy_iter_dgram(skb, from, copy);
1837 			if (err < 0)
1838 				goto error;
1839 		}
1840 		offset += copy;
1841 		length -= copy;
1842 	}
1843 
1844 	if (wmem_alloc_delta)
1845 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1846 	return 0;
1847 
1848 error_efault:
1849 	err = -EFAULT;
1850 error:
1851 	net_zcopy_put_abort(uarg, extra_uref);
1852 	cork->length -= length;
1853 	IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1854 	refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1855 	if (hold_tskey)
1856 		atomic_dec(&sk->sk_tskey);
1857 	return err;
1858 }
1859 
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1860 int ip6_append_data(struct sock *sk,
1861 		    int getfrag(void *from, char *to, int offset, int len,
1862 				int odd, struct sk_buff *skb),
1863 		    void *from, size_t length, int transhdrlen,
1864 		    struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1865 		    struct rt6_info *rt, unsigned int flags)
1866 {
1867 	struct inet_sock *inet = inet_sk(sk);
1868 	int exthdrlen;
1869 	int err;
1870 
1871 	if (flags&MSG_PROBE)
1872 		return 0;
1873 	if (skb_queue_empty(&sk->sk_write_queue)) {
1874 		/*
1875 		 * setup for corking
1876 		 */
1877 		dst_hold(&rt->dst);
1878 		err = ip6_setup_cork(sk, &inet->cork,
1879 				     ipc6, rt);
1880 		if (err)
1881 			return err;
1882 
1883 		inet->cork.fl.u.ip6 = *fl6;
1884 		exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1885 		length += exthdrlen;
1886 		transhdrlen += exthdrlen;
1887 	} else {
1888 		transhdrlen = 0;
1889 	}
1890 
1891 	return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1892 				 sk_page_frag(sk), getfrag,
1893 				 from, length, transhdrlen, flags);
1894 }
1895 EXPORT_SYMBOL_GPL(ip6_append_data);
1896 
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1897 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1898 {
1899 	struct dst_entry *dst = cork->base.dst;
1900 
1901 	cork->base.dst = NULL;
1902 	skb_dst_set(skb, dst);
1903 }
1904 
ip6_cork_release(struct inet_cork_full * cork)1905 static void ip6_cork_release(struct inet_cork_full *cork)
1906 {
1907 	struct inet6_cork *v6_cork = &cork->base6;
1908 
1909 	if (unlikely(v6_cork->opt)) {
1910 		struct ipv6_txoptions *opt = v6_cork->opt;
1911 
1912 		kfree(opt->dst0opt);
1913 		kfree(opt->dst1opt);
1914 		kfree(opt->hopopt);
1915 		kfree(opt->srcrt);
1916 		kfree(opt);
1917 		v6_cork->opt = NULL;
1918 	}
1919 
1920 	if (cork->base.dst) {
1921 		dst_release(cork->base.dst);
1922 		cork->base.dst = NULL;
1923 	}
1924 }
1925 
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork)1926 struct sk_buff *__ip6_make_skb(struct sock *sk,
1927 			       struct sk_buff_head *queue,
1928 			       struct inet_cork_full *cork)
1929 {
1930 	struct sk_buff *skb, *tmp_skb;
1931 	struct sk_buff **tail_skb;
1932 	struct in6_addr *final_dst;
1933 	struct net *net = sock_net(sk);
1934 	struct ipv6hdr *hdr;
1935 	struct ipv6_txoptions *opt;
1936 	struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1937 	struct flowi6 *fl6 = &cork->fl.u.ip6;
1938 	unsigned char proto = fl6->flowi6_proto;
1939 
1940 	skb = __skb_dequeue(queue);
1941 	if (!skb)
1942 		goto out;
1943 	tail_skb = &(skb_shinfo(skb)->frag_list);
1944 
1945 	/* move skb->data to ip header from ext header */
1946 	if (skb->data < skb_network_header(skb))
1947 		__skb_pull(skb, skb_network_offset(skb));
1948 	while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1949 		__skb_pull(tmp_skb, skb_network_header_len(skb));
1950 		*tail_skb = tmp_skb;
1951 		tail_skb = &(tmp_skb->next);
1952 		skb->len += tmp_skb->len;
1953 		skb->data_len += tmp_skb->len;
1954 		skb->truesize += tmp_skb->truesize;
1955 		tmp_skb->destructor = NULL;
1956 		tmp_skb->sk = NULL;
1957 	}
1958 
1959 	/* Allow local fragmentation. */
1960 	skb->ignore_df = ip6_sk_ignore_df(sk);
1961 	__skb_pull(skb, skb_network_header_len(skb));
1962 
1963 	final_dst = &fl6->daddr;
1964 	opt = cork->base6.opt;
1965 	if (unlikely(opt)) {
1966 		if (opt->opt_flen)
1967 			proto = ipv6_push_frag_opts(skb, opt, proto);
1968 		if (opt->opt_nflen)
1969 			proto = ipv6_push_nfrag_opts(skb, opt, proto,
1970 						     &final_dst, &fl6->saddr);
1971 	}
1972 	skb_push(skb, sizeof(struct ipv6hdr));
1973 	skb_reset_network_header(skb);
1974 	hdr = ipv6_hdr(skb);
1975 
1976 	ip6_flow_hdr(hdr, cork->base6.tclass,
1977 		     ip6_make_flowlabel(net, skb, fl6->flowlabel,
1978 					ip6_autoflowlabel(net, sk), fl6));
1979 	hdr->hop_limit = cork->base6.hop_limit;
1980 	hdr->nexthdr = proto;
1981 	hdr->saddr = fl6->saddr;
1982 	hdr->daddr = *final_dst;
1983 
1984 	skb->priority = cork->base.priority;
1985 	skb->mark = cork->base.mark;
1986 	if (sk_is_tcp(sk))
1987 		skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1988 	else
1989 		skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1990 
1991 	ip6_cork_steal_dst(skb, cork);
1992 	IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1993 	if (unlikely(proto == IPPROTO_ICMPV6)) {
1994 		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1995 		u8 icmp6_type;
1996 
1997 		if (sk->sk_socket->type == SOCK_RAW &&
1998 		   !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1999 			icmp6_type = fl6->fl6_icmp_type;
2000 		else
2001 			icmp6_type = icmp6_hdr(skb)->icmp6_type;
2002 		ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2003 		ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2004 	}
2005 
2006 	ip6_cork_release(cork);
2007 out:
2008 	return skb;
2009 }
2010 
ip6_send_skb(struct sk_buff * skb)2011 int ip6_send_skb(struct sk_buff *skb)
2012 {
2013 	struct net *net = sock_net(skb->sk);
2014 	struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
2015 	int err;
2016 
2017 	rcu_read_lock();
2018 	err = ip6_local_out(net, skb->sk, skb);
2019 	if (err) {
2020 		if (err > 0)
2021 			err = net_xmit_errno(err);
2022 		if (err)
2023 			IP6_INC_STATS(net, rt->rt6i_idev,
2024 				      IPSTATS_MIB_OUTDISCARDS);
2025 	}
2026 
2027 	rcu_read_unlock();
2028 	return err;
2029 }
2030 
ip6_push_pending_frames(struct sock * sk)2031 int ip6_push_pending_frames(struct sock *sk)
2032 {
2033 	struct sk_buff *skb;
2034 
2035 	skb = ip6_finish_skb(sk);
2036 	if (!skb)
2037 		return 0;
2038 
2039 	return ip6_send_skb(skb);
2040 }
2041 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2042 
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork)2043 static void __ip6_flush_pending_frames(struct sock *sk,
2044 				       struct sk_buff_head *queue,
2045 				       struct inet_cork_full *cork)
2046 {
2047 	struct sk_buff *skb;
2048 
2049 	while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2050 		if (skb_dst(skb))
2051 			IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2052 				      IPSTATS_MIB_OUTDISCARDS);
2053 		kfree_skb(skb);
2054 	}
2055 
2056 	ip6_cork_release(cork);
2057 }
2058 
ip6_flush_pending_frames(struct sock * sk)2059 void ip6_flush_pending_frames(struct sock *sk)
2060 {
2061 	__ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2062 				   &inet_sk(sk)->cork);
2063 }
2064 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2065 
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2066 struct sk_buff *ip6_make_skb(struct sock *sk,
2067 			     int getfrag(void *from, char *to, int offset,
2068 					 int len, int odd, struct sk_buff *skb),
2069 			     void *from, size_t length, int transhdrlen,
2070 			     struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2071 			     unsigned int flags, struct inet_cork_full *cork)
2072 {
2073 	int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2074 	struct sk_buff_head queue;
2075 	int err;
2076 
2077 	if (flags & MSG_PROBE) {
2078 		dst_release(&rt->dst);
2079 		return NULL;
2080 	}
2081 
2082 	__skb_queue_head_init(&queue);
2083 
2084 	cork->base.flags = 0;
2085 	cork->base.addr = 0;
2086 	cork->base.opt = NULL;
2087 	cork->base6.opt = NULL;
2088 	err = ip6_setup_cork(sk, cork, ipc6, rt);
2089 	if (err) {
2090 		ip6_cork_release(cork);
2091 		return ERR_PTR(err);
2092 	}
2093 
2094 	err = __ip6_append_data(sk, &queue, cork,
2095 				&current->task_frag, getfrag, from,
2096 				length + exthdrlen, transhdrlen + exthdrlen,
2097 				flags);
2098 	if (err) {
2099 		__ip6_flush_pending_frames(sk, &queue, cork);
2100 		return ERR_PTR(err);
2101 	}
2102 
2103 	return __ip6_make_skb(sk, &queue, cork);
2104 }
2105