1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 struct dst_entry *dst = skb_dst(skb);
63 struct net_device *dev = dst_dev_rcu(dst);
64 struct inet6_dev *idev = ip6_dst_idev(dst);
65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
68 struct neighbour *neigh;
69 int ret;
70
71 /* Be paranoid, rather than too clever. */
72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 /* idev stays alive because we hold rcu_read_lock(). */
74 skb = skb_expand_head(skb, hh_len);
75 if (!skb) {
76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77 return -ENOMEM;
78 }
79 }
80
81 hdr = ipv6_hdr(skb);
82 daddr = &hdr->daddr;
83 if (unlikely(ipv6_addr_is_multicast(daddr))) {
84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85 ((mroute6_is_socket(net, skb) &&
86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89
90 /* Do not check for IFF_ALLMULTI; multicast routing
91 is not supported in any case.
92 */
93 if (newskb)
94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95 net, sk, newskb, NULL, newskb->dev,
96 dev_loopback_xmit);
97
98 if (hdr->hop_limit == 0) {
99 IP6_INC_STATS(net, idev,
100 IPSTATS_MIB_OUTDISCARDS);
101 kfree_skb(skb);
102 return 0;
103 }
104 }
105
106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108 !(dev->flags & IFF_LOOPBACK)) {
109 kfree_skb(skb);
110 return 0;
111 }
112 }
113
114 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115 int res = lwtunnel_xmit(skb);
116
117 if (res != LWTUNNEL_XMIT_CONTINUE)
118 return res;
119 }
120
121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122
123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125
126 if (IS_ERR_OR_NULL(neigh)) {
127 if (unlikely(!neigh))
128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 if (IS_ERR(neigh)) {
130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132 return -EINVAL;
133 }
134 }
135 sock_confirm_neigh(skb, neigh);
136 ret = neigh_output(neigh, skb, false);
137 return ret;
138 }
139
140 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 struct sk_buff *skb, unsigned int mtu)
143 {
144 struct sk_buff *segs, *nskb;
145 netdev_features_t features;
146 int ret = 0;
147
148 /* Please see corresponding comment in ip_finish_output_gso
149 * describing the cases where GSO segment length exceeds the
150 * egress MTU.
151 */
152 features = netif_skb_features(skb);
153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 if (IS_ERR_OR_NULL(segs)) {
155 kfree_skb(skb);
156 return -ENOMEM;
157 }
158
159 consume_skb(skb);
160
161 skb_list_walk_safe(segs, segs, nskb) {
162 int err;
163
164 skb_mark_not_on_list(segs);
165 /* Last GSO segment can be smaller than gso_size (and MTU).
166 * Adding a fragment header would produce an "atomic fragment",
167 * which is considered harmful (RFC-8021). Avoid that.
168 */
169 err = segs->len > mtu ?
170 ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 ip6_finish_output2(net, sk, segs);
172 if (err && ret == 0)
173 ret = err;
174 }
175
176 return ret;
177 }
178
ip6_finish_output_gso(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)179 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180 struct sk_buff *skb, unsigned int mtu)
181 {
182 if (unlikely(!skb_gso_validate_network_len(skb, mtu)))
183 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
184
185 return ip6_finish_output2(net, sk, skb);
186 }
187
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 {
190 unsigned int mtu;
191
192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
193 /* Policy lookup after SNAT yielded a new policy */
194 if (skb_dst(skb)->xfrm) {
195 IP6CB(skb)->flags |= IP6SKB_REROUTED;
196 return dst_output(net, sk, skb);
197 }
198 #endif
199
200 mtu = ip6_skb_dst_mtu(skb);
201 if (skb_is_gso(skb))
202 return ip6_finish_output_gso(net, sk, skb, mtu);
203
204 if (unlikely(skb->len > mtu ||
205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
206 return ip6_fragment(net, sk, skb, ip6_finish_output2);
207
208 return ip6_finish_output2(net, sk, skb);
209 }
210
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 int ret;
214
215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 switch (ret) {
217 case NET_XMIT_SUCCESS:
218 case NET_XMIT_CN:
219 return __ip6_finish_output(net, sk, skb) ? : ret;
220 default:
221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 return ret;
223 }
224 }
225
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 struct dst_entry *dst = skb_dst(skb);
229 struct net_device *dev, *indev = skb->dev;
230 struct inet6_dev *idev;
231 int ret;
232
233 skb->protocol = htons(ETH_P_IPV6);
234 rcu_read_lock();
235 dev = dst_dev_rcu(dst);
236 idev = ip6_dst_idev(dst);
237 skb->dev = dev;
238
239 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
240 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
241 rcu_read_unlock();
242 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
243 return 0;
244 }
245
246 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
247 net, sk, skb, indev, dev,
248 ip6_finish_output,
249 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
250 rcu_read_unlock();
251 return ret;
252 }
253 EXPORT_SYMBOL(ip6_output);
254
ip6_autoflowlabel(struct net * net,const struct sock * sk)255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 return ip6_default_np_autolabel(net);
259 return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261
ip6_dst_hoplimit(struct dst_entry * dst)262 int ip6_dst_hoplimit(struct dst_entry *dst)
263 {
264 int hoplimit = dst_metric_raw(dst, RTAX_HOPLIMIT);
265
266 rcu_read_lock();
267 if (hoplimit == 0) {
268 struct net_device *dev = dst_dev_rcu(dst);
269 struct inet6_dev *idev;
270
271 idev = __in6_dev_get(dev);
272 if (idev)
273 hoplimit = READ_ONCE(idev->cnf.hop_limit);
274 else
275 hoplimit = READ_ONCE(dev_net(dev)->ipv6.devconf_all->hop_limit);
276 }
277 rcu_read_unlock();
278
279 return hoplimit;
280 }
281 EXPORT_SYMBOL(ip6_dst_hoplimit);
282
283 /*
284 * xmit an sk_buff (used by TCP and SCTP)
285 * Note : socket lock is not held for SYNACK packets, but might be modified
286 * by calls to skb_set_owner_w() and ipv6_local_error(),
287 * which are using proper atomic operations or spinlocks.
288 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)289 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
290 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
291 {
292 const struct ipv6_pinfo *np = inet6_sk(sk);
293 struct in6_addr *first_hop = &fl6->daddr;
294 struct dst_entry *dst = skb_dst(skb);
295 struct inet6_dev *idev = ip6_dst_idev(dst);
296 struct net *net = sock_net(sk);
297 unsigned int head_room;
298 struct net_device *dev;
299 struct ipv6hdr *hdr;
300 u8 proto = fl6->flowi6_proto;
301 int seg_len = skb->len;
302 int ret, hlimit = -1;
303 u32 mtu;
304
305 rcu_read_lock();
306
307 dev = dst_dev_rcu(dst);
308 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
309 if (opt)
310 head_room += opt->opt_nflen + opt->opt_flen;
311
312 if (unlikely(head_room > skb_headroom(skb))) {
313 /* idev stays alive while we hold rcu_read_lock(). */
314 skb = skb_expand_head(skb, head_room);
315 if (!skb) {
316 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
317 ret = -ENOBUFS;
318 goto unlock;
319 }
320 }
321
322 if (unlikely(opt)) {
323 seg_len += opt->opt_nflen + opt->opt_flen;
324
325 if (opt->opt_flen)
326 proto = ipv6_push_frag_opts(skb, opt, proto);
327
328 if (opt->opt_nflen)
329 proto = ipv6_push_nfrag_opts(skb, opt, proto,
330 &first_hop,
331 &fl6->saddr);
332 }
333
334 if (unlikely(seg_len > IPV6_MAXPLEN))
335 seg_len = 0;
336
337 __skb_push(skb, sizeof(struct ipv6hdr));
338 skb_reset_network_header(skb);
339 hdr = ipv6_hdr(skb);
340
341 /*
342 * Fill in the IPv6 header
343 */
344 if (np)
345 hlimit = READ_ONCE(np->hop_limit);
346 if (hlimit < 0)
347 hlimit = ip6_dst_hoplimit(dst);
348
349 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
350 ip6_autoflowlabel(net, sk), fl6));
351
352 hdr->payload_len = htons(seg_len);
353 hdr->nexthdr = proto;
354 hdr->hop_limit = hlimit;
355
356 hdr->saddr = fl6->saddr;
357 hdr->daddr = *first_hop;
358
359 skb->protocol = htons(ETH_P_IPV6);
360 skb->priority = priority;
361 skb->mark = mark;
362
363 mtu = dst6_mtu(dst);
364 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
365 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
366
367 /* if egress device is enslaved to an L3 master device pass the
368 * skb to its handler for processing
369 */
370 skb = l3mdev_ip6_out((struct sock *)sk, skb);
371 if (unlikely(!skb)) {
372 ret = 0;
373 goto unlock;
374 }
375
376 /* hooks should never assume socket lock is held.
377 * we promote our socket to non const
378 */
379 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
380 net, (struct sock *)sk, skb, NULL, dev,
381 dst_output);
382 goto unlock;
383 }
384
385 ret = -EMSGSIZE;
386 skb->dev = dev;
387 /* ipv6_local_error() does not require socket lock,
388 * we promote our socket to non const
389 */
390 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
391
392 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
393 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
394 unlock:
395 rcu_read_unlock();
396 return ret;
397 }
398 EXPORT_SYMBOL(ip6_xmit);
399
ip6_call_ra_chain(struct sk_buff * skb,int sel)400 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
401 {
402 struct ip6_ra_chain *ra;
403 struct sock *last = NULL;
404
405 read_lock(&ip6_ra_lock);
406 for (ra = ip6_ra_chain; ra; ra = ra->next) {
407 struct sock *sk = ra->sk;
408 if (sk && ra->sel == sel &&
409 (!sk->sk_bound_dev_if ||
410 sk->sk_bound_dev_if == skb->dev->ifindex)) {
411
412 if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
413 !net_eq(sock_net(sk), dev_net(skb->dev))) {
414 continue;
415 }
416 if (last) {
417 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
418 if (skb2)
419 rawv6_rcv(last, skb2);
420 }
421 last = sk;
422 }
423 }
424
425 if (last) {
426 rawv6_rcv(last, skb);
427 read_unlock(&ip6_ra_lock);
428 return 1;
429 }
430 read_unlock(&ip6_ra_lock);
431 return 0;
432 }
433
ip6_forward_proxy_check(struct sk_buff * skb)434 static int ip6_forward_proxy_check(struct sk_buff *skb)
435 {
436 struct ipv6hdr *hdr = ipv6_hdr(skb);
437 u8 nexthdr = hdr->nexthdr;
438 __be16 frag_off;
439 int offset;
440
441 if (ipv6_ext_hdr(nexthdr)) {
442 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
443 if (offset < 0)
444 return 0;
445 } else
446 offset = sizeof(struct ipv6hdr);
447
448 if (nexthdr == IPPROTO_ICMPV6) {
449 struct icmp6hdr *icmp6;
450
451 if (!pskb_may_pull(skb, (skb_network_header(skb) +
452 offset + 1 - skb->data)))
453 return 0;
454
455 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
456
457 switch (icmp6->icmp6_type) {
458 case NDISC_ROUTER_SOLICITATION:
459 case NDISC_ROUTER_ADVERTISEMENT:
460 case NDISC_NEIGHBOUR_SOLICITATION:
461 case NDISC_NEIGHBOUR_ADVERTISEMENT:
462 case NDISC_REDIRECT:
463 /* For reaction involving unicast neighbor discovery
464 * message destined to the proxied address, pass it to
465 * input function.
466 */
467 return 1;
468 default:
469 break;
470 }
471 hdr = ipv6_hdr(skb);
472 }
473
474 /*
475 * The proxying router can't forward traffic sent to a link-local
476 * address, so signal the sender and discard the packet. This
477 * behavior is clarified by the MIPv6 specification.
478 */
479 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
480 dst_link_failure(skb);
481 return -1;
482 }
483
484 return 0;
485 }
486
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)487 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
488 struct sk_buff *skb)
489 {
490 #ifdef CONFIG_NET_SWITCHDEV
491 if (skb->offload_l3_fwd_mark) {
492 consume_skb(skb);
493 return 0;
494 }
495 #endif
496
497 skb_clear_tstamp(skb);
498 return dst_output(net, sk, skb);
499 }
500
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)501 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
502 {
503 if (skb->len <= mtu)
504 return false;
505
506 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
507 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
508 return true;
509
510 if (skb->ignore_df)
511 return false;
512
513 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
514 return false;
515
516 return true;
517 }
518
ip6_forward(struct sk_buff * skb)519 int ip6_forward(struct sk_buff *skb)
520 {
521 struct dst_entry *dst = skb_dst(skb);
522 struct ipv6hdr *hdr = ipv6_hdr(skb);
523 struct inet6_skb_parm *opt = IP6CB(skb);
524 struct net *net = dev_net(dst_dev(dst));
525 struct net_device *dev;
526 struct inet6_dev *idev;
527 SKB_DR(reason);
528 u32 mtu;
529
530 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
531 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
532 (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
533 goto error;
534
535 if (skb->pkt_type != PACKET_HOST)
536 goto drop;
537
538 if (unlikely(skb->sk))
539 goto drop;
540
541 if (skb_warn_if_lro(skb))
542 goto drop;
543
544 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
545 (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
546 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
547 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
548 goto drop;
549 }
550
551 skb_forward_csum(skb);
552
553 /*
554 * We DO NOT make any processing on
555 * RA packets, pushing them to user level AS IS
556 * without ane WARRANTY that application will be able
557 * to interpret them. The reason is that we
558 * cannot make anything clever here.
559 *
560 * We are not end-node, so that if packet contains
561 * AH/ESP, we cannot make anything.
562 * Defragmentation also would be mistake, RA packets
563 * cannot be fragmented, because there is no warranty
564 * that different fragments will go along one path. --ANK
565 */
566 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
567 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
568 return 0;
569 }
570
571 /*
572 * check and decrement ttl
573 */
574 if (hdr->hop_limit <= 1) {
575 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
576 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
577
578 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
579 return -ETIMEDOUT;
580 }
581
582 /* XXX: idev->cnf.proxy_ndp? */
583 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
584 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
585 int proxied = ip6_forward_proxy_check(skb);
586
587 hdr = ipv6_hdr(skb);
588 if (proxied > 0) {
589 /* It's tempting to decrease the hop limit
590 * here by 1, as we do at the end of the
591 * function too.
592 *
593 * But that would be incorrect, as proxying is
594 * not forwarding. The ip6_input function
595 * will handle this packet locally, and it
596 * depends on the hop limit being unchanged.
597 *
598 * One example is the NDP hop limit, that
599 * always has to stay 255, but other would be
600 * similar checks around RA packets, where the
601 * user can even change the desired limit.
602 */
603 return ip6_input(skb);
604 } else if (proxied < 0) {
605 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
606 goto drop;
607 }
608 }
609
610 if (!xfrm6_route_forward(skb)) {
611 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
612 SKB_DR_SET(reason, XFRM_POLICY);
613 goto drop;
614 }
615 dst = skb_dst(skb);
616 dev = dst_dev(dst);
617 /* IPv6 specs say nothing about it, but it is clear that we cannot
618 send redirects to source routed frames.
619 We don't send redirects to frames decapsulated from IPsec.
620 */
621 if (IP6CB(skb)->iif == dev->ifindex &&
622 opt->srcrt == 0 && !skb_sec_path(skb)) {
623 struct in6_addr *target = NULL;
624 struct inet_peer *peer;
625 struct rt6_info *rt;
626
627 /*
628 * incoming and outgoing devices are the same
629 * send a redirect.
630 */
631
632 rt = dst_rt6_info(dst);
633 if (rt->rt6i_flags & RTF_GATEWAY)
634 target = &rt->rt6i_gateway;
635 else
636 target = &hdr->daddr;
637
638 rcu_read_lock();
639 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
640
641 /* Limit redirects both by destination (here)
642 and by source (inside ndisc_send_redirect)
643 */
644 if (inet_peer_xrlim_allow(peer, 1*HZ))
645 ndisc_send_redirect(skb, target);
646 rcu_read_unlock();
647 } else {
648 int addrtype = ipv6_addr_type(&hdr->saddr);
649
650 /* This check is security critical. */
651 if (addrtype == IPV6_ADDR_ANY ||
652 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
653 goto error;
654 if (addrtype & IPV6_ADDR_LINKLOCAL) {
655 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
656 ICMPV6_NOT_NEIGHBOUR, 0);
657 goto error;
658 }
659 }
660
661 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
662
663 mtu = ip6_dst_mtu_maybe_forward(dst, true);
664 if (mtu < IPV6_MIN_MTU)
665 mtu = IPV6_MIN_MTU;
666
667 if (unlikely(ip6_pkt_too_big(skb, mtu))) {
668 /* Again, force OUTPUT device used as source address */
669 skb->dev = dev;
670 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
671 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
672 __IP6_INC_STATS(net, ip6_dst_idev(dst),
673 IPSTATS_MIB_FRAGFAILS);
674 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
675 return -EMSGSIZE;
676 }
677
678 if (skb_cow(skb, dev->hard_header_len)) {
679 __IP6_INC_STATS(net, ip6_dst_idev(dst),
680 IPSTATS_MIB_OUTDISCARDS);
681 goto drop;
682 }
683
684 hdr = ipv6_hdr(skb);
685
686 /* Mangling hops number delayed to point after skb COW */
687
688 hdr->hop_limit--;
689
690 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
691 net, NULL, skb, skb->dev, dev,
692 ip6_forward_finish);
693
694 error:
695 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
696 SKB_DR_SET(reason, IP_INADDRERRORS);
697 drop:
698 kfree_skb_reason(skb, reason);
699 return -EINVAL;
700 }
701
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)702 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
703 {
704 to->pkt_type = from->pkt_type;
705 to->priority = from->priority;
706 to->protocol = from->protocol;
707 skb_dst_drop(to);
708 skb_dst_set(to, dst_clone(skb_dst(from)));
709 to->dev = from->dev;
710 to->mark = from->mark;
711
712 skb_copy_hash(to, from);
713
714 #ifdef CONFIG_NET_SCHED
715 to->tc_index = from->tc_index;
716 #endif
717 nf_copy(to, from);
718 skb_ext_copy(to, from);
719 skb_copy_secmark(to, from);
720 }
721
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)722 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
723 u8 nexthdr, __be32 frag_id,
724 struct ip6_fraglist_iter *iter)
725 {
726 unsigned int first_len;
727 struct frag_hdr *fh;
728
729 /* BUILD HEADER */
730 *prevhdr = NEXTHDR_FRAGMENT;
731 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
732 if (!iter->tmp_hdr)
733 return -ENOMEM;
734
735 iter->frag = skb_shinfo(skb)->frag_list;
736 skb_frag_list_init(skb);
737
738 iter->offset = 0;
739 iter->hlen = hlen;
740 iter->frag_id = frag_id;
741 iter->nexthdr = nexthdr;
742
743 __skb_pull(skb, hlen);
744 fh = __skb_push(skb, sizeof(struct frag_hdr));
745 __skb_push(skb, hlen);
746 skb_reset_network_header(skb);
747 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
748
749 fh->nexthdr = nexthdr;
750 fh->reserved = 0;
751 fh->frag_off = htons(IP6_MF);
752 fh->identification = frag_id;
753
754 first_len = skb_pagelen(skb);
755 skb->data_len = first_len - skb_headlen(skb);
756 skb->len = first_len;
757 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
758
759 return 0;
760 }
761 EXPORT_SYMBOL(ip6_fraglist_init);
762
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)763 void ip6_fraglist_prepare(struct sk_buff *skb,
764 struct ip6_fraglist_iter *iter)
765 {
766 struct sk_buff *frag = iter->frag;
767 unsigned int hlen = iter->hlen;
768 struct frag_hdr *fh;
769
770 frag->ip_summed = CHECKSUM_NONE;
771 skb_reset_transport_header(frag);
772 fh = __skb_push(frag, sizeof(struct frag_hdr));
773 __skb_push(frag, hlen);
774 skb_reset_network_header(frag);
775 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
776 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
777 fh->nexthdr = iter->nexthdr;
778 fh->reserved = 0;
779 fh->frag_off = htons(iter->offset);
780 if (frag->next)
781 fh->frag_off |= htons(IP6_MF);
782 fh->identification = iter->frag_id;
783 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
784 ip6_copy_metadata(frag, skb);
785 }
786 EXPORT_SYMBOL(ip6_fraglist_prepare);
787
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)788 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
789 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
790 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
791 {
792 state->prevhdr = prevhdr;
793 state->nexthdr = nexthdr;
794 state->frag_id = frag_id;
795
796 state->hlen = hlen;
797 state->mtu = mtu;
798
799 state->left = skb->len - hlen; /* Space per frame */
800 state->ptr = hlen; /* Where to start from */
801
802 state->hroom = hdr_room;
803 state->troom = needed_tailroom;
804
805 state->offset = 0;
806 }
807 EXPORT_SYMBOL(ip6_frag_init);
808
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)809 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
810 {
811 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
812 struct sk_buff *frag;
813 struct frag_hdr *fh;
814 unsigned int len;
815
816 len = state->left;
817 /* IF: it doesn't fit, use 'mtu' - the data space left */
818 if (len > state->mtu)
819 len = state->mtu;
820 /* IF: we are not sending up to and including the packet end
821 then align the next start on an eight byte boundary */
822 if (len < state->left)
823 len &= ~7;
824
825 /* Allocate buffer */
826 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
827 state->hroom + state->troom, GFP_ATOMIC);
828 if (!frag)
829 return ERR_PTR(-ENOMEM);
830
831 /*
832 * Set up data on packet
833 */
834
835 ip6_copy_metadata(frag, skb);
836 skb_reserve(frag, state->hroom);
837 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
838 skb_reset_network_header(frag);
839 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
840 frag->transport_header = (frag->network_header + state->hlen +
841 sizeof(struct frag_hdr));
842
843 /*
844 * Charge the memory for the fragment to any owner
845 * it might possess
846 */
847 if (skb->sk)
848 skb_set_owner_w(frag, skb->sk);
849
850 /*
851 * Copy the packet header into the new buffer.
852 */
853 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
854
855 fragnexthdr_offset = skb_network_header(frag);
856 fragnexthdr_offset += prevhdr - skb_network_header(skb);
857 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
858
859 /*
860 * Build fragment header.
861 */
862 fh->nexthdr = state->nexthdr;
863 fh->reserved = 0;
864 fh->identification = state->frag_id;
865
866 /*
867 * Copy a block of the IP datagram.
868 */
869 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
870 len));
871 state->left -= len;
872
873 fh->frag_off = htons(state->offset);
874 if (state->left > 0)
875 fh->frag_off |= htons(IP6_MF);
876 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
877
878 state->ptr += len;
879 state->offset += len;
880
881 return frag;
882 }
883 EXPORT_SYMBOL(ip6_frag_next);
884
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))885 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
886 int (*output)(struct net *, struct sock *, struct sk_buff *))
887 {
888 struct sk_buff *frag;
889 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
890 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
891 inet6_sk(skb->sk) : NULL;
892 u8 tstamp_type = skb->tstamp_type;
893 struct ip6_frag_state state;
894 unsigned int mtu, hlen, nexthdr_offset;
895 ktime_t tstamp = skb->tstamp;
896 int hroom, err = 0;
897 __be32 frag_id;
898 u8 *prevhdr, nexthdr = 0;
899
900 if (!ipv6_mod_enabled()) {
901 kfree_skb(skb);
902 return -EAFNOSUPPORT;
903 }
904
905 err = ip6_find_1stfragopt(skb, &prevhdr);
906 if (err < 0)
907 goto fail;
908 hlen = err;
909 nexthdr = *prevhdr;
910 nexthdr_offset = prevhdr - skb_network_header(skb);
911
912 mtu = ip6_skb_dst_mtu(skb);
913
914 /* We must not fragment if the socket is set to force MTU discovery
915 * or if the skb it not generated by a local socket.
916 */
917 if (unlikely(!skb->ignore_df && skb->len > mtu))
918 goto fail_toobig;
919
920 if (IP6CB(skb)->frag_max_size) {
921 if (IP6CB(skb)->frag_max_size > mtu)
922 goto fail_toobig;
923
924 /* don't send fragments larger than what we received */
925 mtu = IP6CB(skb)->frag_max_size;
926 if (mtu < IPV6_MIN_MTU)
927 mtu = IPV6_MIN_MTU;
928 }
929
930 if (np) {
931 u32 frag_size = READ_ONCE(np->frag_size);
932
933 if (frag_size && frag_size < mtu)
934 mtu = frag_size;
935 }
936 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
937 goto fail_toobig;
938 mtu -= hlen + sizeof(struct frag_hdr);
939
940 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
941 &ipv6_hdr(skb)->saddr);
942
943 if (skb->ip_summed == CHECKSUM_PARTIAL &&
944 (err = skb_checksum_help(skb)))
945 goto fail;
946
947 prevhdr = skb_network_header(skb) + nexthdr_offset;
948 hroom = LL_RESERVED_SPACE(rt->dst.dev);
949 if (skb_has_frag_list(skb)) {
950 unsigned int first_len = skb_pagelen(skb);
951 struct ip6_fraglist_iter iter;
952 struct sk_buff *frag2;
953
954 if (first_len - hlen > mtu ||
955 ((first_len - hlen) & 7) ||
956 skb_cloned(skb) ||
957 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
958 goto slow_path;
959
960 skb_walk_frags(skb, frag) {
961 /* Correct geometry. */
962 if (frag->len > mtu ||
963 ((frag->len & 7) && frag->next) ||
964 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
965 goto slow_path_clean;
966
967 /* Partially cloned skb? */
968 if (skb_shared(frag))
969 goto slow_path_clean;
970
971 BUG_ON(frag->sk);
972 if (skb->sk) {
973 frag->sk = skb->sk;
974 frag->destructor = sock_wfree;
975 }
976 skb->truesize -= frag->truesize;
977 }
978
979 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
980 &iter);
981 if (err < 0)
982 goto fail;
983
984 /* We prevent @rt from being freed. */
985 rcu_read_lock();
986
987 for (;;) {
988 /* Prepare header of the next frame,
989 * before previous one went down. */
990 if (iter.frag)
991 ip6_fraglist_prepare(skb, &iter);
992
993 skb_set_delivery_time(skb, tstamp, tstamp_type);
994 err = output(net, sk, skb);
995 if (!err)
996 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
997 IPSTATS_MIB_FRAGCREATES);
998
999 if (err || !iter.frag)
1000 break;
1001
1002 skb = ip6_fraglist_next(&iter);
1003 }
1004
1005 kfree(iter.tmp_hdr);
1006
1007 if (err == 0) {
1008 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1009 IPSTATS_MIB_FRAGOKS);
1010 rcu_read_unlock();
1011 return 0;
1012 }
1013
1014 kfree_skb_list(iter.frag);
1015
1016 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
1017 IPSTATS_MIB_FRAGFAILS);
1018 rcu_read_unlock();
1019 return err;
1020
1021 slow_path_clean:
1022 skb_walk_frags(skb, frag2) {
1023 if (frag2 == frag)
1024 break;
1025 frag2->sk = NULL;
1026 frag2->destructor = NULL;
1027 skb->truesize += frag2->truesize;
1028 }
1029 }
1030
1031 slow_path:
1032 /*
1033 * Fragment the datagram.
1034 */
1035
1036 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1037 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1038 &state);
1039
1040 /*
1041 * Keep copying data until we run out.
1042 */
1043
1044 while (state.left > 0) {
1045 frag = ip6_frag_next(skb, &state);
1046 if (IS_ERR(frag)) {
1047 err = PTR_ERR(frag);
1048 goto fail;
1049 }
1050
1051 /*
1052 * Put this fragment into the sending queue.
1053 */
1054 skb_set_delivery_time(frag, tstamp, tstamp_type);
1055 err = output(net, sk, frag);
1056 if (err)
1057 goto fail;
1058
1059 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1060 IPSTATS_MIB_FRAGCREATES);
1061 }
1062 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1063 IPSTATS_MIB_FRAGOKS);
1064 consume_skb(skb);
1065 return err;
1066
1067 fail_toobig:
1068 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1069 err = -EMSGSIZE;
1070
1071 fail:
1072 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1073 IPSTATS_MIB_FRAGFAILS);
1074 kfree_skb(skb);
1075 return err;
1076 }
1077 EXPORT_SYMBOL_GPL(ip6_fragment);
1078
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1079 static inline int ip6_rt_check(const struct rt6key *rt_key,
1080 const struct in6_addr *fl_addr,
1081 const struct in6_addr *addr_cache)
1082 {
1083 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1084 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1085 }
1086
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1087 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1088 struct dst_entry *dst,
1089 const struct flowi6 *fl6)
1090 {
1091 struct ipv6_pinfo *np = inet6_sk(sk);
1092 struct rt6_info *rt;
1093
1094 if (!dst)
1095 goto out;
1096
1097 if (dst->ops->family != AF_INET6) {
1098 dst_release(dst);
1099 return NULL;
1100 }
1101
1102 rt = dst_rt6_info(dst);
1103 /* Yes, checking route validity in not connected
1104 * case is not very simple. Take into account,
1105 * that we do not support routing by source, TOS,
1106 * and MSG_DONTROUTE --ANK (980726)
1107 *
1108 * 1. ip6_rt_check(): If route was host route,
1109 * check that cached destination is current.
1110 * If it is network route, we still may
1111 * check its validity using saved pointer
1112 * to the last used address: daddr_cache.
1113 * We do not want to save whole address now,
1114 * (because main consumer of this service
1115 * is tcp, which has not this problem),
1116 * so that the last trick works only on connected
1117 * sockets.
1118 * 2. oif also should be the same.
1119 */
1120 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1121 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1122 #ifdef CONFIG_IPV6_SUBTREES
1123 ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1124 np->saddr_cache ? &np->saddr : NULL) ||
1125 #endif
1126 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1127 dst_release(dst);
1128 dst = NULL;
1129 }
1130
1131 out:
1132 return dst;
1133 }
1134
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1135 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1136 struct dst_entry **dst, struct flowi6 *fl6)
1137 {
1138 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1139 struct neighbour *n;
1140 struct rt6_info *rt;
1141 #endif
1142 int err;
1143 int flags = 0;
1144
1145 /* The correct way to handle this would be to do
1146 * ip6_route_get_saddr, and then ip6_route_output; however,
1147 * the route-specific preferred source forces the
1148 * ip6_route_output call _before_ ip6_route_get_saddr.
1149 *
1150 * In source specific routing (no src=any default route),
1151 * ip6_route_output will fail given src=any saddr, though, so
1152 * that's why we try it again later.
1153 */
1154 if (ipv6_addr_any(&fl6->saddr)) {
1155 struct fib6_info *from;
1156 struct rt6_info *rt;
1157
1158 *dst = ip6_route_output(net, sk, fl6);
1159 rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1160
1161 rcu_read_lock();
1162 from = rt ? rcu_dereference(rt->from) : NULL;
1163 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1164 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1165 fl6->flowi6_l3mdev,
1166 &fl6->saddr);
1167 rcu_read_unlock();
1168
1169 if (err)
1170 goto out_err_release;
1171
1172 /* If we had an erroneous initial result, pretend it
1173 * never existed and let the SA-enabled version take
1174 * over.
1175 */
1176 if ((*dst)->error) {
1177 dst_release(*dst);
1178 *dst = NULL;
1179 }
1180
1181 if (fl6->flowi6_oif)
1182 flags |= RT6_LOOKUP_F_IFACE;
1183 }
1184
1185 if (!*dst)
1186 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1187
1188 err = (*dst)->error;
1189 if (err)
1190 goto out_err_release;
1191
1192 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1193 /*
1194 * Here if the dst entry we've looked up
1195 * has a neighbour entry that is in the INCOMPLETE
1196 * state and the src address from the flow is
1197 * marked as OPTIMISTIC, we release the found
1198 * dst entry and replace it instead with the
1199 * dst entry of the nexthop router
1200 */
1201 rt = dst_rt6_info(*dst);
1202 rcu_read_lock();
1203 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1204 rt6_nexthop(rt, &fl6->daddr));
1205 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1206 rcu_read_unlock();
1207
1208 if (err) {
1209 struct inet6_ifaddr *ifp;
1210 struct flowi6 fl_gw6;
1211 int redirect;
1212
1213 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1214 (*dst)->dev, 1);
1215
1216 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1217 if (ifp)
1218 in6_ifa_put(ifp);
1219
1220 if (redirect) {
1221 /*
1222 * We need to get the dst entry for the
1223 * default router instead
1224 */
1225 dst_release(*dst);
1226 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1227 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1228 *dst = ip6_route_output(net, sk, &fl_gw6);
1229 err = (*dst)->error;
1230 if (err)
1231 goto out_err_release;
1232 }
1233 }
1234 #endif
1235 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1236 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1237 err = -EAFNOSUPPORT;
1238 goto out_err_release;
1239 }
1240
1241 return 0;
1242
1243 out_err_release:
1244 dst_release(*dst);
1245 *dst = NULL;
1246
1247 if (err == -ENETUNREACH)
1248 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1249 return err;
1250 }
1251
1252 /**
1253 * ip6_dst_lookup - perform route lookup on flow
1254 * @net: Network namespace to perform lookup in
1255 * @sk: socket which provides route info
1256 * @dst: pointer to dst_entry * for result
1257 * @fl6: flow to lookup
1258 *
1259 * This function performs a route lookup on the given flow.
1260 *
1261 * It returns zero on success, or a standard errno code on error.
1262 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1263 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1264 struct flowi6 *fl6)
1265 {
1266 *dst = NULL;
1267 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1268 }
1269 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1270
1271 /**
1272 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1273 * @net: Network namespace to perform lookup in
1274 * @sk: socket which provides route info
1275 * @fl6: flow to lookup
1276 * @final_dst: final destination address for ipsec lookup
1277 *
1278 * This function performs a route lookup on the given flow.
1279 *
1280 * It returns a valid dst pointer on success, or a pointer encoded
1281 * error code.
1282 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1283 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1284 const struct in6_addr *final_dst)
1285 {
1286 struct dst_entry *dst = NULL;
1287 int err;
1288
1289 if (!ipv6_mod_enabled())
1290 return ERR_PTR(-EAFNOSUPPORT);
1291 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1292 if (err)
1293 return ERR_PTR(err);
1294 if (final_dst)
1295 fl6->daddr = *final_dst;
1296
1297 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1298 }
1299 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1300
1301 /**
1302 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1303 * @sk: socket which provides the dst cache and route info
1304 * @fl6: flow to lookup
1305 * @final_dst: final destination address for ipsec lookup
1306 * @connected: whether @sk is connected or not
1307 *
1308 * This function performs a route lookup on the given flow with the
1309 * possibility of using the cached route in the socket if it is valid.
1310 * It will take the socket dst lock when operating on the dst cache.
1311 * As a result, this function can only be used in process context.
1312 *
1313 * In addition, for a connected socket, cache the dst in the socket
1314 * if the current cache is not valid.
1315 *
1316 * It returns a valid dst pointer on success, or a pointer encoded
1317 * error code.
1318 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1319 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1320 const struct in6_addr *final_dst,
1321 bool connected)
1322 {
1323 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1324
1325 dst = ip6_sk_dst_check(sk, dst, fl6);
1326 if (dst)
1327 return dst;
1328
1329 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1330 if (connected && !IS_ERR(dst))
1331 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1332
1333 return dst;
1334 }
1335 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1336
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1337 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1338 gfp_t gfp)
1339 {
1340 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1341 }
1342
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1343 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1344 gfp_t gfp)
1345 {
1346 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1347 }
1348
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1349 static void ip6_append_data_mtu(unsigned int *mtu,
1350 int *maxfraglen,
1351 unsigned int fragheaderlen,
1352 struct sk_buff *skb,
1353 struct rt6_info *rt,
1354 unsigned int orig_mtu)
1355 {
1356 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1357 if (!skb) {
1358 /* first fragment, reserve header_len */
1359 *mtu = orig_mtu - rt->dst.header_len;
1360
1361 } else {
1362 /*
1363 * this fragment is not first, the headers
1364 * space is regarded as data space.
1365 */
1366 *mtu = orig_mtu;
1367 }
1368 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1369 + fragheaderlen - sizeof(struct frag_hdr);
1370 }
1371 }
1372
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1373 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1374 struct ipcm6_cookie *ipc6,
1375 struct rt6_info *rt)
1376 {
1377 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1378 struct inet6_cork *v6_cork = &cork->base6;
1379 struct ipv6_pinfo *np = inet6_sk(sk);
1380 unsigned int mtu, frag_size;
1381
1382 /* callers pass dst together with a reference, set it first so
1383 * ip6_cork_release() can put it down even in case of an error.
1384 */
1385 cork->base.dst = &rt->dst;
1386
1387 /*
1388 * setup for corking
1389 */
1390 if (unlikely(opt)) {
1391 if (WARN_ON(v6_cork->opt))
1392 return -EINVAL;
1393
1394 nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation);
1395 if (unlikely(!nopt))
1396 return -ENOBUFS;
1397
1398 nopt->tot_len = sizeof(*opt);
1399 nopt->opt_flen = opt->opt_flen;
1400 nopt->opt_nflen = opt->opt_nflen;
1401
1402 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1403 if (opt->dst0opt && !nopt->dst0opt)
1404 return -ENOBUFS;
1405
1406 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1407 if (opt->dst1opt && !nopt->dst1opt)
1408 return -ENOBUFS;
1409
1410 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1411 if (opt->hopopt && !nopt->hopopt)
1412 return -ENOBUFS;
1413
1414 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1415 if (opt->srcrt && !nopt->srcrt)
1416 return -ENOBUFS;
1417
1418 /* need source address above miyazawa*/
1419 }
1420 v6_cork->hop_limit = ipc6->hlimit;
1421 v6_cork->tclass = ipc6->tclass;
1422 v6_cork->dontfrag = ipc6->dontfrag;
1423 if (rt->dst.flags & DST_XFRM_TUNNEL)
1424 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1425 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
1426 else
1427 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1428 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
1429
1430 frag_size = READ_ONCE(np->frag_size);
1431 if (frag_size && frag_size < mtu)
1432 mtu = frag_size;
1433
1434 cork->base.fragsize = mtu;
1435 cork->base.gso_size = ipc6->gso_size;
1436 cork->base.tx_flags = 0;
1437 cork->base.mark = ipc6->sockc.mark;
1438 cork->base.priority = ipc6->sockc.priority;
1439 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1440 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1441 cork->base.flags |= IPCORK_TS_OPT_ID;
1442 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1443 }
1444 cork->base.length = 0;
1445 cork->base.transmit_time = ipc6->sockc.transmit_time;
1446
1447 return 0;
1448 }
1449
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags)1450 static int __ip6_append_data(struct sock *sk,
1451 struct sk_buff_head *queue,
1452 struct inet_cork_full *cork_full,
1453 struct page_frag *pfrag,
1454 int getfrag(void *from, char *to, int offset,
1455 int len, int odd, struct sk_buff *skb),
1456 void *from, size_t length, int transhdrlen,
1457 unsigned int flags)
1458 {
1459 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1460 struct inet6_cork *v6_cork = &cork_full->base6;
1461 struct inet_cork *cork = &cork_full->base;
1462 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1463 struct sk_buff *skb, *skb_prev = NULL;
1464 struct ubuf_info *uarg = NULL;
1465 int exthdrlen = 0;
1466 int dst_exthdrlen = 0;
1467 int hh_len;
1468 int copy;
1469 int err;
1470 int offset = 0;
1471 bool zc = false;
1472 u32 tskey = 0;
1473 struct rt6_info *rt = dst_rt6_info(cork->dst);
1474 bool paged, hold_tskey = false, extra_uref = false;
1475 struct ipv6_txoptions *opt = v6_cork->opt;
1476 int csummode = CHECKSUM_NONE;
1477 unsigned int maxnonfragsize, headersize;
1478 unsigned int wmem_alloc_delta = 0;
1479
1480 skb = skb_peek_tail(queue);
1481 if (!skb) {
1482 exthdrlen = opt ? opt->opt_flen : 0;
1483 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1484 }
1485
1486 paged = !!cork->gso_size;
1487 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1488 orig_mtu = mtu;
1489
1490 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1491
1492 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1493 (opt ? opt->opt_nflen : 0);
1494
1495 headersize = sizeof(struct ipv6hdr) +
1496 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1497 rt->rt6i_nfheader_len;
1498
1499 if (mtu <= fragheaderlen ||
1500 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1501 goto emsgsize;
1502
1503 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1504 sizeof(struct frag_hdr);
1505
1506 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1507 * the first fragment
1508 */
1509 if (headersize + transhdrlen > mtu)
1510 goto emsgsize;
1511
1512 if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1513 (sk->sk_protocol == IPPROTO_UDP ||
1514 sk->sk_protocol == IPPROTO_ICMPV6 ||
1515 sk->sk_protocol == IPPROTO_RAW)) {
1516 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1517 sizeof(struct ipv6hdr));
1518 goto emsgsize;
1519 }
1520
1521 if (ip6_sk_ignore_df(sk))
1522 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1523 else
1524 maxnonfragsize = mtu;
1525
1526 if (cork->length + length > maxnonfragsize - headersize) {
1527 emsgsize:
1528 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1529 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1530 return -EMSGSIZE;
1531 }
1532
1533 /* CHECKSUM_PARTIAL only with no extension headers and when
1534 * we are not going to fragment
1535 */
1536 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1537 headersize == sizeof(struct ipv6hdr) &&
1538 length <= mtu - headersize &&
1539 (!(flags & MSG_MORE) || cork->gso_size) &&
1540 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1541 csummode = CHECKSUM_PARTIAL;
1542
1543 if ((flags & MSG_ZEROCOPY) && length) {
1544 struct msghdr *msg = from;
1545
1546 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1547 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1548 return -EINVAL;
1549
1550 /* Leave uarg NULL if can't zerocopy, callers should
1551 * be able to handle it.
1552 */
1553 if ((rt->dst.dev->features & NETIF_F_SG) &&
1554 csummode == CHECKSUM_PARTIAL) {
1555 paged = true;
1556 zc = true;
1557 uarg = msg->msg_ubuf;
1558 }
1559 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1560 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1561 false);
1562 if (!uarg)
1563 return -ENOBUFS;
1564 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1565 if (rt->dst.dev->features & NETIF_F_SG &&
1566 csummode == CHECKSUM_PARTIAL) {
1567 paged = true;
1568 zc = true;
1569 } else {
1570 uarg_to_msgzc(uarg)->zerocopy = 0;
1571 skb_zcopy_set(skb, uarg, &extra_uref);
1572 }
1573 }
1574 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1575 if (inet_test_bit(HDRINCL, sk))
1576 return -EPERM;
1577 if (rt->dst.dev->features & NETIF_F_SG &&
1578 getfrag == ip_generic_getfrag)
1579 /* We need an empty buffer to attach stuff to */
1580 paged = true;
1581 else
1582 flags &= ~MSG_SPLICE_PAGES;
1583 }
1584
1585 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1586 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1587 if (cork->flags & IPCORK_TS_OPT_ID) {
1588 tskey = cork->ts_opt_id;
1589 } else {
1590 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1591 hold_tskey = true;
1592 }
1593 }
1594
1595 /*
1596 * Let's try using as much space as possible.
1597 * Use MTU if total length of the message fits into the MTU.
1598 * Otherwise, we need to reserve fragment header and
1599 * fragment alignment (= 8-15 octects, in total).
1600 *
1601 * Note that we may need to "move" the data from the tail
1602 * of the buffer to the new fragment when we split
1603 * the message.
1604 *
1605 * FIXME: It may be fragmented into multiple chunks
1606 * at once if non-fragmentable extension headers
1607 * are too large.
1608 * --yoshfuji
1609 */
1610
1611 cork->length += length;
1612 if (!skb)
1613 goto alloc_new_skb;
1614
1615 while (length > 0) {
1616 /* Check if the remaining data fits into current packet. */
1617 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1618 if (copy < length)
1619 copy = maxfraglen - skb->len;
1620
1621 if (copy <= 0) {
1622 char *data;
1623 unsigned int datalen;
1624 unsigned int fraglen;
1625 unsigned int fraggap;
1626 unsigned int alloclen, alloc_extra;
1627 unsigned int pagedlen;
1628 alloc_new_skb:
1629 /* There's no room in the current skb */
1630 if (skb)
1631 fraggap = skb->len - maxfraglen;
1632 else
1633 fraggap = 0;
1634 /* update mtu and maxfraglen if necessary */
1635 if (!skb || !skb_prev)
1636 ip6_append_data_mtu(&mtu, &maxfraglen,
1637 fragheaderlen, skb, rt,
1638 orig_mtu);
1639
1640 skb_prev = skb;
1641
1642 /*
1643 * If remaining data exceeds the mtu,
1644 * we know we need more fragment(s).
1645 */
1646 datalen = length + fraggap;
1647
1648 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1649 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1650 fraglen = datalen + fragheaderlen;
1651 pagedlen = 0;
1652
1653 alloc_extra = hh_len;
1654 alloc_extra += dst_exthdrlen;
1655 alloc_extra += rt->dst.trailer_len;
1656
1657 /* We just reserve space for fragment header.
1658 * Note: this may be overallocation if the message
1659 * (without MSG_MORE) fits into the MTU.
1660 */
1661 alloc_extra += sizeof(struct frag_hdr);
1662
1663 if ((flags & MSG_MORE) &&
1664 !(rt->dst.dev->features&NETIF_F_SG))
1665 alloclen = mtu;
1666 else if (!paged &&
1667 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1668 !(rt->dst.dev->features & NETIF_F_SG)))
1669 alloclen = fraglen;
1670 else {
1671 alloclen = fragheaderlen + transhdrlen;
1672 pagedlen = datalen - transhdrlen;
1673 }
1674 alloclen += alloc_extra;
1675
1676 if (datalen != length + fraggap) {
1677 /*
1678 * this is not the last fragment, the trailer
1679 * space is regarded as data space.
1680 */
1681 datalen += rt->dst.trailer_len;
1682 }
1683
1684 fraglen = datalen + fragheaderlen;
1685
1686 copy = datalen - transhdrlen - fraggap - pagedlen;
1687 /* [!] NOTE: copy may be negative if pagedlen>0
1688 * because then the equation may reduces to -fraggap.
1689 */
1690 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1691 err = -EINVAL;
1692 goto error;
1693 }
1694 if (transhdrlen) {
1695 skb = sock_alloc_send_skb(sk, alloclen,
1696 (flags & MSG_DONTWAIT), &err);
1697 } else {
1698 skb = NULL;
1699 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1700 2 * sk->sk_sndbuf)
1701 skb = alloc_skb(alloclen,
1702 sk->sk_allocation);
1703 if (unlikely(!skb))
1704 err = -ENOBUFS;
1705 }
1706 if (!skb)
1707 goto error;
1708 /*
1709 * Fill in the control structures
1710 */
1711 skb->protocol = htons(ETH_P_IPV6);
1712 skb->ip_summed = csummode;
1713 skb->csum = 0;
1714 /* reserve for fragmentation and ipsec header */
1715 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1716 dst_exthdrlen);
1717
1718 /*
1719 * Find where to start putting bytes
1720 */
1721 data = skb_put(skb, fraglen - pagedlen);
1722 skb_set_network_header(skb, exthdrlen);
1723 data += fragheaderlen;
1724 skb->transport_header = (skb->network_header +
1725 fragheaderlen);
1726 if (fraggap) {
1727 skb->csum = skb_copy_and_csum_bits(
1728 skb_prev, maxfraglen,
1729 data + transhdrlen, fraggap);
1730 skb_prev->csum = csum_sub(skb_prev->csum,
1731 skb->csum);
1732 data += fraggap;
1733 pskb_trim_unique(skb_prev, maxfraglen);
1734 }
1735 if (copy > 0 &&
1736 INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1737 from, data + transhdrlen, offset,
1738 copy, fraggap, skb) < 0) {
1739 err = -EFAULT;
1740 kfree_skb(skb);
1741 goto error;
1742 } else if (flags & MSG_SPLICE_PAGES) {
1743 copy = 0;
1744 }
1745
1746 offset += copy;
1747 length -= copy + transhdrlen;
1748 transhdrlen = 0;
1749 exthdrlen = 0;
1750 dst_exthdrlen = 0;
1751
1752 /* Only the initial fragment is time stamped */
1753 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1754 cork->tx_flags = 0;
1755 skb_shinfo(skb)->tskey = tskey;
1756 tskey = 0;
1757 skb_zcopy_set(skb, uarg, &extra_uref);
1758
1759 if ((flags & MSG_CONFIRM) && !skb_prev)
1760 skb_set_dst_pending_confirm(skb, 1);
1761
1762 /*
1763 * Put the packet on the pending queue
1764 */
1765 if (!skb->destructor) {
1766 skb->destructor = sock_wfree;
1767 skb->sk = sk;
1768 wmem_alloc_delta += skb->truesize;
1769 }
1770 __skb_queue_tail(queue, skb);
1771 continue;
1772 }
1773
1774 if (copy > length)
1775 copy = length;
1776
1777 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1778 skb_tailroom(skb) >= copy) {
1779 unsigned int off;
1780
1781 off = skb->len;
1782 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1783 from, skb_put(skb, copy),
1784 offset, copy, off, skb) < 0) {
1785 __skb_trim(skb, off);
1786 err = -EFAULT;
1787 goto error;
1788 }
1789 } else if (flags & MSG_SPLICE_PAGES) {
1790 struct msghdr *msg = from;
1791
1792 err = -EIO;
1793 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1794 goto error;
1795
1796 err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1797 if (err < 0)
1798 goto error;
1799 copy = err;
1800 if (!(flags & MSG_NO_SHARED_FRAGS))
1801 skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG;
1802 wmem_alloc_delta += copy;
1803 } else if (!zc) {
1804 int i = skb_shinfo(skb)->nr_frags;
1805
1806 err = -ENOMEM;
1807 if (!sk_page_frag_refill(sk, pfrag))
1808 goto error;
1809
1810 skb_zcopy_downgrade_managed(skb);
1811 if (!skb_can_coalesce(skb, i, pfrag->page,
1812 pfrag->offset)) {
1813 err = -EMSGSIZE;
1814 if (i == MAX_SKB_FRAGS)
1815 goto error;
1816
1817 __skb_fill_page_desc(skb, i, pfrag->page,
1818 pfrag->offset, 0);
1819 skb_shinfo(skb)->nr_frags = ++i;
1820 get_page(pfrag->page);
1821 }
1822 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1823 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1824 from,
1825 page_address(pfrag->page) + pfrag->offset,
1826 offset, copy, skb->len, skb) < 0)
1827 goto error_efault;
1828
1829 pfrag->offset += copy;
1830 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1831 skb->len += copy;
1832 skb->data_len += copy;
1833 skb->truesize += copy;
1834 wmem_alloc_delta += copy;
1835 } else {
1836 err = skb_zerocopy_iter_dgram(skb, from, copy);
1837 if (err < 0)
1838 goto error;
1839 }
1840 offset += copy;
1841 length -= copy;
1842 }
1843
1844 if (wmem_alloc_delta)
1845 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1846 return 0;
1847
1848 error_efault:
1849 err = -EFAULT;
1850 error:
1851 net_zcopy_put_abort(uarg, extra_uref);
1852 cork->length -= length;
1853 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1854 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1855 if (hold_tskey)
1856 atomic_dec(&sk->sk_tskey);
1857 return err;
1858 }
1859
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1860 int ip6_append_data(struct sock *sk,
1861 int getfrag(void *from, char *to, int offset, int len,
1862 int odd, struct sk_buff *skb),
1863 void *from, size_t length, int transhdrlen,
1864 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1865 struct rt6_info *rt, unsigned int flags)
1866 {
1867 struct inet_sock *inet = inet_sk(sk);
1868 int exthdrlen;
1869 int err;
1870
1871 if (flags&MSG_PROBE)
1872 return 0;
1873 if (skb_queue_empty(&sk->sk_write_queue)) {
1874 /*
1875 * setup for corking
1876 */
1877 dst_hold(&rt->dst);
1878 err = ip6_setup_cork(sk, &inet->cork,
1879 ipc6, rt);
1880 if (err)
1881 return err;
1882
1883 inet->cork.fl.u.ip6 = *fl6;
1884 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1885 length += exthdrlen;
1886 transhdrlen += exthdrlen;
1887 } else {
1888 transhdrlen = 0;
1889 }
1890
1891 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1892 sk_page_frag(sk), getfrag,
1893 from, length, transhdrlen, flags);
1894 }
1895 EXPORT_SYMBOL_GPL(ip6_append_data);
1896
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1897 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1898 {
1899 struct dst_entry *dst = cork->base.dst;
1900
1901 cork->base.dst = NULL;
1902 skb_dst_set(skb, dst);
1903 }
1904
ip6_cork_release(struct inet_cork_full * cork)1905 static void ip6_cork_release(struct inet_cork_full *cork)
1906 {
1907 struct inet6_cork *v6_cork = &cork->base6;
1908
1909 if (unlikely(v6_cork->opt)) {
1910 struct ipv6_txoptions *opt = v6_cork->opt;
1911
1912 kfree(opt->dst0opt);
1913 kfree(opt->dst1opt);
1914 kfree(opt->hopopt);
1915 kfree(opt->srcrt);
1916 kfree(opt);
1917 v6_cork->opt = NULL;
1918 }
1919
1920 if (cork->base.dst) {
1921 dst_release(cork->base.dst);
1922 cork->base.dst = NULL;
1923 }
1924 }
1925
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork)1926 struct sk_buff *__ip6_make_skb(struct sock *sk,
1927 struct sk_buff_head *queue,
1928 struct inet_cork_full *cork)
1929 {
1930 struct sk_buff *skb, *tmp_skb;
1931 struct sk_buff **tail_skb;
1932 struct in6_addr *final_dst;
1933 struct net *net = sock_net(sk);
1934 struct ipv6hdr *hdr;
1935 struct ipv6_txoptions *opt;
1936 struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1937 struct flowi6 *fl6 = &cork->fl.u.ip6;
1938 unsigned char proto = fl6->flowi6_proto;
1939
1940 skb = __skb_dequeue(queue);
1941 if (!skb)
1942 goto out;
1943 tail_skb = &(skb_shinfo(skb)->frag_list);
1944
1945 /* move skb->data to ip header from ext header */
1946 if (skb->data < skb_network_header(skb))
1947 __skb_pull(skb, skb_network_offset(skb));
1948 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1949 __skb_pull(tmp_skb, skb_network_header_len(skb));
1950 *tail_skb = tmp_skb;
1951 tail_skb = &(tmp_skb->next);
1952 skb->len += tmp_skb->len;
1953 skb->data_len += tmp_skb->len;
1954 skb->truesize += tmp_skb->truesize;
1955 tmp_skb->destructor = NULL;
1956 tmp_skb->sk = NULL;
1957 }
1958
1959 /* Allow local fragmentation. */
1960 skb->ignore_df = ip6_sk_ignore_df(sk);
1961 __skb_pull(skb, skb_network_header_len(skb));
1962
1963 final_dst = &fl6->daddr;
1964 opt = cork->base6.opt;
1965 if (unlikely(opt)) {
1966 if (opt->opt_flen)
1967 proto = ipv6_push_frag_opts(skb, opt, proto);
1968 if (opt->opt_nflen)
1969 proto = ipv6_push_nfrag_opts(skb, opt, proto,
1970 &final_dst, &fl6->saddr);
1971 }
1972 skb_push(skb, sizeof(struct ipv6hdr));
1973 skb_reset_network_header(skb);
1974 hdr = ipv6_hdr(skb);
1975
1976 ip6_flow_hdr(hdr, cork->base6.tclass,
1977 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1978 ip6_autoflowlabel(net, sk), fl6));
1979 hdr->hop_limit = cork->base6.hop_limit;
1980 hdr->nexthdr = proto;
1981 hdr->saddr = fl6->saddr;
1982 hdr->daddr = *final_dst;
1983
1984 skb->priority = cork->base.priority;
1985 skb->mark = cork->base.mark;
1986 if (sk_is_tcp(sk))
1987 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1988 else
1989 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1990
1991 ip6_cork_steal_dst(skb, cork);
1992 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1993 if (unlikely(proto == IPPROTO_ICMPV6)) {
1994 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1995 u8 icmp6_type;
1996
1997 if (sk->sk_socket->type == SOCK_RAW &&
1998 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1999 icmp6_type = fl6->fl6_icmp_type;
2000 else
2001 icmp6_type = icmp6_hdr(skb)->icmp6_type;
2002 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
2003 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
2004 }
2005
2006 ip6_cork_release(cork);
2007 out:
2008 return skb;
2009 }
2010
ip6_send_skb(struct sk_buff * skb)2011 int ip6_send_skb(struct sk_buff *skb)
2012 {
2013 struct net *net = sock_net(skb->sk);
2014 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
2015 int err;
2016
2017 rcu_read_lock();
2018 err = ip6_local_out(net, skb->sk, skb);
2019 if (err) {
2020 if (err > 0)
2021 err = net_xmit_errno(err);
2022 if (err)
2023 IP6_INC_STATS(net, rt->rt6i_idev,
2024 IPSTATS_MIB_OUTDISCARDS);
2025 }
2026
2027 rcu_read_unlock();
2028 return err;
2029 }
2030
ip6_push_pending_frames(struct sock * sk)2031 int ip6_push_pending_frames(struct sock *sk)
2032 {
2033 struct sk_buff *skb;
2034
2035 skb = ip6_finish_skb(sk);
2036 if (!skb)
2037 return 0;
2038
2039 return ip6_send_skb(skb);
2040 }
2041 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2042
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork)2043 static void __ip6_flush_pending_frames(struct sock *sk,
2044 struct sk_buff_head *queue,
2045 struct inet_cork_full *cork)
2046 {
2047 struct sk_buff *skb;
2048
2049 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2050 if (skb_dst(skb))
2051 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2052 IPSTATS_MIB_OUTDISCARDS);
2053 kfree_skb(skb);
2054 }
2055
2056 ip6_cork_release(cork);
2057 }
2058
ip6_flush_pending_frames(struct sock * sk)2059 void ip6_flush_pending_frames(struct sock *sk)
2060 {
2061 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2062 &inet_sk(sk)->cork);
2063 }
2064 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2065
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2066 struct sk_buff *ip6_make_skb(struct sock *sk,
2067 int getfrag(void *from, char *to, int offset,
2068 int len, int odd, struct sk_buff *skb),
2069 void *from, size_t length, int transhdrlen,
2070 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2071 unsigned int flags, struct inet_cork_full *cork)
2072 {
2073 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2074 struct sk_buff_head queue;
2075 int err;
2076
2077 if (flags & MSG_PROBE) {
2078 dst_release(&rt->dst);
2079 return NULL;
2080 }
2081
2082 __skb_queue_head_init(&queue);
2083
2084 cork->base.flags = 0;
2085 cork->base.addr = 0;
2086 cork->base.opt = NULL;
2087 cork->base6.opt = NULL;
2088 err = ip6_setup_cork(sk, cork, ipc6, rt);
2089 if (err) {
2090 ip6_cork_release(cork);
2091 return ERR_PTR(err);
2092 }
2093
2094 err = __ip6_append_data(sk, &queue, cork,
2095 ¤t->task_frag, getfrag, from,
2096 length + exthdrlen, transhdrlen + exthdrlen,
2097 flags);
2098 if (err) {
2099 __ip6_flush_pending_frames(sk, &queue, cork);
2100 return ERR_PTR(err);
2101 }
2102
2103 return __ip6_make_skb(sk, &queue, cork);
2104 }
2105