1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 struct dst_entry *dst = skb_dst(skb);
63 struct net_device *dev = dst_dev_rcu(dst);
64 struct inet6_dev *idev = ip6_dst_idev(dst);
65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
68 struct neighbour *neigh;
69 int ret;
70
71 /* Be paranoid, rather than too clever. */
72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 /* idev stays alive because we hold rcu_read_lock(). */
74 skb = skb_expand_head(skb, hh_len);
75 if (!skb) {
76 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
77 return -ENOMEM;
78 }
79 }
80
81 hdr = ipv6_hdr(skb);
82 daddr = &hdr->daddr;
83 if (unlikely(ipv6_addr_is_multicast(daddr))) {
84 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
85 ((mroute6_is_socket(net, skb) &&
86 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
87 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
88 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
89
90 /* Do not check for IFF_ALLMULTI; multicast routing
91 is not supported in any case.
92 */
93 if (newskb)
94 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
95 net, sk, newskb, NULL, newskb->dev,
96 dev_loopback_xmit);
97
98 if (hdr->hop_limit == 0) {
99 IP6_INC_STATS(net, idev,
100 IPSTATS_MIB_OUTDISCARDS);
101 kfree_skb(skb);
102 return 0;
103 }
104 }
105
106 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
107 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
108 !(dev->flags & IFF_LOOPBACK)) {
109 kfree_skb(skb);
110 return 0;
111 }
112 }
113
114 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
115 int res = lwtunnel_xmit(skb);
116
117 if (res != LWTUNNEL_XMIT_CONTINUE)
118 return res;
119 }
120
121 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
122
123 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
124 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
125
126 if (IS_ERR_OR_NULL(neigh)) {
127 if (unlikely(!neigh))
128 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
129 if (IS_ERR(neigh)) {
130 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
131 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
132 return -EINVAL;
133 }
134 }
135 sock_confirm_neigh(skb, neigh);
136 ret = neigh_output(neigh, skb, false);
137 return ret;
138 }
139
140 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)141 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
142 struct sk_buff *skb, unsigned int mtu)
143 {
144 struct sk_buff *segs, *nskb;
145 netdev_features_t features;
146 int ret = 0;
147
148 /* Please see corresponding comment in ip_finish_output_gso
149 * describing the cases where GSO segment length exceeds the
150 * egress MTU.
151 */
152 features = netif_skb_features(skb);
153 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
154 if (IS_ERR_OR_NULL(segs)) {
155 kfree_skb(skb);
156 return -ENOMEM;
157 }
158
159 consume_skb(skb);
160
161 skb_list_walk_safe(segs, segs, nskb) {
162 int err;
163
164 skb_mark_not_on_list(segs);
165 /* Last GSO segment can be smaller than gso_size (and MTU).
166 * Adding a fragment header would produce an "atomic fragment",
167 * which is considered harmful (RFC-8021). Avoid that.
168 */
169 err = segs->len > mtu ?
170 ip6_fragment(net, sk, segs, ip6_finish_output2) :
171 ip6_finish_output2(net, sk, segs);
172 if (err && ret == 0)
173 ret = err;
174 }
175
176 return ret;
177 }
178
ip6_finish_output_gso(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)179 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
180 struct sk_buff *skb, unsigned int mtu)
181 {
182 if (unlikely(!skb_gso_validate_network_len(skb, mtu)))
183 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
184
185 return ip6_finish_output2(net, sk, skb);
186 }
187
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)188 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
189 {
190 unsigned int mtu;
191
192 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
193 /* Policy lookup after SNAT yielded a new policy */
194 if (skb_dst(skb)->xfrm) {
195 IP6CB(skb)->flags |= IP6SKB_REROUTED;
196 return dst_output(net, sk, skb);
197 }
198 #endif
199
200 mtu = ip6_skb_dst_mtu(skb);
201 if (skb_is_gso(skb))
202 return ip6_finish_output_gso(net, sk, skb, mtu);
203
204 if (unlikely(skb->len > mtu ||
205 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size)))
206 return ip6_fragment(net, sk, skb, ip6_finish_output2);
207
208 return ip6_finish_output2(net, sk, skb);
209 }
210
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)211 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
212 {
213 int ret;
214
215 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
216 switch (ret) {
217 case NET_XMIT_SUCCESS:
218 case NET_XMIT_CN:
219 return __ip6_finish_output(net, sk, skb) ? : ret;
220 default:
221 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
222 return ret;
223 }
224 }
225
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)226 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
227 {
228 struct dst_entry *dst = skb_dst(skb);
229 struct net_device *dev, *indev = skb->dev;
230 struct inet6_dev *idev;
231 int ret;
232
233 skb->protocol = htons(ETH_P_IPV6);
234 rcu_read_lock();
235 dev = dst_dev_rcu(dst);
236 idev = ip6_dst_idev(dst);
237 skb->dev = dev;
238
239 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
240 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
241 rcu_read_unlock();
242 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
243 return 0;
244 }
245
246 ret = NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
247 net, sk, skb, indev, dev,
248 ip6_finish_output,
249 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
250 rcu_read_unlock();
251 return ret;
252 }
253 EXPORT_SYMBOL(ip6_output);
254
ip6_autoflowlabel(struct net * net,const struct sock * sk)255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 return ip6_default_np_autolabel(net);
259 return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261
262 /*
263 * xmit an sk_buff (used by TCP and SCTP)
264 * Note : socket lock is not held for SYNACK packets, but might be modified
265 * by calls to skb_set_owner_w() and ipv6_local_error(),
266 * which are using proper atomic operations or spinlocks.
267 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
269 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
270 {
271 const struct ipv6_pinfo *np = inet6_sk(sk);
272 struct in6_addr *first_hop = &fl6->daddr;
273 struct dst_entry *dst = skb_dst(skb);
274 struct inet6_dev *idev = ip6_dst_idev(dst);
275 struct net *net = sock_net(sk);
276 unsigned int head_room;
277 struct net_device *dev;
278 struct ipv6hdr *hdr;
279 u8 proto = fl6->flowi6_proto;
280 int seg_len = skb->len;
281 int ret, hlimit = -1;
282 u32 mtu;
283
284 rcu_read_lock();
285
286 dev = dst_dev_rcu(dst);
287 head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
288 if (opt)
289 head_room += opt->opt_nflen + opt->opt_flen;
290
291 if (unlikely(head_room > skb_headroom(skb))) {
292 /* idev stays alive while we hold rcu_read_lock(). */
293 skb = skb_expand_head(skb, head_room);
294 if (!skb) {
295 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
296 ret = -ENOBUFS;
297 goto unlock;
298 }
299 }
300
301 if (unlikely(opt)) {
302 seg_len += opt->opt_nflen + opt->opt_flen;
303
304 if (opt->opt_flen)
305 proto = ipv6_push_frag_opts(skb, opt, proto);
306
307 if (opt->opt_nflen)
308 proto = ipv6_push_nfrag_opts(skb, opt, proto,
309 &first_hop,
310 &fl6->saddr);
311 }
312
313 if (unlikely(seg_len > IPV6_MAXPLEN))
314 seg_len = 0;
315
316 __skb_push(skb, sizeof(struct ipv6hdr));
317 skb_reset_network_header(skb);
318 hdr = ipv6_hdr(skb);
319
320 /*
321 * Fill in the IPv6 header
322 */
323 if (np)
324 hlimit = READ_ONCE(np->hop_limit);
325 if (hlimit < 0)
326 hlimit = ip6_dst_hoplimit(dst);
327
328 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
329 ip6_autoflowlabel(net, sk), fl6));
330
331 hdr->payload_len = htons(seg_len);
332 hdr->nexthdr = proto;
333 hdr->hop_limit = hlimit;
334
335 hdr->saddr = fl6->saddr;
336 hdr->daddr = *first_hop;
337
338 skb->protocol = htons(ETH_P_IPV6);
339 skb->priority = priority;
340 skb->mark = mark;
341
342 mtu = dst6_mtu(dst);
343 if (likely((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb))) {
344 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
345
346 /* if egress device is enslaved to an L3 master device pass the
347 * skb to its handler for processing
348 */
349 skb = l3mdev_ip6_out((struct sock *)sk, skb);
350 if (unlikely(!skb)) {
351 ret = 0;
352 goto unlock;
353 }
354
355 /* hooks should never assume socket lock is held.
356 * we promote our socket to non const
357 */
358 ret = NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
359 net, (struct sock *)sk, skb, NULL, dev,
360 dst_output);
361 goto unlock;
362 }
363
364 ret = -EMSGSIZE;
365 skb->dev = dev;
366 /* ipv6_local_error() does not require socket lock,
367 * we promote our socket to non const
368 */
369 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
370
371 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
372 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
373 unlock:
374 rcu_read_unlock();
375 return ret;
376 }
377 EXPORT_SYMBOL(ip6_xmit);
378
ip6_call_ra_chain(struct sk_buff * skb,int sel)379 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
380 {
381 struct ip6_ra_chain *ra;
382 struct sock *last = NULL;
383
384 read_lock(&ip6_ra_lock);
385 for (ra = ip6_ra_chain; ra; ra = ra->next) {
386 struct sock *sk = ra->sk;
387 if (sk && ra->sel == sel &&
388 (!sk->sk_bound_dev_if ||
389 sk->sk_bound_dev_if == skb->dev->ifindex)) {
390
391 if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
392 !net_eq(sock_net(sk), dev_net(skb->dev))) {
393 continue;
394 }
395 if (last) {
396 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
397 if (skb2)
398 rawv6_rcv(last, skb2);
399 }
400 last = sk;
401 }
402 }
403
404 if (last) {
405 rawv6_rcv(last, skb);
406 read_unlock(&ip6_ra_lock);
407 return 1;
408 }
409 read_unlock(&ip6_ra_lock);
410 return 0;
411 }
412
ip6_forward_proxy_check(struct sk_buff * skb)413 static int ip6_forward_proxy_check(struct sk_buff *skb)
414 {
415 struct ipv6hdr *hdr = ipv6_hdr(skb);
416 u8 nexthdr = hdr->nexthdr;
417 __be16 frag_off;
418 int offset;
419
420 if (ipv6_ext_hdr(nexthdr)) {
421 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
422 if (offset < 0)
423 return 0;
424 } else
425 offset = sizeof(struct ipv6hdr);
426
427 if (nexthdr == IPPROTO_ICMPV6) {
428 struct icmp6hdr *icmp6;
429
430 if (!pskb_may_pull(skb, (skb_network_header(skb) +
431 offset + 1 - skb->data)))
432 return 0;
433
434 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
435
436 switch (icmp6->icmp6_type) {
437 case NDISC_ROUTER_SOLICITATION:
438 case NDISC_ROUTER_ADVERTISEMENT:
439 case NDISC_NEIGHBOUR_SOLICITATION:
440 case NDISC_NEIGHBOUR_ADVERTISEMENT:
441 case NDISC_REDIRECT:
442 /* For reaction involving unicast neighbor discovery
443 * message destined to the proxied address, pass it to
444 * input function.
445 */
446 return 1;
447 default:
448 break;
449 }
450 }
451
452 /*
453 * The proxying router can't forward traffic sent to a link-local
454 * address, so signal the sender and discard the packet. This
455 * behavior is clarified by the MIPv6 specification.
456 */
457 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
458 dst_link_failure(skb);
459 return -1;
460 }
461
462 return 0;
463 }
464
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)465 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
466 struct sk_buff *skb)
467 {
468 #ifdef CONFIG_NET_SWITCHDEV
469 if (skb->offload_l3_fwd_mark) {
470 consume_skb(skb);
471 return 0;
472 }
473 #endif
474
475 skb_clear_tstamp(skb);
476 return dst_output(net, sk, skb);
477 }
478
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)479 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
480 {
481 if (skb->len <= mtu)
482 return false;
483
484 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
485 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
486 return true;
487
488 if (skb->ignore_df)
489 return false;
490
491 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
492 return false;
493
494 return true;
495 }
496
ip6_forward(struct sk_buff * skb)497 int ip6_forward(struct sk_buff *skb)
498 {
499 struct dst_entry *dst = skb_dst(skb);
500 struct ipv6hdr *hdr = ipv6_hdr(skb);
501 struct inet6_skb_parm *opt = IP6CB(skb);
502 struct net *net = dev_net(dst_dev(dst));
503 struct net_device *dev;
504 struct inet6_dev *idev;
505 SKB_DR(reason);
506 u32 mtu;
507
508 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
509 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
510 (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
511 goto error;
512
513 if (skb->pkt_type != PACKET_HOST)
514 goto drop;
515
516 if (unlikely(skb->sk))
517 goto drop;
518
519 if (skb_warn_if_lro(skb))
520 goto drop;
521
522 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
523 (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
524 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
525 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
526 goto drop;
527 }
528
529 skb_forward_csum(skb);
530
531 /*
532 * We DO NOT make any processing on
533 * RA packets, pushing them to user level AS IS
534 * without ane WARRANTY that application will be able
535 * to interpret them. The reason is that we
536 * cannot make anything clever here.
537 *
538 * We are not end-node, so that if packet contains
539 * AH/ESP, we cannot make anything.
540 * Defragmentation also would be mistake, RA packets
541 * cannot be fragmented, because there is no warranty
542 * that different fragments will go along one path. --ANK
543 */
544 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
545 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
546 return 0;
547 }
548
549 /*
550 * check and decrement ttl
551 */
552 if (hdr->hop_limit <= 1) {
553 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
554 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
555
556 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
557 return -ETIMEDOUT;
558 }
559
560 /* XXX: idev->cnf.proxy_ndp? */
561 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
562 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
563 int proxied = ip6_forward_proxy_check(skb);
564 if (proxied > 0) {
565 /* It's tempting to decrease the hop limit
566 * here by 1, as we do at the end of the
567 * function too.
568 *
569 * But that would be incorrect, as proxying is
570 * not forwarding. The ip6_input function
571 * will handle this packet locally, and it
572 * depends on the hop limit being unchanged.
573 *
574 * One example is the NDP hop limit, that
575 * always has to stay 255, but other would be
576 * similar checks around RA packets, where the
577 * user can even change the desired limit.
578 */
579 return ip6_input(skb);
580 } else if (proxied < 0) {
581 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
582 goto drop;
583 }
584 }
585
586 if (!xfrm6_route_forward(skb)) {
587 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
588 SKB_DR_SET(reason, XFRM_POLICY);
589 goto drop;
590 }
591 dst = skb_dst(skb);
592 dev = dst_dev(dst);
593 /* IPv6 specs say nothing about it, but it is clear that we cannot
594 send redirects to source routed frames.
595 We don't send redirects to frames decapsulated from IPsec.
596 */
597 if (IP6CB(skb)->iif == dev->ifindex &&
598 opt->srcrt == 0 && !skb_sec_path(skb)) {
599 struct in6_addr *target = NULL;
600 struct inet_peer *peer;
601 struct rt6_info *rt;
602
603 /*
604 * incoming and outgoing devices are the same
605 * send a redirect.
606 */
607
608 rt = dst_rt6_info(dst);
609 if (rt->rt6i_flags & RTF_GATEWAY)
610 target = &rt->rt6i_gateway;
611 else
612 target = &hdr->daddr;
613
614 rcu_read_lock();
615 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
616
617 /* Limit redirects both by destination (here)
618 and by source (inside ndisc_send_redirect)
619 */
620 if (inet_peer_xrlim_allow(peer, 1*HZ))
621 ndisc_send_redirect(skb, target);
622 rcu_read_unlock();
623 } else {
624 int addrtype = ipv6_addr_type(&hdr->saddr);
625
626 /* This check is security critical. */
627 if (addrtype == IPV6_ADDR_ANY ||
628 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
629 goto error;
630 if (addrtype & IPV6_ADDR_LINKLOCAL) {
631 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
632 ICMPV6_NOT_NEIGHBOUR, 0);
633 goto error;
634 }
635 }
636
637 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
638
639 mtu = ip6_dst_mtu_maybe_forward(dst, true);
640 if (mtu < IPV6_MIN_MTU)
641 mtu = IPV6_MIN_MTU;
642
643 if (unlikely(ip6_pkt_too_big(skb, mtu))) {
644 /* Again, force OUTPUT device used as source address */
645 skb->dev = dev;
646 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
647 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
648 __IP6_INC_STATS(net, ip6_dst_idev(dst),
649 IPSTATS_MIB_FRAGFAILS);
650 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
651 return -EMSGSIZE;
652 }
653
654 if (skb_cow(skb, dev->hard_header_len)) {
655 __IP6_INC_STATS(net, ip6_dst_idev(dst),
656 IPSTATS_MIB_OUTDISCARDS);
657 goto drop;
658 }
659
660 hdr = ipv6_hdr(skb);
661
662 /* Mangling hops number delayed to point after skb COW */
663
664 hdr->hop_limit--;
665
666 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
667 net, NULL, skb, skb->dev, dev,
668 ip6_forward_finish);
669
670 error:
671 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
672 SKB_DR_SET(reason, IP_INADDRERRORS);
673 drop:
674 kfree_skb_reason(skb, reason);
675 return -EINVAL;
676 }
677
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)678 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
679 {
680 to->pkt_type = from->pkt_type;
681 to->priority = from->priority;
682 to->protocol = from->protocol;
683 skb_dst_drop(to);
684 skb_dst_set(to, dst_clone(skb_dst(from)));
685 to->dev = from->dev;
686 to->mark = from->mark;
687
688 skb_copy_hash(to, from);
689
690 #ifdef CONFIG_NET_SCHED
691 to->tc_index = from->tc_index;
692 #endif
693 nf_copy(to, from);
694 skb_ext_copy(to, from);
695 skb_copy_secmark(to, from);
696 }
697
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)698 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
699 u8 nexthdr, __be32 frag_id,
700 struct ip6_fraglist_iter *iter)
701 {
702 unsigned int first_len;
703 struct frag_hdr *fh;
704
705 /* BUILD HEADER */
706 *prevhdr = NEXTHDR_FRAGMENT;
707 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
708 if (!iter->tmp_hdr)
709 return -ENOMEM;
710
711 iter->frag = skb_shinfo(skb)->frag_list;
712 skb_frag_list_init(skb);
713
714 iter->offset = 0;
715 iter->hlen = hlen;
716 iter->frag_id = frag_id;
717 iter->nexthdr = nexthdr;
718
719 __skb_pull(skb, hlen);
720 fh = __skb_push(skb, sizeof(struct frag_hdr));
721 __skb_push(skb, hlen);
722 skb_reset_network_header(skb);
723 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
724
725 fh->nexthdr = nexthdr;
726 fh->reserved = 0;
727 fh->frag_off = htons(IP6_MF);
728 fh->identification = frag_id;
729
730 first_len = skb_pagelen(skb);
731 skb->data_len = first_len - skb_headlen(skb);
732 skb->len = first_len;
733 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
734
735 return 0;
736 }
737 EXPORT_SYMBOL(ip6_fraglist_init);
738
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)739 void ip6_fraglist_prepare(struct sk_buff *skb,
740 struct ip6_fraglist_iter *iter)
741 {
742 struct sk_buff *frag = iter->frag;
743 unsigned int hlen = iter->hlen;
744 struct frag_hdr *fh;
745
746 frag->ip_summed = CHECKSUM_NONE;
747 skb_reset_transport_header(frag);
748 fh = __skb_push(frag, sizeof(struct frag_hdr));
749 __skb_push(frag, hlen);
750 skb_reset_network_header(frag);
751 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
752 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
753 fh->nexthdr = iter->nexthdr;
754 fh->reserved = 0;
755 fh->frag_off = htons(iter->offset);
756 if (frag->next)
757 fh->frag_off |= htons(IP6_MF);
758 fh->identification = iter->frag_id;
759 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
760 ip6_copy_metadata(frag, skb);
761 }
762 EXPORT_SYMBOL(ip6_fraglist_prepare);
763
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)764 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
765 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
766 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
767 {
768 state->prevhdr = prevhdr;
769 state->nexthdr = nexthdr;
770 state->frag_id = frag_id;
771
772 state->hlen = hlen;
773 state->mtu = mtu;
774
775 state->left = skb->len - hlen; /* Space per frame */
776 state->ptr = hlen; /* Where to start from */
777
778 state->hroom = hdr_room;
779 state->troom = needed_tailroom;
780
781 state->offset = 0;
782 }
783 EXPORT_SYMBOL(ip6_frag_init);
784
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)785 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
786 {
787 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
788 struct sk_buff *frag;
789 struct frag_hdr *fh;
790 unsigned int len;
791
792 len = state->left;
793 /* IF: it doesn't fit, use 'mtu' - the data space left */
794 if (len > state->mtu)
795 len = state->mtu;
796 /* IF: we are not sending up to and including the packet end
797 then align the next start on an eight byte boundary */
798 if (len < state->left)
799 len &= ~7;
800
801 /* Allocate buffer */
802 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
803 state->hroom + state->troom, GFP_ATOMIC);
804 if (!frag)
805 return ERR_PTR(-ENOMEM);
806
807 /*
808 * Set up data on packet
809 */
810
811 ip6_copy_metadata(frag, skb);
812 skb_reserve(frag, state->hroom);
813 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
814 skb_reset_network_header(frag);
815 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
816 frag->transport_header = (frag->network_header + state->hlen +
817 sizeof(struct frag_hdr));
818
819 /*
820 * Charge the memory for the fragment to any owner
821 * it might possess
822 */
823 if (skb->sk)
824 skb_set_owner_w(frag, skb->sk);
825
826 /*
827 * Copy the packet header into the new buffer.
828 */
829 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
830
831 fragnexthdr_offset = skb_network_header(frag);
832 fragnexthdr_offset += prevhdr - skb_network_header(skb);
833 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
834
835 /*
836 * Build fragment header.
837 */
838 fh->nexthdr = state->nexthdr;
839 fh->reserved = 0;
840 fh->identification = state->frag_id;
841
842 /*
843 * Copy a block of the IP datagram.
844 */
845 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
846 len));
847 state->left -= len;
848
849 fh->frag_off = htons(state->offset);
850 if (state->left > 0)
851 fh->frag_off |= htons(IP6_MF);
852 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
853
854 state->ptr += len;
855 state->offset += len;
856
857 return frag;
858 }
859 EXPORT_SYMBOL(ip6_frag_next);
860
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))861 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
862 int (*output)(struct net *, struct sock *, struct sk_buff *))
863 {
864 struct sk_buff *frag;
865 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
866 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
867 inet6_sk(skb->sk) : NULL;
868 u8 tstamp_type = skb->tstamp_type;
869 struct ip6_frag_state state;
870 unsigned int mtu, hlen, nexthdr_offset;
871 ktime_t tstamp = skb->tstamp;
872 int hroom, err = 0;
873 __be32 frag_id;
874 u8 *prevhdr, nexthdr = 0;
875
876 err = ip6_find_1stfragopt(skb, &prevhdr);
877 if (err < 0)
878 goto fail;
879 hlen = err;
880 nexthdr = *prevhdr;
881 nexthdr_offset = prevhdr - skb_network_header(skb);
882
883 mtu = ip6_skb_dst_mtu(skb);
884
885 /* We must not fragment if the socket is set to force MTU discovery
886 * or if the skb it not generated by a local socket.
887 */
888 if (unlikely(!skb->ignore_df && skb->len > mtu))
889 goto fail_toobig;
890
891 if (IP6CB(skb)->frag_max_size) {
892 if (IP6CB(skb)->frag_max_size > mtu)
893 goto fail_toobig;
894
895 /* don't send fragments larger than what we received */
896 mtu = IP6CB(skb)->frag_max_size;
897 if (mtu < IPV6_MIN_MTU)
898 mtu = IPV6_MIN_MTU;
899 }
900
901 if (np) {
902 u32 frag_size = READ_ONCE(np->frag_size);
903
904 if (frag_size && frag_size < mtu)
905 mtu = frag_size;
906 }
907 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
908 goto fail_toobig;
909 mtu -= hlen + sizeof(struct frag_hdr);
910
911 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
912 &ipv6_hdr(skb)->saddr);
913
914 if (skb->ip_summed == CHECKSUM_PARTIAL &&
915 (err = skb_checksum_help(skb)))
916 goto fail;
917
918 prevhdr = skb_network_header(skb) + nexthdr_offset;
919 hroom = LL_RESERVED_SPACE(rt->dst.dev);
920 if (skb_has_frag_list(skb)) {
921 unsigned int first_len = skb_pagelen(skb);
922 struct ip6_fraglist_iter iter;
923 struct sk_buff *frag2;
924
925 if (first_len - hlen > mtu ||
926 ((first_len - hlen) & 7) ||
927 skb_cloned(skb) ||
928 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
929 goto slow_path;
930
931 skb_walk_frags(skb, frag) {
932 /* Correct geometry. */
933 if (frag->len > mtu ||
934 ((frag->len & 7) && frag->next) ||
935 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
936 goto slow_path_clean;
937
938 /* Partially cloned skb? */
939 if (skb_shared(frag))
940 goto slow_path_clean;
941
942 BUG_ON(frag->sk);
943 if (skb->sk) {
944 frag->sk = skb->sk;
945 frag->destructor = sock_wfree;
946 }
947 skb->truesize -= frag->truesize;
948 }
949
950 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
951 &iter);
952 if (err < 0)
953 goto fail;
954
955 /* We prevent @rt from being freed. */
956 rcu_read_lock();
957
958 for (;;) {
959 /* Prepare header of the next frame,
960 * before previous one went down. */
961 if (iter.frag)
962 ip6_fraglist_prepare(skb, &iter);
963
964 skb_set_delivery_time(skb, tstamp, tstamp_type);
965 err = output(net, sk, skb);
966 if (!err)
967 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
968 IPSTATS_MIB_FRAGCREATES);
969
970 if (err || !iter.frag)
971 break;
972
973 skb = ip6_fraglist_next(&iter);
974 }
975
976 kfree(iter.tmp_hdr);
977
978 if (err == 0) {
979 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
980 IPSTATS_MIB_FRAGOKS);
981 rcu_read_unlock();
982 return 0;
983 }
984
985 kfree_skb_list(iter.frag);
986
987 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
988 IPSTATS_MIB_FRAGFAILS);
989 rcu_read_unlock();
990 return err;
991
992 slow_path_clean:
993 skb_walk_frags(skb, frag2) {
994 if (frag2 == frag)
995 break;
996 frag2->sk = NULL;
997 frag2->destructor = NULL;
998 skb->truesize += frag2->truesize;
999 }
1000 }
1001
1002 slow_path:
1003 /*
1004 * Fragment the datagram.
1005 */
1006
1007 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1008 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1009 &state);
1010
1011 /*
1012 * Keep copying data until we run out.
1013 */
1014
1015 while (state.left > 0) {
1016 frag = ip6_frag_next(skb, &state);
1017 if (IS_ERR(frag)) {
1018 err = PTR_ERR(frag);
1019 goto fail;
1020 }
1021
1022 /*
1023 * Put this fragment into the sending queue.
1024 */
1025 skb_set_delivery_time(frag, tstamp, tstamp_type);
1026 err = output(net, sk, frag);
1027 if (err)
1028 goto fail;
1029
1030 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1031 IPSTATS_MIB_FRAGCREATES);
1032 }
1033 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1034 IPSTATS_MIB_FRAGOKS);
1035 consume_skb(skb);
1036 return err;
1037
1038 fail_toobig:
1039 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1040 err = -EMSGSIZE;
1041
1042 fail:
1043 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1044 IPSTATS_MIB_FRAGFAILS);
1045 kfree_skb(skb);
1046 return err;
1047 }
1048
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1049 static inline int ip6_rt_check(const struct rt6key *rt_key,
1050 const struct in6_addr *fl_addr,
1051 const struct in6_addr *addr_cache)
1052 {
1053 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1054 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1055 }
1056
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1057 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1058 struct dst_entry *dst,
1059 const struct flowi6 *fl6)
1060 {
1061 struct ipv6_pinfo *np = inet6_sk(sk);
1062 struct rt6_info *rt;
1063
1064 if (!dst)
1065 goto out;
1066
1067 if (dst->ops->family != AF_INET6) {
1068 dst_release(dst);
1069 return NULL;
1070 }
1071
1072 rt = dst_rt6_info(dst);
1073 /* Yes, checking route validity in not connected
1074 * case is not very simple. Take into account,
1075 * that we do not support routing by source, TOS,
1076 * and MSG_DONTROUTE --ANK (980726)
1077 *
1078 * 1. ip6_rt_check(): If route was host route,
1079 * check that cached destination is current.
1080 * If it is network route, we still may
1081 * check its validity using saved pointer
1082 * to the last used address: daddr_cache.
1083 * We do not want to save whole address now,
1084 * (because main consumer of this service
1085 * is tcp, which has not this problem),
1086 * so that the last trick works only on connected
1087 * sockets.
1088 * 2. oif also should be the same.
1089 */
1090 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr,
1091 np->daddr_cache ? &sk->sk_v6_daddr : NULL) ||
1092 #ifdef CONFIG_IPV6_SUBTREES
1093 ip6_rt_check(&rt->rt6i_src, &fl6->saddr,
1094 np->saddr_cache ? &np->saddr : NULL) ||
1095 #endif
1096 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1097 dst_release(dst);
1098 dst = NULL;
1099 }
1100
1101 out:
1102 return dst;
1103 }
1104
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1105 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1106 struct dst_entry **dst, struct flowi6 *fl6)
1107 {
1108 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1109 struct neighbour *n;
1110 struct rt6_info *rt;
1111 #endif
1112 int err;
1113 int flags = 0;
1114
1115 /* The correct way to handle this would be to do
1116 * ip6_route_get_saddr, and then ip6_route_output; however,
1117 * the route-specific preferred source forces the
1118 * ip6_route_output call _before_ ip6_route_get_saddr.
1119 *
1120 * In source specific routing (no src=any default route),
1121 * ip6_route_output will fail given src=any saddr, though, so
1122 * that's why we try it again later.
1123 */
1124 if (ipv6_addr_any(&fl6->saddr)) {
1125 struct fib6_info *from;
1126 struct rt6_info *rt;
1127
1128 *dst = ip6_route_output(net, sk, fl6);
1129 rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1130
1131 rcu_read_lock();
1132 from = rt ? rcu_dereference(rt->from) : NULL;
1133 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1134 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1135 fl6->flowi6_l3mdev,
1136 &fl6->saddr);
1137 rcu_read_unlock();
1138
1139 if (err)
1140 goto out_err_release;
1141
1142 /* If we had an erroneous initial result, pretend it
1143 * never existed and let the SA-enabled version take
1144 * over.
1145 */
1146 if ((*dst)->error) {
1147 dst_release(*dst);
1148 *dst = NULL;
1149 }
1150
1151 if (fl6->flowi6_oif)
1152 flags |= RT6_LOOKUP_F_IFACE;
1153 }
1154
1155 if (!*dst)
1156 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1157
1158 err = (*dst)->error;
1159 if (err)
1160 goto out_err_release;
1161
1162 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1163 /*
1164 * Here if the dst entry we've looked up
1165 * has a neighbour entry that is in the INCOMPLETE
1166 * state and the src address from the flow is
1167 * marked as OPTIMISTIC, we release the found
1168 * dst entry and replace it instead with the
1169 * dst entry of the nexthop router
1170 */
1171 rt = dst_rt6_info(*dst);
1172 rcu_read_lock();
1173 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1174 rt6_nexthop(rt, &fl6->daddr));
1175 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1176 rcu_read_unlock();
1177
1178 if (err) {
1179 struct inet6_ifaddr *ifp;
1180 struct flowi6 fl_gw6;
1181 int redirect;
1182
1183 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1184 (*dst)->dev, 1);
1185
1186 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1187 if (ifp)
1188 in6_ifa_put(ifp);
1189
1190 if (redirect) {
1191 /*
1192 * We need to get the dst entry for the
1193 * default router instead
1194 */
1195 dst_release(*dst);
1196 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1197 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1198 *dst = ip6_route_output(net, sk, &fl_gw6);
1199 err = (*dst)->error;
1200 if (err)
1201 goto out_err_release;
1202 }
1203 }
1204 #endif
1205 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1206 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1207 err = -EAFNOSUPPORT;
1208 goto out_err_release;
1209 }
1210
1211 return 0;
1212
1213 out_err_release:
1214 dst_release(*dst);
1215 *dst = NULL;
1216
1217 if (err == -ENETUNREACH)
1218 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1219 return err;
1220 }
1221
1222 /**
1223 * ip6_dst_lookup - perform route lookup on flow
1224 * @net: Network namespace to perform lookup in
1225 * @sk: socket which provides route info
1226 * @dst: pointer to dst_entry * for result
1227 * @fl6: flow to lookup
1228 *
1229 * This function performs a route lookup on the given flow.
1230 *
1231 * It returns zero on success, or a standard errno code on error.
1232 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1233 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1234 struct flowi6 *fl6)
1235 {
1236 *dst = NULL;
1237 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1238 }
1239 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1240
1241 /**
1242 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1243 * @net: Network namespace to perform lookup in
1244 * @sk: socket which provides route info
1245 * @fl6: flow to lookup
1246 * @final_dst: final destination address for ipsec lookup
1247 *
1248 * This function performs a route lookup on the given flow.
1249 *
1250 * It returns a valid dst pointer on success, or a pointer encoded
1251 * error code.
1252 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1253 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1254 const struct in6_addr *final_dst)
1255 {
1256 struct dst_entry *dst = NULL;
1257 int err;
1258
1259 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1260 if (err)
1261 return ERR_PTR(err);
1262 if (final_dst)
1263 fl6->daddr = *final_dst;
1264
1265 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1266 }
1267 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1268
1269 /**
1270 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1271 * @sk: socket which provides the dst cache and route info
1272 * @fl6: flow to lookup
1273 * @final_dst: final destination address for ipsec lookup
1274 * @connected: whether @sk is connected or not
1275 *
1276 * This function performs a route lookup on the given flow with the
1277 * possibility of using the cached route in the socket if it is valid.
1278 * It will take the socket dst lock when operating on the dst cache.
1279 * As a result, this function can only be used in process context.
1280 *
1281 * In addition, for a connected socket, cache the dst in the socket
1282 * if the current cache is not valid.
1283 *
1284 * It returns a valid dst pointer on success, or a pointer encoded
1285 * error code.
1286 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1287 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1288 const struct in6_addr *final_dst,
1289 bool connected)
1290 {
1291 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1292
1293 dst = ip6_sk_dst_check(sk, dst, fl6);
1294 if (dst)
1295 return dst;
1296
1297 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1298 if (connected && !IS_ERR(dst))
1299 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1300
1301 return dst;
1302 }
1303 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1304
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1305 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1306 gfp_t gfp)
1307 {
1308 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1309 }
1310
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1311 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1312 gfp_t gfp)
1313 {
1314 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1315 }
1316
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1317 static void ip6_append_data_mtu(unsigned int *mtu,
1318 int *maxfraglen,
1319 unsigned int fragheaderlen,
1320 struct sk_buff *skb,
1321 struct rt6_info *rt,
1322 unsigned int orig_mtu)
1323 {
1324 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1325 if (!skb) {
1326 /* first fragment, reserve header_len */
1327 *mtu = orig_mtu - rt->dst.header_len;
1328
1329 } else {
1330 /*
1331 * this fragment is not first, the headers
1332 * space is regarded as data space.
1333 */
1334 *mtu = orig_mtu;
1335 }
1336 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1337 + fragheaderlen - sizeof(struct frag_hdr);
1338 }
1339 }
1340
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1341 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1342 struct ipcm6_cookie *ipc6,
1343 struct rt6_info *rt)
1344 {
1345 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1346 struct inet6_cork *v6_cork = &cork->base6;
1347 struct ipv6_pinfo *np = inet6_sk(sk);
1348 unsigned int mtu, frag_size;
1349
1350 /* callers pass dst together with a reference, set it first so
1351 * ip6_cork_release() can put it down even in case of an error.
1352 */
1353 cork->base.dst = &rt->dst;
1354
1355 /*
1356 * setup for corking
1357 */
1358 if (unlikely(opt)) {
1359 if (WARN_ON(v6_cork->opt))
1360 return -EINVAL;
1361
1362 nopt = v6_cork->opt = kzalloc_obj(*opt, sk->sk_allocation);
1363 if (unlikely(!nopt))
1364 return -ENOBUFS;
1365
1366 nopt->tot_len = sizeof(*opt);
1367 nopt->opt_flen = opt->opt_flen;
1368 nopt->opt_nflen = opt->opt_nflen;
1369
1370 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1371 if (opt->dst0opt && !nopt->dst0opt)
1372 return -ENOBUFS;
1373
1374 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1375 if (opt->dst1opt && !nopt->dst1opt)
1376 return -ENOBUFS;
1377
1378 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1379 if (opt->hopopt && !nopt->hopopt)
1380 return -ENOBUFS;
1381
1382 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1383 if (opt->srcrt && !nopt->srcrt)
1384 return -ENOBUFS;
1385
1386 /* need source address above miyazawa*/
1387 }
1388 v6_cork->hop_limit = ipc6->hlimit;
1389 v6_cork->tclass = ipc6->tclass;
1390 v6_cork->dontfrag = ipc6->dontfrag;
1391 if (rt->dst.flags & DST_XFRM_TUNNEL)
1392 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1393 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(&rt->dst);
1394 else
1395 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1396 READ_ONCE(rt->dst.dev->mtu) : dst6_mtu(xfrm_dst_path(&rt->dst));
1397
1398 frag_size = READ_ONCE(np->frag_size);
1399 if (frag_size && frag_size < mtu)
1400 mtu = frag_size;
1401
1402 cork->base.fragsize = mtu;
1403 cork->base.gso_size = ipc6->gso_size;
1404 cork->base.tx_flags = 0;
1405 cork->base.mark = ipc6->sockc.mark;
1406 cork->base.priority = ipc6->sockc.priority;
1407 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1408 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1409 cork->base.flags |= IPCORK_TS_OPT_ID;
1410 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1411 }
1412 cork->base.length = 0;
1413 cork->base.transmit_time = ipc6->sockc.transmit_time;
1414
1415 return 0;
1416 }
1417
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags)1418 static int __ip6_append_data(struct sock *sk,
1419 struct sk_buff_head *queue,
1420 struct inet_cork_full *cork_full,
1421 struct page_frag *pfrag,
1422 int getfrag(void *from, char *to, int offset,
1423 int len, int odd, struct sk_buff *skb),
1424 void *from, size_t length, int transhdrlen,
1425 unsigned int flags)
1426 {
1427 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1428 struct inet6_cork *v6_cork = &cork_full->base6;
1429 struct inet_cork *cork = &cork_full->base;
1430 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1431 struct sk_buff *skb, *skb_prev = NULL;
1432 struct ubuf_info *uarg = NULL;
1433 int exthdrlen = 0;
1434 int dst_exthdrlen = 0;
1435 int hh_len;
1436 int copy;
1437 int err;
1438 int offset = 0;
1439 bool zc = false;
1440 u32 tskey = 0;
1441 struct rt6_info *rt = dst_rt6_info(cork->dst);
1442 bool paged, hold_tskey = false, extra_uref = false;
1443 struct ipv6_txoptions *opt = v6_cork->opt;
1444 int csummode = CHECKSUM_NONE;
1445 unsigned int maxnonfragsize, headersize;
1446 unsigned int wmem_alloc_delta = 0;
1447
1448 skb = skb_peek_tail(queue);
1449 if (!skb) {
1450 exthdrlen = opt ? opt->opt_flen : 0;
1451 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1452 }
1453
1454 paged = !!cork->gso_size;
1455 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1456 orig_mtu = mtu;
1457
1458 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1459
1460 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1461 (opt ? opt->opt_nflen : 0);
1462
1463 headersize = sizeof(struct ipv6hdr) +
1464 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1465 rt->rt6i_nfheader_len;
1466
1467 if (mtu <= fragheaderlen ||
1468 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1469 goto emsgsize;
1470
1471 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1472 sizeof(struct frag_hdr);
1473
1474 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1475 * the first fragment
1476 */
1477 if (headersize + transhdrlen > mtu)
1478 goto emsgsize;
1479
1480 if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1481 (sk->sk_protocol == IPPROTO_UDP ||
1482 sk->sk_protocol == IPPROTO_ICMPV6 ||
1483 sk->sk_protocol == IPPROTO_RAW)) {
1484 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1485 sizeof(struct ipv6hdr));
1486 goto emsgsize;
1487 }
1488
1489 if (ip6_sk_ignore_df(sk))
1490 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1491 else
1492 maxnonfragsize = mtu;
1493
1494 if (cork->length + length > maxnonfragsize - headersize) {
1495 emsgsize:
1496 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1497 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1498 return -EMSGSIZE;
1499 }
1500
1501 /* CHECKSUM_PARTIAL only with no extension headers and when
1502 * we are not going to fragment
1503 */
1504 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1505 headersize == sizeof(struct ipv6hdr) &&
1506 length <= mtu - headersize &&
1507 (!(flags & MSG_MORE) || cork->gso_size) &&
1508 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1509 csummode = CHECKSUM_PARTIAL;
1510
1511 if ((flags & MSG_ZEROCOPY) && length) {
1512 struct msghdr *msg = from;
1513
1514 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1515 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1516 return -EINVAL;
1517
1518 /* Leave uarg NULL if can't zerocopy, callers should
1519 * be able to handle it.
1520 */
1521 if ((rt->dst.dev->features & NETIF_F_SG) &&
1522 csummode == CHECKSUM_PARTIAL) {
1523 paged = true;
1524 zc = true;
1525 uarg = msg->msg_ubuf;
1526 }
1527 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1528 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1529 false);
1530 if (!uarg)
1531 return -ENOBUFS;
1532 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1533 if (rt->dst.dev->features & NETIF_F_SG &&
1534 csummode == CHECKSUM_PARTIAL) {
1535 paged = true;
1536 zc = true;
1537 } else {
1538 uarg_to_msgzc(uarg)->zerocopy = 0;
1539 skb_zcopy_set(skb, uarg, &extra_uref);
1540 }
1541 }
1542 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1543 if (inet_test_bit(HDRINCL, sk))
1544 return -EPERM;
1545 if (rt->dst.dev->features & NETIF_F_SG &&
1546 getfrag == ip_generic_getfrag)
1547 /* We need an empty buffer to attach stuff to */
1548 paged = true;
1549 else
1550 flags &= ~MSG_SPLICE_PAGES;
1551 }
1552
1553 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1554 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1555 if (cork->flags & IPCORK_TS_OPT_ID) {
1556 tskey = cork->ts_opt_id;
1557 } else {
1558 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1559 hold_tskey = true;
1560 }
1561 }
1562
1563 /*
1564 * Let's try using as much space as possible.
1565 * Use MTU if total length of the message fits into the MTU.
1566 * Otherwise, we need to reserve fragment header and
1567 * fragment alignment (= 8-15 octects, in total).
1568 *
1569 * Note that we may need to "move" the data from the tail
1570 * of the buffer to the new fragment when we split
1571 * the message.
1572 *
1573 * FIXME: It may be fragmented into multiple chunks
1574 * at once if non-fragmentable extension headers
1575 * are too large.
1576 * --yoshfuji
1577 */
1578
1579 cork->length += length;
1580 if (!skb)
1581 goto alloc_new_skb;
1582
1583 while (length > 0) {
1584 /* Check if the remaining data fits into current packet. */
1585 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1586 if (copy < length)
1587 copy = maxfraglen - skb->len;
1588
1589 if (copy <= 0) {
1590 char *data;
1591 unsigned int datalen;
1592 unsigned int fraglen;
1593 unsigned int fraggap;
1594 unsigned int alloclen, alloc_extra;
1595 unsigned int pagedlen;
1596 alloc_new_skb:
1597 /* There's no room in the current skb */
1598 if (skb)
1599 fraggap = skb->len - maxfraglen;
1600 else
1601 fraggap = 0;
1602 /* update mtu and maxfraglen if necessary */
1603 if (!skb || !skb_prev)
1604 ip6_append_data_mtu(&mtu, &maxfraglen,
1605 fragheaderlen, skb, rt,
1606 orig_mtu);
1607
1608 skb_prev = skb;
1609
1610 /*
1611 * If remaining data exceeds the mtu,
1612 * we know we need more fragment(s).
1613 */
1614 datalen = length + fraggap;
1615
1616 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1617 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1618 fraglen = datalen + fragheaderlen;
1619 pagedlen = 0;
1620
1621 alloc_extra = hh_len;
1622 alloc_extra += dst_exthdrlen;
1623 alloc_extra += rt->dst.trailer_len;
1624
1625 /* We just reserve space for fragment header.
1626 * Note: this may be overallocation if the message
1627 * (without MSG_MORE) fits into the MTU.
1628 */
1629 alloc_extra += sizeof(struct frag_hdr);
1630
1631 if ((flags & MSG_MORE) &&
1632 !(rt->dst.dev->features&NETIF_F_SG))
1633 alloclen = mtu;
1634 else if (!paged &&
1635 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1636 !(rt->dst.dev->features & NETIF_F_SG)))
1637 alloclen = fraglen;
1638 else {
1639 alloclen = fragheaderlen + transhdrlen;
1640 pagedlen = datalen - transhdrlen;
1641 }
1642 alloclen += alloc_extra;
1643
1644 if (datalen != length + fraggap) {
1645 /*
1646 * this is not the last fragment, the trailer
1647 * space is regarded as data space.
1648 */
1649 datalen += rt->dst.trailer_len;
1650 }
1651
1652 fraglen = datalen + fragheaderlen;
1653
1654 copy = datalen - transhdrlen - fraggap - pagedlen;
1655 /* [!] NOTE: copy may be negative if pagedlen>0
1656 * because then the equation may reduces to -fraggap.
1657 */
1658 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1659 err = -EINVAL;
1660 goto error;
1661 }
1662 if (transhdrlen) {
1663 skb = sock_alloc_send_skb(sk, alloclen,
1664 (flags & MSG_DONTWAIT), &err);
1665 } else {
1666 skb = NULL;
1667 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1668 2 * sk->sk_sndbuf)
1669 skb = alloc_skb(alloclen,
1670 sk->sk_allocation);
1671 if (unlikely(!skb))
1672 err = -ENOBUFS;
1673 }
1674 if (!skb)
1675 goto error;
1676 /*
1677 * Fill in the control structures
1678 */
1679 skb->protocol = htons(ETH_P_IPV6);
1680 skb->ip_summed = csummode;
1681 skb->csum = 0;
1682 /* reserve for fragmentation and ipsec header */
1683 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1684 dst_exthdrlen);
1685
1686 /*
1687 * Find where to start putting bytes
1688 */
1689 data = skb_put(skb, fraglen - pagedlen);
1690 skb_set_network_header(skb, exthdrlen);
1691 data += fragheaderlen;
1692 skb->transport_header = (skb->network_header +
1693 fragheaderlen);
1694 if (fraggap) {
1695 skb->csum = skb_copy_and_csum_bits(
1696 skb_prev, maxfraglen,
1697 data + transhdrlen, fraggap);
1698 skb_prev->csum = csum_sub(skb_prev->csum,
1699 skb->csum);
1700 data += fraggap;
1701 pskb_trim_unique(skb_prev, maxfraglen);
1702 }
1703 if (copy > 0 &&
1704 INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1705 from, data + transhdrlen, offset,
1706 copy, fraggap, skb) < 0) {
1707 err = -EFAULT;
1708 kfree_skb(skb);
1709 goto error;
1710 } else if (flags & MSG_SPLICE_PAGES) {
1711 copy = 0;
1712 }
1713
1714 offset += copy;
1715 length -= copy + transhdrlen;
1716 transhdrlen = 0;
1717 exthdrlen = 0;
1718 dst_exthdrlen = 0;
1719
1720 /* Only the initial fragment is time stamped */
1721 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1722 cork->tx_flags = 0;
1723 skb_shinfo(skb)->tskey = tskey;
1724 tskey = 0;
1725 skb_zcopy_set(skb, uarg, &extra_uref);
1726
1727 if ((flags & MSG_CONFIRM) && !skb_prev)
1728 skb_set_dst_pending_confirm(skb, 1);
1729
1730 /*
1731 * Put the packet on the pending queue
1732 */
1733 if (!skb->destructor) {
1734 skb->destructor = sock_wfree;
1735 skb->sk = sk;
1736 wmem_alloc_delta += skb->truesize;
1737 }
1738 __skb_queue_tail(queue, skb);
1739 continue;
1740 }
1741
1742 if (copy > length)
1743 copy = length;
1744
1745 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1746 skb_tailroom(skb) >= copy) {
1747 unsigned int off;
1748
1749 off = skb->len;
1750 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1751 from, skb_put(skb, copy),
1752 offset, copy, off, skb) < 0) {
1753 __skb_trim(skb, off);
1754 err = -EFAULT;
1755 goto error;
1756 }
1757 } else if (flags & MSG_SPLICE_PAGES) {
1758 struct msghdr *msg = from;
1759
1760 err = -EIO;
1761 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1762 goto error;
1763
1764 err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1765 if (err < 0)
1766 goto error;
1767 copy = err;
1768 wmem_alloc_delta += copy;
1769 } else if (!zc) {
1770 int i = skb_shinfo(skb)->nr_frags;
1771
1772 err = -ENOMEM;
1773 if (!sk_page_frag_refill(sk, pfrag))
1774 goto error;
1775
1776 skb_zcopy_downgrade_managed(skb);
1777 if (!skb_can_coalesce(skb, i, pfrag->page,
1778 pfrag->offset)) {
1779 err = -EMSGSIZE;
1780 if (i == MAX_SKB_FRAGS)
1781 goto error;
1782
1783 __skb_fill_page_desc(skb, i, pfrag->page,
1784 pfrag->offset, 0);
1785 skb_shinfo(skb)->nr_frags = ++i;
1786 get_page(pfrag->page);
1787 }
1788 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1789 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1790 from,
1791 page_address(pfrag->page) + pfrag->offset,
1792 offset, copy, skb->len, skb) < 0)
1793 goto error_efault;
1794
1795 pfrag->offset += copy;
1796 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1797 skb->len += copy;
1798 skb->data_len += copy;
1799 skb->truesize += copy;
1800 wmem_alloc_delta += copy;
1801 } else {
1802 err = skb_zerocopy_iter_dgram(skb, from, copy);
1803 if (err < 0)
1804 goto error;
1805 }
1806 offset += copy;
1807 length -= copy;
1808 }
1809
1810 if (wmem_alloc_delta)
1811 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1812 return 0;
1813
1814 error_efault:
1815 err = -EFAULT;
1816 error:
1817 net_zcopy_put_abort(uarg, extra_uref);
1818 cork->length -= length;
1819 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1820 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1821 if (hold_tskey)
1822 atomic_dec(&sk->sk_tskey);
1823 return err;
1824 }
1825
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1826 int ip6_append_data(struct sock *sk,
1827 int getfrag(void *from, char *to, int offset, int len,
1828 int odd, struct sk_buff *skb),
1829 void *from, size_t length, int transhdrlen,
1830 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1831 struct rt6_info *rt, unsigned int flags)
1832 {
1833 struct inet_sock *inet = inet_sk(sk);
1834 int exthdrlen;
1835 int err;
1836
1837 if (flags&MSG_PROBE)
1838 return 0;
1839 if (skb_queue_empty(&sk->sk_write_queue)) {
1840 /*
1841 * setup for corking
1842 */
1843 dst_hold(&rt->dst);
1844 err = ip6_setup_cork(sk, &inet->cork,
1845 ipc6, rt);
1846 if (err)
1847 return err;
1848
1849 inet->cork.fl.u.ip6 = *fl6;
1850 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1851 length += exthdrlen;
1852 transhdrlen += exthdrlen;
1853 } else {
1854 transhdrlen = 0;
1855 }
1856
1857 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1858 sk_page_frag(sk), getfrag,
1859 from, length, transhdrlen, flags);
1860 }
1861 EXPORT_SYMBOL_GPL(ip6_append_data);
1862
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1863 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1864 {
1865 struct dst_entry *dst = cork->base.dst;
1866
1867 cork->base.dst = NULL;
1868 skb_dst_set(skb, dst);
1869 }
1870
ip6_cork_release(struct inet_cork_full * cork)1871 static void ip6_cork_release(struct inet_cork_full *cork)
1872 {
1873 struct inet6_cork *v6_cork = &cork->base6;
1874
1875 if (unlikely(v6_cork->opt)) {
1876 struct ipv6_txoptions *opt = v6_cork->opt;
1877
1878 kfree(opt->dst0opt);
1879 kfree(opt->dst1opt);
1880 kfree(opt->hopopt);
1881 kfree(opt->srcrt);
1882 kfree(opt);
1883 v6_cork->opt = NULL;
1884 }
1885
1886 if (cork->base.dst) {
1887 dst_release(cork->base.dst);
1888 cork->base.dst = NULL;
1889 }
1890 }
1891
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork)1892 struct sk_buff *__ip6_make_skb(struct sock *sk,
1893 struct sk_buff_head *queue,
1894 struct inet_cork_full *cork)
1895 {
1896 struct sk_buff *skb, *tmp_skb;
1897 struct sk_buff **tail_skb;
1898 struct in6_addr *final_dst;
1899 struct net *net = sock_net(sk);
1900 struct ipv6hdr *hdr;
1901 struct ipv6_txoptions *opt;
1902 struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1903 struct flowi6 *fl6 = &cork->fl.u.ip6;
1904 unsigned char proto = fl6->flowi6_proto;
1905
1906 skb = __skb_dequeue(queue);
1907 if (!skb)
1908 goto out;
1909 tail_skb = &(skb_shinfo(skb)->frag_list);
1910
1911 /* move skb->data to ip header from ext header */
1912 if (skb->data < skb_network_header(skb))
1913 __skb_pull(skb, skb_network_offset(skb));
1914 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1915 __skb_pull(tmp_skb, skb_network_header_len(skb));
1916 *tail_skb = tmp_skb;
1917 tail_skb = &(tmp_skb->next);
1918 skb->len += tmp_skb->len;
1919 skb->data_len += tmp_skb->len;
1920 skb->truesize += tmp_skb->truesize;
1921 tmp_skb->destructor = NULL;
1922 tmp_skb->sk = NULL;
1923 }
1924
1925 /* Allow local fragmentation. */
1926 skb->ignore_df = ip6_sk_ignore_df(sk);
1927 __skb_pull(skb, skb_network_header_len(skb));
1928
1929 final_dst = &fl6->daddr;
1930 opt = cork->base6.opt;
1931 if (unlikely(opt)) {
1932 if (opt->opt_flen)
1933 proto = ipv6_push_frag_opts(skb, opt, proto);
1934 if (opt->opt_nflen)
1935 proto = ipv6_push_nfrag_opts(skb, opt, proto,
1936 &final_dst, &fl6->saddr);
1937 }
1938 skb_push(skb, sizeof(struct ipv6hdr));
1939 skb_reset_network_header(skb);
1940 hdr = ipv6_hdr(skb);
1941
1942 ip6_flow_hdr(hdr, cork->base6.tclass,
1943 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1944 ip6_autoflowlabel(net, sk), fl6));
1945 hdr->hop_limit = cork->base6.hop_limit;
1946 hdr->nexthdr = proto;
1947 hdr->saddr = fl6->saddr;
1948 hdr->daddr = *final_dst;
1949
1950 skb->priority = cork->base.priority;
1951 skb->mark = cork->base.mark;
1952 if (sk_is_tcp(sk))
1953 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1954 else
1955 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1956
1957 ip6_cork_steal_dst(skb, cork);
1958 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1959 if (unlikely(proto == IPPROTO_ICMPV6)) {
1960 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1961 u8 icmp6_type;
1962
1963 if (sk->sk_socket->type == SOCK_RAW &&
1964 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1965 icmp6_type = fl6->fl6_icmp_type;
1966 else
1967 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1968 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1969 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1970 }
1971
1972 ip6_cork_release(cork);
1973 out:
1974 return skb;
1975 }
1976
ip6_send_skb(struct sk_buff * skb)1977 int ip6_send_skb(struct sk_buff *skb)
1978 {
1979 struct net *net = sock_net(skb->sk);
1980 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1981 int err;
1982
1983 rcu_read_lock();
1984 err = ip6_local_out(net, skb->sk, skb);
1985 if (err) {
1986 if (err > 0)
1987 err = net_xmit_errno(err);
1988 if (err)
1989 IP6_INC_STATS(net, rt->rt6i_idev,
1990 IPSTATS_MIB_OUTDISCARDS);
1991 }
1992
1993 rcu_read_unlock();
1994 return err;
1995 }
1996
ip6_push_pending_frames(struct sock * sk)1997 int ip6_push_pending_frames(struct sock *sk)
1998 {
1999 struct sk_buff *skb;
2000
2001 skb = ip6_finish_skb(sk);
2002 if (!skb)
2003 return 0;
2004
2005 return ip6_send_skb(skb);
2006 }
2007 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2008
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork)2009 static void __ip6_flush_pending_frames(struct sock *sk,
2010 struct sk_buff_head *queue,
2011 struct inet_cork_full *cork)
2012 {
2013 struct sk_buff *skb;
2014
2015 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2016 if (skb_dst(skb))
2017 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2018 IPSTATS_MIB_OUTDISCARDS);
2019 kfree_skb(skb);
2020 }
2021
2022 ip6_cork_release(cork);
2023 }
2024
ip6_flush_pending_frames(struct sock * sk)2025 void ip6_flush_pending_frames(struct sock *sk)
2026 {
2027 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2028 &inet_sk(sk)->cork);
2029 }
2030 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2031
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2032 struct sk_buff *ip6_make_skb(struct sock *sk,
2033 int getfrag(void *from, char *to, int offset,
2034 int len, int odd, struct sk_buff *skb),
2035 void *from, size_t length, int transhdrlen,
2036 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2037 unsigned int flags, struct inet_cork_full *cork)
2038 {
2039 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2040 struct sk_buff_head queue;
2041 int err;
2042
2043 if (flags & MSG_PROBE) {
2044 dst_release(&rt->dst);
2045 return NULL;
2046 }
2047
2048 __skb_queue_head_init(&queue);
2049
2050 cork->base.flags = 0;
2051 cork->base.addr = 0;
2052 cork->base.opt = NULL;
2053 cork->base6.opt = NULL;
2054 err = ip6_setup_cork(sk, cork, ipc6, rt);
2055 if (err) {
2056 ip6_cork_release(cork);
2057 return ERR_PTR(err);
2058 }
2059
2060 err = __ip6_append_data(sk, &queue, cork,
2061 ¤t->task_frag, getfrag, from,
2062 length + exthdrlen, transhdrlen + exthdrlen,
2063 flags);
2064 if (err) {
2065 __ip6_flush_pending_frames(sk, &queue, cork);
2066 return ERR_PTR(err);
2067 }
2068
2069 return __ip6_make_skb(sk, &queue, cork);
2070 }
2071