1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3 * IPv6 output functions
4 * Linux INET6 implementation
5 *
6 * Authors:
7 * Pedro Roque <roque@di.fc.ul.pt>
8 *
9 * Based on linux/net/ipv4/ip_output.c
10 *
11 * Changes:
12 * A.N.Kuznetsov : airthmetics in fragmentation.
13 * extension headers are implemented.
14 * route changes now work.
15 * ip6_forward does not confuse sniffers.
16 * etc.
17 *
18 * H. von Brand : Added missing #include <linux/string.h>
19 * Imran Patel : frag id should be in NBO
20 * Kazunori MIYAZAWA @USAGI
21 * : add ip6_append_data and related functions
22 * for datagram xmit
23 */
24
25 #include <linux/errno.h>
26 #include <linux/kernel.h>
27 #include <linux/string.h>
28 #include <linux/socket.h>
29 #include <linux/net.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/in6.h>
33 #include <linux/tcp.h>
34 #include <linux/route.h>
35 #include <linux/module.h>
36 #include <linux/slab.h>
37
38 #include <linux/bpf-cgroup.h>
39 #include <linux/netfilter.h>
40 #include <linux/netfilter_ipv6.h>
41
42 #include <net/sock.h>
43 #include <net/snmp.h>
44
45 #include <net/gso.h>
46 #include <net/ipv6.h>
47 #include <net/ndisc.h>
48 #include <net/protocol.h>
49 #include <net/ip6_route.h>
50 #include <net/addrconf.h>
51 #include <net/rawv6.h>
52 #include <net/icmp.h>
53 #include <net/xfrm.h>
54 #include <net/checksum.h>
55 #include <linux/mroute6.h>
56 #include <net/l3mdev.h>
57 #include <net/lwtunnel.h>
58 #include <net/ip_tunnels.h>
59
ip6_finish_output2(struct net * net,struct sock * sk,struct sk_buff * skb)60 static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *skb)
61 {
62 struct dst_entry *dst = skb_dst(skb);
63 struct net_device *dev = dst_dev(dst);
64 struct inet6_dev *idev = ip6_dst_idev(dst);
65 unsigned int hh_len = LL_RESERVED_SPACE(dev);
66 const struct in6_addr *daddr, *nexthop;
67 struct ipv6hdr *hdr;
68 struct neighbour *neigh;
69 int ret;
70
71 /* Be paranoid, rather than too clever. */
72 if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
73 /* Make sure idev stays alive */
74 rcu_read_lock();
75 skb = skb_expand_head(skb, hh_len);
76 if (!skb) {
77 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
78 rcu_read_unlock();
79 return -ENOMEM;
80 }
81 rcu_read_unlock();
82 }
83
84 hdr = ipv6_hdr(skb);
85 daddr = &hdr->daddr;
86 if (ipv6_addr_is_multicast(daddr)) {
87 if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
88 ((mroute6_is_socket(net, skb) &&
89 !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
90 ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
91 struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
92
93 /* Do not check for IFF_ALLMULTI; multicast routing
94 is not supported in any case.
95 */
96 if (newskb)
97 NF_HOOK(NFPROTO_IPV6, NF_INET_POST_ROUTING,
98 net, sk, newskb, NULL, newskb->dev,
99 dev_loopback_xmit);
100
101 if (hdr->hop_limit == 0) {
102 IP6_INC_STATS(net, idev,
103 IPSTATS_MIB_OUTDISCARDS);
104 kfree_skb(skb);
105 return 0;
106 }
107 }
108
109 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
110 if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
111 !(dev->flags & IFF_LOOPBACK)) {
112 kfree_skb(skb);
113 return 0;
114 }
115 }
116
117 if (lwtunnel_xmit_redirect(dst->lwtstate)) {
118 int res = lwtunnel_xmit(skb);
119
120 if (res != LWTUNNEL_XMIT_CONTINUE)
121 return res;
122 }
123
124 IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
125
126 rcu_read_lock();
127 nexthop = rt6_nexthop(dst_rt6_info(dst), daddr);
128 neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
129
130 if (IS_ERR_OR_NULL(neigh)) {
131 if (unlikely(!neigh))
132 neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
133 if (IS_ERR(neigh)) {
134 rcu_read_unlock();
135 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
136 kfree_skb_reason(skb, SKB_DROP_REASON_NEIGH_CREATEFAIL);
137 return -EINVAL;
138 }
139 }
140 sock_confirm_neigh(skb, neigh);
141 ret = neigh_output(neigh, skb, false);
142 rcu_read_unlock();
143 return ret;
144 }
145
146 static int
ip6_finish_output_gso_slowpath_drop(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)147 ip6_finish_output_gso_slowpath_drop(struct net *net, struct sock *sk,
148 struct sk_buff *skb, unsigned int mtu)
149 {
150 struct sk_buff *segs, *nskb;
151 netdev_features_t features;
152 int ret = 0;
153
154 /* Please see corresponding comment in ip_finish_output_gso
155 * describing the cases where GSO segment length exceeds the
156 * egress MTU.
157 */
158 features = netif_skb_features(skb);
159 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
160 if (IS_ERR_OR_NULL(segs)) {
161 kfree_skb(skb);
162 return -ENOMEM;
163 }
164
165 consume_skb(skb);
166
167 skb_list_walk_safe(segs, segs, nskb) {
168 int err;
169
170 skb_mark_not_on_list(segs);
171 /* Last GSO segment can be smaller than gso_size (and MTU).
172 * Adding a fragment header would produce an "atomic fragment",
173 * which is considered harmful (RFC-8021). Avoid that.
174 */
175 err = segs->len > mtu ?
176 ip6_fragment(net, sk, segs, ip6_finish_output2) :
177 ip6_finish_output2(net, sk, segs);
178 if (err && ret == 0)
179 ret = err;
180 }
181
182 return ret;
183 }
184
ip6_finish_output_gso(struct net * net,struct sock * sk,struct sk_buff * skb,unsigned int mtu)185 static int ip6_finish_output_gso(struct net *net, struct sock *sk,
186 struct sk_buff *skb, unsigned int mtu)
187 {
188 if (!(IP6CB(skb)->flags & IP6SKB_FAKEJUMBO) &&
189 !skb_gso_validate_network_len(skb, mtu))
190 return ip6_finish_output_gso_slowpath_drop(net, sk, skb, mtu);
191
192 return ip6_finish_output2(net, sk, skb);
193 }
194
__ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)195 static int __ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
196 {
197 unsigned int mtu;
198
199 #if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
200 /* Policy lookup after SNAT yielded a new policy */
201 if (skb_dst(skb)->xfrm) {
202 IP6CB(skb)->flags |= IP6SKB_REROUTED;
203 return dst_output(net, sk, skb);
204 }
205 #endif
206
207 mtu = ip6_skb_dst_mtu(skb);
208 if (skb_is_gso(skb))
209 return ip6_finish_output_gso(net, sk, skb, mtu);
210
211 if (skb->len > mtu ||
212 (IP6CB(skb)->frag_max_size && skb->len > IP6CB(skb)->frag_max_size))
213 return ip6_fragment(net, sk, skb, ip6_finish_output2);
214
215 return ip6_finish_output2(net, sk, skb);
216 }
217
ip6_finish_output(struct net * net,struct sock * sk,struct sk_buff * skb)218 static int ip6_finish_output(struct net *net, struct sock *sk, struct sk_buff *skb)
219 {
220 int ret;
221
222 ret = BPF_CGROUP_RUN_PROG_INET_EGRESS(sk, skb);
223 switch (ret) {
224 case NET_XMIT_SUCCESS:
225 case NET_XMIT_CN:
226 return __ip6_finish_output(net, sk, skb) ? : ret;
227 default:
228 kfree_skb_reason(skb, SKB_DROP_REASON_BPF_CGROUP_EGRESS);
229 return ret;
230 }
231 }
232
ip6_output(struct net * net,struct sock * sk,struct sk_buff * skb)233 int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb)
234 {
235 struct dst_entry *dst = skb_dst(skb);
236 struct net_device *dev = dst_dev(dst), *indev = skb->dev;
237 struct inet6_dev *idev = ip6_dst_idev(dst);
238
239 skb->protocol = htons(ETH_P_IPV6);
240 skb->dev = dev;
241
242 if (unlikely(!idev || READ_ONCE(idev->cnf.disable_ipv6))) {
243 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
244 kfree_skb_reason(skb, SKB_DROP_REASON_IPV6DISABLED);
245 return 0;
246 }
247
248 return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING,
249 net, sk, skb, indev, dev,
250 ip6_finish_output,
251 !(IP6CB(skb)->flags & IP6SKB_REROUTED));
252 }
253 EXPORT_SYMBOL(ip6_output);
254
ip6_autoflowlabel(struct net * net,const struct sock * sk)255 bool ip6_autoflowlabel(struct net *net, const struct sock *sk)
256 {
257 if (!inet6_test_bit(AUTOFLOWLABEL_SET, sk))
258 return ip6_default_np_autolabel(net);
259 return inet6_test_bit(AUTOFLOWLABEL, sk);
260 }
261
262 /*
263 * xmit an sk_buff (used by TCP and SCTP)
264 * Note : socket lock is not held for SYNACK packets, but might be modified
265 * by calls to skb_set_owner_w() and ipv6_local_error(),
266 * which are using proper atomic operations or spinlocks.
267 */
ip6_xmit(const struct sock * sk,struct sk_buff * skb,struct flowi6 * fl6,__u32 mark,struct ipv6_txoptions * opt,int tclass,u32 priority)268 int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
269 __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority)
270 {
271 struct net *net = sock_net(sk);
272 const struct ipv6_pinfo *np = inet6_sk(sk);
273 struct in6_addr *first_hop = &fl6->daddr;
274 struct dst_entry *dst = skb_dst(skb);
275 struct net_device *dev = dst_dev(dst);
276 struct inet6_dev *idev = ip6_dst_idev(dst);
277 struct hop_jumbo_hdr *hop_jumbo;
278 int hoplen = sizeof(*hop_jumbo);
279 unsigned int head_room;
280 struct ipv6hdr *hdr;
281 u8 proto = fl6->flowi6_proto;
282 int seg_len = skb->len;
283 int hlimit = -1;
284 u32 mtu;
285
286 head_room = sizeof(struct ipv6hdr) + hoplen + LL_RESERVED_SPACE(dev);
287 if (opt)
288 head_room += opt->opt_nflen + opt->opt_flen;
289
290 if (unlikely(head_room > skb_headroom(skb))) {
291 /* Make sure idev stays alive */
292 rcu_read_lock();
293 skb = skb_expand_head(skb, head_room);
294 if (!skb) {
295 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
296 rcu_read_unlock();
297 return -ENOBUFS;
298 }
299 rcu_read_unlock();
300 }
301
302 if (opt) {
303 seg_len += opt->opt_nflen + opt->opt_flen;
304
305 if (opt->opt_flen)
306 ipv6_push_frag_opts(skb, opt, &proto);
307
308 if (opt->opt_nflen)
309 ipv6_push_nfrag_opts(skb, opt, &proto, &first_hop,
310 &fl6->saddr);
311 }
312
313 if (unlikely(seg_len > IPV6_MAXPLEN)) {
314 hop_jumbo = skb_push(skb, hoplen);
315
316 hop_jumbo->nexthdr = proto;
317 hop_jumbo->hdrlen = 0;
318 hop_jumbo->tlv_type = IPV6_TLV_JUMBO;
319 hop_jumbo->tlv_len = 4;
320 hop_jumbo->jumbo_payload_len = htonl(seg_len + hoplen);
321
322 proto = IPPROTO_HOPOPTS;
323 seg_len = 0;
324 IP6CB(skb)->flags |= IP6SKB_FAKEJUMBO;
325 }
326
327 skb_push(skb, sizeof(struct ipv6hdr));
328 skb_reset_network_header(skb);
329 hdr = ipv6_hdr(skb);
330
331 /*
332 * Fill in the IPv6 header
333 */
334 if (np)
335 hlimit = READ_ONCE(np->hop_limit);
336 if (hlimit < 0)
337 hlimit = ip6_dst_hoplimit(dst);
338
339 ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
340 ip6_autoflowlabel(net, sk), fl6));
341
342 hdr->payload_len = htons(seg_len);
343 hdr->nexthdr = proto;
344 hdr->hop_limit = hlimit;
345
346 hdr->saddr = fl6->saddr;
347 hdr->daddr = *first_hop;
348
349 skb->protocol = htons(ETH_P_IPV6);
350 skb->priority = priority;
351 skb->mark = mark;
352
353 mtu = dst_mtu(dst);
354 if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
355 IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTREQUESTS);
356
357 /* if egress device is enslaved to an L3 master device pass the
358 * skb to its handler for processing
359 */
360 skb = l3mdev_ip6_out((struct sock *)sk, skb);
361 if (unlikely(!skb))
362 return 0;
363
364 /* hooks should never assume socket lock is held.
365 * we promote our socket to non const
366 */
367 return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
368 net, (struct sock *)sk, skb, NULL, dev,
369 dst_output);
370 }
371
372 skb->dev = dev;
373 /* ipv6_local_error() does not require socket lock,
374 * we promote our socket to non const
375 */
376 ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
377
378 IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
379 kfree_skb(skb);
380 return -EMSGSIZE;
381 }
382 EXPORT_SYMBOL(ip6_xmit);
383
ip6_call_ra_chain(struct sk_buff * skb,int sel)384 static int ip6_call_ra_chain(struct sk_buff *skb, int sel)
385 {
386 struct ip6_ra_chain *ra;
387 struct sock *last = NULL;
388
389 read_lock(&ip6_ra_lock);
390 for (ra = ip6_ra_chain; ra; ra = ra->next) {
391 struct sock *sk = ra->sk;
392 if (sk && ra->sel == sel &&
393 (!sk->sk_bound_dev_if ||
394 sk->sk_bound_dev_if == skb->dev->ifindex)) {
395
396 if (inet6_test_bit(RTALERT_ISOLATE, sk) &&
397 !net_eq(sock_net(sk), dev_net(skb->dev))) {
398 continue;
399 }
400 if (last) {
401 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
402 if (skb2)
403 rawv6_rcv(last, skb2);
404 }
405 last = sk;
406 }
407 }
408
409 if (last) {
410 rawv6_rcv(last, skb);
411 read_unlock(&ip6_ra_lock);
412 return 1;
413 }
414 read_unlock(&ip6_ra_lock);
415 return 0;
416 }
417
ip6_forward_proxy_check(struct sk_buff * skb)418 static int ip6_forward_proxy_check(struct sk_buff *skb)
419 {
420 struct ipv6hdr *hdr = ipv6_hdr(skb);
421 u8 nexthdr = hdr->nexthdr;
422 __be16 frag_off;
423 int offset;
424
425 if (ipv6_ext_hdr(nexthdr)) {
426 offset = ipv6_skip_exthdr(skb, sizeof(*hdr), &nexthdr, &frag_off);
427 if (offset < 0)
428 return 0;
429 } else
430 offset = sizeof(struct ipv6hdr);
431
432 if (nexthdr == IPPROTO_ICMPV6) {
433 struct icmp6hdr *icmp6;
434
435 if (!pskb_may_pull(skb, (skb_network_header(skb) +
436 offset + 1 - skb->data)))
437 return 0;
438
439 icmp6 = (struct icmp6hdr *)(skb_network_header(skb) + offset);
440
441 switch (icmp6->icmp6_type) {
442 case NDISC_ROUTER_SOLICITATION:
443 case NDISC_ROUTER_ADVERTISEMENT:
444 case NDISC_NEIGHBOUR_SOLICITATION:
445 case NDISC_NEIGHBOUR_ADVERTISEMENT:
446 case NDISC_REDIRECT:
447 /* For reaction involving unicast neighbor discovery
448 * message destined to the proxied address, pass it to
449 * input function.
450 */
451 return 1;
452 default:
453 break;
454 }
455 }
456
457 /*
458 * The proxying router can't forward traffic sent to a link-local
459 * address, so signal the sender and discard the packet. This
460 * behavior is clarified by the MIPv6 specification.
461 */
462 if (ipv6_addr_type(&hdr->daddr) & IPV6_ADDR_LINKLOCAL) {
463 dst_link_failure(skb);
464 return -1;
465 }
466
467 return 0;
468 }
469
ip6_forward_finish(struct net * net,struct sock * sk,struct sk_buff * skb)470 static inline int ip6_forward_finish(struct net *net, struct sock *sk,
471 struct sk_buff *skb)
472 {
473 #ifdef CONFIG_NET_SWITCHDEV
474 if (skb->offload_l3_fwd_mark) {
475 consume_skb(skb);
476 return 0;
477 }
478 #endif
479
480 skb_clear_tstamp(skb);
481 return dst_output(net, sk, skb);
482 }
483
ip6_pkt_too_big(const struct sk_buff * skb,unsigned int mtu)484 static bool ip6_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
485 {
486 if (skb->len <= mtu)
487 return false;
488
489 /* ipv6 conntrack defrag sets max_frag_size + ignore_df */
490 if (IP6CB(skb)->frag_max_size && IP6CB(skb)->frag_max_size > mtu)
491 return true;
492
493 if (skb->ignore_df)
494 return false;
495
496 if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu))
497 return false;
498
499 return true;
500 }
501
ip6_forward(struct sk_buff * skb)502 int ip6_forward(struct sk_buff *skb)
503 {
504 struct dst_entry *dst = skb_dst(skb);
505 struct ipv6hdr *hdr = ipv6_hdr(skb);
506 struct inet6_skb_parm *opt = IP6CB(skb);
507 struct net *net = dev_net(dst_dev(dst));
508 struct net_device *dev;
509 struct inet6_dev *idev;
510 SKB_DR(reason);
511 u32 mtu;
512
513 idev = __in6_dev_get_safely(dev_get_by_index_rcu(net, IP6CB(skb)->iif));
514 if (!READ_ONCE(net->ipv6.devconf_all->forwarding) &&
515 (!idev || !READ_ONCE(idev->cnf.force_forwarding)))
516 goto error;
517
518 if (skb->pkt_type != PACKET_HOST)
519 goto drop;
520
521 if (unlikely(skb->sk))
522 goto drop;
523
524 if (skb_warn_if_lro(skb))
525 goto drop;
526
527 if (!READ_ONCE(net->ipv6.devconf_all->disable_policy) &&
528 (!idev || !READ_ONCE(idev->cnf.disable_policy)) &&
529 !xfrm6_policy_check(NULL, XFRM_POLICY_FWD, skb)) {
530 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
531 goto drop;
532 }
533
534 skb_forward_csum(skb);
535
536 /*
537 * We DO NOT make any processing on
538 * RA packets, pushing them to user level AS IS
539 * without ane WARRANTY that application will be able
540 * to interpret them. The reason is that we
541 * cannot make anything clever here.
542 *
543 * We are not end-node, so that if packet contains
544 * AH/ESP, we cannot make anything.
545 * Defragmentation also would be mistake, RA packets
546 * cannot be fragmented, because there is no warranty
547 * that different fragments will go along one path. --ANK
548 */
549 if (unlikely(opt->flags & IP6SKB_ROUTERALERT)) {
550 if (ip6_call_ra_chain(skb, ntohs(opt->ra)))
551 return 0;
552 }
553
554 /*
555 * check and decrement ttl
556 */
557 if (hdr->hop_limit <= 1) {
558 icmpv6_send(skb, ICMPV6_TIME_EXCEED, ICMPV6_EXC_HOPLIMIT, 0);
559 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INHDRERRORS);
560
561 kfree_skb_reason(skb, SKB_DROP_REASON_IP_INHDR);
562 return -ETIMEDOUT;
563 }
564
565 /* XXX: idev->cnf.proxy_ndp? */
566 if (READ_ONCE(net->ipv6.devconf_all->proxy_ndp) &&
567 pneigh_lookup(&nd_tbl, net, &hdr->daddr, skb->dev)) {
568 int proxied = ip6_forward_proxy_check(skb);
569 if (proxied > 0) {
570 /* It's tempting to decrease the hop limit
571 * here by 1, as we do at the end of the
572 * function too.
573 *
574 * But that would be incorrect, as proxying is
575 * not forwarding. The ip6_input function
576 * will handle this packet locally, and it
577 * depends on the hop limit being unchanged.
578 *
579 * One example is the NDP hop limit, that
580 * always has to stay 255, but other would be
581 * similar checks around RA packets, where the
582 * user can even change the desired limit.
583 */
584 return ip6_input(skb);
585 } else if (proxied < 0) {
586 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
587 goto drop;
588 }
589 }
590
591 if (!xfrm6_route_forward(skb)) {
592 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INDISCARDS);
593 SKB_DR_SET(reason, XFRM_POLICY);
594 goto drop;
595 }
596 dst = skb_dst(skb);
597 dev = dst_dev(dst);
598 /* IPv6 specs say nothing about it, but it is clear that we cannot
599 send redirects to source routed frames.
600 We don't send redirects to frames decapsulated from IPsec.
601 */
602 if (IP6CB(skb)->iif == dev->ifindex &&
603 opt->srcrt == 0 && !skb_sec_path(skb)) {
604 struct in6_addr *target = NULL;
605 struct inet_peer *peer;
606 struct rt6_info *rt;
607
608 /*
609 * incoming and outgoing devices are the same
610 * send a redirect.
611 */
612
613 rt = dst_rt6_info(dst);
614 if (rt->rt6i_flags & RTF_GATEWAY)
615 target = &rt->rt6i_gateway;
616 else
617 target = &hdr->daddr;
618
619 rcu_read_lock();
620 peer = inet_getpeer_v6(net->ipv6.peers, &hdr->daddr);
621
622 /* Limit redirects both by destination (here)
623 and by source (inside ndisc_send_redirect)
624 */
625 if (inet_peer_xrlim_allow(peer, 1*HZ))
626 ndisc_send_redirect(skb, target);
627 rcu_read_unlock();
628 } else {
629 int addrtype = ipv6_addr_type(&hdr->saddr);
630
631 /* This check is security critical. */
632 if (addrtype == IPV6_ADDR_ANY ||
633 addrtype & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LOOPBACK))
634 goto error;
635 if (addrtype & IPV6_ADDR_LINKLOCAL) {
636 icmpv6_send(skb, ICMPV6_DEST_UNREACH,
637 ICMPV6_NOT_NEIGHBOUR, 0);
638 goto error;
639 }
640 }
641
642 __IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
643
644 mtu = ip6_dst_mtu_maybe_forward(dst, true);
645 if (mtu < IPV6_MIN_MTU)
646 mtu = IPV6_MIN_MTU;
647
648 if (ip6_pkt_too_big(skb, mtu)) {
649 /* Again, force OUTPUT device used as source address */
650 skb->dev = dev;
651 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
652 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INTOOBIGERRORS);
653 __IP6_INC_STATS(net, ip6_dst_idev(dst),
654 IPSTATS_MIB_FRAGFAILS);
655 kfree_skb_reason(skb, SKB_DROP_REASON_PKT_TOO_BIG);
656 return -EMSGSIZE;
657 }
658
659 if (skb_cow(skb, dev->hard_header_len)) {
660 __IP6_INC_STATS(net, ip6_dst_idev(dst),
661 IPSTATS_MIB_OUTDISCARDS);
662 goto drop;
663 }
664
665 hdr = ipv6_hdr(skb);
666
667 /* Mangling hops number delayed to point after skb COW */
668
669 hdr->hop_limit--;
670
671 return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD,
672 net, NULL, skb, skb->dev, dev,
673 ip6_forward_finish);
674
675 error:
676 __IP6_INC_STATS(net, idev, IPSTATS_MIB_INADDRERRORS);
677 SKB_DR_SET(reason, IP_INADDRERRORS);
678 drop:
679 kfree_skb_reason(skb, reason);
680 return -EINVAL;
681 }
682
ip6_copy_metadata(struct sk_buff * to,struct sk_buff * from)683 static void ip6_copy_metadata(struct sk_buff *to, struct sk_buff *from)
684 {
685 to->pkt_type = from->pkt_type;
686 to->priority = from->priority;
687 to->protocol = from->protocol;
688 skb_dst_drop(to);
689 skb_dst_set(to, dst_clone(skb_dst(from)));
690 to->dev = from->dev;
691 to->mark = from->mark;
692
693 skb_copy_hash(to, from);
694
695 #ifdef CONFIG_NET_SCHED
696 to->tc_index = from->tc_index;
697 #endif
698 nf_copy(to, from);
699 skb_ext_copy(to, from);
700 skb_copy_secmark(to, from);
701 }
702
ip6_fraglist_init(struct sk_buff * skb,unsigned int hlen,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_fraglist_iter * iter)703 int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr,
704 u8 nexthdr, __be32 frag_id,
705 struct ip6_fraglist_iter *iter)
706 {
707 unsigned int first_len;
708 struct frag_hdr *fh;
709
710 /* BUILD HEADER */
711 *prevhdr = NEXTHDR_FRAGMENT;
712 iter->tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
713 if (!iter->tmp_hdr)
714 return -ENOMEM;
715
716 iter->frag = skb_shinfo(skb)->frag_list;
717 skb_frag_list_init(skb);
718
719 iter->offset = 0;
720 iter->hlen = hlen;
721 iter->frag_id = frag_id;
722 iter->nexthdr = nexthdr;
723
724 __skb_pull(skb, hlen);
725 fh = __skb_push(skb, sizeof(struct frag_hdr));
726 __skb_push(skb, hlen);
727 skb_reset_network_header(skb);
728 memcpy(skb_network_header(skb), iter->tmp_hdr, hlen);
729
730 fh->nexthdr = nexthdr;
731 fh->reserved = 0;
732 fh->frag_off = htons(IP6_MF);
733 fh->identification = frag_id;
734
735 first_len = skb_pagelen(skb);
736 skb->data_len = first_len - skb_headlen(skb);
737 skb->len = first_len;
738 ipv6_hdr(skb)->payload_len = htons(first_len - sizeof(struct ipv6hdr));
739
740 return 0;
741 }
742 EXPORT_SYMBOL(ip6_fraglist_init);
743
ip6_fraglist_prepare(struct sk_buff * skb,struct ip6_fraglist_iter * iter)744 void ip6_fraglist_prepare(struct sk_buff *skb,
745 struct ip6_fraglist_iter *iter)
746 {
747 struct sk_buff *frag = iter->frag;
748 unsigned int hlen = iter->hlen;
749 struct frag_hdr *fh;
750
751 frag->ip_summed = CHECKSUM_NONE;
752 skb_reset_transport_header(frag);
753 fh = __skb_push(frag, sizeof(struct frag_hdr));
754 __skb_push(frag, hlen);
755 skb_reset_network_header(frag);
756 memcpy(skb_network_header(frag), iter->tmp_hdr, hlen);
757 iter->offset += skb->len - hlen - sizeof(struct frag_hdr);
758 fh->nexthdr = iter->nexthdr;
759 fh->reserved = 0;
760 fh->frag_off = htons(iter->offset);
761 if (frag->next)
762 fh->frag_off |= htons(IP6_MF);
763 fh->identification = iter->frag_id;
764 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
765 ip6_copy_metadata(frag, skb);
766 }
767 EXPORT_SYMBOL(ip6_fraglist_prepare);
768
ip6_frag_init(struct sk_buff * skb,unsigned int hlen,unsigned int mtu,unsigned short needed_tailroom,int hdr_room,u8 * prevhdr,u8 nexthdr,__be32 frag_id,struct ip6_frag_state * state)769 void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu,
770 unsigned short needed_tailroom, int hdr_room, u8 *prevhdr,
771 u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state)
772 {
773 state->prevhdr = prevhdr;
774 state->nexthdr = nexthdr;
775 state->frag_id = frag_id;
776
777 state->hlen = hlen;
778 state->mtu = mtu;
779
780 state->left = skb->len - hlen; /* Space per frame */
781 state->ptr = hlen; /* Where to start from */
782
783 state->hroom = hdr_room;
784 state->troom = needed_tailroom;
785
786 state->offset = 0;
787 }
788 EXPORT_SYMBOL(ip6_frag_init);
789
ip6_frag_next(struct sk_buff * skb,struct ip6_frag_state * state)790 struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state)
791 {
792 u8 *prevhdr = state->prevhdr, *fragnexthdr_offset;
793 struct sk_buff *frag;
794 struct frag_hdr *fh;
795 unsigned int len;
796
797 len = state->left;
798 /* IF: it doesn't fit, use 'mtu' - the data space left */
799 if (len > state->mtu)
800 len = state->mtu;
801 /* IF: we are not sending up to and including the packet end
802 then align the next start on an eight byte boundary */
803 if (len < state->left)
804 len &= ~7;
805
806 /* Allocate buffer */
807 frag = alloc_skb(len + state->hlen + sizeof(struct frag_hdr) +
808 state->hroom + state->troom, GFP_ATOMIC);
809 if (!frag)
810 return ERR_PTR(-ENOMEM);
811
812 /*
813 * Set up data on packet
814 */
815
816 ip6_copy_metadata(frag, skb);
817 skb_reserve(frag, state->hroom);
818 skb_put(frag, len + state->hlen + sizeof(struct frag_hdr));
819 skb_reset_network_header(frag);
820 fh = (struct frag_hdr *)(skb_network_header(frag) + state->hlen);
821 frag->transport_header = (frag->network_header + state->hlen +
822 sizeof(struct frag_hdr));
823
824 /*
825 * Charge the memory for the fragment to any owner
826 * it might possess
827 */
828 if (skb->sk)
829 skb_set_owner_w(frag, skb->sk);
830
831 /*
832 * Copy the packet header into the new buffer.
833 */
834 skb_copy_from_linear_data(skb, skb_network_header(frag), state->hlen);
835
836 fragnexthdr_offset = skb_network_header(frag);
837 fragnexthdr_offset += prevhdr - skb_network_header(skb);
838 *fragnexthdr_offset = NEXTHDR_FRAGMENT;
839
840 /*
841 * Build fragment header.
842 */
843 fh->nexthdr = state->nexthdr;
844 fh->reserved = 0;
845 fh->identification = state->frag_id;
846
847 /*
848 * Copy a block of the IP datagram.
849 */
850 BUG_ON(skb_copy_bits(skb, state->ptr, skb_transport_header(frag),
851 len));
852 state->left -= len;
853
854 fh->frag_off = htons(state->offset);
855 if (state->left > 0)
856 fh->frag_off |= htons(IP6_MF);
857 ipv6_hdr(frag)->payload_len = htons(frag->len - sizeof(struct ipv6hdr));
858
859 state->ptr += len;
860 state->offset += len;
861
862 return frag;
863 }
864 EXPORT_SYMBOL(ip6_frag_next);
865
ip6_fragment(struct net * net,struct sock * sk,struct sk_buff * skb,int (* output)(struct net *,struct sock *,struct sk_buff *))866 int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
867 int (*output)(struct net *, struct sock *, struct sk_buff *))
868 {
869 struct sk_buff *frag;
870 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
871 struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
872 inet6_sk(skb->sk) : NULL;
873 u8 tstamp_type = skb->tstamp_type;
874 struct ip6_frag_state state;
875 unsigned int mtu, hlen, nexthdr_offset;
876 ktime_t tstamp = skb->tstamp;
877 int hroom, err = 0;
878 __be32 frag_id;
879 u8 *prevhdr, nexthdr = 0;
880
881 err = ip6_find_1stfragopt(skb, &prevhdr);
882 if (err < 0)
883 goto fail;
884 hlen = err;
885 nexthdr = *prevhdr;
886 nexthdr_offset = prevhdr - skb_network_header(skb);
887
888 mtu = ip6_skb_dst_mtu(skb);
889
890 /* We must not fragment if the socket is set to force MTU discovery
891 * or if the skb it not generated by a local socket.
892 */
893 if (unlikely(!skb->ignore_df && skb->len > mtu))
894 goto fail_toobig;
895
896 if (IP6CB(skb)->frag_max_size) {
897 if (IP6CB(skb)->frag_max_size > mtu)
898 goto fail_toobig;
899
900 /* don't send fragments larger than what we received */
901 mtu = IP6CB(skb)->frag_max_size;
902 if (mtu < IPV6_MIN_MTU)
903 mtu = IPV6_MIN_MTU;
904 }
905
906 if (np) {
907 u32 frag_size = READ_ONCE(np->frag_size);
908
909 if (frag_size && frag_size < mtu)
910 mtu = frag_size;
911 }
912 if (mtu < hlen + sizeof(struct frag_hdr) + 8)
913 goto fail_toobig;
914 mtu -= hlen + sizeof(struct frag_hdr);
915
916 frag_id = ipv6_select_ident(net, &ipv6_hdr(skb)->daddr,
917 &ipv6_hdr(skb)->saddr);
918
919 if (skb->ip_summed == CHECKSUM_PARTIAL &&
920 (err = skb_checksum_help(skb)))
921 goto fail;
922
923 prevhdr = skb_network_header(skb) + nexthdr_offset;
924 hroom = LL_RESERVED_SPACE(rt->dst.dev);
925 if (skb_has_frag_list(skb)) {
926 unsigned int first_len = skb_pagelen(skb);
927 struct ip6_fraglist_iter iter;
928 struct sk_buff *frag2;
929
930 if (first_len - hlen > mtu ||
931 ((first_len - hlen) & 7) ||
932 skb_cloned(skb) ||
933 skb_headroom(skb) < (hroom + sizeof(struct frag_hdr)))
934 goto slow_path;
935
936 skb_walk_frags(skb, frag) {
937 /* Correct geometry. */
938 if (frag->len > mtu ||
939 ((frag->len & 7) && frag->next) ||
940 skb_headroom(frag) < (hlen + hroom + sizeof(struct frag_hdr)))
941 goto slow_path_clean;
942
943 /* Partially cloned skb? */
944 if (skb_shared(frag))
945 goto slow_path_clean;
946
947 BUG_ON(frag->sk);
948 if (skb->sk) {
949 frag->sk = skb->sk;
950 frag->destructor = sock_wfree;
951 }
952 skb->truesize -= frag->truesize;
953 }
954
955 err = ip6_fraglist_init(skb, hlen, prevhdr, nexthdr, frag_id,
956 &iter);
957 if (err < 0)
958 goto fail;
959
960 /* We prevent @rt from being freed. */
961 rcu_read_lock();
962
963 for (;;) {
964 /* Prepare header of the next frame,
965 * before previous one went down. */
966 if (iter.frag)
967 ip6_fraglist_prepare(skb, &iter);
968
969 skb_set_delivery_time(skb, tstamp, tstamp_type);
970 err = output(net, sk, skb);
971 if (!err)
972 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
973 IPSTATS_MIB_FRAGCREATES);
974
975 if (err || !iter.frag)
976 break;
977
978 skb = ip6_fraglist_next(&iter);
979 }
980
981 kfree(iter.tmp_hdr);
982
983 if (err == 0) {
984 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
985 IPSTATS_MIB_FRAGOKS);
986 rcu_read_unlock();
987 return 0;
988 }
989
990 kfree_skb_list(iter.frag);
991
992 IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
993 IPSTATS_MIB_FRAGFAILS);
994 rcu_read_unlock();
995 return err;
996
997 slow_path_clean:
998 skb_walk_frags(skb, frag2) {
999 if (frag2 == frag)
1000 break;
1001 frag2->sk = NULL;
1002 frag2->destructor = NULL;
1003 skb->truesize += frag2->truesize;
1004 }
1005 }
1006
1007 slow_path:
1008 /*
1009 * Fragment the datagram.
1010 */
1011
1012 ip6_frag_init(skb, hlen, mtu, rt->dst.dev->needed_tailroom,
1013 LL_RESERVED_SPACE(rt->dst.dev), prevhdr, nexthdr, frag_id,
1014 &state);
1015
1016 /*
1017 * Keep copying data until we run out.
1018 */
1019
1020 while (state.left > 0) {
1021 frag = ip6_frag_next(skb, &state);
1022 if (IS_ERR(frag)) {
1023 err = PTR_ERR(frag);
1024 goto fail;
1025 }
1026
1027 /*
1028 * Put this fragment into the sending queue.
1029 */
1030 skb_set_delivery_time(frag, tstamp, tstamp_type);
1031 err = output(net, sk, frag);
1032 if (err)
1033 goto fail;
1034
1035 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1036 IPSTATS_MIB_FRAGCREATES);
1037 }
1038 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1039 IPSTATS_MIB_FRAGOKS);
1040 consume_skb(skb);
1041 return err;
1042
1043 fail_toobig:
1044 icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
1045 err = -EMSGSIZE;
1046
1047 fail:
1048 IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
1049 IPSTATS_MIB_FRAGFAILS);
1050 kfree_skb(skb);
1051 return err;
1052 }
1053
ip6_rt_check(const struct rt6key * rt_key,const struct in6_addr * fl_addr,const struct in6_addr * addr_cache)1054 static inline int ip6_rt_check(const struct rt6key *rt_key,
1055 const struct in6_addr *fl_addr,
1056 const struct in6_addr *addr_cache)
1057 {
1058 return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) &&
1059 (!addr_cache || !ipv6_addr_equal(fl_addr, addr_cache));
1060 }
1061
ip6_sk_dst_check(struct sock * sk,struct dst_entry * dst,const struct flowi6 * fl6)1062 static struct dst_entry *ip6_sk_dst_check(struct sock *sk,
1063 struct dst_entry *dst,
1064 const struct flowi6 *fl6)
1065 {
1066 struct ipv6_pinfo *np = inet6_sk(sk);
1067 struct rt6_info *rt;
1068
1069 if (!dst)
1070 goto out;
1071
1072 if (dst->ops->family != AF_INET6) {
1073 dst_release(dst);
1074 return NULL;
1075 }
1076
1077 rt = dst_rt6_info(dst);
1078 /* Yes, checking route validity in not connected
1079 * case is not very simple. Take into account,
1080 * that we do not support routing by source, TOS,
1081 * and MSG_DONTROUTE --ANK (980726)
1082 *
1083 * 1. ip6_rt_check(): If route was host route,
1084 * check that cached destination is current.
1085 * If it is network route, we still may
1086 * check its validity using saved pointer
1087 * to the last used address: daddr_cache.
1088 * We do not want to save whole address now,
1089 * (because main consumer of this service
1090 * is tcp, which has not this problem),
1091 * so that the last trick works only on connected
1092 * sockets.
1093 * 2. oif also should be the same.
1094 */
1095 if (ip6_rt_check(&rt->rt6i_dst, &fl6->daddr, np->daddr_cache) ||
1096 #ifdef CONFIG_IPV6_SUBTREES
1097 ip6_rt_check(&rt->rt6i_src, &fl6->saddr, np->saddr_cache) ||
1098 #endif
1099 (fl6->flowi6_oif && fl6->flowi6_oif != dst_dev(dst)->ifindex)) {
1100 dst_release(dst);
1101 dst = NULL;
1102 }
1103
1104 out:
1105 return dst;
1106 }
1107
ip6_dst_lookup_tail(struct net * net,const struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1108 static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
1109 struct dst_entry **dst, struct flowi6 *fl6)
1110 {
1111 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1112 struct neighbour *n;
1113 struct rt6_info *rt;
1114 #endif
1115 int err;
1116 int flags = 0;
1117
1118 /* The correct way to handle this would be to do
1119 * ip6_route_get_saddr, and then ip6_route_output; however,
1120 * the route-specific preferred source forces the
1121 * ip6_route_output call _before_ ip6_route_get_saddr.
1122 *
1123 * In source specific routing (no src=any default route),
1124 * ip6_route_output will fail given src=any saddr, though, so
1125 * that's why we try it again later.
1126 */
1127 if (ipv6_addr_any(&fl6->saddr)) {
1128 struct fib6_info *from;
1129 struct rt6_info *rt;
1130
1131 *dst = ip6_route_output(net, sk, fl6);
1132 rt = (*dst)->error ? NULL : dst_rt6_info(*dst);
1133
1134 rcu_read_lock();
1135 from = rt ? rcu_dereference(rt->from) : NULL;
1136 err = ip6_route_get_saddr(net, from, &fl6->daddr,
1137 sk ? READ_ONCE(inet6_sk(sk)->srcprefs) : 0,
1138 fl6->flowi6_l3mdev,
1139 &fl6->saddr);
1140 rcu_read_unlock();
1141
1142 if (err)
1143 goto out_err_release;
1144
1145 /* If we had an erroneous initial result, pretend it
1146 * never existed and let the SA-enabled version take
1147 * over.
1148 */
1149 if ((*dst)->error) {
1150 dst_release(*dst);
1151 *dst = NULL;
1152 }
1153
1154 if (fl6->flowi6_oif)
1155 flags |= RT6_LOOKUP_F_IFACE;
1156 }
1157
1158 if (!*dst)
1159 *dst = ip6_route_output_flags(net, sk, fl6, flags);
1160
1161 err = (*dst)->error;
1162 if (err)
1163 goto out_err_release;
1164
1165 #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
1166 /*
1167 * Here if the dst entry we've looked up
1168 * has a neighbour entry that is in the INCOMPLETE
1169 * state and the src address from the flow is
1170 * marked as OPTIMISTIC, we release the found
1171 * dst entry and replace it instead with the
1172 * dst entry of the nexthop router
1173 */
1174 rt = dst_rt6_info(*dst);
1175 rcu_read_lock();
1176 n = __ipv6_neigh_lookup_noref(rt->dst.dev,
1177 rt6_nexthop(rt, &fl6->daddr));
1178 err = n && !(READ_ONCE(n->nud_state) & NUD_VALID) ? -EINVAL : 0;
1179 rcu_read_unlock();
1180
1181 if (err) {
1182 struct inet6_ifaddr *ifp;
1183 struct flowi6 fl_gw6;
1184 int redirect;
1185
1186 ifp = ipv6_get_ifaddr(net, &fl6->saddr,
1187 (*dst)->dev, 1);
1188
1189 redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC);
1190 if (ifp)
1191 in6_ifa_put(ifp);
1192
1193 if (redirect) {
1194 /*
1195 * We need to get the dst entry for the
1196 * default router instead
1197 */
1198 dst_release(*dst);
1199 memcpy(&fl_gw6, fl6, sizeof(struct flowi6));
1200 memset(&fl_gw6.daddr, 0, sizeof(struct in6_addr));
1201 *dst = ip6_route_output(net, sk, &fl_gw6);
1202 err = (*dst)->error;
1203 if (err)
1204 goto out_err_release;
1205 }
1206 }
1207 #endif
1208 if (ipv6_addr_v4mapped(&fl6->saddr) &&
1209 !(ipv6_addr_v4mapped(&fl6->daddr) || ipv6_addr_any(&fl6->daddr))) {
1210 err = -EAFNOSUPPORT;
1211 goto out_err_release;
1212 }
1213
1214 return 0;
1215
1216 out_err_release:
1217 dst_release(*dst);
1218 *dst = NULL;
1219
1220 if (err == -ENETUNREACH)
1221 IP6_INC_STATS(net, NULL, IPSTATS_MIB_OUTNOROUTES);
1222 return err;
1223 }
1224
1225 /**
1226 * ip6_dst_lookup - perform route lookup on flow
1227 * @net: Network namespace to perform lookup in
1228 * @sk: socket which provides route info
1229 * @dst: pointer to dst_entry * for result
1230 * @fl6: flow to lookup
1231 *
1232 * This function performs a route lookup on the given flow.
1233 *
1234 * It returns zero on success, or a standard errno code on error.
1235 */
ip6_dst_lookup(struct net * net,struct sock * sk,struct dst_entry ** dst,struct flowi6 * fl6)1236 int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst,
1237 struct flowi6 *fl6)
1238 {
1239 *dst = NULL;
1240 return ip6_dst_lookup_tail(net, sk, dst, fl6);
1241 }
1242 EXPORT_SYMBOL_GPL(ip6_dst_lookup);
1243
1244 /**
1245 * ip6_dst_lookup_flow - perform route lookup on flow with ipsec
1246 * @net: Network namespace to perform lookup in
1247 * @sk: socket which provides route info
1248 * @fl6: flow to lookup
1249 * @final_dst: final destination address for ipsec lookup
1250 *
1251 * This function performs a route lookup on the given flow.
1252 *
1253 * It returns a valid dst pointer on success, or a pointer encoded
1254 * error code.
1255 */
ip6_dst_lookup_flow(struct net * net,const struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst)1256 struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6,
1257 const struct in6_addr *final_dst)
1258 {
1259 struct dst_entry *dst = NULL;
1260 int err;
1261
1262 err = ip6_dst_lookup_tail(net, sk, &dst, fl6);
1263 if (err)
1264 return ERR_PTR(err);
1265 if (final_dst)
1266 fl6->daddr = *final_dst;
1267
1268 return xfrm_lookup_route(net, dst, flowi6_to_flowi(fl6), sk, 0);
1269 }
1270 EXPORT_SYMBOL_GPL(ip6_dst_lookup_flow);
1271
1272 /**
1273 * ip6_sk_dst_lookup_flow - perform socket cached route lookup on flow
1274 * @sk: socket which provides the dst cache and route info
1275 * @fl6: flow to lookup
1276 * @final_dst: final destination address for ipsec lookup
1277 * @connected: whether @sk is connected or not
1278 *
1279 * This function performs a route lookup on the given flow with the
1280 * possibility of using the cached route in the socket if it is valid.
1281 * It will take the socket dst lock when operating on the dst cache.
1282 * As a result, this function can only be used in process context.
1283 *
1284 * In addition, for a connected socket, cache the dst in the socket
1285 * if the current cache is not valid.
1286 *
1287 * It returns a valid dst pointer on success, or a pointer encoded
1288 * error code.
1289 */
ip6_sk_dst_lookup_flow(struct sock * sk,struct flowi6 * fl6,const struct in6_addr * final_dst,bool connected)1290 struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6,
1291 const struct in6_addr *final_dst,
1292 bool connected)
1293 {
1294 struct dst_entry *dst = sk_dst_check(sk, inet6_sk(sk)->dst_cookie);
1295
1296 dst = ip6_sk_dst_check(sk, dst, fl6);
1297 if (dst)
1298 return dst;
1299
1300 dst = ip6_dst_lookup_flow(sock_net(sk), sk, fl6, final_dst);
1301 if (connected && !IS_ERR(dst))
1302 ip6_sk_dst_store_flow(sk, dst_clone(dst), fl6);
1303
1304 return dst;
1305 }
1306 EXPORT_SYMBOL_GPL(ip6_sk_dst_lookup_flow);
1307
ip6_opt_dup(struct ipv6_opt_hdr * src,gfp_t gfp)1308 static inline struct ipv6_opt_hdr *ip6_opt_dup(struct ipv6_opt_hdr *src,
1309 gfp_t gfp)
1310 {
1311 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1312 }
1313
ip6_rthdr_dup(struct ipv6_rt_hdr * src,gfp_t gfp)1314 static inline struct ipv6_rt_hdr *ip6_rthdr_dup(struct ipv6_rt_hdr *src,
1315 gfp_t gfp)
1316 {
1317 return src ? kmemdup(src, (src->hdrlen + 1) * 8, gfp) : NULL;
1318 }
1319
ip6_append_data_mtu(unsigned int * mtu,int * maxfraglen,unsigned int fragheaderlen,struct sk_buff * skb,struct rt6_info * rt,unsigned int orig_mtu)1320 static void ip6_append_data_mtu(unsigned int *mtu,
1321 int *maxfraglen,
1322 unsigned int fragheaderlen,
1323 struct sk_buff *skb,
1324 struct rt6_info *rt,
1325 unsigned int orig_mtu)
1326 {
1327 if (!(rt->dst.flags & DST_XFRM_TUNNEL)) {
1328 if (!skb) {
1329 /* first fragment, reserve header_len */
1330 *mtu = orig_mtu - rt->dst.header_len;
1331
1332 } else {
1333 /*
1334 * this fragment is not first, the headers
1335 * space is regarded as data space.
1336 */
1337 *mtu = orig_mtu;
1338 }
1339 *maxfraglen = ((*mtu - fragheaderlen) & ~7)
1340 + fragheaderlen - sizeof(struct frag_hdr);
1341 }
1342 }
1343
ip6_setup_cork(struct sock * sk,struct inet_cork_full * cork,struct inet6_cork * v6_cork,struct ipcm6_cookie * ipc6,struct rt6_info * rt)1344 static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork,
1345 struct inet6_cork *v6_cork, struct ipcm6_cookie *ipc6,
1346 struct rt6_info *rt)
1347 {
1348 struct ipv6_pinfo *np = inet6_sk(sk);
1349 unsigned int mtu, frag_size;
1350 struct ipv6_txoptions *nopt, *opt = ipc6->opt;
1351
1352 /* callers pass dst together with a reference, set it first so
1353 * ip6_cork_release() can put it down even in case of an error.
1354 */
1355 cork->base.dst = &rt->dst;
1356
1357 /*
1358 * setup for corking
1359 */
1360 if (opt) {
1361 if (WARN_ON(v6_cork->opt))
1362 return -EINVAL;
1363
1364 nopt = v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
1365 if (unlikely(!nopt))
1366 return -ENOBUFS;
1367
1368 nopt->tot_len = sizeof(*opt);
1369 nopt->opt_flen = opt->opt_flen;
1370 nopt->opt_nflen = opt->opt_nflen;
1371
1372 nopt->dst0opt = ip6_opt_dup(opt->dst0opt, sk->sk_allocation);
1373 if (opt->dst0opt && !nopt->dst0opt)
1374 return -ENOBUFS;
1375
1376 nopt->dst1opt = ip6_opt_dup(opt->dst1opt, sk->sk_allocation);
1377 if (opt->dst1opt && !nopt->dst1opt)
1378 return -ENOBUFS;
1379
1380 nopt->hopopt = ip6_opt_dup(opt->hopopt, sk->sk_allocation);
1381 if (opt->hopopt && !nopt->hopopt)
1382 return -ENOBUFS;
1383
1384 nopt->srcrt = ip6_rthdr_dup(opt->srcrt, sk->sk_allocation);
1385 if (opt->srcrt && !nopt->srcrt)
1386 return -ENOBUFS;
1387
1388 /* need source address above miyazawa*/
1389 }
1390 v6_cork->hop_limit = ipc6->hlimit;
1391 v6_cork->tclass = ipc6->tclass;
1392 v6_cork->dontfrag = ipc6->dontfrag;
1393 if (rt->dst.flags & DST_XFRM_TUNNEL)
1394 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1395 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst);
1396 else
1397 mtu = READ_ONCE(np->pmtudisc) >= IPV6_PMTUDISC_PROBE ?
1398 READ_ONCE(rt->dst.dev->mtu) : dst_mtu(xfrm_dst_path(&rt->dst));
1399
1400 frag_size = READ_ONCE(np->frag_size);
1401 if (frag_size && frag_size < mtu)
1402 mtu = frag_size;
1403
1404 cork->base.fragsize = mtu;
1405 cork->base.gso_size = ipc6->gso_size;
1406 cork->base.tx_flags = 0;
1407 cork->base.mark = ipc6->sockc.mark;
1408 cork->base.priority = ipc6->sockc.priority;
1409 sock_tx_timestamp(sk, &ipc6->sockc, &cork->base.tx_flags);
1410 if (ipc6->sockc.tsflags & SOCKCM_FLAG_TS_OPT_ID) {
1411 cork->base.flags |= IPCORK_TS_OPT_ID;
1412 cork->base.ts_opt_id = ipc6->sockc.ts_opt_id;
1413 }
1414 cork->base.length = 0;
1415 cork->base.transmit_time = ipc6->sockc.transmit_time;
1416
1417 return 0;
1418 }
1419
__ip6_append_data(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork_full,struct inet6_cork * v6_cork,struct page_frag * pfrag,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,unsigned int flags)1420 static int __ip6_append_data(struct sock *sk,
1421 struct sk_buff_head *queue,
1422 struct inet_cork_full *cork_full,
1423 struct inet6_cork *v6_cork,
1424 struct page_frag *pfrag,
1425 int getfrag(void *from, char *to, int offset,
1426 int len, int odd, struct sk_buff *skb),
1427 void *from, size_t length, int transhdrlen,
1428 unsigned int flags)
1429 {
1430 struct sk_buff *skb, *skb_prev = NULL;
1431 struct inet_cork *cork = &cork_full->base;
1432 struct flowi6 *fl6 = &cork_full->fl.u.ip6;
1433 unsigned int maxfraglen, fragheaderlen, mtu, orig_mtu, pmtu;
1434 struct ubuf_info *uarg = NULL;
1435 int exthdrlen = 0;
1436 int dst_exthdrlen = 0;
1437 int hh_len;
1438 int copy;
1439 int err;
1440 int offset = 0;
1441 bool zc = false;
1442 u32 tskey = 0;
1443 struct rt6_info *rt = dst_rt6_info(cork->dst);
1444 bool paged, hold_tskey = false, extra_uref = false;
1445 struct ipv6_txoptions *opt = v6_cork->opt;
1446 int csummode = CHECKSUM_NONE;
1447 unsigned int maxnonfragsize, headersize;
1448 unsigned int wmem_alloc_delta = 0;
1449
1450 skb = skb_peek_tail(queue);
1451 if (!skb) {
1452 exthdrlen = opt ? opt->opt_flen : 0;
1453 dst_exthdrlen = rt->dst.header_len - rt->rt6i_nfheader_len;
1454 }
1455
1456 paged = !!cork->gso_size;
1457 mtu = cork->gso_size ? IP6_MAX_MTU : cork->fragsize;
1458 orig_mtu = mtu;
1459
1460 hh_len = LL_RESERVED_SPACE(rt->dst.dev);
1461
1462 fragheaderlen = sizeof(struct ipv6hdr) + rt->rt6i_nfheader_len +
1463 (opt ? opt->opt_nflen : 0);
1464
1465 headersize = sizeof(struct ipv6hdr) +
1466 (opt ? opt->opt_flen + opt->opt_nflen : 0) +
1467 rt->rt6i_nfheader_len;
1468
1469 if (mtu <= fragheaderlen ||
1470 ((mtu - fragheaderlen) & ~7) + fragheaderlen <= sizeof(struct frag_hdr))
1471 goto emsgsize;
1472
1473 maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen -
1474 sizeof(struct frag_hdr);
1475
1476 /* as per RFC 7112 section 5, the entire IPv6 Header Chain must fit
1477 * the first fragment
1478 */
1479 if (headersize + transhdrlen > mtu)
1480 goto emsgsize;
1481
1482 if (cork->length + length > mtu - headersize && v6_cork->dontfrag &&
1483 (sk->sk_protocol == IPPROTO_UDP ||
1484 sk->sk_protocol == IPPROTO_ICMPV6 ||
1485 sk->sk_protocol == IPPROTO_RAW)) {
1486 ipv6_local_rxpmtu(sk, fl6, mtu - headersize +
1487 sizeof(struct ipv6hdr));
1488 goto emsgsize;
1489 }
1490
1491 if (ip6_sk_ignore_df(sk))
1492 maxnonfragsize = sizeof(struct ipv6hdr) + IPV6_MAXPLEN;
1493 else
1494 maxnonfragsize = mtu;
1495
1496 if (cork->length + length > maxnonfragsize - headersize) {
1497 emsgsize:
1498 pmtu = max_t(int, mtu - headersize + sizeof(struct ipv6hdr), 0);
1499 ipv6_local_error(sk, EMSGSIZE, fl6, pmtu);
1500 return -EMSGSIZE;
1501 }
1502
1503 /* CHECKSUM_PARTIAL only with no extension headers and when
1504 * we are not going to fragment
1505 */
1506 if (transhdrlen && sk->sk_protocol == IPPROTO_UDP &&
1507 headersize == sizeof(struct ipv6hdr) &&
1508 length <= mtu - headersize &&
1509 (!(flags & MSG_MORE) || cork->gso_size) &&
1510 rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
1511 csummode = CHECKSUM_PARTIAL;
1512
1513 if ((flags & MSG_ZEROCOPY) && length) {
1514 struct msghdr *msg = from;
1515
1516 if (getfrag == ip_generic_getfrag && msg->msg_ubuf) {
1517 if (skb_zcopy(skb) && msg->msg_ubuf != skb_zcopy(skb))
1518 return -EINVAL;
1519
1520 /* Leave uarg NULL if can't zerocopy, callers should
1521 * be able to handle it.
1522 */
1523 if ((rt->dst.dev->features & NETIF_F_SG) &&
1524 csummode == CHECKSUM_PARTIAL) {
1525 paged = true;
1526 zc = true;
1527 uarg = msg->msg_ubuf;
1528 }
1529 } else if (sock_flag(sk, SOCK_ZEROCOPY)) {
1530 uarg = msg_zerocopy_realloc(sk, length, skb_zcopy(skb),
1531 false);
1532 if (!uarg)
1533 return -ENOBUFS;
1534 extra_uref = !skb_zcopy(skb); /* only ref on new uarg */
1535 if (rt->dst.dev->features & NETIF_F_SG &&
1536 csummode == CHECKSUM_PARTIAL) {
1537 paged = true;
1538 zc = true;
1539 } else {
1540 uarg_to_msgzc(uarg)->zerocopy = 0;
1541 skb_zcopy_set(skb, uarg, &extra_uref);
1542 }
1543 }
1544 } else if ((flags & MSG_SPLICE_PAGES) && length) {
1545 if (inet_test_bit(HDRINCL, sk))
1546 return -EPERM;
1547 if (rt->dst.dev->features & NETIF_F_SG &&
1548 getfrag == ip_generic_getfrag)
1549 /* We need an empty buffer to attach stuff to */
1550 paged = true;
1551 else
1552 flags &= ~MSG_SPLICE_PAGES;
1553 }
1554
1555 if (cork->tx_flags & SKBTX_ANY_TSTAMP &&
1556 READ_ONCE(sk->sk_tsflags) & SOF_TIMESTAMPING_OPT_ID) {
1557 if (cork->flags & IPCORK_TS_OPT_ID) {
1558 tskey = cork->ts_opt_id;
1559 } else {
1560 tskey = atomic_inc_return(&sk->sk_tskey) - 1;
1561 hold_tskey = true;
1562 }
1563 }
1564
1565 /*
1566 * Let's try using as much space as possible.
1567 * Use MTU if total length of the message fits into the MTU.
1568 * Otherwise, we need to reserve fragment header and
1569 * fragment alignment (= 8-15 octects, in total).
1570 *
1571 * Note that we may need to "move" the data from the tail
1572 * of the buffer to the new fragment when we split
1573 * the message.
1574 *
1575 * FIXME: It may be fragmented into multiple chunks
1576 * at once if non-fragmentable extension headers
1577 * are too large.
1578 * --yoshfuji
1579 */
1580
1581 cork->length += length;
1582 if (!skb)
1583 goto alloc_new_skb;
1584
1585 while (length > 0) {
1586 /* Check if the remaining data fits into current packet. */
1587 copy = (cork->length <= mtu ? mtu : maxfraglen) - skb->len;
1588 if (copy < length)
1589 copy = maxfraglen - skb->len;
1590
1591 if (copy <= 0) {
1592 char *data;
1593 unsigned int datalen;
1594 unsigned int fraglen;
1595 unsigned int fraggap;
1596 unsigned int alloclen, alloc_extra;
1597 unsigned int pagedlen;
1598 alloc_new_skb:
1599 /* There's no room in the current skb */
1600 if (skb)
1601 fraggap = skb->len - maxfraglen;
1602 else
1603 fraggap = 0;
1604 /* update mtu and maxfraglen if necessary */
1605 if (!skb || !skb_prev)
1606 ip6_append_data_mtu(&mtu, &maxfraglen,
1607 fragheaderlen, skb, rt,
1608 orig_mtu);
1609
1610 skb_prev = skb;
1611
1612 /*
1613 * If remaining data exceeds the mtu,
1614 * we know we need more fragment(s).
1615 */
1616 datalen = length + fraggap;
1617
1618 if (datalen > (cork->length <= mtu ? mtu : maxfraglen) - fragheaderlen)
1619 datalen = maxfraglen - fragheaderlen - rt->dst.trailer_len;
1620 fraglen = datalen + fragheaderlen;
1621 pagedlen = 0;
1622
1623 alloc_extra = hh_len;
1624 alloc_extra += dst_exthdrlen;
1625 alloc_extra += rt->dst.trailer_len;
1626
1627 /* We just reserve space for fragment header.
1628 * Note: this may be overallocation if the message
1629 * (without MSG_MORE) fits into the MTU.
1630 */
1631 alloc_extra += sizeof(struct frag_hdr);
1632
1633 if ((flags & MSG_MORE) &&
1634 !(rt->dst.dev->features&NETIF_F_SG))
1635 alloclen = mtu;
1636 else if (!paged &&
1637 (fraglen + alloc_extra < SKB_MAX_ALLOC ||
1638 !(rt->dst.dev->features & NETIF_F_SG)))
1639 alloclen = fraglen;
1640 else {
1641 alloclen = fragheaderlen + transhdrlen;
1642 pagedlen = datalen - transhdrlen;
1643 }
1644 alloclen += alloc_extra;
1645
1646 if (datalen != length + fraggap) {
1647 /*
1648 * this is not the last fragment, the trailer
1649 * space is regarded as data space.
1650 */
1651 datalen += rt->dst.trailer_len;
1652 }
1653
1654 fraglen = datalen + fragheaderlen;
1655
1656 copy = datalen - transhdrlen - fraggap - pagedlen;
1657 /* [!] NOTE: copy may be negative if pagedlen>0
1658 * because then the equation may reduces to -fraggap.
1659 */
1660 if (copy < 0 && !(flags & MSG_SPLICE_PAGES)) {
1661 err = -EINVAL;
1662 goto error;
1663 }
1664 if (transhdrlen) {
1665 skb = sock_alloc_send_skb(sk, alloclen,
1666 (flags & MSG_DONTWAIT), &err);
1667 } else {
1668 skb = NULL;
1669 if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <=
1670 2 * sk->sk_sndbuf)
1671 skb = alloc_skb(alloclen,
1672 sk->sk_allocation);
1673 if (unlikely(!skb))
1674 err = -ENOBUFS;
1675 }
1676 if (!skb)
1677 goto error;
1678 /*
1679 * Fill in the control structures
1680 */
1681 skb->protocol = htons(ETH_P_IPV6);
1682 skb->ip_summed = csummode;
1683 skb->csum = 0;
1684 /* reserve for fragmentation and ipsec header */
1685 skb_reserve(skb, hh_len + sizeof(struct frag_hdr) +
1686 dst_exthdrlen);
1687
1688 /*
1689 * Find where to start putting bytes
1690 */
1691 data = skb_put(skb, fraglen - pagedlen);
1692 skb_set_network_header(skb, exthdrlen);
1693 data += fragheaderlen;
1694 skb->transport_header = (skb->network_header +
1695 fragheaderlen);
1696 if (fraggap) {
1697 skb->csum = skb_copy_and_csum_bits(
1698 skb_prev, maxfraglen,
1699 data + transhdrlen, fraggap);
1700 skb_prev->csum = csum_sub(skb_prev->csum,
1701 skb->csum);
1702 data += fraggap;
1703 pskb_trim_unique(skb_prev, maxfraglen);
1704 }
1705 if (copy > 0 &&
1706 INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1707 from, data + transhdrlen, offset,
1708 copy, fraggap, skb) < 0) {
1709 err = -EFAULT;
1710 kfree_skb(skb);
1711 goto error;
1712 } else if (flags & MSG_SPLICE_PAGES) {
1713 copy = 0;
1714 }
1715
1716 offset += copy;
1717 length -= copy + transhdrlen;
1718 transhdrlen = 0;
1719 exthdrlen = 0;
1720 dst_exthdrlen = 0;
1721
1722 /* Only the initial fragment is time stamped */
1723 skb_shinfo(skb)->tx_flags = cork->tx_flags;
1724 cork->tx_flags = 0;
1725 skb_shinfo(skb)->tskey = tskey;
1726 tskey = 0;
1727 skb_zcopy_set(skb, uarg, &extra_uref);
1728
1729 if ((flags & MSG_CONFIRM) && !skb_prev)
1730 skb_set_dst_pending_confirm(skb, 1);
1731
1732 /*
1733 * Put the packet on the pending queue
1734 */
1735 if (!skb->destructor) {
1736 skb->destructor = sock_wfree;
1737 skb->sk = sk;
1738 wmem_alloc_delta += skb->truesize;
1739 }
1740 __skb_queue_tail(queue, skb);
1741 continue;
1742 }
1743
1744 if (copy > length)
1745 copy = length;
1746
1747 if (!(rt->dst.dev->features&NETIF_F_SG) &&
1748 skb_tailroom(skb) >= copy) {
1749 unsigned int off;
1750
1751 off = skb->len;
1752 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1753 from, skb_put(skb, copy),
1754 offset, copy, off, skb) < 0) {
1755 __skb_trim(skb, off);
1756 err = -EFAULT;
1757 goto error;
1758 }
1759 } else if (flags & MSG_SPLICE_PAGES) {
1760 struct msghdr *msg = from;
1761
1762 err = -EIO;
1763 if (WARN_ON_ONCE(copy > msg->msg_iter.count))
1764 goto error;
1765
1766 err = skb_splice_from_iter(skb, &msg->msg_iter, copy);
1767 if (err < 0)
1768 goto error;
1769 copy = err;
1770 wmem_alloc_delta += copy;
1771 } else if (!zc) {
1772 int i = skb_shinfo(skb)->nr_frags;
1773
1774 err = -ENOMEM;
1775 if (!sk_page_frag_refill(sk, pfrag))
1776 goto error;
1777
1778 skb_zcopy_downgrade_managed(skb);
1779 if (!skb_can_coalesce(skb, i, pfrag->page,
1780 pfrag->offset)) {
1781 err = -EMSGSIZE;
1782 if (i == MAX_SKB_FRAGS)
1783 goto error;
1784
1785 __skb_fill_page_desc(skb, i, pfrag->page,
1786 pfrag->offset, 0);
1787 skb_shinfo(skb)->nr_frags = ++i;
1788 get_page(pfrag->page);
1789 }
1790 copy = min_t(int, copy, pfrag->size - pfrag->offset);
1791 if (INDIRECT_CALL_1(getfrag, ip_generic_getfrag,
1792 from,
1793 page_address(pfrag->page) + pfrag->offset,
1794 offset, copy, skb->len, skb) < 0)
1795 goto error_efault;
1796
1797 pfrag->offset += copy;
1798 skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
1799 skb->len += copy;
1800 skb->data_len += copy;
1801 skb->truesize += copy;
1802 wmem_alloc_delta += copy;
1803 } else {
1804 err = skb_zerocopy_iter_dgram(skb, from, copy);
1805 if (err < 0)
1806 goto error;
1807 }
1808 offset += copy;
1809 length -= copy;
1810 }
1811
1812 if (wmem_alloc_delta)
1813 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1814 return 0;
1815
1816 error_efault:
1817 err = -EFAULT;
1818 error:
1819 net_zcopy_put_abort(uarg, extra_uref);
1820 cork->length -= length;
1821 IP6_INC_STATS(sock_net(sk), rt->rt6i_idev, IPSTATS_MIB_OUTDISCARDS);
1822 refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
1823 if (hold_tskey)
1824 atomic_dec(&sk->sk_tskey);
1825 return err;
1826 }
1827
ip6_append_data(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct flowi6 * fl6,struct rt6_info * rt,unsigned int flags)1828 int ip6_append_data(struct sock *sk,
1829 int getfrag(void *from, char *to, int offset, int len,
1830 int odd, struct sk_buff *skb),
1831 void *from, size_t length, int transhdrlen,
1832 struct ipcm6_cookie *ipc6, struct flowi6 *fl6,
1833 struct rt6_info *rt, unsigned int flags)
1834 {
1835 struct inet_sock *inet = inet_sk(sk);
1836 struct ipv6_pinfo *np = inet6_sk(sk);
1837 int exthdrlen;
1838 int err;
1839
1840 if (flags&MSG_PROBE)
1841 return 0;
1842 if (skb_queue_empty(&sk->sk_write_queue)) {
1843 /*
1844 * setup for corking
1845 */
1846 dst_hold(&rt->dst);
1847 err = ip6_setup_cork(sk, &inet->cork, &np->cork,
1848 ipc6, rt);
1849 if (err)
1850 return err;
1851
1852 inet->cork.fl.u.ip6 = *fl6;
1853 exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
1854 length += exthdrlen;
1855 transhdrlen += exthdrlen;
1856 } else {
1857 transhdrlen = 0;
1858 }
1859
1860 return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork,
1861 &np->cork, sk_page_frag(sk), getfrag,
1862 from, length, transhdrlen, flags);
1863 }
1864 EXPORT_SYMBOL_GPL(ip6_append_data);
1865
ip6_cork_steal_dst(struct sk_buff * skb,struct inet_cork_full * cork)1866 static void ip6_cork_steal_dst(struct sk_buff *skb, struct inet_cork_full *cork)
1867 {
1868 struct dst_entry *dst = cork->base.dst;
1869
1870 cork->base.dst = NULL;
1871 skb_dst_set(skb, dst);
1872 }
1873
ip6_cork_release(struct inet_cork_full * cork,struct inet6_cork * v6_cork)1874 static void ip6_cork_release(struct inet_cork_full *cork,
1875 struct inet6_cork *v6_cork)
1876 {
1877 if (v6_cork->opt) {
1878 struct ipv6_txoptions *opt = v6_cork->opt;
1879
1880 kfree(opt->dst0opt);
1881 kfree(opt->dst1opt);
1882 kfree(opt->hopopt);
1883 kfree(opt->srcrt);
1884 kfree(opt);
1885 v6_cork->opt = NULL;
1886 }
1887
1888 if (cork->base.dst) {
1889 dst_release(cork->base.dst);
1890 cork->base.dst = NULL;
1891 }
1892 }
1893
__ip6_make_skb(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)1894 struct sk_buff *__ip6_make_skb(struct sock *sk,
1895 struct sk_buff_head *queue,
1896 struct inet_cork_full *cork,
1897 struct inet6_cork *v6_cork)
1898 {
1899 struct sk_buff *skb, *tmp_skb;
1900 struct sk_buff **tail_skb;
1901 struct in6_addr *final_dst;
1902 struct net *net = sock_net(sk);
1903 struct ipv6hdr *hdr;
1904 struct ipv6_txoptions *opt = v6_cork->opt;
1905 struct rt6_info *rt = dst_rt6_info(cork->base.dst);
1906 struct flowi6 *fl6 = &cork->fl.u.ip6;
1907 unsigned char proto = fl6->flowi6_proto;
1908
1909 skb = __skb_dequeue(queue);
1910 if (!skb)
1911 goto out;
1912 tail_skb = &(skb_shinfo(skb)->frag_list);
1913
1914 /* move skb->data to ip header from ext header */
1915 if (skb->data < skb_network_header(skb))
1916 __skb_pull(skb, skb_network_offset(skb));
1917 while ((tmp_skb = __skb_dequeue(queue)) != NULL) {
1918 __skb_pull(tmp_skb, skb_network_header_len(skb));
1919 *tail_skb = tmp_skb;
1920 tail_skb = &(tmp_skb->next);
1921 skb->len += tmp_skb->len;
1922 skb->data_len += tmp_skb->len;
1923 skb->truesize += tmp_skb->truesize;
1924 tmp_skb->destructor = NULL;
1925 tmp_skb->sk = NULL;
1926 }
1927
1928 /* Allow local fragmentation. */
1929 skb->ignore_df = ip6_sk_ignore_df(sk);
1930 __skb_pull(skb, skb_network_header_len(skb));
1931
1932 final_dst = &fl6->daddr;
1933 if (opt && opt->opt_flen)
1934 ipv6_push_frag_opts(skb, opt, &proto);
1935 if (opt && opt->opt_nflen)
1936 ipv6_push_nfrag_opts(skb, opt, &proto, &final_dst, &fl6->saddr);
1937
1938 skb_push(skb, sizeof(struct ipv6hdr));
1939 skb_reset_network_header(skb);
1940 hdr = ipv6_hdr(skb);
1941
1942 ip6_flow_hdr(hdr, v6_cork->tclass,
1943 ip6_make_flowlabel(net, skb, fl6->flowlabel,
1944 ip6_autoflowlabel(net, sk), fl6));
1945 hdr->hop_limit = v6_cork->hop_limit;
1946 hdr->nexthdr = proto;
1947 hdr->saddr = fl6->saddr;
1948 hdr->daddr = *final_dst;
1949
1950 skb->priority = cork->base.priority;
1951 skb->mark = cork->base.mark;
1952 if (sk_is_tcp(sk))
1953 skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC);
1954 else
1955 skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid);
1956
1957 ip6_cork_steal_dst(skb, cork);
1958 IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS);
1959 if (proto == IPPROTO_ICMPV6) {
1960 struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
1961 u8 icmp6_type;
1962
1963 if (sk->sk_socket->type == SOCK_RAW &&
1964 !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH))
1965 icmp6_type = fl6->fl6_icmp_type;
1966 else
1967 icmp6_type = icmp6_hdr(skb)->icmp6_type;
1968 ICMP6MSGOUT_INC_STATS(net, idev, icmp6_type);
1969 ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
1970 }
1971
1972 ip6_cork_release(cork, v6_cork);
1973 out:
1974 return skb;
1975 }
1976
ip6_send_skb(struct sk_buff * skb)1977 int ip6_send_skb(struct sk_buff *skb)
1978 {
1979 struct net *net = sock_net(skb->sk);
1980 struct rt6_info *rt = dst_rt6_info(skb_dst(skb));
1981 int err;
1982
1983 rcu_read_lock();
1984 err = ip6_local_out(net, skb->sk, skb);
1985 if (err) {
1986 if (err > 0)
1987 err = net_xmit_errno(err);
1988 if (err)
1989 IP6_INC_STATS(net, rt->rt6i_idev,
1990 IPSTATS_MIB_OUTDISCARDS);
1991 }
1992
1993 rcu_read_unlock();
1994 return err;
1995 }
1996
ip6_push_pending_frames(struct sock * sk)1997 int ip6_push_pending_frames(struct sock *sk)
1998 {
1999 struct sk_buff *skb;
2000
2001 skb = ip6_finish_skb(sk);
2002 if (!skb)
2003 return 0;
2004
2005 return ip6_send_skb(skb);
2006 }
2007 EXPORT_SYMBOL_GPL(ip6_push_pending_frames);
2008
__ip6_flush_pending_frames(struct sock * sk,struct sk_buff_head * queue,struct inet_cork_full * cork,struct inet6_cork * v6_cork)2009 static void __ip6_flush_pending_frames(struct sock *sk,
2010 struct sk_buff_head *queue,
2011 struct inet_cork_full *cork,
2012 struct inet6_cork *v6_cork)
2013 {
2014 struct sk_buff *skb;
2015
2016 while ((skb = __skb_dequeue_tail(queue)) != NULL) {
2017 if (skb_dst(skb))
2018 IP6_INC_STATS(sock_net(sk), ip6_dst_idev(skb_dst(skb)),
2019 IPSTATS_MIB_OUTDISCARDS);
2020 kfree_skb(skb);
2021 }
2022
2023 ip6_cork_release(cork, v6_cork);
2024 }
2025
ip6_flush_pending_frames(struct sock * sk)2026 void ip6_flush_pending_frames(struct sock *sk)
2027 {
2028 __ip6_flush_pending_frames(sk, &sk->sk_write_queue,
2029 &inet_sk(sk)->cork, &inet6_sk(sk)->cork);
2030 }
2031 EXPORT_SYMBOL_GPL(ip6_flush_pending_frames);
2032
ip6_make_skb(struct sock * sk,int getfrag (void * from,char * to,int offset,int len,int odd,struct sk_buff * skb),void * from,size_t length,int transhdrlen,struct ipcm6_cookie * ipc6,struct rt6_info * rt,unsigned int flags,struct inet_cork_full * cork)2033 struct sk_buff *ip6_make_skb(struct sock *sk,
2034 int getfrag(void *from, char *to, int offset,
2035 int len, int odd, struct sk_buff *skb),
2036 void *from, size_t length, int transhdrlen,
2037 struct ipcm6_cookie *ipc6, struct rt6_info *rt,
2038 unsigned int flags, struct inet_cork_full *cork)
2039 {
2040 struct inet6_cork v6_cork;
2041 struct sk_buff_head queue;
2042 int exthdrlen = (ipc6->opt ? ipc6->opt->opt_flen : 0);
2043 int err;
2044
2045 if (flags & MSG_PROBE) {
2046 dst_release(&rt->dst);
2047 return NULL;
2048 }
2049
2050 __skb_queue_head_init(&queue);
2051
2052 cork->base.flags = 0;
2053 cork->base.addr = 0;
2054 cork->base.opt = NULL;
2055 v6_cork.opt = NULL;
2056 err = ip6_setup_cork(sk, cork, &v6_cork, ipc6, rt);
2057 if (err) {
2058 ip6_cork_release(cork, &v6_cork);
2059 return ERR_PTR(err);
2060 }
2061
2062 err = __ip6_append_data(sk, &queue, cork, &v6_cork,
2063 ¤t->task_frag, getfrag, from,
2064 length + exthdrlen, transhdrlen + exthdrlen,
2065 flags);
2066 if (err) {
2067 __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork);
2068 return ERR_PTR(err);
2069 }
2070
2071 return __ip6_make_skb(sk, &queue, cork, &v6_cork);
2072 }
2073