xref: /linux/net/ipv4/ip_input.c (revision 05e352444b2430de4b183b4a988085381e5fd6ad)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET		An implementation of the TCP/IP protocol suite for the LINUX
4  *		operating system.  INET is implemented using the  BSD Socket
5  *		interface as the means of communication with the user level.
6  *
7  *		The Internet Protocol (IP) module.
8  *
9  * Authors:	Ross Biro
10  *		Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *		Donald Becker, <becker@super.org>
12  *		Alan Cox, <alan@lxorguk.ukuu.org.uk>
13  *		Richard Underwood
14  *		Stefan Becker, <stefanb@yello.ping.de>
15  *		Jorge Cwik, <jorge@laser.satlink.net>
16  *		Arnt Gulbrandsen, <agulbra@nvg.unit.no>
17  *
18  * Fixes:
19  *		Alan Cox	:	Commented a couple of minor bits of surplus code
20  *		Alan Cox	:	Undefining IP_FORWARD doesn't include the code
21  *					(just stops a compiler warning).
22  *		Alan Cox	:	Frames with >=MAX_ROUTE record routes, strict routes or loose routes
23  *					are junked rather than corrupting things.
24  *		Alan Cox	:	Frames to bad broadcast subnets are dumped
25  *					We used to process them non broadcast and
26  *					boy could that cause havoc.
27  *		Alan Cox	:	ip_forward sets the free flag on the
28  *					new frame it queues. Still crap because
29  *					it copies the frame but at least it
30  *					doesn't eat memory too.
31  *		Alan Cox	:	Generic queue code and memory fixes.
32  *		Fred Van Kempen :	IP fragment support (borrowed from NET2E)
33  *		Gerhard Koerting:	Forward fragmented frames correctly.
34  *		Gerhard Koerting: 	Fixes to my fix of the above 8-).
35  *		Gerhard Koerting:	IP interface addressing fix.
36  *		Linus Torvalds	:	More robustness checks
37  *		Alan Cox	:	Even more checks: Still not as robust as it ought to be
38  *		Alan Cox	:	Save IP header pointer for later
39  *		Alan Cox	:	ip option setting
40  *		Alan Cox	:	Use ip_tos/ip_ttl settings
41  *		Alan Cox	:	Fragmentation bogosity removed
42  *					(Thanks to Mark.Bush@prg.ox.ac.uk)
43  *		Dmitry Gorodchanin :	Send of a raw packet crash fix.
44  *		Alan Cox	:	Silly ip bug when an overlength
45  *					fragment turns up. Now frees the
46  *					queue.
47  *		Linus Torvalds/ :	Memory leakage on fragmentation
48  *		Alan Cox	:	handling.
49  *		Gerhard Koerting:	Forwarding uses IP priority hints
50  *		Teemu Rantanen	:	Fragment problems.
51  *		Alan Cox	:	General cleanup, comments and reformat
52  *		Alan Cox	:	SNMP statistics
53  *		Alan Cox	:	BSD address rule semantics. Also see
54  *					UDP as there is a nasty checksum issue
55  *					if you do things the wrong way.
56  *		Alan Cox	:	Always defrag, moved IP_FORWARD to the config.in file
57  *		Alan Cox	: 	IP options adjust sk->priority.
58  *		Pedro Roque	:	Fix mtu/length error in ip_forward.
59  *		Alan Cox	:	Avoid ip_chk_addr when possible.
60  *	Richard Underwood	:	IP multicasting.
61  *		Alan Cox	:	Cleaned up multicast handlers.
62  *		Alan Cox	:	RAW sockets demultiplex in the BSD style.
63  *		Gunther Mayer	:	Fix the SNMP reporting typo
64  *		Alan Cox	:	Always in group 224.0.0.1
65  *	Pauline Middelink	:	Fast ip_checksum update when forwarding
66  *					Masquerading support.
67  *		Alan Cox	:	Multicast loopback error for 224.0.0.1
68  *		Alan Cox	:	IP_MULTICAST_LOOP option.
69  *		Alan Cox	:	Use notifiers.
70  *		Bjorn Ekwall	:	Removed ip_csum (from slhc.c too)
71  *		Bjorn Ekwall	:	Moved ip_fast_csum to ip.h (inline!)
72  *		Stefan Becker   :       Send out ICMP HOST REDIRECT
73  *	Arnt Gulbrandsen	:	ip_build_xmit
74  *		Alan Cox	:	Per socket routing cache
75  *		Alan Cox	:	Fixed routing cache, added header cache.
76  *		Alan Cox	:	Loopback didn't work right in original ip_build_xmit - fixed it.
77  *		Alan Cox	:	Only send ICMP_REDIRECT if src/dest are the same net.
78  *		Alan Cox	:	Incoming IP option handling.
79  *		Alan Cox	:	Set saddr on raw output frames as per BSD.
80  *		Alan Cox	:	Stopped broadcast source route explosions.
81  *		Alan Cox	:	Can disable source routing
82  *		Takeshi Sone    :	Masquerading didn't work.
83  *	Dave Bonn,Alan Cox	:	Faster IP forwarding whenever possible.
84  *		Alan Cox	:	Memory leaks, tramples, misc debugging.
85  *		Alan Cox	:	Fixed multicast (by popular demand 8))
86  *		Alan Cox	:	Fixed forwarding (by even more popular demand 8))
87  *		Alan Cox	:	Fixed SNMP statistics [I think]
88  *	Gerhard Koerting	:	IP fragmentation forwarding fix
89  *		Alan Cox	:	Device lock against page fault.
90  *		Alan Cox	:	IP_HDRINCL facility.
91  *	Werner Almesberger	:	Zero fragment bug
92  *		Alan Cox	:	RAW IP frame length bug
93  *		Alan Cox	:	Outgoing firewall on build_xmit
94  *		A.N.Kuznetsov	:	IP_OPTIONS support throughout the kernel
95  *		Alan Cox	:	Multicast routing hooks
96  *		Jos Vos		:	Do accounting *before* call_in_firewall
97  *	Willy Konynenberg	:	Transparent proxying support
98  *
99  * To Fix:
100  *		IP fragmentation wants rewriting cleanly. The RFC815 algorithm is much more efficient
101  *		and could be made very efficient with the addition of some virtual memory hacks to permit
102  *		the allocation of a buffer that can then be 'grown' by twiddling page tables.
103  *		Output fragmentation wants updating along with the buffer management to use a single
104  *		interleaved copy algorithm so that fragmenting has a one copy overhead. Actual packet
105  *		output should probably do its own fragmentation at the UDP/RAW layer. TCP shouldn't cause
106  *		fragmentation anyway.
107  */
108 
109 #define pr_fmt(fmt) "IPv4: " fmt
110 
111 #include <linux/module.h>
112 #include <linux/types.h>
113 #include <linux/kernel.h>
114 #include <linux/string.h>
115 #include <linux/errno.h>
116 #include <linux/slab.h>
117 
118 #include <linux/net.h>
119 #include <linux/socket.h>
120 #include <linux/sockios.h>
121 #include <linux/in.h>
122 #include <linux/inet.h>
123 #include <linux/inetdevice.h>
124 #include <linux/netdevice.h>
125 #include <linux/etherdevice.h>
126 #include <linux/indirect_call_wrapper.h>
127 
128 #include <net/snmp.h>
129 #include <net/ip.h>
130 #include <net/protocol.h>
131 #include <net/route.h>
132 #include <linux/skbuff.h>
133 #include <net/sock.h>
134 #include <net/arp.h>
135 #include <net/icmp.h>
136 #include <net/raw.h>
137 #include <net/checksum.h>
138 #include <net/inet_ecn.h>
139 #include <linux/netfilter_ipv4.h>
140 #include <net/xfrm.h>
141 #include <linux/mroute.h>
142 #include <linux/netlink.h>
143 #include <net/dst_metadata.h>
144 #include <net/udp.h>
145 #include <net/tcp.h>
146 
147 /*
148  *	Process Router Attention IP option (RFC 2113)
149  */
150 bool ip_call_ra_chain(struct sk_buff *skb)
151 {
152 	struct ip_ra_chain *ra;
153 	u8 protocol = ip_hdr(skb)->protocol;
154 	struct sock *last = NULL;
155 	struct net_device *dev = skb->dev;
156 	struct net *net = dev_net(dev);
157 
158 	for (ra = rcu_dereference(net->ipv4.ra_chain); ra; ra = rcu_dereference(ra->next)) {
159 		struct sock *sk = ra->sk;
160 
161 		/* If socket is bound to an interface, only report
162 		 * the packet if it came  from that interface.
163 		 */
164 		if (sk && inet_sk(sk)->inet_num == protocol &&
165 		    (!sk->sk_bound_dev_if ||
166 		     sk->sk_bound_dev_if == dev->ifindex)) {
167 			if (ip_is_fragment(ip_hdr(skb))) {
168 				if (ip_defrag(net, skb, IP_DEFRAG_CALL_RA_CHAIN))
169 					return true;
170 			}
171 			if (last) {
172 				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
173 				if (skb2)
174 					raw_rcv(last, skb2);
175 			}
176 			last = sk;
177 		}
178 	}
179 
180 	if (last) {
181 		raw_rcv(last, skb);
182 		return true;
183 	}
184 	return false;
185 }
186 
187 INDIRECT_CALLABLE_DECLARE(int udp_rcv(struct sk_buff *));
188 INDIRECT_CALLABLE_DECLARE(int tcp_v4_rcv(struct sk_buff *));
189 void ip_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int protocol)
190 {
191 	const struct net_protocol *ipprot;
192 	int raw, ret;
193 
194 resubmit:
195 	raw = raw_local_deliver(skb, protocol);
196 
197 	ipprot = rcu_dereference(inet_protos[protocol]);
198 	if (ipprot) {
199 		if (!ipprot->no_policy) {
200 			if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
201 				kfree_skb_reason(skb,
202 						 SKB_DROP_REASON_XFRM_POLICY);
203 				return;
204 			}
205 			nf_reset_ct(skb);
206 		}
207 		ret = INDIRECT_CALL_2(ipprot->handler, tcp_v4_rcv, udp_rcv,
208 				      skb);
209 		if (ret < 0) {
210 			protocol = -ret;
211 			goto resubmit;
212 		}
213 		__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
214 	} else {
215 		if (!raw) {
216 			if (xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
217 				__IP_INC_STATS(net, IPSTATS_MIB_INUNKNOWNPROTOS);
218 				icmp_send(skb, ICMP_DEST_UNREACH,
219 					  ICMP_PROT_UNREACH, 0);
220 			}
221 			kfree_skb_reason(skb, SKB_DROP_REASON_IP_NOPROTO);
222 		} else {
223 			__IP_INC_STATS(net, IPSTATS_MIB_INDELIVERS);
224 			consume_skb(skb);
225 		}
226 	}
227 }
228 
229 static int ip_local_deliver_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
230 {
231 	if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC))) {
232 		__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
233 		kfree_skb_reason(skb, SKB_DROP_REASON_NOMEM);
234 		return 0;
235 	}
236 
237 	skb_clear_delivery_time(skb);
238 	__skb_pull(skb, skb_network_header_len(skb));
239 
240 	rcu_read_lock();
241 	ip_protocol_deliver_rcu(net, skb, ip_hdr(skb)->protocol);
242 	rcu_read_unlock();
243 
244 	return 0;
245 }
246 
247 /*
248  * 	Deliver IP Packets to the higher protocol layers.
249  */
250 int ip_local_deliver(struct sk_buff *skb)
251 {
252 	/*
253 	 *	Reassemble IP fragments.
254 	 */
255 	struct net *net = dev_net(skb->dev);
256 
257 	if (ip_is_fragment(ip_hdr(skb))) {
258 		if (ip_defrag(net, skb, IP_DEFRAG_LOCAL_DELIVER))
259 			return 0;
260 	}
261 
262 	return NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_IN,
263 		       net, NULL, skb, skb->dev, NULL,
264 		       ip_local_deliver_finish);
265 }
266 EXPORT_SYMBOL(ip_local_deliver);
267 
268 static inline enum skb_drop_reason
269 ip_rcv_options(struct sk_buff *skb, struct net_device *dev)
270 {
271 	const struct iphdr *iph;
272 	struct ip_options *opt;
273 
274 	/* It looks as overkill, because not all
275 	   IP options require packet mangling.
276 	   But it is the easiest for now, especially taking
277 	   into account that combination of IP options
278 	   and running sniffer is extremely rare condition.
279 					      --ANK (980813)
280 	*/
281 	if (skb_cow(skb, skb_headroom(skb))) {
282 		__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INDISCARDS);
283 		return SKB_DROP_REASON_NOMEM;
284 	}
285 
286 	iph = ip_hdr(skb);
287 	opt = &(IPCB(skb)->opt);
288 	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
289 
290 	if (ip_options_compile(dev_net(dev), opt, skb)) {
291 		__IP_INC_STATS(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
292 		return SKB_DROP_REASON_IP_INHDR;
293 	}
294 
295 	if (unlikely(opt->srr)) {
296 		struct in_device *in_dev = __in_dev_get_rcu(dev);
297 
298 		if (in_dev) {
299 			if (!IN_DEV_SOURCE_ROUTE(in_dev)) {
300 				if (IN_DEV_LOG_MARTIANS(in_dev))
301 					net_info_ratelimited("source route option %pI4 -> %pI4\n",
302 							     &iph->saddr,
303 							     &iph->daddr);
304 				return SKB_DROP_REASON_NOT_SPECIFIED;
305 			}
306 		}
307 
308 		if (ip_options_rcv_srr(skb, dev))
309 			return SKB_DROP_REASON_NOT_SPECIFIED;
310 	}
311 
312 	return SKB_NOT_DROPPED_YET;
313 }
314 
315 static bool ip_can_use_hint(const struct sk_buff *skb, const struct iphdr *iph,
316 			    const struct sk_buff *hint)
317 {
318 	return hint && !skb_dst(skb) && ip_hdr(hint)->daddr == iph->daddr &&
319 	       ip_hdr(hint)->tos == iph->tos;
320 }
321 
322 static int tcp_v4_early_demux(struct sk_buff *skb)
323 {
324 	struct net *net = dev_net_rcu(skb->dev);
325 	const struct iphdr *iph;
326 	const struct tcphdr *th;
327 	struct sock *sk;
328 
329 	if (skb->pkt_type != PACKET_HOST)
330 		return 0;
331 
332 	if (!pskb_may_pull(skb, skb_transport_offset(skb) +
333 				sizeof(struct tcphdr)))
334 		return 0;
335 
336 	iph = ip_hdr(skb);
337 	th = tcp_hdr(skb);
338 
339 	if (th->doff < sizeof(struct tcphdr) / 4)
340 		return 0;
341 
342 	sk = __inet_lookup_established(net, iph->saddr, th->source,
343 				       iph->daddr, ntohs(th->dest),
344 				       skb->skb_iif, inet_sdif(skb));
345 	if (sk) {
346 		skb->sk = sk;
347 		skb->destructor = sock_edemux;
348 		if (sk_fullsock(sk)) {
349 			struct dst_entry *dst = rcu_dereference(sk->sk_rx_dst);
350 
351 			if (dst)
352 				dst = dst_check(dst, 0);
353 			if (dst &&
354 			    sk->sk_rx_dst_ifindex == skb->skb_iif)
355 				skb_dst_set_noref(skb, dst);
356 		}
357 	}
358 	return 0;
359 }
360 
361 static int ip_rcv_finish_core(struct net *net,
362 			      struct sk_buff *skb, struct net_device *dev,
363 			      const struct sk_buff *hint)
364 {
365 	const struct iphdr *iph = ip_hdr(skb);
366 	struct rtable *rt;
367 	int drop_reason;
368 
369 	if (ip_can_use_hint(skb, iph, hint)) {
370 		drop_reason = ip_route_use_hint(skb, iph->daddr, iph->saddr,
371 						ip4h_dscp(iph), dev, hint);
372 		if (unlikely(drop_reason))
373 			goto drop_error;
374 	}
375 
376 	if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
377 	    !skb_dst(skb) &&
378 	    !skb->sk &&
379 	    !ip_is_fragment(iph)) {
380 		switch (iph->protocol) {
381 		case IPPROTO_TCP:
382 			if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
383 				tcp_v4_early_demux(skb);
384 
385 				/* must reload iph, skb->head might have changed */
386 				iph = ip_hdr(skb);
387 			}
388 			break;
389 		case IPPROTO_UDP:
390 			if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
391 				drop_reason = udp_v4_early_demux(skb);
392 				if (unlikely(drop_reason))
393 					goto drop_error;
394 
395 				/* must reload iph, skb->head might have changed */
396 				iph = ip_hdr(skb);
397 			}
398 			break;
399 		}
400 	}
401 
402 	/*
403 	 *	Initialise the virtual path cache for the packet. It describes
404 	 *	how the packet travels inside Linux networking.
405 	 */
406 	if (!skb_valid_dst(skb)) {
407 		drop_reason = ip_route_input_noref(skb, iph->daddr, iph->saddr,
408 						   ip4h_dscp(iph), dev);
409 		if (unlikely(drop_reason))
410 			goto drop_error;
411 	} else {
412 		struct in_device *in_dev = __in_dev_get_rcu(dev);
413 
414 		if (in_dev && IN_DEV_ORCONF(in_dev, NOPOLICY))
415 			IPCB(skb)->flags |= IPSKB_NOPOLICY;
416 	}
417 
418 #ifdef CONFIG_IP_ROUTE_CLASSID
419 	if (unlikely(skb_dst(skb)->tclassid)) {
420 		struct ip_rt_acct *st = this_cpu_ptr(ip_rt_acct);
421 		u32 idx = skb_dst(skb)->tclassid;
422 		st[idx&0xFF].o_packets++;
423 		st[idx&0xFF].o_bytes += skb->len;
424 		st[(idx>>16)&0xFF].i_packets++;
425 		st[(idx>>16)&0xFF].i_bytes += skb->len;
426 	}
427 #endif
428 
429 	if (iph->ihl > 5) {
430 		drop_reason = ip_rcv_options(skb, dev);
431 		if (drop_reason)
432 			goto drop;
433 	}
434 
435 	rt = skb_rtable(skb);
436 	if (rt->rt_type == RTN_MULTICAST) {
437 		__IP_UPD_PO_STATS(net, IPSTATS_MIB_INMCAST, skb->len);
438 	} else if (rt->rt_type == RTN_BROADCAST) {
439 		__IP_UPD_PO_STATS(net, IPSTATS_MIB_INBCAST, skb->len);
440 	} else if (skb->pkt_type == PACKET_BROADCAST ||
441 		   skb->pkt_type == PACKET_MULTICAST) {
442 		struct in_device *in_dev = __in_dev_get_rcu(dev);
443 
444 		/* RFC 1122 3.3.6:
445 		 *
446 		 *   When a host sends a datagram to a link-layer broadcast
447 		 *   address, the IP destination address MUST be a legal IP
448 		 *   broadcast or IP multicast address.
449 		 *
450 		 *   A host SHOULD silently discard a datagram that is received
451 		 *   via a link-layer broadcast (see Section 2.4) but does not
452 		 *   specify an IP multicast or broadcast destination address.
453 		 *
454 		 * This doesn't explicitly say L2 *broadcast*, but broadcast is
455 		 * in a way a form of multicast and the most common use case for
456 		 * this is 802.11 protecting against cross-station spoofing (the
457 		 * so-called "hole-196" attack) so do it for both.
458 		 */
459 		if (in_dev &&
460 		    IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST)) {
461 			drop_reason = SKB_DROP_REASON_UNICAST_IN_L2_MULTICAST;
462 			goto drop;
463 		}
464 	}
465 
466 	return NET_RX_SUCCESS;
467 
468 drop:
469 	kfree_skb_reason(skb, drop_reason);
470 	return NET_RX_DROP;
471 
472 drop_error:
473 	if (drop_reason == SKB_DROP_REASON_IP_RPFILTER)
474 		__NET_INC_STATS(net, LINUX_MIB_IPRPFILTER);
475 	goto drop;
476 }
477 
478 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
479 {
480 	struct net_device *dev = skb->dev;
481 	int ret;
482 
483 	/* if ingress device is enslaved to an L3 master device pass the
484 	 * skb to its handler for processing
485 	 */
486 	skb = l3mdev_ip_rcv(skb);
487 	if (!skb)
488 		return NET_RX_SUCCESS;
489 
490 	ret = ip_rcv_finish_core(net, skb, dev, NULL);
491 	if (ret != NET_RX_DROP)
492 		ret = dst_input(skb);
493 	return ret;
494 }
495 
496 /*
497  * 	Main IP Receive routine.
498  */
499 static struct sk_buff *ip_rcv_core(struct sk_buff *skb, struct net *net)
500 {
501 	const struct iphdr *iph;
502 	int drop_reason;
503 	u32 len;
504 
505 	/* When the interface is in promisc. mode, drop all the crap
506 	 * that it receives, do not try to analyse it.
507 	 */
508 	if (skb->pkt_type == PACKET_OTHERHOST) {
509 		dev_core_stats_rx_otherhost_dropped_inc(skb->dev);
510 		drop_reason = SKB_DROP_REASON_OTHERHOST;
511 		goto drop;
512 	}
513 
514 	__IP_UPD_PO_STATS(net, IPSTATS_MIB_IN, skb->len);
515 
516 	skb = skb_share_check(skb, GFP_ATOMIC);
517 	if (!skb) {
518 		__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
519 		goto out;
520 	}
521 
522 	drop_reason = SKB_DROP_REASON_NOT_SPECIFIED;
523 	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
524 		goto inhdr_error;
525 
526 	iph = ip_hdr(skb);
527 
528 	/*
529 	 *	RFC1122: 3.2.1.2 MUST silently discard any IP frame that fails the checksum.
530 	 *
531 	 *	Is the datagram acceptable?
532 	 *
533 	 *	1.	Length at least the size of an ip header
534 	 *	2.	Version of 4
535 	 *	3.	Checksums correctly. [Speed optimisation for later, skip loopback checksums]
536 	 *	4.	Doesn't have a bogus length
537 	 */
538 
539 	if (iph->ihl < 5 || iph->version != 4)
540 		goto inhdr_error;
541 
542 	BUILD_BUG_ON(IPSTATS_MIB_ECT1PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_1);
543 	BUILD_BUG_ON(IPSTATS_MIB_ECT0PKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_ECT_0);
544 	BUILD_BUG_ON(IPSTATS_MIB_CEPKTS != IPSTATS_MIB_NOECTPKTS + INET_ECN_CE);
545 	__IP_ADD_STATS(net,
546 		       IPSTATS_MIB_NOECTPKTS + (iph->tos & INET_ECN_MASK),
547 		       max_t(unsigned short, 1, skb_shinfo(skb)->gso_segs));
548 
549 	if (!pskb_may_pull(skb, iph->ihl*4))
550 		goto inhdr_error;
551 
552 	iph = ip_hdr(skb);
553 
554 	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
555 		goto csum_error;
556 
557 	len = iph_totlen(skb, iph);
558 	if (skb->len < len) {
559 		drop_reason = SKB_DROP_REASON_PKT_TOO_SMALL;
560 		__IP_INC_STATS(net, IPSTATS_MIB_INTRUNCATEDPKTS);
561 		goto drop;
562 	} else if (len < (iph->ihl*4))
563 		goto inhdr_error;
564 
565 	/* Our transport medium may have padded the buffer out. Now we know it
566 	 * is IP we can trim to the true length of the frame.
567 	 * Note this now means skb->len holds ntohs(iph->tot_len).
568 	 */
569 	if (pskb_trim_rcsum(skb, len)) {
570 		__IP_INC_STATS(net, IPSTATS_MIB_INDISCARDS);
571 		goto drop;
572 	}
573 
574 	iph = ip_hdr(skb);
575 	skb->transport_header = skb->network_header + iph->ihl*4;
576 
577 	/* Remove any debris in the socket control block */
578 	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
579 	IPCB(skb)->iif = skb->skb_iif;
580 
581 	/* Must drop socket now because of tproxy. */
582 	if (!skb_sk_is_prefetched(skb))
583 		skb_orphan(skb);
584 
585 	return skb;
586 
587 csum_error:
588 	drop_reason = SKB_DROP_REASON_IP_CSUM;
589 	__IP_INC_STATS(net, IPSTATS_MIB_CSUMERRORS);
590 inhdr_error:
591 	if (drop_reason == SKB_DROP_REASON_NOT_SPECIFIED)
592 		drop_reason = SKB_DROP_REASON_IP_INHDR;
593 	__IP_INC_STATS(net, IPSTATS_MIB_INHDRERRORS);
594 drop:
595 	kfree_skb_reason(skb, drop_reason);
596 out:
597 	return NULL;
598 }
599 
600 /*
601  * IP receive entry point
602  */
603 int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt,
604 	   struct net_device *orig_dev)
605 {
606 	struct net *net = dev_net(dev);
607 
608 	skb = ip_rcv_core(skb, net);
609 	if (skb == NULL)
610 		return NET_RX_DROP;
611 
612 	return NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING,
613 		       net, NULL, skb, dev, NULL,
614 		       ip_rcv_finish);
615 }
616 
617 static void ip_sublist_rcv_finish(struct list_head *head)
618 {
619 	struct sk_buff *skb, *next;
620 
621 	list_for_each_entry_safe(skb, next, head, list) {
622 		skb_list_del_init(skb);
623 		dst_input(skb);
624 	}
625 }
626 
627 static struct sk_buff *ip_extract_route_hint(const struct net *net,
628 					     struct sk_buff *skb)
629 {
630 	const struct iphdr *iph = ip_hdr(skb);
631 
632 	if (fib4_has_custom_rules(net) ||
633 	    ipv4_is_lbcast(iph->daddr) ||
634 	    ipv4_is_zeronet(iph->daddr) ||
635 	    IPCB(skb)->flags & IPSKB_MULTIPATH)
636 		return NULL;
637 
638 	return skb;
639 }
640 
641 static void ip_list_rcv_finish(struct net *net, struct list_head *head)
642 {
643 	struct sk_buff *skb, *next, *hint = NULL;
644 	struct dst_entry *curr_dst = NULL;
645 	LIST_HEAD(sublist);
646 
647 	list_for_each_entry_safe(skb, next, head, list) {
648 		struct net_device *dev = skb->dev;
649 		struct dst_entry *dst;
650 
651 		skb_list_del_init(skb);
652 		/* if ingress device is enslaved to an L3 master device pass the
653 		 * skb to its handler for processing
654 		 */
655 		skb = l3mdev_ip_rcv(skb);
656 		if (!skb)
657 			continue;
658 		if (ip_rcv_finish_core(net, skb, dev, hint) == NET_RX_DROP)
659 			continue;
660 
661 		dst = skb_dst(skb);
662 		if (curr_dst != dst) {
663 			hint = ip_extract_route_hint(net, skb);
664 
665 			/* dispatch old sublist */
666 			if (!list_empty(&sublist))
667 				ip_sublist_rcv_finish(&sublist);
668 			/* start new sublist */
669 			INIT_LIST_HEAD(&sublist);
670 			curr_dst = dst;
671 		}
672 		list_add_tail(&skb->list, &sublist);
673 	}
674 	/* dispatch final sublist */
675 	ip_sublist_rcv_finish(&sublist);
676 }
677 
678 static void ip_sublist_rcv(struct list_head *head, struct net_device *dev,
679 			   struct net *net)
680 {
681 	NF_HOOK_LIST(NFPROTO_IPV4, NF_INET_PRE_ROUTING, net, NULL,
682 		     head, dev, NULL, ip_rcv_finish);
683 	ip_list_rcv_finish(net, head);
684 }
685 
686 /* Receive a list of IP packets */
687 void ip_list_rcv(struct list_head *head, struct packet_type *pt,
688 		 struct net_device *orig_dev)
689 {
690 	struct net_device *curr_dev = NULL;
691 	struct net *curr_net = NULL;
692 	struct sk_buff *skb, *next;
693 	LIST_HEAD(sublist);
694 
695 	list_for_each_entry_safe(skb, next, head, list) {
696 		struct net_device *dev = skb->dev;
697 		struct net *net = dev_net(dev);
698 
699 		skb_list_del_init(skb);
700 		skb = ip_rcv_core(skb, net);
701 		if (skb == NULL)
702 			continue;
703 
704 		if (curr_dev != dev || curr_net != net) {
705 			/* dispatch old sublist */
706 			if (!list_empty(&sublist))
707 				ip_sublist_rcv(&sublist, curr_dev, curr_net);
708 			/* start new sublist */
709 			INIT_LIST_HEAD(&sublist);
710 			curr_dev = dev;
711 			curr_net = net;
712 		}
713 		list_add_tail(&skb->list, &sublist);
714 	}
715 	/* dispatch final sublist */
716 	if (!list_empty(&sublist))
717 		ip_sublist_rcv(&sublist, curr_dev, curr_net);
718 }
719