xref: /linux/net/netfilter/nf_flow_table_core.c (revision fcee7d82f27d6a8b1ddc5bbefda59b4e441e9bc0)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16 
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 static __read_mostly struct kmem_cache *flow_offload_cachep;
20 
21 static void
flow_offload_fill_dir(struct flow_offload * flow,enum flow_offload_tuple_dir dir)22 flow_offload_fill_dir(struct flow_offload *flow,
23 		      enum flow_offload_tuple_dir dir)
24 {
25 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
26 	struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
27 
28 	ft->dir = dir;
29 
30 	switch (ctt->src.l3num) {
31 	case NFPROTO_IPV4:
32 		ft->src_v4 = ctt->src.u3.in;
33 		ft->dst_v4 = ctt->dst.u3.in;
34 		break;
35 	case NFPROTO_IPV6:
36 		ft->src_v6 = ctt->src.u3.in6;
37 		ft->dst_v6 = ctt->dst.u3.in6;
38 		break;
39 	}
40 
41 	ft->l3proto = ctt->src.l3num;
42 	ft->l4proto = ctt->dst.protonum;
43 
44 	switch (ctt->dst.protonum) {
45 	case IPPROTO_TCP:
46 	case IPPROTO_UDP:
47 		ft->src_port = ctt->src.u.tcp.port;
48 		ft->dst_port = ctt->dst.u.tcp.port;
49 		break;
50 	}
51 }
52 
flow_offload_alloc(struct nf_conn * ct)53 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
54 {
55 	struct flow_offload *flow;
56 
57 	if (unlikely(nf_ct_is_dying(ct)))
58 		return NULL;
59 
60 	flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
61 	if (!flow)
62 		return NULL;
63 
64 	refcount_inc(&ct->ct_general.use);
65 	flow->ct = ct;
66 
67 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
68 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
69 
70 	if (ct->status & IPS_SRC_NAT)
71 		__set_bit(NF_FLOW_SNAT, &flow->flags);
72 	if (ct->status & IPS_DST_NAT)
73 		__set_bit(NF_FLOW_DNAT, &flow->flags);
74 
75 	return flow;
76 }
77 EXPORT_SYMBOL_GPL(flow_offload_alloc);
78 
flow_offload_dst_cookie(struct flow_offload_tuple * flow_tuple)79 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
80 {
81 	if (flow_tuple->l3proto == NFPROTO_IPV6)
82 		return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
83 
84 	return 0;
85 }
86 
nft_route_dst_fetch(struct nf_flow_route * route,enum flow_offload_tuple_dir dir)87 static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
88 					     enum flow_offload_tuple_dir dir)
89 {
90 	struct dst_entry *dst = route->tuple[dir].dst;
91 
92 	route->tuple[dir].dst = NULL;
93 
94 	return dst;
95 }
96 
flow_offload_fill_route(struct flow_offload * flow,struct nf_flow_route * route,enum flow_offload_tuple_dir dir)97 static int flow_offload_fill_route(struct flow_offload *flow,
98 				   struct nf_flow_route *route,
99 				   enum flow_offload_tuple_dir dir)
100 {
101 	struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
102 	struct dst_entry *dst = nft_route_dst_fetch(route, dir);
103 	int i, j = 0;
104 
105 	switch (flow_tuple->l3proto) {
106 	case NFPROTO_IPV4:
107 		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
108 		break;
109 	case NFPROTO_IPV6:
110 		flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
111 		break;
112 	}
113 
114 	flow_tuple->iifidx = route->tuple[dir].in.ifindex;
115 	for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
116 		flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
117 		flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
118 		if (route->tuple[dir].in.ingress_vlans & BIT(i))
119 			flow_tuple->in_vlan_ingress |= BIT(j);
120 		j++;
121 	}
122 
123 	flow_tuple->tun = route->tuple[dir].in.tun;
124 	flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
125 	flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment;
126 	flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
127 
128 	switch (route->tuple[dir].xmit_type) {
129 	case FLOW_OFFLOAD_XMIT_DIRECT:
130 		memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
131 		       ETH_ALEN);
132 		memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
133 		       ETH_ALEN);
134 		flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
135 		dst_release(dst);
136 		break;
137 	case FLOW_OFFLOAD_XMIT_XFRM:
138 	case FLOW_OFFLOAD_XMIT_NEIGH:
139 		flow_tuple->ifidx = route->tuple[dir].out.ifindex;
140 		flow_tuple->dst_cache = dst;
141 		flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
142 		break;
143 	default:
144 		WARN_ON_ONCE(1);
145 		break;
146 	}
147 	flow_tuple->xmit_type = route->tuple[dir].xmit_type;
148 
149 	return 0;
150 }
151 
nft_flow_dst_release(struct flow_offload * flow,enum flow_offload_tuple_dir dir)152 static void nft_flow_dst_release(struct flow_offload *flow,
153 				 enum flow_offload_tuple_dir dir)
154 {
155 	if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
156 	    flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
157 		dst_release(flow->tuplehash[dir].tuple.dst_cache);
158 }
159 
flow_offload_route_init(struct flow_offload * flow,struct nf_flow_route * route)160 void flow_offload_route_init(struct flow_offload *flow,
161 			     struct nf_flow_route *route)
162 {
163 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
164 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
165 	flow->type = NF_FLOW_OFFLOAD_ROUTE;
166 }
167 EXPORT_SYMBOL_GPL(flow_offload_route_init);
168 
nf_flow_has_expired(const struct flow_offload * flow)169 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
170 {
171 	return nf_flow_timeout_delta(flow->timeout) <= 0;
172 }
173 
flow_offload_fixup_tcp(struct nf_conn * ct,u8 tcp_state)174 static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
175 {
176 	struct ip_ct_tcp *tcp = &ct->proto.tcp;
177 
178 	spin_lock_bh(&ct->lock);
179 	if (tcp->state != tcp_state)
180 		tcp->state = tcp_state;
181 
182 	/* syn packet triggers the TCP reopen case from conntrack. */
183 	if (tcp->state == TCP_CONNTRACK_CLOSE)
184 		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
185 
186 	/* Conntrack state is outdated due to offload bypass.
187 	 * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
188 	 * TCP reset validation will fail.
189 	 */
190 	tcp->seen[0].td_maxwin = 0;
191 	tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
192 	tcp->seen[1].td_maxwin = 0;
193 	tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
194 	spin_unlock_bh(&ct->lock);
195 }
196 
flow_offload_fixup_ct(struct flow_offload * flow)197 static void flow_offload_fixup_ct(struct flow_offload *flow)
198 {
199 	struct nf_conn *ct = flow->ct;
200 	struct net *net = nf_ct_net(ct);
201 	int l4num = nf_ct_protonum(ct);
202 	bool expired, closing = false;
203 	u32 offload_timeout = 0;
204 	s32 timeout;
205 
206 	if (l4num == IPPROTO_TCP) {
207 		const struct nf_tcp_net *tn = nf_tcp_pernet(net);
208 		u8 tcp_state;
209 
210 		/* Enter CLOSE state if fin/rst packet has been seen, this
211 		 * allows TCP reopen from conntrack. Otherwise, pick up from
212 		 * the last seen TCP state.
213 		 */
214 		closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
215 		if (closing) {
216 			flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
217 			timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
218 			expired = false;
219 		} else {
220 			tcp_state = READ_ONCE(ct->proto.tcp.state);
221 			flow_offload_fixup_tcp(ct, tcp_state);
222 			timeout = READ_ONCE(tn->timeouts[tcp_state]);
223 			expired = nf_flow_has_expired(flow);
224 		}
225 		offload_timeout = READ_ONCE(tn->offload_timeout);
226 
227 	} else if (l4num == IPPROTO_UDP) {
228 		const struct nf_udp_net *tn = nf_udp_pernet(net);
229 		enum udp_conntrack state =
230 			test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
231 			UDP_CT_REPLIED : UDP_CT_UNREPLIED;
232 
233 		timeout = READ_ONCE(tn->timeouts[state]);
234 		expired = nf_flow_has_expired(flow);
235 		offload_timeout = READ_ONCE(tn->offload_timeout);
236 	} else {
237 		return;
238 	}
239 
240 	if (expired)
241 		timeout -= offload_timeout;
242 
243 	if (timeout < 0)
244 		timeout = 0;
245 
246 	if (closing ||
247 	    nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
248 		nf_ct_refresh(ct, timeout);
249 }
250 
flow_offload_route_release(struct flow_offload * flow)251 static void flow_offload_route_release(struct flow_offload *flow)
252 {
253 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
254 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
255 }
256 
flow_offload_free(struct flow_offload * flow)257 void flow_offload_free(struct flow_offload *flow)
258 {
259 	switch (flow->type) {
260 	case NF_FLOW_OFFLOAD_ROUTE:
261 		flow_offload_route_release(flow);
262 		break;
263 	default:
264 		break;
265 	}
266 	nf_ct_put(flow->ct);
267 	kfree_rcu(flow, rcu_head);
268 }
269 EXPORT_SYMBOL_GPL(flow_offload_free);
270 
flow_offload_hash(const void * data,u32 len,u32 seed)271 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
272 {
273 	const struct flow_offload_tuple *tuple = data;
274 
275 	return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
276 }
277 
flow_offload_hash_obj(const void * data,u32 len,u32 seed)278 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
279 {
280 	const struct flow_offload_tuple_rhash *tuplehash = data;
281 
282 	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
283 }
284 
flow_offload_hash_cmp(struct rhashtable_compare_arg * arg,const void * ptr)285 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
286 					const void *ptr)
287 {
288 	const struct flow_offload_tuple *tuple = arg->key;
289 	const struct flow_offload_tuple_rhash *x = ptr;
290 
291 	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
292 		return 1;
293 
294 	return 0;
295 }
296 
297 static const struct rhashtable_params nf_flow_offload_rhash_params = {
298 	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
299 	.hashfn			= flow_offload_hash,
300 	.obj_hashfn		= flow_offload_hash_obj,
301 	.obj_cmpfn		= flow_offload_hash_cmp,
302 	.automatic_shrinking	= true,
303 };
304 
flow_offload_get_timeout(struct flow_offload * flow)305 unsigned long flow_offload_get_timeout(struct flow_offload *flow)
306 {
307 	unsigned long timeout = NF_FLOW_TIMEOUT;
308 	struct net *net = nf_ct_net(flow->ct);
309 	int l4num = nf_ct_protonum(flow->ct);
310 
311 	if (l4num == IPPROTO_TCP) {
312 		struct nf_tcp_net *tn = nf_tcp_pernet(net);
313 
314 		timeout = tn->offload_timeout;
315 	} else if (l4num == IPPROTO_UDP) {
316 		struct nf_udp_net *tn = nf_udp_pernet(net);
317 
318 		timeout = tn->offload_timeout;
319 	}
320 
321 	return timeout;
322 }
323 
flow_offload_add(struct nf_flowtable * flow_table,struct flow_offload * flow)324 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
325 {
326 	int err;
327 
328 	flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
329 
330 	err = rhashtable_insert_fast(&flow_table->rhashtable,
331 				     &flow->tuplehash[0].node,
332 				     nf_flow_offload_rhash_params);
333 	if (err < 0)
334 		return err;
335 
336 	err = rhashtable_insert_fast(&flow_table->rhashtable,
337 				     &flow->tuplehash[1].node,
338 				     nf_flow_offload_rhash_params);
339 	if (err < 0) {
340 		rhashtable_remove_fast(&flow_table->rhashtable,
341 				       &flow->tuplehash[0].node,
342 				       nf_flow_offload_rhash_params);
343 		return err;
344 	}
345 
346 	nf_ct_refresh(flow->ct, NF_CT_DAY);
347 
348 	if (nf_flowtable_hw_offload(flow_table)) {
349 		__set_bit(NF_FLOW_HW, &flow->flags);
350 		nf_flow_offload_add(flow_table, flow);
351 	}
352 
353 	return 0;
354 }
355 EXPORT_SYMBOL_GPL(flow_offload_add);
356 
flow_offload_refresh(struct nf_flowtable * flow_table,struct flow_offload * flow,bool force)357 void flow_offload_refresh(struct nf_flowtable *flow_table,
358 			  struct flow_offload *flow, bool force)
359 {
360 	u32 timeout;
361 
362 	timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
363 	if (force || timeout - READ_ONCE(flow->timeout) > HZ)
364 		WRITE_ONCE(flow->timeout, timeout);
365 	else
366 		return;
367 
368 	if (likely(!nf_flowtable_hw_offload(flow_table)) ||
369 	    test_bit(NF_FLOW_CLOSING, &flow->flags))
370 		return;
371 
372 	nf_flow_offload_add(flow_table, flow);
373 }
374 EXPORT_SYMBOL_GPL(flow_offload_refresh);
375 
flow_offload_del(struct nf_flowtable * flow_table,struct flow_offload * flow)376 static void flow_offload_del(struct nf_flowtable *flow_table,
377 			     struct flow_offload *flow)
378 {
379 	rhashtable_remove_fast(&flow_table->rhashtable,
380 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
381 			       nf_flow_offload_rhash_params);
382 	rhashtable_remove_fast(&flow_table->rhashtable,
383 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
384 			       nf_flow_offload_rhash_params);
385 	flow_offload_free(flow);
386 }
387 
flow_offload_teardown(struct flow_offload * flow)388 void flow_offload_teardown(struct flow_offload *flow)
389 {
390 	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
391 	if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
392 		flow_offload_fixup_ct(flow);
393 }
394 EXPORT_SYMBOL_GPL(flow_offload_teardown);
395 
396 struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable * flow_table,struct flow_offload_tuple * tuple)397 flow_offload_lookup(struct nf_flowtable *flow_table,
398 		    struct flow_offload_tuple *tuple)
399 {
400 	struct flow_offload_tuple_rhash *tuplehash;
401 	struct flow_offload *flow;
402 	int dir;
403 
404 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
405 				      nf_flow_offload_rhash_params);
406 	if (!tuplehash)
407 		return NULL;
408 
409 	dir = tuplehash->tuple.dir;
410 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
411 	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
412 		return NULL;
413 
414 	if (unlikely(nf_ct_is_dying(flow->ct)))
415 		return NULL;
416 
417 	return tuplehash;
418 }
419 EXPORT_SYMBOL_GPL(flow_offload_lookup);
420 
421 static int
nf_flow_table_iterate(struct nf_flowtable * flow_table,void (* iter)(struct nf_flowtable * flowtable,struct flow_offload * flow,void * data),void * data)422 nf_flow_table_iterate(struct nf_flowtable *flow_table,
423 		      void (*iter)(struct nf_flowtable *flowtable,
424 				   struct flow_offload *flow, void *data),
425 		      void *data)
426 {
427 	struct flow_offload_tuple_rhash *tuplehash;
428 	struct rhashtable_iter hti;
429 	struct flow_offload *flow;
430 	int err = 0;
431 
432 	rhashtable_walk_enter(&flow_table->rhashtable, &hti);
433 	rhashtable_walk_start(&hti);
434 
435 	while ((tuplehash = rhashtable_walk_next(&hti))) {
436 		if (IS_ERR(tuplehash)) {
437 			if (PTR_ERR(tuplehash) != -EAGAIN) {
438 				err = PTR_ERR(tuplehash);
439 				break;
440 			}
441 			continue;
442 		}
443 		if (tuplehash->tuple.dir)
444 			continue;
445 
446 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
447 
448 		iter(flow_table, flow, data);
449 	}
450 	rhashtable_walk_stop(&hti);
451 	rhashtable_walk_exit(&hti);
452 
453 	return err;
454 }
455 
nf_flow_custom_gc(struct nf_flowtable * flow_table,const struct flow_offload * flow)456 static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
457 			      const struct flow_offload *flow)
458 {
459 	return flow_table->type->gc && flow_table->type->gc(flow);
460 }
461 
462 /**
463  * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
464  * @ct:		Flowtable offloaded tcp ct
465  *
466  * Return: number of seconds when ct entry should expire.
467  */
nf_flow_table_tcp_timeout(const struct nf_conn * ct)468 static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
469 {
470 	u8 state = READ_ONCE(ct->proto.tcp.state);
471 
472 	switch (state) {
473 	case TCP_CONNTRACK_SYN_SENT:
474 	case TCP_CONNTRACK_SYN_RECV:
475 		return 0;
476 	case TCP_CONNTRACK_ESTABLISHED:
477 		return NF_CT_DAY;
478 	case TCP_CONNTRACK_FIN_WAIT:
479 	case TCP_CONNTRACK_CLOSE_WAIT:
480 	case TCP_CONNTRACK_LAST_ACK:
481 	case TCP_CONNTRACK_TIME_WAIT:
482 		return 5 * 60 * HZ;
483 	case TCP_CONNTRACK_CLOSE:
484 		return 0;
485 	}
486 
487 	return 0;
488 }
489 
490 /**
491  * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
492  * @ct:		Flowtable offloaded ct
493  *
494  * Datapath lookups in the conntrack table will evict nf_conn entries
495  * if they have expired.
496  *
497  * Once nf_conn entries have been offloaded, nf_conntrack might not see any
498  * packets anymore.  Thus ct->timeout is no longer refreshed and ct can
499  * be evicted.
500  *
501  * To avoid the need for an additional check on the offload bit for every
502  * packet processed via nf_conntrack_in(), set an arbitrary timeout large
503  * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
504  * from the packet path via nf_ct_is_expired().
505  */
nf_flow_table_extend_ct_timeout(struct nf_conn * ct)506 static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
507 {
508 	static const u32 min_timeout = 5 * 60 * HZ;
509 	u32 expires = nf_ct_expires(ct);
510 
511 	/* normal case: large enough timeout, nothing to do. */
512 	if (likely(expires >= min_timeout))
513 		return;
514 
515 	/* must check offload bit after this, we do not hold any locks.
516 	 * flowtable and ct entries could have been removed on another CPU.
517 	 */
518 	if (!refcount_inc_not_zero(&ct->ct_general.use))
519 		return;
520 
521 	/* load ct->status after refcount increase */
522 	smp_acquire__after_ctrl_dep();
523 
524 	if (nf_ct_is_confirmed(ct) &&
525 	    test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
526 		u8 l4proto = nf_ct_protonum(ct);
527 		u32 new_timeout = true;
528 
529 		switch (l4proto) {
530 		case IPPROTO_UDP:
531 			new_timeout = NF_CT_DAY;
532 			break;
533 		case IPPROTO_TCP:
534 			new_timeout = nf_flow_table_tcp_timeout(ct);
535 			break;
536 		default:
537 			WARN_ON_ONCE(1);
538 			break;
539 		}
540 
541 		/* Update to ct->timeout from nf_conntrack happens
542 		 * without holding ct->lock.
543 		 *
544 		 * Use cmpxchg to ensure timeout extension doesn't
545 		 * happen when we race with conntrack datapath.
546 		 *
547 		 * The inverse -- datapath updating ->timeout right
548 		 * after this -- is fine, datapath is authoritative.
549 		 */
550 		if (new_timeout) {
551 			new_timeout += nfct_time_stamp;
552 			cmpxchg(&ct->timeout, expires, new_timeout);
553 		}
554 	}
555 
556 	nf_ct_put(ct);
557 }
558 
nf_flow_offload_gc_step(struct nf_flowtable * flow_table,struct flow_offload * flow,void * data)559 static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
560 				    struct flow_offload *flow, void *data)
561 {
562 	bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
563 
564 	if (nf_flow_has_expired(flow) ||
565 	    nf_ct_is_dying(flow->ct) ||
566 	    nf_flow_custom_gc(flow_table, flow)) {
567 		flow_offload_teardown(flow);
568 		teardown = true;
569 	} else if (!teardown) {
570 		nf_flow_table_extend_ct_timeout(flow->ct);
571 	}
572 
573 	if (teardown) {
574 		if (test_bit(NF_FLOW_HW, &flow->flags)) {
575 			if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
576 				nf_flow_offload_del(flow_table, flow);
577 			else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
578 				flow_offload_del(flow_table, flow);
579 		} else {
580 			flow_offload_del(flow_table, flow);
581 		}
582 	} else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
583 		   test_bit(NF_FLOW_HW, &flow->flags) &&
584 		   !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
585 		nf_flow_offload_del(flow_table, flow);
586 	} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
587 		nf_flow_offload_stats(flow_table, flow);
588 	}
589 }
590 
nf_flow_table_gc_run(struct nf_flowtable * flow_table)591 void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
592 {
593 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
594 }
595 
nf_flow_offload_work_gc(struct work_struct * work)596 static void nf_flow_offload_work_gc(struct work_struct *work)
597 {
598 	struct nf_flowtable *flow_table;
599 
600 	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
601 	nf_flow_table_gc_run(flow_table);
602 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
603 }
604 
nf_flow_nat_port_tcp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)605 static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
606 				 __be16 port, __be16 new_port)
607 {
608 	struct tcphdr *tcph;
609 
610 	tcph = (void *)(skb_network_header(skb) + thoff);
611 	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
612 }
613 
nf_flow_nat_port_udp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)614 static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
615 				 __be16 port, __be16 new_port)
616 {
617 	struct udphdr *udph;
618 
619 	udph = (void *)(skb_network_header(skb) + thoff);
620 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
621 		inet_proto_csum_replace2(&udph->check, skb, port,
622 					 new_port, false);
623 		if (!udph->check)
624 			udph->check = CSUM_MANGLED_0;
625 	}
626 }
627 
nf_flow_nat_port(struct sk_buff * skb,unsigned int thoff,u8 protocol,__be16 port,__be16 new_port)628 static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
629 			     u8 protocol, __be16 port, __be16 new_port)
630 {
631 	switch (protocol) {
632 	case IPPROTO_TCP:
633 		nf_flow_nat_port_tcp(skb, thoff, port, new_port);
634 		break;
635 	case IPPROTO_UDP:
636 		nf_flow_nat_port_udp(skb, thoff, port, new_port);
637 		break;
638 	}
639 }
640 
nf_flow_snat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)641 void nf_flow_snat_port(const struct flow_offload *flow,
642 		       struct sk_buff *skb, unsigned int thoff,
643 		       u8 protocol, enum flow_offload_tuple_dir dir)
644 {
645 	struct flow_ports *hdr;
646 	__be16 port, new_port;
647 
648 	hdr = (void *)(skb_network_header(skb) + thoff);
649 
650 	switch (dir) {
651 	case FLOW_OFFLOAD_DIR_ORIGINAL:
652 		port = hdr->source;
653 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
654 		hdr->source = new_port;
655 		break;
656 	case FLOW_OFFLOAD_DIR_REPLY:
657 		port = hdr->dest;
658 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
659 		hdr->dest = new_port;
660 		break;
661 	}
662 
663 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
664 }
665 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
666 
nf_flow_dnat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)667 void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
668 		       unsigned int thoff, u8 protocol,
669 		       enum flow_offload_tuple_dir dir)
670 {
671 	struct flow_ports *hdr;
672 	__be16 port, new_port;
673 
674 	hdr = (void *)(skb_network_header(skb) + thoff);
675 
676 	switch (dir) {
677 	case FLOW_OFFLOAD_DIR_ORIGINAL:
678 		port = hdr->dest;
679 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
680 		hdr->dest = new_port;
681 		break;
682 	case FLOW_OFFLOAD_DIR_REPLY:
683 		port = hdr->source;
684 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
685 		hdr->source = new_port;
686 		break;
687 	}
688 
689 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
690 }
691 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
692 
nf_flow_table_init(struct nf_flowtable * flowtable)693 int nf_flow_table_init(struct nf_flowtable *flowtable)
694 {
695 	int err;
696 
697 	INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
698 	flow_block_init(&flowtable->flow_block);
699 	init_rwsem(&flowtable->flow_block_lock);
700 
701 	err = rhashtable_init(&flowtable->rhashtable,
702 			      &nf_flow_offload_rhash_params);
703 	if (err < 0)
704 		return err;
705 
706 	queue_delayed_work(system_power_efficient_wq,
707 			   &flowtable->gc_work, HZ);
708 
709 	mutex_lock(&flowtable_lock);
710 	list_add(&flowtable->list, &flowtables);
711 	mutex_unlock(&flowtable_lock);
712 
713 	return 0;
714 }
715 EXPORT_SYMBOL_GPL(nf_flow_table_init);
716 
nf_flow_table_do_cleanup(struct nf_flowtable * flow_table,struct flow_offload * flow,void * data)717 static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
718 				     struct flow_offload *flow, void *data)
719 {
720 	struct net_device *dev = data;
721 
722 	if (!dev) {
723 		flow_offload_teardown(flow);
724 		return;
725 	}
726 
727 	if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
728 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
729 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
730 		flow_offload_teardown(flow);
731 }
732 
nf_flow_table_gc_cleanup(struct nf_flowtable * flowtable,struct net_device * dev)733 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
734 			      struct net_device *dev)
735 {
736 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
737 	flush_delayed_work(&flowtable->gc_work);
738 	nf_flow_table_offload_flush(flowtable);
739 }
740 
nf_flow_table_cleanup(struct net_device * dev)741 void nf_flow_table_cleanup(struct net_device *dev)
742 {
743 	struct nf_flowtable *flowtable;
744 
745 	mutex_lock(&flowtable_lock);
746 	list_for_each_entry(flowtable, &flowtables, list)
747 		nf_flow_table_gc_cleanup(flowtable, dev);
748 	mutex_unlock(&flowtable_lock);
749 }
750 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
751 
nf_flow_table_free(struct nf_flowtable * flow_table)752 void nf_flow_table_free(struct nf_flowtable *flow_table)
753 {
754 	mutex_lock(&flowtable_lock);
755 	list_del(&flow_table->list);
756 	mutex_unlock(&flowtable_lock);
757 
758 	cancel_delayed_work_sync(&flow_table->gc_work);
759 	nf_flow_table_offload_flush(flow_table);
760 	/* ... no more pending work after this stage ... */
761 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
762 	nf_flow_table_gc_run(flow_table);
763 	nf_flow_table_offload_flush_cleanup(flow_table);
764 	rhashtable_destroy(&flow_table->rhashtable);
765 }
766 EXPORT_SYMBOL_GPL(nf_flow_table_free);
767 
nf_flow_table_init_net(struct net * net)768 static int nf_flow_table_init_net(struct net *net)
769 {
770 	net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
771 	return net->ft.stat ? 0 : -ENOMEM;
772 }
773 
nf_flow_table_fini_net(struct net * net)774 static void nf_flow_table_fini_net(struct net *net)
775 {
776 	free_percpu(net->ft.stat);
777 }
778 
nf_flow_table_pernet_init(struct net * net)779 static int nf_flow_table_pernet_init(struct net *net)
780 {
781 	int ret;
782 
783 	ret = nf_flow_table_init_net(net);
784 	if (ret < 0)
785 		return ret;
786 
787 	ret = nf_flow_table_init_proc(net);
788 	if (ret < 0)
789 		goto out_proc;
790 
791 	return 0;
792 
793 out_proc:
794 	nf_flow_table_fini_net(net);
795 	return ret;
796 }
797 
nf_flow_table_pernet_exit(struct list_head * net_exit_list)798 static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
799 {
800 	struct net *net;
801 
802 	list_for_each_entry(net, net_exit_list, exit_list) {
803 		nf_flow_table_fini_proc(net);
804 		nf_flow_table_fini_net(net);
805 	}
806 }
807 
808 static struct pernet_operations nf_flow_table_net_ops = {
809 	.init = nf_flow_table_pernet_init,
810 	.exit_batch = nf_flow_table_pernet_exit,
811 };
812 
nf_flow_table_module_init(void)813 static int __init nf_flow_table_module_init(void)
814 {
815 	int ret;
816 
817 	flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
818 	if (!flow_offload_cachep)
819 		return -ENOMEM;
820 
821 	ret = register_pernet_subsys(&nf_flow_table_net_ops);
822 	if (ret < 0)
823 		goto out_pernet;
824 
825 	ret = nf_flow_table_offload_init();
826 	if (ret)
827 		goto out_offload;
828 
829 	ret = nf_flow_register_bpf();
830 	if (ret)
831 		goto out_bpf;
832 
833 	return 0;
834 
835 out_bpf:
836 	nf_flow_table_offload_exit();
837 out_offload:
838 	unregister_pernet_subsys(&nf_flow_table_net_ops);
839 out_pernet:
840 	kmem_cache_destroy(flow_offload_cachep);
841 	return ret;
842 }
843 
nf_flow_table_module_exit(void)844 static void __exit nf_flow_table_module_exit(void)
845 {
846 	nf_flow_table_offload_exit();
847 	unregister_pernet_subsys(&nf_flow_table_net_ops);
848 	kmem_cache_destroy(flow_offload_cachep);
849 }
850 
851 module_init(nf_flow_table_module_init);
852 module_exit(nf_flow_table_module_exit);
853 
854 MODULE_LICENSE("GPL");
855 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
856 MODULE_DESCRIPTION("Netfilter flow table module");
857