xref: /linux/net/netfilter/nf_flow_table_core.c (revision 53b3e60edb674b442b2b3bbdba484667b0f47a5d)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16 
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 static __read_mostly struct kmem_cache *flow_offload_cachep;
20 
21 static void
22 flow_offload_fill_dir(struct flow_offload *flow,
23 		      enum flow_offload_tuple_dir dir)
24 {
25 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
26 	struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
27 
28 	ft->dir = dir;
29 
30 	switch (ctt->src.l3num) {
31 	case NFPROTO_IPV4:
32 		ft->src_v4 = ctt->src.u3.in;
33 		ft->dst_v4 = ctt->dst.u3.in;
34 		break;
35 	case NFPROTO_IPV6:
36 		ft->src_v6 = ctt->src.u3.in6;
37 		ft->dst_v6 = ctt->dst.u3.in6;
38 		break;
39 	}
40 
41 	ft->l3proto = ctt->src.l3num;
42 	ft->l4proto = ctt->dst.protonum;
43 
44 	switch (ctt->dst.protonum) {
45 	case IPPROTO_TCP:
46 	case IPPROTO_UDP:
47 		ft->src_port = ctt->src.u.tcp.port;
48 		ft->dst_port = ctt->dst.u.tcp.port;
49 		break;
50 	}
51 }
52 
53 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
54 {
55 	struct flow_offload *flow;
56 
57 	if (unlikely(nf_ct_is_dying(ct)))
58 		return NULL;
59 
60 	flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
61 	if (!flow)
62 		return NULL;
63 
64 	refcount_inc(&ct->ct_general.use);
65 	flow->ct = ct;
66 
67 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
68 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
69 
70 	if (ct->status & IPS_SRC_NAT)
71 		__set_bit(NF_FLOW_SNAT, &flow->flags);
72 	if (ct->status & IPS_DST_NAT)
73 		__set_bit(NF_FLOW_DNAT, &flow->flags);
74 
75 	return flow;
76 }
77 EXPORT_SYMBOL_GPL(flow_offload_alloc);
78 
79 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
80 {
81 	if (flow_tuple->l3proto == NFPROTO_IPV6)
82 		return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
83 
84 	return 0;
85 }
86 
87 static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
88 					     enum flow_offload_tuple_dir dir)
89 {
90 	struct dst_entry *dst = route->tuple[dir].dst;
91 
92 	route->tuple[dir].dst = NULL;
93 
94 	return dst;
95 }
96 
97 static int flow_offload_fill_route(struct flow_offload *flow,
98 				   struct nf_flow_route *route,
99 				   enum flow_offload_tuple_dir dir)
100 {
101 	struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
102 	struct dst_entry *dst = nft_route_dst_fetch(route, dir);
103 	int i, j = 0;
104 
105 	switch (flow_tuple->l3proto) {
106 	case NFPROTO_IPV4:
107 		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
108 		break;
109 	case NFPROTO_IPV6:
110 		flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
111 		break;
112 	}
113 
114 	flow_tuple->iifidx = route->tuple[dir].in.ifindex;
115 	for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
116 		flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
117 		flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
118 		if (route->tuple[dir].in.ingress_vlans & BIT(i))
119 			flow_tuple->in_vlan_ingress |= BIT(j);
120 		j++;
121 	}
122 
123 	flow_tuple->tun = route->tuple[dir].in.tun;
124 	flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
125 	flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment;
126 	flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
127 
128 	switch (route->tuple[dir].xmit_type) {
129 	case FLOW_OFFLOAD_XMIT_DIRECT:
130 		memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
131 		       ETH_ALEN);
132 		memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
133 		       ETH_ALEN);
134 		flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
135 		dst_release(dst);
136 		break;
137 	case FLOW_OFFLOAD_XMIT_XFRM:
138 	case FLOW_OFFLOAD_XMIT_NEIGH:
139 		flow_tuple->ifidx = route->tuple[dir].out.ifindex;
140 		flow_tuple->dst_cache = dst;
141 		flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
142 		break;
143 	default:
144 		WARN_ON_ONCE(1);
145 		break;
146 	}
147 	flow_tuple->xmit_type = route->tuple[dir].xmit_type;
148 
149 	return 0;
150 }
151 
152 static void nft_flow_dst_release(struct flow_offload *flow,
153 				 enum flow_offload_tuple_dir dir)
154 {
155 	if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
156 	    flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
157 		dst_release(flow->tuplehash[dir].tuple.dst_cache);
158 }
159 
160 void flow_offload_route_init(struct flow_offload *flow,
161 			     struct nf_flow_route *route)
162 {
163 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
164 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
165 	flow->type = NF_FLOW_OFFLOAD_ROUTE;
166 }
167 EXPORT_SYMBOL_GPL(flow_offload_route_init);
168 
169 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
170 {
171 	return nf_flow_timeout_delta(flow->timeout) <= 0;
172 }
173 
174 static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
175 {
176 	struct ip_ct_tcp *tcp = &ct->proto.tcp;
177 
178 	spin_lock_bh(&ct->lock);
179 	if (tcp->state != tcp_state)
180 		tcp->state = tcp_state;
181 
182 	/* syn packet triggers the TCP reopen case from conntrack. */
183 	if (tcp->state == TCP_CONNTRACK_CLOSE)
184 		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
185 
186 	/* Conntrack state is outdated due to offload bypass.
187 	 * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
188 	 * TCP reset validation will fail.
189 	 */
190 	tcp->seen[0].td_maxwin = 0;
191 	tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
192 	tcp->seen[1].td_maxwin = 0;
193 	tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
194 	spin_unlock_bh(&ct->lock);
195 }
196 
197 static void flow_offload_fixup_ct(struct flow_offload *flow)
198 {
199 	struct nf_conn *ct = flow->ct;
200 	struct net *net = nf_ct_net(ct);
201 	int l4num = nf_ct_protonum(ct);
202 	bool expired, closing = false;
203 	u32 offload_timeout = 0;
204 	s32 timeout;
205 
206 	if (l4num == IPPROTO_TCP) {
207 		const struct nf_tcp_net *tn = nf_tcp_pernet(net);
208 		u8 tcp_state;
209 
210 		/* Enter CLOSE state if fin/rst packet has been seen, this
211 		 * allows TCP reopen from conntrack. Otherwise, pick up from
212 		 * the last seen TCP state.
213 		 */
214 		closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
215 		if (closing) {
216 			flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
217 			timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
218 			expired = false;
219 		} else {
220 			tcp_state = READ_ONCE(ct->proto.tcp.state);
221 			flow_offload_fixup_tcp(ct, tcp_state);
222 			timeout = READ_ONCE(tn->timeouts[tcp_state]);
223 			expired = nf_flow_has_expired(flow);
224 		}
225 		offload_timeout = READ_ONCE(tn->offload_timeout);
226 
227 	} else if (l4num == IPPROTO_UDP) {
228 		const struct nf_udp_net *tn = nf_udp_pernet(net);
229 		enum udp_conntrack state =
230 			test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
231 			UDP_CT_REPLIED : UDP_CT_UNREPLIED;
232 
233 		timeout = READ_ONCE(tn->timeouts[state]);
234 		expired = nf_flow_has_expired(flow);
235 		offload_timeout = READ_ONCE(tn->offload_timeout);
236 	} else {
237 		return;
238 	}
239 
240 	if (expired)
241 		timeout -= offload_timeout;
242 
243 	if (timeout < 0)
244 		timeout = 0;
245 
246 	if (closing ||
247 	    nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
248 		nf_ct_refresh(ct, timeout);
249 }
250 
251 static void flow_offload_route_release(struct flow_offload *flow)
252 {
253 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
254 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
255 }
256 
257 void flow_offload_free(struct flow_offload *flow)
258 {
259 	switch (flow->type) {
260 	case NF_FLOW_OFFLOAD_ROUTE:
261 		flow_offload_route_release(flow);
262 		break;
263 	default:
264 		break;
265 	}
266 	nf_ct_put(flow->ct);
267 	kfree_rcu(flow, rcu_head);
268 }
269 EXPORT_SYMBOL_GPL(flow_offload_free);
270 
271 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
272 {
273 	const struct flow_offload_tuple *tuple = data;
274 
275 	return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
276 }
277 
278 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
279 {
280 	const struct flow_offload_tuple_rhash *tuplehash = data;
281 
282 	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
283 }
284 
285 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
286 					const void *ptr)
287 {
288 	const struct flow_offload_tuple *tuple = arg->key;
289 	const struct flow_offload_tuple_rhash *x = ptr;
290 
291 	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
292 		return 1;
293 
294 	return 0;
295 }
296 
297 static const struct rhashtable_params nf_flow_offload_rhash_params = {
298 	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
299 	.hashfn			= flow_offload_hash,
300 	.obj_hashfn		= flow_offload_hash_obj,
301 	.obj_cmpfn		= flow_offload_hash_cmp,
302 	.automatic_shrinking	= true,
303 };
304 
305 unsigned long flow_offload_get_timeout(struct flow_offload *flow)
306 {
307 	unsigned long timeout = NF_FLOW_TIMEOUT;
308 	struct net *net = nf_ct_net(flow->ct);
309 	int l4num = nf_ct_protonum(flow->ct);
310 
311 	if (l4num == IPPROTO_TCP) {
312 		struct nf_tcp_net *tn = nf_tcp_pernet(net);
313 
314 		timeout = tn->offload_timeout;
315 	} else if (l4num == IPPROTO_UDP) {
316 		struct nf_udp_net *tn = nf_udp_pernet(net);
317 
318 		timeout = tn->offload_timeout;
319 	}
320 
321 	return timeout;
322 }
323 
324 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
325 {
326 	int err;
327 
328 	flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
329 
330 	err = rhashtable_insert_fast(&flow_table->rhashtable,
331 				     &flow->tuplehash[0].node,
332 				     nf_flow_offload_rhash_params);
333 	if (err < 0)
334 		return err;
335 
336 	err = rhashtable_insert_fast(&flow_table->rhashtable,
337 				     &flow->tuplehash[1].node,
338 				     nf_flow_offload_rhash_params);
339 	if (err < 0) {
340 		rhashtable_remove_fast(&flow_table->rhashtable,
341 				       &flow->tuplehash[0].node,
342 				       nf_flow_offload_rhash_params);
343 		return err;
344 	}
345 
346 	nf_ct_refresh(flow->ct, NF_CT_DAY);
347 
348 	if (nf_flowtable_hw_offload(flow_table)) {
349 		__set_bit(NF_FLOW_HW, &flow->flags);
350 		nf_flow_offload_add(flow_table, flow);
351 	}
352 
353 	return 0;
354 }
355 EXPORT_SYMBOL_GPL(flow_offload_add);
356 
357 void flow_offload_refresh(struct nf_flowtable *flow_table,
358 			  struct flow_offload *flow, bool force)
359 {
360 	u32 timeout;
361 
362 	timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
363 	if (force || timeout - READ_ONCE(flow->timeout) > HZ)
364 		WRITE_ONCE(flow->timeout, timeout);
365 	else
366 		return;
367 
368 	if (likely(!nf_flowtable_hw_offload(flow_table)) ||
369 	    test_bit(NF_FLOW_CLOSING, &flow->flags))
370 		return;
371 
372 	nf_flow_offload_add(flow_table, flow);
373 }
374 EXPORT_SYMBOL_GPL(flow_offload_refresh);
375 
376 static void flow_offload_del(struct nf_flowtable *flow_table,
377 			     struct flow_offload *flow)
378 {
379 	rhashtable_remove_fast(&flow_table->rhashtable,
380 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
381 			       nf_flow_offload_rhash_params);
382 	rhashtable_remove_fast(&flow_table->rhashtable,
383 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
384 			       nf_flow_offload_rhash_params);
385 	flow_offload_free(flow);
386 }
387 
388 void flow_offload_teardown(struct flow_offload *flow)
389 {
390 	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
391 	if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
392 		flow_offload_fixup_ct(flow);
393 }
394 EXPORT_SYMBOL_GPL(flow_offload_teardown);
395 
396 struct flow_offload_tuple_rhash *
397 flow_offload_lookup(struct nf_flowtable *flow_table,
398 		    struct flow_offload_tuple *tuple)
399 {
400 	struct flow_offload_tuple_rhash *tuplehash;
401 	struct flow_offload *flow;
402 	int dir;
403 
404 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
405 				      nf_flow_offload_rhash_params);
406 	if (!tuplehash)
407 		return NULL;
408 
409 	dir = tuplehash->tuple.dir;
410 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
411 	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
412 		return NULL;
413 
414 	if (unlikely(nf_ct_is_dying(flow->ct)))
415 		return NULL;
416 
417 	return tuplehash;
418 }
419 EXPORT_SYMBOL_GPL(flow_offload_lookup);
420 
421 static int
422 nf_flow_table_iterate(struct nf_flowtable *flow_table,
423 		      void (*iter)(struct nf_flowtable *flowtable,
424 				   struct flow_offload *flow, void *data),
425 		      void *data)
426 {
427 	struct flow_offload_tuple_rhash *tuplehash;
428 	struct rhashtable_iter hti;
429 	struct flow_offload *flow;
430 	int err = 0;
431 
432 	rhashtable_walk_enter(&flow_table->rhashtable, &hti);
433 	rhashtable_walk_start(&hti);
434 
435 	while ((tuplehash = rhashtable_walk_next(&hti))) {
436 		if (IS_ERR(tuplehash)) {
437 			if (PTR_ERR(tuplehash) != -EAGAIN) {
438 				err = PTR_ERR(tuplehash);
439 				break;
440 			}
441 			continue;
442 		}
443 		if (tuplehash->tuple.dir)
444 			continue;
445 
446 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
447 
448 		iter(flow_table, flow, data);
449 	}
450 	rhashtable_walk_stop(&hti);
451 	rhashtable_walk_exit(&hti);
452 
453 	return err;
454 }
455 
456 static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
457 			      const struct flow_offload *flow)
458 {
459 	return flow_table->type->gc && flow_table->type->gc(flow);
460 }
461 
462 /**
463  * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
464  * @ct:		Flowtable offloaded tcp ct
465  *
466  * Return: number of seconds when ct entry should expire.
467  */
468 static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
469 {
470 	u8 state = READ_ONCE(ct->proto.tcp.state);
471 
472 	switch (state) {
473 	case TCP_CONNTRACK_SYN_SENT:
474 	case TCP_CONNTRACK_SYN_RECV:
475 		return 0;
476 	case TCP_CONNTRACK_ESTABLISHED:
477 		return NF_CT_DAY;
478 	case TCP_CONNTRACK_FIN_WAIT:
479 	case TCP_CONNTRACK_CLOSE_WAIT:
480 	case TCP_CONNTRACK_LAST_ACK:
481 	case TCP_CONNTRACK_TIME_WAIT:
482 		return 5 * 60 * HZ;
483 	case TCP_CONNTRACK_CLOSE:
484 		return 0;
485 	}
486 
487 	return 0;
488 }
489 
490 /**
491  * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
492  * @ct:		Flowtable offloaded ct
493  *
494  * Datapath lookups in the conntrack table will evict nf_conn entries
495  * if they have expired.
496  *
497  * Once nf_conn entries have been offloaded, nf_conntrack might not see any
498  * packets anymore.  Thus ct->timeout is no longer refreshed and ct can
499  * be evicted.
500  *
501  * To avoid the need for an additional check on the offload bit for every
502  * packet processed via nf_conntrack_in(), set an arbitrary timeout large
503  * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
504  * from the packet path via nf_ct_is_expired().
505  */
506 static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
507 {
508 	static const s32 min_timeout = 5 * 60 * HZ;
509 	u32 ct_timeout = READ_ONCE(ct->timeout);
510 	s32 expires;
511 
512 	expires = ct_timeout - nfct_time_stamp;
513 	if (expires <= 0) /* already expired */
514 		return;
515 
516 	/* normal case: large enough timeout, nothing to do. */
517 	if (likely(expires >= min_timeout))
518 		return;
519 
520 	/* must check offload bit after this, we do not hold any locks.
521 	 * flowtable and ct entries could have been removed on another CPU.
522 	 */
523 	if (!refcount_inc_not_zero(&ct->ct_general.use))
524 		return;
525 
526 	/* load ct->status after refcount increase */
527 	smp_acquire__after_ctrl_dep();
528 
529 	if (nf_ct_is_confirmed(ct) &&
530 	    test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
531 		u8 l4proto = nf_ct_protonum(ct);
532 		u32 new_timeout = 1;
533 
534 		switch (l4proto) {
535 		case IPPROTO_UDP:
536 			new_timeout = NF_CT_DAY;
537 			break;
538 		case IPPROTO_TCP:
539 			new_timeout = nf_flow_table_tcp_timeout(ct);
540 			break;
541 		default:
542 			WARN_ON_ONCE(1);
543 			break;
544 		}
545 
546 		/* Update to ct->timeout from nf_conntrack happens
547 		 * without holding ct->lock.
548 		 *
549 		 * Use cmpxchg to ensure timeout extension doesn't
550 		 * happen when we race with conntrack datapath.
551 		 *
552 		 * The inverse -- datapath updating ->timeout right
553 		 * after this -- is fine, datapath is authoritative.
554 		 */
555 		if (new_timeout) {
556 			new_timeout += nfct_time_stamp;
557 			cmpxchg(&ct->timeout, ct_timeout, new_timeout);
558 		}
559 	}
560 
561 	nf_ct_put(ct);
562 }
563 
564 static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
565 				    struct flow_offload *flow, void *data)
566 {
567 	bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
568 
569 	if (nf_flow_has_expired(flow) ||
570 	    nf_ct_is_dying(flow->ct) ||
571 	    nf_flow_custom_gc(flow_table, flow)) {
572 		flow_offload_teardown(flow);
573 		teardown = true;
574 	} else if (!teardown) {
575 		nf_flow_table_extend_ct_timeout(flow->ct);
576 	}
577 
578 	if (teardown) {
579 		if (test_bit(NF_FLOW_HW, &flow->flags)) {
580 			if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
581 				nf_flow_offload_del(flow_table, flow);
582 			else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
583 				flow_offload_del(flow_table, flow);
584 		} else {
585 			flow_offload_del(flow_table, flow);
586 		}
587 	} else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
588 		   test_bit(NF_FLOW_HW, &flow->flags) &&
589 		   !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
590 		nf_flow_offload_del(flow_table, flow);
591 	} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
592 		nf_flow_offload_stats(flow_table, flow);
593 	}
594 }
595 
596 void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
597 {
598 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
599 }
600 
601 static void nf_flow_offload_work_gc(struct work_struct *work)
602 {
603 	struct nf_flowtable *flow_table;
604 
605 	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
606 	nf_flow_table_gc_run(flow_table);
607 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
608 }
609 
610 static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
611 				 __be16 port, __be16 new_port)
612 {
613 	struct tcphdr *tcph;
614 
615 	tcph = (void *)(skb_network_header(skb) + thoff);
616 	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
617 }
618 
619 static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
620 				 __be16 port, __be16 new_port)
621 {
622 	struct udphdr *udph;
623 
624 	udph = (void *)(skb_network_header(skb) + thoff);
625 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
626 		inet_proto_csum_replace2(&udph->check, skb, port,
627 					 new_port, false);
628 		if (!udph->check)
629 			udph->check = CSUM_MANGLED_0;
630 	}
631 }
632 
633 static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
634 			     u8 protocol, __be16 port, __be16 new_port)
635 {
636 	switch (protocol) {
637 	case IPPROTO_TCP:
638 		nf_flow_nat_port_tcp(skb, thoff, port, new_port);
639 		break;
640 	case IPPROTO_UDP:
641 		nf_flow_nat_port_udp(skb, thoff, port, new_port);
642 		break;
643 	}
644 }
645 
646 void nf_flow_snat_port(const struct flow_offload *flow,
647 		       struct sk_buff *skb, unsigned int thoff,
648 		       u8 protocol, enum flow_offload_tuple_dir dir)
649 {
650 	struct flow_ports *hdr;
651 	__be16 port, new_port;
652 
653 	hdr = (void *)(skb_network_header(skb) + thoff);
654 
655 	switch (dir) {
656 	case FLOW_OFFLOAD_DIR_ORIGINAL:
657 		port = hdr->source;
658 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
659 		hdr->source = new_port;
660 		break;
661 	case FLOW_OFFLOAD_DIR_REPLY:
662 		port = hdr->dest;
663 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
664 		hdr->dest = new_port;
665 		break;
666 	}
667 
668 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
669 }
670 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
671 
672 void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
673 		       unsigned int thoff, u8 protocol,
674 		       enum flow_offload_tuple_dir dir)
675 {
676 	struct flow_ports *hdr;
677 	__be16 port, new_port;
678 
679 	hdr = (void *)(skb_network_header(skb) + thoff);
680 
681 	switch (dir) {
682 	case FLOW_OFFLOAD_DIR_ORIGINAL:
683 		port = hdr->dest;
684 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
685 		hdr->dest = new_port;
686 		break;
687 	case FLOW_OFFLOAD_DIR_REPLY:
688 		port = hdr->source;
689 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
690 		hdr->source = new_port;
691 		break;
692 	}
693 
694 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
695 }
696 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
697 
698 int nf_flow_table_init(struct nf_flowtable *flowtable)
699 {
700 	int err;
701 
702 	INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
703 	flow_block_init(&flowtable->flow_block);
704 	init_rwsem(&flowtable->flow_block_lock);
705 
706 	err = rhashtable_init(&flowtable->rhashtable,
707 			      &nf_flow_offload_rhash_params);
708 	if (err < 0)
709 		return err;
710 
711 	queue_delayed_work(system_power_efficient_wq,
712 			   &flowtable->gc_work, HZ);
713 
714 	mutex_lock(&flowtable_lock);
715 	list_add(&flowtable->list, &flowtables);
716 	mutex_unlock(&flowtable_lock);
717 
718 	return 0;
719 }
720 EXPORT_SYMBOL_GPL(nf_flow_table_init);
721 
722 static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
723 				     struct flow_offload *flow, void *data)
724 {
725 	struct net_device *dev = data;
726 
727 	if (!dev) {
728 		flow_offload_teardown(flow);
729 		return;
730 	}
731 
732 	if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
733 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
734 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
735 		flow_offload_teardown(flow);
736 }
737 
738 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
739 			      struct net_device *dev)
740 {
741 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
742 	flush_delayed_work(&flowtable->gc_work);
743 	nf_flow_table_offload_flush(flowtable);
744 }
745 
746 void nf_flow_table_cleanup(struct net_device *dev)
747 {
748 	struct nf_flowtable *flowtable;
749 
750 	mutex_lock(&flowtable_lock);
751 	list_for_each_entry(flowtable, &flowtables, list)
752 		nf_flow_table_gc_cleanup(flowtable, dev);
753 	mutex_unlock(&flowtable_lock);
754 }
755 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
756 
757 void nf_flow_table_free(struct nf_flowtable *flow_table)
758 {
759 	mutex_lock(&flowtable_lock);
760 	list_del(&flow_table->list);
761 	mutex_unlock(&flowtable_lock);
762 
763 	cancel_delayed_work_sync(&flow_table->gc_work);
764 	nf_flow_table_offload_flush(flow_table);
765 	/* ... no more pending work after this stage ... */
766 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
767 	nf_flow_table_gc_run(flow_table);
768 	nf_flow_table_offload_flush_cleanup(flow_table);
769 	rhashtable_destroy(&flow_table->rhashtable);
770 }
771 EXPORT_SYMBOL_GPL(nf_flow_table_free);
772 
773 static int nf_flow_table_init_net(struct net *net)
774 {
775 	net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
776 	return net->ft.stat ? 0 : -ENOMEM;
777 }
778 
779 static void nf_flow_table_fini_net(struct net *net)
780 {
781 	free_percpu(net->ft.stat);
782 }
783 
784 static int nf_flow_table_pernet_init(struct net *net)
785 {
786 	int ret;
787 
788 	ret = nf_flow_table_init_net(net);
789 	if (ret < 0)
790 		return ret;
791 
792 	ret = nf_flow_table_init_proc(net);
793 	if (ret < 0)
794 		goto out_proc;
795 
796 	return 0;
797 
798 out_proc:
799 	nf_flow_table_fini_net(net);
800 	return ret;
801 }
802 
803 static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
804 {
805 	struct net *net;
806 
807 	list_for_each_entry(net, net_exit_list, exit_list) {
808 		nf_flow_table_fini_proc(net);
809 		nf_flow_table_fini_net(net);
810 	}
811 }
812 
813 static struct pernet_operations nf_flow_table_net_ops = {
814 	.init = nf_flow_table_pernet_init,
815 	.exit_batch = nf_flow_table_pernet_exit,
816 };
817 
818 static int __init nf_flow_table_module_init(void)
819 {
820 	int ret;
821 
822 	flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
823 	if (!flow_offload_cachep)
824 		return -ENOMEM;
825 
826 	ret = register_pernet_subsys(&nf_flow_table_net_ops);
827 	if (ret < 0)
828 		goto out_pernet;
829 
830 	ret = nf_flow_table_offload_init();
831 	if (ret)
832 		goto out_offload;
833 
834 	ret = nf_flow_register_bpf();
835 	if (ret)
836 		goto out_bpf;
837 
838 	return 0;
839 
840 out_bpf:
841 	nf_flow_table_offload_exit();
842 out_offload:
843 	unregister_pernet_subsys(&nf_flow_table_net_ops);
844 out_pernet:
845 	kmem_cache_destroy(flow_offload_cachep);
846 	return ret;
847 }
848 
849 static void __exit nf_flow_table_module_exit(void)
850 {
851 	nf_flow_table_offload_exit();
852 	unregister_pernet_subsys(&nf_flow_table_net_ops);
853 	kmem_cache_destroy(flow_offload_cachep);
854 }
855 
856 module_init(nf_flow_table_module_init);
857 module_exit(nf_flow_table_module_exit);
858 
859 MODULE_LICENSE("GPL");
860 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
861 MODULE_DESCRIPTION("Netfilter flow table module");
862