xref: /linux/net/netfilter/nf_flow_table_core.c (revision 37a93dd5c49b5fda807fd204edf2547c3493319c)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16 
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 static __read_mostly struct kmem_cache *flow_offload_cachep;
20 
21 static void
22 flow_offload_fill_dir(struct flow_offload *flow,
23 		      enum flow_offload_tuple_dir dir)
24 {
25 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
26 	struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
27 
28 	ft->dir = dir;
29 
30 	switch (ctt->src.l3num) {
31 	case NFPROTO_IPV4:
32 		ft->src_v4 = ctt->src.u3.in;
33 		ft->dst_v4 = ctt->dst.u3.in;
34 		break;
35 	case NFPROTO_IPV6:
36 		ft->src_v6 = ctt->src.u3.in6;
37 		ft->dst_v6 = ctt->dst.u3.in6;
38 		break;
39 	}
40 
41 	ft->l3proto = ctt->src.l3num;
42 	ft->l4proto = ctt->dst.protonum;
43 
44 	switch (ctt->dst.protonum) {
45 	case IPPROTO_TCP:
46 	case IPPROTO_UDP:
47 		ft->src_port = ctt->src.u.tcp.port;
48 		ft->dst_port = ctt->dst.u.tcp.port;
49 		break;
50 	}
51 }
52 
53 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
54 {
55 	struct flow_offload *flow;
56 
57 	if (unlikely(nf_ct_is_dying(ct)))
58 		return NULL;
59 
60 	flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
61 	if (!flow)
62 		return NULL;
63 
64 	refcount_inc(&ct->ct_general.use);
65 	flow->ct = ct;
66 
67 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
68 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
69 
70 	if (ct->status & IPS_SRC_NAT)
71 		__set_bit(NF_FLOW_SNAT, &flow->flags);
72 	if (ct->status & IPS_DST_NAT)
73 		__set_bit(NF_FLOW_DNAT, &flow->flags);
74 
75 	return flow;
76 }
77 EXPORT_SYMBOL_GPL(flow_offload_alloc);
78 
79 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
80 {
81 	if (flow_tuple->l3proto == NFPROTO_IPV6)
82 		return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
83 
84 	return 0;
85 }
86 
87 static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
88 					     enum flow_offload_tuple_dir dir)
89 {
90 	struct dst_entry *dst = route->tuple[dir].dst;
91 
92 	route->tuple[dir].dst = NULL;
93 
94 	return dst;
95 }
96 
97 static int flow_offload_fill_route(struct flow_offload *flow,
98 				   struct nf_flow_route *route,
99 				   enum flow_offload_tuple_dir dir)
100 {
101 	struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
102 	struct dst_entry *dst = nft_route_dst_fetch(route, dir);
103 	int i, j = 0;
104 
105 	switch (flow_tuple->l3proto) {
106 	case NFPROTO_IPV4:
107 		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
108 		break;
109 	case NFPROTO_IPV6:
110 		flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
111 		break;
112 	}
113 
114 	flow_tuple->iifidx = route->tuple[dir].in.ifindex;
115 	for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
116 		flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
117 		flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
118 		if (route->tuple[dir].in.ingress_vlans & BIT(i))
119 			flow_tuple->in_vlan_ingress |= BIT(j);
120 		j++;
121 	}
122 
123 	flow_tuple->tun = route->tuple[dir].in.tun;
124 	flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
125 	flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
126 
127 	switch (route->tuple[dir].xmit_type) {
128 	case FLOW_OFFLOAD_XMIT_DIRECT:
129 		memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
130 		       ETH_ALEN);
131 		memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
132 		       ETH_ALEN);
133 		flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
134 		dst_release(dst);
135 		break;
136 	case FLOW_OFFLOAD_XMIT_XFRM:
137 	case FLOW_OFFLOAD_XMIT_NEIGH:
138 		flow_tuple->ifidx = route->tuple[dir].out.ifindex;
139 		flow_tuple->dst_cache = dst;
140 		flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
141 		break;
142 	default:
143 		WARN_ON_ONCE(1);
144 		break;
145 	}
146 	flow_tuple->xmit_type = route->tuple[dir].xmit_type;
147 
148 	return 0;
149 }
150 
151 static void nft_flow_dst_release(struct flow_offload *flow,
152 				 enum flow_offload_tuple_dir dir)
153 {
154 	if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
155 	    flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
156 		dst_release(flow->tuplehash[dir].tuple.dst_cache);
157 }
158 
159 void flow_offload_route_init(struct flow_offload *flow,
160 			     struct nf_flow_route *route)
161 {
162 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
163 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
164 	flow->type = NF_FLOW_OFFLOAD_ROUTE;
165 }
166 EXPORT_SYMBOL_GPL(flow_offload_route_init);
167 
168 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
169 {
170 	return nf_flow_timeout_delta(flow->timeout) <= 0;
171 }
172 
173 static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
174 {
175 	struct ip_ct_tcp *tcp = &ct->proto.tcp;
176 
177 	spin_lock_bh(&ct->lock);
178 	if (tcp->state != tcp_state)
179 		tcp->state = tcp_state;
180 
181 	/* syn packet triggers the TCP reopen case from conntrack. */
182 	if (tcp->state == TCP_CONNTRACK_CLOSE)
183 		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
184 
185 	/* Conntrack state is outdated due to offload bypass.
186 	 * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
187 	 * TCP reset validation will fail.
188 	 */
189 	tcp->seen[0].td_maxwin = 0;
190 	tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
191 	tcp->seen[1].td_maxwin = 0;
192 	tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
193 	spin_unlock_bh(&ct->lock);
194 }
195 
196 static void flow_offload_fixup_ct(struct flow_offload *flow)
197 {
198 	struct nf_conn *ct = flow->ct;
199 	struct net *net = nf_ct_net(ct);
200 	int l4num = nf_ct_protonum(ct);
201 	bool expired, closing = false;
202 	u32 offload_timeout = 0;
203 	s32 timeout;
204 
205 	if (l4num == IPPROTO_TCP) {
206 		const struct nf_tcp_net *tn = nf_tcp_pernet(net);
207 		u8 tcp_state;
208 
209 		/* Enter CLOSE state if fin/rst packet has been seen, this
210 		 * allows TCP reopen from conntrack. Otherwise, pick up from
211 		 * the last seen TCP state.
212 		 */
213 		closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
214 		if (closing) {
215 			flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
216 			timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
217 			expired = false;
218 		} else {
219 			tcp_state = READ_ONCE(ct->proto.tcp.state);
220 			flow_offload_fixup_tcp(ct, tcp_state);
221 			timeout = READ_ONCE(tn->timeouts[tcp_state]);
222 			expired = nf_flow_has_expired(flow);
223 		}
224 		offload_timeout = READ_ONCE(tn->offload_timeout);
225 
226 	} else if (l4num == IPPROTO_UDP) {
227 		const struct nf_udp_net *tn = nf_udp_pernet(net);
228 		enum udp_conntrack state =
229 			test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
230 			UDP_CT_REPLIED : UDP_CT_UNREPLIED;
231 
232 		timeout = READ_ONCE(tn->timeouts[state]);
233 		expired = nf_flow_has_expired(flow);
234 		offload_timeout = READ_ONCE(tn->offload_timeout);
235 	} else {
236 		return;
237 	}
238 
239 	if (expired)
240 		timeout -= offload_timeout;
241 
242 	if (timeout < 0)
243 		timeout = 0;
244 
245 	if (closing ||
246 	    nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
247 		nf_ct_refresh(ct, timeout);
248 }
249 
250 static void flow_offload_route_release(struct flow_offload *flow)
251 {
252 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
253 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
254 }
255 
256 void flow_offload_free(struct flow_offload *flow)
257 {
258 	switch (flow->type) {
259 	case NF_FLOW_OFFLOAD_ROUTE:
260 		flow_offload_route_release(flow);
261 		break;
262 	default:
263 		break;
264 	}
265 	nf_ct_put(flow->ct);
266 	kfree_rcu(flow, rcu_head);
267 }
268 EXPORT_SYMBOL_GPL(flow_offload_free);
269 
270 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
271 {
272 	const struct flow_offload_tuple *tuple = data;
273 
274 	return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
275 }
276 
277 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
278 {
279 	const struct flow_offload_tuple_rhash *tuplehash = data;
280 
281 	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
282 }
283 
284 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
285 					const void *ptr)
286 {
287 	const struct flow_offload_tuple *tuple = arg->key;
288 	const struct flow_offload_tuple_rhash *x = ptr;
289 
290 	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
291 		return 1;
292 
293 	return 0;
294 }
295 
296 static const struct rhashtable_params nf_flow_offload_rhash_params = {
297 	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
298 	.hashfn			= flow_offload_hash,
299 	.obj_hashfn		= flow_offload_hash_obj,
300 	.obj_cmpfn		= flow_offload_hash_cmp,
301 	.automatic_shrinking	= true,
302 };
303 
304 unsigned long flow_offload_get_timeout(struct flow_offload *flow)
305 {
306 	unsigned long timeout = NF_FLOW_TIMEOUT;
307 	struct net *net = nf_ct_net(flow->ct);
308 	int l4num = nf_ct_protonum(flow->ct);
309 
310 	if (l4num == IPPROTO_TCP) {
311 		struct nf_tcp_net *tn = nf_tcp_pernet(net);
312 
313 		timeout = tn->offload_timeout;
314 	} else if (l4num == IPPROTO_UDP) {
315 		struct nf_udp_net *tn = nf_udp_pernet(net);
316 
317 		timeout = tn->offload_timeout;
318 	}
319 
320 	return timeout;
321 }
322 
323 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
324 {
325 	int err;
326 
327 	flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
328 
329 	err = rhashtable_insert_fast(&flow_table->rhashtable,
330 				     &flow->tuplehash[0].node,
331 				     nf_flow_offload_rhash_params);
332 	if (err < 0)
333 		return err;
334 
335 	err = rhashtable_insert_fast(&flow_table->rhashtable,
336 				     &flow->tuplehash[1].node,
337 				     nf_flow_offload_rhash_params);
338 	if (err < 0) {
339 		rhashtable_remove_fast(&flow_table->rhashtable,
340 				       &flow->tuplehash[0].node,
341 				       nf_flow_offload_rhash_params);
342 		return err;
343 	}
344 
345 	nf_ct_refresh(flow->ct, NF_CT_DAY);
346 
347 	if (nf_flowtable_hw_offload(flow_table)) {
348 		__set_bit(NF_FLOW_HW, &flow->flags);
349 		nf_flow_offload_add(flow_table, flow);
350 	}
351 
352 	return 0;
353 }
354 EXPORT_SYMBOL_GPL(flow_offload_add);
355 
356 void flow_offload_refresh(struct nf_flowtable *flow_table,
357 			  struct flow_offload *flow, bool force)
358 {
359 	u32 timeout;
360 
361 	timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
362 	if (force || timeout - READ_ONCE(flow->timeout) > HZ)
363 		WRITE_ONCE(flow->timeout, timeout);
364 	else
365 		return;
366 
367 	if (likely(!nf_flowtable_hw_offload(flow_table)) ||
368 	    test_bit(NF_FLOW_CLOSING, &flow->flags))
369 		return;
370 
371 	nf_flow_offload_add(flow_table, flow);
372 }
373 EXPORT_SYMBOL_GPL(flow_offload_refresh);
374 
375 static void flow_offload_del(struct nf_flowtable *flow_table,
376 			     struct flow_offload *flow)
377 {
378 	rhashtable_remove_fast(&flow_table->rhashtable,
379 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
380 			       nf_flow_offload_rhash_params);
381 	rhashtable_remove_fast(&flow_table->rhashtable,
382 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
383 			       nf_flow_offload_rhash_params);
384 	flow_offload_free(flow);
385 }
386 
387 void flow_offload_teardown(struct flow_offload *flow)
388 {
389 	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
390 	if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
391 		flow_offload_fixup_ct(flow);
392 }
393 EXPORT_SYMBOL_GPL(flow_offload_teardown);
394 
395 struct flow_offload_tuple_rhash *
396 flow_offload_lookup(struct nf_flowtable *flow_table,
397 		    struct flow_offload_tuple *tuple)
398 {
399 	struct flow_offload_tuple_rhash *tuplehash;
400 	struct flow_offload *flow;
401 	int dir;
402 
403 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
404 				      nf_flow_offload_rhash_params);
405 	if (!tuplehash)
406 		return NULL;
407 
408 	dir = tuplehash->tuple.dir;
409 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
410 	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
411 		return NULL;
412 
413 	if (unlikely(nf_ct_is_dying(flow->ct)))
414 		return NULL;
415 
416 	return tuplehash;
417 }
418 EXPORT_SYMBOL_GPL(flow_offload_lookup);
419 
420 static int
421 nf_flow_table_iterate(struct nf_flowtable *flow_table,
422 		      void (*iter)(struct nf_flowtable *flowtable,
423 				   struct flow_offload *flow, void *data),
424 		      void *data)
425 {
426 	struct flow_offload_tuple_rhash *tuplehash;
427 	struct rhashtable_iter hti;
428 	struct flow_offload *flow;
429 	int err = 0;
430 
431 	rhashtable_walk_enter(&flow_table->rhashtable, &hti);
432 	rhashtable_walk_start(&hti);
433 
434 	while ((tuplehash = rhashtable_walk_next(&hti))) {
435 		if (IS_ERR(tuplehash)) {
436 			if (PTR_ERR(tuplehash) != -EAGAIN) {
437 				err = PTR_ERR(tuplehash);
438 				break;
439 			}
440 			continue;
441 		}
442 		if (tuplehash->tuple.dir)
443 			continue;
444 
445 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
446 
447 		iter(flow_table, flow, data);
448 	}
449 	rhashtable_walk_stop(&hti);
450 	rhashtable_walk_exit(&hti);
451 
452 	return err;
453 }
454 
455 static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
456 			      const struct flow_offload *flow)
457 {
458 	return flow_table->type->gc && flow_table->type->gc(flow);
459 }
460 
461 /**
462  * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
463  * @ct:		Flowtable offloaded tcp ct
464  *
465  * Return: number of seconds when ct entry should expire.
466  */
467 static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
468 {
469 	u8 state = READ_ONCE(ct->proto.tcp.state);
470 
471 	switch (state) {
472 	case TCP_CONNTRACK_SYN_SENT:
473 	case TCP_CONNTRACK_SYN_RECV:
474 		return 0;
475 	case TCP_CONNTRACK_ESTABLISHED:
476 		return NF_CT_DAY;
477 	case TCP_CONNTRACK_FIN_WAIT:
478 	case TCP_CONNTRACK_CLOSE_WAIT:
479 	case TCP_CONNTRACK_LAST_ACK:
480 	case TCP_CONNTRACK_TIME_WAIT:
481 		return 5 * 60 * HZ;
482 	case TCP_CONNTRACK_CLOSE:
483 		return 0;
484 	}
485 
486 	return 0;
487 }
488 
489 /**
490  * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
491  * @ct:		Flowtable offloaded ct
492  *
493  * Datapath lookups in the conntrack table will evict nf_conn entries
494  * if they have expired.
495  *
496  * Once nf_conn entries have been offloaded, nf_conntrack might not see any
497  * packets anymore.  Thus ct->timeout is no longer refreshed and ct can
498  * be evicted.
499  *
500  * To avoid the need for an additional check on the offload bit for every
501  * packet processed via nf_conntrack_in(), set an arbitrary timeout large
502  * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
503  * from the packet path via nf_ct_is_expired().
504  */
505 static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
506 {
507 	static const u32 min_timeout = 5 * 60 * HZ;
508 	u32 expires = nf_ct_expires(ct);
509 
510 	/* normal case: large enough timeout, nothing to do. */
511 	if (likely(expires >= min_timeout))
512 		return;
513 
514 	/* must check offload bit after this, we do not hold any locks.
515 	 * flowtable and ct entries could have been removed on another CPU.
516 	 */
517 	if (!refcount_inc_not_zero(&ct->ct_general.use))
518 		return;
519 
520 	/* load ct->status after refcount increase */
521 	smp_acquire__after_ctrl_dep();
522 
523 	if (nf_ct_is_confirmed(ct) &&
524 	    test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
525 		u8 l4proto = nf_ct_protonum(ct);
526 		u32 new_timeout = true;
527 
528 		switch (l4proto) {
529 		case IPPROTO_UDP:
530 			new_timeout = NF_CT_DAY;
531 			break;
532 		case IPPROTO_TCP:
533 			new_timeout = nf_flow_table_tcp_timeout(ct);
534 			break;
535 		default:
536 			WARN_ON_ONCE(1);
537 			break;
538 		}
539 
540 		/* Update to ct->timeout from nf_conntrack happens
541 		 * without holding ct->lock.
542 		 *
543 		 * Use cmpxchg to ensure timeout extension doesn't
544 		 * happen when we race with conntrack datapath.
545 		 *
546 		 * The inverse -- datapath updating ->timeout right
547 		 * after this -- is fine, datapath is authoritative.
548 		 */
549 		if (new_timeout) {
550 			new_timeout += nfct_time_stamp;
551 			cmpxchg(&ct->timeout, expires, new_timeout);
552 		}
553 	}
554 
555 	nf_ct_put(ct);
556 }
557 
558 static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
559 				    struct flow_offload *flow, void *data)
560 {
561 	bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
562 
563 	if (nf_flow_has_expired(flow) ||
564 	    nf_ct_is_dying(flow->ct) ||
565 	    nf_flow_custom_gc(flow_table, flow)) {
566 		flow_offload_teardown(flow);
567 		teardown = true;
568 	} else if (!teardown) {
569 		nf_flow_table_extend_ct_timeout(flow->ct);
570 	}
571 
572 	if (teardown) {
573 		if (test_bit(NF_FLOW_HW, &flow->flags)) {
574 			if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
575 				nf_flow_offload_del(flow_table, flow);
576 			else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
577 				flow_offload_del(flow_table, flow);
578 		} else {
579 			flow_offload_del(flow_table, flow);
580 		}
581 	} else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
582 		   test_bit(NF_FLOW_HW, &flow->flags) &&
583 		   !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
584 		nf_flow_offload_del(flow_table, flow);
585 	} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
586 		nf_flow_offload_stats(flow_table, flow);
587 	}
588 }
589 
590 void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
591 {
592 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
593 }
594 
595 static void nf_flow_offload_work_gc(struct work_struct *work)
596 {
597 	struct nf_flowtable *flow_table;
598 
599 	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
600 	nf_flow_table_gc_run(flow_table);
601 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
602 }
603 
604 static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
605 				 __be16 port, __be16 new_port)
606 {
607 	struct tcphdr *tcph;
608 
609 	tcph = (void *)(skb_network_header(skb) + thoff);
610 	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
611 }
612 
613 static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
614 				 __be16 port, __be16 new_port)
615 {
616 	struct udphdr *udph;
617 
618 	udph = (void *)(skb_network_header(skb) + thoff);
619 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
620 		inet_proto_csum_replace2(&udph->check, skb, port,
621 					 new_port, false);
622 		if (!udph->check)
623 			udph->check = CSUM_MANGLED_0;
624 	}
625 }
626 
627 static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
628 			     u8 protocol, __be16 port, __be16 new_port)
629 {
630 	switch (protocol) {
631 	case IPPROTO_TCP:
632 		nf_flow_nat_port_tcp(skb, thoff, port, new_port);
633 		break;
634 	case IPPROTO_UDP:
635 		nf_flow_nat_port_udp(skb, thoff, port, new_port);
636 		break;
637 	}
638 }
639 
640 void nf_flow_snat_port(const struct flow_offload *flow,
641 		       struct sk_buff *skb, unsigned int thoff,
642 		       u8 protocol, enum flow_offload_tuple_dir dir)
643 {
644 	struct flow_ports *hdr;
645 	__be16 port, new_port;
646 
647 	hdr = (void *)(skb_network_header(skb) + thoff);
648 
649 	switch (dir) {
650 	case FLOW_OFFLOAD_DIR_ORIGINAL:
651 		port = hdr->source;
652 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
653 		hdr->source = new_port;
654 		break;
655 	case FLOW_OFFLOAD_DIR_REPLY:
656 		port = hdr->dest;
657 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
658 		hdr->dest = new_port;
659 		break;
660 	}
661 
662 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
663 }
664 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
665 
666 void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
667 		       unsigned int thoff, u8 protocol,
668 		       enum flow_offload_tuple_dir dir)
669 {
670 	struct flow_ports *hdr;
671 	__be16 port, new_port;
672 
673 	hdr = (void *)(skb_network_header(skb) + thoff);
674 
675 	switch (dir) {
676 	case FLOW_OFFLOAD_DIR_ORIGINAL:
677 		port = hdr->dest;
678 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
679 		hdr->dest = new_port;
680 		break;
681 	case FLOW_OFFLOAD_DIR_REPLY:
682 		port = hdr->source;
683 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
684 		hdr->source = new_port;
685 		break;
686 	}
687 
688 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
689 }
690 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
691 
692 int nf_flow_table_init(struct nf_flowtable *flowtable)
693 {
694 	int err;
695 
696 	INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
697 	flow_block_init(&flowtable->flow_block);
698 	init_rwsem(&flowtable->flow_block_lock);
699 
700 	err = rhashtable_init(&flowtable->rhashtable,
701 			      &nf_flow_offload_rhash_params);
702 	if (err < 0)
703 		return err;
704 
705 	queue_delayed_work(system_power_efficient_wq,
706 			   &flowtable->gc_work, HZ);
707 
708 	mutex_lock(&flowtable_lock);
709 	list_add(&flowtable->list, &flowtables);
710 	mutex_unlock(&flowtable_lock);
711 
712 	return 0;
713 }
714 EXPORT_SYMBOL_GPL(nf_flow_table_init);
715 
716 static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
717 				     struct flow_offload *flow, void *data)
718 {
719 	struct net_device *dev = data;
720 
721 	if (!dev) {
722 		flow_offload_teardown(flow);
723 		return;
724 	}
725 
726 	if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
727 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
728 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
729 		flow_offload_teardown(flow);
730 }
731 
732 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
733 			      struct net_device *dev)
734 {
735 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
736 	flush_delayed_work(&flowtable->gc_work);
737 	nf_flow_table_offload_flush(flowtable);
738 }
739 
740 void nf_flow_table_cleanup(struct net_device *dev)
741 {
742 	struct nf_flowtable *flowtable;
743 
744 	mutex_lock(&flowtable_lock);
745 	list_for_each_entry(flowtable, &flowtables, list)
746 		nf_flow_table_gc_cleanup(flowtable, dev);
747 	mutex_unlock(&flowtable_lock);
748 }
749 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
750 
751 void nf_flow_table_free(struct nf_flowtable *flow_table)
752 {
753 	mutex_lock(&flowtable_lock);
754 	list_del(&flow_table->list);
755 	mutex_unlock(&flowtable_lock);
756 
757 	cancel_delayed_work_sync(&flow_table->gc_work);
758 	nf_flow_table_offload_flush(flow_table);
759 	/* ... no more pending work after this stage ... */
760 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
761 	nf_flow_table_gc_run(flow_table);
762 	nf_flow_table_offload_flush_cleanup(flow_table);
763 	rhashtable_destroy(&flow_table->rhashtable);
764 }
765 EXPORT_SYMBOL_GPL(nf_flow_table_free);
766 
767 static int nf_flow_table_init_net(struct net *net)
768 {
769 	net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
770 	return net->ft.stat ? 0 : -ENOMEM;
771 }
772 
773 static void nf_flow_table_fini_net(struct net *net)
774 {
775 	free_percpu(net->ft.stat);
776 }
777 
778 static int nf_flow_table_pernet_init(struct net *net)
779 {
780 	int ret;
781 
782 	ret = nf_flow_table_init_net(net);
783 	if (ret < 0)
784 		return ret;
785 
786 	ret = nf_flow_table_init_proc(net);
787 	if (ret < 0)
788 		goto out_proc;
789 
790 	return 0;
791 
792 out_proc:
793 	nf_flow_table_fini_net(net);
794 	return ret;
795 }
796 
797 static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
798 {
799 	struct net *net;
800 
801 	list_for_each_entry(net, net_exit_list, exit_list) {
802 		nf_flow_table_fini_proc(net);
803 		nf_flow_table_fini_net(net);
804 	}
805 }
806 
807 static struct pernet_operations nf_flow_table_net_ops = {
808 	.init = nf_flow_table_pernet_init,
809 	.exit_batch = nf_flow_table_pernet_exit,
810 };
811 
812 static int __init nf_flow_table_module_init(void)
813 {
814 	int ret;
815 
816 	flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
817 	if (!flow_offload_cachep)
818 		return -ENOMEM;
819 
820 	ret = register_pernet_subsys(&nf_flow_table_net_ops);
821 	if (ret < 0)
822 		goto out_pernet;
823 
824 	ret = nf_flow_table_offload_init();
825 	if (ret)
826 		goto out_offload;
827 
828 	ret = nf_flow_register_bpf();
829 	if (ret)
830 		goto out_bpf;
831 
832 	return 0;
833 
834 out_bpf:
835 	nf_flow_table_offload_exit();
836 out_offload:
837 	unregister_pernet_subsys(&nf_flow_table_net_ops);
838 out_pernet:
839 	kmem_cache_destroy(flow_offload_cachep);
840 	return ret;
841 }
842 
843 static void __exit nf_flow_table_module_exit(void)
844 {
845 	nf_flow_table_offload_exit();
846 	unregister_pernet_subsys(&nf_flow_table_net_ops);
847 	kmem_cache_destroy(flow_offload_cachep);
848 }
849 
850 module_init(nf_flow_table_module_init);
851 module_exit(nf_flow_table_module_exit);
852 
853 MODULE_LICENSE("GPL");
854 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
855 MODULE_DESCRIPTION("Netfilter flow table module");
856