xref: /linux/net/netfilter/nf_flow_table_core.c (revision 8f7aa3d3c7323f4ca2768a9e74ebbe359c4f8f88)
1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16 
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 
20 static void
21 flow_offload_fill_dir(struct flow_offload *flow,
22 		      enum flow_offload_tuple_dir dir)
23 {
24 	struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
25 	struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
26 
27 	ft->dir = dir;
28 
29 	switch (ctt->src.l3num) {
30 	case NFPROTO_IPV4:
31 		ft->src_v4 = ctt->src.u3.in;
32 		ft->dst_v4 = ctt->dst.u3.in;
33 		break;
34 	case NFPROTO_IPV6:
35 		ft->src_v6 = ctt->src.u3.in6;
36 		ft->dst_v6 = ctt->dst.u3.in6;
37 		break;
38 	}
39 
40 	ft->l3proto = ctt->src.l3num;
41 	ft->l4proto = ctt->dst.protonum;
42 
43 	switch (ctt->dst.protonum) {
44 	case IPPROTO_TCP:
45 	case IPPROTO_UDP:
46 		ft->src_port = ctt->src.u.tcp.port;
47 		ft->dst_port = ctt->dst.u.tcp.port;
48 		break;
49 	}
50 }
51 
52 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
53 {
54 	struct flow_offload *flow;
55 
56 	if (unlikely(nf_ct_is_dying(ct)))
57 		return NULL;
58 
59 	flow = kzalloc(sizeof(*flow), GFP_ATOMIC);
60 	if (!flow)
61 		return NULL;
62 
63 	refcount_inc(&ct->ct_general.use);
64 	flow->ct = ct;
65 
66 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
67 	flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
68 
69 	if (ct->status & IPS_SRC_NAT)
70 		__set_bit(NF_FLOW_SNAT, &flow->flags);
71 	if (ct->status & IPS_DST_NAT)
72 		__set_bit(NF_FLOW_DNAT, &flow->flags);
73 
74 	return flow;
75 }
76 EXPORT_SYMBOL_GPL(flow_offload_alloc);
77 
78 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
79 {
80 	if (flow_tuple->l3proto == NFPROTO_IPV6)
81 		return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
82 
83 	return 0;
84 }
85 
86 static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
87 					     enum flow_offload_tuple_dir dir)
88 {
89 	struct dst_entry *dst = route->tuple[dir].dst;
90 
91 	route->tuple[dir].dst = NULL;
92 
93 	return dst;
94 }
95 
96 static int flow_offload_fill_route(struct flow_offload *flow,
97 				   struct nf_flow_route *route,
98 				   enum flow_offload_tuple_dir dir)
99 {
100 	struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
101 	struct dst_entry *dst = nft_route_dst_fetch(route, dir);
102 	int i, j = 0;
103 
104 	switch (flow_tuple->l3proto) {
105 	case NFPROTO_IPV4:
106 		flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
107 		break;
108 	case NFPROTO_IPV6:
109 		flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
110 		break;
111 	}
112 
113 	flow_tuple->iifidx = route->tuple[dir].in.ifindex;
114 	for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
115 		flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
116 		flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
117 		if (route->tuple[dir].in.ingress_vlans & BIT(i))
118 			flow_tuple->in_vlan_ingress |= BIT(j);
119 		j++;
120 	}
121 
122 	flow_tuple->tun = route->tuple[dir].in.tun;
123 	flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
124 	flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
125 
126 	switch (route->tuple[dir].xmit_type) {
127 	case FLOW_OFFLOAD_XMIT_DIRECT:
128 		memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
129 		       ETH_ALEN);
130 		memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
131 		       ETH_ALEN);
132 		flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
133 		dst_release(dst);
134 		break;
135 	case FLOW_OFFLOAD_XMIT_XFRM:
136 	case FLOW_OFFLOAD_XMIT_NEIGH:
137 		flow_tuple->ifidx = route->tuple[dir].out.ifindex;
138 		flow_tuple->dst_cache = dst;
139 		flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
140 		break;
141 	default:
142 		WARN_ON_ONCE(1);
143 		break;
144 	}
145 	flow_tuple->xmit_type = route->tuple[dir].xmit_type;
146 
147 	return 0;
148 }
149 
150 static void nft_flow_dst_release(struct flow_offload *flow,
151 				 enum flow_offload_tuple_dir dir)
152 {
153 	if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
154 	    flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
155 		dst_release(flow->tuplehash[dir].tuple.dst_cache);
156 }
157 
158 void flow_offload_route_init(struct flow_offload *flow,
159 			     struct nf_flow_route *route)
160 {
161 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
162 	flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
163 	flow->type = NF_FLOW_OFFLOAD_ROUTE;
164 }
165 EXPORT_SYMBOL_GPL(flow_offload_route_init);
166 
167 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
168 {
169 	return nf_flow_timeout_delta(flow->timeout) <= 0;
170 }
171 
172 static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
173 {
174 	struct ip_ct_tcp *tcp = &ct->proto.tcp;
175 
176 	spin_lock_bh(&ct->lock);
177 	if (tcp->state != tcp_state)
178 		tcp->state = tcp_state;
179 
180 	/* syn packet triggers the TCP reopen case from conntrack. */
181 	if (tcp->state == TCP_CONNTRACK_CLOSE)
182 		ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
183 
184 	/* Conntrack state is outdated due to offload bypass.
185 	 * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
186 	 * TCP reset validation will fail.
187 	 */
188 	tcp->seen[0].td_maxwin = 0;
189 	tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
190 	tcp->seen[1].td_maxwin = 0;
191 	tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
192 	spin_unlock_bh(&ct->lock);
193 }
194 
195 static void flow_offload_fixup_ct(struct flow_offload *flow)
196 {
197 	struct nf_conn *ct = flow->ct;
198 	struct net *net = nf_ct_net(ct);
199 	int l4num = nf_ct_protonum(ct);
200 	bool expired, closing = false;
201 	u32 offload_timeout = 0;
202 	s32 timeout;
203 
204 	if (l4num == IPPROTO_TCP) {
205 		const struct nf_tcp_net *tn = nf_tcp_pernet(net);
206 		u8 tcp_state;
207 
208 		/* Enter CLOSE state if fin/rst packet has been seen, this
209 		 * allows TCP reopen from conntrack. Otherwise, pick up from
210 		 * the last seen TCP state.
211 		 */
212 		closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
213 		if (closing) {
214 			flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
215 			timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
216 			expired = false;
217 		} else {
218 			tcp_state = READ_ONCE(ct->proto.tcp.state);
219 			flow_offload_fixup_tcp(ct, tcp_state);
220 			timeout = READ_ONCE(tn->timeouts[tcp_state]);
221 			expired = nf_flow_has_expired(flow);
222 		}
223 		offload_timeout = READ_ONCE(tn->offload_timeout);
224 
225 	} else if (l4num == IPPROTO_UDP) {
226 		const struct nf_udp_net *tn = nf_udp_pernet(net);
227 		enum udp_conntrack state =
228 			test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
229 			UDP_CT_REPLIED : UDP_CT_UNREPLIED;
230 
231 		timeout = READ_ONCE(tn->timeouts[state]);
232 		expired = nf_flow_has_expired(flow);
233 		offload_timeout = READ_ONCE(tn->offload_timeout);
234 	} else {
235 		return;
236 	}
237 
238 	if (expired)
239 		timeout -= offload_timeout;
240 
241 	if (timeout < 0)
242 		timeout = 0;
243 
244 	if (closing ||
245 	    nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
246 		nf_ct_refresh(ct, timeout);
247 }
248 
249 static void flow_offload_route_release(struct flow_offload *flow)
250 {
251 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
252 	nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
253 }
254 
255 void flow_offload_free(struct flow_offload *flow)
256 {
257 	switch (flow->type) {
258 	case NF_FLOW_OFFLOAD_ROUTE:
259 		flow_offload_route_release(flow);
260 		break;
261 	default:
262 		break;
263 	}
264 	nf_ct_put(flow->ct);
265 	kfree_rcu(flow, rcu_head);
266 }
267 EXPORT_SYMBOL_GPL(flow_offload_free);
268 
269 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
270 {
271 	const struct flow_offload_tuple *tuple = data;
272 
273 	return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
274 }
275 
276 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
277 {
278 	const struct flow_offload_tuple_rhash *tuplehash = data;
279 
280 	return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
281 }
282 
283 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
284 					const void *ptr)
285 {
286 	const struct flow_offload_tuple *tuple = arg->key;
287 	const struct flow_offload_tuple_rhash *x = ptr;
288 
289 	if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
290 		return 1;
291 
292 	return 0;
293 }
294 
295 static const struct rhashtable_params nf_flow_offload_rhash_params = {
296 	.head_offset		= offsetof(struct flow_offload_tuple_rhash, node),
297 	.hashfn			= flow_offload_hash,
298 	.obj_hashfn		= flow_offload_hash_obj,
299 	.obj_cmpfn		= flow_offload_hash_cmp,
300 	.automatic_shrinking	= true,
301 };
302 
303 unsigned long flow_offload_get_timeout(struct flow_offload *flow)
304 {
305 	unsigned long timeout = NF_FLOW_TIMEOUT;
306 	struct net *net = nf_ct_net(flow->ct);
307 	int l4num = nf_ct_protonum(flow->ct);
308 
309 	if (l4num == IPPROTO_TCP) {
310 		struct nf_tcp_net *tn = nf_tcp_pernet(net);
311 
312 		timeout = tn->offload_timeout;
313 	} else if (l4num == IPPROTO_UDP) {
314 		struct nf_udp_net *tn = nf_udp_pernet(net);
315 
316 		timeout = tn->offload_timeout;
317 	}
318 
319 	return timeout;
320 }
321 
322 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
323 {
324 	int err;
325 
326 	flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
327 
328 	err = rhashtable_insert_fast(&flow_table->rhashtable,
329 				     &flow->tuplehash[0].node,
330 				     nf_flow_offload_rhash_params);
331 	if (err < 0)
332 		return err;
333 
334 	err = rhashtable_insert_fast(&flow_table->rhashtable,
335 				     &flow->tuplehash[1].node,
336 				     nf_flow_offload_rhash_params);
337 	if (err < 0) {
338 		rhashtable_remove_fast(&flow_table->rhashtable,
339 				       &flow->tuplehash[0].node,
340 				       nf_flow_offload_rhash_params);
341 		return err;
342 	}
343 
344 	nf_ct_refresh(flow->ct, NF_CT_DAY);
345 
346 	if (nf_flowtable_hw_offload(flow_table)) {
347 		__set_bit(NF_FLOW_HW, &flow->flags);
348 		nf_flow_offload_add(flow_table, flow);
349 	}
350 
351 	return 0;
352 }
353 EXPORT_SYMBOL_GPL(flow_offload_add);
354 
355 void flow_offload_refresh(struct nf_flowtable *flow_table,
356 			  struct flow_offload *flow, bool force)
357 {
358 	u32 timeout;
359 
360 	timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
361 	if (force || timeout - READ_ONCE(flow->timeout) > HZ)
362 		WRITE_ONCE(flow->timeout, timeout);
363 	else
364 		return;
365 
366 	if (likely(!nf_flowtable_hw_offload(flow_table)) ||
367 	    test_bit(NF_FLOW_CLOSING, &flow->flags))
368 		return;
369 
370 	nf_flow_offload_add(flow_table, flow);
371 }
372 EXPORT_SYMBOL_GPL(flow_offload_refresh);
373 
374 static void flow_offload_del(struct nf_flowtable *flow_table,
375 			     struct flow_offload *flow)
376 {
377 	rhashtable_remove_fast(&flow_table->rhashtable,
378 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
379 			       nf_flow_offload_rhash_params);
380 	rhashtable_remove_fast(&flow_table->rhashtable,
381 			       &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
382 			       nf_flow_offload_rhash_params);
383 	flow_offload_free(flow);
384 }
385 
386 void flow_offload_teardown(struct flow_offload *flow)
387 {
388 	clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
389 	if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
390 		flow_offload_fixup_ct(flow);
391 }
392 EXPORT_SYMBOL_GPL(flow_offload_teardown);
393 
394 struct flow_offload_tuple_rhash *
395 flow_offload_lookup(struct nf_flowtable *flow_table,
396 		    struct flow_offload_tuple *tuple)
397 {
398 	struct flow_offload_tuple_rhash *tuplehash;
399 	struct flow_offload *flow;
400 	int dir;
401 
402 	tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
403 				      nf_flow_offload_rhash_params);
404 	if (!tuplehash)
405 		return NULL;
406 
407 	dir = tuplehash->tuple.dir;
408 	flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
409 	if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
410 		return NULL;
411 
412 	if (unlikely(nf_ct_is_dying(flow->ct)))
413 		return NULL;
414 
415 	return tuplehash;
416 }
417 EXPORT_SYMBOL_GPL(flow_offload_lookup);
418 
419 static int
420 nf_flow_table_iterate(struct nf_flowtable *flow_table,
421 		      void (*iter)(struct nf_flowtable *flowtable,
422 				   struct flow_offload *flow, void *data),
423 		      void *data)
424 {
425 	struct flow_offload_tuple_rhash *tuplehash;
426 	struct rhashtable_iter hti;
427 	struct flow_offload *flow;
428 	int err = 0;
429 
430 	rhashtable_walk_enter(&flow_table->rhashtable, &hti);
431 	rhashtable_walk_start(&hti);
432 
433 	while ((tuplehash = rhashtable_walk_next(&hti))) {
434 		if (IS_ERR(tuplehash)) {
435 			if (PTR_ERR(tuplehash) != -EAGAIN) {
436 				err = PTR_ERR(tuplehash);
437 				break;
438 			}
439 			continue;
440 		}
441 		if (tuplehash->tuple.dir)
442 			continue;
443 
444 		flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
445 
446 		iter(flow_table, flow, data);
447 	}
448 	rhashtable_walk_stop(&hti);
449 	rhashtable_walk_exit(&hti);
450 
451 	return err;
452 }
453 
454 static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
455 			      const struct flow_offload *flow)
456 {
457 	return flow_table->type->gc && flow_table->type->gc(flow);
458 }
459 
460 /**
461  * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
462  * @ct:		Flowtable offloaded tcp ct
463  *
464  * Return: number of seconds when ct entry should expire.
465  */
466 static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
467 {
468 	u8 state = READ_ONCE(ct->proto.tcp.state);
469 
470 	switch (state) {
471 	case TCP_CONNTRACK_SYN_SENT:
472 	case TCP_CONNTRACK_SYN_RECV:
473 		return 0;
474 	case TCP_CONNTRACK_ESTABLISHED:
475 		return NF_CT_DAY;
476 	case TCP_CONNTRACK_FIN_WAIT:
477 	case TCP_CONNTRACK_CLOSE_WAIT:
478 	case TCP_CONNTRACK_LAST_ACK:
479 	case TCP_CONNTRACK_TIME_WAIT:
480 		return 5 * 60 * HZ;
481 	case TCP_CONNTRACK_CLOSE:
482 		return 0;
483 	}
484 
485 	return 0;
486 }
487 
488 /**
489  * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
490  * @ct:		Flowtable offloaded ct
491  *
492  * Datapath lookups in the conntrack table will evict nf_conn entries
493  * if they have expired.
494  *
495  * Once nf_conn entries have been offloaded, nf_conntrack might not see any
496  * packets anymore.  Thus ct->timeout is no longer refreshed and ct can
497  * be evicted.
498  *
499  * To avoid the need for an additional check on the offload bit for every
500  * packet processed via nf_conntrack_in(), set an arbitrary timeout large
501  * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
502  * from the packet path via nf_ct_is_expired().
503  */
504 static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
505 {
506 	static const u32 min_timeout = 5 * 60 * HZ;
507 	u32 expires = nf_ct_expires(ct);
508 
509 	/* normal case: large enough timeout, nothing to do. */
510 	if (likely(expires >= min_timeout))
511 		return;
512 
513 	/* must check offload bit after this, we do not hold any locks.
514 	 * flowtable and ct entries could have been removed on another CPU.
515 	 */
516 	if (!refcount_inc_not_zero(&ct->ct_general.use))
517 		return;
518 
519 	/* load ct->status after refcount increase */
520 	smp_acquire__after_ctrl_dep();
521 
522 	if (nf_ct_is_confirmed(ct) &&
523 	    test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
524 		u8 l4proto = nf_ct_protonum(ct);
525 		u32 new_timeout = true;
526 
527 		switch (l4proto) {
528 		case IPPROTO_UDP:
529 			new_timeout = NF_CT_DAY;
530 			break;
531 		case IPPROTO_TCP:
532 			new_timeout = nf_flow_table_tcp_timeout(ct);
533 			break;
534 		default:
535 			WARN_ON_ONCE(1);
536 			break;
537 		}
538 
539 		/* Update to ct->timeout from nf_conntrack happens
540 		 * without holding ct->lock.
541 		 *
542 		 * Use cmpxchg to ensure timeout extension doesn't
543 		 * happen when we race with conntrack datapath.
544 		 *
545 		 * The inverse -- datapath updating ->timeout right
546 		 * after this -- is fine, datapath is authoritative.
547 		 */
548 		if (new_timeout) {
549 			new_timeout += nfct_time_stamp;
550 			cmpxchg(&ct->timeout, expires, new_timeout);
551 		}
552 	}
553 
554 	nf_ct_put(ct);
555 }
556 
557 static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
558 				    struct flow_offload *flow, void *data)
559 {
560 	bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
561 
562 	if (nf_flow_has_expired(flow) ||
563 	    nf_ct_is_dying(flow->ct) ||
564 	    nf_flow_custom_gc(flow_table, flow)) {
565 		flow_offload_teardown(flow);
566 		teardown = true;
567 	} else if (!teardown) {
568 		nf_flow_table_extend_ct_timeout(flow->ct);
569 	}
570 
571 	if (teardown) {
572 		if (test_bit(NF_FLOW_HW, &flow->flags)) {
573 			if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
574 				nf_flow_offload_del(flow_table, flow);
575 			else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
576 				flow_offload_del(flow_table, flow);
577 		} else {
578 			flow_offload_del(flow_table, flow);
579 		}
580 	} else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
581 		   test_bit(NF_FLOW_HW, &flow->flags) &&
582 		   !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
583 		nf_flow_offload_del(flow_table, flow);
584 	} else if (test_bit(NF_FLOW_HW, &flow->flags)) {
585 		nf_flow_offload_stats(flow_table, flow);
586 	}
587 }
588 
589 void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
590 {
591 	nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
592 }
593 
594 static void nf_flow_offload_work_gc(struct work_struct *work)
595 {
596 	struct nf_flowtable *flow_table;
597 
598 	flow_table = container_of(work, struct nf_flowtable, gc_work.work);
599 	nf_flow_table_gc_run(flow_table);
600 	queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
601 }
602 
603 static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
604 				 __be16 port, __be16 new_port)
605 {
606 	struct tcphdr *tcph;
607 
608 	tcph = (void *)(skb_network_header(skb) + thoff);
609 	inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
610 }
611 
612 static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
613 				 __be16 port, __be16 new_port)
614 {
615 	struct udphdr *udph;
616 
617 	udph = (void *)(skb_network_header(skb) + thoff);
618 	if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
619 		inet_proto_csum_replace2(&udph->check, skb, port,
620 					 new_port, false);
621 		if (!udph->check)
622 			udph->check = CSUM_MANGLED_0;
623 	}
624 }
625 
626 static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
627 			     u8 protocol, __be16 port, __be16 new_port)
628 {
629 	switch (protocol) {
630 	case IPPROTO_TCP:
631 		nf_flow_nat_port_tcp(skb, thoff, port, new_port);
632 		break;
633 	case IPPROTO_UDP:
634 		nf_flow_nat_port_udp(skb, thoff, port, new_port);
635 		break;
636 	}
637 }
638 
639 void nf_flow_snat_port(const struct flow_offload *flow,
640 		       struct sk_buff *skb, unsigned int thoff,
641 		       u8 protocol, enum flow_offload_tuple_dir dir)
642 {
643 	struct flow_ports *hdr;
644 	__be16 port, new_port;
645 
646 	hdr = (void *)(skb_network_header(skb) + thoff);
647 
648 	switch (dir) {
649 	case FLOW_OFFLOAD_DIR_ORIGINAL:
650 		port = hdr->source;
651 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
652 		hdr->source = new_port;
653 		break;
654 	case FLOW_OFFLOAD_DIR_REPLY:
655 		port = hdr->dest;
656 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
657 		hdr->dest = new_port;
658 		break;
659 	}
660 
661 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
662 }
663 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
664 
665 void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
666 		       unsigned int thoff, u8 protocol,
667 		       enum flow_offload_tuple_dir dir)
668 {
669 	struct flow_ports *hdr;
670 	__be16 port, new_port;
671 
672 	hdr = (void *)(skb_network_header(skb) + thoff);
673 
674 	switch (dir) {
675 	case FLOW_OFFLOAD_DIR_ORIGINAL:
676 		port = hdr->dest;
677 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
678 		hdr->dest = new_port;
679 		break;
680 	case FLOW_OFFLOAD_DIR_REPLY:
681 		port = hdr->source;
682 		new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
683 		hdr->source = new_port;
684 		break;
685 	}
686 
687 	nf_flow_nat_port(skb, thoff, protocol, port, new_port);
688 }
689 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
690 
691 int nf_flow_table_init(struct nf_flowtable *flowtable)
692 {
693 	int err;
694 
695 	INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
696 	flow_block_init(&flowtable->flow_block);
697 	init_rwsem(&flowtable->flow_block_lock);
698 
699 	err = rhashtable_init(&flowtable->rhashtable,
700 			      &nf_flow_offload_rhash_params);
701 	if (err < 0)
702 		return err;
703 
704 	queue_delayed_work(system_power_efficient_wq,
705 			   &flowtable->gc_work, HZ);
706 
707 	mutex_lock(&flowtable_lock);
708 	list_add(&flowtable->list, &flowtables);
709 	mutex_unlock(&flowtable_lock);
710 
711 	return 0;
712 }
713 EXPORT_SYMBOL_GPL(nf_flow_table_init);
714 
715 static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
716 				     struct flow_offload *flow, void *data)
717 {
718 	struct net_device *dev = data;
719 
720 	if (!dev) {
721 		flow_offload_teardown(flow);
722 		return;
723 	}
724 
725 	if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
726 	    (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
727 	     flow->tuplehash[1].tuple.iifidx == dev->ifindex))
728 		flow_offload_teardown(flow);
729 }
730 
731 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
732 			      struct net_device *dev)
733 {
734 	nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
735 	flush_delayed_work(&flowtable->gc_work);
736 	nf_flow_table_offload_flush(flowtable);
737 }
738 
739 void nf_flow_table_cleanup(struct net_device *dev)
740 {
741 	struct nf_flowtable *flowtable;
742 
743 	mutex_lock(&flowtable_lock);
744 	list_for_each_entry(flowtable, &flowtables, list)
745 		nf_flow_table_gc_cleanup(flowtable, dev);
746 	mutex_unlock(&flowtable_lock);
747 }
748 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
749 
750 void nf_flow_table_free(struct nf_flowtable *flow_table)
751 {
752 	mutex_lock(&flowtable_lock);
753 	list_del(&flow_table->list);
754 	mutex_unlock(&flowtable_lock);
755 
756 	cancel_delayed_work_sync(&flow_table->gc_work);
757 	nf_flow_table_offload_flush(flow_table);
758 	/* ... no more pending work after this stage ... */
759 	nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
760 	nf_flow_table_gc_run(flow_table);
761 	nf_flow_table_offload_flush_cleanup(flow_table);
762 	rhashtable_destroy(&flow_table->rhashtable);
763 }
764 EXPORT_SYMBOL_GPL(nf_flow_table_free);
765 
766 static int nf_flow_table_init_net(struct net *net)
767 {
768 	net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
769 	return net->ft.stat ? 0 : -ENOMEM;
770 }
771 
772 static void nf_flow_table_fini_net(struct net *net)
773 {
774 	free_percpu(net->ft.stat);
775 }
776 
777 static int nf_flow_table_pernet_init(struct net *net)
778 {
779 	int ret;
780 
781 	ret = nf_flow_table_init_net(net);
782 	if (ret < 0)
783 		return ret;
784 
785 	ret = nf_flow_table_init_proc(net);
786 	if (ret < 0)
787 		goto out_proc;
788 
789 	return 0;
790 
791 out_proc:
792 	nf_flow_table_fini_net(net);
793 	return ret;
794 }
795 
796 static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
797 {
798 	struct net *net;
799 
800 	list_for_each_entry(net, net_exit_list, exit_list) {
801 		nf_flow_table_fini_proc(net);
802 		nf_flow_table_fini_net(net);
803 	}
804 }
805 
806 static struct pernet_operations nf_flow_table_net_ops = {
807 	.init = nf_flow_table_pernet_init,
808 	.exit_batch = nf_flow_table_pernet_exit,
809 };
810 
811 static int __init nf_flow_table_module_init(void)
812 {
813 	int ret;
814 
815 	ret = register_pernet_subsys(&nf_flow_table_net_ops);
816 	if (ret < 0)
817 		return ret;
818 
819 	ret = nf_flow_table_offload_init();
820 	if (ret)
821 		goto out_offload;
822 
823 	ret = nf_flow_register_bpf();
824 	if (ret)
825 		goto out_bpf;
826 
827 	return 0;
828 
829 out_bpf:
830 	nf_flow_table_offload_exit();
831 out_offload:
832 	unregister_pernet_subsys(&nf_flow_table_net_ops);
833 	return ret;
834 }
835 
836 static void __exit nf_flow_table_module_exit(void)
837 {
838 	nf_flow_table_offload_exit();
839 	unregister_pernet_subsys(&nf_flow_table_net_ops);
840 }
841 
842 module_init(nf_flow_table_module_init);
843 module_exit(nf_flow_table_module_exit);
844 
845 MODULE_LICENSE("GPL");
846 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
847 MODULE_DESCRIPTION("Netfilter flow table module");
848