1 // SPDX-License-Identifier: GPL-2.0-only
2 #include <linux/kernel.h>
3 #include <linux/init.h>
4 #include <linux/module.h>
5 #include <linux/netfilter.h>
6 #include <linux/rhashtable.h>
7 #include <linux/netdevice.h>
8 #include <net/ip.h>
9 #include <net/ip6_route.h>
10 #include <net/netfilter/nf_tables.h>
11 #include <net/netfilter/nf_flow_table.h>
12 #include <net/netfilter/nf_conntrack.h>
13 #include <net/netfilter/nf_conntrack_core.h>
14 #include <net/netfilter/nf_conntrack_l4proto.h>
15 #include <net/netfilter/nf_conntrack_tuple.h>
16
17 static DEFINE_MUTEX(flowtable_lock);
18 static LIST_HEAD(flowtables);
19 static __read_mostly struct kmem_cache *flow_offload_cachep;
20
21 static void
flow_offload_fill_dir(struct flow_offload * flow,enum flow_offload_tuple_dir dir)22 flow_offload_fill_dir(struct flow_offload *flow,
23 enum flow_offload_tuple_dir dir)
24 {
25 struct flow_offload_tuple *ft = &flow->tuplehash[dir].tuple;
26 struct nf_conntrack_tuple *ctt = &flow->ct->tuplehash[dir].tuple;
27
28 ft->dir = dir;
29
30 switch (ctt->src.l3num) {
31 case NFPROTO_IPV4:
32 ft->src_v4 = ctt->src.u3.in;
33 ft->dst_v4 = ctt->dst.u3.in;
34 break;
35 case NFPROTO_IPV6:
36 ft->src_v6 = ctt->src.u3.in6;
37 ft->dst_v6 = ctt->dst.u3.in6;
38 break;
39 }
40
41 ft->l3proto = ctt->src.l3num;
42 ft->l4proto = ctt->dst.protonum;
43
44 switch (ctt->dst.protonum) {
45 case IPPROTO_TCP:
46 case IPPROTO_UDP:
47 ft->src_port = ctt->src.u.tcp.port;
48 ft->dst_port = ctt->dst.u.tcp.port;
49 break;
50 }
51 }
52
flow_offload_alloc(struct nf_conn * ct)53 struct flow_offload *flow_offload_alloc(struct nf_conn *ct)
54 {
55 struct flow_offload *flow;
56
57 if (unlikely(nf_ct_is_dying(ct)))
58 return NULL;
59
60 flow = kmem_cache_zalloc(flow_offload_cachep, GFP_ATOMIC);
61 if (!flow)
62 return NULL;
63
64 refcount_inc(&ct->ct_general.use);
65 flow->ct = ct;
66
67 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
68 flow_offload_fill_dir(flow, FLOW_OFFLOAD_DIR_REPLY);
69
70 if (ct->status & IPS_SRC_NAT)
71 __set_bit(NF_FLOW_SNAT, &flow->flags);
72 if (ct->status & IPS_DST_NAT)
73 __set_bit(NF_FLOW_DNAT, &flow->flags);
74
75 return flow;
76 }
77 EXPORT_SYMBOL_GPL(flow_offload_alloc);
78
flow_offload_dst_cookie(struct flow_offload_tuple * flow_tuple)79 static u32 flow_offload_dst_cookie(struct flow_offload_tuple *flow_tuple)
80 {
81 if (flow_tuple->l3proto == NFPROTO_IPV6)
82 return rt6_get_cookie(dst_rt6_info(flow_tuple->dst_cache));
83
84 return 0;
85 }
86
nft_route_dst_fetch(struct nf_flow_route * route,enum flow_offload_tuple_dir dir)87 static struct dst_entry *nft_route_dst_fetch(struct nf_flow_route *route,
88 enum flow_offload_tuple_dir dir)
89 {
90 struct dst_entry *dst = route->tuple[dir].dst;
91
92 route->tuple[dir].dst = NULL;
93
94 return dst;
95 }
96
flow_offload_fill_route(struct flow_offload * flow,struct nf_flow_route * route,enum flow_offload_tuple_dir dir)97 static int flow_offload_fill_route(struct flow_offload *flow,
98 struct nf_flow_route *route,
99 enum flow_offload_tuple_dir dir)
100 {
101 struct flow_offload_tuple *flow_tuple = &flow->tuplehash[dir].tuple;
102 struct dst_entry *dst = nft_route_dst_fetch(route, dir);
103 int i, j = 0;
104
105 switch (flow_tuple->l3proto) {
106 case NFPROTO_IPV4:
107 flow_tuple->mtu = ip_dst_mtu_maybe_forward(dst, true);
108 break;
109 case NFPROTO_IPV6:
110 flow_tuple->mtu = ip6_dst_mtu_maybe_forward(dst, true);
111 break;
112 }
113
114 flow_tuple->iifidx = route->tuple[dir].in.ifindex;
115 for (i = route->tuple[dir].in.num_encaps - 1; i >= 0; i--) {
116 flow_tuple->encap[j].id = route->tuple[dir].in.encap[i].id;
117 flow_tuple->encap[j].proto = route->tuple[dir].in.encap[i].proto;
118 if (route->tuple[dir].in.ingress_vlans & BIT(i))
119 flow_tuple->in_vlan_ingress |= BIT(j);
120 j++;
121 }
122
123 flow_tuple->tun = route->tuple[dir].in.tun;
124 flow_tuple->encap_num = route->tuple[dir].in.num_encaps;
125 flow_tuple->needs_gso_segment = route->tuple[dir].out.needs_gso_segment;
126 flow_tuple->tun_num = route->tuple[dir].in.num_tuns;
127
128 switch (route->tuple[dir].xmit_type) {
129 case FLOW_OFFLOAD_XMIT_DIRECT:
130 memcpy(flow_tuple->out.h_dest, route->tuple[dir].out.h_dest,
131 ETH_ALEN);
132 memcpy(flow_tuple->out.h_source, route->tuple[dir].out.h_source,
133 ETH_ALEN);
134 flow_tuple->out.ifidx = route->tuple[dir].out.ifindex;
135 dst_release(dst);
136 break;
137 case FLOW_OFFLOAD_XMIT_XFRM:
138 case FLOW_OFFLOAD_XMIT_NEIGH:
139 flow_tuple->ifidx = route->tuple[dir].out.ifindex;
140 flow_tuple->dst_cache = dst;
141 flow_tuple->dst_cookie = flow_offload_dst_cookie(flow_tuple);
142 break;
143 default:
144 WARN_ON_ONCE(1);
145 break;
146 }
147 flow_tuple->xmit_type = route->tuple[dir].xmit_type;
148
149 return 0;
150 }
151
nft_flow_dst_release(struct flow_offload * flow,enum flow_offload_tuple_dir dir)152 static void nft_flow_dst_release(struct flow_offload *flow,
153 enum flow_offload_tuple_dir dir)
154 {
155 if (flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_NEIGH ||
156 flow->tuplehash[dir].tuple.xmit_type == FLOW_OFFLOAD_XMIT_XFRM)
157 dst_release(flow->tuplehash[dir].tuple.dst_cache);
158 }
159
flow_offload_route_init(struct flow_offload * flow,struct nf_flow_route * route)160 void flow_offload_route_init(struct flow_offload *flow,
161 struct nf_flow_route *route)
162 {
163 flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_ORIGINAL);
164 flow_offload_fill_route(flow, route, FLOW_OFFLOAD_DIR_REPLY);
165 flow->type = NF_FLOW_OFFLOAD_ROUTE;
166 }
167 EXPORT_SYMBOL_GPL(flow_offload_route_init);
168
nf_flow_has_expired(const struct flow_offload * flow)169 static inline bool nf_flow_has_expired(const struct flow_offload *flow)
170 {
171 return nf_flow_timeout_delta(flow->timeout) <= 0;
172 }
173
flow_offload_fixup_tcp(struct nf_conn * ct,u8 tcp_state)174 static void flow_offload_fixup_tcp(struct nf_conn *ct, u8 tcp_state)
175 {
176 struct ip_ct_tcp *tcp = &ct->proto.tcp;
177
178 spin_lock_bh(&ct->lock);
179 if (tcp->state != tcp_state)
180 tcp->state = tcp_state;
181
182 /* syn packet triggers the TCP reopen case from conntrack. */
183 if (tcp->state == TCP_CONNTRACK_CLOSE)
184 ct->proto.tcp.seen[0].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
185
186 /* Conntrack state is outdated due to offload bypass.
187 * Clear IP_CT_TCP_FLAG_MAXACK_SET, otherwise conntracks
188 * TCP reset validation will fail.
189 */
190 tcp->seen[0].td_maxwin = 0;
191 tcp->seen[0].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
192 tcp->seen[1].td_maxwin = 0;
193 tcp->seen[1].flags &= ~IP_CT_TCP_FLAG_MAXACK_SET;
194 spin_unlock_bh(&ct->lock);
195 }
196
flow_offload_fixup_ct(struct flow_offload * flow)197 static void flow_offload_fixup_ct(struct flow_offload *flow)
198 {
199 struct nf_conn *ct = flow->ct;
200 struct net *net = nf_ct_net(ct);
201 int l4num = nf_ct_protonum(ct);
202 bool expired, closing = false;
203 u32 offload_timeout = 0;
204 s32 timeout;
205
206 if (l4num == IPPROTO_TCP) {
207 const struct nf_tcp_net *tn = nf_tcp_pernet(net);
208 u8 tcp_state;
209
210 /* Enter CLOSE state if fin/rst packet has been seen, this
211 * allows TCP reopen from conntrack. Otherwise, pick up from
212 * the last seen TCP state.
213 */
214 closing = test_bit(NF_FLOW_CLOSING, &flow->flags);
215 if (closing) {
216 flow_offload_fixup_tcp(ct, TCP_CONNTRACK_CLOSE);
217 timeout = READ_ONCE(tn->timeouts[TCP_CONNTRACK_CLOSE]);
218 expired = false;
219 } else {
220 tcp_state = READ_ONCE(ct->proto.tcp.state);
221 flow_offload_fixup_tcp(ct, tcp_state);
222 timeout = READ_ONCE(tn->timeouts[tcp_state]);
223 expired = nf_flow_has_expired(flow);
224 }
225 offload_timeout = READ_ONCE(tn->offload_timeout);
226
227 } else if (l4num == IPPROTO_UDP) {
228 const struct nf_udp_net *tn = nf_udp_pernet(net);
229 enum udp_conntrack state =
230 test_bit(IPS_SEEN_REPLY_BIT, &ct->status) ?
231 UDP_CT_REPLIED : UDP_CT_UNREPLIED;
232
233 timeout = READ_ONCE(tn->timeouts[state]);
234 expired = nf_flow_has_expired(flow);
235 offload_timeout = READ_ONCE(tn->offload_timeout);
236 } else {
237 return;
238 }
239
240 if (expired)
241 timeout -= offload_timeout;
242
243 if (timeout < 0)
244 timeout = 0;
245
246 if (closing ||
247 nf_flow_timeout_delta(READ_ONCE(ct->timeout)) > (__s32)timeout)
248 nf_ct_refresh(ct, timeout);
249 }
250
flow_offload_route_release(struct flow_offload * flow)251 static void flow_offload_route_release(struct flow_offload *flow)
252 {
253 nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_ORIGINAL);
254 nft_flow_dst_release(flow, FLOW_OFFLOAD_DIR_REPLY);
255 }
256
flow_offload_free(struct flow_offload * flow)257 void flow_offload_free(struct flow_offload *flow)
258 {
259 switch (flow->type) {
260 case NF_FLOW_OFFLOAD_ROUTE:
261 flow_offload_route_release(flow);
262 break;
263 default:
264 break;
265 }
266 nf_ct_put(flow->ct);
267 kfree_rcu(flow, rcu_head);
268 }
269 EXPORT_SYMBOL_GPL(flow_offload_free);
270
flow_offload_hash(const void * data,u32 len,u32 seed)271 static u32 flow_offload_hash(const void *data, u32 len, u32 seed)
272 {
273 const struct flow_offload_tuple *tuple = data;
274
275 return jhash(tuple, offsetof(struct flow_offload_tuple, __hash), seed);
276 }
277
flow_offload_hash_obj(const void * data,u32 len,u32 seed)278 static u32 flow_offload_hash_obj(const void *data, u32 len, u32 seed)
279 {
280 const struct flow_offload_tuple_rhash *tuplehash = data;
281
282 return jhash(&tuplehash->tuple, offsetof(struct flow_offload_tuple, __hash), seed);
283 }
284
flow_offload_hash_cmp(struct rhashtable_compare_arg * arg,const void * ptr)285 static int flow_offload_hash_cmp(struct rhashtable_compare_arg *arg,
286 const void *ptr)
287 {
288 const struct flow_offload_tuple *tuple = arg->key;
289 const struct flow_offload_tuple_rhash *x = ptr;
290
291 if (memcmp(&x->tuple, tuple, offsetof(struct flow_offload_tuple, __hash)))
292 return 1;
293
294 return 0;
295 }
296
297 static const struct rhashtable_params nf_flow_offload_rhash_params = {
298 .head_offset = offsetof(struct flow_offload_tuple_rhash, node),
299 .hashfn = flow_offload_hash,
300 .obj_hashfn = flow_offload_hash_obj,
301 .obj_cmpfn = flow_offload_hash_cmp,
302 .automatic_shrinking = true,
303 };
304
flow_offload_get_timeout(struct flow_offload * flow)305 unsigned long flow_offload_get_timeout(struct flow_offload *flow)
306 {
307 unsigned long timeout = NF_FLOW_TIMEOUT;
308 struct net *net = nf_ct_net(flow->ct);
309 int l4num = nf_ct_protonum(flow->ct);
310
311 if (l4num == IPPROTO_TCP) {
312 struct nf_tcp_net *tn = nf_tcp_pernet(net);
313
314 timeout = tn->offload_timeout;
315 } else if (l4num == IPPROTO_UDP) {
316 struct nf_udp_net *tn = nf_udp_pernet(net);
317
318 timeout = tn->offload_timeout;
319 }
320
321 return timeout;
322 }
323
flow_offload_add(struct nf_flowtable * flow_table,struct flow_offload * flow)324 int flow_offload_add(struct nf_flowtable *flow_table, struct flow_offload *flow)
325 {
326 int err;
327
328 flow->timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
329
330 err = rhashtable_insert_fast(&flow_table->rhashtable,
331 &flow->tuplehash[0].node,
332 nf_flow_offload_rhash_params);
333 if (err < 0)
334 return err;
335
336 err = rhashtable_insert_fast(&flow_table->rhashtable,
337 &flow->tuplehash[1].node,
338 nf_flow_offload_rhash_params);
339 if (err < 0) {
340 rhashtable_remove_fast(&flow_table->rhashtable,
341 &flow->tuplehash[0].node,
342 nf_flow_offload_rhash_params);
343 return err;
344 }
345
346 nf_ct_refresh(flow->ct, NF_CT_DAY);
347
348 if (nf_flowtable_hw_offload(flow_table)) {
349 __set_bit(NF_FLOW_HW, &flow->flags);
350 nf_flow_offload_add(flow_table, flow);
351 }
352
353 return 0;
354 }
355 EXPORT_SYMBOL_GPL(flow_offload_add);
356
flow_offload_refresh(struct nf_flowtable * flow_table,struct flow_offload * flow,bool force)357 void flow_offload_refresh(struct nf_flowtable *flow_table,
358 struct flow_offload *flow, bool force)
359 {
360 u32 timeout;
361
362 timeout = nf_flowtable_time_stamp + flow_offload_get_timeout(flow);
363 if (force || timeout - READ_ONCE(flow->timeout) > HZ)
364 WRITE_ONCE(flow->timeout, timeout);
365 else
366 return;
367
368 if (likely(!nf_flowtable_hw_offload(flow_table)) ||
369 test_bit(NF_FLOW_CLOSING, &flow->flags))
370 return;
371
372 nf_flow_offload_add(flow_table, flow);
373 }
374 EXPORT_SYMBOL_GPL(flow_offload_refresh);
375
flow_offload_del(struct nf_flowtable * flow_table,struct flow_offload * flow)376 static void flow_offload_del(struct nf_flowtable *flow_table,
377 struct flow_offload *flow)
378 {
379 rhashtable_remove_fast(&flow_table->rhashtable,
380 &flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].node,
381 nf_flow_offload_rhash_params);
382 rhashtable_remove_fast(&flow_table->rhashtable,
383 &flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].node,
384 nf_flow_offload_rhash_params);
385 flow_offload_free(flow);
386 }
387
flow_offload_teardown(struct flow_offload * flow)388 void flow_offload_teardown(struct flow_offload *flow)
389 {
390 clear_bit(IPS_OFFLOAD_BIT, &flow->ct->status);
391 if (!test_and_set_bit(NF_FLOW_TEARDOWN, &flow->flags))
392 flow_offload_fixup_ct(flow);
393 }
394 EXPORT_SYMBOL_GPL(flow_offload_teardown);
395
396 struct flow_offload_tuple_rhash *
flow_offload_lookup(struct nf_flowtable * flow_table,struct flow_offload_tuple * tuple)397 flow_offload_lookup(struct nf_flowtable *flow_table,
398 struct flow_offload_tuple *tuple)
399 {
400 struct flow_offload_tuple_rhash *tuplehash;
401 struct flow_offload *flow;
402 int dir;
403
404 tuplehash = rhashtable_lookup(&flow_table->rhashtable, tuple,
405 nf_flow_offload_rhash_params);
406 if (!tuplehash)
407 return NULL;
408
409 dir = tuplehash->tuple.dir;
410 flow = container_of(tuplehash, struct flow_offload, tuplehash[dir]);
411 if (test_bit(NF_FLOW_TEARDOWN, &flow->flags))
412 return NULL;
413
414 if (unlikely(nf_ct_is_dying(flow->ct)))
415 return NULL;
416
417 return tuplehash;
418 }
419 EXPORT_SYMBOL_GPL(flow_offload_lookup);
420
421 static int
nf_flow_table_iterate(struct nf_flowtable * flow_table,void (* iter)(struct nf_flowtable * flowtable,struct flow_offload * flow,void * data),void * data)422 nf_flow_table_iterate(struct nf_flowtable *flow_table,
423 void (*iter)(struct nf_flowtable *flowtable,
424 struct flow_offload *flow, void *data),
425 void *data)
426 {
427 struct flow_offload_tuple_rhash *tuplehash;
428 struct rhashtable_iter hti;
429 struct flow_offload *flow;
430 int err = 0;
431
432 rhashtable_walk_enter(&flow_table->rhashtable, &hti);
433 rhashtable_walk_start(&hti);
434
435 while ((tuplehash = rhashtable_walk_next(&hti))) {
436 if (IS_ERR(tuplehash)) {
437 if (PTR_ERR(tuplehash) != -EAGAIN) {
438 err = PTR_ERR(tuplehash);
439 break;
440 }
441 continue;
442 }
443 if (tuplehash->tuple.dir)
444 continue;
445
446 flow = container_of(tuplehash, struct flow_offload, tuplehash[0]);
447
448 iter(flow_table, flow, data);
449 }
450 rhashtable_walk_stop(&hti);
451 rhashtable_walk_exit(&hti);
452
453 return err;
454 }
455
nf_flow_custom_gc(struct nf_flowtable * flow_table,const struct flow_offload * flow)456 static bool nf_flow_custom_gc(struct nf_flowtable *flow_table,
457 const struct flow_offload *flow)
458 {
459 return flow_table->type->gc && flow_table->type->gc(flow);
460 }
461
462 /**
463 * nf_flow_table_tcp_timeout() - new timeout of offloaded tcp entry
464 * @ct: Flowtable offloaded tcp ct
465 *
466 * Return: number of seconds when ct entry should expire.
467 */
nf_flow_table_tcp_timeout(const struct nf_conn * ct)468 static u32 nf_flow_table_tcp_timeout(const struct nf_conn *ct)
469 {
470 u8 state = READ_ONCE(ct->proto.tcp.state);
471
472 switch (state) {
473 case TCP_CONNTRACK_SYN_SENT:
474 case TCP_CONNTRACK_SYN_RECV:
475 return 0;
476 case TCP_CONNTRACK_ESTABLISHED:
477 return NF_CT_DAY;
478 case TCP_CONNTRACK_FIN_WAIT:
479 case TCP_CONNTRACK_CLOSE_WAIT:
480 case TCP_CONNTRACK_LAST_ACK:
481 case TCP_CONNTRACK_TIME_WAIT:
482 return 5 * 60 * HZ;
483 case TCP_CONNTRACK_CLOSE:
484 return 0;
485 }
486
487 return 0;
488 }
489
490 /**
491 * nf_flow_table_extend_ct_timeout() - Extend ct timeout of offloaded conntrack entry
492 * @ct: Flowtable offloaded ct
493 *
494 * Datapath lookups in the conntrack table will evict nf_conn entries
495 * if they have expired.
496 *
497 * Once nf_conn entries have been offloaded, nf_conntrack might not see any
498 * packets anymore. Thus ct->timeout is no longer refreshed and ct can
499 * be evicted.
500 *
501 * To avoid the need for an additional check on the offload bit for every
502 * packet processed via nf_conntrack_in(), set an arbitrary timeout large
503 * enough not to ever expire, this save us a check for the IPS_OFFLOAD_BIT
504 * from the packet path via nf_ct_is_expired().
505 */
nf_flow_table_extend_ct_timeout(struct nf_conn * ct)506 static void nf_flow_table_extend_ct_timeout(struct nf_conn *ct)
507 {
508 static const u32 min_timeout = 5 * 60 * HZ;
509 u32 expires = nf_ct_expires(ct);
510
511 /* normal case: large enough timeout, nothing to do. */
512 if (likely(expires >= min_timeout))
513 return;
514
515 /* must check offload bit after this, we do not hold any locks.
516 * flowtable and ct entries could have been removed on another CPU.
517 */
518 if (!refcount_inc_not_zero(&ct->ct_general.use))
519 return;
520
521 /* load ct->status after refcount increase */
522 smp_acquire__after_ctrl_dep();
523
524 if (nf_ct_is_confirmed(ct) &&
525 test_bit(IPS_OFFLOAD_BIT, &ct->status)) {
526 u8 l4proto = nf_ct_protonum(ct);
527 u32 new_timeout = true;
528
529 switch (l4proto) {
530 case IPPROTO_UDP:
531 new_timeout = NF_CT_DAY;
532 break;
533 case IPPROTO_TCP:
534 new_timeout = nf_flow_table_tcp_timeout(ct);
535 break;
536 default:
537 WARN_ON_ONCE(1);
538 break;
539 }
540
541 /* Update to ct->timeout from nf_conntrack happens
542 * without holding ct->lock.
543 *
544 * Use cmpxchg to ensure timeout extension doesn't
545 * happen when we race with conntrack datapath.
546 *
547 * The inverse -- datapath updating ->timeout right
548 * after this -- is fine, datapath is authoritative.
549 */
550 if (new_timeout) {
551 new_timeout += nfct_time_stamp;
552 cmpxchg(&ct->timeout, expires, new_timeout);
553 }
554 }
555
556 nf_ct_put(ct);
557 }
558
nf_flow_offload_gc_step(struct nf_flowtable * flow_table,struct flow_offload * flow,void * data)559 static void nf_flow_offload_gc_step(struct nf_flowtable *flow_table,
560 struct flow_offload *flow, void *data)
561 {
562 bool teardown = test_bit(NF_FLOW_TEARDOWN, &flow->flags);
563
564 if (nf_flow_has_expired(flow) ||
565 nf_ct_is_dying(flow->ct) ||
566 nf_flow_custom_gc(flow_table, flow)) {
567 flow_offload_teardown(flow);
568 teardown = true;
569 } else if (!teardown) {
570 nf_flow_table_extend_ct_timeout(flow->ct);
571 }
572
573 if (teardown) {
574 if (test_bit(NF_FLOW_HW, &flow->flags)) {
575 if (!test_bit(NF_FLOW_HW_DYING, &flow->flags))
576 nf_flow_offload_del(flow_table, flow);
577 else if (test_bit(NF_FLOW_HW_DEAD, &flow->flags))
578 flow_offload_del(flow_table, flow);
579 } else {
580 flow_offload_del(flow_table, flow);
581 }
582 } else if (test_bit(NF_FLOW_CLOSING, &flow->flags) &&
583 test_bit(NF_FLOW_HW, &flow->flags) &&
584 !test_bit(NF_FLOW_HW_DYING, &flow->flags)) {
585 nf_flow_offload_del(flow_table, flow);
586 } else if (test_bit(NF_FLOW_HW, &flow->flags)) {
587 nf_flow_offload_stats(flow_table, flow);
588 }
589 }
590
nf_flow_table_gc_run(struct nf_flowtable * flow_table)591 void nf_flow_table_gc_run(struct nf_flowtable *flow_table)
592 {
593 nf_flow_table_iterate(flow_table, nf_flow_offload_gc_step, NULL);
594 }
595
nf_flow_offload_work_gc(struct work_struct * work)596 static void nf_flow_offload_work_gc(struct work_struct *work)
597 {
598 struct nf_flowtable *flow_table;
599
600 flow_table = container_of(work, struct nf_flowtable, gc_work.work);
601 nf_flow_table_gc_run(flow_table);
602 queue_delayed_work(system_power_efficient_wq, &flow_table->gc_work, HZ);
603 }
604
nf_flow_nat_port_tcp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)605 static void nf_flow_nat_port_tcp(struct sk_buff *skb, unsigned int thoff,
606 __be16 port, __be16 new_port)
607 {
608 struct tcphdr *tcph;
609
610 tcph = (void *)(skb_network_header(skb) + thoff);
611 inet_proto_csum_replace2(&tcph->check, skb, port, new_port, false);
612 }
613
nf_flow_nat_port_udp(struct sk_buff * skb,unsigned int thoff,__be16 port,__be16 new_port)614 static void nf_flow_nat_port_udp(struct sk_buff *skb, unsigned int thoff,
615 __be16 port, __be16 new_port)
616 {
617 struct udphdr *udph;
618
619 udph = (void *)(skb_network_header(skb) + thoff);
620 if (udph->check || skb->ip_summed == CHECKSUM_PARTIAL) {
621 inet_proto_csum_replace2(&udph->check, skb, port,
622 new_port, false);
623 if (!udph->check)
624 udph->check = CSUM_MANGLED_0;
625 }
626 }
627
nf_flow_nat_port(struct sk_buff * skb,unsigned int thoff,u8 protocol,__be16 port,__be16 new_port)628 static void nf_flow_nat_port(struct sk_buff *skb, unsigned int thoff,
629 u8 protocol, __be16 port, __be16 new_port)
630 {
631 switch (protocol) {
632 case IPPROTO_TCP:
633 nf_flow_nat_port_tcp(skb, thoff, port, new_port);
634 break;
635 case IPPROTO_UDP:
636 nf_flow_nat_port_udp(skb, thoff, port, new_port);
637 break;
638 }
639 }
640
nf_flow_snat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)641 void nf_flow_snat_port(const struct flow_offload *flow,
642 struct sk_buff *skb, unsigned int thoff,
643 u8 protocol, enum flow_offload_tuple_dir dir)
644 {
645 struct flow_ports *hdr;
646 __be16 port, new_port;
647
648 hdr = (void *)(skb_network_header(skb) + thoff);
649
650 switch (dir) {
651 case FLOW_OFFLOAD_DIR_ORIGINAL:
652 port = hdr->source;
653 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.dst_port;
654 hdr->source = new_port;
655 break;
656 case FLOW_OFFLOAD_DIR_REPLY:
657 port = hdr->dest;
658 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.src_port;
659 hdr->dest = new_port;
660 break;
661 }
662
663 nf_flow_nat_port(skb, thoff, protocol, port, new_port);
664 }
665 EXPORT_SYMBOL_GPL(nf_flow_snat_port);
666
nf_flow_dnat_port(const struct flow_offload * flow,struct sk_buff * skb,unsigned int thoff,u8 protocol,enum flow_offload_tuple_dir dir)667 void nf_flow_dnat_port(const struct flow_offload *flow, struct sk_buff *skb,
668 unsigned int thoff, u8 protocol,
669 enum flow_offload_tuple_dir dir)
670 {
671 struct flow_ports *hdr;
672 __be16 port, new_port;
673
674 hdr = (void *)(skb_network_header(skb) + thoff);
675
676 switch (dir) {
677 case FLOW_OFFLOAD_DIR_ORIGINAL:
678 port = hdr->dest;
679 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_REPLY].tuple.src_port;
680 hdr->dest = new_port;
681 break;
682 case FLOW_OFFLOAD_DIR_REPLY:
683 port = hdr->source;
684 new_port = flow->tuplehash[FLOW_OFFLOAD_DIR_ORIGINAL].tuple.dst_port;
685 hdr->source = new_port;
686 break;
687 }
688
689 nf_flow_nat_port(skb, thoff, protocol, port, new_port);
690 }
691 EXPORT_SYMBOL_GPL(nf_flow_dnat_port);
692
nf_flow_table_init(struct nf_flowtable * flowtable)693 int nf_flow_table_init(struct nf_flowtable *flowtable)
694 {
695 int err;
696
697 INIT_DELAYED_WORK(&flowtable->gc_work, nf_flow_offload_work_gc);
698 flow_block_init(&flowtable->flow_block);
699 init_rwsem(&flowtable->flow_block_lock);
700
701 err = rhashtable_init(&flowtable->rhashtable,
702 &nf_flow_offload_rhash_params);
703 if (err < 0)
704 return err;
705
706 queue_delayed_work(system_power_efficient_wq,
707 &flowtable->gc_work, HZ);
708
709 mutex_lock(&flowtable_lock);
710 list_add(&flowtable->list, &flowtables);
711 mutex_unlock(&flowtable_lock);
712
713 return 0;
714 }
715 EXPORT_SYMBOL_GPL(nf_flow_table_init);
716
nf_flow_table_do_cleanup(struct nf_flowtable * flow_table,struct flow_offload * flow,void * data)717 static void nf_flow_table_do_cleanup(struct nf_flowtable *flow_table,
718 struct flow_offload *flow, void *data)
719 {
720 struct net_device *dev = data;
721
722 if (!dev) {
723 flow_offload_teardown(flow);
724 return;
725 }
726
727 if (net_eq(nf_ct_net(flow->ct), dev_net(dev)) &&
728 (flow->tuplehash[0].tuple.iifidx == dev->ifindex ||
729 flow->tuplehash[1].tuple.iifidx == dev->ifindex))
730 flow_offload_teardown(flow);
731 }
732
nf_flow_table_gc_cleanup(struct nf_flowtable * flowtable,struct net_device * dev)733 void nf_flow_table_gc_cleanup(struct nf_flowtable *flowtable,
734 struct net_device *dev)
735 {
736 nf_flow_table_iterate(flowtable, nf_flow_table_do_cleanup, dev);
737 flush_delayed_work(&flowtable->gc_work);
738 nf_flow_table_offload_flush(flowtable);
739 }
740
nf_flow_table_cleanup(struct net_device * dev)741 void nf_flow_table_cleanup(struct net_device *dev)
742 {
743 struct nf_flowtable *flowtable;
744
745 mutex_lock(&flowtable_lock);
746 list_for_each_entry(flowtable, &flowtables, list)
747 nf_flow_table_gc_cleanup(flowtable, dev);
748 mutex_unlock(&flowtable_lock);
749 }
750 EXPORT_SYMBOL_GPL(nf_flow_table_cleanup);
751
nf_flow_table_free(struct nf_flowtable * flow_table)752 void nf_flow_table_free(struct nf_flowtable *flow_table)
753 {
754 mutex_lock(&flowtable_lock);
755 list_del(&flow_table->list);
756 mutex_unlock(&flowtable_lock);
757
758 cancel_delayed_work_sync(&flow_table->gc_work);
759 nf_flow_table_offload_flush(flow_table);
760 /* ... no more pending work after this stage ... */
761 nf_flow_table_iterate(flow_table, nf_flow_table_do_cleanup, NULL);
762 nf_flow_table_gc_run(flow_table);
763 nf_flow_table_offload_flush_cleanup(flow_table);
764 rhashtable_destroy(&flow_table->rhashtable);
765 }
766 EXPORT_SYMBOL_GPL(nf_flow_table_free);
767
nf_flow_table_init_net(struct net * net)768 static int nf_flow_table_init_net(struct net *net)
769 {
770 net->ft.stat = alloc_percpu(struct nf_flow_table_stat);
771 return net->ft.stat ? 0 : -ENOMEM;
772 }
773
nf_flow_table_fini_net(struct net * net)774 static void nf_flow_table_fini_net(struct net *net)
775 {
776 free_percpu(net->ft.stat);
777 }
778
nf_flow_table_pernet_init(struct net * net)779 static int nf_flow_table_pernet_init(struct net *net)
780 {
781 int ret;
782
783 ret = nf_flow_table_init_net(net);
784 if (ret < 0)
785 return ret;
786
787 ret = nf_flow_table_init_proc(net);
788 if (ret < 0)
789 goto out_proc;
790
791 return 0;
792
793 out_proc:
794 nf_flow_table_fini_net(net);
795 return ret;
796 }
797
nf_flow_table_pernet_exit(struct list_head * net_exit_list)798 static void nf_flow_table_pernet_exit(struct list_head *net_exit_list)
799 {
800 struct net *net;
801
802 list_for_each_entry(net, net_exit_list, exit_list) {
803 nf_flow_table_fini_proc(net);
804 nf_flow_table_fini_net(net);
805 }
806 }
807
808 static struct pernet_operations nf_flow_table_net_ops = {
809 .init = nf_flow_table_pernet_init,
810 .exit_batch = nf_flow_table_pernet_exit,
811 };
812
nf_flow_table_module_init(void)813 static int __init nf_flow_table_module_init(void)
814 {
815 int ret;
816
817 flow_offload_cachep = KMEM_CACHE(flow_offload, SLAB_HWCACHE_ALIGN);
818 if (!flow_offload_cachep)
819 return -ENOMEM;
820
821 ret = register_pernet_subsys(&nf_flow_table_net_ops);
822 if (ret < 0)
823 goto out_pernet;
824
825 ret = nf_flow_table_offload_init();
826 if (ret)
827 goto out_offload;
828
829 ret = nf_flow_register_bpf();
830 if (ret)
831 goto out_bpf;
832
833 return 0;
834
835 out_bpf:
836 nf_flow_table_offload_exit();
837 out_offload:
838 unregister_pernet_subsys(&nf_flow_table_net_ops);
839 out_pernet:
840 kmem_cache_destroy(flow_offload_cachep);
841 return ret;
842 }
843
nf_flow_table_module_exit(void)844 static void __exit nf_flow_table_module_exit(void)
845 {
846 nf_flow_table_offload_exit();
847 unregister_pernet_subsys(&nf_flow_table_net_ops);
848 kmem_cache_destroy(flow_offload_cachep);
849 }
850
851 module_init(nf_flow_table_module_init);
852 module_exit(nf_flow_table_module_exit);
853
854 MODULE_LICENSE("GPL");
855 MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
856 MODULE_DESCRIPTION("Netfilter flow table module");
857