xref: /linux/net/sched/sch_teql.c (revision 87320be9f0d24fce67631b7eef919f0b79c3e45c)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* net/sched/sch_teql.c	"True" (or "trivial") link equalizer.
3  *
4  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
5  */
6 
7 #include <linux/module.h>
8 #include <linux/types.h>
9 #include <linux/kernel.h>
10 #include <linux/slab.h>
11 #include <linux/string.h>
12 #include <linux/errno.h>
13 #include <linux/if_arp.h>
14 #include <linux/netdevice.h>
15 #include <linux/init.h>
16 #include <linux/skbuff.h>
17 #include <linux/moduleparam.h>
18 #include <net/dst.h>
19 #include <net/neighbour.h>
20 #include <net/pkt_sched.h>
21 
22 /*
23    How to setup it.
24    ----------------
25 
26    After loading this module you will find a new device teqlN
27    and new qdisc with the same name. To join a slave to the equalizer
28    you should just set this qdisc on a device f.e.
29 
30    # tc qdisc add dev eth0 root teql0
31    # tc qdisc add dev eth1 root teql0
32 
33    That's all. Full PnP 8)
34 
35    Applicability.
36    --------------
37 
38    1. Slave devices MUST be active devices, i.e., they must raise the tbusy
39       signal and generate EOI events. If you want to equalize virtual devices
40       like tunnels, use a normal eql device.
41    2. This device puts no limitations on physical slave characteristics
42       f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
43       Certainly, large difference in link speeds will make the resulting
44       eqalized link unusable, because of huge packet reordering.
45       I estimate an upper useful difference as ~10 times.
46    3. If the slave requires address resolution, only protocols using
47       neighbour cache (IPv4/IPv6) will work over the equalized link.
48       Other protocols are still allowed to use the slave device directly,
49       which will not break load balancing, though native slave
50       traffic will have the highest priority.  */
51 
52 struct teql_master {
53 	struct Qdisc_ops qops;
54 	struct net_device *dev;
55 	struct Qdisc __rcu	*slaves;
56 	spinlock_t		slaves_lock; /* serializes writes to ->slaves */
57 	struct list_head master_list;
58 	unsigned long	tx_bytes;
59 	unsigned long	tx_packets;
60 	unsigned long	tx_errors;
61 	unsigned long	tx_dropped;
62 };
63 
64 struct teql_sched_data {
65 	struct Qdisc __rcu	*next;
66 	struct teql_master *m;
67 	struct sk_buff_head q;
68 };
69 
70 #define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
71 
72 #define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
73 
74 /* "teql*" qdisc routines */
75 
76 static int
teql_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)77 teql_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)
78 {
79 	struct net_device *dev = qdisc_dev(sch);
80 	struct teql_sched_data *q = qdisc_priv(sch);
81 
82 	if (q->q.qlen < READ_ONCE(dev->tx_queue_len)) {
83 		__skb_queue_tail(&q->q, skb);
84 		return NET_XMIT_SUCCESS;
85 	}
86 
87 	return qdisc_drop(skb, sch, to_free);
88 }
89 
90 static struct sk_buff *
teql_dequeue(struct Qdisc * sch)91 teql_dequeue(struct Qdisc *sch)
92 {
93 	struct teql_sched_data *dat = qdisc_priv(sch);
94 	struct netdev_queue *dat_queue;
95 	struct sk_buff *skb;
96 	struct Qdisc *q;
97 
98 	skb = __skb_dequeue(&dat->q);
99 	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
100 	q = rcu_dereference_bh(dat_queue->qdisc);
101 
102 	if (skb == NULL) {
103 		struct net_device *m = qdisc_dev(q);
104 		if (m) {
105 			spin_lock_bh(&dat->m->slaves_lock);
106 			rcu_assign_pointer(dat->m->slaves, sch);
107 			spin_unlock_bh(&dat->m->slaves_lock);
108 			netif_wake_queue(m);
109 		}
110 	} else {
111 		qdisc_bstats_update(sch, skb);
112 	}
113 	WRITE_ONCE(sch->q.qlen, dat->q.qlen + READ_ONCE(q->q.qlen));
114 	return skb;
115 }
116 
117 static struct sk_buff *
teql_peek(struct Qdisc * sch)118 teql_peek(struct Qdisc *sch)
119 {
120 	/* teql is meant to be used as root qdisc */
121 	return NULL;
122 }
123 
124 static void
teql_reset(struct Qdisc * sch)125 teql_reset(struct Qdisc *sch)
126 {
127 	struct teql_sched_data *dat = qdisc_priv(sch);
128 
129 	skb_queue_purge(&dat->q);
130 }
131 
132 static void
teql_destroy(struct Qdisc * sch)133 teql_destroy(struct Qdisc *sch)
134 {
135 	struct Qdisc *q, *prev;
136 	struct teql_sched_data *dat = qdisc_priv(sch);
137 	struct teql_master *master = dat->m;
138 	struct netdev_queue *txq = NULL;
139 	bool reset_master_queue = false;
140 
141 	if (!master)
142 		return;
143 
144 	spin_lock_bh(&master->slaves_lock);
145 	prev = rcu_dereference_protected(master->slaves,
146 					 lockdep_is_held(&master->slaves_lock));
147 	if (prev) {
148 		do {
149 			struct Qdisc *head, *next;
150 
151 			q = rcu_dereference_protected(NEXT_SLAVE(prev),
152 						      lockdep_is_held(&master->slaves_lock));
153 			if (q != sch) {
154 				prev = q;
155 				continue;
156 			}
157 
158 			next = rcu_dereference_protected(NEXT_SLAVE(q),
159 							 lockdep_is_held(&master->slaves_lock));
160 			rcu_assign_pointer(NEXT_SLAVE(prev), next);
161 
162 			head = rcu_dereference_protected(master->slaves,
163 							 lockdep_is_held(&master->slaves_lock));
164 			if (q == head) {
165 				rcu_assign_pointer(master->slaves, next);
166 				if (q == next) {
167 					txq = netdev_get_tx_queue(master->dev, 0);
168 					rcu_assign_pointer(master->slaves, NULL);
169 					reset_master_queue = true;
170 				}
171 			}
172 			skb_queue_purge(&dat->q);
173 			break;
174 		} while (prev != rcu_dereference_protected(master->slaves,
175 							   lockdep_is_held(&master->slaves_lock)));
176 	}
177 	spin_unlock_bh(&master->slaves_lock);
178 
179 	if (reset_master_queue)
180 		dev_reset_queue(master->dev, txq, NULL);
181 }
182 
teql_qdisc_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)183 static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt,
184 			   struct netlink_ext_ack *extack)
185 {
186 	struct net_device *dev = qdisc_dev(sch);
187 	struct teql_master *m = (struct teql_master *)sch->ops;
188 	struct teql_sched_data *q = qdisc_priv(sch);
189 	struct Qdisc *first;
190 
191 	if (dev->hard_header_len > m->dev->hard_header_len)
192 		return -EINVAL;
193 
194 	if (m->dev == dev)
195 		return -ELOOP;
196 
197 	if (sch->parent != TC_H_ROOT) {
198 		NL_SET_ERR_MSG_MOD(extack, "teql can only be used as root");
199 		return -EOPNOTSUPP;
200 	}
201 
202 	q->m = m;
203 
204 	skb_queue_head_init(&q->q);
205 
206 	spin_lock_bh(&m->slaves_lock);
207 	first = rcu_dereference_protected(m->slaves, lockdep_is_held(&m->slaves_lock));
208 	if (first) {
209 		if (m->dev->flags & IFF_UP) {
210 			if ((m->dev->flags & IFF_POINTOPOINT &&
211 			     !(dev->flags & IFF_POINTOPOINT)) ||
212 			    (m->dev->flags & IFF_BROADCAST &&
213 			     !(dev->flags & IFF_BROADCAST)) ||
214 			    (m->dev->flags & IFF_MULTICAST &&
215 			     !(dev->flags & IFF_MULTICAST)) ||
216 			    dev->mtu < m->dev->mtu) {
217 				spin_unlock_bh(&m->slaves_lock);
218 				return -EINVAL;
219 			}
220 		} else {
221 			if (!(dev->flags&IFF_POINTOPOINT))
222 				m->dev->flags &= ~IFF_POINTOPOINT;
223 			if (!(dev->flags&IFF_BROADCAST))
224 				m->dev->flags &= ~IFF_BROADCAST;
225 			if (!(dev->flags&IFF_MULTICAST))
226 				m->dev->flags &= ~IFF_MULTICAST;
227 			if (dev->mtu < m->dev->mtu)
228 				m->dev->mtu = dev->mtu;
229 		}
230 		rcu_assign_pointer(q->next,
231 				   rcu_dereference_protected(NEXT_SLAVE(first),
232 							     lockdep_is_held(&m->slaves_lock)));
233 		rcu_assign_pointer(NEXT_SLAVE(first), sch);
234 	} else {
235 		rcu_assign_pointer(q->next, sch);
236 		rcu_assign_pointer(m->slaves, sch);
237 		m->dev->mtu = dev->mtu;
238 		m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
239 	}
240 	spin_unlock_bh(&m->slaves_lock);
241 	return 0;
242 }
243 
244 
245 static int
__teql_resolve(struct sk_buff * skb,struct sk_buff * skb_res,struct net_device * dev,struct netdev_queue * txq,struct dst_entry * dst)246 __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
247 	       struct net_device *dev, struct netdev_queue *txq,
248 	       struct dst_entry *dst)
249 {
250 	struct neighbour *n;
251 	int err = 0;
252 
253 	n = dst_neigh_lookup_skb(dst, skb);
254 	if (!n)
255 		return -ENOENT;
256 
257 	if (dst->dev != dev) {
258 		struct neighbour *mn;
259 
260 		mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
261 		neigh_release(n);
262 		if (IS_ERR(mn))
263 			return PTR_ERR(mn);
264 		n = mn;
265 	}
266 
267 	if (neigh_event_send(n, skb_res) == 0) {
268 		int err;
269 		char haddr[MAX_ADDR_LEN];
270 
271 		neigh_ha_snapshot(haddr, n, dev);
272 		err = dev_hard_header(skb, dev, ntohs(skb_protocol(skb, false)),
273 				      haddr, NULL, skb->len);
274 
275 		if (err < 0)
276 			err = -EINVAL;
277 	} else {
278 		err = (skb_res == NULL) ? -EAGAIN : 1;
279 	}
280 	neigh_release(n);
281 	return err;
282 }
283 
teql_resolve(struct sk_buff * skb,struct sk_buff * skb_res,struct net_device * dev,struct netdev_queue * txq)284 static inline int teql_resolve(struct sk_buff *skb,
285 			       struct sk_buff *skb_res,
286 			       struct net_device *dev,
287 			       struct netdev_queue *txq)
288 {
289 	struct dst_entry *dst = skb_dst(skb);
290 	int res;
291 
292 	if (rcu_access_pointer(txq->qdisc) == &noop_qdisc)
293 		return -ENODEV;
294 
295 	if (!dev->header_ops || !dst)
296 		return 0;
297 
298 	rcu_read_lock();
299 	res = __teql_resolve(skb, skb_res, dev, txq, dst);
300 	rcu_read_unlock();
301 
302 	return res;
303 }
304 
teql_master_xmit(struct sk_buff * skb,struct net_device * dev)305 static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
306 {
307 	struct teql_master *master = netdev_priv(dev);
308 	struct Qdisc *start, *q;
309 	int busy;
310 	int nores;
311 	int subq = skb_get_queue_mapping(skb);
312 	struct sk_buff *skb_res = NULL;
313 
314 	rcu_read_lock_bh();
315 
316 	start = rcu_dereference_bh(master->slaves);
317 
318 restart:
319 	nores = 0;
320 	busy = 0;
321 
322 	q = start;
323 	if (!q)
324 		goto drop;
325 
326 	do {
327 		struct net_device *slave = qdisc_dev(q);
328 		struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
329 
330 		if (rcu_access_pointer(slave_txq->qdisc_sleeping) != q)
331 			continue;
332 		if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
333 		    !netif_running(slave)) {
334 			busy = 1;
335 			continue;
336 		}
337 
338 		switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
339 		case 0:
340 			if (__netif_tx_trylock(slave_txq)) {
341 				unsigned int length = qdisc_pkt_len(skb);
342 
343 				skb->dev = slave;
344 				if (!netif_xmit_frozen_or_stopped(slave_txq) &&
345 				    netdev_start_xmit(skb, slave, slave_txq, false) ==
346 				    NETDEV_TX_OK) {
347 					__netif_tx_unlock(slave_txq);
348 					spin_lock_bh(&master->slaves_lock);
349 					if (rcu_dereference_protected(master->slaves,
350 								      lockdep_is_held(&master->slaves_lock)) == q)
351 						rcu_assign_pointer(master->slaves,
352 								   rcu_dereference_protected(NEXT_SLAVE(q),
353 											     lockdep_is_held(&master->slaves_lock)));
354 					spin_unlock_bh(&master->slaves_lock);
355 					netif_wake_queue(dev);
356 					master->tx_packets++;
357 					master->tx_bytes += length;
358 					rcu_read_unlock_bh();
359 					return NETDEV_TX_OK;
360 				}
361 				__netif_tx_unlock(slave_txq);
362 			}
363 			if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
364 				busy = 1;
365 			break;
366 		case 1:
367 			spin_lock_bh(&master->slaves_lock);
368 			if (rcu_dereference_protected(master->slaves,
369 						      lockdep_is_held(&master->slaves_lock)) == q)
370 				rcu_assign_pointer(master->slaves,
371 						   rcu_dereference_protected(NEXT_SLAVE(q),
372 									     lockdep_is_held(&master->slaves_lock)));
373 			spin_unlock_bh(&master->slaves_lock);
374 			rcu_read_unlock_bh();
375 			return NETDEV_TX_OK;
376 		default:
377 			nores = 1;
378 			break;
379 		}
380 		__skb_pull(skb, skb_network_offset(skb));
381 	} while ((q = rcu_dereference_bh(NEXT_SLAVE(q))) != start);
382 
383 	if (nores && skb_res == NULL) {
384 		skb_res = skb;
385 		goto restart;
386 	}
387 
388 	if (busy) {
389 		netif_stop_queue(dev);
390 		rcu_read_unlock_bh();
391 		return NETDEV_TX_BUSY;
392 	}
393 	master->tx_errors++;
394 
395 drop:
396 	master->tx_dropped++;
397 	rcu_read_unlock_bh();
398 	dev_kfree_skb(skb);
399 	return NETDEV_TX_OK;
400 }
401 
teql_master_open(struct net_device * dev)402 static int teql_master_open(struct net_device *dev)
403 {
404 	struct Qdisc *q, *first;
405 	struct teql_master *m = netdev_priv(dev);
406 	int mtu = 0xFFFE;
407 	unsigned int flags = IFF_NOARP | IFF_MULTICAST;
408 
409 	first = rtnl_dereference(m->slaves);
410 	if (!first)
411 		return -EUNATCH;
412 
413 	flags = FMASK;
414 
415 	q = first;
416 	do {
417 		struct net_device *slave = qdisc_dev(q);
418 
419 		if (slave == NULL)
420 			return -EUNATCH;
421 
422 		if (slave->mtu < mtu)
423 			mtu = slave->mtu;
424 		if (slave->hard_header_len > LL_MAX_HEADER)
425 			return -EINVAL;
426 
427 		/* If all the slaves are BROADCAST, master is BROADCAST
428 		   If all the slaves are PtP, master is PtP
429 		   Otherwise, master is NBMA.
430 		 */
431 		if (!(slave->flags&IFF_POINTOPOINT))
432 			flags &= ~IFF_POINTOPOINT;
433 		if (!(slave->flags&IFF_BROADCAST))
434 			flags &= ~IFF_BROADCAST;
435 		if (!(slave->flags&IFF_MULTICAST))
436 			flags &= ~IFF_MULTICAST;
437 	} while ((q = rtnl_dereference(NEXT_SLAVE(q))) != first);
438 
439 	m->dev->mtu = mtu;
440 	m->dev->flags = (m->dev->flags&~FMASK) | flags;
441 	netif_start_queue(m->dev);
442 	return 0;
443 }
444 
teql_master_close(struct net_device * dev)445 static int teql_master_close(struct net_device *dev)
446 {
447 	netif_stop_queue(dev);
448 	return 0;
449 }
450 
teql_master_stats64(struct net_device * dev,struct rtnl_link_stats64 * stats)451 static void teql_master_stats64(struct net_device *dev,
452 				struct rtnl_link_stats64 *stats)
453 {
454 	struct teql_master *m = netdev_priv(dev);
455 
456 	stats->tx_packets	= m->tx_packets;
457 	stats->tx_bytes		= m->tx_bytes;
458 	stats->tx_errors	= m->tx_errors;
459 	stats->tx_dropped	= m->tx_dropped;
460 }
461 
teql_master_mtu(struct net_device * dev,int new_mtu)462 static int teql_master_mtu(struct net_device *dev, int new_mtu)
463 {
464 	struct teql_master *m = netdev_priv(dev);
465 	struct Qdisc *q, *first;
466 
467 	first = rtnl_dereference(m->slaves);
468 	q = first;
469 	if (q) {
470 		do {
471 			if (new_mtu > qdisc_dev(q)->mtu)
472 				return -EINVAL;
473 		} while ((q = rtnl_dereference(NEXT_SLAVE(q))) != first);
474 	}
475 
476 	WRITE_ONCE(dev->mtu, new_mtu);
477 	return 0;
478 }
479 
480 static const struct net_device_ops teql_netdev_ops = {
481 	.ndo_open	= teql_master_open,
482 	.ndo_stop	= teql_master_close,
483 	.ndo_start_xmit	= teql_master_xmit,
484 	.ndo_get_stats64 = teql_master_stats64,
485 	.ndo_change_mtu	= teql_master_mtu,
486 };
487 
teql_master_setup(struct net_device * dev)488 static __init void teql_master_setup(struct net_device *dev)
489 {
490 	struct teql_master *master = netdev_priv(dev);
491 	struct Qdisc_ops *ops = &master->qops;
492 
493 	spin_lock_init(&master->slaves_lock);
494 	master->dev	= dev;
495 	ops->priv_size  = sizeof(struct teql_sched_data);
496 
497 	ops->enqueue	=	teql_enqueue;
498 	ops->dequeue	=	teql_dequeue;
499 	ops->peek	=	teql_peek;
500 	ops->init	=	teql_qdisc_init;
501 	ops->reset	=	teql_reset;
502 	ops->destroy	=	teql_destroy;
503 	ops->owner	=	THIS_MODULE;
504 
505 	dev->netdev_ops =       &teql_netdev_ops;
506 	dev->type		= ARPHRD_VOID;
507 	dev->mtu		= 1500;
508 	dev->min_mtu		= 68;
509 	dev->max_mtu		= 65535;
510 	dev->tx_queue_len	= 100;
511 	dev->flags		= IFF_NOARP;
512 	dev->hard_header_len	= LL_MAX_HEADER;
513 	netif_keep_dst(dev);
514 }
515 
516 static LIST_HEAD(master_dev_list);
517 static int max_equalizers = 1;
518 module_param(max_equalizers, int, 0);
519 MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");
520 
teql_init(void)521 static int __init teql_init(void)
522 {
523 	int i;
524 	int err = -ENODEV;
525 
526 	for (i = 0; i < max_equalizers; i++) {
527 		struct net_device *dev;
528 		struct teql_master *master;
529 
530 		dev = alloc_netdev(sizeof(struct teql_master), "teql%d",
531 				   NET_NAME_UNKNOWN, teql_master_setup);
532 		if (!dev) {
533 			err = -ENOMEM;
534 			break;
535 		}
536 
537 		if ((err = register_netdev(dev))) {
538 			free_netdev(dev);
539 			break;
540 		}
541 
542 		master = netdev_priv(dev);
543 
544 		strscpy(master->qops.id, dev->name, IFNAMSIZ);
545 		err = register_qdisc(&master->qops);
546 
547 		if (err) {
548 			unregister_netdev(dev);
549 			free_netdev(dev);
550 			break;
551 		}
552 
553 		list_add_tail(&master->master_list, &master_dev_list);
554 	}
555 	return i ? 0 : err;
556 }
557 
teql_exit(void)558 static void __exit teql_exit(void)
559 {
560 	struct teql_master *master, *nxt;
561 
562 	list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {
563 
564 		list_del(&master->master_list);
565 
566 		unregister_qdisc(&master->qops);
567 		unregister_netdev(master->dev);
568 		free_netdev(master->dev);
569 	}
570 }
571 
572 module_init(teql_init);
573 module_exit(teql_exit);
574 
575 MODULE_LICENSE("GPL");
576 MODULE_DESCRIPTION("True (or trivial) link equalizer qdisc");
577