xref: /linux/net/mctp/route.c (revision a1f4cf5791e7914f3e42f5462669353104fef8a9)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Management Component Transport Protocol (MCTP) - routing
4  * implementation.
5  *
6  * This is currently based on a simple routing table, with no dst cache. The
7  * number of routes should stay fairly small, so the lookup cost is small.
8  *
9  * Copyright (c) 2021 Code Construct
10  * Copyright (c) 2021 Google
11  */
12 
13 #include <linux/idr.h>
14 #include <linux/kconfig.h>
15 #include <linux/mctp.h>
16 #include <linux/netdevice.h>
17 #include <linux/rtnetlink.h>
18 #include <linux/skbuff.h>
19 
20 #include <uapi/linux/if_arp.h>
21 
22 #include <net/mctp.h>
23 #include <net/mctpdevice.h>
24 #include <net/netlink.h>
25 #include <net/sock.h>
26 
27 #include <trace/events/mctp.h>
28 
29 static const unsigned int mctp_message_maxlen = 64 * 1024;
30 static const unsigned long mctp_key_lifetime = 6 * CONFIG_HZ;
31 
32 static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev);
33 
34 /* route output callbacks */
35 static int mctp_route_discard(struct mctp_route *route, struct sk_buff *skb)
36 {
37 	kfree_skb(skb);
38 	return 0;
39 }
40 
41 static struct mctp_sock *mctp_lookup_bind(struct net *net, struct sk_buff *skb)
42 {
43 	struct mctp_skb_cb *cb = mctp_cb(skb);
44 	struct mctp_hdr *mh;
45 	struct sock *sk;
46 	u8 type;
47 
48 	WARN_ON(!rcu_read_lock_held());
49 
50 	/* TODO: look up in skb->cb? */
51 	mh = mctp_hdr(skb);
52 
53 	if (!skb_headlen(skb))
54 		return NULL;
55 
56 	type = (*(u8 *)skb->data) & 0x7f;
57 
58 	sk_for_each_rcu(sk, &net->mctp.binds) {
59 		struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
60 
61 		if (msk->bind_net != MCTP_NET_ANY && msk->bind_net != cb->net)
62 			continue;
63 
64 		if (msk->bind_type != type)
65 			continue;
66 
67 		if (!mctp_address_matches(msk->bind_addr, mh->dest))
68 			continue;
69 
70 		return msk;
71 	}
72 
73 	return NULL;
74 }
75 
76 /* A note on the key allocations.
77  *
78  * struct net->mctp.keys contains our set of currently-allocated keys for
79  * MCTP tag management. The lookup tuple for these is the peer EID,
80  * local EID and MCTP tag.
81  *
82  * In some cases, the peer EID may be MCTP_EID_ANY: for example, when a
83  * broadcast message is sent, we may receive responses from any peer EID.
84  * Because the broadcast dest address is equivalent to ANY, we create
85  * a key with (local = local-eid, peer = ANY). This allows a match on the
86  * incoming broadcast responses from any peer.
87  *
88  * We perform lookups when packets are received, and when tags are allocated
89  * in two scenarios:
90  *
91  *  - when a packet is sent, with a locally-owned tag: we need to find an
92  *    unused tag value for the (local, peer) EID pair.
93  *
94  *  - when a tag is manually allocated: we need to find an unused tag value
95  *    for the peer EID, but don't have a specific local EID at that stage.
96  *
97  * in the latter case, on successful allocation, we end up with a tag with
98  * (local = ANY, peer = peer-eid).
99  *
100  * So, the key set allows both a local EID of ANY, as well as a peer EID of
101  * ANY in the lookup tuple. Both may be ANY if we prealloc for a broadcast.
102  * The matching (in mctp_key_match()) during lookup allows the match value to
103  * be ANY in either the dest or source addresses.
104  *
105  * When allocating (+ inserting) a tag, we need to check for conflicts amongst
106  * the existing tag set. This requires macthing either exactly on the local
107  * and peer addresses, or either being ANY.
108  */
109 
110 static bool mctp_key_match(struct mctp_sk_key *key, mctp_eid_t local,
111 			   mctp_eid_t peer, u8 tag)
112 {
113 	if (!mctp_address_matches(key->local_addr, local))
114 		return false;
115 
116 	if (!mctp_address_matches(key->peer_addr, peer))
117 		return false;
118 
119 	if (key->tag != tag)
120 		return false;
121 
122 	return true;
123 }
124 
125 /* returns a key (with key->lock held, and refcounted), or NULL if no such
126  * key exists.
127  */
128 static struct mctp_sk_key *mctp_lookup_key(struct net *net, struct sk_buff *skb,
129 					   mctp_eid_t peer,
130 					   unsigned long *irqflags)
131 	__acquires(&key->lock)
132 {
133 	struct mctp_sk_key *key, *ret;
134 	unsigned long flags;
135 	struct mctp_hdr *mh;
136 	u8 tag;
137 
138 	mh = mctp_hdr(skb);
139 	tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
140 
141 	ret = NULL;
142 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
143 
144 	hlist_for_each_entry(key, &net->mctp.keys, hlist) {
145 		if (!mctp_key_match(key, mh->dest, peer, tag))
146 			continue;
147 
148 		spin_lock(&key->lock);
149 		if (key->valid) {
150 			refcount_inc(&key->refs);
151 			ret = key;
152 			break;
153 		}
154 		spin_unlock(&key->lock);
155 	}
156 
157 	if (ret) {
158 		spin_unlock(&net->mctp.keys_lock);
159 		*irqflags = flags;
160 	} else {
161 		spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
162 	}
163 
164 	return ret;
165 }
166 
167 static struct mctp_sk_key *mctp_key_alloc(struct mctp_sock *msk,
168 					  mctp_eid_t local, mctp_eid_t peer,
169 					  u8 tag, gfp_t gfp)
170 {
171 	struct mctp_sk_key *key;
172 
173 	key = kzalloc(sizeof(*key), gfp);
174 	if (!key)
175 		return NULL;
176 
177 	key->peer_addr = peer;
178 	key->local_addr = local;
179 	key->tag = tag;
180 	key->sk = &msk->sk;
181 	key->valid = true;
182 	spin_lock_init(&key->lock);
183 	refcount_set(&key->refs, 1);
184 	sock_hold(key->sk);
185 
186 	return key;
187 }
188 
189 void mctp_key_unref(struct mctp_sk_key *key)
190 {
191 	unsigned long flags;
192 
193 	if (!refcount_dec_and_test(&key->refs))
194 		return;
195 
196 	/* even though no refs exist here, the lock allows us to stay
197 	 * consistent with the locking requirement of mctp_dev_release_key
198 	 */
199 	spin_lock_irqsave(&key->lock, flags);
200 	mctp_dev_release_key(key->dev, key);
201 	spin_unlock_irqrestore(&key->lock, flags);
202 
203 	sock_put(key->sk);
204 	kfree(key);
205 }
206 
207 static int mctp_key_add(struct mctp_sk_key *key, struct mctp_sock *msk)
208 {
209 	struct net *net = sock_net(&msk->sk);
210 	struct mctp_sk_key *tmp;
211 	unsigned long flags;
212 	int rc = 0;
213 
214 	spin_lock_irqsave(&net->mctp.keys_lock, flags);
215 
216 	if (sock_flag(&msk->sk, SOCK_DEAD)) {
217 		rc = -EINVAL;
218 		goto out_unlock;
219 	}
220 
221 	hlist_for_each_entry(tmp, &net->mctp.keys, hlist) {
222 		if (mctp_key_match(tmp, key->local_addr, key->peer_addr,
223 				   key->tag)) {
224 			spin_lock(&tmp->lock);
225 			if (tmp->valid)
226 				rc = -EEXIST;
227 			spin_unlock(&tmp->lock);
228 			if (rc)
229 				break;
230 		}
231 	}
232 
233 	if (!rc) {
234 		refcount_inc(&key->refs);
235 		key->expiry = jiffies + mctp_key_lifetime;
236 		timer_reduce(&msk->key_expiry, key->expiry);
237 
238 		hlist_add_head(&key->hlist, &net->mctp.keys);
239 		hlist_add_head(&key->sklist, &msk->keys);
240 	}
241 
242 out_unlock:
243 	spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
244 
245 	return rc;
246 }
247 
248 /* Helper for mctp_route_input().
249  * We're done with the key; unlock and unref the key.
250  * For the usual case of automatic expiry we remove the key from lists.
251  * In the case that manual allocation is set on a key we release the lock
252  * and local ref, reset reassembly, but don't remove from lists.
253  */
254 static void __mctp_key_done_in(struct mctp_sk_key *key, struct net *net,
255 			       unsigned long flags, unsigned long reason)
256 __releases(&key->lock)
257 {
258 	struct sk_buff *skb;
259 
260 	trace_mctp_key_release(key, reason);
261 	skb = key->reasm_head;
262 	key->reasm_head = NULL;
263 
264 	if (!key->manual_alloc) {
265 		key->reasm_dead = true;
266 		key->valid = false;
267 		mctp_dev_release_key(key->dev, key);
268 	}
269 	spin_unlock_irqrestore(&key->lock, flags);
270 
271 	if (!key->manual_alloc) {
272 		spin_lock_irqsave(&net->mctp.keys_lock, flags);
273 		if (!hlist_unhashed(&key->hlist)) {
274 			hlist_del_init(&key->hlist);
275 			hlist_del_init(&key->sklist);
276 			mctp_key_unref(key);
277 		}
278 		spin_unlock_irqrestore(&net->mctp.keys_lock, flags);
279 	}
280 
281 	/* and one for the local reference */
282 	mctp_key_unref(key);
283 
284 	kfree_skb(skb);
285 }
286 
287 #ifdef CONFIG_MCTP_FLOWS
288 static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key)
289 {
290 	struct mctp_flow *flow;
291 
292 	flow = skb_ext_add(skb, SKB_EXT_MCTP);
293 	if (!flow)
294 		return;
295 
296 	refcount_inc(&key->refs);
297 	flow->key = key;
298 }
299 
300 static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev)
301 {
302 	struct mctp_sk_key *key;
303 	struct mctp_flow *flow;
304 
305 	flow = skb_ext_find(skb, SKB_EXT_MCTP);
306 	if (!flow)
307 		return;
308 
309 	key = flow->key;
310 
311 	if (WARN_ON(key->dev && key->dev != dev))
312 		return;
313 
314 	mctp_dev_set_key(dev, key);
315 }
316 #else
317 static void mctp_skb_set_flow(struct sk_buff *skb, struct mctp_sk_key *key) {}
318 static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) {}
319 #endif
320 
321 static int mctp_frag_queue(struct mctp_sk_key *key, struct sk_buff *skb)
322 {
323 	struct mctp_hdr *hdr = mctp_hdr(skb);
324 	u8 exp_seq, this_seq;
325 
326 	this_seq = (hdr->flags_seq_tag >> MCTP_HDR_SEQ_SHIFT)
327 		& MCTP_HDR_SEQ_MASK;
328 
329 	if (!key->reasm_head) {
330 		key->reasm_head = skb;
331 		key->reasm_tailp = &(skb_shinfo(skb)->frag_list);
332 		key->last_seq = this_seq;
333 		return 0;
334 	}
335 
336 	exp_seq = (key->last_seq + 1) & MCTP_HDR_SEQ_MASK;
337 
338 	if (this_seq != exp_seq)
339 		return -EINVAL;
340 
341 	if (key->reasm_head->len + skb->len > mctp_message_maxlen)
342 		return -EINVAL;
343 
344 	skb->next = NULL;
345 	skb->sk = NULL;
346 	*key->reasm_tailp = skb;
347 	key->reasm_tailp = &skb->next;
348 
349 	key->last_seq = this_seq;
350 
351 	key->reasm_head->data_len += skb->len;
352 	key->reasm_head->len += skb->len;
353 	key->reasm_head->truesize += skb->truesize;
354 
355 	return 0;
356 }
357 
358 static int mctp_route_input(struct mctp_route *route, struct sk_buff *skb)
359 {
360 	struct mctp_sk_key *key, *any_key = NULL;
361 	struct net *net = dev_net(skb->dev);
362 	struct mctp_sock *msk;
363 	struct mctp_hdr *mh;
364 	unsigned long f;
365 	u8 tag, flags;
366 	int rc;
367 
368 	msk = NULL;
369 	rc = -EINVAL;
370 
371 	/* we may be receiving a locally-routed packet; drop source sk
372 	 * accounting
373 	 */
374 	skb_orphan(skb);
375 
376 	/* ensure we have enough data for a header and a type */
377 	if (skb->len < sizeof(struct mctp_hdr) + 1)
378 		goto out;
379 
380 	/* grab header, advance data ptr */
381 	mh = mctp_hdr(skb);
382 	skb_pull(skb, sizeof(struct mctp_hdr));
383 
384 	if (mh->ver != 1)
385 		goto out;
386 
387 	flags = mh->flags_seq_tag & (MCTP_HDR_FLAG_SOM | MCTP_HDR_FLAG_EOM);
388 	tag = mh->flags_seq_tag & (MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
389 
390 	rcu_read_lock();
391 
392 	/* lookup socket / reasm context, exactly matching (src,dest,tag).
393 	 * we hold a ref on the key, and key->lock held.
394 	 */
395 	key = mctp_lookup_key(net, skb, mh->src, &f);
396 
397 	if (flags & MCTP_HDR_FLAG_SOM) {
398 		if (key) {
399 			msk = container_of(key->sk, struct mctp_sock, sk);
400 		} else {
401 			/* first response to a broadcast? do a more general
402 			 * key lookup to find the socket, but don't use this
403 			 * key for reassembly - we'll create a more specific
404 			 * one for future packets if required (ie, !EOM).
405 			 *
406 			 * this lookup requires key->peer to be MCTP_ADDR_ANY,
407 			 * it doesn't match just any key->peer.
408 			 */
409 			any_key = mctp_lookup_key(net, skb, MCTP_ADDR_ANY, &f);
410 			if (any_key) {
411 				msk = container_of(any_key->sk,
412 						   struct mctp_sock, sk);
413 				spin_unlock_irqrestore(&any_key->lock, f);
414 			}
415 		}
416 
417 		if (!key && !msk && (tag & MCTP_HDR_FLAG_TO))
418 			msk = mctp_lookup_bind(net, skb);
419 
420 		if (!msk) {
421 			rc = -ENOENT;
422 			goto out_unlock;
423 		}
424 
425 		/* single-packet message? deliver to socket, clean up any
426 		 * pending key.
427 		 */
428 		if (flags & MCTP_HDR_FLAG_EOM) {
429 			sock_queue_rcv_skb(&msk->sk, skb);
430 			if (key) {
431 				/* we've hit a pending reassembly; not much we
432 				 * can do but drop it
433 				 */
434 				__mctp_key_done_in(key, net, f,
435 						   MCTP_TRACE_KEY_REPLIED);
436 				key = NULL;
437 			}
438 			rc = 0;
439 			goto out_unlock;
440 		}
441 
442 		/* broadcast response or a bind() - create a key for further
443 		 * packets for this message
444 		 */
445 		if (!key) {
446 			key = mctp_key_alloc(msk, mh->dest, mh->src,
447 					     tag, GFP_ATOMIC);
448 			if (!key) {
449 				rc = -ENOMEM;
450 				goto out_unlock;
451 			}
452 
453 			/* we can queue without the key lock here, as the
454 			 * key isn't observable yet
455 			 */
456 			mctp_frag_queue(key, skb);
457 
458 			/* if the key_add fails, we've raced with another
459 			 * SOM packet with the same src, dest and tag. There's
460 			 * no way to distinguish future packets, so all we
461 			 * can do is drop; we'll free the skb on exit from
462 			 * this function.
463 			 */
464 			rc = mctp_key_add(key, msk);
465 			if (!rc)
466 				trace_mctp_key_acquire(key);
467 
468 			/* we don't need to release key->lock on exit, so
469 			 * clean up here and suppress the unlock via
470 			 * setting to NULL
471 			 */
472 			mctp_key_unref(key);
473 			key = NULL;
474 
475 		} else {
476 			if (key->reasm_head || key->reasm_dead) {
477 				/* duplicate start? drop everything */
478 				__mctp_key_done_in(key, net, f,
479 						   MCTP_TRACE_KEY_INVALIDATED);
480 				rc = -EEXIST;
481 				key = NULL;
482 			} else {
483 				rc = mctp_frag_queue(key, skb);
484 			}
485 		}
486 
487 	} else if (key) {
488 		/* this packet continues a previous message; reassemble
489 		 * using the message-specific key
490 		 */
491 
492 		/* we need to be continuing an existing reassembly... */
493 		if (!key->reasm_head)
494 			rc = -EINVAL;
495 		else
496 			rc = mctp_frag_queue(key, skb);
497 
498 		/* end of message? deliver to socket, and we're done with
499 		 * the reassembly/response key
500 		 */
501 		if (!rc && flags & MCTP_HDR_FLAG_EOM) {
502 			sock_queue_rcv_skb(key->sk, key->reasm_head);
503 			key->reasm_head = NULL;
504 			__mctp_key_done_in(key, net, f, MCTP_TRACE_KEY_REPLIED);
505 			key = NULL;
506 		}
507 
508 	} else {
509 		/* not a start, no matching key */
510 		rc = -ENOENT;
511 	}
512 
513 out_unlock:
514 	rcu_read_unlock();
515 	if (key) {
516 		spin_unlock_irqrestore(&key->lock, f);
517 		mctp_key_unref(key);
518 	}
519 	if (any_key)
520 		mctp_key_unref(any_key);
521 out:
522 	if (rc)
523 		kfree_skb(skb);
524 	return rc;
525 }
526 
527 static unsigned int mctp_route_mtu(struct mctp_route *rt)
528 {
529 	return rt->mtu ?: READ_ONCE(rt->dev->dev->mtu);
530 }
531 
532 static int mctp_route_output(struct mctp_route *route, struct sk_buff *skb)
533 {
534 	struct mctp_skb_cb *cb = mctp_cb(skb);
535 	struct mctp_hdr *hdr = mctp_hdr(skb);
536 	char daddr_buf[MAX_ADDR_LEN];
537 	char *daddr = NULL;
538 	unsigned int mtu;
539 	int rc;
540 
541 	skb->protocol = htons(ETH_P_MCTP);
542 
543 	mtu = READ_ONCE(skb->dev->mtu);
544 	if (skb->len > mtu) {
545 		kfree_skb(skb);
546 		return -EMSGSIZE;
547 	}
548 
549 	if (cb->ifindex) {
550 		/* direct route; use the hwaddr we stashed in sendmsg */
551 		if (cb->halen != skb->dev->addr_len) {
552 			/* sanity check, sendmsg should have already caught this */
553 			kfree_skb(skb);
554 			return -EMSGSIZE;
555 		}
556 		daddr = cb->haddr;
557 	} else {
558 		/* If lookup fails let the device handle daddr==NULL */
559 		if (mctp_neigh_lookup(route->dev, hdr->dest, daddr_buf) == 0)
560 			daddr = daddr_buf;
561 	}
562 
563 	rc = dev_hard_header(skb, skb->dev, ntohs(skb->protocol),
564 			     daddr, skb->dev->dev_addr, skb->len);
565 	if (rc < 0) {
566 		kfree_skb(skb);
567 		return -EHOSTUNREACH;
568 	}
569 
570 	mctp_flow_prepare_output(skb, route->dev);
571 
572 	rc = dev_queue_xmit(skb);
573 	if (rc)
574 		rc = net_xmit_errno(rc);
575 
576 	return rc;
577 }
578 
579 /* route alloc/release */
580 static void mctp_route_release(struct mctp_route *rt)
581 {
582 	if (refcount_dec_and_test(&rt->refs)) {
583 		mctp_dev_put(rt->dev);
584 		kfree_rcu(rt, rcu);
585 	}
586 }
587 
588 /* returns a route with the refcount at 1 */
589 static struct mctp_route *mctp_route_alloc(void)
590 {
591 	struct mctp_route *rt;
592 
593 	rt = kzalloc(sizeof(*rt), GFP_KERNEL);
594 	if (!rt)
595 		return NULL;
596 
597 	INIT_LIST_HEAD(&rt->list);
598 	refcount_set(&rt->refs, 1);
599 	rt->output = mctp_route_discard;
600 
601 	return rt;
602 }
603 
604 unsigned int mctp_default_net(struct net *net)
605 {
606 	return READ_ONCE(net->mctp.default_net);
607 }
608 
609 int mctp_default_net_set(struct net *net, unsigned int index)
610 {
611 	if (index == 0)
612 		return -EINVAL;
613 	WRITE_ONCE(net->mctp.default_net, index);
614 	return 0;
615 }
616 
617 /* tag management */
618 static void mctp_reserve_tag(struct net *net, struct mctp_sk_key *key,
619 			     struct mctp_sock *msk)
620 {
621 	struct netns_mctp *mns = &net->mctp;
622 
623 	lockdep_assert_held(&mns->keys_lock);
624 
625 	key->expiry = jiffies + mctp_key_lifetime;
626 	timer_reduce(&msk->key_expiry, key->expiry);
627 
628 	/* we hold the net->key_lock here, allowing updates to both
629 	 * then net and sk
630 	 */
631 	hlist_add_head_rcu(&key->hlist, &mns->keys);
632 	hlist_add_head_rcu(&key->sklist, &msk->keys);
633 	refcount_inc(&key->refs);
634 }
635 
636 /* Allocate a locally-owned tag value for (local, peer), and reserve
637  * it for the socket msk
638  */
639 struct mctp_sk_key *mctp_alloc_local_tag(struct mctp_sock *msk,
640 					 mctp_eid_t local, mctp_eid_t peer,
641 					 bool manual, u8 *tagp)
642 {
643 	struct net *net = sock_net(&msk->sk);
644 	struct netns_mctp *mns = &net->mctp;
645 	struct mctp_sk_key *key, *tmp;
646 	unsigned long flags;
647 	u8 tagbits;
648 
649 	/* for NULL destination EIDs, we may get a response from any peer */
650 	if (peer == MCTP_ADDR_NULL)
651 		peer = MCTP_ADDR_ANY;
652 
653 	/* be optimistic, alloc now */
654 	key = mctp_key_alloc(msk, local, peer, 0, GFP_KERNEL);
655 	if (!key)
656 		return ERR_PTR(-ENOMEM);
657 
658 	/* 8 possible tag values */
659 	tagbits = 0xff;
660 
661 	spin_lock_irqsave(&mns->keys_lock, flags);
662 
663 	/* Walk through the existing keys, looking for potential conflicting
664 	 * tags. If we find a conflict, clear that bit from tagbits
665 	 */
666 	hlist_for_each_entry(tmp, &mns->keys, hlist) {
667 		/* We can check the lookup fields (*_addr, tag) without the
668 		 * lock held, they don't change over the lifetime of the key.
669 		 */
670 
671 		/* if we don't own the tag, it can't conflict */
672 		if (tmp->tag & MCTP_HDR_FLAG_TO)
673 			continue;
674 
675 		/* Since we're avoiding conflicting entries, match peer and
676 		 * local addresses, including with a wildcard on ANY. See
677 		 * 'A note on key allocations' for background.
678 		 */
679 		if (peer != MCTP_ADDR_ANY &&
680 		    !mctp_address_matches(tmp->peer_addr, peer))
681 			continue;
682 
683 		if (local != MCTP_ADDR_ANY &&
684 		    !mctp_address_matches(tmp->local_addr, local))
685 			continue;
686 
687 		spin_lock(&tmp->lock);
688 		/* key must still be valid. If we find a match, clear the
689 		 * potential tag value
690 		 */
691 		if (tmp->valid)
692 			tagbits &= ~(1 << tmp->tag);
693 		spin_unlock(&tmp->lock);
694 
695 		if (!tagbits)
696 			break;
697 	}
698 
699 	if (tagbits) {
700 		key->tag = __ffs(tagbits);
701 		mctp_reserve_tag(net, key, msk);
702 		trace_mctp_key_acquire(key);
703 
704 		key->manual_alloc = manual;
705 		*tagp = key->tag;
706 	}
707 
708 	spin_unlock_irqrestore(&mns->keys_lock, flags);
709 
710 	if (!tagbits) {
711 		kfree(key);
712 		return ERR_PTR(-EBUSY);
713 	}
714 
715 	return key;
716 }
717 
718 static struct mctp_sk_key *mctp_lookup_prealloc_tag(struct mctp_sock *msk,
719 						    mctp_eid_t daddr,
720 						    u8 req_tag, u8 *tagp)
721 {
722 	struct net *net = sock_net(&msk->sk);
723 	struct netns_mctp *mns = &net->mctp;
724 	struct mctp_sk_key *key, *tmp;
725 	unsigned long flags;
726 
727 	req_tag &= ~(MCTP_TAG_PREALLOC | MCTP_TAG_OWNER);
728 	key = NULL;
729 
730 	spin_lock_irqsave(&mns->keys_lock, flags);
731 
732 	hlist_for_each_entry(tmp, &mns->keys, hlist) {
733 		if (tmp->tag != req_tag)
734 			continue;
735 
736 		if (!mctp_address_matches(tmp->peer_addr, daddr))
737 			continue;
738 
739 		if (!tmp->manual_alloc)
740 			continue;
741 
742 		spin_lock(&tmp->lock);
743 		if (tmp->valid) {
744 			key = tmp;
745 			refcount_inc(&key->refs);
746 			spin_unlock(&tmp->lock);
747 			break;
748 		}
749 		spin_unlock(&tmp->lock);
750 	}
751 	spin_unlock_irqrestore(&mns->keys_lock, flags);
752 
753 	if (!key)
754 		return ERR_PTR(-ENOENT);
755 
756 	if (tagp)
757 		*tagp = key->tag;
758 
759 	return key;
760 }
761 
762 /* routing lookups */
763 static bool mctp_rt_match_eid(struct mctp_route *rt,
764 			      unsigned int net, mctp_eid_t eid)
765 {
766 	return READ_ONCE(rt->dev->net) == net &&
767 		rt->min <= eid && rt->max >= eid;
768 }
769 
770 /* compares match, used for duplicate prevention */
771 static bool mctp_rt_compare_exact(struct mctp_route *rt1,
772 				  struct mctp_route *rt2)
773 {
774 	ASSERT_RTNL();
775 	return rt1->dev->net == rt2->dev->net &&
776 		rt1->min == rt2->min &&
777 		rt1->max == rt2->max;
778 }
779 
780 struct mctp_route *mctp_route_lookup(struct net *net, unsigned int dnet,
781 				     mctp_eid_t daddr)
782 {
783 	struct mctp_route *tmp, *rt = NULL;
784 
785 	rcu_read_lock();
786 
787 	list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
788 		/* TODO: add metrics */
789 		if (mctp_rt_match_eid(tmp, dnet, daddr)) {
790 			if (refcount_inc_not_zero(&tmp->refs)) {
791 				rt = tmp;
792 				break;
793 			}
794 		}
795 	}
796 
797 	rcu_read_unlock();
798 
799 	return rt;
800 }
801 
802 static struct mctp_route *mctp_route_lookup_null(struct net *net,
803 						 struct net_device *dev)
804 {
805 	struct mctp_route *tmp, *rt = NULL;
806 
807 	rcu_read_lock();
808 
809 	list_for_each_entry_rcu(tmp, &net->mctp.routes, list) {
810 		if (tmp->dev->dev == dev && tmp->type == RTN_LOCAL &&
811 		    refcount_inc_not_zero(&tmp->refs)) {
812 			rt = tmp;
813 			break;
814 		}
815 	}
816 
817 	rcu_read_unlock();
818 
819 	return rt;
820 }
821 
822 static int mctp_do_fragment_route(struct mctp_route *rt, struct sk_buff *skb,
823 				  unsigned int mtu, u8 tag)
824 {
825 	const unsigned int hlen = sizeof(struct mctp_hdr);
826 	struct mctp_hdr *hdr, *hdr2;
827 	unsigned int pos, size, headroom;
828 	struct sk_buff *skb2;
829 	int rc;
830 	u8 seq;
831 
832 	hdr = mctp_hdr(skb);
833 	seq = 0;
834 	rc = 0;
835 
836 	if (mtu < hlen + 1) {
837 		kfree_skb(skb);
838 		return -EMSGSIZE;
839 	}
840 
841 	/* keep same headroom as the original skb */
842 	headroom = skb_headroom(skb);
843 
844 	/* we've got the header */
845 	skb_pull(skb, hlen);
846 
847 	for (pos = 0; pos < skb->len;) {
848 		/* size of message payload */
849 		size = min(mtu - hlen, skb->len - pos);
850 
851 		skb2 = alloc_skb(headroom + hlen + size, GFP_KERNEL);
852 		if (!skb2) {
853 			rc = -ENOMEM;
854 			break;
855 		}
856 
857 		/* generic skb copy */
858 		skb2->protocol = skb->protocol;
859 		skb2->priority = skb->priority;
860 		skb2->dev = skb->dev;
861 		memcpy(skb2->cb, skb->cb, sizeof(skb2->cb));
862 
863 		if (skb->sk)
864 			skb_set_owner_w(skb2, skb->sk);
865 
866 		/* establish packet */
867 		skb_reserve(skb2, headroom);
868 		skb_reset_network_header(skb2);
869 		skb_put(skb2, hlen + size);
870 		skb2->transport_header = skb2->network_header + hlen;
871 
872 		/* copy header fields, calculate SOM/EOM flags & seq */
873 		hdr2 = mctp_hdr(skb2);
874 		hdr2->ver = hdr->ver;
875 		hdr2->dest = hdr->dest;
876 		hdr2->src = hdr->src;
877 		hdr2->flags_seq_tag = tag &
878 			(MCTP_HDR_TAG_MASK | MCTP_HDR_FLAG_TO);
879 
880 		if (pos == 0)
881 			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_SOM;
882 
883 		if (pos + size == skb->len)
884 			hdr2->flags_seq_tag |= MCTP_HDR_FLAG_EOM;
885 
886 		hdr2->flags_seq_tag |= seq << MCTP_HDR_SEQ_SHIFT;
887 
888 		/* copy message payload */
889 		skb_copy_bits(skb, pos, skb_transport_header(skb2), size);
890 
891 		/* do route */
892 		rc = rt->output(rt, skb2);
893 		if (rc)
894 			break;
895 
896 		seq = (seq + 1) & MCTP_HDR_SEQ_MASK;
897 		pos += size;
898 	}
899 
900 	consume_skb(skb);
901 	return rc;
902 }
903 
904 int mctp_local_output(struct sock *sk, struct mctp_route *rt,
905 		      struct sk_buff *skb, mctp_eid_t daddr, u8 req_tag)
906 {
907 	struct mctp_sock *msk = container_of(sk, struct mctp_sock, sk);
908 	struct mctp_skb_cb *cb = mctp_cb(skb);
909 	struct mctp_route tmp_rt = {0};
910 	struct mctp_sk_key *key;
911 	struct mctp_hdr *hdr;
912 	unsigned long flags;
913 	unsigned int mtu;
914 	mctp_eid_t saddr;
915 	bool ext_rt;
916 	int rc;
917 	u8 tag;
918 
919 	rc = -ENODEV;
920 
921 	if (rt) {
922 		ext_rt = false;
923 		if (WARN_ON(!rt->dev))
924 			goto out_release;
925 
926 	} else if (cb->ifindex) {
927 		struct net_device *dev;
928 
929 		ext_rt = true;
930 		rt = &tmp_rt;
931 
932 		rcu_read_lock();
933 		dev = dev_get_by_index_rcu(sock_net(sk), cb->ifindex);
934 		if (!dev) {
935 			rcu_read_unlock();
936 			return rc;
937 		}
938 		rt->dev = __mctp_dev_get(dev);
939 		rcu_read_unlock();
940 
941 		if (!rt->dev)
942 			goto out_release;
943 
944 		/* establish temporary route - we set up enough to keep
945 		 * mctp_route_output happy
946 		 */
947 		rt->output = mctp_route_output;
948 		rt->mtu = 0;
949 
950 	} else {
951 		return -EINVAL;
952 	}
953 
954 	spin_lock_irqsave(&rt->dev->addrs_lock, flags);
955 	if (rt->dev->num_addrs == 0) {
956 		rc = -EHOSTUNREACH;
957 	} else {
958 		/* use the outbound interface's first address as our source */
959 		saddr = rt->dev->addrs[0];
960 		rc = 0;
961 	}
962 	spin_unlock_irqrestore(&rt->dev->addrs_lock, flags);
963 
964 	if (rc)
965 		goto out_release;
966 
967 	if (req_tag & MCTP_TAG_OWNER) {
968 		if (req_tag & MCTP_TAG_PREALLOC)
969 			key = mctp_lookup_prealloc_tag(msk, daddr,
970 						       req_tag, &tag);
971 		else
972 			key = mctp_alloc_local_tag(msk, saddr, daddr,
973 						   false, &tag);
974 
975 		if (IS_ERR(key)) {
976 			rc = PTR_ERR(key);
977 			goto out_release;
978 		}
979 		mctp_skb_set_flow(skb, key);
980 		/* done with the key in this scope */
981 		mctp_key_unref(key);
982 		tag |= MCTP_HDR_FLAG_TO;
983 	} else {
984 		key = NULL;
985 		tag = req_tag & MCTP_TAG_MASK;
986 	}
987 
988 	skb->protocol = htons(ETH_P_MCTP);
989 	skb->priority = 0;
990 	skb_reset_transport_header(skb);
991 	skb_push(skb, sizeof(struct mctp_hdr));
992 	skb_reset_network_header(skb);
993 	skb->dev = rt->dev->dev;
994 
995 	/* cb->net will have been set on initial ingress */
996 	cb->src = saddr;
997 
998 	/* set up common header fields */
999 	hdr = mctp_hdr(skb);
1000 	hdr->ver = 1;
1001 	hdr->dest = daddr;
1002 	hdr->src = saddr;
1003 
1004 	mtu = mctp_route_mtu(rt);
1005 
1006 	if (skb->len + sizeof(struct mctp_hdr) <= mtu) {
1007 		hdr->flags_seq_tag = MCTP_HDR_FLAG_SOM |
1008 			MCTP_HDR_FLAG_EOM | tag;
1009 		rc = rt->output(rt, skb);
1010 	} else {
1011 		rc = mctp_do_fragment_route(rt, skb, mtu, tag);
1012 	}
1013 
1014 out_release:
1015 	if (!ext_rt)
1016 		mctp_route_release(rt);
1017 
1018 	mctp_dev_put(tmp_rt.dev);
1019 
1020 	return rc;
1021 }
1022 
1023 /* route management */
1024 static int mctp_route_add(struct mctp_dev *mdev, mctp_eid_t daddr_start,
1025 			  unsigned int daddr_extent, unsigned int mtu,
1026 			  unsigned char type)
1027 {
1028 	int (*rtfn)(struct mctp_route *rt, struct sk_buff *skb);
1029 	struct net *net = dev_net(mdev->dev);
1030 	struct mctp_route *rt, *ert;
1031 
1032 	if (!mctp_address_unicast(daddr_start))
1033 		return -EINVAL;
1034 
1035 	if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
1036 		return -EINVAL;
1037 
1038 	switch (type) {
1039 	case RTN_LOCAL:
1040 		rtfn = mctp_route_input;
1041 		break;
1042 	case RTN_UNICAST:
1043 		rtfn = mctp_route_output;
1044 		break;
1045 	default:
1046 		return -EINVAL;
1047 	}
1048 
1049 	rt = mctp_route_alloc();
1050 	if (!rt)
1051 		return -ENOMEM;
1052 
1053 	rt->min = daddr_start;
1054 	rt->max = daddr_start + daddr_extent;
1055 	rt->mtu = mtu;
1056 	rt->dev = mdev;
1057 	mctp_dev_hold(rt->dev);
1058 	rt->type = type;
1059 	rt->output = rtfn;
1060 
1061 	ASSERT_RTNL();
1062 	/* Prevent duplicate identical routes. */
1063 	list_for_each_entry(ert, &net->mctp.routes, list) {
1064 		if (mctp_rt_compare_exact(rt, ert)) {
1065 			mctp_route_release(rt);
1066 			return -EEXIST;
1067 		}
1068 	}
1069 
1070 	list_add_rcu(&rt->list, &net->mctp.routes);
1071 
1072 	return 0;
1073 }
1074 
1075 static int mctp_route_remove(struct mctp_dev *mdev, mctp_eid_t daddr_start,
1076 			     unsigned int daddr_extent, unsigned char type)
1077 {
1078 	struct net *net = dev_net(mdev->dev);
1079 	struct mctp_route *rt, *tmp;
1080 	mctp_eid_t daddr_end;
1081 	bool dropped;
1082 
1083 	if (daddr_extent > 0xff || daddr_start + daddr_extent >= 255)
1084 		return -EINVAL;
1085 
1086 	daddr_end = daddr_start + daddr_extent;
1087 	dropped = false;
1088 
1089 	ASSERT_RTNL();
1090 
1091 	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
1092 		if (rt->dev == mdev &&
1093 		    rt->min == daddr_start && rt->max == daddr_end &&
1094 		    rt->type == type) {
1095 			list_del_rcu(&rt->list);
1096 			/* TODO: immediate RTM_DELROUTE */
1097 			mctp_route_release(rt);
1098 			dropped = true;
1099 		}
1100 	}
1101 
1102 	return dropped ? 0 : -ENOENT;
1103 }
1104 
1105 int mctp_route_add_local(struct mctp_dev *mdev, mctp_eid_t addr)
1106 {
1107 	return mctp_route_add(mdev, addr, 0, 0, RTN_LOCAL);
1108 }
1109 
1110 int mctp_route_remove_local(struct mctp_dev *mdev, mctp_eid_t addr)
1111 {
1112 	return mctp_route_remove(mdev, addr, 0, RTN_LOCAL);
1113 }
1114 
1115 /* removes all entries for a given device */
1116 void mctp_route_remove_dev(struct mctp_dev *mdev)
1117 {
1118 	struct net *net = dev_net(mdev->dev);
1119 	struct mctp_route *rt, *tmp;
1120 
1121 	ASSERT_RTNL();
1122 	list_for_each_entry_safe(rt, tmp, &net->mctp.routes, list) {
1123 		if (rt->dev == mdev) {
1124 			list_del_rcu(&rt->list);
1125 			/* TODO: immediate RTM_DELROUTE */
1126 			mctp_route_release(rt);
1127 		}
1128 	}
1129 }
1130 
1131 /* Incoming packet-handling */
1132 
1133 static int mctp_pkttype_receive(struct sk_buff *skb, struct net_device *dev,
1134 				struct packet_type *pt,
1135 				struct net_device *orig_dev)
1136 {
1137 	struct net *net = dev_net(dev);
1138 	struct mctp_dev *mdev;
1139 	struct mctp_skb_cb *cb;
1140 	struct mctp_route *rt;
1141 	struct mctp_hdr *mh;
1142 
1143 	rcu_read_lock();
1144 	mdev = __mctp_dev_get(dev);
1145 	rcu_read_unlock();
1146 	if (!mdev) {
1147 		/* basic non-data sanity checks */
1148 		goto err_drop;
1149 	}
1150 
1151 	if (!pskb_may_pull(skb, sizeof(struct mctp_hdr)))
1152 		goto err_drop;
1153 
1154 	skb_reset_transport_header(skb);
1155 	skb_reset_network_header(skb);
1156 
1157 	/* We have enough for a header; decode and route */
1158 	mh = mctp_hdr(skb);
1159 	if (mh->ver < MCTP_VER_MIN || mh->ver > MCTP_VER_MAX)
1160 		goto err_drop;
1161 
1162 	/* source must be valid unicast or null; drop reserved ranges and
1163 	 * broadcast
1164 	 */
1165 	if (!(mctp_address_unicast(mh->src) || mctp_address_null(mh->src)))
1166 		goto err_drop;
1167 
1168 	/* dest address: as above, but allow broadcast */
1169 	if (!(mctp_address_unicast(mh->dest) || mctp_address_null(mh->dest) ||
1170 	      mctp_address_broadcast(mh->dest)))
1171 		goto err_drop;
1172 
1173 	/* MCTP drivers must populate halen/haddr */
1174 	if (dev->type == ARPHRD_MCTP) {
1175 		cb = mctp_cb(skb);
1176 	} else {
1177 		cb = __mctp_cb(skb);
1178 		cb->halen = 0;
1179 	}
1180 	cb->net = READ_ONCE(mdev->net);
1181 	cb->ifindex = dev->ifindex;
1182 
1183 	rt = mctp_route_lookup(net, cb->net, mh->dest);
1184 
1185 	/* NULL EID, but addressed to our physical address */
1186 	if (!rt && mh->dest == MCTP_ADDR_NULL && skb->pkt_type == PACKET_HOST)
1187 		rt = mctp_route_lookup_null(net, dev);
1188 
1189 	if (!rt)
1190 		goto err_drop;
1191 
1192 	rt->output(rt, skb);
1193 	mctp_route_release(rt);
1194 	mctp_dev_put(mdev);
1195 
1196 	return NET_RX_SUCCESS;
1197 
1198 err_drop:
1199 	kfree_skb(skb);
1200 	mctp_dev_put(mdev);
1201 	return NET_RX_DROP;
1202 }
1203 
1204 static struct packet_type mctp_packet_type = {
1205 	.type = cpu_to_be16(ETH_P_MCTP),
1206 	.func = mctp_pkttype_receive,
1207 };
1208 
1209 /* netlink interface */
1210 
1211 static const struct nla_policy rta_mctp_policy[RTA_MAX + 1] = {
1212 	[RTA_DST]		= { .type = NLA_U8 },
1213 	[RTA_METRICS]		= { .type = NLA_NESTED },
1214 	[RTA_OIF]		= { .type = NLA_U32 },
1215 };
1216 
1217 /* Common part for RTM_NEWROUTE and RTM_DELROUTE parsing.
1218  * tb must hold RTA_MAX+1 elements.
1219  */
1220 static int mctp_route_nlparse(struct sk_buff *skb, struct nlmsghdr *nlh,
1221 			      struct netlink_ext_ack *extack,
1222 			      struct nlattr **tb, struct rtmsg **rtm,
1223 			      struct mctp_dev **mdev, mctp_eid_t *daddr_start)
1224 {
1225 	struct net *net = sock_net(skb->sk);
1226 	struct net_device *dev;
1227 	unsigned int ifindex;
1228 	int rc;
1229 
1230 	rc = nlmsg_parse(nlh, sizeof(struct rtmsg), tb, RTA_MAX,
1231 			 rta_mctp_policy, extack);
1232 	if (rc < 0) {
1233 		NL_SET_ERR_MSG(extack, "incorrect format");
1234 		return rc;
1235 	}
1236 
1237 	if (!tb[RTA_DST]) {
1238 		NL_SET_ERR_MSG(extack, "dst EID missing");
1239 		return -EINVAL;
1240 	}
1241 	*daddr_start = nla_get_u8(tb[RTA_DST]);
1242 
1243 	if (!tb[RTA_OIF]) {
1244 		NL_SET_ERR_MSG(extack, "ifindex missing");
1245 		return -EINVAL;
1246 	}
1247 	ifindex = nla_get_u32(tb[RTA_OIF]);
1248 
1249 	*rtm = nlmsg_data(nlh);
1250 	if ((*rtm)->rtm_family != AF_MCTP) {
1251 		NL_SET_ERR_MSG(extack, "route family must be AF_MCTP");
1252 		return -EINVAL;
1253 	}
1254 
1255 	dev = __dev_get_by_index(net, ifindex);
1256 	if (!dev) {
1257 		NL_SET_ERR_MSG(extack, "bad ifindex");
1258 		return -ENODEV;
1259 	}
1260 	*mdev = mctp_dev_get_rtnl(dev);
1261 	if (!*mdev)
1262 		return -ENODEV;
1263 
1264 	if (dev->flags & IFF_LOOPBACK) {
1265 		NL_SET_ERR_MSG(extack, "no routes to loopback");
1266 		return -EINVAL;
1267 	}
1268 
1269 	return 0;
1270 }
1271 
1272 static const struct nla_policy rta_metrics_policy[RTAX_MAX + 1] = {
1273 	[RTAX_MTU]		= { .type = NLA_U32 },
1274 };
1275 
1276 static int mctp_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
1277 			 struct netlink_ext_ack *extack)
1278 {
1279 	struct nlattr *tb[RTA_MAX + 1];
1280 	struct nlattr *tbx[RTAX_MAX + 1];
1281 	mctp_eid_t daddr_start;
1282 	struct mctp_dev *mdev;
1283 	struct rtmsg *rtm;
1284 	unsigned int mtu;
1285 	int rc;
1286 
1287 	rc = mctp_route_nlparse(skb, nlh, extack, tb,
1288 				&rtm, &mdev, &daddr_start);
1289 	if (rc < 0)
1290 		return rc;
1291 
1292 	if (rtm->rtm_type != RTN_UNICAST) {
1293 		NL_SET_ERR_MSG(extack, "rtm_type must be RTN_UNICAST");
1294 		return -EINVAL;
1295 	}
1296 
1297 	mtu = 0;
1298 	if (tb[RTA_METRICS]) {
1299 		rc = nla_parse_nested(tbx, RTAX_MAX, tb[RTA_METRICS],
1300 				      rta_metrics_policy, NULL);
1301 		if (rc < 0)
1302 			return rc;
1303 		if (tbx[RTAX_MTU])
1304 			mtu = nla_get_u32(tbx[RTAX_MTU]);
1305 	}
1306 
1307 	rc = mctp_route_add(mdev, daddr_start, rtm->rtm_dst_len, mtu,
1308 			    rtm->rtm_type);
1309 	return rc;
1310 }
1311 
1312 static int mctp_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
1313 			 struct netlink_ext_ack *extack)
1314 {
1315 	struct nlattr *tb[RTA_MAX + 1];
1316 	mctp_eid_t daddr_start;
1317 	struct mctp_dev *mdev;
1318 	struct rtmsg *rtm;
1319 	int rc;
1320 
1321 	rc = mctp_route_nlparse(skb, nlh, extack, tb,
1322 				&rtm, &mdev, &daddr_start);
1323 	if (rc < 0)
1324 		return rc;
1325 
1326 	/* we only have unicast routes */
1327 	if (rtm->rtm_type != RTN_UNICAST)
1328 		return -EINVAL;
1329 
1330 	rc = mctp_route_remove(mdev, daddr_start, rtm->rtm_dst_len, RTN_UNICAST);
1331 	return rc;
1332 }
1333 
1334 static int mctp_fill_rtinfo(struct sk_buff *skb, struct mctp_route *rt,
1335 			    u32 portid, u32 seq, int event, unsigned int flags)
1336 {
1337 	struct nlmsghdr *nlh;
1338 	struct rtmsg *hdr;
1339 	void *metrics;
1340 
1341 	nlh = nlmsg_put(skb, portid, seq, event, sizeof(*hdr), flags);
1342 	if (!nlh)
1343 		return -EMSGSIZE;
1344 
1345 	hdr = nlmsg_data(nlh);
1346 	hdr->rtm_family = AF_MCTP;
1347 
1348 	/* we use the _len fields as a number of EIDs, rather than
1349 	 * a number of bits in the address
1350 	 */
1351 	hdr->rtm_dst_len = rt->max - rt->min;
1352 	hdr->rtm_src_len = 0;
1353 	hdr->rtm_tos = 0;
1354 	hdr->rtm_table = RT_TABLE_DEFAULT;
1355 	hdr->rtm_protocol = RTPROT_STATIC; /* everything is user-defined */
1356 	hdr->rtm_scope = RT_SCOPE_LINK; /* TODO: scope in mctp_route? */
1357 	hdr->rtm_type = rt->type;
1358 
1359 	if (nla_put_u8(skb, RTA_DST, rt->min))
1360 		goto cancel;
1361 
1362 	metrics = nla_nest_start_noflag(skb, RTA_METRICS);
1363 	if (!metrics)
1364 		goto cancel;
1365 
1366 	if (rt->mtu) {
1367 		if (nla_put_u32(skb, RTAX_MTU, rt->mtu))
1368 			goto cancel;
1369 	}
1370 
1371 	nla_nest_end(skb, metrics);
1372 
1373 	if (rt->dev) {
1374 		if (nla_put_u32(skb, RTA_OIF, rt->dev->dev->ifindex))
1375 			goto cancel;
1376 	}
1377 
1378 	/* TODO: conditional neighbour physaddr? */
1379 
1380 	nlmsg_end(skb, nlh);
1381 
1382 	return 0;
1383 
1384 cancel:
1385 	nlmsg_cancel(skb, nlh);
1386 	return -EMSGSIZE;
1387 }
1388 
1389 static int mctp_dump_rtinfo(struct sk_buff *skb, struct netlink_callback *cb)
1390 {
1391 	struct net *net = sock_net(skb->sk);
1392 	struct mctp_route *rt;
1393 	int s_idx, idx;
1394 
1395 	/* TODO: allow filtering on route data, possibly under
1396 	 * cb->strict_check
1397 	 */
1398 
1399 	/* TODO: change to struct overlay */
1400 	s_idx = cb->args[0];
1401 	idx = 0;
1402 
1403 	rcu_read_lock();
1404 	list_for_each_entry_rcu(rt, &net->mctp.routes, list) {
1405 		if (idx++ < s_idx)
1406 			continue;
1407 		if (mctp_fill_rtinfo(skb, rt,
1408 				     NETLINK_CB(cb->skb).portid,
1409 				     cb->nlh->nlmsg_seq,
1410 				     RTM_NEWROUTE, NLM_F_MULTI) < 0)
1411 			break;
1412 	}
1413 
1414 	rcu_read_unlock();
1415 	cb->args[0] = idx;
1416 
1417 	return skb->len;
1418 }
1419 
1420 /* net namespace implementation */
1421 static int __net_init mctp_routes_net_init(struct net *net)
1422 {
1423 	struct netns_mctp *ns = &net->mctp;
1424 
1425 	INIT_LIST_HEAD(&ns->routes);
1426 	INIT_HLIST_HEAD(&ns->binds);
1427 	mutex_init(&ns->bind_lock);
1428 	INIT_HLIST_HEAD(&ns->keys);
1429 	spin_lock_init(&ns->keys_lock);
1430 	WARN_ON(mctp_default_net_set(net, MCTP_INITIAL_DEFAULT_NET));
1431 	return 0;
1432 }
1433 
1434 static void __net_exit mctp_routes_net_exit(struct net *net)
1435 {
1436 	struct mctp_route *rt;
1437 
1438 	rcu_read_lock();
1439 	list_for_each_entry_rcu(rt, &net->mctp.routes, list)
1440 		mctp_route_release(rt);
1441 	rcu_read_unlock();
1442 }
1443 
1444 static struct pernet_operations mctp_net_ops = {
1445 	.init = mctp_routes_net_init,
1446 	.exit = mctp_routes_net_exit,
1447 };
1448 
1449 int __init mctp_routes_init(void)
1450 {
1451 	dev_add_pack(&mctp_packet_type);
1452 
1453 	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_GETROUTE,
1454 			     NULL, mctp_dump_rtinfo, 0);
1455 	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_NEWROUTE,
1456 			     mctp_newroute, NULL, 0);
1457 	rtnl_register_module(THIS_MODULE, PF_MCTP, RTM_DELROUTE,
1458 			     mctp_delroute, NULL, 0);
1459 
1460 	return register_pernet_subsys(&mctp_net_ops);
1461 }
1462 
1463 void mctp_routes_exit(void)
1464 {
1465 	unregister_pernet_subsys(&mctp_net_ops);
1466 	rtnl_unregister(PF_MCTP, RTM_DELROUTE);
1467 	rtnl_unregister(PF_MCTP, RTM_NEWROUTE);
1468 	rtnl_unregister(PF_MCTP, RTM_GETROUTE);
1469 	dev_remove_pack(&mctp_packet_type);
1470 }
1471 
1472 #if IS_ENABLED(CONFIG_MCTP_TEST)
1473 #include "test/route-test.c"
1474 #endif
1475