xref: /freebsd/sys/netlink/netlink_message_writer.c (revision ba3c1f5972d7b90feb6e6da47905ff2757e0fe57)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2022 Alexander V. Chernikov <melifaro@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include "opt_netlink.h"
29 
30 #include <sys/cdefs.h>
31 __FBSDID("$FreeBSD$");
32 #include <sys/param.h>
33 #include <sys/malloc.h>
34 #include <sys/lock.h>
35 #include <sys/rmlock.h>
36 #include <sys/mbuf.h>
37 #include <sys/ck.h>
38 #include <sys/socket.h>
39 #include <sys/socketvar.h>
40 #include <sys/syslog.h>
41 
42 #include <netlink/netlink.h>
43 #include <netlink/netlink_ctl.h>
44 #include <netlink/netlink_linux.h>
45 #include <netlink/netlink_var.h>
46 
47 #define	DEBUG_MOD_NAME	nl_writer
48 #define	DEBUG_MAX_LEVEL	LOG_DEBUG3
49 #include <netlink/netlink_debug.h>
50 _DECLARE_DEBUG(LOG_INFO);
51 
52 /*
53  * The goal of this file is to provide convenient message writing KPI on top of
54  * different storage methods (mbufs, uio, temporary memory chunks).
55  *
56  * The main KPI guarantee is that the (last) message always resides in the contiguous
57  *  memory buffer, so one is able to update the header after writing the entire message.
58  *
59  * This guarantee comes with a side effect of potentially reallocating underlying
60  *  buffer, so one needs to update the desired pointers after something is added
61  *  to the header.
62  *
63  * Messaging layer contains hooks performing transparent Linux translation for the messages.
64  *
65  * There are 3 types of supported targets:
66  *  * socket (adds mbufs to the socket buffer, used for message replies)
67  *  * group (sends mbuf/chain to the specified groups, used for the notifications)
68  *  * chain (returns mbuf chain, used in Linux message translation code)
69  *
70  * There are 3 types of storage:
71  * * NS_WRITER_TYPE_MBUF (mbuf-based, most efficient, used when a single message
72  *    fits in NLMBUFSIZE)
73  * * NS_WRITER_TYPE_BUF (fallback, malloc-based, used when a single message needs
74  *    to be larger than one supported by NS_WRITER_TYPE_MBUF)
75  * * NS_WRITER_TYPE_LBUF (malloc-based, similar to NS_WRITER_TYPE_BUF, used for
76  *    Linux sockets, calls translation hook prior to sending messages to the socket).
77  *
78  * Internally, KPI switches between different types of storage when memory requirements
79  *  change. It happens transparently to the caller.
80  */
81 
82 /*
83  * Uma zone for the mbuf-based Netlink storage
84  */
85 static uma_zone_t	nlmsg_zone;
86 
87 static void
88 nl_free_mbuf_storage(struct mbuf *m)
89 {
90 	uma_zfree(nlmsg_zone, m->m_ext.ext_buf);
91 }
92 
93 static int
94 nl_setup_mbuf_storage(void *mem, int size, void *arg, int how __unused)
95 {
96 	struct mbuf *m = (struct mbuf *)arg;
97 
98 	if (m != NULL)
99 		m_extadd(m, mem, size, nl_free_mbuf_storage, NULL, NULL, 0, EXT_MOD_TYPE);
100 
101 	return (0);
102 }
103 
104 static struct mbuf *
105 nl_get_mbuf_flags(int size, int malloc_flags, int mbuf_flags)
106 {
107 	struct mbuf *m, *m_storage;
108 
109 	if (size <= MHLEN)
110 		return (m_get2(size, malloc_flags, MT_DATA, mbuf_flags));
111 
112 	if (__predict_false(size > NLMBUFSIZE))
113 		return (NULL);
114 
115 	m = m_gethdr(malloc_flags, MT_DATA);
116 	if (m == NULL)
117 		return (NULL);
118 
119 	m_storage = uma_zalloc_arg(nlmsg_zone, m, malloc_flags);
120 	if (m_storage == NULL) {
121 		m_free_raw(m);
122 		return (NULL);
123 	}
124 
125 	return (m);
126 }
127 
128 static struct mbuf *
129 nl_get_mbuf(int size, int malloc_flags)
130 {
131 	return (nl_get_mbuf_flags(size, malloc_flags, M_PKTHDR));
132 }
133 
134 /*
135  * Gets a chain of Netlink mbufs.
136  * This is strip-down version of m_getm2()
137  */
138 static struct mbuf *
139 nl_get_mbuf_chain(int len, int malloc_flags)
140 {
141 	struct mbuf *m_chain = NULL, *m_tail = NULL;
142 	int mbuf_flags = M_PKTHDR;
143 
144 	while (len > 0) {
145 		int sz = len > NLMBUFSIZE ? NLMBUFSIZE: len;
146 		struct mbuf *m = nl_get_mbuf_flags(sz, malloc_flags, mbuf_flags);
147 
148 		if (m == NULL) {
149 			m_freem(m_chain);
150 			return (NULL);
151 		}
152 
153 		/* Book keeping. */
154 		len -= M_SIZE(m);
155 		if (m_tail != NULL)
156 			m_tail->m_next = m;
157 		else
158 			m_chain = m;
159 		m_tail = m;
160 		mbuf_flags &= ~M_PKTHDR;	/* Only valid on the first mbuf. */
161 	}
162 
163 	return (m_chain);
164 }
165 
166 void
167 nl_init_msg_zone(void)
168 {
169 	nlmsg_zone = uma_zcreate("netlink", NLMBUFSIZE, nl_setup_mbuf_storage,
170 	    NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
171 }
172 
173 void
174 nl_destroy_msg_zone(void)
175 {
176 	uma_zdestroy(nlmsg_zone);
177 }
178 
179 
180 typedef bool nlwriter_op_init(struct nl_writer *nw, int size, bool waitok);
181 typedef bool nlwriter_op_write(struct nl_writer *nw, void *buf, int buflen, int cnt);
182 
183 struct nlwriter_ops {
184 	nlwriter_op_init	*init;
185 	nlwriter_op_write	*write_socket;
186 	nlwriter_op_write	*write_group;
187 	nlwriter_op_write	*write_chain;
188 };
189 
190 /*
191  * NS_WRITER_TYPE_BUF
192  * Writes message to a temporary memory buffer,
193  * flushing to the socket/group when buffer size limit is reached
194  */
195 static bool
196 nlmsg_get_ns_buf(struct nl_writer *nw, int size, bool waitok)
197 {
198 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
199 	nw->_storage = malloc(size, M_NETLINK, mflag | M_ZERO);
200 	if (__predict_false(nw->_storage == NULL))
201 		return (false);
202 	nw->alloc_len = size;
203 	nw->offset = 0;
204 	nw->hdr = NULL;
205 	nw->data = nw->_storage;
206 	nw->writer_type = NS_WRITER_TYPE_BUF;
207 	nw->malloc_flag = mflag;
208 	nw->num_messages = 0;
209 	nw->enomem = false;
210 	return (true);
211 }
212 
213 static bool
214 nlmsg_write_socket_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
215 {
216 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
217 	if (__predict_false(datalen == 0)) {
218 		free(buf, M_NETLINK);
219 		return (true);
220 	}
221 
222 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
223 	if (__predict_false(m == NULL)) {
224 		/* XXX: should we set sorcverr? */
225 		free(buf, M_NETLINK);
226 		return (false);
227 	}
228 	m_append(m, datalen, buf);
229 	free(buf, M_NETLINK);
230 
231 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
232 	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
233 }
234 
235 static bool
236 nlmsg_write_group_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
237 {
238 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
239 	    nw->arg.group.proto, nw->arg.group.id);
240 	if (__predict_false(datalen == 0)) {
241 		free(buf, M_NETLINK);
242 		return (true);
243 	}
244 
245 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
246 	if (__predict_false(m == NULL)) {
247 		free(buf, M_NETLINK);
248 		return (false);
249 	}
250 	bool success = m_append(m, datalen, buf) != 0;
251 	free(buf, M_NETLINK);
252 
253 	if (!success)
254 		return (false);
255 
256 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
257 	return (true);
258 }
259 
260 static bool
261 nlmsg_write_chain_buf(struct nl_writer *nw, void *buf, int datalen, int cnt)
262 {
263 	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
264 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
265 
266 	if (__predict_false(datalen == 0)) {
267 		free(buf, M_NETLINK);
268 		return (true);
269 	}
270 
271 	if (*m0 == NULL) {
272 		struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
273 
274 		if (__predict_false(m == NULL)) {
275 			free(buf, M_NETLINK);
276 			return (false);
277 		}
278 		*m0 = m;
279 	}
280 	if (__predict_false(m_append(*m0, datalen, buf) == 0)) {
281 		free(buf, M_NETLINK);
282 		return (false);
283 	}
284 	return (true);
285 }
286 
287 
288 /*
289  * NS_WRITER_TYPE_MBUF
290  * Writes message to the allocated mbuf,
291  * flushing to socket/group when mbuf size limit is reached.
292  * This is the most efficient mechanism as it avoids double-copying.
293  *
294  * Allocates a single mbuf suitable to store up to @size bytes of data.
295  * If size < MHLEN (around 160 bytes), allocates mbuf with pkghdr.
296  * If the size <= NLMBUFSIZE (2k), allocate mbuf+storage out of nlmsg_zone.
297  * Returns NULL on greater size or the allocation failure.
298  */
299 static bool
300 nlmsg_get_ns_mbuf(struct nl_writer *nw, int size, bool waitok)
301 {
302 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
303 	struct mbuf *m = nl_get_mbuf(size, mflag);
304 
305 	if (__predict_false(m == NULL))
306 		return (false);
307 	nw->alloc_len = M_TRAILINGSPACE(m);
308 	nw->offset = 0;
309 	nw->hdr = NULL;
310 	nw->_storage = (void *)m;
311 	nw->data = mtod(m, void *);
312 	nw->writer_type = NS_WRITER_TYPE_MBUF;
313 	nw->malloc_flag = mflag;
314 	nw->num_messages = 0;
315 	nw->enomem = false;
316 	memset(nw->data, 0, size);
317 	NL_LOG(LOG_DEBUG2, "alloc mbuf %p req_len %d alloc_len %d data_ptr %p",
318 	    m, size, nw->alloc_len, nw->data);
319 	return (true);
320 }
321 
322 static bool
323 nlmsg_write_socket_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
324 {
325 	struct mbuf *m = (struct mbuf *)buf;
326 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
327 
328 	if (__predict_false(datalen == 0)) {
329 		m_freem(m);
330 		return (true);
331 	}
332 
333 	m->m_pkthdr.len = datalen;
334 	m->m_len = datalen;
335 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
336 	return (nl_send_one(m, (struct nlpcb *)(nw->arg.ptr), cnt, io_flags));
337 }
338 
339 static bool
340 nlmsg_write_group_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
341 {
342 	struct mbuf *m = (struct mbuf *)buf;
343 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d proto: %d id: %d", buf, datalen,
344 	    nw->arg.group.proto, nw->arg.group.id);
345 
346 	if (__predict_false(datalen == 0)) {
347 		m_freem(m);
348 		return (true);
349 	}
350 
351 	m->m_pkthdr.len = datalen;
352 	m->m_len = datalen;
353 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
354 	return (true);
355 }
356 
357 static bool
358 nlmsg_write_chain_mbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
359 {
360 	struct mbuf *m_new = (struct mbuf *)buf;
361 	struct mbuf **m0 = (struct mbuf **)(nw->arg.ptr);
362 
363 	NL_LOG(LOG_DEBUG2, "IN: ptr: %p len: %d arg: %p", buf, datalen, nw->arg.ptr);
364 
365 	if (__predict_false(datalen == 0)) {
366 		m_freem(m_new);
367 		return (true);
368 	}
369 
370 	m_new->m_pkthdr.len = datalen;
371 	m_new->m_len = datalen;
372 
373 	if (*m0 == NULL) {
374 		*m0 = m_new;
375 	} else {
376 		struct mbuf *m_last;
377 		for (m_last = *m0; m_last->m_next != NULL; m_last = m_last->m_next)
378 			;
379 		m_last->m_next = m_new;
380 		(*m0)->m_pkthdr.len += datalen;
381 	}
382 
383 	return (true);
384 }
385 
386 /*
387  * NS_WRITER_TYPE_LBUF
388  * Writes message to the allocated memory buffer,
389  * flushing to socket/group when mbuf size limit is reached.
390  * Calls linux handler to rewrite messages before sending to the socket.
391  */
392 static bool
393 nlmsg_get_ns_lbuf(struct nl_writer *nw, int size, bool waitok)
394 {
395 	int mflag = waitok ? M_WAITOK : M_NOWAIT;
396 	size = roundup2(size, sizeof(void *));
397 	int add_size = sizeof(struct linear_buffer) + SCRATCH_BUFFER_SIZE;
398 	char *buf = malloc(add_size + size * 2, M_NETLINK, mflag | M_ZERO);
399 	if (__predict_false(buf == NULL))
400 		return (false);
401 
402 	/* Fill buffer header first */
403 	struct linear_buffer *lb = (struct linear_buffer *)buf;
404 	lb->base = &buf[sizeof(struct linear_buffer) + size];
405 	lb->size = size + SCRATCH_BUFFER_SIZE;
406 
407 	nw->alloc_len = size;
408 	nw->offset = 0;
409 	nw->hdr = NULL;
410 	nw->_storage = buf;
411 	nw->data = (char *)(lb + 1);
412 	nw->malloc_flag = mflag;
413 	nw->writer_type = NS_WRITER_TYPE_LBUF;
414 	nw->num_messages = 0;
415 	nw->enomem = false;
416 	return (true);
417 }
418 
419 static bool
420 nlmsg_write_socket_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
421 {
422 	struct linear_buffer *lb = (struct linear_buffer *)buf;
423 	char *data = (char *)(lb + 1);
424 	struct nlpcb *nlp = (struct nlpcb *)(nw->arg.ptr);
425 
426 	if (__predict_false(datalen == 0)) {
427 		free(buf, M_NETLINK);
428 		return (true);
429 	}
430 
431 	struct mbuf *m = NULL;
432 	if (linux_netlink_p != NULL)
433 		m = linux_netlink_p->msgs_to_linux(nlp->nl_proto, data, datalen, nlp);
434 	free(buf, M_NETLINK);
435 
436 	if (__predict_false(m == NULL)) {
437 		/* XXX: should we set sorcverr? */
438 		return (false);
439 	}
440 
441 	int io_flags = (nw->ignore_limit) ? NL_IOF_IGNORE_LIMIT : 0;
442 	return (nl_send_one(m, nlp, cnt, io_flags));
443 }
444 
445 /* Shouldn't be called (maybe except Linux code originating message) */
446 static bool
447 nlmsg_write_group_lbuf(struct nl_writer *nw, void *buf, int datalen, int cnt)
448 {
449 	struct linear_buffer *lb = (struct linear_buffer *)buf;
450 	char *data = (char *)(lb + 1);
451 
452 	if (__predict_false(datalen == 0)) {
453 		free(buf, M_NETLINK);
454 		return (true);
455 	}
456 
457 	struct mbuf *m = nl_get_mbuf_chain(datalen, nw->malloc_flag);
458 	if (__predict_false(m == NULL)) {
459 		free(buf, M_NETLINK);
460 		return (false);
461 	}
462 	m_append(m, datalen, data);
463 	free(buf, M_NETLINK);
464 
465 	nl_send_group(m, cnt, nw->arg.group.proto, nw->arg.group.id);
466 	return (true);
467 }
468 
469 static const struct nlwriter_ops nlmsg_writers[] = {
470 	/* NS_WRITER_TYPE_MBUF */
471 	{
472 		.init = nlmsg_get_ns_mbuf,
473 		.write_socket = nlmsg_write_socket_mbuf,
474 		.write_group = nlmsg_write_group_mbuf,
475 		.write_chain = nlmsg_write_chain_mbuf,
476 	},
477 	/* NS_WRITER_TYPE_BUF */
478 	{
479 		.init = nlmsg_get_ns_buf,
480 		.write_socket = nlmsg_write_socket_buf,
481 		.write_group = nlmsg_write_group_buf,
482 		.write_chain = nlmsg_write_chain_buf,
483 	},
484 	/* NS_WRITER_TYPE_LBUF */
485 	{
486 		.init = nlmsg_get_ns_lbuf,
487 		.write_socket = nlmsg_write_socket_lbuf,
488 		.write_group = nlmsg_write_group_lbuf,
489 	},
490 };
491 
492 static void
493 nlmsg_set_callback(struct nl_writer *nw)
494 {
495 	const struct nlwriter_ops *pops = &nlmsg_writers[nw->writer_type];
496 
497 	switch (nw->writer_target) {
498 	case NS_WRITER_TARGET_SOCKET:
499 		nw->cb = pops->write_socket;
500 		break;
501 	case NS_WRITER_TARGET_GROUP:
502 		nw->cb = pops->write_group;
503 		break;
504 	case NS_WRITER_TARGET_CHAIN:
505 		nw->cb = pops->write_chain;
506 		break;
507 	default:
508 		panic("not implemented");
509 	}
510 }
511 
512 static bool
513 nlmsg_get_buf_type(struct nl_writer *nw, int size, int type, bool waitok)
514 {
515 	MPASS(type + 1 <= sizeof(nlmsg_writers) / sizeof(nlmsg_writers[0]));
516 	NL_LOG(LOG_DEBUG3, "Setting up nw %p size %d type %d", nw, size, type);
517 	return (nlmsg_writers[type].init(nw, size, waitok));
518 }
519 
520 static bool
521 nlmsg_get_buf(struct nl_writer *nw, int size, bool waitok, bool is_linux)
522 {
523 	int type;
524 
525 	if (!is_linux) {
526 		if (__predict_true(size <= NLMBUFSIZE))
527 			type = NS_WRITER_TYPE_MBUF;
528 		else
529 			type = NS_WRITER_TYPE_BUF;
530 	} else
531 		type = NS_WRITER_TYPE_LBUF;
532 	return (nlmsg_get_buf_type(nw, size, type, waitok));
533 }
534 
535 bool
536 _nlmsg_get_unicast_writer(struct nl_writer *nw, int size, struct nlpcb *nlp)
537 {
538 	if (!nlmsg_get_buf(nw, size, false, nlp->nl_linux))
539 		return (false);
540 	nw->arg.ptr = (void *)nlp;
541 	nw->writer_target = NS_WRITER_TARGET_SOCKET;
542 	nlmsg_set_callback(nw);
543 	return (true);
544 }
545 
546 bool
547 _nlmsg_get_group_writer(struct nl_writer *nw, int size, int protocol, int group_id)
548 {
549 	if (!nlmsg_get_buf(nw, size, false, false))
550 		return (false);
551 	nw->arg.group.proto = protocol;
552 	nw->arg.group.id = group_id;
553 	nw->writer_target = NS_WRITER_TARGET_GROUP;
554 	nlmsg_set_callback(nw);
555 	return (true);
556 }
557 
558 bool
559 _nlmsg_get_chain_writer(struct nl_writer *nw, int size, struct mbuf **pm)
560 {
561 	if (!nlmsg_get_buf(nw, size, false, false))
562 		return (false);
563 	*pm = NULL;
564 	nw->arg.ptr = (void *)pm;
565 	nw->writer_target = NS_WRITER_TARGET_CHAIN;
566 	nlmsg_set_callback(nw);
567 	NL_LOG(LOG_DEBUG3, "setup cb %p (need %p)", nw->cb, &nlmsg_write_chain_mbuf);
568 	return (true);
569 }
570 
571 void
572 _nlmsg_ignore_limit(struct nl_writer *nw)
573 {
574 	nw->ignore_limit = true;
575 }
576 
577 bool
578 _nlmsg_flush(struct nl_writer *nw)
579 {
580 
581 	if (__predict_false(nw->hdr != NULL)) {
582 		/* Last message has not been completed, skip it. */
583 		int completed_len = (char *)nw->hdr - nw->data;
584 		/* Send completed messages */
585 		nw->offset -= nw->offset - completed_len;
586 		nw->hdr = NULL;
587 	}
588 
589 	NL_LOG(LOG_DEBUG2, "OUT");
590 	bool result = nw->cb(nw, nw->_storage, nw->offset, nw->num_messages);
591 	nw->_storage = NULL;
592 
593 	if (!result) {
594 		NL_LOG(LOG_DEBUG, "nw %p offset %d: flush with %p() failed", nw, nw->offset, nw->cb);
595 	}
596 
597 	return (result);
598 }
599 
600 /*
601  * Flushes previous data and allocates new underlying storage
602  *  sufficient for holding at least @required_len bytes.
603  * Return true on success.
604  */
605 bool
606 _nlmsg_refill_buffer(struct nl_writer *nw, int required_len)
607 {
608 	struct nl_writer ns_new = {};
609 	int completed_len, new_len;
610 
611 	if (nw->enomem)
612 		return (false);
613 
614 	NL_LOG(LOG_DEBUG3, "no space at offset %d/%d (want %d), trying to reclaim",
615 	    nw->offset, nw->alloc_len, required_len);
616 
617 	/* Calculated new buffer size and allocate it s*/
618 	completed_len = (nw->hdr != NULL) ? (char *)nw->hdr - nw->data : nw->offset;
619 	if (completed_len > 0 && required_len < NLMBUFSIZE) {
620 		/* We already ran out of space, use the largest effective size */
621 		new_len = max(nw->alloc_len, NLMBUFSIZE);
622 	} else {
623 		if (nw->alloc_len < NLMBUFSIZE)
624 			new_len = NLMBUFSIZE;
625 		else
626 			new_len = nw->alloc_len * 2;
627 		while (new_len < required_len)
628 			new_len *= 2;
629 	}
630 	bool waitok = (nw->malloc_flag == M_WAITOK);
631 	bool is_linux = (nw->writer_type == NS_WRITER_TYPE_LBUF);
632 	if (!nlmsg_get_buf(&ns_new, new_len, waitok, is_linux)) {
633 		nw->enomem = true;
634 		NL_LOG(LOG_DEBUG, "getting new buf failed, setting ENOMEM");
635 		return (false);
636 	}
637 	if (nw->ignore_limit)
638 		nlmsg_ignore_limit(&ns_new);
639 
640 	/* Update callback data */
641 	ns_new.writer_target = nw->writer_target;
642 	nlmsg_set_callback(&ns_new);
643 	ns_new.arg = nw->arg;
644 
645 	/* Copy last (unfinished) header to the new storage */
646 	int last_len = nw->offset - completed_len;
647 	if (last_len > 0) {
648 		memcpy(ns_new.data, nw->hdr, last_len);
649 		ns_new.hdr = (struct nlmsghdr *)ns_new.data;
650 		ns_new.offset = last_len;
651 	}
652 
653 	NL_LOG(LOG_DEBUG2, "completed: %d bytes, copied: %d bytes", completed_len, last_len);
654 
655 	/* Flush completed headers & switch to the new nw */
656 	nlmsg_flush(nw);
657 	memcpy(nw, &ns_new, sizeof(struct nl_writer));
658 	NL_LOG(LOG_DEBUG2, "switched buffer: used %d/%d bytes", nw->offset, nw->alloc_len);
659 
660 	return (true);
661 }
662 
663 bool
664 _nlmsg_add(struct nl_writer *nw, uint32_t portid, uint32_t seq, uint16_t type,
665     uint16_t flags, uint32_t len)
666 {
667 	struct nlmsghdr *hdr;
668 
669 	MPASS(nw->hdr == NULL);
670 
671 	int required_len = NETLINK_ALIGN(len + sizeof(struct nlmsghdr));
672 	if (__predict_false(nw->offset + required_len > nw->alloc_len)) {
673 		if (!nlmsg_refill_buffer(nw, required_len))
674 			return (false);
675 	}
676 
677 	hdr = (struct nlmsghdr *)(&nw->data[nw->offset]);
678 
679 	hdr->nlmsg_len = len;
680 	hdr->nlmsg_type = type;
681 	hdr->nlmsg_flags = flags;
682 	hdr->nlmsg_seq = seq;
683 	hdr->nlmsg_pid = portid;
684 
685 	nw->hdr = hdr;
686 	nw->offset += sizeof(struct nlmsghdr);
687 
688 	return (true);
689 }
690 
691 bool
692 _nlmsg_end(struct nl_writer *nw)
693 {
694 	MPASS(nw->hdr != NULL);
695 
696 	if (nw->enomem) {
697 		NL_LOG(LOG_DEBUG, "ENOMEM when dumping message");
698 		nlmsg_abort(nw);
699 		return (false);
700 	}
701 
702 	nw->hdr->nlmsg_len = (uint32_t)(nw->data + nw->offset - (char *)nw->hdr);
703 	NL_LOG(LOG_DEBUG2, "wrote msg len: %u type: %d: flags: 0x%X seq: %u pid: %u",
704 	    nw->hdr->nlmsg_len, nw->hdr->nlmsg_type, nw->hdr->nlmsg_flags,
705 	    nw->hdr->nlmsg_seq, nw->hdr->nlmsg_pid);
706 	nw->hdr = NULL;
707 	nw->num_messages++;
708 	return (true);
709 }
710 
711 void
712 _nlmsg_abort(struct nl_writer *nw)
713 {
714 	if (nw->hdr != NULL) {
715 		nw->offset = (uint32_t)((char *)nw->hdr - nw->data);
716 		nw->hdr = NULL;
717 	}
718 }
719 
720 void
721 nlmsg_ack(struct nlpcb *nlp, int error, struct nlmsghdr *hdr,
722     struct nl_pstate *npt)
723 {
724 	struct nlmsgerr *errmsg;
725 	int payload_len;
726 	uint32_t flags = nlp->nl_flags;
727 	struct nl_writer *nw = npt->nw;
728 	bool cap_ack;
729 
730 	payload_len = sizeof(struct nlmsgerr);
731 
732 	/*
733 	 * The only case when we send the full message in the
734 	 * reply is when there is an error and NETLINK_CAP_ACK
735 	 * is not set.
736 	 */
737 	cap_ack = (error == 0) || (flags & NLF_CAP_ACK);
738 	if (!cap_ack)
739 		payload_len += hdr->nlmsg_len - sizeof(struct nlmsghdr);
740 	payload_len = NETLINK_ALIGN(payload_len);
741 
742 	uint16_t nl_flags = cap_ack ? NLM_F_CAPPED : 0;
743 	if ((npt->err_msg || npt->err_off) && nlp->nl_flags & NLF_EXT_ACK)
744 		nl_flags |= NLM_F_ACK_TLVS;
745 
746 	NL_LOG(LOG_DEBUG3, "acknowledging message type %d seq %d",
747 	    hdr->nlmsg_type, hdr->nlmsg_seq);
748 
749 	if (!nlmsg_add(nw, nlp->nl_port, hdr->nlmsg_seq, NLMSG_ERROR, nl_flags, payload_len))
750 		goto enomem;
751 
752 	errmsg = nlmsg_reserve_data(nw, payload_len, struct nlmsgerr);
753 	errmsg->error = error;
754 	/* In case of error copy the whole message, else just the header */
755 	memcpy(&errmsg->msg, hdr, cap_ack ? sizeof(*hdr) : hdr->nlmsg_len);
756 
757 	if (npt->err_msg != NULL && nlp->nl_flags & NLF_EXT_ACK)
758 		nlattr_add_string(nw, NLMSGERR_ATTR_MSG, npt->err_msg);
759 	if (npt->err_off != 0 && nlp->nl_flags & NLF_EXT_ACK)
760 		nlattr_add_u32(nw, NLMSGERR_ATTR_OFFS, npt->err_off);
761 	if (npt->cookie != NULL)
762 		nlattr_add_raw(nw, npt->cookie);
763 
764 	if (nlmsg_end(nw))
765 		return;
766 enomem:
767 	NLP_LOG(LOG_DEBUG, nlp, "error allocating ack data for message %d seq %u",
768 	    hdr->nlmsg_type, hdr->nlmsg_seq);
769 	nlmsg_abort(nw);
770 }
771 
772 bool
773 _nlmsg_end_dump(struct nl_writer *nw, int error, struct nlmsghdr *hdr)
774 {
775 	if (!nlmsg_add(nw, hdr->nlmsg_pid, hdr->nlmsg_seq, NLMSG_DONE, 0, sizeof(int))) {
776 		NL_LOG(LOG_DEBUG, "Error finalizing table dump");
777 		return (false);
778 	}
779 	/* Save operation result */
780 	int *perror = nlmsg_reserve_object(nw, int);
781 	NL_LOG(LOG_DEBUG2, "record error=%d at off %d (%p)", error,
782 	    nw->offset, perror);
783 	*perror = error;
784 	nlmsg_end(nw);
785 	nw->suppress_ack = true;
786 
787 	return (true);
788 }
789 
790 #include <netlink/ktest_netlink_message_writer.h>
791