From 32682ba02d588cebedf5cd42a8074a9830cf9041 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Tue, 2 Dec 2025 22:57:53 +0900 Subject: [PATCH 1/5] sd-netlink: introduce netlink_now() and move timespan_to_timestamp() When sd-event is attached to the sd-netlink object, use the timestamp of the current event, otherwise call now(). This mostly change nothing. Preparation for later change. --- src/libsystemd/sd-netlink/sd-netlink.c | 76 +++++++++++++++----------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/src/libsystemd/sd-netlink/sd-netlink.c b/src/libsystemd/sd-netlink/sd-netlink.c index cd66b2e832..c8bee25863 100644 --- a/src/libsystemd/sd-netlink/sd-netlink.c +++ b/src/libsystemd/sd-netlink/sd-netlink.c @@ -152,6 +152,44 @@ static sd_netlink *netlink_free(sd_netlink *nl) { DEFINE_TRIVIAL_REF_UNREF_FUNC(sd_netlink, sd_netlink, netlink_free); +static usec_t netlink_now(sd_netlink *nl, clock_t clock) { + assert(nl); + + usec_t now_usec; + if (nl->event && sd_event_now(nl->event, clock, &now_usec) > 0) + return now_usec; + + return now(clock); +} + +static usec_t timespan_to_timestamp(sd_netlink *nl, usec_t usec) { + static bool default_timeout_set = false; + static usec_t default_timeout; + int r; + + assert(nl); + + if (usec == 0) { + if (!default_timeout_set) { + const char *e; + + default_timeout_set = true; + default_timeout = NETLINK_DEFAULT_TIMEOUT_USEC; + + e = secure_getenv("SYSTEMD_NETLINK_DEFAULT_TIMEOUT"); + if (e) { + r = parse_sec(e, &default_timeout); + if (r < 0) + log_debug_errno(r, "sd-netlink: Failed to parse $SYSTEMD_NETLINK_DEFAULT_TIMEOUT environment variable, ignoring: %m"); + } + } + + usec = default_timeout; + } + + return usec_add(netlink_now(nl, CLOCK_MONOTONIC), usec); +} + int sd_netlink_send( sd_netlink *nl, sd_netlink_message *message, @@ -204,7 +242,6 @@ static int process_timeout(sd_netlink *nl) { _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; struct reply_callback *c; sd_netlink_slot *slot; - usec_t n; int r; assert(nl); @@ -213,8 +250,7 @@ static int process_timeout(sd_netlink *nl) { if (!c) return 0; - n = now(CLOCK_MONOTONIC); - if (c->timeout > n) + if (c->timeout > netlink_now(nl, CLOCK_MONOTONIC)) return 0; r = message_new_synthetic_error(nl, -ETIMEDOUT, c->serial, &m); @@ -384,32 +420,6 @@ int sd_netlink_process(sd_netlink *nl, sd_netlink_message **ret) { return r; } -static usec_t timespan_to_timestamp(usec_t usec) { - static bool default_timeout_set = false; - static usec_t default_timeout; - int r; - - if (usec == 0) { - if (!default_timeout_set) { - const char *e; - - default_timeout_set = true; - default_timeout = NETLINK_DEFAULT_TIMEOUT_USEC; - - e = secure_getenv("SYSTEMD_NETLINK_DEFAULT_TIMEOUT"); - if (e) { - r = parse_sec(e, &default_timeout); - if (r < 0) - log_debug_errno(r, "sd-netlink: Failed to parse $SYSTEMD_NETLINK_DEFAULT_TIMEOUT environment variable, ignoring: %m"); - } - } - - usec = default_timeout; - } - - return usec_add(now(CLOCK_MONOTONIC), usec); -} - static int netlink_poll(sd_netlink *nl, bool need_more, usec_t timeout_usec) { usec_t m = USEC_INFINITY; int r, e; @@ -434,7 +444,7 @@ static int netlink_poll(sd_netlink *nl, bool need_more, usec_t timeout_usec) { if (r < 0) return r; - m = usec_sub_unsigned(until, now(CLOCK_MONOTONIC)); + m = usec_sub_unsigned(until, netlink_now(nl, CLOCK_MONOTONIC)); } r = fd_wait_for_event(nl->fd, e, MIN(m, timeout_usec)); @@ -508,7 +518,7 @@ int sd_netlink_call_async( return r; slot->reply_callback.callback = callback; - slot->reply_callback.timeout = timespan_to_timestamp(usec); + slot->reply_callback.timeout = timespan_to_timestamp(nl, usec); k = sd_netlink_send(nl, m, &slot->reply_callback.serial); if (k < 0) @@ -549,7 +559,7 @@ int sd_netlink_read( assert_return(nl, -EINVAL); assert_return(!netlink_pid_changed(nl), -ECHILD); - usec = timespan_to_timestamp(timeout); + usec = timespan_to_timestamp(nl, timeout); for (;;) { _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL; @@ -591,7 +601,7 @@ int sd_netlink_read( if (usec != USEC_INFINITY) { usec_t n; - n = now(CLOCK_MONOTONIC); + n = netlink_now(nl, CLOCK_MONOTONIC); if (n >= usec) return -ETIMEDOUT; From 991703009ebe994d9513416398d0c6ea3d14bfef Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Tue, 2 Dec 2025 23:02:50 +0900 Subject: [PATCH 2/5] sd-netlink: introduce sd_netlink_ignore_serial() When we send a message with NLM_F_ACK, but if later we are not interested in the reply and do not want to call sd_netlink_read(), the reply will be stored in the rqueue forever. Let's introduce a way to ignore received message without waiting reply. --- src/libsystemd/sd-netlink/netlink-internal.h | 6 +++ src/libsystemd/sd-netlink/netlink-socket.c | 11 +++- src/libsystemd/sd-netlink/sd-netlink.c | 56 ++++++++++++++++++++ src/systemd/sd-netlink.h | 2 + 4 files changed, 74 insertions(+), 1 deletion(-) diff --git a/src/libsystemd/sd-netlink/netlink-internal.h b/src/libsystemd/sd-netlink/netlink-internal.h index 1b8f4afcb0..158e2e5577 100644 --- a/src/libsystemd/sd-netlink/netlink-internal.h +++ b/src/libsystemd/sd-netlink/netlink-internal.h @@ -55,6 +55,11 @@ typedef struct sd_netlink_slot { }; } sd_netlink_slot; +typedef struct NetlinkIgnoredSerial { + uint32_t serial; + usec_t timeout_usec; /* timestamp in CLOCK_MONOTONIC */ +} NetlinkIgnoredSerial; + typedef struct sd_netlink { unsigned n_ref; @@ -78,6 +83,7 @@ typedef struct sd_netlink { bool processing:1; uint32_t serial; + Hashmap *ignored_serials; struct Prioq *reply_callbacks_prioq; Hashmap *reply_callbacks; diff --git a/src/libsystemd/sd-netlink/netlink-socket.c b/src/libsystemd/sd-netlink/netlink-socket.c index 28fd2bc3d9..385d92a64f 100644 --- a/src/libsystemd/sd-netlink/netlink-socket.c +++ b/src/libsystemd/sd-netlink/netlink-socket.c @@ -222,6 +222,16 @@ static int netlink_queue_received_message(sd_netlink *nl, sd_netlink_message *m) assert(nl); assert(m); + serial = message_get_serial(m); + if (serial != 0) { + NetlinkIgnoredSerial *s = hashmap_remove(nl->ignored_serials, UINT32_TO_PTR(serial)); + if (s) { + /* We are not interested in the message anymore. */ + free(s); + return 0; + } + } + if (ordered_set_size(nl->rqueue) >= NETLINK_RQUEUE_MAX) return log_debug_errno(SYNTHETIC_ERRNO(ENOBUFS), "sd-netlink: exhausted the read queue size (%d)", NETLINK_RQUEUE_MAX); @@ -235,7 +245,6 @@ static int netlink_queue_received_message(sd_netlink *nl, sd_netlink_message *m) if (sd_netlink_message_is_broadcast(m)) return 0; - serial = message_get_serial(m); if (serial == 0) return 0; diff --git a/src/libsystemd/sd-netlink/sd-netlink.c b/src/libsystemd/sd-netlink/sd-netlink.c index c8bee25863..2deb49557d 100644 --- a/src/libsystemd/sd-netlink/sd-netlink.c +++ b/src/libsystemd/sd-netlink/sd-netlink.c @@ -126,6 +126,8 @@ static sd_netlink *netlink_free(sd_netlink *nl) { assert(nl); + hashmap_free(nl->ignored_serials); + ordered_set_free(nl->rqueue); hashmap_free(nl->rqueue_by_serial); hashmap_free(nl->rqueue_partial_by_serial); @@ -190,6 +192,58 @@ static usec_t timespan_to_timestamp(sd_netlink *nl, usec_t usec) { return usec_add(netlink_now(nl, CLOCK_MONOTONIC), usec); } +static void netlink_trim_ignored_serials(sd_netlink *nl) { + NetlinkIgnoredSerial *s; + usec_t now_usec = 0; + + assert(nl); + + HASHMAP_FOREACH(s, nl->ignored_serials) { + if (s->timeout_usec == USEC_INFINITY) + continue; + + if (now_usec == 0) + now_usec = netlink_now(nl, CLOCK_MONOTONIC); + + if (s->timeout_usec < now_usec) + free(hashmap_remove(nl->ignored_serials, UINT32_TO_PTR(s->serial))); + } +} + +int sd_netlink_ignore_serial(sd_netlink *nl, uint32_t serial, uint64_t timeout_usec) { + int r; + + assert_return(nl, -EINVAL); + assert_return(!netlink_pid_changed(nl), -ECHILD); + assert_return(serial != 0, -EINVAL); + + timeout_usec = timespan_to_timestamp(nl, timeout_usec); + + NetlinkIgnoredSerial *existing = hashmap_get(nl->ignored_serials, UINT32_TO_PTR(serial)); + if (existing) { + existing->timeout_usec = timeout_usec; + return 0; + } + + netlink_trim_ignored_serials(nl); + + _cleanup_free_ NetlinkIgnoredSerial *s = new(NetlinkIgnoredSerial, 1); + if (!s) + return -ENOMEM; + + *s = (NetlinkIgnoredSerial) { + .serial = serial, + .timeout_usec = timeout_usec, + }; + + r = hashmap_ensure_put(&nl->ignored_serials, &trivial_hash_ops_value_free, UINT32_TO_PTR(s->serial), s); + if (r < 0) + return r; + + TAKE_PTR(s); + return 0; +} + int sd_netlink_send( sd_netlink *nl, sd_netlink_message *message, @@ -373,6 +427,8 @@ static int process_running(sd_netlink *nl, sd_netlink_message **ret) { assert(nl); + netlink_trim_ignored_serials(nl); + r = process_timeout(nl); if (r != 0) goto null_message; diff --git a/src/systemd/sd-netlink.h b/src/systemd/sd-netlink.h index b8f0481d49..10705d3fe2 100644 --- a/src/systemd/sd-netlink.h +++ b/src/systemd/sd-netlink.h @@ -56,6 +56,8 @@ int sd_netlink_call_async(sd_netlink *nl, sd_netlink_slot **ret_slot, sd_netlink int sd_netlink_call(sd_netlink *nl, sd_netlink_message *message, uint64_t timeout, sd_netlink_message **ret); int sd_netlink_read(sd_netlink *nl, uint32_t serial, uint64_t timeout, sd_netlink_message **ret); +int sd_netlink_ignore_serial(sd_netlink *nl, uint32_t serial, uint64_t timeout_usec); + int sd_netlink_get_events(sd_netlink *nl); int sd_netlink_get_timeout(sd_netlink *nl, uint64_t *ret); int sd_netlink_process(sd_netlink *nl, sd_netlink_message **ret); From ea97ca9a06c8c718ae09c601eb092d0d34e975f9 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Wed, 3 Dec 2025 00:28:55 +0900 Subject: [PATCH 3/5] sd-netlink: drop unused ret_messages argument This is not only unused, the kernel never provide any meaningful reply for batch message. Let's drop it. --- src/libsystemd/sd-netlink/netlink-internal.h | 3 +-- src/libsystemd/sd-netlink/netlink-message-nfnl.c | 16 ++-------------- src/shared/firewall-util.c | 10 +++++----- 3 files changed, 8 insertions(+), 21 deletions(-) diff --git a/src/libsystemd/sd-netlink/netlink-internal.h b/src/libsystemd/sd-netlink/netlink-internal.h index 158e2e5577..51e523c1ba 100644 --- a/src/libsystemd/sd-netlink/netlink-internal.h +++ b/src/libsystemd/sd-netlink/netlink-internal.h @@ -187,8 +187,7 @@ int sd_nfnl_call_batch( sd_netlink *nfnl, sd_netlink_message **messages, size_t n_messages, - uint64_t usec, - sd_netlink_message ***ret_messages); + uint64_t usec); int sd_nfnl_message_new( sd_netlink *nfnl, sd_netlink_message **ret, diff --git a/src/libsystemd/sd-netlink/netlink-message-nfnl.c b/src/libsystemd/sd-netlink/netlink-message-nfnl.c index 2b6e1dd82b..8708aac102 100644 --- a/src/libsystemd/sd-netlink/netlink-message-nfnl.c +++ b/src/libsystemd/sd-netlink/netlink-message-nfnl.c @@ -178,10 +178,8 @@ int sd_nfnl_call_batch( sd_netlink *nfnl, sd_netlink_message **messages, size_t n_messages, - uint64_t usec, - sd_netlink_message ***ret_messages) { + uint64_t usec) { - _cleanup_free_ sd_netlink_message **replies = NULL; _cleanup_free_ uint32_t *serials = NULL; int r; @@ -190,25 +188,15 @@ int sd_nfnl_call_batch( assert_return(messages, -EINVAL); assert_return(n_messages > 0, -EINVAL); - if (ret_messages) { - replies = new0(sd_netlink_message*, n_messages); - if (!replies) - return -ENOMEM; - } - r = sd_nfnl_send_batch(nfnl, messages, n_messages, &serials); if (r < 0) return r; for (size_t i = 0; i < n_messages; i++) - RET_GATHER(r, - sd_netlink_read(nfnl, serials[i], usec, ret_messages ? replies + i : NULL)); + RET_GATHER(r, sd_netlink_read(nfnl, serials[i], usec, /* ret= */ NULL)); if (r < 0) return r; - if (ret_messages) - *ret_messages = TAKE_PTR(replies); - return 0; } diff --git a/src/shared/firewall-util.c b/src/shared/firewall-util.c index b59740166e..0f4073a085 100644 --- a/src/shared/firewall-util.c +++ b/src/shared/firewall-util.c @@ -807,7 +807,7 @@ static int fw_nftables_init_family(sd_netlink *nfnl, int family) { return r; assert(msgcnt < ELEMENTSOF(messages)); - r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS); if (r < 0 && r != -EEXIST) return r; @@ -919,7 +919,7 @@ int nft_set_element_modify_iprange( if (r < 0) return r; - return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS); } int nft_set_element_modify_ip( @@ -959,7 +959,7 @@ int nft_set_element_modify_ip( if (r < 0) return r; - return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS); } int nft_set_element_modify_any( @@ -987,7 +987,7 @@ int nft_set_element_modify_any( if (r < 0) return r; - return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + return sd_nfnl_call_batch(nfnl, &m, 1, NFNL_DEFAULT_TIMEOUT_USECS); } static int af_to_nfproto(int af) { @@ -1124,7 +1124,7 @@ static int fw_nftables_add_local_dnat_internal( return r; assert(msgcnt < ELEMENTSOF(messages)); - r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS, NULL); + r = sd_nfnl_call_batch(nfnl, messages, msgcnt, NFNL_DEFAULT_TIMEOUT_USECS); if (r == -EOVERFLOW && af == AF_INET6) { /* The current implementation of DNAT in systemd requires kernel's * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns From 691d63dbddcb6db4379c58e94325b8115a7bf124 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Tue, 2 Dec 2025 19:27:56 +0900 Subject: [PATCH 4/5] sd-netlink: also read the reply for NFNL_MSG_BATCH_BEGIN message When we send a batch of nfnl messages, but e.g. without sufficient privilege, the kernel may only return an error message for NFNL_MSG_BATCH_BEGIN and ignore all later messages. So, we need to read the response for the NFNL_MSG_BATCH_BEGIN, and if it is an error ignore the replies for the rest. --- .../sd-netlink/netlink-message-nfnl.c | 45 +++++++++++++++---- 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/src/libsystemd/sd-netlink/netlink-message-nfnl.c b/src/libsystemd/sd-netlink/netlink-message-nfnl.c index 8708aac102..a485fd096f 100644 --- a/src/libsystemd/sd-netlink/netlink-message-nfnl.c +++ b/src/libsystemd/sd-netlink/netlink-message-nfnl.c @@ -7,7 +7,6 @@ #include "sd-netlink.h" #include "alloc-util.h" -#include "errno-util.h" #include "iovec-util.h" #include "netlink-internal.h" #include "netlink-util.h" @@ -119,7 +118,7 @@ int sd_nfnl_send_batch( return -ENOMEM; if (ret_serials) { - serials = new(uint32_t, n_messages); + serials = new(uint32_t, n_messages + 2); if (!serials) return -ENOMEM; } @@ -133,6 +132,9 @@ int sd_nfnl_send_batch( return r; netlink_seal_message(nfnl, batch_begin); + if (serials) + serials[c] = message_get_serial(batch_begin); + iovs[c++] = IOVEC_MAKE(batch_begin->hdr, batch_begin->hdr->nlmsg_len); for (size_t i = 0; i < n_messages; i++) { @@ -147,7 +149,7 @@ int sd_nfnl_send_batch( netlink_seal_message(nfnl, messages[i]); if (serials) - serials[i] = message_get_serial(messages[i]); + serials[c] = message_get_serial(messages[i]); /* It seems that the kernel accepts an arbitrary number. Let's set the lower 16 bits of the * serial of the first message. */ @@ -161,6 +163,9 @@ int sd_nfnl_send_batch( return r; netlink_seal_message(nfnl, batch_end); + if (serials) + serials[c] = message_get_serial(batch_end); + iovs[c++] = IOVEC_MAKE(batch_end->hdr, batch_end->hdr->nlmsg_len); assert(c == n_messages + 2); @@ -192,12 +197,36 @@ int sd_nfnl_call_batch( if (r < 0) return r; - for (size_t i = 0; i < n_messages; i++) - RET_GATHER(r, sd_netlink_read(nfnl, serials[i], usec, /* ret= */ NULL)); - if (r < 0) - return r; + for (size_t i = 1; i <= n_messages; i++) { + /* If we have received an error, kernel may not send replies for later messages. Let's ignore + * remaining replies. */ + if (r < 0) { + (void) sd_netlink_ignore_serial(nfnl, serials[i], usec); + continue; + } - return 0; + r = sd_netlink_read(nfnl, serials[i], usec, /* ret= */ NULL); + if (r != -ETIMEDOUT) + continue; + + /* The kernel returns some errors, e.g. unprivileged, to the BATCH_BEGIN. Hence, if we have + * not received any replies for the batch body, try to read an error in the reply for the + * batch begin. Note, since v6.10 (bf2ac490d28c21a349e9eef81edc45320fca4a3c), we can expect + * that the kernel always replies the batch begin and end. When we bump the kernel baseline, + * we can read the reply for the batch begin at first. */ + int k = sd_netlink_read(nfnl, serials[0], usec, /* ret= */ NULL); + if (k < 0) + r = k; + + serials[0] = 0; /* indicates that we have read the reply. */ + } + + /* Ignore replies for batch begin and end if we have not read them. */ + if (serials[0] != 0) + (void) sd_netlink_ignore_serial(nfnl, serials[0], usec); + (void) sd_netlink_ignore_serial(nfnl, serials[n_messages + 1], usec); + + return r; } int sd_nfnl_nft_message_new_basechain( From 84990e08e5d1dceedb2525e977b07ecc0b172628 Mon Sep 17 00:00:00 2001 From: Yu Watanabe Date: Wed, 3 Dec 2025 01:35:34 +0900 Subject: [PATCH 5/5] test-firewall-util: allow to run test-firewall-util Now fw_nftables_add_masquerade() should return EOPNOTSUPP or so if unprivileged. It is not necessary to skip whole tests earlier. --- src/test/test-firewall-util.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/test/test-firewall-util.c b/src/test/test-firewall-util.c index d7080fa349..aea6cded5a 100644 --- a/src/test/test-firewall-util.c +++ b/src/test/test-firewall-util.c @@ -1,8 +1,5 @@ /* SPDX-License-Identifier: LGPL-2.1-or-later */ -#include -#include - #include "sd-netlink.h" #include "firewall-util.h" @@ -79,9 +76,6 @@ TEST(v4) { static int intro(void) { int r; - if (getuid() != 0) - return log_tests_skipped("not root"); - ASSERT_OK_ERRNO(setenv("SYSTEMD_FIREWALL_UTIL_NFT_TABLE_NAME", "io.systemd-test.nat", /* overwrite = */ true)); ASSERT_OK_ERRNO(setenv("SYSTEMD_FIREWALL_UTIL_DNAT_MAP_NAME", "test_map_port_ipport", /* overwrite = */ true));