Using NFQUEUE to delegate the decision on TCP packets to userspace processes will cause memory leak.
The symptom is that TCP slab objects will accumulate and eventually cause OOM.
[Fix]
There is a discrepancy between backport and upstream commit.
/* Bump dev refs so they don't vanish while packet is out */
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
+ if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
+ return false;
+
dev_hold(state->in);
dev_hold(state->out);
- if (state->sk)
- sock_hold(state->sk);
/* Bump dev refs so they don't vanish while packet is out */
-void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
+bool nf_queue_entry_get_refs(struct nf_queue_entry *entry)
{
struct nf_hook_state *state = &entry->state;
+ if (state->sk && !refcount_inc_not_zero(&state->sk->sk_refcnt))
+ return false;
+
if (state->in)
dev_hold(state->in);
if (state->out)
@@ -102,6 +105,7 @@ void nf_queue_entry_get_refs(struct nf_queue_entry *entry)
dev_hold(physdev);
}
#endif
+ return true;
}
EXPORT_SYMBOL_GPL(nf_queue_entry_get_refs);
The sock_hold() logic still remains in the backport commit, which will affect the reference count and result in memory leak.
The fix aligns the logic with the upstream commit.
[Test Plan]
1. Prepare a VM and run a TCP server on host.
2. Enter into the VM and set up a iptables rule.
iptables -I OUTPUT -p tcp --dst <-TCP_SERVER_IP-> -j NFQUEUE --queue-num=1 --queue-bypass
3. Run a nfnetlink client (should listen on NF queue 1) on VM.
4. Keep connecting the TCP server from VM.
while true; do netcat <-TCP_SERVER_IP-> 8080; done
5. The VM's TCP slab objects will accumulate and eventually encounter OOM situation.
cat /proc/slabinfo | grep TCP
[Where problems could occur]
The fix just aligns the logic with the upstream commit, so the regression can be considered as low.
[Impact]
Environment: v4.15.0-177-generic Xenial ESM
Using NFQUEUE to delegate the decision on TCP packets to userspace processes will cause memory leak.
The symptom is that TCP slab objects will accumulate and eventually cause OOM.
[Fix]
There is a discrepancy between backport and upstream commit.
[Upstream Commit c3873070247d9e3 c7a6b0cf9bf9b45 e8018427b1] net/netfilter/ nf_queue. h b/include/ net/netfilter/ nf_queue. h .980daa6e1e3a 100644 net/netfilter/ nf_queue. h net/netfilter/ nf_queue. h queue_handler( const struct nf_queue_handler *qh); queue_handler( void);
diff --git a/include/
index 9eed51e920e8.
--- a/include/
+++ b/include/
@@ -37,7 +37,7 @@ void nf_register_
void nf_unregister_
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
-void nf_queue_ entry_get_ refs(struct nf_queue_entry *entry); entry_get_ refs(struct nf_queue_entry *entry); entry_free( struct nf_queue_entry *entry);
+bool nf_queue_
void nf_queue_
static inline void init_hashrandom(u32 *jhash_initval) /nf_queue. c b/net/netfilter /nf_queue. c .e39549c55945 100644 /nf_queue. c /nf_queue. c entry_init_ physdevs( struct nf_queue_entry *entry)
diff --git a/net/netfilter
index 5ab0680db445.
--- a/net/netfilter
+++ b/net/netfilter
@@ -96,19 +96,21 @@ static void __nf_queue_
}
/* Bump dev refs so they don't vanish while packet is out */ entry_get_ refs(struct nf_queue_entry *entry) entry_get_ refs(struct nf_queue_entry *entry)
-void nf_queue_
+bool nf_queue_
{
struct nf_hook_state *state = &entry->state;
+ if (state->sk && !refcount_ inc_not_ zero(&state- >sk->sk_ refcnt) ) state-> in); state-> out); state-> sk);
+ return false;
+
dev_hold(
dev_hold(
- if (state->sk)
- sock_hold(
#if IS_ENABLED( CONFIG_ BRIDGE_ NETFILTER) entry-> physin) ; entry-> physout) ; SYMBOL_ GPL(nf_ queue_entry_ get_refs) ;
dev_hold(
dev_hold(
#endif
+ return true;
}
EXPORT_
@@ -196,7 +198,10 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state,
__nf_ queue_entry_ init_physdevs( entry);
- nf_queue_ entry_get_ refs(entry) ; entry_get_ refs(entry) ) {
+ if (!nf_queue_
+ kfree(entry);
+ return -ENOTCONN;
+ }
switch (entry->state.pf) { /nfnetlink_ queue.c b/net/netfilter /nfnetlink_ queue.c .64a6acb6aeae 100644 /nfnetlink_ queue.c /nfnetlink_ queue.c entry_dup( struct nf_queue_entry *e) entry_get_ refs(entry) ; entry_get_ refs(entry) )
case AF_INET:
diff --git a/net/netfilter
index ea2d9c2a44cf.
--- a/net/netfilter
+++ b/net/netfilter
@@ -710,9 +710,15 @@ static struct nf_queue_entry *
nf_queue_
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry)
- nf_queue_
- return entry;
+
+ if (!entry)
+ return NULL;
+
+ if (nf_queue_
+ return entry;
+
+ kfree(entry);
+ return NULL;
}
#if IS_ENABLED( CONFIG_ BRIDGE_ NETFILTER)
[Backport Commit 4d032d60432327f 068f03b516f5b6c 51ceb17d15] net/netfilter/ nf_queue. h b/include/ net/netfilter/ nf_queue. h .f38cc6092c5a 100644 net/netfilter/ nf_queue. h net/netfilter/ nf_queue. h queue_handler( struct net *net, const struct nf_queue_handler *q queue_handler( struct net *net);
diff --git a/include/
index 814058d0f167.
--- a/include/
+++ b/include/
@@ -32,7 +32,7 @@ void nf_register_
void nf_unregister_
void nf_reinject(struct nf_queue_entry *entry, unsigned int verdict);
-void nf_queue_ entry_get_ refs(struct nf_queue_entry *entry); entry_get_ refs(struct nf_queue_entry *entry); entry_release_ refs(struct nf_queue_entry *entry);
+bool nf_queue_
void nf_queue_
static inline void init_hashrandom(u32 *jhash_initval) /nf_queue. c b/net/netfilter /nf_queue. c .dbc45165c533 100644 /nf_queue. c /nf_queue. c entry_release_ refs(struct nf_queue_entry *entry) SYMBOL_ GPL(nf_ queue_entry_ release_ refs);
diff --git a/net/netfilter
index 59340b3ef7ef.
--- a/net/netfilter
+++ b/net/netfilter
@@ -80,10 +80,13 @@ void nf_queue_
EXPORT_
/* Bump dev refs so they don't vanish while packet is out */ entry_get_ refs(struct nf_queue_entry *entry) entry_get_ refs(struct nf_queue_entry *entry)
-void nf_queue_
+bool nf_queue_
{
struct nf_hook_state *state = &entry->state;
+ if (state->sk && !refcount_ inc_not_ zero(&state- >sk->sk_ refcnt) ) hold(state- >in); entry_get_ refs(struct nf_queue_entry *entry) hold(physdev) ; SYMBOL_ GPL(nf_ queue_entry_ get_refs) ;
+ return false;
+
if (state->in)
dev_
if (state->out)
@@ -102,6 +105,7 @@ void nf_queue_
dev_
}
#endif
+ return true;
}
EXPORT_
@@ -159,7 +163,11 @@ static int __nf_queue(struct sk_buff *skb, const struct nf_hook_state *state, >route_ key_size,
.size = sizeof(*entry) + afinfo-
};
- nf_queue_ entry_get_ refs(entry) ; entry_get_ refs(entry) ) { >saveroute( skb, entry);
+ if (!nf_queue_
+ kfree(entry);
+ return -ENOTCONN;
+ }
+
afinfo-
status = qh->outfn(entry, queuenum);
diff --git a/net/netfilter /nfnetlink_ queue.c b/net/netfilter /nfnetlink_ queue.c .abad8bf6fa38 100644 /nfnetlink_ queue.c /nfnetlink_ queue.c entry_dup( struct nf_queue_entry *e) entry_get_ refs(entry) ; entry_get_ refs(entry) )
index 48ed30e1b405.
--- a/net/netfilter
+++ b/net/netfilter
@@ -693,9 +693,15 @@ static struct nf_queue_entry *
nf_queue_
{
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
- if (entry)
- nf_queue_
- return entry;
+
+ if (!entry)
+ return NULL;
+
+ if (nf_queue_
+ return entry;
+
+ kfree(entry);
+ return NULL;
}
#if IS_ENABLED( CONFIG_ BRIDGE_ NETFILTER)
[Difference between Commits] state-> in); state-> out); state-> sk); CONFIG_ BRIDGE_ NETFILTER) entry-> physin) ; entry-> physout) ; state-> in); entry_get_ refs(struct nf_queue_entry *entry)
50,57c57,62
< dev_hold(
< dev_hold(
< - if (state->sk)
< - sock_hold(
<
< #if IS_ENABLED(
< dev_hold(
< dev_hold(
---
> if (state->in)
> dev_hold(
> if (state->out)
> @@ -102,6 +105,7 @@ void nf_queue_
> dev_hold(physdev);
> }
The sock_hold() logic still remains in the backport commit, which will affect the reference count and result in memory leak.
The fix aligns the logic with the upstream commit.
[Test Plan]
1. Prepare a VM and run a TCP server on host.
2. Enter into the VM and set up a iptables rule.
iptables -I OUTPUT -p tcp --dst <-TCP_SERVER_IP-> -j NFQUEUE --queue-num=1 --queue-bypass
3. Run a nfnetlink client (should listen on NF queue 1) on VM.
4. Keep connecting the TCP server from VM.
while true; do netcat <-TCP_SERVER_IP-> 8080; done
5. The VM's TCP slab objects will accumulate and eventually encounter OOM situation.
cat /proc/slabinfo | grep TCP
[Where problems could occur]
The fix just aligns the logic with the upstream commit, so the regression can be considered as low.