All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH net-next v2 0/3] Add support for netnamespace filtering in drop monitor
@ 2022-11-23 14:28 Nikolay Borisov
  2022-11-23 14:28 ` [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops Nikolay Borisov
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Nikolay Borisov @ 2022-11-23 14:28 UTC (permalink / raw)
  To: nhorman; +Cc: davem, kuba, pabeni, netdev, kernel, Nikolay Borisov

This series adds support for conveying as well as filtering based on the the
id of the net namespace where a particular event originated. This is especially
useful when dealing with systems hosting 10s or 100s of containers.

Currently software as well as devlink-originated drops are supported. The output
would look like the following:

11 drops at location 0xffffffffad8cd0c3 [software] [ns: 4026532485]
11 drops at location 0xffffffffad8cd0c3 [software] [ns: 4026532513]

Changes in v2:
 * Fixed the inadvertent uapi breakage by appending the newly added netlink
 attributes. All tests are now passing.


Nikolay Borisov (3):
  drop_monitor: Implement namespace filtering/reporting for software
    drops
  drop_monitor: Add namespace filtering/reporting for hardware drops
  selftests: net: Add drop monitor tests for namespace filtering
    functionality

 include/uapi/linux/net_dropmon.h              |   3 +
 net/core/drop_monitor.c                       |  64 ++++++++-
 .../selftests/net/drop_monitor_tests.sh       | 127 +++++++++++++++---
 3 files changed, 171 insertions(+), 23 deletions(-)

--
2.34.1


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 14:28 [PATCH net-next v2 0/3] Add support for netnamespace filtering in drop monitor Nikolay Borisov
@ 2022-11-23 14:28 ` Nikolay Borisov
  2022-11-23 15:16   ` Ido Schimmel
  2022-11-23 15:33   ` Alexander Lobakin
  2022-11-23 14:28 ` [PATCH net-next v2 2/3] drop_monitor: Add namespace filtering/reporting for hardware drops Nikolay Borisov
  2022-11-23 14:28 ` [PATCH net-next v2 3/3] selftests: net: Add drop monitor tests for namespace filtering functionality Nikolay Borisov
  2 siblings, 2 replies; 13+ messages in thread
From: Nikolay Borisov @ 2022-11-23 14:28 UTC (permalink / raw)
  To: nhorman; +Cc: davem, kuba, pabeni, netdev, kernel, Nikolay Borisov

On hosts running multiple containers it's helpful to be able to see
in which net namespace a particular drop occured. Additionally, it's
also useful to limit drop point filtering to a single namespace,
especially for hosts which are dropping skb's at a high rate.

Signed-off-by: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
---
 include/uapi/linux/net_dropmon.h |  2 ++
 net/core/drop_monitor.c          | 36 ++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 84f622a66a7a..81eb2bd184e8 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -8,6 +8,7 @@
 struct net_dm_drop_point {
 	__u8 pc[8];
 	__u32 count;
+	__u32 ns_id;
 };
 
 #define is_drop_point_hw(x) do {\
@@ -94,6 +95,7 @@ enum net_dm_attr {
 	NET_DM_ATTR_HW_DROPS,			/* flag */
 	NET_DM_ATTR_FLOW_ACTION_COOKIE,		/* binary */
 	NET_DM_ATTR_REASON,			/* string */
+	NET_DM_ATTR_NS,				/* u32 */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index f084a4a6b7ab..680f54d21f38 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -103,6 +103,7 @@ static unsigned long dm_hw_check_delta = 2*HZ;
 static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
 static u32 net_dm_trunc_len;
 static u32 net_dm_queue_len = 1000;
+static u32 net_dm_ns;
 
 struct net_dm_alert_ops {
 	void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
@@ -210,6 +211,19 @@ static void sched_send_work(struct timer_list *t)
 	schedule_work(&data->dm_alert_work);
 }
 
+static bool drop_point_matches(struct net_dm_drop_point *point, void *location,
+			       unsigned long ns_id)
+{
+	if (net_dm_ns && point->ns_id == net_dm_ns &&
+	    !memcmp(&location, &point->pc, sizeof(void *)))
+		return true;
+	else if (net_dm_ns == 0 && point->ns_id == ns_id &&
+		 !memcmp(&location, &point->pc, sizeof(void *)))
+		return true;
+	else
+		return false;
+}
+
 static void trace_drop_common(struct sk_buff *skb, void *location)
 {
 	struct net_dm_alert_msg *msg;
@@ -219,7 +233,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 	int i;
 	struct sk_buff *dskb;
 	struct per_cpu_dm_data *data;
-	unsigned long flags;
+	unsigned long flags, ns_id = 0;
+
+	if (skb->dev && net_dm_ns &&
+	    dev_net(skb->dev)->ns.inum != net_dm_ns)
+		return;
 
 	local_irq_save(flags);
 	data = this_cpu_ptr(&dm_cpu_data);
@@ -233,8 +251,10 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 	nla = genlmsg_data(nlmsg_data(nlh));
 	msg = nla_data(nla);
 	point = msg->points;
+	if (skb->dev)
+		ns_id = dev_net(skb->dev)->ns.inum;
 	for (i = 0; i < msg->entries; i++) {
-		if (!memcmp(&location, &point->pc, sizeof(void *))) {
+		if (drop_point_matches(point, location, ns_id)) {
 			point->count++;
 			goto out;
 		}
@@ -249,6 +269,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
 	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
 	memcpy(point->pc, &location, sizeof(void *));
 	point->count = 1;
+	point->ns_id = ns_id;
 	msg->entries++;
 
 	if (!timer_pending(&data->send_timer)) {
@@ -1283,6 +1304,14 @@ static void net_dm_trunc_len_set(struct genl_info *info)
 	net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
 }
 
+static void net_dm_ns_set(struct genl_info *info)
+{
+	if (!info->attrs[NET_DM_ATTR_NS])
+		return;
+
+	net_dm_ns = nla_get_u32(info->attrs[NET_DM_ATTR_NS]);
+}
+
 static void net_dm_queue_len_set(struct genl_info *info)
 {
 	if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
@@ -1310,6 +1339,8 @@ static int net_dm_cmd_config(struct sk_buff *skb,
 
 	net_dm_queue_len_set(info);
 
+	net_dm_ns_set(info);
+
 	return 0;
 }
 
@@ -1589,6 +1620,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
 	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
 	[NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
 	[NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
+	[NET_DM_ATTR_NS]	= { .type = NLA_U32 },
 	[NET_DM_ATTR_SW_DROPS]	= {. type = NLA_FLAG },
 	[NET_DM_ATTR_HW_DROPS]	= {. type = NLA_FLAG },
 };
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 2/3] drop_monitor: Add namespace filtering/reporting for hardware drops
  2022-11-23 14:28 [PATCH net-next v2 0/3] Add support for netnamespace filtering in drop monitor Nikolay Borisov
  2022-11-23 14:28 ` [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops Nikolay Borisov
@ 2022-11-23 14:28 ` Nikolay Borisov
  2022-11-23 15:37   ` Alexander Lobakin
  2022-11-23 14:28 ` [PATCH net-next v2 3/3] selftests: net: Add drop monitor tests for namespace filtering functionality Nikolay Borisov
  2 siblings, 1 reply; 13+ messages in thread
From: Nikolay Borisov @ 2022-11-23 14:28 UTC (permalink / raw)
  To: nhorman; +Cc: davem, kuba, pabeni, netdev, kernel, Nikolay Borisov

Add support for filtering and conveying the netnamespace where a
particular drop event occured. This is counterpart to the software
drop events support that was added earlier.

Signed-off-by: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
---
 include/uapi/linux/net_dropmon.h |  1 +
 net/core/drop_monitor.c          | 28 ++++++++++++++++++++++++++--
 2 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
index 81eb2bd184e8..42d82bc424dc 100644
--- a/include/uapi/linux/net_dropmon.h
+++ b/include/uapi/linux/net_dropmon.h
@@ -96,6 +96,7 @@ enum net_dm_attr {
 	NET_DM_ATTR_FLOW_ACTION_COOKIE,		/* binary */
 	NET_DM_ATTR_REASON,			/* string */
 	NET_DM_ATTR_NS,				/* u32 */
+	NET_DM_ATTR_HW_NS,			/* u32 */
 
 	__NET_DM_ATTR_MAX,
 	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
index 680f54d21f38..8e5daa6fef56 100644
--- a/net/core/drop_monitor.c
+++ b/net/core/drop_monitor.c
@@ -64,6 +64,7 @@ struct net_dm_stats {
 struct net_dm_hw_entry {
 	char trap_name[NET_DM_MAX_HW_TRAP_NAME_LEN];
 	u32 count;
+	u32 ns_id;
 };
 
 struct net_dm_hw_entries {
@@ -355,6 +356,9 @@ static int net_dm_hw_entry_put(struct sk_buff *msg,
 	if (nla_put_u32(msg, NET_DM_ATTR_HW_TRAP_COUNT, hw_entry->count))
 		goto nla_put_failure;
 
+	if (nla_put_u32(msg, NET_DM_ATTR_HW_NS, hw_entry->ns_id))
+		goto nla_put_failure;
+
 	nla_nest_end(msg, attr);
 
 	return 0;
@@ -452,6 +456,21 @@ static void net_dm_hw_summary_work(struct work_struct *work)
 	kfree(hw_entries);
 }
 
+static bool hw_entry_matches(struct net_dm_hw_entry *entry,
+			     const char *trap_name, unsigned long ns_id)
+{
+	if (net_dm_ns && entry->ns_id == net_dm_ns &&
+	    !strncmp(entry->trap_name, trap_name,
+		     NET_DM_MAX_HW_TRAP_NAME_LEN - 1))
+		return true;
+	else if (net_dm_ns == 0 && entry->ns_id == ns_id &&
+		 !strncmp(entry->trap_name, trap_name,
+			  NET_DM_MAX_HW_TRAP_NAME_LEN - 1))
+		return true;
+	else
+		return false;
+}
+
 static void
 net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
 			     struct sk_buff *skb,
@@ -461,11 +480,15 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
 	struct net_dm_hw_entry *hw_entry;
 	struct per_cpu_dm_data *hw_data;
 	unsigned long flags;
+	unsigned long ns_id;
 	int i;
 
 	if (metadata->trap_type == DEVLINK_TRAP_TYPE_CONTROL)
 		return;
 
+	if (net_dm_ns && dev_net(skb->dev)->ns.inum != net_dm_ns)
+		return;
+
 	hw_data = this_cpu_ptr(&dm_hw_cpu_data);
 	spin_lock_irqsave(&hw_data->lock, flags);
 	hw_entries = hw_data->hw_entries;
@@ -473,10 +496,10 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
 	if (!hw_entries)
 		goto out;
 
+	ns_id = dev_net(skb->dev)->ns.inum;
 	for (i = 0; i < hw_entries->num_entries; i++) {
 		hw_entry = &hw_entries->entries[i];
-		if (!strncmp(hw_entry->trap_name, metadata->trap_name,
-			     NET_DM_MAX_HW_TRAP_NAME_LEN - 1)) {
+		if (hw_entry_matches(hw_entry, metadata->trap_name, ns_id)) {
 			hw_entry->count++;
 			goto out;
 		}
@@ -489,6 +512,7 @@ net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
 		NET_DM_MAX_HW_TRAP_NAME_LEN - 1);
 	hw_entry->count = 1;
 	hw_entries->num_entries++;
+	hw_entry->ns_id = ns_id;
 
 	if (!timer_pending(&hw_data->send_timer)) {
 		hw_data->send_timer.expires = jiffies + dm_delay * HZ;
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH net-next v2 3/3] selftests: net: Add drop monitor tests for namespace filtering functionality
  2022-11-23 14:28 [PATCH net-next v2 0/3] Add support for netnamespace filtering in drop monitor Nikolay Borisov
  2022-11-23 14:28 ` [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops Nikolay Borisov
  2022-11-23 14:28 ` [PATCH net-next v2 2/3] drop_monitor: Add namespace filtering/reporting for hardware drops Nikolay Borisov
@ 2022-11-23 14:28 ` Nikolay Borisov
  2 siblings, 0 replies; 13+ messages in thread
From: Nikolay Borisov @ 2022-11-23 14:28 UTC (permalink / raw)
  To: nhorman; +Cc: davem, kuba, pabeni, netdev, kernel, Nikolay Borisov

Extend the current set of tests with new ones covering the updated
functionality allowing to filter events based on the net namespace they
originated from. The new set of tests:

Software drops test
    TEST: No filtering                                                  [ OK ]
    TEST: Filter everything                                             [ OK ]
    TEST: NS2 packet drop filtered                                      [ OK ]
    TEST: Filtering reset                                               [ OK ]
    TEST: Filtering disabled                                            [ OK ]

Hardware drops test
    TEST: No filtering                                                  [ OK ]
    TEST: Filter everything                                             [ OK ]
    TEST: NS2 packet drop filtered                                      [ OK ]
    TEST: Filtering reset                                               [ OK ]
    TEST: Filtering disabled                                            [ OK ]

Signed-off-by: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
---
 .../selftests/net/drop_monitor_tests.sh       | 127 +++++++++++++++---
 1 file changed, 108 insertions(+), 19 deletions(-)

diff --git a/tools/testing/selftests/net/drop_monitor_tests.sh b/tools/testing/selftests/net/drop_monitor_tests.sh
index b7650e30d18b..776aabc036f1 100755
--- a/tools/testing/selftests/net/drop_monitor_tests.sh
+++ b/tools/testing/selftests/net/drop_monitor_tests.sh
@@ -13,14 +13,13 @@ TESTS="
 	hw_drops
 "
 
-IP="ip -netns ns1"
-TC="tc -netns ns1"
-DEVLINK="devlink -N ns1"
-NS_EXEC="ip netns exec ns1"
 NETDEVSIM_PATH=/sys/bus/netdevsim/
-DEV_ADDR=1337
-DEV=netdevsim${DEV_ADDR}
-DEVLINK_DEV=netdevsim/${DEV}
+DEV1_ADDR=1336
+DEV2_ADDR=1337
+DEV1=netdevsim${DEV1_ADDR}
+DEV2=netdevsim${DEV2_ADDR}
+DEVLINK_DEV1=netdevsim/${DEV1}
+DEVLINK_DEV2=netdevsim/${DEV2}
 
 log_test()
 {
@@ -44,20 +43,29 @@ setup()
 
 	set -e
 	ip netns add ns1
-	$IP link add dummy10 up type dummy
-
-	$NS_EXEC echo "$DEV_ADDR 1" > ${NETDEVSIM_PATH}/new_device
+	ip netns add ns2
+	NS1INUM=$(findmnt -t nsfs | grep -m1 ns1 | sed -rn 's/.*net:\[([[:digit:]]+)\].*/\1/p')
+	NS2INUM=$(findmnt -t nsfs | grep -m1 ns2 | sed -rn 's/.*net:\[([[:digit:]]+)\].*/\1/p')
+	ip -netns ns1 link add dummy10 up type dummy
+	ip -netns ns2 link add dummy10 up type dummy
+
+	ip netns exec ns1 echo "$DEV1_ADDR 1" > ${NETDEVSIM_PATH}/new_device
+	ip netns exec ns2 echo "$DEV2_ADDR 1" > ${NETDEVSIM_PATH}/new_device
 	udevadm settle
-	local netdev=$($NS_EXEC ls ${NETDEVSIM_PATH}/devices/${DEV}/net/)
-	$IP link set dev $netdev up
+	local netdev=$(ip netns exec ns1 ls ${NETDEVSIM_PATH}/devices/${DEV1}/net/)
+	ip -netns ns1 link set dev $netdev up
+	netdev=$(ip netns exec ns2 ls ${NETDEVSIM_PATH}/devices/${DEV2}/net/)
+	ip -netns ns2 link set dev $netdev up
 
 	set +e
 }
 
 cleanup()
 {
-	$NS_EXEC echo "$DEV_ADDR" > ${NETDEVSIM_PATH}/del_device
+	ip netns exec ns1 echo "$DEV1_ADDR" > ${NETDEVSIM_PATH}/del_device
+	ip netns exec ns2 echo "$DEV2_ADDR" > ${NETDEVSIM_PATH}/del_device
 	ip netns del ns1
+	ip netns del ns2
 }
 
 sw_drops_test()
@@ -69,13 +77,53 @@ sw_drops_test()
 
 	local dir=$(mktemp -d)
 
-	$TC qdisc add dev dummy10 clsact
-	$TC filter add dev dummy10 egress pref 1 handle 101 proto ip \
+	tc -netns ns1 qdisc add dev dummy10 clsact
+	tc -netns ns2 qdisc add dev dummy10 clsact
+	tc -netns ns1 filter add dev dummy10 egress pref 1 handle 101 proto ip \
+		flower dst_ip 192.0.2.10 action drop
+	tc -netns ns2 filter add dev dummy10 egress pref 1 handle 101 proto ip \
 		flower dst_ip 192.0.2.10 action drop
 
-	$NS_EXEC mausezahn dummy10 -a 00:11:22:33:44:55 -b 00:aa:bb:cc:dd:ee \
+	ip netns exec ns1 mausezahn dummy10 -a 00:11:22:33:44:55 -b 00:aa:bb:cc:dd:ee \
 		-A 192.0.2.1 -B 192.0.2.10 -t udp sp=12345,dp=54321 -c 0 -q \
 		-d 100msec &
+	ip netns exec ns2 mausezahn dummy10 -a 00:11:22:33:44:55 -b 00:aa:bb:cc:dd:ee \
+		-A 192.0.2.1 -B 192.0.2.10 -t udp sp=12345,dp=54321 -c 0 -q \
+		-d 100msec &
+
+	# Test that if we set to 0 we get all packets
+	echo -e  "set alertmode summary\nset ns 0\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q $NS1INUM $dir/output.txt
+	local ret1=$?
+	grep -q $NS2INUM $dir/output.txt
+	local ret2=$?
+	(( ret1 == 0 && ret2 == 0 ))
+	log_test $? 0 "No filtering"
+
+	# Set filter to a non-existant ns and we should see nothing
+	echo -e  "set alertmode summary\nset ns -1\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q drops $dir/output.txt
+	log_test $? 1 "Filter everything"
+
+	# Set filter to NS1 so we shouldn't see NS2
+	echo -e  "set ns $NS1INUM\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q $NS2INUM $dir/output.txt
+	log_test $? 1 "NS2 packet drop filtered"
+
+	# Return filter to 0 and ensure everything is fine
+	echo -e  "set ns 0\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q $NS1INUM $dir/output.txt
+	ret1=$?
+	grep -q $NS2INUM $dir/output.txt
+	ret2=$?
+	(( ret1 == 0 && ret2 == 0 ))
+	log_test $? 0 "Filtering reset"
+
+	# disable ns capability at all
+	echo -e  "set ns off\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q ns: $dir/output.txt
+	log_test $? 1 "Filtering disabled"
+
 	timeout 5 dwdump -o sw -w ${dir}/packets.pcap
 	(( $(tshark -r ${dir}/packets.pcap \
 		-Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) != 0))
@@ -83,7 +131,8 @@ sw_drops_test()
 
 	rm ${dir}/packets.pcap
 
-	{ kill %% && wait %%; } 2>/dev/null
+	{ kill $(jobs -p) && wait $(jobs -p); } 2> /dev/null
+
 	timeout 5 dwdump -o sw -w ${dir}/packets.pcap
 	(( $(tshark -r ${dir}/packets.pcap \
 		-Y 'ip.dst == 192.0.2.10' 2> /dev/null | wc -l) == 0))
@@ -103,16 +152,56 @@ hw_drops_test()
 
 	local dir=$(mktemp -d)
 
-	$DEVLINK trap set $DEVLINK_DEV trap blackhole_route action trap
+	devlink -N ns1 trap set $DEVLINK_DEV1 trap blackhole_route action trap
+	devlink -N ns2 trap set $DEVLINK_DEV2 trap blackhole_route action trap
+
+	# Test that if we set to 0 we get all packets
+	echo -e  "set alertmode summary\nset ns 0\nset hw true\nstart" \
+		| timeout -s 2 5 dropwatch &> $dir/output.txt
+	#echo -e  "set hw true\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -Eq ".*blackhole_route \[hardware\] \[ns: $NS1INUM\]" $dir/output.txt
+	local ret1=$?
+	grep -Eq ".*blackhole_route \[hardware\] \[ns: $NS2INUM\]" $dir/output.txt
+	local ret2=$?
+	(( ret1 == 0 && ret2 == 0 ))
+	log_test $? 0 "No filtering"
+
+	# Set filter to a non-existant ns and we should see nothing
+	echo -e  "set ns -1\nset hw true\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q "\[hardware\]" $dir/output.txt
+	log_test $? 1 "Filter everything"
+
+	# Set filter to NS1 so we shouldn't see NS2
+	echo -e  "set ns $NS1INUM\nset hw true\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q $NS2INUM $dir/output.txt
+	log_test $? 1 "NS2 packet drop filtered"
+
+	# Return filter to 0 and ensure everything is fine
+	echo -e  "set ns 0\nset hw true\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -Eq ".*blackhole_route \[hardware\] \[ns: $NS1INUM\]" $dir/output.txt
+	local ret1=$?
+	grep -Eq ".*blackhole_route \[hardware\] \[ns: $NS2INUM\]" $dir/output.txt
+	local ret2=$?
+	(( ret1 == 0 && ret2 == 0 ))
+	log_test $? 0 "Filtering reset"
+
+	# disable ns capability at all
+	echo -e  "set ns off\nset hw true\nstart" | timeout -s 2 5 dropwatch &> $dir/output.txt
+	grep -q ns: $dir/output.txt
+	log_test $? 1 "Filtering disabled"
+
 	timeout 5 dwdump -o hw -w ${dir}/packets.pcap
 	(( $(tshark -r ${dir}/packets.pcap \
 		-Y 'net_dm.hw_trap_name== blackhole_route' 2> /dev/null \
 		| wc -l) != 0))
 	log_test $? 0 "Capturing active hardware drops"
 
+	cp ${dir}/packets.pcap /root/host/
 	rm ${dir}/packets.pcap
 
-	$DEVLINK trap set $DEVLINK_DEV trap blackhole_route action drop
+	devlink -N ns1 trap set $DEVLINK_DEV1 trap blackhole_route action drop
+	devlink -N ns2 trap set $DEVLINK_DEV2 trap blackhole_route action drop
+
 	timeout 5 dwdump -o hw -w ${dir}/packets.pcap
 	(( $(tshark -r ${dir}/packets.pcap \
 		-Y 'net_dm.hw_trap_name== blackhole_route' 2> /dev/null \
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 14:28 ` [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops Nikolay Borisov
@ 2022-11-23 15:16   ` Ido Schimmel
  2022-11-23 15:21     ` nb
  2022-11-23 15:33   ` Alexander Lobakin
  1 sibling, 1 reply; 13+ messages in thread
From: Ido Schimmel @ 2022-11-23 15:16 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: nhorman, davem, kuba, pabeni, netdev, kernel

On Wed, Nov 23, 2022 at 04:28:15PM +0200, Nikolay Borisov wrote:
>  static void trace_drop_common(struct sk_buff *skb, void *location)
>  {
>  	struct net_dm_alert_msg *msg;
> @@ -219,7 +233,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
>  	int i;
>  	struct sk_buff *dskb;
>  	struct per_cpu_dm_data *data;
> -	unsigned long flags;
> +	unsigned long flags, ns_id = 0;
> +
> +	if (skb->dev && net_dm_ns &&
> +	    dev_net(skb->dev)->ns.inum != net_dm_ns)

I don't think this is going to work, unfortunately. 'skb->dev' is in a
union with 'dev_scratch' so 'skb->dev' does not necessarily point to a
valid netdev at all times. It can explode when dev_net() tries to
dereference it.

__skb_flow_dissect() is doing something similar, but I believe there the
code paths were audited to make sure it is safe.

Did you consider achieving this functionality with a BPF program
attached to skb::kfree_skb tracepoint? I believe BPF programs are run
with page faults disabled, so it should be safe to attempt this there.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 15:16   ` Ido Schimmel
@ 2022-11-23 15:21     ` nb
  2022-11-23 18:10       ` Ido Schimmel
  0 siblings, 1 reply; 13+ messages in thread
From: nb @ 2022-11-23 15:21 UTC (permalink / raw)
  To: Ido Schimmel; +Cc: nhorman, davem, kuba, pabeni, netdev, kernel



On 23.11.22 г. 17:16 ч., Ido Schimmel wrote:
> On Wed, Nov 23, 2022 at 04:28:15PM +0200, Nikolay Borisov wrote:
>>   static void trace_drop_common(struct sk_buff *skb, void *location)
>>   {
>>   	struct net_dm_alert_msg *msg;
>> @@ -219,7 +233,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
>>   	int i;
>>   	struct sk_buff *dskb;
>>   	struct per_cpu_dm_data *data;
>> -	unsigned long flags;
>> +	unsigned long flags, ns_id = 0;
>> +
>> +	if (skb->dev && net_dm_ns &&
>> +	    dev_net(skb->dev)->ns.inum != net_dm_ns)
> 
> I don't think this is going to work, unfortunately. 'skb->dev' is in a
> union with 'dev_scratch' so 'skb->dev' does not necessarily point to a
> valid netdev at all times. It can explode when dev_net() tries to
> dereference it.
> 
> __skb_flow_dissect() is doing something similar, but I believe there the
> code paths were audited to make sure it is safe.
> 
> Did you consider achieving this functionality with a BPF program
> attached to skb::kfree_skb tracepoint? I believe BPF programs are run
> with page faults disabled, so it should be safe to attempt this there.

How would that be different than the trace_drop_common which is called 
as part of the trace_kfree_skb, as it's really passed as trace point 
probe via:

net_dm_trace_on_set->register_trace_kfree_skb(trace_drop_common)




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 14:28 ` [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops Nikolay Borisov
  2022-11-23 15:16   ` Ido Schimmel
@ 2022-11-23 15:33   ` Alexander Lobakin
  2022-11-23 16:04     ` nb
  1 sibling, 1 reply; 13+ messages in thread
From: Alexander Lobakin @ 2022-11-23 15:33 UTC (permalink / raw)
  To: Nikolay Borisov
  Cc: Alexander Lobakin, nhorman, davem, kuba, pabeni, netdev, kernel

From: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
Date: Wed, 23 Nov 2022 16:28:15 +0200

> On hosts running multiple containers it's helpful to be able to see
> in which net namespace a particular drop occured. Additionally, it's
> also useful to limit drop point filtering to a single namespace,
> especially for hosts which are dropping skb's at a high rate.
> 
> Signed-off-by: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
> ---
>  include/uapi/linux/net_dropmon.h |  2 ++
>  net/core/drop_monitor.c          | 36 ++++++++++++++++++++++++++++++--
>  2 files changed, 36 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/net_dropmon.h b/include/uapi/linux/net_dropmon.h
> index 84f622a66a7a..81eb2bd184e8 100644
> --- a/include/uapi/linux/net_dropmon.h
> +++ b/include/uapi/linux/net_dropmon.h
> @@ -8,6 +8,7 @@
>  struct net_dm_drop_point {
>  	__u8 pc[8];
>  	__u32 count;
> +	__u32 ns_id;
>  };
>  
>  #define is_drop_point_hw(x) do {\
> @@ -94,6 +95,7 @@ enum net_dm_attr {
>  	NET_DM_ATTR_HW_DROPS,			/* flag */
>  	NET_DM_ATTR_FLOW_ACTION_COOKIE,		/* binary */
>  	NET_DM_ATTR_REASON,			/* string */
> +	NET_DM_ATTR_NS,				/* u32 */
>  
>  	__NET_DM_ATTR_MAX,
>  	NET_DM_ATTR_MAX = __NET_DM_ATTR_MAX - 1
> diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c
> index f084a4a6b7ab..680f54d21f38 100644
> --- a/net/core/drop_monitor.c
> +++ b/net/core/drop_monitor.c
> @@ -103,6 +103,7 @@ static unsigned long dm_hw_check_delta = 2*HZ;
>  static enum net_dm_alert_mode net_dm_alert_mode = NET_DM_ALERT_MODE_SUMMARY;
>  static u32 net_dm_trunc_len;
>  static u32 net_dm_queue_len = 1000;
> +static u32 net_dm_ns;
>  
>  struct net_dm_alert_ops {
>  	void (*kfree_skb_probe)(void *ignore, struct sk_buff *skb,
> @@ -210,6 +211,19 @@ static void sched_send_work(struct timer_list *t)
>  	schedule_work(&data->dm_alert_work);
>  }
>  
> +static bool drop_point_matches(struct net_dm_drop_point *point, void *location,
> +			       unsigned long ns_id)
> +{
> +	if (net_dm_ns && point->ns_id == net_dm_ns &&
> +	    !memcmp(&location, &point->pc, sizeof(void *)))

                                           ^^^^^^^^^^^^^^

Nit: sizeof(location)?

> +		return true;
> +	else if (net_dm_ns == 0 && point->ns_id == ns_id &&

                 ^^^^^^^^^^^^^^

Just `!net_dm_ns` is preferred.

> +		 !memcmp(&location, &point->pc, sizeof(void *)))
> +		return true;
> +	else
> +		return false;

I think the dup of the last condition can be avoided with oring
`(net_dm_ns && ...) || (!net_dm_ns && ...)`. Then, true/false
becomes redundant:

	return ((net_dm_ns && point->ns_id == net_dm_ns) ||
		(!net_dm_ns && point->ns_id == ns_id)) &&
	       !memcmp(&location, &point->pc, sizeof(location));

> +}
> +
>  static void trace_drop_common(struct sk_buff *skb, void *location)
>  {
>  	struct net_dm_alert_msg *msg;
> @@ -219,7 +233,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
>  	int i;
>  	struct sk_buff *dskb;
>  	struct per_cpu_dm_data *data;
> -	unsigned long flags;
> +	unsigned long flags, ns_id = 0;

With that line extension, it breaks RCT style. Yeah, it's already
broken a line above, but let's not introduce more style issues %)

> +
> +	if (skb->dev && net_dm_ns &&

It's faster to test net_dm_ns at first and then skb->dev. The former
is static on the BSS and the latter is dynamic. Plus the former will
be zeroed much more often than the latter.

> +	    dev_net(skb->dev)->ns.inum != net_dm_ns)
> +		return;
>  
>  	local_irq_save(flags);
>  	data = this_cpu_ptr(&dm_cpu_data);
> @@ -233,8 +251,10 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
>  	nla = genlmsg_data(nlmsg_data(nlh));
>  	msg = nla_data(nla);
>  	point = msg->points;
> +	if (skb->dev)
> +		ns_id = dev_net(skb->dev)->ns.inum;
>  	for (i = 0; i < msg->entries; i++) {
> -		if (!memcmp(&location, &point->pc, sizeof(void *))) {
> +		if (drop_point_matches(point, location, ns_id)) {
>  			point->count++;
>  			goto out;
>  		}
> @@ -249,6 +269,7 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
>  	nla->nla_len += NLA_ALIGN(sizeof(struct net_dm_drop_point));
>  	memcpy(point->pc, &location, sizeof(void *));
>  	point->count = 1;
> +	point->ns_id = ns_id;
>  	msg->entries++;
>  
>  	if (!timer_pending(&data->send_timer)) {
> @@ -1283,6 +1304,14 @@ static void net_dm_trunc_len_set(struct genl_info *info)
>  	net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
>  }
>  
> +static void net_dm_ns_set(struct genl_info *info)
> +{
> +	if (!info->attrs[NET_DM_ATTR_NS])
> +		return;
> +
> +	net_dm_ns = nla_get_u32(info->attrs[NET_DM_ATTR_NS]);

So, if I got it correctly, it can limit the scope to only one netns.
Isn't that not flexible enough? What about a white- or black- list
of NSes to filter or filter-out?

> +}
> +
>  static void net_dm_queue_len_set(struct genl_info *info)
>  {
>  	if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
> @@ -1310,6 +1339,8 @@ static int net_dm_cmd_config(struct sk_buff *skb,
>  
>  	net_dm_queue_len_set(info);
>  
> +	net_dm_ns_set(info);
> +
>  	return 0;
>  }
>  
> @@ -1589,6 +1620,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
>  	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
>  	[NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
>  	[NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
> +	[NET_DM_ATTR_NS]	= { .type = NLA_U32 },
>  	[NET_DM_ATTR_SW_DROPS]	= {. type = NLA_FLAG },
>  	[NET_DM_ATTR_HW_DROPS]	= {. type = NLA_FLAG },
>  };
> -- 
> 2.34.1

Thanks,
Olek

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 2/3] drop_monitor: Add namespace filtering/reporting for hardware drops
  2022-11-23 14:28 ` [PATCH net-next v2 2/3] drop_monitor: Add namespace filtering/reporting for hardware drops Nikolay Borisov
@ 2022-11-23 15:37   ` Alexander Lobakin
  0 siblings, 0 replies; 13+ messages in thread
From: Alexander Lobakin @ 2022-11-23 15:37 UTC (permalink / raw)
  To: Nikolay Borisov; +Cc: Alexander Lobakin, nhorman, davem, kuba, pabeni, netdev

From: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
Date: Wed, 23 Nov 2022 16:28:16 +0200

> Add support for filtering and conveying the netnamespace where a
> particular drop event occured. This is counterpart to the software
> drop events support that was added earlier.
> 
> Signed-off-by: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
> ---
>  include/uapi/linux/net_dropmon.h |  1 +
>  net/core/drop_monitor.c          | 28 ++++++++++++++++++++++++++--
>  2 files changed, 27 insertions(+), 2 deletions(-)

[...]

> @@ -452,6 +456,21 @@ static void net_dm_hw_summary_work(struct work_struct *work)
>  	kfree(hw_entries);
>  }
>  
> +static bool hw_entry_matches(struct net_dm_hw_entry *entry,
> +			     const char *trap_name, unsigned long ns_id)
> +{
> +	if (net_dm_ns && entry->ns_id == net_dm_ns &&
> +	    !strncmp(entry->trap_name, trap_name,
> +		     NET_DM_MAX_HW_TRAP_NAME_LEN - 1))
> +		return true;
> +	else if (net_dm_ns == 0 && entry->ns_id == ns_id &&
> +		 !strncmp(entry->trap_name, trap_name,
> +			  NET_DM_MAX_HW_TRAP_NAME_LEN - 1))
> +		return true;
> +	else
> +		return false;

Same as in my previous mail.

> +}
> +
>  static void
>  net_dm_hw_trap_summary_probe(void *ignore, const struct devlink *devlink,
>  			     struct sk_buff *skb,

[...]

> -- 
> 2.34.1

Thanks,
Olek

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 15:33   ` Alexander Lobakin
@ 2022-11-23 16:04     ` nb
  2022-11-23 17:16       ` Alexander Lobakin
  0 siblings, 1 reply; 13+ messages in thread
From: nb @ 2022-11-23 16:04 UTC (permalink / raw)
  To: Alexander Lobakin; +Cc: nhorman, davem, kuba, pabeni, netdev, kernel



On 23.11.22 г. 17:33 ч., Alexander Lobakin wrote:
> From: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
> Date: Wed, 23 Nov 2022 16:28:15 +0200
> 

<snip>

>> @@ -1283,6 +1304,14 @@ static void net_dm_trunc_len_set(struct genl_info *info)
>>   	net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
>>   }
>>   
>> +static void net_dm_ns_set(struct genl_info *info)
>> +{
>> +	if (!info->attrs[NET_DM_ATTR_NS])
>> +		return;
>> +
>> +	net_dm_ns = nla_get_u32(info->attrs[NET_DM_ATTR_NS]);
> 
> So, if I got it correctly, it can limit the scope to only one netns.
> Isn't that not flexible enough? What about a white- or black- list
> of NSes to filter or filter-out?

Can do, however my current use case is to really pin-point a single 
offending container, but yeah, you are right that a list would be more 
flexible. I would consider doing this provided there are no blockers in 
the code overall. Do you have any idea whether a black/white list would 
be better? This also begs the question whether we'll support some fixed 
amount of ns i.e an array or a list and allow an "infinite" amount of ns 
filtering ...

> 
>> +}
>> +
>>   static void net_dm_queue_len_set(struct genl_info *info)
>>   {
>>   	if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
>> @@ -1310,6 +1339,8 @@ static int net_dm_cmd_config(struct sk_buff *skb,
>>   
>>   	net_dm_queue_len_set(info);
>>   
>> +	net_dm_ns_set(info);
>> +
>>   	return 0;
>>   }
>>   
>> @@ -1589,6 +1620,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
>>   	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
>>   	[NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
>>   	[NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
>> +	[NET_DM_ATTR_NS]	= { .type = NLA_U32 },
>>   	[NET_DM_ATTR_SW_DROPS]	= {. type = NLA_FLAG },
>>   	[NET_DM_ATTR_HW_DROPS]	= {. type = NLA_FLAG },
>>   };
>> -- 
>> 2.34.1
> 
> Thanks,
> Olek

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 16:04     ` nb
@ 2022-11-23 17:16       ` Alexander Lobakin
  0 siblings, 0 replies; 13+ messages in thread
From: Alexander Lobakin @ 2022-11-23 17:16 UTC (permalink / raw)
  To: nb; +Cc: Alexander Lobakin, nhorman, davem, kuba, pabeni, netdev, kernel

From: nb <nikolay.borisov@virtuozzo.com>
Date: Wed, 23 Nov 2022 18:04:25 +0200

> On 23.11.22 г. 17:33 ч., Alexander Lobakin wrote:
> > From: Nikolay Borisov <nikolay.borisov@virtuozzo.com>
> > Date: Wed, 23 Nov 2022 16:28:15 +0200
> > 
> 
> <snip>
> 
> >> @@ -1283,6 +1304,14 @@ static void net_dm_trunc_len_set(struct genl_info *info)
> >>   	net_dm_trunc_len = nla_get_u32(info->attrs[NET_DM_ATTR_TRUNC_LEN]);
> >>   }
> >>   
> >> +static void net_dm_ns_set(struct genl_info *info)
> >> +{
> >> +	if (!info->attrs[NET_DM_ATTR_NS])
> >> +		return;
> >> +
> >> +	net_dm_ns = nla_get_u32(info->attrs[NET_DM_ATTR_NS]);
> > 
> > So, if I got it correctly, it can limit the scope to only one netns.
> > Isn't that not flexible enough? What about a white- or black- list
> > of NSes to filter or filter-out?
> 
> Can do, however my current use case is to really pin-point a single 
> offending container, but yeah, you are right that a list would be more 
> flexible. I would consider doing this provided there are no blockers in 
> the code overall. Do you have any idea whether a black/white list would 
> be better? This also begs the question whether we'll support some fixed 
> amount of ns i.e an array or a list and allow an "infinite" amount of ns 
> filtering ...

I'd go with list_head to not make it limited or consume a fixed
amount of memory regardless of the actual amount of rules.

You can make it work as both white/black by having a switch
"inverse", which makes the list filtering or filtering out.

> 
> > 
> >> +}
> >> +
> >>   static void net_dm_queue_len_set(struct genl_info *info)
> >>   {
> >>   	if (!info->attrs[NET_DM_ATTR_QUEUE_LEN])
> >> @@ -1310,6 +1339,8 @@ static int net_dm_cmd_config(struct sk_buff *skb,
> >>   
> >>   	net_dm_queue_len_set(info);
> >>   
> >> +	net_dm_ns_set(info);
> >> +
> >>   	return 0;
> >>   }
> >>   
> >> @@ -1589,6 +1620,7 @@ static const struct nla_policy net_dm_nl_policy[NET_DM_ATTR_MAX + 1] = {
> >>   	[NET_DM_ATTR_ALERT_MODE] = { .type = NLA_U8 },
> >>   	[NET_DM_ATTR_TRUNC_LEN] = { .type = NLA_U32 },
> >>   	[NET_DM_ATTR_QUEUE_LEN] = { .type = NLA_U32 },
> >> +	[NET_DM_ATTR_NS]	= { .type = NLA_U32 },
> >>   	[NET_DM_ATTR_SW_DROPS]	= {. type = NLA_FLAG },
> >>   	[NET_DM_ATTR_HW_DROPS]	= {. type = NLA_FLAG },
> >>   };
> >> -- 
> >> 2.34.1
> > 
> > Thanks,
> > Olek

Thanks,
Olek

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 15:21     ` nb
@ 2022-11-23 18:10       ` Ido Schimmel
  2022-11-24 11:41         ` nb
  0 siblings, 1 reply; 13+ messages in thread
From: Ido Schimmel @ 2022-11-23 18:10 UTC (permalink / raw)
  To: nb; +Cc: nhorman, davem, kuba, pabeni, netdev, kernel

On Wed, Nov 23, 2022 at 05:21:23PM +0200, nb wrote:
> 
> 
> On 23.11.22 г. 17:16 ч., Ido Schimmel wrote:
> > On Wed, Nov 23, 2022 at 04:28:15PM +0200, Nikolay Borisov wrote:
> > >   static void trace_drop_common(struct sk_buff *skb, void *location)
> > >   {
> > >   	struct net_dm_alert_msg *msg;
> > > @@ -219,7 +233,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
> > >   	int i;
> > >   	struct sk_buff *dskb;
> > >   	struct per_cpu_dm_data *data;
> > > -	unsigned long flags;
> > > +	unsigned long flags, ns_id = 0;
> > > +
> > > +	if (skb->dev && net_dm_ns &&
> > > +	    dev_net(skb->dev)->ns.inum != net_dm_ns)
> > 
> > I don't think this is going to work, unfortunately. 'skb->dev' is in a
> > union with 'dev_scratch' so 'skb->dev' does not necessarily point to a
> > valid netdev at all times. It can explode when dev_net() tries to
> > dereference it.
> > 
> > __skb_flow_dissect() is doing something similar, but I believe there the
> > code paths were audited to make sure it is safe.
> > 
> > Did you consider achieving this functionality with a BPF program
> > attached to skb::kfree_skb tracepoint? I believe BPF programs are run
> > with page faults disabled, so it should be safe to attempt this there.
> 
> How would that be different than the trace_drop_common which is called as
> part of the trace_kfree_skb, as it's really passed as trace point probe via:

Consider this call path:

__udp_queue_rcv_skb()
    __udp_enqueue_schedule_skb()
        udp_set_dev_scratch() // skb->dev is not NULL, but not a pointer to a netdev either
	// error is returned
    kfree_skb_reason() // probe is called

dev_net(skb->dev) in the probe will try to dereference skb->dev and
crash.

On the other hand, a BPF program that is registered as another probe on
the tracepoint will access the memory via bpf_probe_read_kernel(), which
will try to safely read the memory and return an error if it can't. You
can do that today without any kernel changes.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-23 18:10       ` Ido Schimmel
@ 2022-11-24 11:41         ` nb
  2022-11-28  7:54           ` Ido Schimmel
  0 siblings, 1 reply; 13+ messages in thread
From: nb @ 2022-11-24 11:41 UTC (permalink / raw)
  To: Ido Schimmel; +Cc: nhorman, davem, kuba, pabeni, netdev, kernel



On 23.11.22 г. 20:10 ч., Ido Schimmel wrote:
> On Wed, Nov 23, 2022 at 05:21:23PM +0200, nb wrote:
>>
>>
>> On 23.11.22 г. 17:16 ч., Ido Schimmel wrote:
>>> On Wed, Nov 23, 2022 at 04:28:15PM +0200, Nikolay Borisov wrote:
>>>>    static void trace_drop_common(struct sk_buff *skb, void *location)
>>>>    {
>>>>    	struct net_dm_alert_msg *msg;
>>>> @@ -219,7 +233,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
>>>>    	int i;
>>>>    	struct sk_buff *dskb;
>>>>    	struct per_cpu_dm_data *data;
>>>> -	unsigned long flags;
>>>> +	unsigned long flags, ns_id = 0;
>>>> +
>>>> +	if (skb->dev && net_dm_ns &&
>>>> +	    dev_net(skb->dev)->ns.inum != net_dm_ns)
>>>
>>> I don't think this is going to work, unfortunately. 'skb->dev' is in a
>>> union with 'dev_scratch' so 'skb->dev' does not necessarily point to a
>>> valid netdev at all times. It can explode when dev_net() tries to
>>> dereference it.
>>>
>>> __skb_flow_dissect() is doing something similar, but I believe there the
>>> code paths were audited to make sure it is safe.
>>>
>>> Did you consider achieving this functionality with a BPF program
>>> attached to skb::kfree_skb tracepoint? I believe BPF programs are run
>>> with page faults disabled, so it should be safe to attempt this there.
>>
>> How would that be different than the trace_drop_common which is called as
>> part of the trace_kfree_skb, as it's really passed as trace point probe via:
> 
> Consider this call path:
> 
> __udp_queue_rcv_skb()
>      __udp_enqueue_schedule_skb()
>          udp_set_dev_scratch() // skb->dev is not NULL, but not a pointer to a netdev either
> 	// error is returned
>      kfree_skb_reason() // probe is called
> 
> dev_net(skb->dev) in the probe will try to dereference skb->dev and
> crash.

This can easily be rectified by using is_kernel() .

> 
> On the other hand, a BPF program that is registered as another probe on
> the tracepoint will access the memory via bpf_probe_read_kernel(), which
> will try to safely read the memory and return an error if it can't. You
> can do that today without any kernel changes.

I did a PoC for this and indeed it works, however I'd still like to 
pursue this code provided there is upstream interest.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops
  2022-11-24 11:41         ` nb
@ 2022-11-28  7:54           ` Ido Schimmel
  0 siblings, 0 replies; 13+ messages in thread
From: Ido Schimmel @ 2022-11-28  7:54 UTC (permalink / raw)
  To: nb; +Cc: nhorman, davem, kuba, pabeni, netdev, kernel

On Thu, Nov 24, 2022 at 01:41:38PM +0200, nb wrote:
> 
> 
> On 23.11.22 г. 20:10 ч., Ido Schimmel wrote:
> > On Wed, Nov 23, 2022 at 05:21:23PM +0200, nb wrote:
> > > 
> > > 
> > > On 23.11.22 г. 17:16 ч., Ido Schimmel wrote:
> > > > On Wed, Nov 23, 2022 at 04:28:15PM +0200, Nikolay Borisov wrote:
> > > > >    static void trace_drop_common(struct sk_buff *skb, void *location)
> > > > >    {
> > > > >    	struct net_dm_alert_msg *msg;
> > > > > @@ -219,7 +233,11 @@ static void trace_drop_common(struct sk_buff *skb, void *location)
> > > > >    	int i;
> > > > >    	struct sk_buff *dskb;
> > > > >    	struct per_cpu_dm_data *data;
> > > > > -	unsigned long flags;
> > > > > +	unsigned long flags, ns_id = 0;
> > > > > +
> > > > > +	if (skb->dev && net_dm_ns &&
> > > > > +	    dev_net(skb->dev)->ns.inum != net_dm_ns)
> > > > 
> > > > I don't think this is going to work, unfortunately. 'skb->dev' is in a
> > > > union with 'dev_scratch' so 'skb->dev' does not necessarily point to a
> > > > valid netdev at all times. It can explode when dev_net() tries to
> > > > dereference it.
> > > > 
> > > > __skb_flow_dissect() is doing something similar, but I believe there the
> > > > code paths were audited to make sure it is safe.
> > > > 
> > > > Did you consider achieving this functionality with a BPF program
> > > > attached to skb::kfree_skb tracepoint? I believe BPF programs are run
> > > > with page faults disabled, so it should be safe to attempt this there.
> > > 
> > > How would that be different than the trace_drop_common which is called as
> > > part of the trace_kfree_skb, as it's really passed as trace point probe via:
> > 
> > Consider this call path:
> > 
> > __udp_queue_rcv_skb()
> >      __udp_enqueue_schedule_skb()
> >          udp_set_dev_scratch() // skb->dev is not NULL, but not a pointer to a netdev either
> > 	// error is returned
> >      kfree_skb_reason() // probe is called
> > 
> > dev_net(skb->dev) in the probe will try to dereference skb->dev and
> > crash.
> 
> This can easily be rectified by using is_kernel() .

The layout of 'struct udp_dev_scratch' is not fixed and it can be
arranged to contain values that make it seem like a valid kernel
address, but does not actually point to a 'struct net_device'.

> 
> > 
> > On the other hand, a BPF program that is registered as another probe on
> > the tracepoint will access the memory via bpf_probe_read_kernel(), which
> > will try to safely read the memory and return an error if it can't. You
> > can do that today without any kernel changes.
> 
> I did a PoC for this and indeed it works, however I'd still like to pursue
> this code provided there is upstream interest.

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2022-11-28  7:55 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-11-23 14:28 [PATCH net-next v2 0/3] Add support for netnamespace filtering in drop monitor Nikolay Borisov
2022-11-23 14:28 ` [PATCH net-next v2 1/3] drop_monitor: Implement namespace filtering/reporting for software drops Nikolay Borisov
2022-11-23 15:16   ` Ido Schimmel
2022-11-23 15:21     ` nb
2022-11-23 18:10       ` Ido Schimmel
2022-11-24 11:41         ` nb
2022-11-28  7:54           ` Ido Schimmel
2022-11-23 15:33   ` Alexander Lobakin
2022-11-23 16:04     ` nb
2022-11-23 17:16       ` Alexander Lobakin
2022-11-23 14:28 ` [PATCH net-next v2 2/3] drop_monitor: Add namespace filtering/reporting for hardware drops Nikolay Borisov
2022-11-23 15:37   ` Alexander Lobakin
2022-11-23 14:28 ` [PATCH net-next v2 3/3] selftests: net: Add drop monitor tests for namespace filtering functionality Nikolay Borisov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.