[PATCH] openvswitch: perform refragmentation for packets which pass through conntrack

* [PATCH] openvswitch: perform refragmentation for packets which pass through conntrack
@ 2021-03-19 20:43 Aaron Conole
  2021-03-19 22:13 ` Aaron Conole
                   ` (2 more replies)
  0 siblings, 3 replies; 10+ messages in thread
From: Aaron Conole @ 2021-03-19 20:43 UTC (permalink / raw)
  To: netdev; +Cc: dev, Pravin B Shelar, Joe Stringer, Eelco Chaudron, Dan Williams

When a user instructs a flow pipeline to perform connection tracking,
there is an implicit L3 operation that occurs - namely the IP fragments
are reassembled and then processed as a single unit.  After this, new
fragments are generated and then transmitted, with the hint that they
should be fragmented along the max rx unit boundary.  In general, this
behavior works well to forward packets along when the MTUs are congruent
across the datapath.

However, if using a protocol such as UDP on a network with mismatching
MTUs, it is possible that the refragmentation will still produce an
invalid fragment, and that fragmented packet will not be delivered.
Such a case shouldn't happen because the user explicitly requested a
layer 3+4 function (conntrack), and that function generates new fragments,
so we should perform the needed actions in that case (namely, refragment
IPv4 along a correct boundary, or send a packet too big in the IPv6 case).

Additionally, introduce a test suite for openvswitch with a test case
that ensures this MTU behavior, with the expectation that new tests are
added when needed.

Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action")
Signed-off-by: Aaron Conole <aconole@redhat.com>
---
NOTE: checkpatch reports a whitespace error with the openvswitch.sh
      script - this is due to using tab as the IFS value.  This part
      of the script was copied from
      tools/testing/selftests/net/pmtu.sh so I think should be
      permissible.

 net/openvswitch/actions.c                  |   2 +-
 tools/testing/selftests/net/.gitignore     |   1 +
 tools/testing/selftests/net/Makefile       |   1 +
 tools/testing/selftests/net/openvswitch.sh | 394 +++++++++++++++++++++
 4 files changed, 397 insertions(+), 1 deletion(-)
 create mode 100755 tools/testing/selftests/net/openvswitch.sh

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 92a0b67b2728..d858ea580e43 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -890,7 +890,7 @@ static void do_output(struct datapath *dp, struct sk_buff *skb, int out_port,
 		if (likely(!mru ||
 		           (skb->len <= mru + vport->dev->hard_header_len))) {
 			ovs_vport_send(vport, skb, ovs_key_mac_proto(key));
-		} else if (mru <= vport->dev->mtu) {
+		} else if (mru) {
 			struct net *net = read_pnet(&dp->net);
 
 			ovs_fragment(net, vport, skb, mru, key);
diff --git a/tools/testing/selftests/net/.gitignore b/tools/testing/selftests/net/.gitignore
index 61ae899cfc17..d4d7487833be 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -30,3 +30,4 @@ hwtstamp_config
 rxtimestamp
 timestamping
 txtimestamp
+test_mismatched_mtu_with_conntrack
diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile
index 25f198bec0b2..dc9b556f86fd 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -23,6 +23,7 @@ TEST_PROGS += drop_monitor_tests.sh
 TEST_PROGS += vrf_route_leaking.sh
 TEST_PROGS += bareudp.sh
 TEST_PROGS += unicast_extensions.sh
+TEST_PROGS += openvswitch.sh
 TEST_PROGS_EXTENDED := in_netns.sh
 TEST_GEN_FILES =  socket nettest
 TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any
diff --git a/tools/testing/selftests/net/openvswitch.sh b/tools/testing/selftests/net/openvswitch.sh
new file mode 100755
index 000000000000..7b6341688cc3
--- /dev/null
+++ b/tools/testing/selftests/net/openvswitch.sh
@@ -0,0 +1,394 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+#
+# OVS kernel module self tests
+#
+# Tests currently implemented:
+#
+# - mismatched_mtu_with_conntrack
+#	Set up two namespaces (client and server) which each have devices specifying
+#	incongruent MTUs (1450 vs 1500 in the test).  Transmit a UDP packet of 1901 bytes
+#	from client to server, and back.  Ensure that the ct() action
+#	uses the mru as a hint, but not a forced check.
+
+
+# Kselftest framework requirement - SKIP code is 4.
+ksft_skip=4
+
+PAUSE_ON_FAIL=no
+VERBOSE=0
+TRACING=0
+
+tests="
+	mismatched_mtu_with_conntrack		ipv4: IP Fragmentation with conntrack"
+
+
+usage() {
+	echo
+	echo "$0 [OPTIONS] [TEST]..."
+	echo "If no TEST argument is given, all tests will be run."
+	echo
+	echo "Options"
+	echo "  -t: capture traffic via tcpdump"
+	echo "  -v: verbose"
+	echo "  -p: pause on failure"
+	echo
+	echo "Available tests${tests}"
+	exit 1
+}
+
+on_exit() {
+	echo "$1" > ${ovs_dir}/cleanup.tmp
+	cat ${ovs_dir}/cleanup >> ${ovs_dir}/cleanup.tmp
+	mv ${ovs_dir}/cleanup.tmp ${ovs_dir}/cleanup
+}
+
+ovs_setenv() {
+	sandbox=$1
+
+	ovs_dir=$ovs_base${1:+/$1}; export ovs_dir
+
+	test -e ${ovs_dir}/cleanup || : > ${ovs_dir}/cleanup
+
+	OVS_RUNDIR=$ovs_dir; export OVS_RUNDIR
+	OVS_LOGDIR=$ovs_dir; export OVS_LOGDIR
+	OVS_DBDIR=$ovs_dir; export OVS_DBDIR
+	OVS_SYSCONFDIR=$ovs_dir; export OVS_SYSCONFDIR
+	OVS_PKGDATADIR=$ovs_dir; export OVS_PKGDATADIR
+}
+
+ovs_exit_sig() {
+	. "$ovs_dir/cleanup"
+	ovs-dpctl del-dp ovs-system
+}
+
+ovs_cleanup() {
+	ovs_exit_sig
+	[ $VERBOSE = 0 ] || echo "Error detected.  See $ovs_dir for more details."
+}
+
+ovs_normal_exit() {
+	ovs_exit_sig
+	rm -rf ${ovs_dir}
+}
+
+info() {
+    [ $VERBOSE = 0 ] || echo $*
+}
+
+kill_ovs_vswitchd () {
+	# Use provided PID or save the current PID if available.
+	TMPPID=$1
+	if test -z "$TMPPID"; then
+		TMPPID=$(cat $OVS_RUNDIR/ovs-vswitchd.pid 2>/dev/null)
+	fi
+
+	# Tell the daemon to terminate gracefully
+	ovs-appctl -t ovs-vswitchd exit --cleanup 2>/dev/null
+
+	# Nothing else to be done if there is no PID
+	test -z "$TMPPID" && return
+
+	for i in 1 2 3 4 5 6 7 8 9; do
+		# Check if the daemon is alive.
+		kill -0 $TMPPID 2>/dev/null || return
+
+		# Fallback to whole number since POSIX doesn't require
+		# fractional times to work.
+		sleep 0.1 || sleep 1
+	done
+
+	# Make sure it is terminated.
+	kill $TMPPID
+}
+
+start_daemon () {
+	info "exec: $@ -vconsole:off --detach --no-chdir --pidfile --log-file"
+	"$@" -vconsole:off --detach --no-chdir --pidfile --log-file >/dev/null
+	pidfile="$OVS_RUNDIR"/$1.pid
+
+	info "setting kill for $@..."
+	on_exit "test -e \"$pidfile\" && kill \`cat \"$pidfile\"\`"
+}
+
+if test "X$vswitchd_schema" = "X"; then
+	vswitchd_schema="/usr/share/openvswitch"
+fi
+
+ovs_sbx() {
+	if test "X$2" != X; then
+		(ovs_setenv $1; shift; "$@" >> ${ovs_dir}/debug.log)
+	else
+		ovs_setenv $1
+	fi
+}
+
+seq () {
+	if test $# = 1; then
+		set 1 $1
+	fi
+	while test $1 -le $2; do
+		echo $1
+		set `expr $1 + ${3-1}` $2 $3
+	done
+}
+
+ovs_wait () {
+	info "$1: waiting $2..."
+
+	# First try the condition without waiting.
+	if ovs_wait_cond; then info "$1: wait succeeded immediately"; return 0; fi
+
+	# Try a quick sleep, so that the test completes very quickly
+	# in the normal case.  POSIX doesn't require fractional times to
+	# work, so this might not work.
+	sleep 0.1
+	if ovs_wait_cond; then info "$1: wait succeeded quickly"; return 0; fi
+
+	# Then wait up to OVS_CTL_TIMEOUT seconds.
+	local d
+	for d in `seq 1 "$OVS_CTL_TIMEOUT"`; do
+		sleep 1
+		if ovs_wait_cond; then info "$1: wait succeeded after $d seconds"; return 0; fi
+	done
+
+	info "$1: wait failed after $d seconds"
+	ovs_wait_failed
+}
+
+sbxs=
+sbx_add () {
+	info "adding sandbox '$1'"
+
+	sbxs="$sbxs $1"
+
+	NO_BIN=0
+	which ovsdb-tool >/dev/null 2>&1 || NO_BIN=1
+	which ovsdb-server >/dev/null 2>&1 || NO_BIN=1
+	which ovs-vsctl >/dev/null 2>&1 || NO_BIN=1
+	which ovs-vswitchd >/dev/null 2>&1 || NO_BIN=1
+
+	if [ $NO_BIN = 1 ]; then
+		info "Missing required binaries..."
+		return 4
+	fi
+	# Create sandbox.
+	local d="$ovs_base"/$1
+	if [ -e $d ]; then
+		info "removing $d"
+		rm -rf "$d"
+	fi
+	mkdir "$d" || return 1
+	ovs_setenv $1
+
+	# Create database and start ovsdb-server.
+        info "$1: create db and start db-server"
+	: > "$d"/.conf.db.~lock~
+	ovs_sbx $1 ovsdb-tool create "$d"/conf.db "$vswitchd_schema"/vswitch.ovsschema || return 1
+	ovs_sbx $1 start_daemon ovsdb-server --detach --remote=punix:"$d"/db.sock || return 1
+
+	# Initialize database.
+	ovs_sbx $1 ovs-vsctl --no-wait -- init || return 1
+
+	# Start ovs-vswitchd
+        info "$1: start vswitchd"
+	ovs_sbx $1 start_daemon ovs-vswitchd -vvconn -vofproto_dpif -vunixctl
+
+	ovs_wait_cond () {
+		if ip link show ovs-netdev >/dev/null 2>&1; then return 1; else return 0; fi
+	}
+	ovs_wait_failed () {
+		:
+	}
+
+	ovs_wait "sandbox_add" "while ip link show ovs-netdev" || return 1
+}
+
+ovs_base=`pwd`
+
+# mismatched_mtu_with_conntrack test
+#  - client has 1450 byte MTU
+#  - server has 1500 byte MTU
+#  - use UDP to send 1901 bytes each direction for mismatched
+#    fragmentation.
+test_mismatched_mtu_with_conntrack() {
+
+	sbx_add "test_mismatched_mtu_with_conntrack" || return $?
+
+	info "create namespaces"
+	for ns in client server; do
+		ip netns add $ns || return 1
+		on_exit "ip netns del $ns"
+	done
+
+	# setup the base bridge
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-vsctl add-br br0 || return 1
+
+	# setup the client
+	info "setup client namespace"
+	ip link add c0 type veth peer name c1 || return 1
+	on_exit "ip link del c0 >/dev/null 2>&1"
+
+	ip link set c0 mtu 1450
+	ip link set c0 up
+
+	ip link set c1 netns client || return 1
+	ip netns exec client ip addr add 172.31.110.2/24 dev c1
+	ip netns exec client ip link set c1 mtu 1450
+	ip netns exec client ip link set c1 up
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-vsctl add-port br0 c0 || return 1
+
+	# setup the server
+	info "setup server namespace"
+	ip link add s0 type veth peer name s1 || return 1
+	on_exit "ip link del s0 >/dev/null 2>&1; ip netns exec server ip link del s0 >/dev/null 2>&1"
+	ip link set s0 up
+
+	ip link set s1 netns server || return 1
+	ip netns exec server ip addr add 172.31.110.1/24 dev s1 || return 1
+	ip netns exec server ip link set s1 up
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-vsctl add-port br0 s0 || return 1
+
+	info "setup flows"
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-ofctl del-flows br0
+
+	cat >${ovs_dir}/flows.txt <<EOF
+table=0,priority=100,arp action=normal
+table=0,priority=100,ip,udp action=ct(table=1)
+table=0,priority=10 action=drop
+
+table=1,priority=100,ct_state=+new+trk,in_port=c0,ip action=ct(commit),s0
+table=1,priority=100,ct_state=+est+trk,in_port=s0,ip action=c0
+
+EOF
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-ofctl --bundle add-flows br0 ${ovs_dir}/flows.txt || return 1
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-ofctl dump-flows br0
+
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-vsctl show
+
+	# setup echo
+	mknod -m 777 ${ovs_dir}/fifo p || return 1
+	# on_exit "rm ${ovs_dir}/fifo"
+
+	# test a udp connection
+	info "send udp data"
+	ip netns exec server sh -c 'cat ${ovs_dir}/fifo | nc -l -vv -u 8888 >${ovs_dir}/fifo 2>${ovs_dir}/s1-nc.log & echo $! > ${ovs_dir}/server.pid'
+	on_exit "test -e \"${ovs_dir}/server.pid\" && kill \`cat \"${ovs_dir}/server.pid\"\`"
+	if [ $TRACING = 1 ]; then
+		ip netns exec server sh -c "tcpdump -i s1 -l -n -U -xx >> ${ovs_dir}/s1-pkts.cap &"
+		ip netns exec client sh -c "tcpdump -i c1 -l -n -U -xx >> ${ovs_dir}/c1-pkts.cap &"
+	fi
+
+	ip netns exec client sh -c "python -c \"import time; print('a' * 1900); time.sleep(2)\" | nc -v -u 172.31.110.1 8888 2>${ovs_dir}/c1-nc.log" || return 1
+
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-appctl dpctl/dump-flows
+	ovs_sbx "test_mismatched_mtu_with_conntrack" ovs-ofctl dump-flows br0
+
+	info "check udp data was tx and rx"
+	grep "1901 bytes received" ${ovs_dir}/c1-nc.log || return 1
+	ovs_normal_exit
+}
+
+run_test() {
+	(
+	tname="$1"
+	tdesc="$2"
+
+	if ! lsmod | grep openvswitch >/dev/null 2>&1; then
+		printf "TEST: %-60s  [SKIP]\n" "${tdesc}"
+		return $ksft_skip
+	fi
+
+	unset IFS
+
+	eval test_${tname}
+	ret=$?
+
+	if [ $ret -eq 0 ]; then
+		printf "TEST: %-60s  [ OK ]\n" "${tdesc}"
+		ovs_normal_exit
+	elif [ $ret -eq 1 ]; then
+		printf "TEST: %-60s  [FAIL]\n" "${tdesc}"
+		if [ "${PAUSE_ON_FAIL}" = "yes" ]; then
+			echo
+			echo "Pausing. Hit enter to continue"
+			read a
+		fi
+		ovs_exit_sig
+		exit 1
+	elif [ $ret -eq $ksft_skip ]; then
+		printf "TEST: %-60s  [SKIP]\n" "${tdesc}"
+	elif [ $ret -eq 2 ]; then
+		rm -rf test_${tname}
+		run_test "$1" "$2"
+	fi
+
+	return $ret
+	)
+	ret=$?
+	case $ret in
+		0)
+			all_skipped=false
+			[ $exitcode=$ksft_skip ] && exitcode=0
+		;;
+		$ksft_skip)
+			[ $all_skipped = true ] && exitcode=$ksft_skip
+		;;
+		*)
+			all_skipped=false
+			exitcode=1
+		;;
+	esac
+
+	return $ret
+}
+
+
+exitcode=0
+desc=0
+all_skipped=true
+
+while getopts :pvt o
+do
+	case $o in
+	p) PAUSE_ON_FAIL=yes;;
+	v) VERBOSE=1;;
+	t) if which tcpdump > /dev/null 2>&1; then
+		TRACING=1
+	   else
+		echo "=== tcpdump not available, tracing disabled"
+	   fi
+	   ;;
+	*) usage;;
+	esac
+done
+shift $(($OPTIND-1))
+
+IFS="	
+"
+
+for arg do
+	# Check first that all requested tests are available before running any
+	command -v > /dev/null "test_${arg}" || { echo "=== Test ${arg} not found"; usage; }
+done
+
+name=""
+desc=""
+for t in ${tests}; do
+	[ "${name}" = "" ]	&& name="${t}"	&& continue
+	[ "${desc}" = "" ]	&& desc="${t}"
+
+	run_this=1
+	for arg do
+		[ "${arg}" != "${arg#--*}" ] && continue
+		[ "${arg}" = "${name}" ] && run_this=1 && break
+		run_this=0
+	done
+	if [ $run_this -eq 1 ]; then
+		run_test "${name}" "${desc}"
+	fi
+	name=""
+	desc=""
+done
+
+exit ${exitcode}
-- 
2.25.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread