linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Mingbao Sun <sunmingbao@tom.com>
To: Keith Busch <kbusch@kernel.org>, Jens Axboe <axboe@fb.com>,
	Christoph Hellwig <hch@lst.de>, Sagi Grimberg <sagi@grimberg.me>,
	Chaitanya Kulkarni <kch@nvidia.com>,
	linux-nvme@lists.infradead.org, linux-kernel@vger.kernel.org
Cc: sunmingbao@tom.com, tyler.sun@dell.com, ping.gan@dell.com,
	yanxiu.cai@dell.com, libin.zhang@dell.com, ao.sun@dell.com
Subject: [PATCH 1/2] nvmet-tcp: support specifying the congestion-control
Date: Fri,  4 Mar 2022 17:27:53 +0800	[thread overview]
Message-ID: <20220304092754.2721-2-sunmingbao@tom.com> (raw)
In-Reply-To: <20220304092754.2721-1-sunmingbao@tom.com>

From: Mingbao Sun <tyler.sun@dell.com>

congestion-control could have a noticeable impaction on the
performance of TCP-based communications. This is of course true
to NVMe_over_TCP.

Different congestion-controls (e.g., cubic, dctcp) are suitable for
different scenarios. Proper adoption of congestion control would benefit
the performance. On the contrary, the performance could be destroyed.

Though we can specify the congestion-control of NVMe_over_TCP via
writing '/proc/sys/net/ipv4/tcp_congestion_control', but this also
changes the congestion-control of all the future TCP sockets that
have not been explicitly assigned the congestion-control, thus bringing
potential impaction on their performance.

So it makes sense to make NVMe_over_TCP support specifying the
congestion-control. And this commit addresses the target side.

Implementation approach:
the following new file entry was created for user to specify the
congestion-control of each nvmet port.
'/sys/kernel/config/nvmet/ports/X/tcp_congestion'
Then later in nvmet_tcp_add_port, the specified congestion-control
would be applied to the listening socket of the nvmet port.

Signed-off-by: Mingbao Sun <tyler.sun@dell.com>
---
 drivers/nvme/target/configfs.c | 52 ++++++++++++++++++++++++++++++++++
 drivers/nvme/target/nvmet.h    |  1 +
 drivers/nvme/target/tcp.c      | 27 ++++++++++++++++++
 3 files changed, 80 insertions(+)

diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 091a0ca16361..fcf01f2b8045 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -11,6 +11,7 @@
 #include <linux/ctype.h>
 #include <linux/pci.h>
 #include <linux/pci-p2pdma.h>
+#include <net/tcp.h>
 
 #include "nvmet.h"
 
@@ -222,6 +223,55 @@ static ssize_t nvmet_addr_trsvcid_store(struct config_item *item,
 
 CONFIGFS_ATTR(nvmet_, addr_trsvcid);
 
+static ssize_t nvmet_tcp_congestion_show(struct config_item *item,
+		char *page)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+
+	return snprintf(page, PAGE_SIZE, "%s\n",
+			port->tcp_congestion ? port->tcp_congestion : "");
+}
+
+static ssize_t nvmet_tcp_congestion_store(struct config_item *item,
+		const char *page, size_t count)
+{
+	struct nvmet_port *port = to_nvmet_port(item);
+	int len;
+	bool ecn_ca;
+	u32 key;
+
+	len = strcspn(page, "\n");
+	if (!len)
+		return -EINVAL;
+
+	if (len >= TCP_CA_NAME_MAX) {
+		pr_err("name of TCP congestion control can not exceed %d bytes.\n",
+		       TCP_CA_NAME_MAX);
+		return -EINVAL;
+	}
+
+	if (nvmet_is_port_enabled(port, __func__))
+		return -EACCES;
+
+	kfree(port->tcp_congestion);
+	port->tcp_congestion = kmemdup_nul(page, len, GFP_KERNEL);
+	if (!port->tcp_congestion)
+		return -ENOMEM;
+
+	key = tcp_ca_get_key_by_name(NULL, port->tcp_congestion, &ecn_ca);
+	if (key == TCP_CA_UNSPEC) {
+		pr_err("congestion control %s not found.\n",
+		       port->tcp_congestion);
+		kfree(port->tcp_congestion);
+		port->tcp_congestion = NULL;
+		return -EINVAL;
+	}
+
+	return count;
+}
+
+CONFIGFS_ATTR(nvmet_, tcp_congestion);
+
 static ssize_t nvmet_param_inline_data_size_show(struct config_item *item,
 		char *page)
 {
@@ -1597,6 +1647,7 @@ static void nvmet_port_release(struct config_item *item)
 	list_del(&port->global_entry);
 
 	kfree(port->ana_state);
+	kfree(port->tcp_congestion);
 	kfree(port);
 }
 
@@ -1605,6 +1656,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
 	&nvmet_attr_addr_treq,
 	&nvmet_attr_addr_traddr,
 	&nvmet_attr_addr_trsvcid,
+	&nvmet_attr_tcp_congestion,
 	&nvmet_attr_addr_trtype,
 	&nvmet_attr_param_inline_data_size,
 #ifdef CONFIG_BLK_DEV_INTEGRITY
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 69637bf8f8e1..76a57c4c3456 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -145,6 +145,7 @@ struct nvmet_port {
 	struct config_group		ana_groups_group;
 	struct nvmet_ana_group		ana_default_group;
 	enum nvme_ana_state		*ana_state;
+	const char			*tcp_congestion;
 	void				*priv;
 	bool				enabled;
 	int				inline_data_size;
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 83ca577f72be..3b72e782c901 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -1657,8 +1657,10 @@ static void nvmet_tcp_accept_work(struct work_struct *w)
 	struct nvmet_tcp_port *port =
 		container_of(w, struct nvmet_tcp_port, accept_work);
 	struct socket *newsock;
+	struct inet_connection_sock *icsk, *icsk_new;
 	int ret;
 
+	icsk = inet_csk(port->sock->sk);
 	while (true) {
 		ret = kernel_accept(port->sock, &newsock, O_NONBLOCK);
 		if (ret < 0) {
@@ -1666,6 +1668,16 @@ static void nvmet_tcp_accept_work(struct work_struct *w)
 				pr_warn("failed to accept err=%d\n", ret);
 			return;
 		}
+
+		if (port->nport->tcp_congestion) {
+			icsk_new = inet_csk(newsock->sk);
+			if (icsk_new->icsk_ca_ops != icsk->icsk_ca_ops) {
+				pr_warn("congestion abnormal: expected %s, actual %s.\n",
+					icsk->icsk_ca_ops->name,
+					icsk_new->icsk_ca_ops->name);
+			}
+		}
+
 		ret = nvmet_tcp_alloc_queue(port, newsock);
 		if (ret) {
 			pr_err("failed to allocate queue\n");
@@ -1693,6 +1705,8 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 {
 	struct nvmet_tcp_port *port;
 	__kernel_sa_family_t af;
+	char ca_name[TCP_CA_NAME_MAX];
+	sockptr_t optval;
 	int ret;
 
 	port = kzalloc(sizeof(*port), GFP_KERNEL);
@@ -1741,6 +1755,19 @@ static int nvmet_tcp_add_port(struct nvmet_port *nport)
 	if (so_priority > 0)
 		sock_set_priority(port->sock->sk, so_priority);
 
+	if (nport->tcp_congestion) {
+		strncpy(ca_name, nport->tcp_congestion, TCP_CA_NAME_MAX-1);
+		optval = KERNEL_SOCKPTR(ca_name);
+		ret = sock_common_setsockopt(port->sock, IPPROTO_TCP,
+					     TCP_CONGESTION, optval,
+					     strlen(ca_name));
+		if (ret) {
+			pr_err("failed to set port socket's congestion to %s: %d\n",
+			       ca_name, ret);
+			goto err_sock;
+		}
+	}
+
 	ret = kernel_bind(port->sock, (struct sockaddr *)&port->addr,
 			sizeof(port->addr));
 	if (ret) {
-- 
2.26.2


  reply	other threads:[~2022-03-04  9:41 UTC|newest]

Thread overview: 8+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-04  9:27 [PATCH 0/2] NVMe_over_TCP: support specifying the congestion-control Mingbao Sun
2022-03-04  9:27 ` Mingbao Sun [this message]
2022-03-04  9:27 ` [PATCH 2/2] nvme-tcp: " Mingbao Sun
2022-03-04 16:20   ` Christoph Hellwig
2022-03-05  7:09     ` Mingbao Sun
2022-03-08  7:12       ` Christoph Hellwig
2022-03-08  7:57         ` Mingbao Sun
2022-03-08 13:03 ` [PATCH 0/2] NVMe_over_TCP: " Mingbao Sun

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220304092754.2721-2-sunmingbao@tom.com \
    --to=sunmingbao@tom.com \
    --cc=ao.sun@dell.com \
    --cc=axboe@fb.com \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=kch@nvidia.com \
    --cc=libin.zhang@dell.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=ping.gan@dell.com \
    --cc=sagi@grimberg.me \
    --cc=tyler.sun@dell.com \
    --cc=yanxiu.cai@dell.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).