All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
@ 2019-03-14 14:17 Marcin Dziegielewski
  2019-03-14 14:22 ` Javier González
  0 siblings, 1 reply; 8+ messages in thread
From: Marcin Dziegielewski @ 2019-03-14 14:17 UTC (permalink / raw)
  To: mb, javier, hans.holmberg
  Cc: linux-block, marcin.dziegielewski, igor.j.konopko

In some cases write thread migration between cpus can cause
sending writes in improper order and in consequence a lot of
errors from device.

Write thread affinity to particular cpu prevent before it.

Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
---
 drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
 drivers/lightnvm/pblk.h      |  1 +
 drivers/nvme/host/lightnvm.c |  1 +
 include/linux/lightnvm.h     |  2 ++
 4 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 81e8ed4..bd25004 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -21,6 +21,8 @@
 
 #include "pblk.h"
 #include "pblk-trace.h"
+#include <linux/cpumask.h>
+#include <linux/numa.h>
 
 static unsigned int write_buffer_size;
 
@@ -47,6 +49,8 @@ struct pblk_global_caches {
 
 struct bio_set pblk_bio_set;
 
+cpumask_t free_cpumask;
+
 static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
 			  struct bio *bio)
 {
@@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
 
 static int pblk_writer_init(struct pblk *pblk)
 {
+	cpumask_t tmp_cpumask, cpumask;
+	int cpu;
+
 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
 	if (IS_ERR(pblk->writer_ts)) {
 		int err = PTR_ERR(pblk->writer_ts);
@@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
 		return err;
 	}
 
+	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
+			cpu_online_mask);
+	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
+
+	if (!cpumask_weight(&free_cpumask)) {
+		free_cpumask = CPU_MASK_ALL;
+		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
+	}
+
+	cpu = cpumask_last(&cpumask);
+
+	kthread_bind(pblk->writer_ts, cpu);
+
+	cpumask_clear_cpu(cpu, &free_cpumask);
+	pblk->writer_cpu = cpu;
+
 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
 
@@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
 			"Stopping not fully synced write buffer\n");
 
 	del_timer_sync(&pblk->wtimer);
-	if (pblk->writer_ts)
+	if (pblk->writer_ts) {
+		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
 		kthread_stop(pblk->writer_ts);
+		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
+	}
 }
 
 static void pblk_free(struct pblk *pblk)
@@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
 {
 	int ret;
 
+	free_cpumask = CPU_MASK_ALL;
+
 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
 	if (ret)
 		return ret;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 381f074..650f983 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -690,6 +690,7 @@ struct pblk {
 	atomic_t inflight_io;		/* General inflight I/O counter */
 
 	struct task_struct *writer_ts;
+	int writer_cpu;
 
 	/* Simple translation map of logical addresses to physical addresses.
 	 * The logical addresses is known by the host system, while the physical
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index 949e29e..971a19f 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
 	dev->ops = &nvme_nvm_dev_ops;
 	dev->private_data = ns;
+	dev->node = node;
 	ns->ndev = dev;
 
 	return nvm_register(dev);
diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
index 5d865a5..312029e 100644
--- a/include/linux/lightnvm.h
+++ b/include/linux/lightnvm.h
@@ -427,6 +427,8 @@ struct nvm_dev {
 	char name[DISK_NAME_LEN];
 	void *private_data;
 
+	int node;
+
 	void *rmap;
 
 	struct mutex mlock;
-- 
1.8.3.1


^ permalink raw reply related	[flat|nested] 8+ messages in thread

* Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
  2019-03-14 14:17 [PATCH] lightnvm: pblk: set write thread affinity to particular cpu Marcin Dziegielewski
@ 2019-03-14 14:22 ` Javier González
  2019-03-14 14:22   ` Javier González
  0 siblings, 1 reply; 8+ messages in thread
From: Javier González @ 2019-03-14 14:22 UTC (permalink / raw)
  To: Marcin Dziegielewski
  Cc: Matias Bjørling, Hans Holmberg, linux-block, Konopko, Igor J

[-- Attachment #1: Type: text/plain, Size: 4372 bytes --]

> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
> 
> In some cases write thread migration between cpus can cause
> sending writes in improper order and in consequence a lot of
> errors from device.
> 
> Write thread affinity to particular cpu prevent before it.
> 
> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
> ---
> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
> drivers/lightnvm/pblk.h      |  1 +
> drivers/nvme/host/lightnvm.c |  1 +
> include/linux/lightnvm.h     |  2 ++
> 4 files changed, 33 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index 81e8ed4..bd25004 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -21,6 +21,8 @@
> 
> #include "pblk.h"
> #include "pblk-trace.h"
> +#include <linux/cpumask.h>
> +#include <linux/numa.h>
> 
> static unsigned int write_buffer_size;
> 
> @@ -47,6 +49,8 @@ struct pblk_global_caches {
> 
> struct bio_set pblk_bio_set;
> 
> +cpumask_t free_cpumask;
> +
> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
> 			  struct bio *bio)
> {
> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
> 
> static int pblk_writer_init(struct pblk *pblk)
> {
> +	cpumask_t tmp_cpumask, cpumask;
> +	int cpu;
> +
> 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
> 	if (IS_ERR(pblk->writer_ts)) {
> 		int err = PTR_ERR(pblk->writer_ts);
> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
> 		return err;
> 	}
> 
> +	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
> +			cpu_online_mask);
> +	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> +
> +	if (!cpumask_weight(&free_cpumask)) {
> +		free_cpumask = CPU_MASK_ALL;
> +		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> +	}
> +
> +	cpu = cpumask_last(&cpumask);
> +
> +	kthread_bind(pblk->writer_ts, cpu);
> +
> +	cpumask_clear_cpu(cpu, &free_cpumask);
> +	pblk->writer_cpu = cpu;
> +
> 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
> 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
> 
> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
> 			"Stopping not fully synced write buffer\n");
> 
> 	del_timer_sync(&pblk->wtimer);
> -	if (pblk->writer_ts)
> +	if (pblk->writer_ts) {
> +		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
> 		kthread_stop(pblk->writer_ts);
> +		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
> +	}
> }
> 
> static void pblk_free(struct pblk *pblk)
> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
> {
> 	int ret;
> 
> +	free_cpumask = CPU_MASK_ALL;
> +
> 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
> 	if (ret)
> 		return ret;
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index 381f074..650f983 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -690,6 +690,7 @@ struct pblk {
> 	atomic_t inflight_io;		/* General inflight I/O counter */
> 
> 	struct task_struct *writer_ts;
> +	int writer_cpu;
> 
> 	/* Simple translation map of logical addresses to physical addresses.
> 	 * The logical addresses is known by the host system, while the physical
> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
> index 949e29e..971a19f 100644
> --- a/drivers/nvme/host/lightnvm.c
> +++ b/drivers/nvme/host/lightnvm.c
> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
> 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
> 	dev->ops = &nvme_nvm_dev_ops;
> 	dev->private_data = ns;
> +	dev->node = node;
> 	ns->ndev = dev;
> 
> 	return nvm_register(dev);
> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
> index 5d865a5..312029e 100644
> --- a/include/linux/lightnvm.h
> +++ b/include/linux/lightnvm.h
> @@ -427,6 +427,8 @@ struct nvm_dev {
> 	char name[DISK_NAME_LEN];
> 	void *private_data;
> 
> +	int node;
> +
> 	void *rmap;
> 
> 	struct mutex mlock;
> --
> 1.8.3.1

We have a per-CPU semaphore that only allows to send a single I/O in
order to prevent write pointer violations. Are you seeing this error, or
is it theoretical?

Javier

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
  2019-03-14 14:22 ` Javier González
@ 2019-03-14 14:22   ` Javier González
  2019-03-18  9:44     ` Marcin Dziegielewski
  0 siblings, 1 reply; 8+ messages in thread
From: Javier González @ 2019-03-14 14:22 UTC (permalink / raw)
  To: Marcin Dziegielewski
  Cc: Matias Bjørling, Hans Holmberg, linux-block, Konopko, Igor J

[-- Attachment #1: Type: text/plain, Size: 4635 bytes --]


> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
> 
>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>> 
>> In some cases write thread migration between cpus can cause
>> sending writes in improper order and in consequence a lot of
>> errors from device.
>> 
>> Write thread affinity to particular cpu prevent before it.
>> 
>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>> ---
>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>> drivers/lightnvm/pblk.h      |  1 +
>> drivers/nvme/host/lightnvm.c |  1 +
>> include/linux/lightnvm.h     |  2 ++
>> 4 files changed, 33 insertions(+), 1 deletion(-)
>> 
>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>> index 81e8ed4..bd25004 100644
>> --- a/drivers/lightnvm/pblk-init.c
>> +++ b/drivers/lightnvm/pblk-init.c
>> @@ -21,6 +21,8 @@
>> 
>> #include "pblk.h"
>> #include "pblk-trace.h"
>> +#include <linux/cpumask.h>
>> +#include <linux/numa.h>
>> 
>> static unsigned int write_buffer_size;
>> 
>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>> 
>> struct bio_set pblk_bio_set;
>> 
>> +cpumask_t free_cpumask;
>> +
>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>> 			  struct bio *bio)
>> {
>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>> 
>> static int pblk_writer_init(struct pblk *pblk)
>> {
>> +	cpumask_t tmp_cpumask, cpumask;
>> +	int cpu;
>> +
>> 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>> 	if (IS_ERR(pblk->writer_ts)) {
>> 		int err = PTR_ERR(pblk->writer_ts);
>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>> 		return err;
>> 	}
>> 
>> +	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>> +			cpu_online_mask);
>> +	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>> +
>> +	if (!cpumask_weight(&free_cpumask)) {
>> +		free_cpumask = CPU_MASK_ALL;
>> +		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>> +	}
>> +
>> +	cpu = cpumask_last(&cpumask);
>> +
>> +	kthread_bind(pblk->writer_ts, cpu);
>> +
>> +	cpumask_clear_cpu(cpu, &free_cpumask);
>> +	pblk->writer_cpu = cpu;
>> +
>> 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>> 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>> 
>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>> 			"Stopping not fully synced write buffer\n");
>> 
>> 	del_timer_sync(&pblk->wtimer);
>> -	if (pblk->writer_ts)
>> +	if (pblk->writer_ts) {
>> +		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>> 		kthread_stop(pblk->writer_ts);
>> +		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>> +	}
>> }
>> 
>> static void pblk_free(struct pblk *pblk)
>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>> {
>> 	int ret;
>> 
>> +	free_cpumask = CPU_MASK_ALL;
>> +
>> 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>> 	if (ret)
>> 		return ret;
>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>> index 381f074..650f983 100644
>> --- a/drivers/lightnvm/pblk.h
>> +++ b/drivers/lightnvm/pblk.h
>> @@ -690,6 +690,7 @@ struct pblk {
>> 	atomic_t inflight_io;		/* General inflight I/O counter */
>> 
>> 	struct task_struct *writer_ts;
>> +	int writer_cpu;
>> 
>> 	/* Simple translation map of logical addresses to physical addresses.
>> 	 * The logical addresses is known by the host system, while the physical
>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>> index 949e29e..971a19f 100644
>> --- a/drivers/nvme/host/lightnvm.c
>> +++ b/drivers/nvme/host/lightnvm.c
>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>> 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
>> 	dev->ops = &nvme_nvm_dev_ops;
>> 	dev->private_data = ns;
>> +	dev->node = node;
>> 	ns->ndev = dev;
>> 
>> 	return nvm_register(dev);
>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>> index 5d865a5..312029e 100644
>> --- a/include/linux/lightnvm.h
>> +++ b/include/linux/lightnvm.h
>> @@ -427,6 +427,8 @@ struct nvm_dev {
>> 	char name[DISK_NAME_LEN];
>> 	void *private_data;
>> 
>> +	int node;
>> +
>> 	void *rmap;
>> 
>> 	struct mutex mlock;
>> --
>> 1.8.3.1
> 
> We have a per-CPU semaphore that only allows to send a single I/O in
> order to prevent write pointer violations. Are you seeing this error, or
> is it theoretical?
> 
> Javier

I meant per PU (parallel unit)…


[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
  2019-03-14 14:22   ` Javier González
@ 2019-03-18  9:44     ` Marcin Dziegielewski
  2019-03-18 12:22       ` Hans Holmberg
  0 siblings, 1 reply; 8+ messages in thread
From: Marcin Dziegielewski @ 2019-03-18  9:44 UTC (permalink / raw)
  To: Javier González
  Cc: Matias Bjørling, Hans Holmberg, linux-block, Konopko, Igor J

On 3/14/19 3:22 PM, Javier González wrote:
> 
>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>
>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>
>>> In some cases write thread migration between cpus can cause
>>> sending writes in improper order and in consequence a lot of
>>> errors from device.
>>>
>>> Write thread affinity to particular cpu prevent before it.
>>>
>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>> ---
>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>> drivers/lightnvm/pblk.h      |  1 +
>>> drivers/nvme/host/lightnvm.c |  1 +
>>> include/linux/lightnvm.h     |  2 ++
>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>> index 81e8ed4..bd25004 100644
>>> --- a/drivers/lightnvm/pblk-init.c
>>> +++ b/drivers/lightnvm/pblk-init.c
>>> @@ -21,6 +21,8 @@
>>>
>>> #include "pblk.h"
>>> #include "pblk-trace.h"
>>> +#include <linux/cpumask.h>
>>> +#include <linux/numa.h>
>>>
>>> static unsigned int write_buffer_size;
>>>
>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>
>>> struct bio_set pblk_bio_set;
>>>
>>> +cpumask_t free_cpumask;
>>> +
>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>> 			  struct bio *bio)
>>> {
>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>
>>> static int pblk_writer_init(struct pblk *pblk)
>>> {
>>> +	cpumask_t tmp_cpumask, cpumask;
>>> +	int cpu;
>>> +
>>> 	pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>> 	if (IS_ERR(pblk->writer_ts)) {
>>> 		int err = PTR_ERR(pblk->writer_ts);
>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>> 		return err;
>>> 	}
>>>
>>> +	cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>> +			cpu_online_mask);
>>> +	cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>> +
>>> +	if (!cpumask_weight(&free_cpumask)) {
>>> +		free_cpumask = CPU_MASK_ALL;
>>> +		cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>> +	}
>>> +
>>> +	cpu = cpumask_last(&cpumask);
>>> +
>>> +	kthread_bind(pblk->writer_ts, cpu);
>>> +
>>> +	cpumask_clear_cpu(cpu, &free_cpumask);
>>> +	pblk->writer_cpu = cpu;
>>> +
>>> 	timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>> 	mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>
>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>> 			"Stopping not fully synced write buffer\n");
>>>
>>> 	del_timer_sync(&pblk->wtimer);
>>> -	if (pblk->writer_ts)
>>> +	if (pblk->writer_ts) {
>>> +		set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>> 		kthread_stop(pblk->writer_ts);
>>> +		cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>> +	}
>>> }
>>>
>>> static void pblk_free(struct pblk *pblk)
>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>> {
>>> 	int ret;
>>>
>>> +	free_cpumask = CPU_MASK_ALL;
>>> +
>>> 	ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>> 	if (ret)
>>> 		return ret;
>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>> index 381f074..650f983 100644
>>> --- a/drivers/lightnvm/pblk.h
>>> +++ b/drivers/lightnvm/pblk.h
>>> @@ -690,6 +690,7 @@ struct pblk {
>>> 	atomic_t inflight_io;		/* General inflight I/O counter */
>>>
>>> 	struct task_struct *writer_ts;
>>> +	int writer_cpu;
>>>
>>> 	/* Simple translation map of logical addresses to physical addresses.
>>> 	 * The logical addresses is known by the host system, while the physical
>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>> index 949e29e..971a19f 100644
>>> --- a/drivers/nvme/host/lightnvm.c
>>> +++ b/drivers/nvme/host/lightnvm.c
>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>> 	memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>> 	dev->ops = &nvme_nvm_dev_ops;
>>> 	dev->private_data = ns;
>>> +	dev->node = node;
>>> 	ns->ndev = dev;
>>>
>>> 	return nvm_register(dev);
>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>> index 5d865a5..312029e 100644
>>> --- a/include/linux/lightnvm.h
>>> +++ b/include/linux/lightnvm.h
>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>> 	char name[DISK_NAME_LEN];
>>> 	void *private_data;
>>>
>>> +	int node;
>>> +
>>> 	void *rmap;
>>>
>>> 	struct mutex mlock;
>>> --
>>> 1.8.3.1
>>
>> We have a per-CPU semaphore that only allows to send a single I/O in
>> order to prevent write pointer violations. Are you seeing this error, or
>> is it theoretical?
>>
>> Javier
> 
> I meant per PU (parallel unit)…
> 

You are right, for current upstream implementation it's theoretical. I 
saw it after some experimental changes in lightnvm layer.

But I see one more potential benefit from this patch (correct me if I'm 
wrong), after this patch we will be sure that write thread is working on 
cpu from right numa node for device. Now I think, we can also take under 
consideration binding to whole set of cpus from right numa node instead 
of particular one.

Marcin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
  2019-03-18  9:44     ` Marcin Dziegielewski
@ 2019-03-18 12:22       ` Hans Holmberg
  2019-03-18 18:22         ` Javier González
  0 siblings, 1 reply; 8+ messages in thread
From: Hans Holmberg @ 2019-03-18 12:22 UTC (permalink / raw)
  To: Marcin Dziegielewski
  Cc: Javier González, Matias Bjørling, Hans Holmberg,
	linux-block, Konopko, Igor J

On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
<marcin.dziegielewski@intel.com> wrote:
>
> On 3/14/19 3:22 PM, Javier González wrote:
> >
> >> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
> >>
> >>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
> >>>
> >>> In some cases write thread migration between cpus can cause
> >>> sending writes in improper order and in consequence a lot of
> >>> errors from device.
> >>>
> >>> Write thread affinity to particular cpu prevent before it.
> >>>
> >>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
> >>> ---
> >>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
> >>> drivers/lightnvm/pblk.h      |  1 +
> >>> drivers/nvme/host/lightnvm.c |  1 +
> >>> include/linux/lightnvm.h     |  2 ++
> >>> 4 files changed, 33 insertions(+), 1 deletion(-)
> >>>
> >>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> >>> index 81e8ed4..bd25004 100644
> >>> --- a/drivers/lightnvm/pblk-init.c
> >>> +++ b/drivers/lightnvm/pblk-init.c
> >>> @@ -21,6 +21,8 @@
> >>>
> >>> #include "pblk.h"
> >>> #include "pblk-trace.h"
> >>> +#include <linux/cpumask.h>
> >>> +#include <linux/numa.h>
> >>>
> >>> static unsigned int write_buffer_size;
> >>>
> >>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
> >>>
> >>> struct bio_set pblk_bio_set;
> >>>
> >>> +cpumask_t free_cpumask;
> >>> +
> >>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
> >>>                       struct bio *bio)
> >>> {
> >>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
> >>>
> >>> static int pblk_writer_init(struct pblk *pblk)
> >>> {
> >>> +   cpumask_t tmp_cpumask, cpumask;
> >>> +   int cpu;
> >>> +
> >>>     pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
> >>>     if (IS_ERR(pblk->writer_ts)) {
> >>>             int err = PTR_ERR(pblk->writer_ts);
> >>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
> >>>             return err;
> >>>     }
> >>>
> >>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
> >>> +                   cpu_online_mask);
> >>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> >>> +
> >>> +   if (!cpumask_weight(&free_cpumask)) {
> >>> +           free_cpumask = CPU_MASK_ALL;
> >>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
> >>> +   }
> >>> +
> >>> +   cpu = cpumask_last(&cpumask);
> >>> +
> >>> +   kthread_bind(pblk->writer_ts, cpu);
> >>> +
> >>> +   cpumask_clear_cpu(cpu, &free_cpumask);
> >>> +   pblk->writer_cpu = cpu;
> >>> +
> >>>     timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
> >>>     mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
> >>>
> >>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
> >>>                     "Stopping not fully synced write buffer\n");
> >>>
> >>>     del_timer_sync(&pblk->wtimer);
> >>> -   if (pblk->writer_ts)
> >>> +   if (pblk->writer_ts) {
> >>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
> >>>             kthread_stop(pblk->writer_ts);
> >>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
> >>> +   }
> >>> }
> >>>
> >>> static void pblk_free(struct pblk *pblk)
> >>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
> >>> {
> >>>     int ret;
> >>>
> >>> +   free_cpumask = CPU_MASK_ALL;
> >>> +
> >>>     ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
> >>>     if (ret)
> >>>             return ret;
> >>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> >>> index 381f074..650f983 100644
> >>> --- a/drivers/lightnvm/pblk.h
> >>> +++ b/drivers/lightnvm/pblk.h
> >>> @@ -690,6 +690,7 @@ struct pblk {
> >>>     atomic_t inflight_io;           /* General inflight I/O counter */
> >>>
> >>>     struct task_struct *writer_ts;
> >>> +   int writer_cpu;
> >>>
> >>>     /* Simple translation map of logical addresses to physical addresses.
> >>>      * The logical addresses is known by the host system, while the physical
> >>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
> >>> index 949e29e..971a19f 100644
> >>> --- a/drivers/nvme/host/lightnvm.c
> >>> +++ b/drivers/nvme/host/lightnvm.c
> >>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
> >>>     memcpy(dev->name, disk_name, DISK_NAME_LEN);
> >>>     dev->ops = &nvme_nvm_dev_ops;
> >>>     dev->private_data = ns;
> >>> +   dev->node = node;
> >>>     ns->ndev = dev;
> >>>
> >>>     return nvm_register(dev);
> >>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
> >>> index 5d865a5..312029e 100644
> >>> --- a/include/linux/lightnvm.h
> >>> +++ b/include/linux/lightnvm.h
> >>> @@ -427,6 +427,8 @@ struct nvm_dev {
> >>>     char name[DISK_NAME_LEN];
> >>>     void *private_data;
> >>>
> >>> +   int node;
> >>> +
> >>>     void *rmap;
> >>>
> >>>     struct mutex mlock;
> >>> --
> >>> 1.8.3.1
> >>
> >> We have a per-CPU semaphore that only allows to send a single I/O in
> >> order to prevent write pointer violations. Are you seeing this error, or
> >> is it theoretical?
> >>
> >> Javier
> >
> > I meant per PU (parallel unit)…
> >
>
> You are right, for current upstream implementation it's theoretical. I
> saw it after some experimental changes in lightnvm layer.
>
> But I see one more potential benefit from this patch (correct me if I'm
> wrong), after this patch we will be sure that write thread is working on
> cpu from right numa node for device. Now I think, we can also take under
> consideration binding to whole set of cpus from right numa node instead
> of particular one.

If this is just a potential benefit, I'd rather not do this, at least
not until we have numbers on any performance benefits and an analysis
of any trade offs.

Thanks,
Hans
>
> Marcin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
  2019-03-18 12:22       ` Hans Holmberg
@ 2019-03-18 18:22         ` Javier González
  2019-03-19 13:33           ` Marcin Dziegielewski
  0 siblings, 1 reply; 8+ messages in thread
From: Javier González @ 2019-03-18 18:22 UTC (permalink / raw)
  To: Hans Holmberg
  Cc: Marcin Dziegielewski, Matias Bjørling, Hans Holmberg,
	linux-block, Konopko, Igor J

[-- Attachment #1: Type: text/plain, Size: 6393 bytes --]


> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote:
> 
> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
> <marcin.dziegielewski@intel.com> wrote:
>> On 3/14/19 3:22 PM, Javier González wrote:
>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>>> 
>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>>> 
>>>>> In some cases write thread migration between cpus can cause
>>>>> sending writes in improper order and in consequence a lot of
>>>>> errors from device.
>>>>> 
>>>>> Write thread affinity to particular cpu prevent before it.
>>>>> 
>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>>>> ---
>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>>>> drivers/lightnvm/pblk.h      |  1 +
>>>>> drivers/nvme/host/lightnvm.c |  1 +
>>>>> include/linux/lightnvm.h     |  2 ++
>>>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>>> 
>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>>>> index 81e8ed4..bd25004 100644
>>>>> --- a/drivers/lightnvm/pblk-init.c
>>>>> +++ b/drivers/lightnvm/pblk-init.c
>>>>> @@ -21,6 +21,8 @@
>>>>> 
>>>>> #include "pblk.h"
>>>>> #include "pblk-trace.h"
>>>>> +#include <linux/cpumask.h>
>>>>> +#include <linux/numa.h>
>>>>> 
>>>>> static unsigned int write_buffer_size;
>>>>> 
>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>>> 
>>>>> struct bio_set pblk_bio_set;
>>>>> 
>>>>> +cpumask_t free_cpumask;
>>>>> +
>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>>>>                      struct bio *bio)
>>>>> {
>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>>> 
>>>>> static int pblk_writer_init(struct pblk *pblk)
>>>>> {
>>>>> +   cpumask_t tmp_cpumask, cpumask;
>>>>> +   int cpu;
>>>>> +
>>>>>    pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>>>>    if (IS_ERR(pblk->writer_ts)) {
>>>>>            int err = PTR_ERR(pblk->writer_ts);
>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>>>>            return err;
>>>>>    }
>>>>> 
>>>>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>>>> +                   cpu_online_mask);
>>>>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>> +
>>>>> +   if (!cpumask_weight(&free_cpumask)) {
>>>>> +           free_cpumask = CPU_MASK_ALL;
>>>>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>> +   }
>>>>> +
>>>>> +   cpu = cpumask_last(&cpumask);
>>>>> +
>>>>> +   kthread_bind(pblk->writer_ts, cpu);
>>>>> +
>>>>> +   cpumask_clear_cpu(cpu, &free_cpumask);
>>>>> +   pblk->writer_cpu = cpu;
>>>>> +
>>>>>    timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>>>>    mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>>> 
>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>>>>                    "Stopping not fully synced write buffer\n");
>>>>> 
>>>>>    del_timer_sync(&pblk->wtimer);
>>>>> -   if (pblk->writer_ts)
>>>>> +   if (pblk->writer_ts) {
>>>>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>>>>            kthread_stop(pblk->writer_ts);
>>>>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>>>> +   }
>>>>> }
>>>>> 
>>>>> static void pblk_free(struct pblk *pblk)
>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>>>> {
>>>>>    int ret;
>>>>> 
>>>>> +   free_cpumask = CPU_MASK_ALL;
>>>>> +
>>>>>    ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>>>>    if (ret)
>>>>>            return ret;
>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>>>> index 381f074..650f983 100644
>>>>> --- a/drivers/lightnvm/pblk.h
>>>>> +++ b/drivers/lightnvm/pblk.h
>>>>> @@ -690,6 +690,7 @@ struct pblk {
>>>>>    atomic_t inflight_io;           /* General inflight I/O counter */
>>>>> 
>>>>>    struct task_struct *writer_ts;
>>>>> +   int writer_cpu;
>>>>> 
>>>>>    /* Simple translation map of logical addresses to physical addresses.
>>>>>     * The logical addresses is known by the host system, while the physical
>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>>>> index 949e29e..971a19f 100644
>>>>> --- a/drivers/nvme/host/lightnvm.c
>>>>> +++ b/drivers/nvme/host/lightnvm.c
>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>>>>    memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>>>>    dev->ops = &nvme_nvm_dev_ops;
>>>>>    dev->private_data = ns;
>>>>> +   dev->node = node;
>>>>>    ns->ndev = dev;
>>>>> 
>>>>>    return nvm_register(dev);
>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>>>> index 5d865a5..312029e 100644
>>>>> --- a/include/linux/lightnvm.h
>>>>> +++ b/include/linux/lightnvm.h
>>>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>>>>    char name[DISK_NAME_LEN];
>>>>>    void *private_data;
>>>>> 
>>>>> +   int node;
>>>>> +
>>>>>    void *rmap;
>>>>> 
>>>>>    struct mutex mlock;
>>>>> --
>>>>> 1.8.3.1
>>>> 
>>>> We have a per-CPU semaphore that only allows to send a single I/O in
>>>> order to prevent write pointer violations. Are you seeing this error, or
>>>> is it theoretical?
>>>> 
>>>> Javier
>>> 
>>> I meant per PU (parallel unit)…
>> 
>> You are right, for current upstream implementation it's theoretical. I
>> saw it after some experimental changes in lightnvm layer.
>> 
>> But I see one more potential benefit from this patch (correct me if I'm
>> wrong), after this patch we will be sure that write thread is working on
>> cpu from right numa node for device. Now I think, we can also take under
>> consideration binding to whole set of cpus from right numa node instead
>> of particular one.

I can see what you mean, but I think we are better relying on the
scheduler balancing the NUMA nodes rather than pinning the write thread
to specific CPUs. Are we doing something like this in the writeback
path?

> 
> If this is just a potential benefit, I'd rather not do this, at least
> not until we have numbers on any performance benefits and an analysis
> of any trade offs.
> 
> Thanks,
> Hans
>> Marcin

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
  2019-03-18 18:22         ` Javier González
@ 2019-03-19 13:33           ` Marcin Dziegielewski
  2019-03-19 14:25             ` Javier González
  0 siblings, 1 reply; 8+ messages in thread
From: Marcin Dziegielewski @ 2019-03-19 13:33 UTC (permalink / raw)
  To: Javier González, Hans Holmberg
  Cc: Matias Bjørling, Hans Holmberg, linux-block, Konopko, Igor J



On 3/18/19 7:22 PM, Javier González wrote:
> 
>> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote:
>>
>> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
>> <marcin.dziegielewski@intel.com> wrote:
>>> On 3/14/19 3:22 PM, Javier González wrote:
>>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>>>>
>>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>>>>
>>>>>> In some cases write thread migration between cpus can cause
>>>>>> sending writes in improper order and in consequence a lot of
>>>>>> errors from device.
>>>>>>
>>>>>> Write thread affinity to particular cpu prevent before it.
>>>>>>
>>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>>>>> ---
>>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>>>>> drivers/lightnvm/pblk.h      |  1 +
>>>>>> drivers/nvme/host/lightnvm.c |  1 +
>>>>>> include/linux/lightnvm.h     |  2 ++
>>>>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>>>>
>>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>>>>> index 81e8ed4..bd25004 100644
>>>>>> --- a/drivers/lightnvm/pblk-init.c
>>>>>> +++ b/drivers/lightnvm/pblk-init.c
>>>>>> @@ -21,6 +21,8 @@
>>>>>>
>>>>>> #include "pblk.h"
>>>>>> #include "pblk-trace.h"
>>>>>> +#include <linux/cpumask.h>
>>>>>> +#include <linux/numa.h>
>>>>>>
>>>>>> static unsigned int write_buffer_size;
>>>>>>
>>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>>>>
>>>>>> struct bio_set pblk_bio_set;
>>>>>>
>>>>>> +cpumask_t free_cpumask;
>>>>>> +
>>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>>>>>                       struct bio *bio)
>>>>>> {
>>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>>>>
>>>>>> static int pblk_writer_init(struct pblk *pblk)
>>>>>> {
>>>>>> +   cpumask_t tmp_cpumask, cpumask;
>>>>>> +   int cpu;
>>>>>> +
>>>>>>     pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>>>>>     if (IS_ERR(pblk->writer_ts)) {
>>>>>>             int err = PTR_ERR(pblk->writer_ts);
>>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>>>>>             return err;
>>>>>>     }
>>>>>>
>>>>>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>>>>> +                   cpu_online_mask);
>>>>>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>> +
>>>>>> +   if (!cpumask_weight(&free_cpumask)) {
>>>>>> +           free_cpumask = CPU_MASK_ALL;
>>>>>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>> +   }
>>>>>> +
>>>>>> +   cpu = cpumask_last(&cpumask);
>>>>>> +
>>>>>> +   kthread_bind(pblk->writer_ts, cpu);
>>>>>> +
>>>>>> +   cpumask_clear_cpu(cpu, &free_cpumask);
>>>>>> +   pblk->writer_cpu = cpu;
>>>>>> +
>>>>>>     timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>>>>>     mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>>>>
>>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>>>>>                     "Stopping not fully synced write buffer\n");
>>>>>>
>>>>>>     del_timer_sync(&pblk->wtimer);
>>>>>> -   if (pblk->writer_ts)
>>>>>> +   if (pblk->writer_ts) {
>>>>>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>>>>>             kthread_stop(pblk->writer_ts);
>>>>>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>>>>> +   }
>>>>>> }
>>>>>>
>>>>>> static void pblk_free(struct pblk *pblk)
>>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>>>>> {
>>>>>>     int ret;
>>>>>>
>>>>>> +   free_cpumask = CPU_MASK_ALL;
>>>>>> +
>>>>>>     ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>>>>>     if (ret)
>>>>>>             return ret;
>>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>>>>> index 381f074..650f983 100644
>>>>>> --- a/drivers/lightnvm/pblk.h
>>>>>> +++ b/drivers/lightnvm/pblk.h
>>>>>> @@ -690,6 +690,7 @@ struct pblk {
>>>>>>     atomic_t inflight_io;           /* General inflight I/O counter */
>>>>>>
>>>>>>     struct task_struct *writer_ts;
>>>>>> +   int writer_cpu;
>>>>>>
>>>>>>     /* Simple translation map of logical addresses to physical addresses.
>>>>>>      * The logical addresses is known by the host system, while the physical
>>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>>>>> index 949e29e..971a19f 100644
>>>>>> --- a/drivers/nvme/host/lightnvm.c
>>>>>> +++ b/drivers/nvme/host/lightnvm.c
>>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>>>>>     memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>>>>>     dev->ops = &nvme_nvm_dev_ops;
>>>>>>     dev->private_data = ns;
>>>>>> +   dev->node = node;
>>>>>>     ns->ndev = dev;
>>>>>>
>>>>>>     return nvm_register(dev);
>>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>>>>> index 5d865a5..312029e 100644
>>>>>> --- a/include/linux/lightnvm.h
>>>>>> +++ b/include/linux/lightnvm.h
>>>>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>>>>>     char name[DISK_NAME_LEN];
>>>>>>     void *private_data;
>>>>>>
>>>>>> +   int node;
>>>>>> +
>>>>>>     void *rmap;
>>>>>>
>>>>>>     struct mutex mlock;
>>>>>> --
>>>>>> 1.8.3.1
>>>>>
>>>>> We have a per-CPU semaphore that only allows to send a single I/O in
>>>>> order to prevent write pointer violations. Are you seeing this error, or
>>>>> is it theoretical?
>>>>>
>>>>> Javier
>>>>
>>>> I meant per PU (parallel unit)…
>>>
>>> You are right, for current upstream implementation it's theoretical. I
>>> saw it after some experimental changes in lightnvm layer.
>>>
>>> But I see one more potential benefit from this patch (correct me if I'm
>>> wrong), after this patch we will be sure that write thread is working on
>>> cpu from right numa node for device. Now I think, we can also take under
>>> consideration binding to whole set of cpus from right numa node instead
>>> of particular one.
> 
> I can see what you mean, but I think we are better relying on the
> scheduler balancing the NUMA nodes rather than pinning the write thread
> to specific CPUs. Are we doing something like this in the writeback
> path?
> 
I have checked wrtieback path and I didn't find nothing similar, so 
please disregard this patch.

Marcin

>>
>> If this is just a potential benefit, I'd rather not do this, at least
>> not until we have numbers on any performance benefits and an analysis
>> of any trade offs.
>>
>> Thanks,
>> Hans
>>> Marcin

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] lightnvm: pblk: set write thread affinity to particular cpu
  2019-03-19 13:33           ` Marcin Dziegielewski
@ 2019-03-19 14:25             ` Javier González
  0 siblings, 0 replies; 8+ messages in thread
From: Javier González @ 2019-03-19 14:25 UTC (permalink / raw)
  To: Marcin Dziegielewski
  Cc: Hans Holmberg, Matias Bjørling, Hans Holmberg, linux-block,
	Konopko, Igor J

[-- Attachment #1: Type: text/plain, Size: 6907 bytes --]

> On 19 Mar 2019, at 14.33, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
> 
> 
> 
> On 3/18/19 7:22 PM, Javier González wrote:
>>> On 18 Mar 2019, at 13.22, Hans Holmberg <hans@owltronix.com> wrote:
>>> 
>>> On Mon, Mar 18, 2019 at 10:44 AM Marcin Dziegielewski
>>> <marcin.dziegielewski@intel.com> wrote:
>>>> On 3/14/19 3:22 PM, Javier González wrote:
>>>>>> On 14 Mar 2019, at 07.22, Javier González <javier@javigon.com> wrote:
>>>>>> 
>>>>>>> On 14 Mar 2019, at 07.17, Marcin Dziegielewski <marcin.dziegielewski@intel.com> wrote:
>>>>>>> 
>>>>>>> In some cases write thread migration between cpus can cause
>>>>>>> sending writes in improper order and in consequence a lot of
>>>>>>> errors from device.
>>>>>>> 
>>>>>>> Write thread affinity to particular cpu prevent before it.
>>>>>>> 
>>>>>>> Signed-off-by: Marcin Dziegielewski <marcin.dziegielewski@intel.com>
>>>>>>> ---
>>>>>>> drivers/lightnvm/pblk-init.c | 30 +++++++++++++++++++++++++++++-
>>>>>>> drivers/lightnvm/pblk.h      |  1 +
>>>>>>> drivers/nvme/host/lightnvm.c |  1 +
>>>>>>> include/linux/lightnvm.h     |  2 ++
>>>>>>> 4 files changed, 33 insertions(+), 1 deletion(-)
>>>>>>> 
>>>>>>> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
>>>>>>> index 81e8ed4..bd25004 100644
>>>>>>> --- a/drivers/lightnvm/pblk-init.c
>>>>>>> +++ b/drivers/lightnvm/pblk-init.c
>>>>>>> @@ -21,6 +21,8 @@
>>>>>>> 
>>>>>>> #include "pblk.h"
>>>>>>> #include "pblk-trace.h"
>>>>>>> +#include <linux/cpumask.h>
>>>>>>> +#include <linux/numa.h>
>>>>>>> 
>>>>>>> static unsigned int write_buffer_size;
>>>>>>> 
>>>>>>> @@ -47,6 +49,8 @@ struct pblk_global_caches {
>>>>>>> 
>>>>>>> struct bio_set pblk_bio_set;
>>>>>>> 
>>>>>>> +cpumask_t free_cpumask;
>>>>>>> +
>>>>>>> static int pblk_rw_io(struct request_queue *q, struct pblk *pblk,
>>>>>>>                      struct bio *bio)
>>>>>>> {
>>>>>>> @@ -1098,6 +1102,9 @@ static int pblk_lines_init(struct pblk *pblk)
>>>>>>> 
>>>>>>> static int pblk_writer_init(struct pblk *pblk)
>>>>>>> {
>>>>>>> +   cpumask_t tmp_cpumask, cpumask;
>>>>>>> +   int cpu;
>>>>>>> +
>>>>>>>    pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
>>>>>>>    if (IS_ERR(pblk->writer_ts)) {
>>>>>>>            int err = PTR_ERR(pblk->writer_ts);
>>>>>>> @@ -1108,6 +1115,22 @@ static int pblk_writer_init(struct pblk *pblk)
>>>>>>>            return err;
>>>>>>>    }
>>>>>>> 
>>>>>>> +   cpumask_and(&tmp_cpumask, cpumask_of_node(pblk->dev->parent->node),
>>>>>>> +                   cpu_online_mask);
>>>>>>> +   cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>>> +
>>>>>>> +   if (!cpumask_weight(&free_cpumask)) {
>>>>>>> +           free_cpumask = CPU_MASK_ALL;
>>>>>>> +           cpumask_and(&cpumask, &tmp_cpumask, &free_cpumask);
>>>>>>> +   }
>>>>>>> +
>>>>>>> +   cpu = cpumask_last(&cpumask);
>>>>>>> +
>>>>>>> +   kthread_bind(pblk->writer_ts, cpu);
>>>>>>> +
>>>>>>> +   cpumask_clear_cpu(cpu, &free_cpumask);
>>>>>>> +   pblk->writer_cpu = cpu;
>>>>>>> +
>>>>>>>    timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
>>>>>>>    mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
>>>>>>> 
>>>>>>> @@ -1126,8 +1149,11 @@ static void pblk_writer_stop(struct pblk *pblk)
>>>>>>>                    "Stopping not fully synced write buffer\n");
>>>>>>> 
>>>>>>>    del_timer_sync(&pblk->wtimer);
>>>>>>> -   if (pblk->writer_ts)
>>>>>>> +   if (pblk->writer_ts) {
>>>>>>> +           set_cpus_allowed_ptr(pblk->writer_ts, cpu_online_mask);
>>>>>>>            kthread_stop(pblk->writer_ts);
>>>>>>> +           cpumask_set_cpu(pblk->writer_cpu, &free_cpumask);
>>>>>>> +   }
>>>>>>> }
>>>>>>> 
>>>>>>> static void pblk_free(struct pblk *pblk)
>>>>>>> @@ -1328,6 +1354,8 @@ static int __init pblk_module_init(void)
>>>>>>> {
>>>>>>>    int ret;
>>>>>>> 
>>>>>>> +   free_cpumask = CPU_MASK_ALL;
>>>>>>> +
>>>>>>>    ret = bioset_init(&pblk_bio_set, BIO_POOL_SIZE, 0, 0);
>>>>>>>    if (ret)
>>>>>>>            return ret;
>>>>>>> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
>>>>>>> index 381f074..650f983 100644
>>>>>>> --- a/drivers/lightnvm/pblk.h
>>>>>>> +++ b/drivers/lightnvm/pblk.h
>>>>>>> @@ -690,6 +690,7 @@ struct pblk {
>>>>>>>    atomic_t inflight_io;           /* General inflight I/O counter */
>>>>>>> 
>>>>>>>    struct task_struct *writer_ts;
>>>>>>> +   int writer_cpu;
>>>>>>> 
>>>>>>>    /* Simple translation map of logical addresses to physical addresses.
>>>>>>>     * The logical addresses is known by the host system, while the physical
>>>>>>> diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
>>>>>>> index 949e29e..971a19f 100644
>>>>>>> --- a/drivers/nvme/host/lightnvm.c
>>>>>>> +++ b/drivers/nvme/host/lightnvm.c
>>>>>>> @@ -982,6 +982,7 @@ int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node)
>>>>>>>    memcpy(dev->name, disk_name, DISK_NAME_LEN);
>>>>>>>    dev->ops = &nvme_nvm_dev_ops;
>>>>>>>    dev->private_data = ns;
>>>>>>> +   dev->node = node;
>>>>>>>    ns->ndev = dev;
>>>>>>> 
>>>>>>>    return nvm_register(dev);
>>>>>>> diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h
>>>>>>> index 5d865a5..312029e 100644
>>>>>>> --- a/include/linux/lightnvm.h
>>>>>>> +++ b/include/linux/lightnvm.h
>>>>>>> @@ -427,6 +427,8 @@ struct nvm_dev {
>>>>>>>    char name[DISK_NAME_LEN];
>>>>>>>    void *private_data;
>>>>>>> 
>>>>>>> +   int node;
>>>>>>> +
>>>>>>>    void *rmap;
>>>>>>> 
>>>>>>>    struct mutex mlock;
>>>>>>> --
>>>>>>> 1.8.3.1
>>>>>> 
>>>>>> We have a per-CPU semaphore that only allows to send a single I/O in
>>>>>> order to prevent write pointer violations. Are you seeing this error, or
>>>>>> is it theoretical?
>>>>>> 
>>>>>> Javier
>>>>> 
>>>>> I meant per PU (parallel unit)…
>>>> 
>>>> You are right, for current upstream implementation it's theoretical. I
>>>> saw it after some experimental changes in lightnvm layer.
>>>> 
>>>> But I see one more potential benefit from this patch (correct me if I'm
>>>> wrong), after this patch we will be sure that write thread is working on
>>>> cpu from right numa node for device. Now I think, we can also take under
>>>> consideration binding to whole set of cpus from right numa node instead
>>>> of particular one.
>> I can see what you mean, but I think we are better relying on the
>> scheduler balancing the NUMA nodes rather than pinning the write thread
>> to specific CPUs. Are we doing something like this in the writeback
>> path?
> I have checked wrtieback path and I didn't find nothing similar, so please disregard this patch.
> 
> Marcin
> 

Don’t get me wrong. I think it is a good motivation, but maybe it should
be implemented at a different level.

Thanks,
Javier

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2019-03-19 14:25 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-14 14:17 [PATCH] lightnvm: pblk: set write thread affinity to particular cpu Marcin Dziegielewski
2019-03-14 14:22 ` Javier González
2019-03-14 14:22   ` Javier González
2019-03-18  9:44     ` Marcin Dziegielewski
2019-03-18 12:22       ` Hans Holmberg
2019-03-18 18:22         ` Javier González
2019-03-19 13:33           ` Marcin Dziegielewski
2019-03-19 14:25             ` Javier González

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.