All of lore.kernel.org
 help / color / mirror / Atom feed
From: Leon Romanovsky <leon@kernel.org>
To: Jack Wang <jinpuwang@gmail.com>
Cc: linux-block@vger.kernel.org, linux-rdma@vger.kernel.org,
	axboe@kernel.dk, hch@infradead.org, sagi@grimberg.me,
	bvanassche@acm.org, dledford@redhat.com, jgg@ziepe.ca,
	danil.kipnis@cloud.ionos.com, jinpu.wang@cloud.ionos.com,
	rpenyaev@suse.de
Subject: Re: [PATCH v7 17/25] block/rnbd: client: main functionality
Date: Mon, 20 Jan 2020 15:48:15 +0200	[thread overview]
Message-ID: <20200120134815.GH51881@unreal> (raw)
In-Reply-To: <20200116125915.14815-18-jinpuwang@gmail.com>

On Thu, Jan 16, 2020 at 01:59:07PM +0100, Jack Wang wrote:
> From: Jack Wang <jinpu.wang@cloud.ionos.com>
>
> This is main functionality of rnbd-client module, which provides
> interface to map remote device as local block device /dev/rnbd<N>
> and feeds RTRS with IO requests.
>
> Signed-off-by: Danil Kipnis <danil.kipnis@cloud.ionos.com>
> Signed-off-by: Jack Wang <jinpu.wang@cloud.ionos.com>
> ---
>  drivers/block/rnbd/rnbd-clt.c | 1730 +++++++++++++++++++++++++++++++++
>  1 file changed, 1730 insertions(+)
>  create mode 100644 drivers/block/rnbd/rnbd-clt.c
>
> diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
> new file mode 100644
> index 000000000000..7d8cb38d3969
> --- /dev/null
> +++ b/drivers/block/rnbd/rnbd-clt.c
> @@ -0,0 +1,1730 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * RDMA Network Block Driver
> + *
> + * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
> + *
> + * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
> + *
> + * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
> + */
> +
> +#undef pr_fmt
> +#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
> +
> +#include <linux/module.h>
> +#include <linux/blkdev.h>
> +#include <linux/hdreg.h>
> +#include <linux/scatterlist.h>
> +#include <linux/idr.h>
> +
> +#include "rnbd-clt.h"
> +
> +MODULE_DESCRIPTION("RDMA Network Block Device Client");
> +MODULE_LICENSE("GPL");
> +
> +static int rnbd_client_major;
> +static DEFINE_IDA(index_ida);
> +static DEFINE_MUTEX(ida_lock);
> +static DEFINE_MUTEX(sess_lock);
> +static LIST_HEAD(sess_list);
> +
> +/*
> + * Maximum number of partitions an instance can have.
> + * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself)
> + */
> +#define RNBD_PART_BITS		6
> +
> +static inline bool rnbd_clt_get_sess(struct rnbd_clt_session *sess)
> +{
> +	return refcount_inc_not_zero(&sess->refcount);
> +}
> +
> +static void free_sess(struct rnbd_clt_session *sess);
> +
> +static void rnbd_clt_put_sess(struct rnbd_clt_session *sess)
> +{
> +	might_sleep();
> +
> +	if (refcount_dec_and_test(&sess->refcount))
> +		free_sess(sess);
> +}

I see that this code is for drivers/block and maybe it is a way to do it
there, but in RDMA, we don't like abstraction of general and well-known
kernel APIs. It looks like kref to me.

> +
> +static inline bool rnbd_clt_dev_is_mapped(struct rnbd_clt_dev *dev)
> +{
> +	return dev->dev_state == DEV_STATE_MAPPED;
> +}
> +
> +static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev)
> +{
> +	might_sleep();
> +
> +	if (refcount_dec_and_test(&dev->refcount)) {
> +		mutex_lock(&ida_lock);
> +		ida_simple_remove(&index_ida, dev->clt_device_id);
> +		mutex_unlock(&ida_lock);
> +		kfree(dev->hw_queues);
> +		rnbd_clt_put_sess(dev->sess);
> +		kfree(dev);
> +	}
> +}
> +
> +static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
> +{
> +	return refcount_inc_not_zero(&dev->refcount);
> +}
> +
> +static int rnbd_clt_set_dev_attr(struct rnbd_clt_dev *dev,
> +				 const struct rnbd_msg_open_rsp *rsp)
> +{
> +	struct rnbd_clt_session *sess = dev->sess;
> +
> +	if (unlikely(!rsp->logical_block_size))
> +		return -EINVAL;

unlikely() again.

> +
> +	dev->device_id		    = le32_to_cpu(rsp->device_id);
> +	dev->nsectors		    = le64_to_cpu(rsp->nsectors);
> +	dev->logical_block_size	    = le16_to_cpu(rsp->logical_block_size);
> +	dev->physical_block_size    = le16_to_cpu(rsp->physical_block_size);
> +	dev->max_write_same_sectors = le32_to_cpu(rsp->max_write_same_sectors);
> +	dev->max_discard_sectors    = le32_to_cpu(rsp->max_discard_sectors);
> +	dev->discard_granularity    = le32_to_cpu(rsp->discard_granularity);
> +	dev->discard_alignment	    = le32_to_cpu(rsp->discard_alignment);
> +	dev->secure_discard	    = le16_to_cpu(rsp->secure_discard);
> +	dev->rotational		    = rsp->rotational;
> +
> +	dev->max_hw_sectors = sess->max_io_size / SECTOR_SIZE;
> +	dev->max_segments = BMAX_SEGMENTS;
> +
> +	dev->max_hw_sectors = min_t(u32, dev->max_hw_sectors,
> +				    le32_to_cpu(rsp->max_hw_sectors));
> +	dev->max_segments = min_t(u16, dev->max_segments,
> +				  le16_to_cpu(rsp->max_segments));
> +
> +	return 0;
> +}
> +
> +static int rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
> +				    size_t new_nsectors)
> +{
> +	int err = 0;
> +
> +	rnbd_clt_info(dev, "Device size changed from %zu to %zu sectors\n",
> +		       dev->nsectors, new_nsectors);
> +	dev->nsectors = new_nsectors;
> +	set_capacity(dev->gd, dev->nsectors);
> +	err = revalidate_disk(dev->gd);
> +	if (err)
> +		rnbd_clt_err(dev,
> +			      "Failed to change device size from %zu to %zu, err: %d\n",
> +			      dev->nsectors, new_nsectors, err);
> +	return err;
> +}
> +
> +static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
> +				struct rnbd_msg_open_rsp *rsp)
> +{
> +	int err = 0;
> +
> +	mutex_lock(&dev->lock);
> +	if (dev->dev_state == DEV_STATE_UNMAPPED) {
> +		rnbd_clt_info(dev,
> +			       "Ignoring Open-Response message from server for  unmapped device\n");
> +		err = -ENOENT;
> +		goto out;
> +	}
> +	if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
> +		u64 nsectors = le64_to_cpu(rsp->nsectors);
> +
> +		/*
> +		 * If the device was remapped and the size changed in the
> +		 * meantime we need to revalidate it
> +		 */
> +		if (dev->nsectors != nsectors)
> +			rnbd_clt_change_capacity(dev, nsectors);
> +		rnbd_clt_info(dev, "Device online, device remapped successfully\n");
> +	}
> +	err = rnbd_clt_set_dev_attr(dev, rsp);
> +	if (unlikely(err))
> +		goto out;
> +	dev->dev_state = DEV_STATE_MAPPED;
> +
> +out:
> +	mutex_unlock(&dev->lock);
> +
> +	return err;
> +}
> +
> +int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, size_t newsize)
> +{
> +	int ret = 0;
> +
> +	mutex_lock(&dev->lock);
> +	if (dev->dev_state != DEV_STATE_MAPPED) {
> +		pr_err("Failed to set new size of the device, device is not opened\n");
> +		ret = -ENOENT;
> +		goto out;
> +	}
> +	ret = rnbd_clt_change_capacity(dev, newsize);
> +
> +out:
> +	mutex_unlock(&dev->lock);
> +
> +	return ret;
> +}
> +
> +static inline void rnbd_clt_dev_requeue(struct rnbd_queue *q)
> +{
> +	if (WARN_ON(!q->hctx))
> +		return;
> +
> +	/* We can come here from interrupt, thus async=true */
> +	blk_mq_run_hw_queue(q->hctx, true);
> +}
> +
> +enum {
> +	RNBD_DELAY_10ms   = 10,
> +	RNBD_DELAY_IFBUSY = -1,
> +};
> +
> +/**
> + * rnbd_get_cpu_qlist() - finds a list with HW queues to be rerun
> + * @sess:	Session to find a queue for
> + * @cpu:	Cpu to start the search from
> + *
> + * Description:
> + *     Each CPU has a list of HW queues, which needs to be rerun.  If a list
> + *     is not empty - it is marked with a bit.  This function finds first
> + *     set bit in a bitmap and returns corresponding CPU list.
> + */
> +static struct rnbd_cpu_qlist *
> +rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu)
> +{
> +	int bit;
> +
> +	/* First half */
> +	bit = find_next_bit(sess->cpu_queues_bm, nr_cpu_ids, cpu);

Is it protected by any lock?

> +	if (bit < nr_cpu_ids) {
> +		return per_cpu_ptr(sess->cpu_queues, bit);
> +	} else if (cpu != 0) {
> +		/* Second half */
> +		bit = find_next_bit(sess->cpu_queues_bm, cpu, 0);
> +		if (bit < cpu)
> +			return per_cpu_ptr(sess->cpu_queues, bit);
> +	}
> +
> +	return NULL;
> +}
> +
> +static inline int nxt_cpu(int cpu)
> +{
> +	return (cpu + 1) % nr_cpu_ids;
> +}
> +
> +/**
> + * rnbd_rerun_if_needed() - rerun next queue marked as stopped
> + * @sess:	Session to rerun a queue on
> + *
> + * Description:
> + *     Each CPU has it's own list of HW queues, which should be rerun.
> + *     Function finds such list with HW queues, takes a list lock, picks up
> + *     the first HW queue out of the list and requeues it.
> + *
> + * Return:
> + *     True if the queue was requeued, false otherwise.
> + *
> + * Context:
> + *     Does not matter.
> + */
> +static inline bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess)

No inline function in C files.

> +{
> +	struct rnbd_queue *q = NULL;
> +	struct rnbd_cpu_qlist *cpu_q;
> +	unsigned long flags;
> +	int *cpup;
> +
> +	/*
> +	 * To keep fairness and not to let other queues starve we always
> +	 * try to wake up someone else in round-robin manner.  That of course
> +	 * increases latency but queues always have a chance to be executed.
> +	 */
> +	cpup = get_cpu_ptr(sess->cpu_rr);
> +	for (cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(*cpup)); cpu_q;
> +	     cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) {
> +		if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags))
> +			continue;
> +		if (likely(test_bit(cpu_q->cpu, sess->cpu_queues_bm))) {

Success oriented approach please.

> +			q = list_first_entry_or_null(&cpu_q->requeue_list,
> +						     typeof(*q), requeue_list);
> +			if (WARN_ON(!q))
> +				goto clear_bit;
> +			list_del_init(&q->requeue_list);
> +			clear_bit_unlock(0, &q->in_list);
> +
> +			if (list_empty(&cpu_q->requeue_list)) {
> +				/* Clear bit if nothing is left */
> +clear_bit:
> +				clear_bit(cpu_q->cpu, sess->cpu_queues_bm);
> +			}
> +		}
> +		spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
> +
> +		if (q)
> +			break;
> +	}
> +
> +	/**
> +	 * Saves the CPU that is going to be requeued on the per-cpu var. Just
> +	 * incrementing it doesn't work because rnbd_get_cpu_qlist() will
> +	 * always return the first CPU with something on the queue list when the
> +	 * value stored on the var is greater than the last CPU with something
> +	 * on the list.
> +	 */
> +	if (cpu_q)
> +		*cpup = cpu_q->cpu;
> +	put_cpu_var(sess->cpu_rr);
> +
> +	if (q)
> +		rnbd_clt_dev_requeue(q);
> +
> +	return !!q;
> +}
> +
> +/**
> + * rnbd_rerun_all_if_idle() - rerun all queues left in the list if
> + *				 session is idling (there are no requests
> + *				 in-flight).
> + * @sess:	Session to rerun the queues on
> + *
> + * Description:
> + *     This function tries to rerun all stopped queues if there are no
> + *     requests in-flight anymore.  This function tries to solve an obvious
> + *     problem, when number of tags < than number of queues (hctx), which
> + *     are stopped and put to sleep.  If last permit, which has been just put,
> + *     does not wake up all left queues (hctxs), IO requests hang forever.
> + *
> + *     That can happen when all number of permits, say N, have been exhausted
> + *     from one CPU, and we have many block devices per session, say M.
> + *     Each block device has it's own queue (hctx) for each CPU, so eventually
> + *     we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids.
> + *     If number of permits N < M x nr_cpu_ids finally we will get an IO hang.
> + *
> + *     To avoid this hang last caller of rnbd_put_permit() (last caller is the
> + *     one who observes sess->busy == 0) must wake up all remaining queues.
> + *
> + * Context:
> + *     Does not matter.
> + */
> +static inline void rnbd_rerun_all_if_idle(struct rnbd_clt_session *sess)
> +{
> +	bool requeued;
> +
> +	do {
> +		requeued = rnbd_rerun_if_needed(sess);
> +	} while (atomic_read(&sess->busy) == 0 && requeued);
> +}
> +
> +static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess,
> +					     enum rtrs_clt_con_type con_type,
> +					     int wait)
> +{
> +	struct rtrs_permit *permit;
> +
> +	permit = rtrs_clt_get_permit(sess->rtrs, con_type,
> +				      wait ? RTRS_PERMIT_WAIT :
> +				      RTRS_PERMIT_NOWAIT);
> +	if (likely(permit))
> +		/* We have a subtle rare case here, when all permits can be
> +		 * consumed before busy counter increased.  This is safe,
> +		 * because loser will get NULL as a permit, observe 0 busy
> +		 * counter and immediately restart the queue himself.
> +		 */
> +		atomic_inc(&sess->busy);
> +
> +	return permit;
> +}
> +
> +static void rnbd_put_permit(struct rnbd_clt_session *sess,
> +			     struct rtrs_permit *permit)
> +{
> +	rtrs_clt_put_permit(sess->rtrs, permit);
> +	atomic_dec(&sess->busy);
> +	/* Paired with rnbd_clt_dev_add_to_requeue().  Decrement first
> +	 * and then check queue bits.
> +	 */
> +	smp_mb__after_atomic();
> +	rnbd_rerun_all_if_idle(sess);
> +}
> +
> +static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
> +				     enum rtrs_clt_con_type con_type,
> +				     int wait)
> +{
> +	struct rnbd_iu *iu;
> +	struct rtrs_permit *permit;
> +
> +	permit = rnbd_get_permit(sess, con_type,
> +				  wait ? RTRS_PERMIT_WAIT :
> +				  RTRS_PERMIT_NOWAIT);
> +	if (unlikely(!permit))
> +		return NULL;
> +	iu = rtrs_permit_to_pdu(permit);
> +	iu->permit = permit;
> +	/* yes, rtrs_permit_from_pdu() can be nice here,
> +	 * but also we have to think about MQ mode
> +	 */
> +	/*
> +	 * 1st reference is dropped after finishing sending a "user" message,
> +	 * 2nd reference is dropped after confirmation with the response is
> +	 * returned.
> +	 * 1st and 2nd can happen in any order, so the rnbd_iu should be
> +	 * released (rtrs_permit returned to ibbtrs) only leased after both
> +	 * are finished.
> +	 */
> +	atomic_set(&iu->refcount, 2);
> +	init_waitqueue_head(&iu->comp.wait);
> +	iu->comp.errno = INT_MAX;
> +
> +	return iu;
> +}
> +
> +static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu)
> +{
> +	if (atomic_dec_and_test(&iu->refcount))
> +		rnbd_put_permit(sess, iu->permit);
> +}
> +
> +static void rnbd_softirq_done_fn(struct request *rq)
> +{
> +	struct rnbd_clt_dev *dev	= rq->rq_disk->private_data;
> +	struct rnbd_clt_session *sess	= dev->sess;a

Please no vertical alignment in new code, it adds a lot of churn if such
line is changed later and creates difficulties for the backports.

Thanks

  reply	other threads:[~2020-01-20 13:48 UTC|newest]

Thread overview: 51+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2020-01-16 12:58 [PATCH v7 00/25] RTRS (former IBTRS) RDMA Transport Library and RNBD (former IBNBD) RDMA Network Block Device Jack Wang
2020-01-16 12:58 ` [PATCH v7 01/25] sysfs: export sysfs_remove_file_self() Jack Wang
2020-01-16 12:58 ` [PATCH v7 02/25] RDMA/rtrs: public interface header to establish RDMA connections Jack Wang
2020-01-16 12:58 ` [PATCH v7 03/25] RDMA/rtrs: private headers with rtrs protocol structs and helpers Jack Wang
2020-01-16 12:58 ` [PATCH v7 04/25] RDMA/rtrs: core: lib functions shared between client and server modules Jack Wang
2020-01-19 14:48   ` Leon Romanovsky
2020-01-20 11:32     ` Jinpu Wang
2020-01-20 13:30       ` Leon Romanovsky
2020-01-20 14:08         ` Jinpu Wang
2020-01-16 12:58 ` [PATCH v7 05/25] RDMA/rtrs: client: private header with client structs and functions Jack Wang
2020-01-16 12:58 ` [PATCH v7 06/25] RDMA/rtrs: client: main functionality Jack Wang
2020-01-16 14:53   ` Leon Romanovsky
2020-01-16 15:43     ` Jinpu Wang
2020-01-16 15:53       ` Jason Gunthorpe
2020-01-16 16:48         ` Jinpu Wang
2020-01-16 15:58       ` Leon Romanovsky
2020-01-16 16:24         ` Jinpu Wang
2020-01-18 10:12           ` Leon Romanovsky
2020-01-20 11:24             ` Jinpu Wang
2020-01-16 12:58 ` [PATCH v7 07/25] RDMA/rtrs: client: statistics functions Jack Wang
2020-01-16 12:58 ` [PATCH v7 08/25] RDMA/rtrs: client: sysfs interface functions Jack Wang
2020-01-16 12:58 ` [PATCH v7 09/25] RDMA/rtrs: server: private header with server structs and functions Jack Wang
2020-01-16 12:59 ` [PATCH v7 10/25] RDMA/rtrs: server: main functionality Jack Wang
2020-01-16 12:59 ` [PATCH v7 11/25] RDMA/rtrs: server: statistics functions Jack Wang
2020-01-16 12:59 ` [PATCH v7 12/25] RDMA/rtrs: server: sysfs interface functions Jack Wang
2020-01-16 12:59 ` [PATCH v7 13/25] RDMA/rtrs: include client and server modules into kernel compilation Jack Wang
2020-01-16 12:59 ` [PATCH v7 14/25] RDMA/rtrs: a bit of documentation Jack Wang
2020-01-16 12:59 ` [PATCH v7 15/25] block/rnbd: private headers with rnbd protocol structs and helpers Jack Wang
2020-01-16 12:59 ` [PATCH v7 16/25] block/rnbd: client: private header with client structs and functions Jack Wang
2020-01-16 12:59 ` [PATCH v7 17/25] block/rnbd: client: main functionality Jack Wang
2020-01-20 13:48   ` Leon Romanovsky [this message]
2020-01-20 17:30     ` Jinpu Wang
2020-01-22 12:31       ` Leon Romanovsky
2020-01-22 12:52         ` Jinpu Wang
2020-01-22 11:22     ` Jinpu Wang
2020-01-22 12:25       ` Leon Romanovsky
2020-01-22 13:12         ` Jinpu Wang
2020-01-22 14:07           ` Leon Romanovsky
2020-01-22 14:18             ` Jinpu Wang
2020-01-16 12:59 ` [PATCH v7 18/25] block/rnbd: client: sysfs interface functions Jack Wang
2020-01-16 12:59 ` [PATCH v7 19/25] block/rnbd: server: private header with server structs and functions Jack Wang
2020-01-16 12:59 ` [PATCH v7 20/25] block/rnbd: server: main functionality Jack Wang
2020-01-16 12:59 ` [PATCH v7 21/25] block/rnbd: server: functionality for IO submission to file or block dev Jack Wang
2020-01-16 12:59 ` [PATCH v7 22/25] block/rnbd: server: sysfs interface functions Jack Wang
2020-01-16 12:59 ` [PATCH v7 23/25] block/rnbd: include client and server modules into kernel compilation Jack Wang
2020-01-16 14:40   ` Leon Romanovsky
2020-01-16 14:54     ` Jinpu Wang
2020-01-16 15:59       ` Leon Romanovsky
2020-01-16 16:53         ` Jinpu Wang
2020-01-16 12:59 ` [PATCH v7 24/25] block/rnbd: a bit of documentation Jack Wang
2020-01-16 12:59 ` [PATCH v7 25/25] MAINTAINERS: Add maintainers for RNBD/RTRS modules Jack Wang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20200120134815.GH51881@unreal \
    --to=leon@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=bvanassche@acm.org \
    --cc=danil.kipnis@cloud.ionos.com \
    --cc=dledford@redhat.com \
    --cc=hch@infradead.org \
    --cc=jgg@ziepe.ca \
    --cc=jinpu.wang@cloud.ionos.com \
    --cc=jinpuwang@gmail.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-rdma@vger.kernel.org \
    --cc=rpenyaev@suse.de \
    --cc=sagi@grimberg.me \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.