linux-bcache.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Coly Li <colyli@suse.de>
To: Qiaowei Ren <qiaowei.ren@intel.com>, Jianpeng Ma <jianpeng.ma@intel.com>
Cc: linux-bcache@vger.kernel.org
Subject: Re: [bch-nvm-pages v7 2/6] bcache: initialize the nvm pages allocator
Date: Mon, 29 Mar 2021 13:54:05 +0800	[thread overview]
Message-ID: <f1bab0fa-e5cf-a0ca-5e89-b9dfdcff7988@suse.de> (raw)
In-Reply-To: <20210317151029.40735-3-qiaowei.ren@intel.com>

On 3/17/21 11:10 PM, Qiaowei Ren wrote:
> From: Jianpeng Ma <jianpeng.ma@intel.com>
> 
> This patch define the prototype data structures in memory and initializes
> the nvm pages allocator.
> 
> The nv address space which is managed by this allocatior can consist of
> many nvm namespaces, and some namespaces can compose into one nvm set,
> like cache set. For this initial implementation, only one set can be
> supported.
> 
> The users of this nvm pages allocator need to call regiseter_namespace()
> to register the nvdimm device (like /dev/pmemX) into this allocator as
> the instance of struct nvm_namespace.
> 
> Signed-off-by: Jianpeng Ma <jianpeng.ma@intel.com>
> Co-authored-by: Qiaowei Ren <qiaowei.ren@intel.com>
> ---
>  drivers/md/bcache/Kconfig       |   6 +
>  drivers/md/bcache/Makefile      |   2 +-
>  drivers/md/bcache/nvm-pages.c   | 291 ++++++++++++++++++++++++++++++++
>  drivers/md/bcache/nvm-pages.h   |  71 ++++++++
>  drivers/md/bcache/super.c       |   3 +
>  include/uapi/linux/bcache-nvm.h |   7 -
>  6 files changed, 372 insertions(+), 8 deletions(-)
>  create mode 100644 drivers/md/bcache/nvm-pages.c
>  create mode 100644 drivers/md/bcache/nvm-pages.h
> 
> diff --git a/drivers/md/bcache/Kconfig b/drivers/md/bcache/Kconfig
> index d1ca4d059c20..fdec9905ef40 100644
> --- a/drivers/md/bcache/Kconfig
> +++ b/drivers/md/bcache/Kconfig
> @@ -35,3 +35,9 @@ config BCACHE_ASYNC_REGISTRATION
>  	device path into this file will returns immediately and the real
>  	registration work is handled in kernel work queue in asynchronous
>  	way.
> +
> +config BCACHE_NVM_PAGES
> +	bool "NVDIMM support for bcache (EXPERIMENTAL)"
> +	depends on BCACHE
> +	help
> +	nvm pages allocator for bcache.
> diff --git a/drivers/md/bcache/Makefile b/drivers/md/bcache/Makefile
> index 5b87e59676b8..948e5ed2ca66 100644
> --- a/drivers/md/bcache/Makefile
> +++ b/drivers/md/bcache/Makefile
> @@ -4,4 +4,4 @@ obj-$(CONFIG_BCACHE)	+= bcache.o
>  
>  bcache-y		:= alloc.o bset.o btree.o closure.o debug.o extents.o\
>  	io.o journal.o movinggc.o request.o stats.o super.o sysfs.o trace.o\
> -	util.o writeback.o features.o
> +	util.o writeback.o features.o nvm-pages.o
> diff --git a/drivers/md/bcache/nvm-pages.c b/drivers/md/bcache/nvm-pages.c
> new file mode 100644
> index 000000000000..9335371c9d91
> --- /dev/null
> +++ b/drivers/md/bcache/nvm-pages.c
> @@ -0,0 +1,291 @@
> +// SPDX-License-Identifier: GPL-2.0-only
> +/*
> + * Nvdimm page-buddy allocator
> + *
> + * Copyright (c) 2021, Intel Corporation.
> + * Copyright (c) 2021, Qiaowei Ren <qiaowei.ren@intel.com>.
> + * Copyright (c) 2021, Jianpeng Ma <jianpeng.ma@intel.com>.
> + */
> +
> +#include "bcache.h"
> +#include "nvm-pages.h"
> +
> +#include <linux/slab.h>
> +#include <linux/list.h>
> +#include <linux/mutex.h>
> +#include <linux/dax.h>
> +#include <linux/pfn_t.h>
> +#include <linux/libnvdimm.h>
> +#include <linux/mm_types.h>
> +#include <linux/err.h>
> +#include <linux/pagemap.h>
> +#include <linux/bitmap.h>
> +#include <linux/blkdev.h>
> +
> +#ifdef CONFIG_BCACHE_NVM_PAGES
> +
> +static const char bch_nvm_pages_magic[] = {
> +	0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
> +	0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
> +static const char bch_nvm_pages_pgalloc_magic[] = {
> +	0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
> +	0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };
> +

The above magic strings belong to include/uapi/linux/bcache-nvm.h, you
may add,

#include <linux/bcache-nvm.h>

to include them from the uapi header.


> +struct bch_nvm_set *only_set;
> +
> +static void release_nvm_namespaces(struct bch_nvm_set *nvm_set)
> +{
> +	int i;
> +	struct bch_nvm_namespace *ns;
> +
> +	for (i = 0; i < nvm_set->total_namespaces_nr; i++) {
> +		ns = nvm_set->nss[i];
> +		if (ns) {
> +			blkdev_put(ns->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
> +			kfree(ns);
> +		}
> +	}
> +
> +	kfree(nvm_set->nss);
> +}
> +
> +static void release_nvm_set(struct bch_nvm_set *nvm_set)
> +{
> +	release_nvm_namespaces(nvm_set);
> +	kfree(nvm_set);
> +}
> +
> +static int init_owner_info(struct bch_nvm_namespace *ns)
> +{
> +	struct bch_owner_list_head *owner_list_head = ns->sb->owner_list_head;
> +
> +	mutex_lock(&only_set->lock);
> +	only_set->owner_list_head = owner_list_head;
> +	only_set->owner_list_size = owner_list_head->size;
> +	only_set->owner_list_used = owner_list_head->used;
> +	mutex_unlock(&only_set->lock);
> +
> +	return 0;
> +}
> +
> +static bool attach_nvm_set(struct bch_nvm_namespace *ns)
> +{
> +	bool rc = true;
> +
> +	mutex_lock(&only_set->lock);
> +	if (only_set->nss) {
> +		if (memcmp(ns->sb->set_uuid, only_set->set_uuid, 16)) {
> +			pr_info("namespace id does't match nvm set\n");
> +			rc = false;
> +			goto unlock;
> +		}
> +
> +		if (only_set->nss[ns->sb->this_namespace_nr]) {
> +			pr_info("already has the same position(%d) nvm\n",
> +					ns->sb->this_namespace_nr);
> +			rc = false;
> +			goto unlock;
> +		}
> +	} else {
> +		memcpy(only_set->set_uuid, ns->sb->set_uuid, 16);
> +		only_set->total_namespaces_nr = ns->sb->total_namespaces_nr;
> +		only_set->nss = kcalloc(only_set->total_namespaces_nr,
> +				sizeof(struct bch_nvm_namespace *), GFP_KERNEL);
> +		if (!only_set->nss) {
> +			rc = false;
> +			goto unlock;
> +		}
> +	}
> +
> +	only_set->nss[ns->sb->this_namespace_nr] = ns;
> +
> +unlock:
> +	mutex_unlock(&only_set->lock);
> +	return rc;
> +}
> +
> +static int read_nvdimm_meta_super(struct block_device *bdev,
> +			      struct bch_nvm_namespace *ns)
> +{
> +	struct page *page;
> +	struct bch_nvm_pages_sb *sb;
> +
> +	page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
> +			BCH_NVM_PAGES_SB_OFFSET >> PAGE_SHIFT, GFP_KERNEL);
> +
> +	if (IS_ERR(page))
> +		return -EIO;
> +
> +	sb = page_address(page) + offset_in_page(BCH_NVM_PAGES_SB_OFFSET);
> +
> +	// temporary use for DAX API
> +	ns->page_size = sb->page_size;
> +	ns->pages_total = sb->pages_total;
> +
> +	put_page(page);
> +
> +	return 0;
> +}
> +
> +struct bch_nvm_namespace *bch_register_namespace(const char *dev_path)
> +{
> +	struct bch_nvm_namespace *ns;
> +	int err;
> +	pgoff_t pgoff;
> +	char buf[BDEVNAME_SIZE];
> +	struct block_device *bdev;
> +	uint64_t expected_csum;
> +	int id;
> +	char *path = NULL;
> +
> +	path = kstrndup(dev_path, 512, GFP_KERNEL);
> +	if (!path) {
> +		pr_err("kstrndup failed\n");
> +		return ERR_PTR(-ENOMEM);
> +	}
> +
> +	bdev = blkdev_get_by_path(strim(path),
> +				  FMODE_READ|FMODE_WRITE|FMODE_EXEC,
> +				  only_set);
> +	if (IS_ERR(bdev)) {
> +		pr_info("get %s error: %ld\n", dev_path, PTR_ERR(bdev));
> +		kfree(path);
> +		return ERR_PTR(PTR_ERR(bdev));
> +	}
> +
> +	ns = kzalloc(sizeof(struct bch_nvm_namespace), GFP_KERNEL);
> +	if (!ns)
> +		goto bdput;
> +
> +	err = -EIO;
> +	if (read_nvdimm_meta_super(bdev, ns)) {
> +		pr_info("%s read nvdimm meta super block failed.\n",
> +			bdevname(bdev, buf));
> +		goto free_ns;
> +	}
> +
> +	err = -EOPNOTSUPP;
> +	if (!bdev_dax_supported(bdev, ns->page_size)) {
> +		pr_info("%s don't support DAX\n", bdevname(bdev, buf));
> +		goto free_ns;
> +	}
> +
> +	err = -EINVAL;
> +	if (bdev_dax_pgoff(bdev, 0, ns->page_size, &pgoff)) {
> +		pr_info("invalid offset of %s\n", bdevname(bdev, buf));
> +		goto free_ns;
> +	}
> +
> +	err = -ENOMEM;
> +	ns->dax_dev = fs_dax_get_by_bdev(bdev);
> +	if (!ns->dax_dev) {
> +		pr_info("can't by dax device by %s\n", bdevname(bdev, buf));
> +		goto free_ns;
> +	}
> +
> +	err = -EINVAL;
> +	id = dax_read_lock();
> +	if (dax_direct_access(ns->dax_dev, pgoff, ns->pages_total,
> +			      &ns->kaddr, &ns->start_pfn) <= 0) {
> +		pr_info("dax_direct_access error\n");
> +		dax_read_unlock(id);
> +		goto free_ns;
> +	}
> +	dax_read_unlock(id);
> +
> +	ns->sb = ns->kaddr + BCH_NVM_PAGES_SB_OFFSET;
> +
> +	if (memcmp(ns->sb->magic, bch_nvm_pages_magic, 16)) {
> +		pr_info("invalid bch_nvm_pages_magic\n");
> +		goto free_ns;
> +	}
> +
> +	if (ns->sb->sb_offset != BCH_NVM_PAGES_SB_OFFSET) {
> +		pr_info("invalid superblock offset\n");
> +		goto free_ns;
> +	}
> +
> +	if (ns->sb->total_namespaces_nr != 1) {
> +		pr_info("only one nvm device\n");
> +		goto free_ns;
> +	}
> +
> +	expected_csum = csum_set(ns->sb);
> +	if (expected_csum != ns->sb->csum) {
> +		pr_info("csum is not match with expected one\n");
> +		goto free_ns;
> +	}
> +
> +	err = -EEXIST;
> +	if (!attach_nvm_set(ns))
> +		goto free_ns;
> +
> +	// Firstly attach
> +	if ((unsigned long)ns->sb->owner_list_head == BCH_NVM_PAGES_OWNER_LIST_HEAD_OFFSET) {
> +		struct bch_nvm_pages_owner_head *sys_owner_head;
> +		struct bch_nvm_pgalloc_recs *sys_pgalloc_recs;
> +
> +		ns->sb->owner_list_head = ns->kaddr + BCH_NVM_PAGES_OWNER_LIST_HEAD_OFFSET;
> +		sys_pgalloc_recs = ns->kaddr + BCH_NVM_PAGES_SYS_RECS_HEAD_OFFSET;
> +
> +		sys_owner_head = &(ns->sb->owner_list_head->heads[0]);
> +		sys_owner_head->recs[0] = sys_pgalloc_recs;
> +		ns->sb->csum = csum_set(ns->sb);
> +
> +		sys_pgalloc_recs->owner = sys_owner_head;
> +	} else
> +		BUG_ON(ns->sb->owner_list_head !=
> +			(ns->kaddr + BCH_NVM_PAGES_OWNER_LIST_HEAD_OFFSET));
> +
> +	ns->page_size = ns->sb->page_size;
> +	ns->pages_offset = ns->sb->pages_offset;
> +	ns->pages_total = ns->sb->pages_total;
> +	ns->free = 0;
> +	ns->bdev = bdev;
> +	ns->nvm_set = only_set;
> +	mutex_init(&ns->lock);
> +
> +	if (ns->sb->this_namespace_nr == 0) {
> +		pr_info("only first namespace contain owner info\n");
> +		err = init_owner_info(ns);
> +		if (err < 0) {
> +			pr_info("init_owner_info met error %d\n", err);
> +			only_set->nss[ns->sb->this_namespace_nr] = NULL;
> +			goto free_ns;
> +		}
> +	}
> +
> +	kfree(path);
> +	return ns;
> +free_ns:
> +	kfree(ns);
> +bdput:
> +	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXEC);
> +	kfree(path);
> +	return ERR_PTR(err);
> +}
> +EXPORT_SYMBOL_GPL(bch_register_namespace);
> +
> +int __init bch_nvm_init(void)
> +{
> +	only_set = kzalloc(sizeof(*only_set), GFP_KERNEL);
> +	if (!only_set)
> +		return -ENOMEM;
> +
> +	only_set->total_namespaces_nr = 0;
> +	only_set->owner_list_head = NULL;
> +	only_set->nss = NULL;
> +
> +	mutex_init(&only_set->lock);
> +
> +	pr_info("bcache nvm init\n");
> +	return 0;
> +}
> +
> +void bch_nvm_exit(void)
> +{
> +	release_nvm_set(only_set);
> +	pr_info("bcache nvm exit\n");
> +}
> +
> +#endif
> diff --git a/drivers/md/bcache/nvm-pages.h b/drivers/md/bcache/nvm-pages.h
> new file mode 100644
> index 000000000000..3b723a775b7b
> --- /dev/null
> +++ b/drivers/md/bcache/nvm-pages.h
> @@ -0,0 +1,71 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#ifndef _BCACHE_NVM_PAGES_H
> +#define _BCACHE_NVM_PAGES_H
> +
> +#include <linux/bcache-nvm.h>
> +
> +/*
> + * Bcache NVDIMM in memory data structures
> + */
> +
> +/*
> + * The following three structures in memory records which page(s) allocated
> + * to which owner. After reboot from power failure, they will be initialized
> + * based on nvm pages superblock in NVDIMM device.
> + */
> +struct bch_nvm_namespace {
> +	struct bch_nvm_pages_sb *sb;
> +	void *kaddr;
> +
> +	u8 uuid[16];
> +	u64 free;
> +	u32 page_size;
> +	u64 pages_offset;
> +	u64 pages_total;
> +	pfn_t start_pfn;
> +
> +	struct dax_device *dax_dev;
> +	struct block_device *bdev;
> +	struct bch_nvm_set *nvm_set;
> +
> +	struct mutex lock;
> +};
> +
> +/*
> + * A set of namespaces. Currently only one set can be supported.
> + */
> +struct bch_nvm_set {
> +	u8 set_uuid[16];
> +	u32 total_namespaces_nr;
> +
> +	u32 owner_list_size;
> +	u32 owner_list_used;
> +	struct bch_owner_list_head *owner_list_head;
> +
> +	struct bch_nvm_namespace **nss;
> +
> +	struct mutex lock;
> +};
> +extern struct bch_nvm_set *only_set;
> +
> +#ifdef CONFIG_BCACHE_NVM_PAGES
> +
> +struct bch_nvm_namespace *bch_register_namespace(const char *dev_path);
> +int bch_nvm_init(void);
> +void bch_nvm_exit(void);
> +
> +#else
> +
> +static inline struct bch_nvm_namespace *bch_register_namespace(const char *dev_path)
> +	return NULL;
> +}
> +static inline int bch_nvm_init(void)
> +{
> +	return 0;
> +}
> +static inline void bch_nvm_exit(void) { }
> +
> +#endif /* CONFIG_BCACHE_NVM_PAGES */
> +
> +#endif /* _BCACHE_NVM_PAGES_H */
> diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
> index 2047a9cccdb5..7fffb6ccfb0c 100644
> --- a/drivers/md/bcache/super.c
> +++ b/drivers/md/bcache/super.c
> @@ -14,6 +14,7 @@
>  #include "request.h"
>  #include "writeback.h"
>  #include "features.h"
> +#include "nvm-pages.h"
>  
>  #include <linux/blkdev.h>
>  #include <linux/debugfs.h>
> @@ -2815,6 +2816,7 @@ static void bcache_exit(void)
>  {
>  	bch_debug_exit();
>  	bch_request_exit();
> +	bch_nvm_exit();
>  	if (bcache_kobj)
>  		kobject_put(bcache_kobj);
>  	if (bcache_wq)
> @@ -2894,6 +2896,7 @@ static int __init bcache_init(void)
>  
>  	bch_debug_init();
>  	closure_debug_init();
> +	bch_nvm_init();
>  
>  	bcache_is_reboot = false;
>  
> diff --git a/include/uapi/linux/bcache-nvm.h b/include/uapi/linux/bcache-nvm.h
> index 01370f4e12d4..8bd5e8f96cf5 100644
> --- a/include/uapi/linux/bcache-nvm.h
> +++ b/include/uapi/linux/bcache-nvm.h
> @@ -99,13 +99,6 @@
>  #define BCH_NVM_PAGES_SB_VERSION		0
>  #define BCH_NVM_PAGES_SB_VERSION_MAX		0
>  
> -static const char bch_nvm_pages_magic[] = {
> -	0x17, 0xbd, 0x53, 0x7f, 0x1b, 0x23, 0xd6, 0x83,
> -	0x46, 0xa4, 0xf8, 0x28, 0x17, 0xda, 0xec, 0xa9 };
> -static const char bch_nvm_pages_pgalloc_magic[] = {
> -	0x39, 0x25, 0x3f, 0xf7, 0x27, 0x17, 0xd0, 0xb9,
> -	0x10, 0xe6, 0xd2, 0xda, 0x38, 0x68, 0x26, 0xae };


The above magic strings will be needed in user space tools to verify the
meta data, they should be here and not moved to other location.

> -
>  struct bch_pgalloc_rec {
>  	__u64			pgoff:52;
>  	__u64			order:12;
> 

BTW the above difinication of bit field is improper. You won't get what
you wan't as a 64bit size record. But don't worry I fix it in other
patches and the joint series will be posted soon after I integriate them
together.

Coly Li



  reply	other threads:[~2021-03-29  5:55 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-03-17 15:10 [bch-nvm-pages v7 0/6] nvm page allocator for bcache Qiaowei Ren
2021-03-17 15:10 ` [bch-nvm-pages v7 1/6] bcache: add initial data structures for nvm pages Qiaowei Ren
2021-03-29  5:47   ` Coly Li
2021-03-17 15:10 ` [bch-nvm-pages v7 2/6] bcache: initialize the nvm pages allocator Qiaowei Ren
2021-03-29  5:54   ` Coly Li [this message]
2021-04-05 15:14     ` Coly Li
2021-03-17 15:10 ` [bch-nvm-pages v7 3/6] bcache: initialization of the buddy Qiaowei Ren
2021-03-29 11:10   ` Coly Li
2021-03-30  2:13     ` Ma, Jianpeng
2021-03-17 15:10 ` [bch-nvm-pages v7 4/6] bcache: bch_nvm_alloc_pages() " Qiaowei Ren
2021-03-29 10:01   ` Coly Li
2021-03-17 15:10 ` [bch-nvm-pages v7 5/6] bcache: bch_nvm_free_pages() " Qiaowei Ren
2021-03-17 15:10 ` [bch-nvm-pages v7 6/6] bcache: get allocated pages from specific owner Qiaowei Ren
2021-03-23 13:24 ` [bch-nvm-pages v7 0/6] nvm page allocator for bcache Coly Li

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f1bab0fa-e5cf-a0ca-5e89-b9dfdcff7988@suse.de \
    --to=colyli@suse.de \
    --cc=jianpeng.ma@intel.com \
    --cc=linux-bcache@vger.kernel.org \
    --cc=qiaowei.ren@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).