All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] ceph: map snapid to anonymous bdev ID
@ 2017-12-15  9:24 Yan, Zheng
  2017-12-15  9:24 ` [PATCH 2/3] ceph: track read contexts in ceph_file_info Yan, Zheng
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Yan, Zheng @ 2017-12-15  9:24 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, idryomov, Yan, Zheng

ceph_getattr() return zero dev ID for head inodes and set dev ID to
snapid directly for snaphost inodes. This is not good because userspace
utilities may consider device ID of 0 as invalid, snapid may conflict
with other device's ID.

This patch introduces "snapids to anonymous bdev IDs" map. we create a
new mapping when we see a snapid for the first time. we trim unused
mapping after it is ilde for an hour.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/addr.c       |   4 +-
 fs/ceph/inode.c      |  31 +++++++----
 fs/ceph/mds_client.c |   8 +++
 fs/ceph/mds_client.h |  13 +++++
 fs/ceph/snap.c       | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ceph/super.h      |  13 ++++-
 6 files changed, 203 insertions(+), 14 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index dbf07051aacd..6f7f00a35190 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -450,8 +450,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 		goto out;
 
 	max = fsc->mount_options->rsize >> PAGE_SHIFT;
-	dout("readpages %p file %p nr_pages %d max %d\n",
-	     inode, file, nr_pages, max);
+	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
+	     inode, file, rw_ctx, nr_pages, max);
 	while (!list_empty(page_list)) {
 		rc = start_read(inode, page_list, max);
 		if (rc < 0)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index c6ec5aa46100..76a428749981 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -542,14 +542,19 @@ void ceph_destroy_inode(struct inode *inode)
 	 */
 	if (ci->i_snap_realm) {
 		struct ceph_mds_client *mdsc =
-			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
-		struct ceph_snap_realm *realm = ci->i_snap_realm;
-
-		dout(" dropping residual ref to snap realm %p\n", realm);
-		spin_lock(&realm->inodes_with_caps_lock);
-		list_del_init(&ci->i_snap_realm_item);
-		spin_unlock(&realm->inodes_with_caps_lock);
-		ceph_put_snap_realm(mdsc, realm);
+					ceph_inode_to_client(inode)->mdsc;
+		if (ceph_snap(inode) == CEPH_NOSNAP) {
+			struct ceph_snap_realm *realm = ci->i_snap_realm;
+			dout(" dropping residual ref to snap realm %p\n",
+			     realm);
+			spin_lock(&realm->inodes_with_caps_lock);
+			list_del_init(&ci->i_snap_realm_item);
+			spin_unlock(&realm->inodes_with_caps_lock);
+			ceph_put_snap_realm(mdsc, realm);
+		} else {
+			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
+		}
+		ci->i_snap_realm = NULL;
 	}
 
 	kfree(ci->i_symlink);
@@ -763,6 +768,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
 						     iinfo->pool_ns_len);
 
+	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
+		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
+
 	spin_lock(&ci->i_ceph_lock);
 
 	/*
@@ -2241,10 +2249,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
 	if (!err) {
 		generic_fillattr(inode, stat);
 		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
-		if (ceph_snap(inode) != CEPH_NOSNAP)
-			stat->dev = ceph_snap(inode);
+		if (ceph_snap(inode) == CEPH_NOSNAP)
+			stat->dev = inode->i_sb->s_dev;
 		else
-			stat->dev = 0;
+			stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
+
 		if (S_ISDIR(inode->i_mode)) {
 			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
 						RBYTES))
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 251dc44d84a0..b6e62f5d97c1 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -3504,6 +3504,8 @@ static void delayed_work(struct work_struct *work)
 	dout("mdsc delayed_work\n");
 	ceph_check_delayed_caps(mdsc);
 
+	ceph_trim_snapid_map(mdsc);
+
 	mutex_lock(&mdsc->mutex);
 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
 	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
@@ -3604,6 +3606,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
 	ceph_caps_init(mdsc);
 	ceph_adjust_min_caps(mdsc, fsc->min_caps);
 
+	spin_lock_init(&mdsc->snapid_map_lock);
+	mdsc->snapid_map_tree = RB_ROOT;
+	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
+
 	init_rwsem(&mdsc->pool_perm_rwsem);
 	mdsc->pool_perm_tree = RB_ROOT;
 
@@ -3797,6 +3803,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
 	mutex_unlock(&mdsc->mutex);
 
+	ceph_cleanup_snapid_map(mdsc);
+
 	ceph_cleanup_empty_realms(mdsc);
 
 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 837ac4b087a0..87506a8fc1c3 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -294,6 +294,15 @@ struct ceph_pool_perm {
 	char pool_ns[];
 };
 
+struct ceph_snapid_map {
+	struct rb_node node;
+	struct list_head lru;
+	atomic_t ref;
+	u64 snap;
+	dev_t dev;
+	unsigned long last_used;
+};
+
 /*
  * mds client state
  */
@@ -368,6 +377,10 @@ struct ceph_mds_client {
 	struct list_head  dentry_lru;
 	int		  num_dentry;
 
+	spinlock_t		snapid_map_lock;
+	struct rb_root		snapid_map_tree;
+	struct list_head	snapid_map_lru;
+
 	struct rw_semaphore     pool_perm_rwsem;
 	struct rb_root		pool_perm_tree;
 
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 8a2ca41e4b97..0af9ea832f03 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -979,3 +979,151 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
 		up_write(&mdsc->snap_rwsem);
 	return;
 }
+
+struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+					    u64 snap)
+{
+	struct ceph_snapid_map *sm, *exist;
+	struct rb_node **p, *parent;
+	int ret;
+
+	exist = NULL;
+	spin_lock(&mdsc->snapid_map_lock);
+	p = &mdsc->snapid_map_tree.rb_node;
+	while (*p) {
+		exist = rb_entry(*p, struct ceph_snapid_map, node);
+		if (snap > exist->snap) {
+			p = &(*p)->rb_left;
+		} else if (snap < exist->snap) {
+			p = &(*p)->rb_right;
+		} else {
+			atomic_inc(&exist->ref);
+			list_del_init(&exist->lru);
+			break;
+		}
+		exist = NULL;
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+	if (exist) {
+		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+		return exist;
+	}
+
+	sm = kmalloc(sizeof(*sm), GFP_NOFS);
+	if (!sm)
+		return NULL;
+
+	ret = get_anon_bdev(&sm->dev);
+	if (ret < 0) {
+		kfree(sm);
+		return NULL;
+	}	
+
+	INIT_LIST_HEAD(&sm->lru);
+	atomic_set(&sm->ref, 1);
+	sm->snap = snap;
+
+	exist = NULL;
+	parent = NULL;
+	p = &mdsc->snapid_map_tree.rb_node;
+	spin_lock(&mdsc->snapid_map_lock);
+	while (*p) {
+		parent = *p;
+		exist = rb_entry(*p, struct ceph_snapid_map, node);
+		if (snap > exist->snap)
+			p = &(*p)->rb_left;
+		else if (snap < exist->snap)
+			p = &(*p)->rb_right;
+		else
+			break;
+		exist = NULL;
+	}
+	if (exist) {
+		atomic_inc(&exist->ref);
+		list_del_init(&exist->lru);
+	} else {
+		rb_link_node(&sm->node, parent, p);
+		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+	if (exist) {
+		free_anon_bdev(sm->dev);
+		kfree(sm);
+		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
+		return exist;
+	}
+		
+	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
+	return sm;
+}
+
+void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+			 struct ceph_snapid_map *sm)
+{
+	if (!sm)
+		return;
+	if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
+		sm->last_used = jiffies;
+		list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
+		spin_unlock(&mdsc->snapid_map_lock);
+	}
+}
+
+void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snapid_map *sm;
+	unsigned long now;
+	LIST_HEAD(to_free);
+
+	/* unused map expires after an hour */
+	static unsigned timeout = 60 * 60 * HZ;
+
+	spin_lock(&mdsc->snapid_map_lock);
+	now = jiffies;
+
+	while (!list_empty(&mdsc->snapid_map_lru)) {
+		sm = list_first_entry(&mdsc->snapid_map_lru,
+				      struct ceph_snapid_map, lru);
+		if (time_after(sm->last_used + timeout, now))
+			break;
+
+		rb_erase(&sm->node, &mdsc->snapid_map_tree);
+		list_move(&sm->lru, &to_free);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+
+	while (!list_empty(&to_free)) {
+		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+		list_del(&sm->lru);
+		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
+		free_anon_bdev(sm->dev);
+		kfree(sm);
+	}
+}
+
+void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
+{
+	struct ceph_snapid_map *sm;
+	struct rb_node *p;
+	LIST_HEAD(to_free);
+
+	spin_lock(&mdsc->snapid_map_lock);
+	while ((p = rb_first(&mdsc->snapid_map_tree))) {
+		sm = rb_entry(p, struct ceph_snapid_map, node);
+		rb_erase(p, &mdsc->snapid_map_tree);
+		list_move(&sm->lru, &to_free);
+	}
+	spin_unlock(&mdsc->snapid_map_lock);
+
+	while (!list_empty(&to_free)) {
+		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
+		list_del(&sm->lru);
+		free_anon_bdev(sm->dev);
+		if (atomic_read(&sm->ref)) {
+			pr_err("snapid map %llx -> %x still in use\n",
+			       sm->snap, sm->dev);
+			continue;
+		}
+		kfree(sm);
+	}
+}
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 75701c199b2b..453edca41a37 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -362,7 +362,10 @@ struct ceph_inode_info {
 	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
 	spinlock_t i_unsafe_lock;
 
-	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+	union {
+		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
+	};
 	int i_snap_realm_counter; /* snap realm (if caps) */
 	struct list_head i_snap_realm_item;
 	struct list_head i_snap_flush_item;
@@ -778,6 +781,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 				  struct ceph_cap_snap *capsnap);
 extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
 
+extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
+						   u64 snap);
+extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
+				struct ceph_snapid_map *sm);
+extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
+extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
+
+
 /*
  * a cap_snap is "pending" if it is still awaiting an in-progress
  * sync write (that may/may not still update size, mtime, etc.).
-- 
2.13.6


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 2/3] ceph: track read contexts in ceph_file_info
  2017-12-15  9:24 [PATCH 1/3] ceph: map snapid to anonymous bdev ID Yan, Zheng
@ 2017-12-15  9:24 ` Yan, Zheng
  2017-12-15  9:24 ` [PATCH 3/3] ceph: fix un-balanced fsc->writeback_count update Yan, Zheng
  2017-12-15 13:22 ` [PATCH 1/3] ceph: map snapid to anonymous bdev ID Jeff Layton
  2 siblings, 0 replies; 6+ messages in thread
From: Yan, Zheng @ 2017-12-15  9:24 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, idryomov, Yan, Zheng

Previously ceph_read_iter() uses current->journal to pass context info
to ceph_readpages(), so that ceph_readpages() can distinguish read(2)
from readahead(2)/fadvise(2)/madvise(2). The problem is that page fault
can happen when copying data to userspace memory. Page fault may call
other filesystem's page_mkwrite() if the userspace memory is mapped to a
file. The later filesystem may also want to use current->journal.

The fix is define a on-stack data structure in ceph_read_iter(), add it
to context list in ceph_file_info. ceph_readpages() searchs the list,
find if there is a context belongs to current thread.

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/addr.c  | 15 ++++++++++-----
 fs/ceph/file.c  | 10 ++++++++--
 fs/ceph/super.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 64 insertions(+), 7 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 6f7f00a35190..78a1208b878e 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -299,7 +299,8 @@ static void finish_read(struct ceph_osd_request *req)
  * start an async read(ahead) operation.  return nr_pages we submitted
  * a read for on success, or negative error code.
  */
-static int start_read(struct inode *inode, struct list_head *page_list, int max)
+static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
+		      struct list_head *page_list, int max)
 {
 	struct ceph_osd_client *osdc =
 		&ceph_inode_to_client(inode)->client->osdc;
@@ -316,7 +317,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	int got = 0;
 	int ret = 0;
 
-	if (!current->journal_info) {
+	if (!rw_ctx) {
 		/* caller of readpages does not hold buffer and read caps
 		 * (fadvise, madvise and readahead cases) */
 		int want = CEPH_CAP_FILE_CACHE;
@@ -437,6 +438,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 {
 	struct inode *inode = file_inode(file);
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
+	struct ceph_file_info *ci = file->private_data;
+	struct ceph_rw_context *rw_ctx;
 	int rc = 0;
 	int max = 0;
 
@@ -449,11 +452,12 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 	if (rc == 0)
 		goto out;
 
+	rw_ctx = ceph_find_rw_context(ci);
 	max = fsc->mount_options->rsize >> PAGE_SHIFT;
 	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
 	     inode, file, rw_ctx, nr_pages, max);
 	while (!list_empty(page_list)) {
-		rc = start_read(inode, page_list, max);
+		rc = start_read(inode, rw_ctx, page_list, max);
 		if (rc < 0)
 			goto out;
 	}
@@ -1450,9 +1454,10 @@ static int ceph_filemap_fault(struct vm_fault *vmf)
 
 	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
 	    ci->i_inline_version == CEPH_INLINE_NONE) {
-		current->journal_info = vma->vm_file;
+		CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
+		ceph_add_rw_context(fi, &rw_ctx);
 		ret = filemap_fault(vmf);
-		current->journal_info = NULL;
+		ceph_del_rw_context(fi, &rw_ctx);
 	} else
 		ret = -EAGAIN;
 
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 14b26c2537a9..3c8b5ec4991a 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -115,6 +115,10 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
 			return -ENOMEM;
 		}
 		cf->fmode = fmode;
+
+		spin_lock_init(&cf->rw_contexts_lock);
+		INIT_LIST_HEAD(&cf->rw_contexts);
+
 		cf->next_offset = 2;
 		cf->readdir_cache_idx = -1;
 		file->private_data = cf;
@@ -398,6 +402,7 @@ int ceph_release(struct inode *inode, struct file *file)
 		ceph_mdsc_put_request(cf->last_readdir);
 	kfree(cf->last_name);
 	kfree(cf->dir_info);
+	WARN_ON(!list_empty(&cf->rw_contexts));
 	kmem_cache_free(ceph_file_cachep, cf);
 
 	/* wake up anyone waiting for caps on this inode */
@@ -1133,12 +1138,13 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			retry_op = READ_INLINE;
 		}
 	} else {
+		CEPH_DEFINE_RW_CONTEXT(rw_ctx, got);
 		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
 		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
 		     ceph_cap_string(got));
-		current->journal_info = filp;
+		ceph_add_rw_context(fi, &rw_ctx);
 		ret = generic_file_read_iter(iocb, to);
-		current->journal_info = NULL;
+		ceph_del_rw_context(fi, &rw_ctx);
 	}
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 453edca41a37..5e134bc69d96 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -672,6 +672,9 @@ struct ceph_file_info {
 	short fmode;     /* initialized on open */
 	short flags;     /* CEPH_F_* */
 
+	spinlock_t rw_contexts_lock;
+	struct list_head rw_contexts;
+
 	/* readdir: position within the dir */
 	u32 frag;
 	struct ceph_mds_request *last_readdir;
@@ -688,6 +691,49 @@ struct ceph_file_info {
 	int dir_info_len;
 };
 
+struct ceph_rw_context {
+	struct list_head list;
+	struct task_struct *thread;
+	int caps;
+};
+
+#define CEPH_DEFINE_RW_CONTEXT(_name, _caps)	\
+	struct ceph_rw_context _name = {	\
+		.thread = current,		\
+		.caps = _caps,			\
+	}
+
+static inline void ceph_add_rw_context(struct ceph_file_info *cf,
+				       struct ceph_rw_context *ctx)
+{
+	spin_lock(&cf->rw_contexts_lock);
+	list_add(&ctx->list, &cf->rw_contexts);
+	spin_unlock(&cf->rw_contexts_lock);
+}
+
+static inline void ceph_del_rw_context(struct ceph_file_info *cf,
+				       struct ceph_rw_context *ctx)
+{
+	spin_lock(&cf->rw_contexts_lock);
+	list_del(&ctx->list);
+	spin_unlock(&cf->rw_contexts_lock);
+}
+
+static inline struct ceph_rw_context*
+ceph_find_rw_context(struct ceph_file_info *cf)
+{
+	struct ceph_rw_context *ctx, *found = NULL;
+	spin_lock(&cf->rw_contexts_lock);
+	list_for_each_entry(ctx, &cf->rw_contexts, list) {
+		if (ctx->thread == current) {
+			found = ctx;
+			break;
+		}
+	}
+	spin_unlock(&cf->rw_contexts_lock);
+	return found;
+}
+
 struct ceph_readdir_cache_control {
 	struct page  *page;
 	struct dentry **dentries;
-- 
2.13.6


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH 3/3] ceph: fix un-balanced fsc->writeback_count update
  2017-12-15  9:24 [PATCH 1/3] ceph: map snapid to anonymous bdev ID Yan, Zheng
  2017-12-15  9:24 ` [PATCH 2/3] ceph: track read contexts in ceph_file_info Yan, Zheng
@ 2017-12-15  9:24 ` Yan, Zheng
  2017-12-15 13:22 ` [PATCH 1/3] ceph: map snapid to anonymous bdev ID Jeff Layton
  2 siblings, 0 replies; 6+ messages in thread
From: Yan, Zheng @ 2017-12-15  9:24 UTC (permalink / raw)
  To: ceph-devel; +Cc: jlayton, idryomov, Yan, Zheng

Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
---
 fs/ceph/addr.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 78a1208b878e..b4336b42ce3b 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -578,7 +578,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	struct ceph_fs_client *fsc;
 	struct ceph_snap_context *snapc, *oldest;
 	loff_t page_off = page_offset(page);
-	long writeback_stat;
 	int err, len = PAGE_SIZE;
 	struct ceph_writeback_ctl ceph_wbc;
 
@@ -619,8 +618,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	dout("writepage %p page %p index %lu on %llu~%u snapc %p seq %lld\n",
 	     inode, page, page->index, page_off, len, snapc, snapc->seq);
 
-	writeback_stat = atomic_long_inc_return(&fsc->writeback_count);
-	if (writeback_stat >
+	if (atomic_long_inc_return(&fsc->writeback_count) >
 	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
 		set_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
 
@@ -655,6 +653,11 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	end_page_writeback(page);
 	ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
 	ceph_put_snap_context(snapc);  /* page's reference */
+
+	if (atomic_long_dec_return(&fsc->writeback_count) <
+	    CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
+		clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC);
+
 	return err;
 }
 
-- 
2.13.6


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] ceph: map snapid to anonymous bdev ID
  2017-12-15  9:24 [PATCH 1/3] ceph: map snapid to anonymous bdev ID Yan, Zheng
  2017-12-15  9:24 ` [PATCH 2/3] ceph: track read contexts in ceph_file_info Yan, Zheng
  2017-12-15  9:24 ` [PATCH 3/3] ceph: fix un-balanced fsc->writeback_count update Yan, Zheng
@ 2017-12-15 13:22 ` Jeff Layton
  2017-12-15 13:45   ` Yan, Zheng
  2 siblings, 1 reply; 6+ messages in thread
From: Jeff Layton @ 2017-12-15 13:22 UTC (permalink / raw)
  To: Yan, Zheng, ceph-devel; +Cc: idryomov

On Fri, 2017-12-15 at 17:24 +0800, Yan, Zheng wrote:
> ceph_getattr() return zero dev ID for head inodes and set dev ID to
> snapid directly for snaphost inodes. This is not good because userspace
> utilities may consider device ID of 0 as invalid, snapid may conflict
> with other device's ID.
> 
> This patch introduces "snapids to anonymous bdev IDs" map. we create a
> new mapping when we see a snapid for the first time. we trim unused
> mapping after it is ilde for an hour.
> 
> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> ---
>  fs/ceph/addr.c       |   4 +-
>  fs/ceph/inode.c      |  31 +++++++----
>  fs/ceph/mds_client.c |   8 +++
>  fs/ceph/mds_client.h |  13 +++++
>  fs/ceph/snap.c       | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ceph/super.h      |  13 ++++-
>  6 files changed, 203 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> index dbf07051aacd..6f7f00a35190 100644
> --- a/fs/ceph/addr.c
> +++ b/fs/ceph/addr.c
> @@ -450,8 +450,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
>  		goto out;
>  
>  	max = fsc->mount_options->rsize >> PAGE_SHIFT;
> -	dout("readpages %p file %p nr_pages %d max %d\n",
> -	     inode, file, nr_pages, max);
> +	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> +	     inode, file, rw_ctx, nr_pages, max);
>  	while (!list_empty(page_list)) {
>  		rc = start_read(inode, page_list, max);
>  		if (rc < 0)
> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> index c6ec5aa46100..76a428749981 100644
> --- a/fs/ceph/inode.c
> +++ b/fs/ceph/inode.c
> @@ -542,14 +542,19 @@ void ceph_destroy_inode(struct inode *inode)
>  	 */
>  	if (ci->i_snap_realm) {
>  		struct ceph_mds_client *mdsc =
> -			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
> -		struct ceph_snap_realm *realm = ci->i_snap_realm;
> -
> -		dout(" dropping residual ref to snap realm %p\n", realm);
> -		spin_lock(&realm->inodes_with_caps_lock);
> -		list_del_init(&ci->i_snap_realm_item);
> -		spin_unlock(&realm->inodes_with_caps_lock);
> -		ceph_put_snap_realm(mdsc, realm);
> +					ceph_inode_to_client(inode)->mdsc;
> +		if (ceph_snap(inode) == CEPH_NOSNAP) {
> +			struct ceph_snap_realm *realm = ci->i_snap_realm;
> +			dout(" dropping residual ref to snap realm %p\n",
> +			     realm);
> +			spin_lock(&realm->inodes_with_caps_lock);
> +			list_del_init(&ci->i_snap_realm_item);
> +			spin_unlock(&realm->inodes_with_caps_lock);
> +			ceph_put_snap_realm(mdsc, realm);
> +		} else {
> +			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
> +		}
> +		ci->i_snap_realm = NULL;
>  	}
>  
>  	kfree(ci->i_symlink);
> @@ -763,6 +768,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>  		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
>  						     iinfo->pool_ns_len);
>  
> +	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
> +		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
> +
>  	spin_lock(&ci->i_ceph_lock);
>  
>  	/*
> @@ -2241,10 +2249,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>  	if (!err) {
>  		generic_fillattr(inode, stat);
>  		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> -		if (ceph_snap(inode) != CEPH_NOSNAP)
> -			stat->dev = ceph_snap(inode);
> +		if (ceph_snap(inode) == CEPH_NOSNAP)
> +			stat->dev = inode->i_sb->s_dev;
>  		else
> -			stat->dev = 0;
> +			stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
> +
>  		if (S_ISDIR(inode->i_mode)) {
>  			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
>  						RBYTES))
> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> index 251dc44d84a0..b6e62f5d97c1 100644
> --- a/fs/ceph/mds_client.c
> +++ b/fs/ceph/mds_client.c
> @@ -3504,6 +3504,8 @@ static void delayed_work(struct work_struct *work)
>  	dout("mdsc delayed_work\n");
>  	ceph_check_delayed_caps(mdsc);
>  
> +	ceph_trim_snapid_map(mdsc);
> +
>  	mutex_lock(&mdsc->mutex);
>  	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
>  	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> @@ -3604,6 +3606,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>  	ceph_caps_init(mdsc);
>  	ceph_adjust_min_caps(mdsc, fsc->min_caps);
>  
> +	spin_lock_init(&mdsc->snapid_map_lock);
> +	mdsc->snapid_map_tree = RB_ROOT;
> +	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
> +
>  	init_rwsem(&mdsc->pool_perm_rwsem);
>  	mdsc->pool_perm_tree = RB_ROOT;
>  
> @@ -3797,6 +3803,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>  	WARN_ON(!list_empty(&mdsc->cap_delay_list));
>  	mutex_unlock(&mdsc->mutex);
>  
> +	ceph_cleanup_snapid_map(mdsc);
> +
>  	ceph_cleanup_empty_realms(mdsc);
>  
>  	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> index 837ac4b087a0..87506a8fc1c3 100644
> --- a/fs/ceph/mds_client.h
> +++ b/fs/ceph/mds_client.h
> @@ -294,6 +294,15 @@ struct ceph_pool_perm {
>  	char pool_ns[];
>  };
>  
> +struct ceph_snapid_map {
> +	struct rb_node node;
> +	struct list_head lru;
> +	atomic_t ref;
> +	u64 snap;
> +	dev_t dev;
> +	unsigned long last_used;
> +};
> +
>  /*
>   * mds client state
>   */
> @@ -368,6 +377,10 @@ struct ceph_mds_client {
>  	struct list_head  dentry_lru;
>  	int		  num_dentry;
>  
> +	spinlock_t		snapid_map_lock;
> +	struct rb_root		snapid_map_tree;
> +	struct list_head	snapid_map_lru;
> +
>  	struct rw_semaphore     pool_perm_rwsem;
>  	struct rb_root		pool_perm_tree;
>  
> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> index 8a2ca41e4b97..0af9ea832f03 100644
> --- a/fs/ceph/snap.c
> +++ b/fs/ceph/snap.c
> @@ -979,3 +979,151 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
>  		up_write(&mdsc->snap_rwsem);
>  	return;
>  }
> +
> +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> +					    u64 snap)
> +{
> +	struct ceph_snapid_map *sm, *exist;
> +	struct rb_node **p, *parent;
> +	int ret;
> +
> +	exist = NULL;
> +	spin_lock(&mdsc->snapid_map_lock);
> +	p = &mdsc->snapid_map_tree.rb_node;
> +	while (*p) {
> +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> +		if (snap > exist->snap) {
> +			p = &(*p)->rb_left;
> +		} else if (snap < exist->snap) {
> +			p = &(*p)->rb_right;
> +		} else {
> +			atomic_inc(&exist->ref);
> +			list_del_init(&exist->lru);
> +			break;
> +		}
> +		exist = NULL;
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +	if (exist) {
> +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> +		return exist;
> +	}
> +
> +	sm = kmalloc(sizeof(*sm), GFP_NOFS);
> +	if (!sm)
> +		return NULL;
> +
> +	ret = get_anon_bdev(&sm->dev);
> +	if (ret < 0) {
> +		kfree(sm);
> +		return NULL;
> +	}	
> +
> +	INIT_LIST_HEAD(&sm->lru);
> +	atomic_set(&sm->ref, 1);
> +	sm->snap = snap;
> +
> +	exist = NULL;
> +	parent = NULL;
> +	p = &mdsc->snapid_map_tree.rb_node;
> +	spin_lock(&mdsc->snapid_map_lock);
> +	while (*p) {
> +		parent = *p;
> +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> +		if (snap > exist->snap)
> +			p = &(*p)->rb_left;
> +		else if (snap < exist->snap)
> +			p = &(*p)->rb_right;
> +		else
> +			break;
> +		exist = NULL;
> +	}
> +	if (exist) {
> +		atomic_inc(&exist->ref);
> +		list_del_init(&exist->lru);
> +	} else {
> +		rb_link_node(&sm->node, parent, p);
> +		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +	if (exist) {
> +		free_anon_bdev(sm->dev);
> +		kfree(sm);
> +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> +		return exist;
> +	}
> +		
> +	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
> +	return sm;
> +}
> +
> +void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> +			 struct ceph_snapid_map *sm)
> +{
> +	if (!sm)
> +		return;
> +	if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
> +		sm->last_used = jiffies;
> +		list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
> +		spin_unlock(&mdsc->snapid_map_lock);
> +	}
> +}
> +
> +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
> +{
> +	struct ceph_snapid_map *sm;
> +	unsigned long now;
> +	LIST_HEAD(to_free);
> +
> +	/* unused map expires after an hour */
> +	static unsigned timeout = 60 * 60 * HZ;
> +
> +	spin_lock(&mdsc->snapid_map_lock);
> +	now = jiffies;
> +
> +	while (!list_empty(&mdsc->snapid_map_lru)) {
> +		sm = list_first_entry(&mdsc->snapid_map_lru,
> +				      struct ceph_snapid_map, lru);
> +		if (time_after(sm->last_used + timeout, now))
> +			break;
> +
> +		rb_erase(&sm->node, &mdsc->snapid_map_tree);
> +		list_move(&sm->lru, &to_free);
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +
> +	while (!list_empty(&to_free)) {
> +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> +		list_del(&sm->lru);
> +		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
> +		free_anon_bdev(sm->dev);
> +		kfree(sm);
> +	}
> +}
> +
> +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
> +{
> +	struct ceph_snapid_map *sm;
> +	struct rb_node *p;
> +	LIST_HEAD(to_free);
> +
> +	spin_lock(&mdsc->snapid_map_lock);
> +	while ((p = rb_first(&mdsc->snapid_map_tree))) {
> +		sm = rb_entry(p, struct ceph_snapid_map, node);
> +		rb_erase(p, &mdsc->snapid_map_tree);
> +		list_move(&sm->lru, &to_free);
> +	}
> +	spin_unlock(&mdsc->snapid_map_lock);
> +
> +	while (!list_empty(&to_free)) {
> +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> +		list_del(&sm->lru);
> +		free_anon_bdev(sm->dev);
> +		if (atomic_read(&sm->ref)) {
> +			pr_err("snapid map %llx -> %x still in use\n",
> +			       sm->snap, sm->dev);
> +			continue;
> +		}

This looks problematic. If it's still in use, then you'll call list_del
and free_anon_bdev on it, but not kfree it. What will eventually clean
it up, and how will it know not to do the list_del and free_anon_bdev a
second time?

> +		kfree(sm);
> +	}
> +}
> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> index 75701c199b2b..453edca41a37 100644
> --- a/fs/ceph/super.h
> +++ b/fs/ceph/super.h
> @@ -362,7 +362,10 @@ struct ceph_inode_info {
>  	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
>  	spinlock_t i_unsafe_lock;
>  
> -	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> +	union {
> +		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> +		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> +	};
>  	int i_snap_realm_counter; /* snap realm (if caps) */
>  	struct list_head i_snap_realm_item;
>  	struct list_head i_snap_flush_item;
> @@ -778,6 +781,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
>  				  struct ceph_cap_snap *capsnap);
>  extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
>  
> +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> +						   u64 snap);
> +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> +				struct ceph_snapid_map *sm);
> +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
> +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
> +
> +
>  /*
>   * a cap_snap is "pending" if it is still awaiting an in-progress
>   * sync write (that may/may not still update size, mtime, etc.).

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] ceph: map snapid to anonymous bdev ID
  2017-12-15 13:22 ` [PATCH 1/3] ceph: map snapid to anonymous bdev ID Jeff Layton
@ 2017-12-15 13:45   ` Yan, Zheng
  2017-12-15 13:48     ` Jeff Layton
  0 siblings, 1 reply; 6+ messages in thread
From: Yan, Zheng @ 2017-12-15 13:45 UTC (permalink / raw)
  To: Jeff Layton; +Cc: ceph-devel, idryomov



> On 15 Dec 2017, at 21:22, Jeff Layton <jlayton@redhat.com> wrote:
> 
> On Fri, 2017-12-15 at 17:24 +0800, Yan, Zheng wrote:
>> ceph_getattr() return zero dev ID for head inodes and set dev ID to
>> snapid directly for snaphost inodes. This is not good because userspace
>> utilities may consider device ID of 0 as invalid, snapid may conflict
>> with other device's ID.
>> 
>> This patch introduces "snapids to anonymous bdev IDs" map. we create a
>> new mapping when we see a snapid for the first time. we trim unused
>> mapping after it is ilde for an hour.
>> 
>> Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
>> ---
>> fs/ceph/addr.c       |   4 +-
>> fs/ceph/inode.c      |  31 +++++++----
>> fs/ceph/mds_client.c |   8 +++
>> fs/ceph/mds_client.h |  13 +++++
>> fs/ceph/snap.c       | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++
>> fs/ceph/super.h      |  13 ++++-
>> 6 files changed, 203 insertions(+), 14 deletions(-)
>> 
>> diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
>> index dbf07051aacd..6f7f00a35190 100644
>> --- a/fs/ceph/addr.c
>> +++ b/fs/ceph/addr.c
>> @@ -450,8 +450,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
>> 		goto out;
>> 
>> 	max = fsc->mount_options->rsize >> PAGE_SHIFT;
>> -	dout("readpages %p file %p nr_pages %d max %d\n",
>> -	     inode, file, nr_pages, max);
>> +	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
>> +	     inode, file, rw_ctx, nr_pages, max);
>> 	while (!list_empty(page_list)) {
>> 		rc = start_read(inode, page_list, max);
>> 		if (rc < 0)
>> diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
>> index c6ec5aa46100..76a428749981 100644
>> --- a/fs/ceph/inode.c
>> +++ b/fs/ceph/inode.c
>> @@ -542,14 +542,19 @@ void ceph_destroy_inode(struct inode *inode)
>> 	 */
>> 	if (ci->i_snap_realm) {
>> 		struct ceph_mds_client *mdsc =
>> -			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
>> -		struct ceph_snap_realm *realm = ci->i_snap_realm;
>> -
>> -		dout(" dropping residual ref to snap realm %p\n", realm);
>> -		spin_lock(&realm->inodes_with_caps_lock);
>> -		list_del_init(&ci->i_snap_realm_item);
>> -		spin_unlock(&realm->inodes_with_caps_lock);
>> -		ceph_put_snap_realm(mdsc, realm);
>> +					ceph_inode_to_client(inode)->mdsc;
>> +		if (ceph_snap(inode) == CEPH_NOSNAP) {
>> +			struct ceph_snap_realm *realm = ci->i_snap_realm;
>> +			dout(" dropping residual ref to snap realm %p\n",
>> +			     realm);
>> +			spin_lock(&realm->inodes_with_caps_lock);
>> +			list_del_init(&ci->i_snap_realm_item);
>> +			spin_unlock(&realm->inodes_with_caps_lock);
>> +			ceph_put_snap_realm(mdsc, realm);
>> +		} else {
>> +			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
>> +		}
>> +		ci->i_snap_realm = NULL;
>> 	}
>> 
>> 	kfree(ci->i_symlink);
>> @@ -763,6 +768,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
>> 		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
>> 						     iinfo->pool_ns_len);
>> 
>> +	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
>> +		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
>> +
>> 	spin_lock(&ci->i_ceph_lock);
>> 
>> 	/*
>> @@ -2241,10 +2249,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
>> 	if (!err) {
>> 		generic_fillattr(inode, stat);
>> 		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
>> -		if (ceph_snap(inode) != CEPH_NOSNAP)
>> -			stat->dev = ceph_snap(inode);
>> +		if (ceph_snap(inode) == CEPH_NOSNAP)
>> +			stat->dev = inode->i_sb->s_dev;
>> 		else
>> -			stat->dev = 0;
>> +			stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
>> +
>> 		if (S_ISDIR(inode->i_mode)) {
>> 			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
>> 						RBYTES))
>> diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
>> index 251dc44d84a0..b6e62f5d97c1 100644
>> --- a/fs/ceph/mds_client.c
>> +++ b/fs/ceph/mds_client.c
>> @@ -3504,6 +3504,8 @@ static void delayed_work(struct work_struct *work)
>> 	dout("mdsc delayed_work\n");
>> 	ceph_check_delayed_caps(mdsc);
>> 
>> +	ceph_trim_snapid_map(mdsc);
>> +
>> 	mutex_lock(&mdsc->mutex);
>> 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
>> 	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
>> @@ -3604,6 +3606,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
>> 	ceph_caps_init(mdsc);
>> 	ceph_adjust_min_caps(mdsc, fsc->min_caps);
>> 
>> +	spin_lock_init(&mdsc->snapid_map_lock);
>> +	mdsc->snapid_map_tree = RB_ROOT;
>> +	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
>> +
>> 	init_rwsem(&mdsc->pool_perm_rwsem);
>> 	mdsc->pool_perm_tree = RB_ROOT;
>> 
>> @@ -3797,6 +3803,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
>> 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
>> 	mutex_unlock(&mdsc->mutex);
>> 
>> +	ceph_cleanup_snapid_map(mdsc);
>> +
>> 	ceph_cleanup_empty_realms(mdsc);
>> 
>> 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
>> diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
>> index 837ac4b087a0..87506a8fc1c3 100644
>> --- a/fs/ceph/mds_client.h
>> +++ b/fs/ceph/mds_client.h
>> @@ -294,6 +294,15 @@ struct ceph_pool_perm {
>> 	char pool_ns[];
>> };
>> 
>> +struct ceph_snapid_map {
>> +	struct rb_node node;
>> +	struct list_head lru;
>> +	atomic_t ref;
>> +	u64 snap;
>> +	dev_t dev;
>> +	unsigned long last_used;
>> +};
>> +
>> /*
>>  * mds client state
>>  */
>> @@ -368,6 +377,10 @@ struct ceph_mds_client {
>> 	struct list_head  dentry_lru;
>> 	int		  num_dentry;
>> 
>> +	spinlock_t		snapid_map_lock;
>> +	struct rb_root		snapid_map_tree;
>> +	struct list_head	snapid_map_lru;
>> +
>> 	struct rw_semaphore     pool_perm_rwsem;
>> 	struct rb_root		pool_perm_tree;
>> 
>> diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
>> index 8a2ca41e4b97..0af9ea832f03 100644
>> --- a/fs/ceph/snap.c
>> +++ b/fs/ceph/snap.c
>> @@ -979,3 +979,151 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
>> 		up_write(&mdsc->snap_rwsem);
>> 	return;
>> }
>> +
>> +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
>> +					    u64 snap)
>> +{
>> +	struct ceph_snapid_map *sm, *exist;
>> +	struct rb_node **p, *parent;
>> +	int ret;
>> +
>> +	exist = NULL;
>> +	spin_lock(&mdsc->snapid_map_lock);
>> +	p = &mdsc->snapid_map_tree.rb_node;
>> +	while (*p) {
>> +		exist = rb_entry(*p, struct ceph_snapid_map, node);
>> +		if (snap > exist->snap) {
>> +			p = &(*p)->rb_left;
>> +		} else if (snap < exist->snap) {
>> +			p = &(*p)->rb_right;
>> +		} else {
>> +			atomic_inc(&exist->ref);
>> +			list_del_init(&exist->lru);
>> +			break;
>> +		}
>> +		exist = NULL;
>> +	}
>> +	spin_unlock(&mdsc->snapid_map_lock);
>> +	if (exist) {
>> +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
>> +		return exist;
>> +	}
>> +
>> +	sm = kmalloc(sizeof(*sm), GFP_NOFS);
>> +	if (!sm)
>> +		return NULL;
>> +
>> +	ret = get_anon_bdev(&sm->dev);
>> +	if (ret < 0) {
>> +		kfree(sm);
>> +		return NULL;
>> +	}	
>> +
>> +	INIT_LIST_HEAD(&sm->lru);
>> +	atomic_set(&sm->ref, 1);
>> +	sm->snap = snap;
>> +
>> +	exist = NULL;
>> +	parent = NULL;
>> +	p = &mdsc->snapid_map_tree.rb_node;
>> +	spin_lock(&mdsc->snapid_map_lock);
>> +	while (*p) {
>> +		parent = *p;
>> +		exist = rb_entry(*p, struct ceph_snapid_map, node);
>> +		if (snap > exist->snap)
>> +			p = &(*p)->rb_left;
>> +		else if (snap < exist->snap)
>> +			p = &(*p)->rb_right;
>> +		else
>> +			break;
>> +		exist = NULL;
>> +	}
>> +	if (exist) {
>> +		atomic_inc(&exist->ref);
>> +		list_del_init(&exist->lru);
>> +	} else {
>> +		rb_link_node(&sm->node, parent, p);
>> +		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
>> +	}
>> +	spin_unlock(&mdsc->snapid_map_lock);
>> +	if (exist) {
>> +		free_anon_bdev(sm->dev);
>> +		kfree(sm);
>> +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
>> +		return exist;
>> +	}
>> +		
>> +	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
>> +	return sm;
>> +}
>> +
>> +void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
>> +			 struct ceph_snapid_map *sm)
>> +{
>> +	if (!sm)
>> +		return;
>> +	if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
>> +		sm->last_used = jiffies;
>> +		list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
>> +		spin_unlock(&mdsc->snapid_map_lock);
>> +	}
>> +}
>> +
>> +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
>> +{
>> +	struct ceph_snapid_map *sm;
>> +	unsigned long now;
>> +	LIST_HEAD(to_free);
>> +
>> +	/* unused map expires after an hour */
>> +	static unsigned timeout = 60 * 60 * HZ;
>> +
>> +	spin_lock(&mdsc->snapid_map_lock);
>> +	now = jiffies;
>> +
>> +	while (!list_empty(&mdsc->snapid_map_lru)) {
>> +		sm = list_first_entry(&mdsc->snapid_map_lru,
>> +				      struct ceph_snapid_map, lru);
>> +		if (time_after(sm->last_used + timeout, now))
>> +			break;
>> +
>> +		rb_erase(&sm->node, &mdsc->snapid_map_tree);
>> +		list_move(&sm->lru, &to_free);
>> +	}
>> +	spin_unlock(&mdsc->snapid_map_lock);
>> +
>> +	while (!list_empty(&to_free)) {
>> +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
>> +		list_del(&sm->lru);
>> +		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
>> +		free_anon_bdev(sm->dev);
>> +		kfree(sm);
>> +	}
>> +}
>> +
>> +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
>> +{
>> +	struct ceph_snapid_map *sm;
>> +	struct rb_node *p;
>> +	LIST_HEAD(to_free);
>> +
>> +	spin_lock(&mdsc->snapid_map_lock);
>> +	while ((p = rb_first(&mdsc->snapid_map_tree))) {
>> +		sm = rb_entry(p, struct ceph_snapid_map, node);
>> +		rb_erase(p, &mdsc->snapid_map_tree);
>> +		list_move(&sm->lru, &to_free);
>> +	}
>> +	spin_unlock(&mdsc->snapid_map_lock);
>> +
>> +	while (!list_empty(&to_free)) {
>> +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
>> +		list_del(&sm->lru);
>> +		free_anon_bdev(sm->dev);
>> +		if (atomic_read(&sm->ref)) {
>> +			pr_err("snapid map %llx -> %x still in use\n",
>> +			       sm->snap, sm->dev);
>> +			continue;
>> +		}
> 
> This looks problematic. If it's still in use, then you'll call list_del
> and free_anon_bdev on it, but not kfree it. What will eventually clean
> it up, and how will it know not to do the list_del and free_anon_bdev a
> second time?

this is for unexpected condition. don’t have a elegant way to handle. how about:
        
while (list_empty(&to_free)) {
                sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
                list_del(&sm->lru);
                free_anon_bdev(sm->dev);
                if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
                        pr_err("snapid map %llx -> %x still in use\n",
                               sm->snap, sm->dev);
                }
                kfree(sm);
}

> 
>> +		kfree(sm);
>> +	}
>> +}
>> diff --git a/fs/ceph/super.h b/fs/ceph/super.h
>> index 75701c199b2b..453edca41a37 100644
>> --- a/fs/ceph/super.h
>> +++ b/fs/ceph/super.h
>> @@ -362,7 +362,10 @@ struct ceph_inode_info {
>> 	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
>> 	spinlock_t i_unsafe_lock;
>> 
>> -	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
>> +	union {
>> +		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
>> +		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
>> +	};
>> 	int i_snap_realm_counter; /* snap realm (if caps) */
>> 	struct list_head i_snap_realm_item;
>> 	struct list_head i_snap_flush_item;
>> @@ -778,6 +781,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
>> 				  struct ceph_cap_snap *capsnap);
>> extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
>> 
>> +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
>> +						   u64 snap);
>> +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
>> +				struct ceph_snapid_map *sm);
>> +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
>> +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
>> +
>> +
>> /*
>>  * a cap_snap is "pending" if it is still awaiting an in-progress
>>  * sync write (that may/may not still update size, mtime, etc.).
> 
> -- 
> Jeff Layton <jlayton@redhat.com>


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH 1/3] ceph: map snapid to anonymous bdev ID
  2017-12-15 13:45   ` Yan, Zheng
@ 2017-12-15 13:48     ` Jeff Layton
  0 siblings, 0 replies; 6+ messages in thread
From: Jeff Layton @ 2017-12-15 13:48 UTC (permalink / raw)
  To: Yan, Zheng; +Cc: ceph-devel, idryomov

On Fri, 2017-12-15 at 21:45 +0800, Yan, Zheng wrote:
> > On 15 Dec 2017, at 21:22, Jeff Layton <jlayton@redhat.com> wrote:
> > 
> > On Fri, 2017-12-15 at 17:24 +0800, Yan, Zheng wrote:
> > > ceph_getattr() return zero dev ID for head inodes and set dev ID to
> > > snapid directly for snaphost inodes. This is not good because userspace
> > > utilities may consider device ID of 0 as invalid, snapid may conflict
> > > with other device's ID.
> > > 
> > > This patch introduces "snapids to anonymous bdev IDs" map. we create a
> > > new mapping when we see a snapid for the first time. we trim unused
> > > mapping after it is ilde for an hour.
> > > 
> > > Signed-off-by: "Yan, Zheng" <zyan@redhat.com>
> > > ---
> > > fs/ceph/addr.c       |   4 +-
> > > fs/ceph/inode.c      |  31 +++++++----
> > > fs/ceph/mds_client.c |   8 +++
> > > fs/ceph/mds_client.h |  13 +++++
> > > fs/ceph/snap.c       | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > > fs/ceph/super.h      |  13 ++++-
> > > 6 files changed, 203 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > index dbf07051aacd..6f7f00a35190 100644
> > > --- a/fs/ceph/addr.c
> > > +++ b/fs/ceph/addr.c
> > > @@ -450,8 +450,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
> > > 		goto out;
> > > 
> > > 	max = fsc->mount_options->rsize >> PAGE_SHIFT;
> > > -	dout("readpages %p file %p nr_pages %d max %d\n",
> > > -	     inode, file, nr_pages, max);
> > > +	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> > > +	     inode, file, rw_ctx, nr_pages, max);
> > > 	while (!list_empty(page_list)) {
> > > 		rc = start_read(inode, page_list, max);
> > > 		if (rc < 0)
> > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > index c6ec5aa46100..76a428749981 100644
> > > --- a/fs/ceph/inode.c
> > > +++ b/fs/ceph/inode.c
> > > @@ -542,14 +542,19 @@ void ceph_destroy_inode(struct inode *inode)
> > > 	 */
> > > 	if (ci->i_snap_realm) {
> > > 		struct ceph_mds_client *mdsc =
> > > -			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
> > > -		struct ceph_snap_realm *realm = ci->i_snap_realm;
> > > -
> > > -		dout(" dropping residual ref to snap realm %p\n", realm);
> > > -		spin_lock(&realm->inodes_with_caps_lock);
> > > -		list_del_init(&ci->i_snap_realm_item);
> > > -		spin_unlock(&realm->inodes_with_caps_lock);
> > > -		ceph_put_snap_realm(mdsc, realm);
> > > +					ceph_inode_to_client(inode)->mdsc;
> > > +		if (ceph_snap(inode) == CEPH_NOSNAP) {
> > > +			struct ceph_snap_realm *realm = ci->i_snap_realm;
> > > +			dout(" dropping residual ref to snap realm %p\n",
> > > +			     realm);
> > > +			spin_lock(&realm->inodes_with_caps_lock);
> > > +			list_del_init(&ci->i_snap_realm_item);
> > > +			spin_unlock(&realm->inodes_with_caps_lock);
> > > +			ceph_put_snap_realm(mdsc, realm);
> > > +		} else {
> > > +			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
> > > +		}
> > > +		ci->i_snap_realm = NULL;
> > > 	}
> > > 
> > > 	kfree(ci->i_symlink);
> > > @@ -763,6 +768,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> > > 		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
> > > 						     iinfo->pool_ns_len);
> > > 
> > > +	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
> > > +		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
> > > +
> > > 	spin_lock(&ci->i_ceph_lock);
> > > 
> > > 	/*
> > > @@ -2241,10 +2249,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> > > 	if (!err) {
> > > 		generic_fillattr(inode, stat);
> > > 		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > > -		if (ceph_snap(inode) != CEPH_NOSNAP)
> > > -			stat->dev = ceph_snap(inode);
> > > +		if (ceph_snap(inode) == CEPH_NOSNAP)
> > > +			stat->dev = inode->i_sb->s_dev;
> > > 		else
> > > -			stat->dev = 0;
> > > +			stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
> > > +
> > > 		if (S_ISDIR(inode->i_mode)) {
> > > 			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
> > > 						RBYTES))
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 251dc44d84a0..b6e62f5d97c1 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -3504,6 +3504,8 @@ static void delayed_work(struct work_struct *work)
> > > 	dout("mdsc delayed_work\n");
> > > 	ceph_check_delayed_caps(mdsc);
> > > 
> > > +	ceph_trim_snapid_map(mdsc);
> > > +
> > > 	mutex_lock(&mdsc->mutex);
> > > 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> > > 	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> > > @@ -3604,6 +3606,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
> > > 	ceph_caps_init(mdsc);
> > > 	ceph_adjust_min_caps(mdsc, fsc->min_caps);
> > > 
> > > +	spin_lock_init(&mdsc->snapid_map_lock);
> > > +	mdsc->snapid_map_tree = RB_ROOT;
> > > +	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
> > > +
> > > 	init_rwsem(&mdsc->pool_perm_rwsem);
> > > 	mdsc->pool_perm_tree = RB_ROOT;
> > > 
> > > @@ -3797,6 +3803,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > > 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
> > > 	mutex_unlock(&mdsc->mutex);
> > > 
> > > +	ceph_cleanup_snapid_map(mdsc);
> > > +
> > > 	ceph_cleanup_empty_realms(mdsc);
> > > 
> > > 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index 837ac4b087a0..87506a8fc1c3 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -294,6 +294,15 @@ struct ceph_pool_perm {
> > > 	char pool_ns[];
> > > };
> > > 
> > > +struct ceph_snapid_map {
> > > +	struct rb_node node;
> > > +	struct list_head lru;
> > > +	atomic_t ref;
> > > +	u64 snap;
> > > +	dev_t dev;
> > > +	unsigned long last_used;
> > > +};
> > > +
> > > /*
> > >  * mds client state
> > >  */
> > > @@ -368,6 +377,10 @@ struct ceph_mds_client {
> > > 	struct list_head  dentry_lru;
> > > 	int		  num_dentry;
> > > 
> > > +	spinlock_t		snapid_map_lock;
> > > +	struct rb_root		snapid_map_tree;
> > > +	struct list_head	snapid_map_lru;
> > > +
> > > 	struct rw_semaphore     pool_perm_rwsem;
> > > 	struct rb_root		pool_perm_tree;
> > > 
> > > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> > > index 8a2ca41e4b97..0af9ea832f03 100644
> > > --- a/fs/ceph/snap.c
> > > +++ b/fs/ceph/snap.c
> > > @@ -979,3 +979,151 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
> > > 		up_write(&mdsc->snap_rwsem);
> > > 	return;
> > > }
> > > +
> > > +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > > +					    u64 snap)
> > > +{
> > > +	struct ceph_snapid_map *sm, *exist;
> > > +	struct rb_node **p, *parent;
> > > +	int ret;
> > > +
> > > +	exist = NULL;
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	p = &mdsc->snapid_map_tree.rb_node;
> > > +	while (*p) {
> > > +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> > > +		if (snap > exist->snap) {
> > > +			p = &(*p)->rb_left;
> > > +		} else if (snap < exist->snap) {
> > > +			p = &(*p)->rb_right;
> > > +		} else {
> > > +			atomic_inc(&exist->ref);
> > > +			list_del_init(&exist->lru);
> > > +			break;
> > > +		}
> > > +		exist = NULL;
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +	if (exist) {
> > > +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > > +		return exist;
> > > +	}
> > > +
> > > +	sm = kmalloc(sizeof(*sm), GFP_NOFS);
> > > +	if (!sm)
> > > +		return NULL;
> > > +
> > > +	ret = get_anon_bdev(&sm->dev);
> > > +	if (ret < 0) {
> > > +		kfree(sm);
> > > +		return NULL;
> > > +	}	
> > > +
> > > +	INIT_LIST_HEAD(&sm->lru);
> > > +	atomic_set(&sm->ref, 1);
> > > +	sm->snap = snap;
> > > +
> > > +	exist = NULL;
> > > +	parent = NULL;
> > > +	p = &mdsc->snapid_map_tree.rb_node;
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	while (*p) {
> > > +		parent = *p;
> > > +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> > > +		if (snap > exist->snap)
> > > +			p = &(*p)->rb_left;
> > > +		else if (snap < exist->snap)
> > > +			p = &(*p)->rb_right;
> > > +		else
> > > +			break;
> > > +		exist = NULL;
> > > +	}
> > > +	if (exist) {
> > > +		atomic_inc(&exist->ref);
> > > +		list_del_init(&exist->lru);
> > > +	} else {
> > > +		rb_link_node(&sm->node, parent, p);
> > > +		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +	if (exist) {
> > > +		free_anon_bdev(sm->dev);
> > > +		kfree(sm);
> > > +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > > +		return exist;
> > > +	}
> > > +		
> > > +	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
> > > +	return sm;
> > > +}
> > > +
> > > +void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > > +			 struct ceph_snapid_map *sm)
> > > +{
> > > +	if (!sm)
> > > +		return;
> > > +	if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
> > > +		sm->last_used = jiffies;
> > > +		list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
> > > +		spin_unlock(&mdsc->snapid_map_lock);
> > > +	}
> > > +}
> > > +
> > > +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
> > > +{
> > > +	struct ceph_snapid_map *sm;
> > > +	unsigned long now;
> > > +	LIST_HEAD(to_free);
> > > +
> > > +	/* unused map expires after an hour */
> > > +	static unsigned timeout = 60 * 60 * HZ;
> > > +
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	now = jiffies;
> > > +
> > > +	while (!list_empty(&mdsc->snapid_map_lru)) {
> > > +		sm = list_first_entry(&mdsc->snapid_map_lru,
> > > +				      struct ceph_snapid_map, lru);
> > > +		if (time_after(sm->last_used + timeout, now))
> > > +			break;
> > > +
> > > +		rb_erase(&sm->node, &mdsc->snapid_map_tree);
> > > +		list_move(&sm->lru, &to_free);
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +
> > > +	while (!list_empty(&to_free)) {
> > > +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > > +		list_del(&sm->lru);
> > > +		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
> > > +		free_anon_bdev(sm->dev);
> > > +		kfree(sm);
> > > +	}
> > > +}
> > > +
> > > +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
> > > +{
> > > +	struct ceph_snapid_map *sm;
> > > +	struct rb_node *p;
> > > +	LIST_HEAD(to_free);
> > > +
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	while ((p = rb_first(&mdsc->snapid_map_tree))) {
> > > +		sm = rb_entry(p, struct ceph_snapid_map, node);
> > > +		rb_erase(p, &mdsc->snapid_map_tree);
> > > +		list_move(&sm->lru, &to_free);
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +
> > > +	while (!list_empty(&to_free)) {
> > > +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > > +		list_del(&sm->lru);
> > > +		free_anon_bdev(sm->dev);
> > > +		if (atomic_read(&sm->ref)) {
> > > +			pr_err("snapid map %llx -> %x still in use\n",
> > > +			       sm->snap, sm->dev);
> > > +			continue;
> > > +		}
> > 
> > This looks problematic. If it's still in use, then you'll call list_del
> > and free_anon_bdev on it, but not kfree it. What will eventually clean
> > it up, and how will it know not to do the list_del and free_anon_bdev a
> > second time?
> 
> this is for unexpected condition. don’t have a elegant way to handle. how about:
>         
> while (list_empty(&to_free)) {
>                 sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
>                 list_del(&sm->lru);
>                 free_anon_bdev(sm->dev);
>                 if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
>                         pr_err("snapid map %llx -> %x still in use\n",
>                                sm->snap, sm->dev);
>                 }
>                 kfree(sm);
> }
> 

That might work. Another thing you could do is ensure you
list_del_init(&sm->lru), and then when you go to put the last reference
to the thing check whether that list_head is empty before you do the
cleanup.

IOW, use the list_head being empty as a mark that the resources held by
this object are already cleaned up and you needn't do anything but kfree
it.

> > 
> > > +		kfree(sm);
> > > +	}
> > > +}
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 75701c199b2b..453edca41a37 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -362,7 +362,10 @@ struct ceph_inode_info {
> > > 	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
> > > 	spinlock_t i_unsafe_lock;
> > > 
> > > -	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > > +	union {
> > > +		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > > +		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> > > +	};
> > > 	int i_snap_realm_counter; /* snap realm (if caps) */
> > > 	struct list_head i_snap_realm_item;
> > > 	struct list_head i_snap_flush_item;
> > > @@ -778,6 +781,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
> > > 				  struct ceph_cap_snap *capsnap);
> > > extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
> > > 
> > > +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > > +						   u64 snap);
> > > +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > > +				struct ceph_snapid_map *sm);
> > > +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
> > > +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
> > > +
> > > +
> > > /*
> > >  * a cap_snap is "pending" if it is still awaiting an in-progress
> > >  * sync write (that may/may not still update size, mtime, etc.).
> > 
> > -- 
> > Jeff Layton <jlayton@redhat.com>
> 
> 

-- 
Jeff Layton <jlayton@redhat.com>

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2017-12-15 13:48 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-12-15  9:24 [PATCH 1/3] ceph: map snapid to anonymous bdev ID Yan, Zheng
2017-12-15  9:24 ` [PATCH 2/3] ceph: track read contexts in ceph_file_info Yan, Zheng
2017-12-15  9:24 ` [PATCH 3/3] ceph: fix un-balanced fsc->writeback_count update Yan, Zheng
2017-12-15 13:22 ` [PATCH 1/3] ceph: map snapid to anonymous bdev ID Jeff Layton
2017-12-15 13:45   ` Yan, Zheng
2017-12-15 13:48     ` Jeff Layton

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.