stgt.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 1/4] sheepdog: prevent double locking during inode reload
@ 2015-07-20  6:59 Hitoshi Mitake
  2015-07-20  6:59 ` [PATCH 2/4] sheepdog: serialize overwrapping request Hitoshi Mitake
                   ` (2 more replies)
  0 siblings, 3 replies; 4+ messages in thread
From: Hitoshi Mitake @ 2015-07-20  6:59 UTC (permalink / raw)
  To: stgt; +Cc: sheepdog, Hitoshi Mitake, Teruaki Ishizaki

The commit 0c531dfe18f9b2af made the lock protecting on memory inode
object finer grain and improved performacne, but it also introduced a
possibility of double locking single VDI.

The problem can arise in this sequence:
1. both of thread A and B issues write request to a VDI
2. user make a snapshot of the VDI
3. the VDI is now readonly snapshot, sheep returns SD_RES_READONLY
4. both of A and B calls find_vdi_name() for obtaining a new ID of
   working VDI, it results double locking

This patch resolves the problem. Each thread has a version number of
inode object in its TLS. Acess info struct also has the number. When
the thread needs to reload the inode object, it check the number with
the one of the access info first. If they differ, it means no other
threads reloaded the object. If not, it means other thread already
reloaded, so the thread doesn't call find_vdi_name() and double
locking can be avoided.

Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
---
 usr/bs_sheepdog.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

diff --git a/usr/bs_sheepdog.c b/usr/bs_sheepdog.c
index cfaf48b..21d7dac 100644
--- a/usr/bs_sheepdog.c
+++ b/usr/bs_sheepdog.c
@@ -291,6 +291,9 @@ struct sheepdog_access_info {
 
 	struct sheepdog_inode inode;
 	pthread_rwlock_t inode_lock;
+
+	pthread_mutex_t inode_version_mutex;
+	uint64_t inode_version;
 };
 
 static inline int is_data_obj_writeable(struct sheepdog_inode *inode,
@@ -656,37 +659,58 @@ static int read_object(struct sheepdog_access_info *ai, char *buf, uint64_t oid,
 
 static int reload_inode(struct sheepdog_access_info *ai, int is_snapshot)
 {
-	int ret, need_reload = 0;
+	int ret = 0, need_reload = 0;
 	char tag[SD_MAX_VDI_TAG_LEN];
 	uint32_t vid;
 
+	static __thread uint64_t inode_version;
+
+	pthread_mutex_lock(&ai->inode_version_mutex);
+
+	if (inode_version != ai->inode_version) {
+		/* some other threads reloaded inode */
+		inode_version = ai->inode_version;
+		goto ret;
+	}
+
 	if (is_snapshot) {
 		memset(tag, 0, sizeof(tag));
 
 		ret = find_vdi_name(ai, ai->inode.name, CURRENT_VDI_ID, tag,
 				    &vid, 0);
-		if (ret)
-			return -1;
+		if (ret) {
+			ret = -1;
+			goto ret;
+		}
 
 		ret = read_object(ai, (char *)&ai->inode, vid_to_vdi_oid(vid),
 				  ai->inode.nr_copies,
 				  offsetof(struct sheepdog_inode, data_vdi_id),
 				  0, &need_reload);
-		if (ret)
-			return -1;
+		if (ret) {
+			ret = -1;
+			goto ret;
+		}
 	} else {
 		ret = read_object(ai, (char *)&ai->inode,
 				  vid_to_vdi_oid(ai->inode.vdi_id),
 				  ai->inode.nr_copies, SD_INODE_SIZE, 0,
 				  &need_reload);
-		if (ret)
-			return -1;
+		if (ret) {
+			ret = -1;
+			goto ret;
+		}
 	}
 
 	ai->min_dirty_data_idx = UINT32_MAX;
 	ai->max_dirty_data_idx = 0;
 
-	return 0;
+	inode_version++;
+	ai->inode_version = inode_version;
+
+ret:
+	pthread_mutex_unlock(&ai->inode_version_mutex);
+	return ret;
 }
 
 static int read_write_object(struct sheepdog_access_info *ai, char *buf,
@@ -1426,6 +1450,7 @@ static tgtadm_err bs_sheepdog_init(struct scsi_lu *lu, char *bsopts)
 	INIT_LIST_HEAD(&ai->fd_list_head);
 	pthread_rwlock_init(&ai->fd_list_lock, NULL);
 	pthread_rwlock_init(&ai->inode_lock, NULL);
+	pthread_mutex_init(&ai->inode_version_mutex, NULL);
 
 	return bs_thread_open(info, bs_sheepdog_request, nr_iothreads);
 }
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 2/4] sheepdog: serialize overwrapping request
  2015-07-20  6:59 [PATCH 1/4] sheepdog: prevent double locking during inode reload Hitoshi Mitake
@ 2015-07-20  6:59 ` Hitoshi Mitake
  2015-07-20  6:59 ` [PATCH 3/4] sheepdog: pass a correct flag to find_vdi_name() Hitoshi Mitake
  2015-07-20  6:59 ` [PATCH 4/4] sheepdog: handle a case of snapshot -> failover Hitoshi Mitake
  2 siblings, 0 replies; 4+ messages in thread
From: Hitoshi Mitake @ 2015-07-20  6:59 UTC (permalink / raw)
  To: stgt; +Cc: sheepdog, Hitoshi Mitake, Teruaki Ishizaki

iSCSI initiators can issue read and write requests that target
overlapping areas. In such a case, current sheepdog driver cannot
serialize these requests so the semantics of virtual drive will be
corrupt. This patch implements a mechanism for serializing such
requests.

Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
---
 usr/bs_sheepdog.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

diff --git a/usr/bs_sheepdog.c b/usr/bs_sheepdog.c
index 21d7dac..be6d321 100644
--- a/usr/bs_sheepdog.c
+++ b/usr/bs_sheepdog.c
@@ -294,6 +294,10 @@ struct sheepdog_access_info {
 
 	pthread_mutex_t inode_version_mutex;
 	uint64_t inode_version;
+
+	struct list_head inflight_list_head;
+	pthread_mutex_t inflight_list_mutex;
+	pthread_cond_t inflight_list_cond;
 };
 
 static inline int is_data_obj_writeable(struct sheepdog_inode *inode,
@@ -1252,6 +1256,11 @@ trans_to_expect_nothing:
 		goto out;
 
 	ret = 0;
+
+	INIT_LIST_HEAD(&ai->inflight_list_head);
+	pthread_mutex_init(&ai->inflight_list_mutex, NULL);
+	pthread_cond_init(&ai->inflight_list_cond, NULL);
+
 out:
 	strcpy(filename, orig_filename);
 	free(orig_filename);
@@ -1349,6 +1358,41 @@ out:
 	return ret;
 }
 
+struct inflight_thread {
+	unsigned long min_idx, max_idx;
+	struct list_head list;
+};
+
+static void inflight_block(struct sheepdog_access_info *ai,
+			   struct inflight_thread *myself)
+{
+	struct inflight_thread *inflight;
+
+	pthread_mutex_lock(&ai->inflight_list_mutex);
+
+retry:
+	list_for_each_entry(inflight, &ai->inflight_list_head, list) {
+		if (!(myself->max_idx < inflight->min_idx ||
+		      inflight->max_idx < myself->min_idx)) {
+			pthread_cond_wait(&ai->inflight_list_cond,
+					  &ai->inflight_list_mutex);
+			goto retry;
+		}
+	}
+
+	list_add_tail(&myself->list, &ai->inflight_list_head);
+	pthread_mutex_unlock(&ai->inflight_list_mutex);
+}
+ void inflight_release(struct sheepdog_access_info *ai,
+			     struct inflight_thread *myself)
+{
+	pthread_mutex_lock(&ai->inflight_list_mutex);
+	list_del(&myself->list);
+	pthread_mutex_unlock(&ai->inflight_list_mutex);
+
+	pthread_cond_signal(&ai->inflight_list_cond);
+}
+
 static void bs_sheepdog_request(struct scsi_cmd *cmd)
 {
 	int ret = 0;
@@ -1360,6 +1404,13 @@ static void bs_sheepdog_request(struct scsi_cmd *cmd)
 	struct sheepdog_access_info *ai =
 		(struct sheepdog_access_info *)(info + 1);
 
+	uint32_t object_size = (UINT32_C(1) << ai->inode.block_size_shift);
+	struct inflight_thread myself;
+	int inflight = 0;
+
+	memset(&myself, 0, sizeof(myself));
+	INIT_LIST_HEAD(&myself.list);
+
 	switch (cmd->scb[0]) {
 	case SYNCHRONIZE_CACHE:
 	case SYNCHRONIZE_CACHE_16:
@@ -1383,6 +1434,13 @@ static void bs_sheepdog_request(struct scsi_cmd *cmd)
 		}
 
 		length = scsi_get_out_length(cmd);
+
+		myself.min_idx = cmd->offset / object_size;
+		myself.max_idx = (cmd->offset + length + (object_size - 1))
+			/ object_size;
+		inflight_block(ai, &myself);
+		inflight = 1;
+
 		ret = sd_io(ai, 1, scsi_get_out_buffer(cmd),
 			    length, cmd->offset);
 
@@ -1394,6 +1452,13 @@ static void bs_sheepdog_request(struct scsi_cmd *cmd)
 	case READ_12:
 	case READ_16:
 		length = scsi_get_in_length(cmd);
+
+		myself.min_idx = cmd->offset / object_size;
+		myself.max_idx = (cmd->offset + length + (object_size - 1))
+			/ object_size;
+		inflight_block(ai, &myself);
+		inflight = 1;
+
 		ret = sd_io(ai, 0, scsi_get_in_buffer(cmd),
 			    length, cmd->offset);
 		if (ret)
@@ -1413,6 +1478,9 @@ static void bs_sheepdog_request(struct scsi_cmd *cmd)
 			cmd, cmd->scb[0], ret, length, cmd->offset);
 		sense_data_build(cmd, key, asc);
 	}
+
+	if (inflight)
+		inflight_release(ai, &myself);
 }
 
 static int bs_sheepdog_open(struct scsi_lu *lu, char *path,
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 3/4] sheepdog: pass a correct flag to find_vdi_name()
  2015-07-20  6:59 [PATCH 1/4] sheepdog: prevent double locking during inode reload Hitoshi Mitake
  2015-07-20  6:59 ` [PATCH 2/4] sheepdog: serialize overwrapping request Hitoshi Mitake
@ 2015-07-20  6:59 ` Hitoshi Mitake
  2015-07-20  6:59 ` [PATCH 4/4] sheepdog: handle a case of snapshot -> failover Hitoshi Mitake
  2 siblings, 0 replies; 4+ messages in thread
From: Hitoshi Mitake @ 2015-07-20  6:59 UTC (permalink / raw)
  To: stgt; +Cc: sheepdog, Hitoshi Mitake, Teruaki Ishizaki

Current usage of find_vdi_name() is invalid because it is called with
for_snapshot == 0 even for a case of readonly. This patch fixes the
problem.

Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
---
 usr/bs_sheepdog.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/usr/bs_sheepdog.c b/usr/bs_sheepdog.c
index be6d321..56630ac 100644
--- a/usr/bs_sheepdog.c
+++ b/usr/bs_sheepdog.c
@@ -681,7 +681,7 @@ static int reload_inode(struct sheepdog_access_info *ai, int is_snapshot)
 		memset(tag, 0, sizeof(tag));
 
 		ret = find_vdi_name(ai, ai->inode.name, CURRENT_VDI_ID, tag,
-				    &vid, 0);
+				    &vid, 1);
 		if (ret) {
 			ret = -1;
 			goto ret;
@@ -905,6 +905,7 @@ static int sd_io(struct sheepdog_access_info *ai, int write, char *buf, int len,
 	int need_update_inode = 0, need_reload_inode;
 	int nr_copies = ai->inode.nr_copies;
 	int need_write_lock, check_idx;
+	int read_reload_snap = 0;
 
 	goto do_req;
 
@@ -912,7 +913,7 @@ reload_in_read_path:
 	pthread_rwlock_unlock(&ai->inode_lock); /* unlock current read lock */
 
 	pthread_rwlock_wrlock(&ai->inode_lock);
-	ret = reload_inode(ai, 0);
+	ret = reload_inode(ai, read_reload_snap);
 	if (ret) {
 		eprintf("failed to reload in read path\n");
 		goto out;
@@ -1008,6 +1009,8 @@ retry:
 					dprintf("reload in read path for not"\
 						" written area\n");
 					size = orig_size;
+					read_reload_snap =
+						need_reload_inode == 1;
 					goto reload_in_read_path;
 				}
 			}
@@ -1019,6 +1022,7 @@ retry:
 			if (need_reload_inode) {
 				dprintf("reload in ordinal read path\n");
 				size = orig_size;
+				read_reload_snap = need_reload_inode == 1;
 				goto reload_in_read_path;
 			}
 		}
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH 4/4] sheepdog: handle a case of snapshot -> failover
  2015-07-20  6:59 [PATCH 1/4] sheepdog: prevent double locking during inode reload Hitoshi Mitake
  2015-07-20  6:59 ` [PATCH 2/4] sheepdog: serialize overwrapping request Hitoshi Mitake
  2015-07-20  6:59 ` [PATCH 3/4] sheepdog: pass a correct flag to find_vdi_name() Hitoshi Mitake
@ 2015-07-20  6:59 ` Hitoshi Mitake
  2 siblings, 0 replies; 4+ messages in thread
From: Hitoshi Mitake @ 2015-07-20  6:59 UTC (permalink / raw)
  To: stgt; +Cc: sheepdog, Hitoshi Mitake, Teruaki Ishizaki

If refreshed inode is snapshot, we can reload entire inode and switch
to working VDI immediately. It is required by the new refreshing
condition introduced in the commit 8f781f02ab of sheepdog.

Cc: Teruaki Ishizaki <ishizaki.teruaki@lab.ntt.co.jp>
Signed-off-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
---
 usr/bs_sheepdog.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/usr/bs_sheepdog.c b/usr/bs_sheepdog.c
index 56630ac..99519ef 100644
--- a/usr/bs_sheepdog.c
+++ b/usr/bs_sheepdog.c
@@ -704,6 +704,34 @@ static int reload_inode(struct sheepdog_access_info *ai, int is_snapshot)
 			ret = -1;
 			goto ret;
 		}
+
+		if (!!ai->inode.snap_ctime) {
+			/*
+			 * This is a case like below:
+			 * take snapshot -> write something -> failover
+			 *
+			 * Because invalidated inode is readonly and latest
+			 * working VDI can have COWed objects, we need to
+			 * resolve VID and reload its entire inode object.
+			 */
+			memset(tag, 0, sizeof(tag));
+
+			ret = find_vdi_name(ai, ai->inode.name, CURRENT_VDI_ID,
+					    tag, &vid, 1);
+			if (ret) {
+				ret = -1;
+				goto ret;
+			}
+
+			ret = read_object(ai, (char *)&ai->inode,
+					  vid_to_vdi_oid(vid),
+					  ai->inode.nr_copies, SD_INODE_SIZE, 0,
+					  &need_reload);
+			if (ret) {
+				ret = -1;
+				goto ret;
+			}
+		}
 	}
 
 	ai->min_dirty_data_idx = UINT32_MAX;
-- 
1.9.1

^ permalink raw reply related	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-07-20  6:59 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-07-20  6:59 [PATCH 1/4] sheepdog: prevent double locking during inode reload Hitoshi Mitake
2015-07-20  6:59 ` [PATCH 2/4] sheepdog: serialize overwrapping request Hitoshi Mitake
2015-07-20  6:59 ` [PATCH 3/4] sheepdog: pass a correct flag to find_vdi_name() Hitoshi Mitake
2015-07-20  6:59 ` [PATCH 4/4] sheepdog: handle a case of snapshot -> failover Hitoshi Mitake

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).