linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] lightnvm: pblk: Introduce hot-cold data separation
@ 2019-04-25  5:21 Heiner Litz
  2019-04-25  5:49 ` Javier González
  2019-04-26  9:11 ` Igor Konopko
  0 siblings, 2 replies; 12+ messages in thread
From: Heiner Litz @ 2019-04-25  5:21 UTC (permalink / raw)
  To: mb
  Cc: javier, hans.holmberg, igor.j.konopko, linux-block, linux-kernel,
	Heiner Litz

Introduce the capability to manage multiple open lines. Maintain one line
for user writes (hot) and a second line for gc writes (cold). As user and
gc writes still utilize a shared ring buffer, in rare cases a multi-sector
write will contain both gc and user data. This is acceptable, as on a
tested SSD with minimum write size of 64KB, less than 1% of all writes
contain both hot and cold sectors.

For a zipfian random distribution of LBA writes with theta-zipf of 1.2,
this patch reduces write amplification from 2.5 to 1.3 and increases user
write IOPS by 2.1x.

Signed-off-by: Heiner Litz <hlitz@ucsc.edu>
---
 drivers/lightnvm/pblk-core.c     | 157 +++++++++++++++++++++++--------
 drivers/lightnvm/pblk-init.c     |   9 +-
 drivers/lightnvm/pblk-map.c      |  29 +++---
 drivers/lightnvm/pblk-recovery.c |  43 ++++++---
 drivers/lightnvm/pblk-write.c    |  37 +++++---
 drivers/lightnvm/pblk.h          |  28 ++++--
 6 files changed, 213 insertions(+), 90 deletions(-)

diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 73be3a0311ff..bbb216788bc8 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -1294,7 +1294,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
 	int ret;
 
 	spin_lock(&l_mg->free_lock);
-	l_mg->data_line = line;
+	pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA, line);
 	list_del(&line->list);
 
 	ret = pblk_line_prepare(pblk, line);
@@ -1410,7 +1410,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
 }
 
 static struct pblk_line *pblk_line_retry(struct pblk *pblk,
-					 struct pblk_line *line)
+					 struct pblk_line *line, int type)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line *retry_line;
@@ -1419,7 +1419,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
 	spin_lock(&l_mg->free_lock);
 	retry_line = pblk_line_get(pblk);
 	if (!retry_line) {
-		l_mg->data_line = NULL;
+		pblk_line_set_data_line(pblk, type, NULL);
 		spin_unlock(&l_mg->free_lock);
 		return NULL;
 	}
@@ -1432,7 +1432,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
 
 	pblk_line_reinit(line);
 
-	l_mg->data_line = retry_line;
+	pblk_line_set_data_line(pblk, type, retry_line);
 	spin_unlock(&l_mg->free_lock);
 
 	pblk_rl_free_lines_dec(&pblk->rl, line, false);
@@ -1450,37 +1450,29 @@ static void pblk_set_space_limit(struct pblk *pblk)
 	atomic_set(&rl->rb_space, 0);
 }
 
-struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk, int type)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line *line;
 
 	spin_lock(&l_mg->free_lock);
-	line = pblk_line_get(pblk);
+	line = pblk_line_init_data_line(pblk, type);
 	if (!line) {
 		spin_unlock(&l_mg->free_lock);
 		return NULL;
 	}
 
-	line->seq_nr = l_mg->d_seq_nr++;
-	line->type = PBLK_LINETYPE_DATA;
-	l_mg->data_line = line;
-
 	pblk_line_setup_metadata(line, l_mg, &pblk->lm);
 
 	/* Allocate next line for preparation */
-	l_mg->data_next = pblk_line_get(pblk);
-	if (!l_mg->data_next) {
+	if (!pblk_line_init_next_line(pblk, type)) {
 		/* If we cannot get a new line, we need to stop the pipeline.
 		 * Only allow as many writes in as we can store safely and then
 		 * fail gracefully
 		 */
 		pblk_set_space_limit(pblk);
 
-		l_mg->data_next = NULL;
-	} else {
-		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
-		l_mg->data_next->type = PBLK_LINETYPE_DATA;
+		pblk_line_set_next_line(pblk, type, NULL);
 	}
 	spin_unlock(&l_mg->free_lock);
 
@@ -1488,14 +1480,14 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
 		return NULL;
 
 	if (pblk_line_erase(pblk, line)) {
-		line = pblk_line_retry(pblk, line);
+		line = pblk_line_retry(pblk, line, type);
 		if (!line)
 			return NULL;
 	}
 
 retry_setup:
 	if (!pblk_line_init_metadata(pblk, line, NULL)) {
-		line = pblk_line_retry(pblk, line);
+		line = pblk_line_retry(pblk, line, type);
 		if (!line)
 			return NULL;
 
@@ -1503,7 +1495,7 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
 	}
 
 	if (!pblk_line_init_bb(pblk, line, 1)) {
-		line = pblk_line_retry(pblk, line);
+		line = pblk_line_retry(pblk, line, type);
 		if (!line)
 			return NULL;
 
@@ -1596,12 +1588,18 @@ void __pblk_pipeline_flush(struct pblk *pblk)
 	pblk_flush_writer(pblk);
 	pblk_wait_for_meta(pblk);
 
-	ret = pblk_recov_pad(pblk);
+	ret = pblk_recov_pad(pblk, PBLK_LINETYPE_DATA);
 	if (ret) {
 		pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
 		return;
 	}
 
+	ret = pblk_recov_pad(pblk, PBLK_LINETYPE_GC);
+	if (ret) {
+		pblk_err(pblk, "could not close gc on teardown(%d)\n", ret);
+		return;
+	}
+
 	flush_workqueue(pblk->bb_wq);
 	pblk_line_close_meta_sync(pblk);
 }
@@ -1613,8 +1611,10 @@ void __pblk_pipeline_stop(struct pblk *pblk)
 	spin_lock(&l_mg->free_lock);
 	pblk->state = PBLK_STATE_STOPPED;
 	trace_pblk_state(pblk_disk_name(pblk), pblk->state);
-	l_mg->data_line = NULL;
-	l_mg->data_next = NULL;
+	pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA, NULL);
+	pblk_line_set_data_line(pblk, PBLK_LINETYPE_GC, NULL);
+	pblk_line_set_next_line(pblk, PBLK_LINETYPE_DATA, NULL);
+	pblk_line_set_next_line(pblk, PBLK_LINETYPE_GC, NULL);
 	spin_unlock(&l_mg->free_lock);
 }
 
@@ -1624,19 +1624,20 @@ void pblk_pipeline_stop(struct pblk *pblk)
 	__pblk_pipeline_stop(pblk);
 }
 
-struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk, int type)
 {
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
 	struct pblk_line *cur, *new = NULL;
 	unsigned int left_seblks;
 
-	new = l_mg->data_next;
+	new = pblk_line_get_next_line(pblk, type);
 	if (!new)
 		goto out;
 
 	spin_lock(&l_mg->free_lock);
-	cur = l_mg->data_line;
-	l_mg->data_line = new;
+
+	cur = pblk_line_get_data_line(pblk, type); 
+	pblk_line_set_data_line(pblk, type, new);
 
 	pblk_line_setup_metadata(new, l_mg, &pblk->lm);
 	spin_unlock(&l_mg->free_lock);
@@ -1659,7 +1660,7 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 
 retry_setup:
 	if (!pblk_line_init_metadata(pblk, new, cur)) {
-		new = pblk_line_retry(pblk, new);
+		new = pblk_line_retry(pblk, new, type);
 		if (!new)
 			goto out;
 
@@ -1667,7 +1668,7 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 	}
 
 	if (!pblk_line_init_bb(pblk, new, 1)) {
-		new = pblk_line_retry(pblk, new);
+		new = pblk_line_retry(pblk, new, type);
 		if (!new)
 			goto out;
 
@@ -1678,17 +1679,12 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
 
 	/* Allocate next line for preparation */
 	spin_lock(&l_mg->free_lock);
-	l_mg->data_next = pblk_line_get(pblk);
-	if (!l_mg->data_next) {
+	if (!pblk_line_init_next_line(pblk, type)) {
 		/* If we cannot get a new line, we need to stop the pipeline.
 		 * Only allow as many writes in as we can store safely and then
 		 * fail gracefully
 		 */
 		pblk_stop_writes(pblk, new);
-		l_mg->data_next = NULL;
-	} else {
-		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
-		l_mg->data_next->type = PBLK_LINETYPE_DATA;
 	}
 	spin_unlock(&l_mg->free_lock);
 
@@ -1801,15 +1797,100 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
 	return err;
 }
 
-struct pblk_line *pblk_line_get_data(struct pblk *pblk)
+struct pblk_line *pblk_line_get_data_line(struct pblk *pblk, int type)
 {
-	return pblk->l_mg.data_line;
+	switch (type) {
+	case PBLK_LINETYPE_DATA:
+		return pblk->l_mg.data_line;
+	case PBLK_LINETYPE_GC:
+		return pblk->l_mg.gc_line;
+	case PBLK_LINETYPE_LOG:
+		return pblk->l_mg.log_line;
+	default:
+		WARN(1, "Unsupported line type\n");
+		return NULL;
+	}
 }
 
 /* For now, always erase next line */
-struct pblk_line *pblk_line_get_erase(struct pblk *pblk)
+struct pblk_line *pblk_line_get_next_line(struct pblk *pblk, int type)
 {
-	return pblk->l_mg.data_next;
+	switch (type) {
+	case PBLK_LINETYPE_DATA:
+		return pblk->l_mg.data_next;
+	case PBLK_LINETYPE_GC:
+		return pblk->l_mg.gc_next;
+	case PBLK_LINETYPE_LOG:
+		return pblk->l_mg.log_next;
+	default:
+		WARN(1, "Unsupported line type\n");
+		return NULL;
+	}
+}
+
+void pblk_line_set_data_line(struct pblk *pblk, int type, struct pblk_line *line)
+{
+	switch (type) {
+	case PBLK_LINETYPE_DATA:
+		pblk->l_mg.data_line = line;
+		break;
+	case PBLK_LINETYPE_GC:
+		pblk->l_mg.gc_line = line;
+		break;
+	case PBLK_LINETYPE_LOG:
+		pblk->l_mg.log_line = line;
+		break;
+	default:
+		WARN(1, "Unsupported line type\n");
+	}
+}
+
+/* For now, always erase next line */
+void pblk_line_set_next_line(struct pblk *pblk, int type, struct pblk_line *line)
+{
+	switch (type) {
+	case PBLK_LINETYPE_DATA:
+		pblk->l_mg.data_next = line;
+		break;
+	case PBLK_LINETYPE_GC:
+		pblk->l_mg.gc_next = line;
+		break;
+	case PBLK_LINETYPE_LOG:
+		pblk->l_mg.log_next = line;
+		break;
+	default:
+		WARN(1, "Unsupported line type\n");
+	}
+}
+
+struct pblk_line *pblk_line_init_data_line(struct pblk *pblk, int type)
+{
+	struct pblk_line *line = pblk_line_get(pblk);
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+	lockdep_assert_held(&l_mg->free_lock);
+
+	if (line) {
+		line->seq_nr = l_mg->d_seq_nr++;
+		line->type = type;
+		pblk_line_set_data_line(pblk, type, line);
+	}
+	return line;
+}
+
+struct pblk_line *pblk_line_init_next_line(struct pblk *pblk, int type)
+{
+	struct pblk_line *line = pblk_line_get(pblk);
+	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+
+	lockdep_assert_held(&l_mg->free_lock);
+
+	if (line) {
+		line->seq_nr = l_mg->d_seq_nr++;
+		line->type = type;
+		pblk_line_set_next_line(pblk, type, line);
+	}
+	return line;
 }
 
 int pblk_line_is_full(struct pblk_line *line)
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 1e227a08e54a..0d127e32d556 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -139,11 +139,16 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
 
 	if (!line) {
 		/* Configure next line for user data */
-		line = pblk_line_get_first_data(pblk);
+		line = pblk_line_get_first_data(pblk, PBLK_LINETYPE_DATA);
 		if (!line)
 			return -EFAULT;
 	}
 
+	/* Configure next line for gc data */
+	line = pblk_line_get_first_data(pblk, PBLK_LINETYPE_GC);
+	if (!line)
+		return -EFAULT;
+
 	return 0;
 }
 
@@ -832,7 +837,7 @@ static int pblk_line_mg_init(struct pblk *pblk)
 	int i, bb_distance;
 
 	l_mg->nr_lines = geo->num_chk;
-	l_mg->log_line = l_mg->data_line = NULL;
+	l_mg->log_line = l_mg->data_line = l_mg->gc_line = NULL;
 	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
 	l_mg->nr_free_lines = 0;
 	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 5408e32b2f13..9b0539137df0 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -23,9 +23,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 			      struct ppa_addr *ppa_list,
 			      unsigned long *lun_bitmap,
 			      void *meta_list,
-			      unsigned int valid_secs)
+			      unsigned int valid_secs, int type)
 {
-	struct pblk_line *line = pblk_line_get_data(pblk);
+	struct pblk_line *line = pblk_line_get_data_line(pblk, type);
 	struct pblk_emeta *emeta;
 	struct pblk_w_ctx *w_ctx;
 	__le64 *lba_list;
@@ -42,7 +42,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 		/* If we cannot allocate a new line, make sure to store metadata
 		 * on current line and then fail
 		 */
-		line = pblk_line_replace_data(pblk);
+		line = pblk_line_replace_data(pblk, type);
 		pblk_line_close_meta(pblk, prev_line);
 
 		if (!line) {
@@ -94,8 +94,8 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
 }
 
 int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
-		 unsigned long *lun_bitmap, unsigned int valid_secs,
-		 unsigned int off)
+		unsigned long *lun_bitmap, unsigned int valid_secs,
+		unsigned int off, int type)
 {
 	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
 	void *meta_buffer;
@@ -110,7 +110,7 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 		meta_buffer = pblk_get_meta(pblk, meta_list, i);
 
 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, meta_buffer, map_secs);
+					 lun_bitmap, meta_buffer, map_secs, type);
 		if (ret)
 			return ret;
 	}
@@ -120,8 +120,9 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
 
 /* only if erase_ppa is set, acquire erase semaphore */
 int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
-		       unsigned int sentry, unsigned long *lun_bitmap,
-		       unsigned int valid_secs, struct ppa_addr *erase_ppa)
+		      unsigned int sentry, unsigned long *lun_bitmap,
+		      unsigned int valid_secs, struct ppa_addr *erase_ppa,
+		      int type)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
@@ -141,7 +142,7 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 		meta_buffer = pblk_get_meta(pblk, meta_list, i);
 
 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
-					lun_bitmap, meta_buffer, map_secs);
+					 lun_bitmap, meta_buffer, map_secs, type);
 		if (ret)
 			return ret;
 
@@ -150,10 +151,10 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 		/* line can change after page map. We might also be writing the
 		 * last line.
 		 */
-		e_line = pblk_line_get_erase(pblk);
+		e_line = pblk_line_get_next_line(pblk, type);
 		if (!e_line)
 			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
-							valid_secs, i + min);
+					   valid_secs, i + min, type);
 
 		spin_lock(&e_line->lock);
 		if (!test_bit(erase_lun, e_line->erase_bitmap)) {
@@ -168,17 +169,17 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 			/* Avoid evaluating e_line->left_eblks */
 			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
-							valid_secs, i + min);
+					   valid_secs, i + min, type);
 		}
 		spin_unlock(&e_line->lock);
 	}
 
-	d_line = pblk_line_get_data(pblk);
+	d_line = pblk_line_get_data_line(pblk, type);
 
 	/* line can change after page map. We might also be writing the
 	 * last line.
 	 */
-	e_line = pblk_line_get_erase(pblk);
+	e_line = pblk_line_get_next_line(pblk, type);
 	if (!e_line)
 		return -ENOSPC;
 
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index 865fe310cab4..38bf7b28e73f 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -677,12 +677,12 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
-	struct pblk_line *line, *tline, *data_line = NULL;
+	struct pblk_line *line, *tline, *data_line = NULL, *gc_line = NULL;
 	struct pblk_smeta *smeta;
 	struct pblk_emeta *emeta;
 	struct line_smeta *smeta_buf;
 	int found_lines = 0, recovered_lines = 0, open_lines = 0;
-	int is_next = 0;
+	int is_data_next = 0, is_gc_next = 0;
 	int meta_line;
 	int i, valid_uuid = 0;
 	LIST_HEAD(recov_list);
@@ -838,7 +838,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 			trace_pblk_line_state(pblk_disk_name(pblk), line->id,
 					line->state);
 
-			data_line = line;
+			if (line->type == PBLK_LINETYPE_DATA)
+				data_line = line;
+			else if (line->type == PBLK_LINETYPE_GC)
+				gc_line = line;
+
 			line->meta_line = meta_line;
 
 			open_lines++;
@@ -852,19 +856,30 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 		spin_unlock(&l_mg->free_lock);
 	} else {
 		spin_lock(&l_mg->free_lock);
-		l_mg->data_line = data_line;
-		/* Allocate next line for preparation */
-		l_mg->data_next = pblk_line_get(pblk);
-		if (l_mg->data_next) {
-			l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
-			l_mg->data_next->type = PBLK_LINETYPE_DATA;
-			is_next = 1;
+		if (data_line) {
+			pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA,
+						data_line);
+			/* Allocate next line for preparation */
+			if (pblk_line_init_next_line(pblk, PBLK_LINETYPE_DATA))
+				is_data_next = 1;
+		}
+		if (gc_line) {
+			pblk_line_set_data_line(pblk, PBLK_LINETYPE_GC,
+						gc_line);
+			/* Allocate next line for preparation */
+			if (pblk_line_init_next_line(pblk, PBLK_LINETYPE_GC))
+				is_gc_next = 1;
 		}
+
 		spin_unlock(&l_mg->free_lock);
 	}
 
-	if (is_next)
-		pblk_line_erase(pblk, l_mg->data_next);
+	if (is_data_next)
+		pblk_line_erase(pblk, pblk_line_get_next_line(pblk,
+							   PBLK_LINETYPE_DATA));
+	if (is_gc_next)
+		pblk_line_erase(pblk, pblk_line_get_next_line(pblk,
+							   PBLK_LINETYPE_GC));
 
 out:
 	if (found_lines != recovered_lines)
@@ -877,7 +892,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
 /*
  * Pad current line
  */
-int pblk_recov_pad(struct pblk *pblk)
+int pblk_recov_pad(struct pblk *pblk, int type)
 {
 	struct pblk_line *line;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -885,7 +900,7 @@ int pblk_recov_pad(struct pblk *pblk)
 	int ret = 0;
 
 	spin_lock(&l_mg->free_lock);
-	line = l_mg->data_line;
+	line = pblk_line_get_data_line(pblk, type);
 	left_msecs = line->left_msecs;
 	spin_unlock(&l_mg->free_lock);
 
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 4e63f9b5954c..1e38adc63800 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -313,10 +313,10 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 }
 
 static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
-			   struct ppa_addr *erase_ppa)
+			   struct ppa_addr *erase_ppa, int type)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
-	struct pblk_line *e_line = pblk_line_get_erase(pblk);
+	struct pblk_line *e_line = pblk_line_get_next_line(pblk, type);
 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
 	unsigned int valid = c_ctx->nr_valid;
 	unsigned int padded = c_ctx->nr_padded;
@@ -337,10 +337,10 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
 
 	if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
 		ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
-							valid, 0);
+				  valid, 0, type);
 	else
 		ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
-							valid, erase_ppa);
+					valid, erase_ppa, type);
 
 	return ret;
 }
@@ -446,12 +446,13 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
 
 static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
 				       struct pblk_line *meta_line,
-				       struct nvm_rq *data_rqd)
+				       struct nvm_rq *data_rqd,
+				       int type)
 {
 	struct nvm_tgt_dev *dev = pblk->dev;
 	struct nvm_geo *geo = &dev->geo;
 	struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
-	struct pblk_line *data_line = pblk_line_get_data(pblk);
+	struct pblk_line *data_line = pblk_line_get_data_line(pblk, type);
 	struct ppa_addr ppa, ppa_opt;
 	u64 paddr;
 	int pos_opt;
@@ -481,7 +482,8 @@ static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
 }
 
 static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
-						    struct nvm_rq *data_rqd)
+						    struct nvm_rq *data_rqd,
+						    int type)
 {
 	struct pblk_line_meta *lm = &pblk->lm;
 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
@@ -499,13 +501,13 @@ static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
 	}
 	spin_unlock(&l_mg->close_lock);
 
-	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
+	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd, type))
 		return NULL;
 
 	return meta_line;
 }
 
-static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
+static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd, int type)
 {
 	struct ppa_addr erase_ppa;
 	struct pblk_line *meta_line;
@@ -514,13 +516,13 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
 	pblk_ppa_set_empty(&erase_ppa);
 
 	/* Assign lbas to ppas and populate request structure */
-	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
+	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa, type);
 	if (err) {
 		pblk_err(pblk, "could not setup write request: %d\n", err);
 		return NVM_IO_ERR;
 	}
 
-	meta_line = pblk_should_submit_meta_io(pblk, rqd);
+	meta_line = pblk_should_submit_meta_io(pblk, rqd, type);
 
 	/* Submit data write for current data line */
 	err = pblk_submit_io(pblk, rqd);
@@ -532,11 +534,12 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
 	if (!pblk_ppa_empty(erase_ppa)) {
 		/* Submit erase for next data line */
 		if (pblk_blk_erase_async(pblk, erase_ppa)) {
-			struct pblk_line *e_line = pblk_line_get_erase(pblk);
+			struct pblk_line *e_line;
 			struct nvm_tgt_dev *dev = pblk->dev;
 			struct nvm_geo *geo = &dev->geo;
 			int bit;
 
+			e_line = pblk_line_get_next_line(pblk, type);
 			atomic_inc(&e_line->left_eblks);
 			bit = pblk_ppa_to_pos(geo, erase_ppa);
 			WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
@@ -574,6 +577,9 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 	unsigned int secs_to_flush, packed_meta_pgs;
 	unsigned long pos;
 	unsigned int resubmit;
+	int type;
+	struct pblk_w_ctx *w_ctx;
+	struct pblk_rb_entry *entry;
 
 	*secs_left = 0;
 
@@ -633,13 +639,18 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
 	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
 	rqd->bio = bio;
 
+	entry = &pblk->rwb.entries[pos];
+	w_ctx = &entry->w_ctx;
+	type = w_ctx->flags & PBLK_IOTYPE_GC ?
+		PBLK_LINETYPE_GC : PBLK_LINETYPE_DATA;
+
 	if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
 								secs_avail)) {
 		pblk_err(pblk, "corrupted write bio\n");
 		goto fail_put_bio;
 	}
 
-	if (pblk_submit_io_set(pblk, rqd))
+	if (pblk_submit_io_set(pblk, rqd, type))
 		goto fail_free_bio;
 
 #ifdef CONFIG_NVM_PBLK_DEBUG
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index e304754aaa3c..93ef5a2bffe6 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -316,6 +316,7 @@ enum {
 	PBLK_LINETYPE_FREE = 0,
 	PBLK_LINETYPE_LOG = 1,
 	PBLK_LINETYPE_DATA = 2,
+	PBLK_LINETYPE_GC = 3,
 
 	/* Line state */
 	PBLK_LINESTATE_NEW = 9,
@@ -526,8 +527,10 @@ struct pblk_line_mgmt {
 
 	struct pblk_line *log_line;	/* Current FTL log line */
 	struct pblk_line *data_line;	/* Current data line */
+	struct pblk_line *gc_line;	/* Current gc line */
 	struct pblk_line *log_next;	/* Next FTL log line */
 	struct pblk_line *data_next;	/* Next data line */
+	struct pblk_line *gc_next;	/* Next gc line */
 
 	struct list_head emeta_list;	/* Lines queued to schedule emeta */
 
@@ -804,14 +807,20 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
 			      unsigned int nr_secs, unsigned int len,
 			      int alloc_type, gfp_t gfp_mask);
 struct pblk_line *pblk_line_get(struct pblk *pblk);
-struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
-struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
+struct pblk_line *pblk_line_get_first_data(struct pblk *pblk, int type);
+struct pblk_line *pblk_line_replace_data(struct pblk *pblk, int type);
 void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa);
 void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd);
 int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
 void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
-struct pblk_line *pblk_line_get_data(struct pblk *pblk);
-struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
+struct pblk_line *pblk_line_get_data_line(struct pblk *pblk, int type);
+struct pblk_line *pblk_line_get_next_line(struct pblk *pblk, int type);
+void pblk_line_set_data_line(struct pblk *pblk, int type,
+			     struct pblk_line *line);
+void pblk_line_set_next_line(struct pblk *pblk, int type,
+			     struct pblk_line *line);
+struct pblk_line *pblk_line_init_next_line(struct pblk *pblk, int type);
+struct pblk_line *pblk_line_init_data_line(struct pblk *pblk, int type);
 int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
 int pblk_line_is_full(struct pblk_line *line);
 void pblk_line_free(struct pblk_line *line);
@@ -875,11 +884,12 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
  * pblk map
  */
 int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
-		       unsigned int sentry, unsigned long *lun_bitmap,
-		       unsigned int valid_secs, struct ppa_addr *erase_ppa);
+		      unsigned int sentry, unsigned long *lun_bitmap,
+		      unsigned int valid_secs, struct ppa_addr *erase_ppa,
+		      int type);
 int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
-		 unsigned long *lun_bitmap, unsigned int valid_secs,
-		 unsigned int off);
+		unsigned long *lun_bitmap, unsigned int valid_secs,
+		unsigned int off, int type);
 
 /*
  * pblk write thread
@@ -899,7 +909,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
  * pblk recovery
  */
 struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
-int pblk_recov_pad(struct pblk *pblk);
+int pblk_recov_pad(struct pblk *pblk, int type);
 int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
 
 /*
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-04-25  5:21 [PATCH] lightnvm: pblk: Introduce hot-cold data separation Heiner Litz
@ 2019-04-25  5:49 ` Javier González
  2019-04-26  9:11 ` Igor Konopko
  1 sibling, 0 replies; 12+ messages in thread
From: Javier González @ 2019-04-25  5:49 UTC (permalink / raw)
  To: Heiner Litz
  Cc: Matias Bjørling, Hans Holmberg, Konopko, Igor J,
	linux-block, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 27603 bytes --]

> On 25 Apr 2019, at 07.21, Heiner Litz <hlitz@ucsc.edu> wrote:
> 
> Introduce the capability to manage multiple open lines. Maintain one line
> for user writes (hot) and a second line for gc writes (cold). As user and
> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
> write will contain both gc and user data. This is acceptable, as on a
> tested SSD with minimum write size of 64KB, less than 1% of all writes
> contain both hot and cold sectors.
> 
> For a zipfian random distribution of LBA writes with theta-zipf of 1.2,
> this patch reduces write amplification from 2.5 to 1.3 and increases user
> write IOPS by 2.1x.
> 
> Signed-off-by: Heiner Litz <hlitz@ucsc.edu>
> ---
> drivers/lightnvm/pblk-core.c     | 157 +++++++++++++++++++++++--------
> drivers/lightnvm/pblk-init.c     |   9 +-
> drivers/lightnvm/pblk-map.c      |  29 +++---
> drivers/lightnvm/pblk-recovery.c |  43 ++++++---
> drivers/lightnvm/pblk-write.c    |  37 +++++---
> drivers/lightnvm/pblk.h          |  28 ++++--
> 6 files changed, 213 insertions(+), 90 deletions(-)
> 
> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
> index 73be3a0311ff..bbb216788bc8 100644
> --- a/drivers/lightnvm/pblk-core.c
> +++ b/drivers/lightnvm/pblk-core.c
> @@ -1294,7 +1294,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
> 	int ret;
> 
> 	spin_lock(&l_mg->free_lock);
> -	l_mg->data_line = line;
> +	pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA, line);
> 	list_del(&line->list);
> 
> 	ret = pblk_line_prepare(pblk, line);
> @@ -1410,7 +1410,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
> }
> 
> static struct pblk_line *pblk_line_retry(struct pblk *pblk,
> -					 struct pblk_line *line)
> +					 struct pblk_line *line, int type)
> {
> 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> 	struct pblk_line *retry_line;
> @@ -1419,7 +1419,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
> 	spin_lock(&l_mg->free_lock);
> 	retry_line = pblk_line_get(pblk);
> 	if (!retry_line) {
> -		l_mg->data_line = NULL;
> +		pblk_line_set_data_line(pblk, type, NULL);
> 		spin_unlock(&l_mg->free_lock);
> 		return NULL;
> 	}
> @@ -1432,7 +1432,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
> 
> 	pblk_line_reinit(line);
> 
> -	l_mg->data_line = retry_line;
> +	pblk_line_set_data_line(pblk, type, retry_line);
> 	spin_unlock(&l_mg->free_lock);
> 
> 	pblk_rl_free_lines_dec(&pblk->rl, line, false);
> @@ -1450,37 +1450,29 @@ static void pblk_set_space_limit(struct pblk *pblk)
> 	atomic_set(&rl->rb_space, 0);
> }
> 
> -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
> +struct pblk_line *pblk_line_get_first_data(struct pblk *pblk, int type)
> {
> 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> 	struct pblk_line *line;
> 
> 	spin_lock(&l_mg->free_lock);
> -	line = pblk_line_get(pblk);
> +	line = pblk_line_init_data_line(pblk, type);
> 	if (!line) {
> 		spin_unlock(&l_mg->free_lock);
> 		return NULL;
> 	}
> 
> -	line->seq_nr = l_mg->d_seq_nr++;
> -	line->type = PBLK_LINETYPE_DATA;
> -	l_mg->data_line = line;
> -
> 	pblk_line_setup_metadata(line, l_mg, &pblk->lm);
> 
> 	/* Allocate next line for preparation */
> -	l_mg->data_next = pblk_line_get(pblk);
> -	if (!l_mg->data_next) {
> +	if (!pblk_line_init_next_line(pblk, type)) {
> 		/* If we cannot get a new line, we need to stop the pipeline.
> 		 * Only allow as many writes in as we can store safely and then
> 		 * fail gracefully
> 		 */
> 		pblk_set_space_limit(pblk);
> 
> -		l_mg->data_next = NULL;
> -	} else {
> -		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
> -		l_mg->data_next->type = PBLK_LINETYPE_DATA;
> +		pblk_line_set_next_line(pblk, type, NULL);
> 	}
> 	spin_unlock(&l_mg->free_lock);
> 
> @@ -1488,14 +1480,14 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
> 		return NULL;
> 
> 	if (pblk_line_erase(pblk, line)) {
> -		line = pblk_line_retry(pblk, line);
> +		line = pblk_line_retry(pblk, line, type);
> 		if (!line)
> 			return NULL;
> 	}
> 
> retry_setup:
> 	if (!pblk_line_init_metadata(pblk, line, NULL)) {
> -		line = pblk_line_retry(pblk, line);
> +		line = pblk_line_retry(pblk, line, type);
> 		if (!line)
> 			return NULL;
> 
> @@ -1503,7 +1495,7 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
> 	}
> 
> 	if (!pblk_line_init_bb(pblk, line, 1)) {
> -		line = pblk_line_retry(pblk, line);
> +		line = pblk_line_retry(pblk, line, type);
> 		if (!line)
> 			return NULL;
> 
> @@ -1596,12 +1588,18 @@ void __pblk_pipeline_flush(struct pblk *pblk)
> 	pblk_flush_writer(pblk);
> 	pblk_wait_for_meta(pblk);
> 
> -	ret = pblk_recov_pad(pblk);
> +	ret = pblk_recov_pad(pblk, PBLK_LINETYPE_DATA);
> 	if (ret) {
> 		pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
> 		return;
> 	}
> 
> +	ret = pblk_recov_pad(pblk, PBLK_LINETYPE_GC);
> +	if (ret) {
> +		pblk_err(pblk, "could not close gc on teardown(%d)\n", ret);
> +		return;
> +	}
> +
> 	flush_workqueue(pblk->bb_wq);
> 	pblk_line_close_meta_sync(pblk);
> }
> @@ -1613,8 +1611,10 @@ void __pblk_pipeline_stop(struct pblk *pblk)
> 	spin_lock(&l_mg->free_lock);
> 	pblk->state = PBLK_STATE_STOPPED;
> 	trace_pblk_state(pblk_disk_name(pblk), pblk->state);
> -	l_mg->data_line = NULL;
> -	l_mg->data_next = NULL;
> +	pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA, NULL);
> +	pblk_line_set_data_line(pblk, PBLK_LINETYPE_GC, NULL);
> +	pblk_line_set_next_line(pblk, PBLK_LINETYPE_DATA, NULL);
> +	pblk_line_set_next_line(pblk, PBLK_LINETYPE_GC, NULL);
> 	spin_unlock(&l_mg->free_lock);
> }
> 
> @@ -1624,19 +1624,20 @@ void pblk_pipeline_stop(struct pblk *pblk)
> 	__pblk_pipeline_stop(pblk);
> }
> 
> -struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
> +struct pblk_line *pblk_line_replace_data(struct pblk *pblk, int type)
> {
> 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> 	struct pblk_line *cur, *new = NULL;
> 	unsigned int left_seblks;
> 
> -	new = l_mg->data_next;
> +	new = pblk_line_get_next_line(pblk, type);
> 	if (!new)
> 		goto out;
> 
> 	spin_lock(&l_mg->free_lock);
> -	cur = l_mg->data_line;
> -	l_mg->data_line = new;
> +
> +	cur = pblk_line_get_data_line(pblk, type);
> +	pblk_line_set_data_line(pblk, type, new);
> 
> 	pblk_line_setup_metadata(new, l_mg, &pblk->lm);
> 	spin_unlock(&l_mg->free_lock);
> @@ -1659,7 +1660,7 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
> 
> retry_setup:
> 	if (!pblk_line_init_metadata(pblk, new, cur)) {
> -		new = pblk_line_retry(pblk, new);
> +		new = pblk_line_retry(pblk, new, type);
> 		if (!new)
> 			goto out;
> 
> @@ -1667,7 +1668,7 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
> 	}
> 
> 	if (!pblk_line_init_bb(pblk, new, 1)) {
> -		new = pblk_line_retry(pblk, new);
> +		new = pblk_line_retry(pblk, new, type);
> 		if (!new)
> 			goto out;
> 
> @@ -1678,17 +1679,12 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
> 
> 	/* Allocate next line for preparation */
> 	spin_lock(&l_mg->free_lock);
> -	l_mg->data_next = pblk_line_get(pblk);
> -	if (!l_mg->data_next) {
> +	if (!pblk_line_init_next_line(pblk, type)) {
> 		/* If we cannot get a new line, we need to stop the pipeline.
> 		 * Only allow as many writes in as we can store safely and then
> 		 * fail gracefully
> 		 */
> 		pblk_stop_writes(pblk, new);
> -		l_mg->data_next = NULL;
> -	} else {
> -		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
> -		l_mg->data_next->type = PBLK_LINETYPE_DATA;
> 	}
> 	spin_unlock(&l_mg->free_lock);
> 
> @@ -1801,15 +1797,100 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
> 	return err;
> }
> 
> -struct pblk_line *pblk_line_get_data(struct pblk *pblk)
> +struct pblk_line *pblk_line_get_data_line(struct pblk *pblk, int type)
> {
> -	return pblk->l_mg.data_line;
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		return pblk->l_mg.data_line;
> +	case PBLK_LINETYPE_GC:
> +		return pblk->l_mg.gc_line;
> +	case PBLK_LINETYPE_LOG:
> +		return pblk->l_mg.log_line;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +		return NULL;
> +	}
> }
> 
> /* For now, always erase next line */
> -struct pblk_line *pblk_line_get_erase(struct pblk *pblk)
> +struct pblk_line *pblk_line_get_next_line(struct pblk *pblk, int type)
> {
> -	return pblk->l_mg.data_next;
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		return pblk->l_mg.data_next;
> +	case PBLK_LINETYPE_GC:
> +		return pblk->l_mg.gc_next;
> +	case PBLK_LINETYPE_LOG:
> +		return pblk->l_mg.log_next;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +		return NULL;
> +	}
> +}
> +
> +void pblk_line_set_data_line(struct pblk *pblk, int type, struct pblk_line *line)
> +{
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		pblk->l_mg.data_line = line;
> +		break;
> +	case PBLK_LINETYPE_GC:
> +		pblk->l_mg.gc_line = line;
> +		break;
> +	case PBLK_LINETYPE_LOG:
> +		pblk->l_mg.log_line = line;
> +		break;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +	}
> +}
> +
> +/* For now, always erase next line */
> +void pblk_line_set_next_line(struct pblk *pblk, int type, struct pblk_line *line)
> +{
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		pblk->l_mg.data_next = line;
> +		break;
> +	case PBLK_LINETYPE_GC:
> +		pblk->l_mg.gc_next = line;
> +		break;
> +	case PBLK_LINETYPE_LOG:
> +		pblk->l_mg.log_next = line;
> +		break;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +	}
> +}
> +
> +struct pblk_line *pblk_line_init_data_line(struct pblk *pblk, int type)
> +{
> +	struct pblk_line *line = pblk_line_get(pblk);
> +	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> +
> +	lockdep_assert_held(&l_mg->free_lock);
> +
> +	if (line) {
> +		line->seq_nr = l_mg->d_seq_nr++;
> +		line->type = type;
> +		pblk_line_set_data_line(pblk, type, line);
> +	}
> +	return line;
> +}
> +
> +struct pblk_line *pblk_line_init_next_line(struct pblk *pblk, int type)
> +{
> +	struct pblk_line *line = pblk_line_get(pblk);
> +	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> +
> +	lockdep_assert_held(&l_mg->free_lock);
> +
> +	if (line) {
> +		line->seq_nr = l_mg->d_seq_nr++;
> +		line->type = type;
> +		pblk_line_set_next_line(pblk, type, line);
> +	}
> +	return line;
> }
> 
> int pblk_line_is_full(struct pblk_line *line)
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index 1e227a08e54a..0d127e32d556 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -139,11 +139,16 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
> 
> 	if (!line) {
> 		/* Configure next line for user data */
> -		line = pblk_line_get_first_data(pblk);
> +		line = pblk_line_get_first_data(pblk, PBLK_LINETYPE_DATA);
> 		if (!line)
> 			return -EFAULT;
> 	}
> 
> +	/* Configure next line for gc data */
> +	line = pblk_line_get_first_data(pblk, PBLK_LINETYPE_GC);
> +	if (!line)
> +		return -EFAULT;
> +
> 	return 0;
> }
> 
> @@ -832,7 +837,7 @@ static int pblk_line_mg_init(struct pblk *pblk)
> 	int i, bb_distance;
> 
> 	l_mg->nr_lines = geo->num_chk;
> -	l_mg->log_line = l_mg->data_line = NULL;
> +	l_mg->log_line = l_mg->data_line = l_mg->gc_line = NULL;
> 	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
> 	l_mg->nr_free_lines = 0;
> 	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
> diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
> index 5408e32b2f13..9b0539137df0 100644
> --- a/drivers/lightnvm/pblk-map.c
> +++ b/drivers/lightnvm/pblk-map.c
> @@ -23,9 +23,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
> 			      struct ppa_addr *ppa_list,
> 			      unsigned long *lun_bitmap,
> 			      void *meta_list,
> -			      unsigned int valid_secs)
> +			      unsigned int valid_secs, int type)
> {
> -	struct pblk_line *line = pblk_line_get_data(pblk);
> +	struct pblk_line *line = pblk_line_get_data_line(pblk, type);
> 	struct pblk_emeta *emeta;
> 	struct pblk_w_ctx *w_ctx;
> 	__le64 *lba_list;
> @@ -42,7 +42,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
> 		/* If we cannot allocate a new line, make sure to store metadata
> 		 * on current line and then fail
> 		 */
> -		line = pblk_line_replace_data(pblk);
> +		line = pblk_line_replace_data(pblk, type);
> 		pblk_line_close_meta(pblk, prev_line);
> 
> 		if (!line) {
> @@ -94,8 +94,8 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
> }
> 
> int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
> -		 unsigned long *lun_bitmap, unsigned int valid_secs,
> -		 unsigned int off)
> +		unsigned long *lun_bitmap, unsigned int valid_secs,
> +		unsigned int off, int type)
> {
> 	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
> 	void *meta_buffer;
> @@ -110,7 +110,7 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
> 		meta_buffer = pblk_get_meta(pblk, meta_list, i);
> 
> 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
> -					lun_bitmap, meta_buffer, map_secs);
> +					 lun_bitmap, meta_buffer, map_secs, type);
> 		if (ret)
> 			return ret;
> 	}
> @@ -120,8 +120,9 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
> 
> /* only if erase_ppa is set, acquire erase semaphore */
> int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
> -		       unsigned int sentry, unsigned long *lun_bitmap,
> -		       unsigned int valid_secs, struct ppa_addr *erase_ppa)
> +		      unsigned int sentry, unsigned long *lun_bitmap,
> +		      unsigned int valid_secs, struct ppa_addr *erase_ppa,
> +		      int type)
> {
> 	struct nvm_tgt_dev *dev = pblk->dev;
> 	struct nvm_geo *geo = &dev->geo;
> @@ -141,7 +142,7 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
> 		meta_buffer = pblk_get_meta(pblk, meta_list, i);
> 
> 		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
> -					lun_bitmap, meta_buffer, map_secs);
> +					 lun_bitmap, meta_buffer, map_secs, type);
> 		if (ret)
> 			return ret;
> 
> @@ -150,10 +151,10 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
> 		/* line can change after page map. We might also be writing the
> 		 * last line.
> 		 */
> -		e_line = pblk_line_get_erase(pblk);
> +		e_line = pblk_line_get_next_line(pblk, type);
> 		if (!e_line)
> 			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
> -							valid_secs, i + min);
> +					   valid_secs, i + min, type);
> 
> 		spin_lock(&e_line->lock);
> 		if (!test_bit(erase_lun, e_line->erase_bitmap)) {
> @@ -168,17 +169,17 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
> 
> 			/* Avoid evaluating e_line->left_eblks */
> 			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
> -							valid_secs, i + min);
> +					   valid_secs, i + min, type);
> 		}
> 		spin_unlock(&e_line->lock);
> 	}
> 
> -	d_line = pblk_line_get_data(pblk);
> +	d_line = pblk_line_get_data_line(pblk, type);
> 
> 	/* line can change after page map. We might also be writing the
> 	 * last line.
> 	 */
> -	e_line = pblk_line_get_erase(pblk);
> +	e_line = pblk_line_get_next_line(pblk, type);
> 	if (!e_line)
> 		return -ENOSPC;
> 
> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
> index 865fe310cab4..38bf7b28e73f 100644
> --- a/drivers/lightnvm/pblk-recovery.c
> +++ b/drivers/lightnvm/pblk-recovery.c
> @@ -677,12 +677,12 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
> {
> 	struct pblk_line_meta *lm = &pblk->lm;
> 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> -	struct pblk_line *line, *tline, *data_line = NULL;
> +	struct pblk_line *line, *tline, *data_line = NULL, *gc_line = NULL;
> 	struct pblk_smeta *smeta;
> 	struct pblk_emeta *emeta;
> 	struct line_smeta *smeta_buf;
> 	int found_lines = 0, recovered_lines = 0, open_lines = 0;
> -	int is_next = 0;
> +	int is_data_next = 0, is_gc_next = 0;
> 	int meta_line;
> 	int i, valid_uuid = 0;
> 	LIST_HEAD(recov_list);
> @@ -838,7 +838,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
> 			trace_pblk_line_state(pblk_disk_name(pblk), line->id,
> 					line->state);
> 
> -			data_line = line;
> +			if (line->type == PBLK_LINETYPE_DATA)
> +				data_line = line;
> +			else if (line->type == PBLK_LINETYPE_GC)
> +				gc_line = line;
> +
> 			line->meta_line = meta_line;
> 
> 			open_lines++;
> @@ -852,19 +856,30 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
> 		spin_unlock(&l_mg->free_lock);
> 	} else {
> 		spin_lock(&l_mg->free_lock);
> -		l_mg->data_line = data_line;
> -		/* Allocate next line for preparation */
> -		l_mg->data_next = pblk_line_get(pblk);
> -		if (l_mg->data_next) {
> -			l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
> -			l_mg->data_next->type = PBLK_LINETYPE_DATA;
> -			is_next = 1;
> +		if (data_line) {
> +			pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA,
> +						data_line);
> +			/* Allocate next line for preparation */
> +			if (pblk_line_init_next_line(pblk, PBLK_LINETYPE_DATA))
> +				is_data_next = 1;
> +		}
> +		if (gc_line) {
> +			pblk_line_set_data_line(pblk, PBLK_LINETYPE_GC,
> +						gc_line);
> +			/* Allocate next line for preparation */
> +			if (pblk_line_init_next_line(pblk, PBLK_LINETYPE_GC))
> +				is_gc_next = 1;
> 		}
> +
> 		spin_unlock(&l_mg->free_lock);
> 	}
> 
> -	if (is_next)
> -		pblk_line_erase(pblk, l_mg->data_next);
> +	if (is_data_next)
> +		pblk_line_erase(pblk, pblk_line_get_next_line(pblk,
> +							   PBLK_LINETYPE_DATA));
> +	if (is_gc_next)
> +		pblk_line_erase(pblk, pblk_line_get_next_line(pblk,
> +							   PBLK_LINETYPE_GC));
> 
> out:
> 	if (found_lines != recovered_lines)
> @@ -877,7 +892,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
> /*
>  * Pad current line
>  */
> -int pblk_recov_pad(struct pblk *pblk)
> +int pblk_recov_pad(struct pblk *pblk, int type)
> {
> 	struct pblk_line *line;
> 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> @@ -885,7 +900,7 @@ int pblk_recov_pad(struct pblk *pblk)
> 	int ret = 0;
> 
> 	spin_lock(&l_mg->free_lock);
> -	line = l_mg->data_line;
> +	line = pblk_line_get_data_line(pblk, type);
> 	left_msecs = line->left_msecs;
> 	spin_unlock(&l_mg->free_lock);
> 
> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
> index 4e63f9b5954c..1e38adc63800 100644
> --- a/drivers/lightnvm/pblk-write.c
> +++ b/drivers/lightnvm/pblk-write.c
> @@ -313,10 +313,10 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
> }
> 
> static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
> -			   struct ppa_addr *erase_ppa)
> +			   struct ppa_addr *erase_ppa, int type)
> {
> 	struct pblk_line_meta *lm = &pblk->lm;
> -	struct pblk_line *e_line = pblk_line_get_erase(pblk);
> +	struct pblk_line *e_line = pblk_line_get_next_line(pblk, type);
> 	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
> 	unsigned int valid = c_ctx->nr_valid;
> 	unsigned int padded = c_ctx->nr_padded;
> @@ -337,10 +337,10 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
> 
> 	if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
> 		ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
> -							valid, 0);
> +				  valid, 0, type);
> 	else
> 		ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
> -							valid, erase_ppa);
> +					valid, erase_ppa, type);
> 
> 	return ret;
> }
> @@ -446,12 +446,13 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
> 
> static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
> 				       struct pblk_line *meta_line,
> -				       struct nvm_rq *data_rqd)
> +				       struct nvm_rq *data_rqd,
> +				       int type)
> {
> 	struct nvm_tgt_dev *dev = pblk->dev;
> 	struct nvm_geo *geo = &dev->geo;
> 	struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
> -	struct pblk_line *data_line = pblk_line_get_data(pblk);
> +	struct pblk_line *data_line = pblk_line_get_data_line(pblk, type);
> 	struct ppa_addr ppa, ppa_opt;
> 	u64 paddr;
> 	int pos_opt;
> @@ -481,7 +482,8 @@ static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
> }
> 
> static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
> -						    struct nvm_rq *data_rqd)
> +						    struct nvm_rq *data_rqd,
> +						    int type)
> {
> 	struct pblk_line_meta *lm = &pblk->lm;
> 	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> @@ -499,13 +501,13 @@ static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
> 	}
> 	spin_unlock(&l_mg->close_lock);
> 
> -	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
> +	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd, type))
> 		return NULL;
> 
> 	return meta_line;
> }
> 
> -static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
> +static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd, int type)
> {
> 	struct ppa_addr erase_ppa;
> 	struct pblk_line *meta_line;
> @@ -514,13 +516,13 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
> 	pblk_ppa_set_empty(&erase_ppa);
> 
> 	/* Assign lbas to ppas and populate request structure */
> -	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
> +	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa, type);
> 	if (err) {
> 		pblk_err(pblk, "could not setup write request: %d\n", err);
> 		return NVM_IO_ERR;
> 	}
> 
> -	meta_line = pblk_should_submit_meta_io(pblk, rqd);
> +	meta_line = pblk_should_submit_meta_io(pblk, rqd, type);
> 
> 	/* Submit data write for current data line */
> 	err = pblk_submit_io(pblk, rqd);
> @@ -532,11 +534,12 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
> 	if (!pblk_ppa_empty(erase_ppa)) {
> 		/* Submit erase for next data line */
> 		if (pblk_blk_erase_async(pblk, erase_ppa)) {
> -			struct pblk_line *e_line = pblk_line_get_erase(pblk);
> +			struct pblk_line *e_line;
> 			struct nvm_tgt_dev *dev = pblk->dev;
> 			struct nvm_geo *geo = &dev->geo;
> 			int bit;
> 
> +			e_line = pblk_line_get_next_line(pblk, type);
> 			atomic_inc(&e_line->left_eblks);
> 			bit = pblk_ppa_to_pos(geo, erase_ppa);
> 			WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
> @@ -574,6 +577,9 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
> 	unsigned int secs_to_flush, packed_meta_pgs;
> 	unsigned long pos;
> 	unsigned int resubmit;
> +	int type;
> +	struct pblk_w_ctx *w_ctx;
> +	struct pblk_rb_entry *entry;
> 
> 	*secs_left = 0;
> 
> @@ -633,13 +639,18 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
> 	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
> 	rqd->bio = bio;
> 
> +	entry = &pblk->rwb.entries[pos];
> +	w_ctx = &entry->w_ctx;
> +	type = w_ctx->flags & PBLK_IOTYPE_GC ?
> +		PBLK_LINETYPE_GC : PBLK_LINETYPE_DATA;
> +
> 	if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
> 								secs_avail)) {
> 		pblk_err(pblk, "corrupted write bio\n");
> 		goto fail_put_bio;
> 	}
> 
> -	if (pblk_submit_io_set(pblk, rqd))
> +	if (pblk_submit_io_set(pblk, rqd, type))
> 		goto fail_free_bio;
> 
> #ifdef CONFIG_NVM_PBLK_DEBUG
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index e304754aaa3c..93ef5a2bffe6 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -316,6 +316,7 @@ enum {
> 	PBLK_LINETYPE_FREE = 0,
> 	PBLK_LINETYPE_LOG = 1,
> 	PBLK_LINETYPE_DATA = 2,
> +	PBLK_LINETYPE_GC = 3,
> 
> 	/* Line state */
> 	PBLK_LINESTATE_NEW = 9,
> @@ -526,8 +527,10 @@ struct pblk_line_mgmt {
> 
> 	struct pblk_line *log_line;	/* Current FTL log line */
> 	struct pblk_line *data_line;	/* Current data line */
> +	struct pblk_line *gc_line;	/* Current gc line */
> 	struct pblk_line *log_next;	/* Next FTL log line */
> 	struct pblk_line *data_next;	/* Next data line */
> +	struct pblk_line *gc_next;	/* Next gc line */

Would it make sense to generalize this and just use an array of lines
with positions for user and GC at this moment? Support for several
streams is around the corner and we would use this same strategy.

> 
> 	struct list_head emeta_list;	/* Lines queued to schedule emeta */
> 
> @@ -804,14 +807,20 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
> 			      unsigned int nr_secs, unsigned int len,
> 			      int alloc_type, gfp_t gfp_mask);
> struct pblk_line *pblk_line_get(struct pblk *pblk);
> -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
> -struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
> +struct pblk_line *pblk_line_get_first_data(struct pblk *pblk, int type);
> +struct pblk_line *pblk_line_replace_data(struct pblk *pblk, int type);
> void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa);
> void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd);
> int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
> void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
> -struct pblk_line *pblk_line_get_data(struct pblk *pblk);
> -struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
> +struct pblk_line *pblk_line_get_data_line(struct pblk *pblk, int type);
> +struct pblk_line *pblk_line_get_next_line(struct pblk *pblk, int type);
> +void pblk_line_set_data_line(struct pblk *pblk, int type,
> +			     struct pblk_line *line);
> +void pblk_line_set_next_line(struct pblk *pblk, int type,
> +			     struct pblk_line *line);
> +struct pblk_line *pblk_line_init_next_line(struct pblk *pblk, int type);
> +struct pblk_line *pblk_line_init_data_line(struct pblk *pblk, int type);
> int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
> int pblk_line_is_full(struct pblk_line *line);
> void pblk_line_free(struct pblk_line *line);
> @@ -875,11 +884,12 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
>  * pblk map
>  */
> int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
> -		       unsigned int sentry, unsigned long *lun_bitmap,
> -		       unsigned int valid_secs, struct ppa_addr *erase_ppa);
> +		      unsigned int sentry, unsigned long *lun_bitmap,
> +		      unsigned int valid_secs, struct ppa_addr *erase_ppa,
> +		      int type);
> int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
> -		 unsigned long *lun_bitmap, unsigned int valid_secs,
> -		 unsigned int off);
> +		unsigned long *lun_bitmap, unsigned int valid_secs,
> +		unsigned int off, int type);

Would be good to clean the pblk_map_* functions; they have grown a lot in parameters.

> 
> /*
>  * pblk write thread
> @@ -899,7 +909,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
>  * pblk recovery
>  */
> struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
> -int pblk_recov_pad(struct pblk *pblk);
> +int pblk_recov_pad(struct pblk *pblk, int type);
> int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
> 
> /*
> --
> 2.17.1


I like this patch a lot. Thanks Heiner!

I started some tests on my side; I’ll send a tested-by tag then. For now
you can add my review - the comments are small suggestions.

Reviewed-by: Javier González <javier@javigon.com>


[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-04-25  5:21 [PATCH] lightnvm: pblk: Introduce hot-cold data separation Heiner Litz
  2019-04-25  5:49 ` Javier González
@ 2019-04-26  9:11 ` Igor Konopko
  2019-04-26 10:04   ` Javier González
  1 sibling, 1 reply; 12+ messages in thread
From: Igor Konopko @ 2019-04-26  9:11 UTC (permalink / raw)
  To: Heiner Litz, mb; +Cc: javier, hans.holmberg, linux-block, linux-kernel



On 25.04.2019 07:21, Heiner Litz wrote:
> Introduce the capability to manage multiple open lines. Maintain one line
> for user writes (hot) and a second line for gc writes (cold). As user and
> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
> write will contain both gc and user data. This is acceptable, as on a
> tested SSD with minimum write size of 64KB, less than 1% of all writes
> contain both hot and cold sectors.
> 

Hi Heiner

Generally I really like this changes, I was thinking about sth similar 
since a while, so it is very good to see that patch.

I have a one question related to this patch, since it is not very clear 
for me - how you ensure the data integrity in following scenarios:
-we have open line X for user data and line Y for GC
-GC writes LBA=N to line Y
-user writes LBA=N to line X
-we have power failure when both line X and Y were not written completely
-during pblk creation we are executing OOB metadata recovery
And here is the question, how we distinguish whether LBA=N from line Y 
or LBA=N from line X is the valid one?
Line X and Y might have seq_id either descending or ascending - this 
would create two possible scenarios too.

Thanks
Igor

> For a zipfian random distribution of LBA writes with theta-zipf of 1.2,
> this patch reduces write amplification from 2.5 to 1.3 and increases user
> write IOPS by 2.1x.
> 
> Signed-off-by: Heiner Litz <hlitz@ucsc.edu>
> ---
>   drivers/lightnvm/pblk-core.c     | 157 +++++++++++++++++++++++--------
>   drivers/lightnvm/pblk-init.c     |   9 +-
>   drivers/lightnvm/pblk-map.c      |  29 +++---
>   drivers/lightnvm/pblk-recovery.c |  43 ++++++---
>   drivers/lightnvm/pblk-write.c    |  37 +++++---
>   drivers/lightnvm/pblk.h          |  28 ++++--
>   6 files changed, 213 insertions(+), 90 deletions(-)
> 
> diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
> index 73be3a0311ff..bbb216788bc8 100644
> --- a/drivers/lightnvm/pblk-core.c
> +++ b/drivers/lightnvm/pblk-core.c
> @@ -1294,7 +1294,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
>   	int ret;
>   
>   	spin_lock(&l_mg->free_lock);
> -	l_mg->data_line = line;
> +	pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA, line);
>   	list_del(&line->list);
>   
>   	ret = pblk_line_prepare(pblk, line);
> @@ -1410,7 +1410,7 @@ struct pblk_line *pblk_line_get(struct pblk *pblk)
>   }
>   
>   static struct pblk_line *pblk_line_retry(struct pblk *pblk,
> -					 struct pblk_line *line)
> +					 struct pblk_line *line, int type)
>   {
>   	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
>   	struct pblk_line *retry_line;
> @@ -1419,7 +1419,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
>   	spin_lock(&l_mg->free_lock);
>   	retry_line = pblk_line_get(pblk);
>   	if (!retry_line) {
> -		l_mg->data_line = NULL;
> +		pblk_line_set_data_line(pblk, type, NULL);
>   		spin_unlock(&l_mg->free_lock);
>   		return NULL;
>   	}
> @@ -1432,7 +1432,7 @@ static struct pblk_line *pblk_line_retry(struct pblk *pblk,
>   
>   	pblk_line_reinit(line);
>   
> -	l_mg->data_line = retry_line;
> +	pblk_line_set_data_line(pblk, type, retry_line);
>   	spin_unlock(&l_mg->free_lock);
>   
>   	pblk_rl_free_lines_dec(&pblk->rl, line, false);
> @@ -1450,37 +1450,29 @@ static void pblk_set_space_limit(struct pblk *pblk)
>   	atomic_set(&rl->rb_space, 0);
>   }
>   
> -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
> +struct pblk_line *pblk_line_get_first_data(struct pblk *pblk, int type)
>   {
>   	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
>   	struct pblk_line *line;
>   
>   	spin_lock(&l_mg->free_lock);
> -	line = pblk_line_get(pblk);
> +	line = pblk_line_init_data_line(pblk, type);
>   	if (!line) {
>   		spin_unlock(&l_mg->free_lock);
>   		return NULL;
>   	}
>   
> -	line->seq_nr = l_mg->d_seq_nr++;
> -	line->type = PBLK_LINETYPE_DATA;
> -	l_mg->data_line = line;
> -
>   	pblk_line_setup_metadata(line, l_mg, &pblk->lm);
>   
>   	/* Allocate next line for preparation */
> -	l_mg->data_next = pblk_line_get(pblk);
> -	if (!l_mg->data_next) {
> +	if (!pblk_line_init_next_line(pblk, type)) {
>   		/* If we cannot get a new line, we need to stop the pipeline.
>   		 * Only allow as many writes in as we can store safely and then
>   		 * fail gracefully
>   		 */
>   		pblk_set_space_limit(pblk);
>   
> -		l_mg->data_next = NULL;
> -	} else {
> -		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
> -		l_mg->data_next->type = PBLK_LINETYPE_DATA;
> +		pblk_line_set_next_line(pblk, type, NULL);
>   	}
>   	spin_unlock(&l_mg->free_lock);
>   
> @@ -1488,14 +1480,14 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
>   		return NULL;
>   
>   	if (pblk_line_erase(pblk, line)) {
> -		line = pblk_line_retry(pblk, line);
> +		line = pblk_line_retry(pblk, line, type);
>   		if (!line)
>   			return NULL;
>   	}
>   
>   retry_setup:
>   	if (!pblk_line_init_metadata(pblk, line, NULL)) {
> -		line = pblk_line_retry(pblk, line);
> +		line = pblk_line_retry(pblk, line, type);
>   		if (!line)
>   			return NULL;
>   
> @@ -1503,7 +1495,7 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
>   	}
>   
>   	if (!pblk_line_init_bb(pblk, line, 1)) {
> -		line = pblk_line_retry(pblk, line);
> +		line = pblk_line_retry(pblk, line, type);
>   		if (!line)
>   			return NULL;
>   
> @@ -1596,12 +1588,18 @@ void __pblk_pipeline_flush(struct pblk *pblk)
>   	pblk_flush_writer(pblk);
>   	pblk_wait_for_meta(pblk);
>   
> -	ret = pblk_recov_pad(pblk);
> +	ret = pblk_recov_pad(pblk, PBLK_LINETYPE_DATA);
>   	if (ret) {
>   		pblk_err(pblk, "could not close data on teardown(%d)\n", ret);
>   		return;
>   	}
>   
> +	ret = pblk_recov_pad(pblk, PBLK_LINETYPE_GC);
> +	if (ret) {
> +		pblk_err(pblk, "could not close gc on teardown(%d)\n", ret);
> +		return;
> +	}
> +
>   	flush_workqueue(pblk->bb_wq);
>   	pblk_line_close_meta_sync(pblk);
>   }
> @@ -1613,8 +1611,10 @@ void __pblk_pipeline_stop(struct pblk *pblk)
>   	spin_lock(&l_mg->free_lock);
>   	pblk->state = PBLK_STATE_STOPPED;
>   	trace_pblk_state(pblk_disk_name(pblk), pblk->state);
> -	l_mg->data_line = NULL;
> -	l_mg->data_next = NULL;
> +	pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA, NULL);
> +	pblk_line_set_data_line(pblk, PBLK_LINETYPE_GC, NULL);
> +	pblk_line_set_next_line(pblk, PBLK_LINETYPE_DATA, NULL);
> +	pblk_line_set_next_line(pblk, PBLK_LINETYPE_GC, NULL);
>   	spin_unlock(&l_mg->free_lock);
>   }
>   
> @@ -1624,19 +1624,20 @@ void pblk_pipeline_stop(struct pblk *pblk)
>   	__pblk_pipeline_stop(pblk);
>   }
>   
> -struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
> +struct pblk_line *pblk_line_replace_data(struct pblk *pblk, int type)
>   {
>   	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
>   	struct pblk_line *cur, *new = NULL;
>   	unsigned int left_seblks;
>   
> -	new = l_mg->data_next;
> +	new = pblk_line_get_next_line(pblk, type);
>   	if (!new)
>   		goto out;
>   
>   	spin_lock(&l_mg->free_lock);
> -	cur = l_mg->data_line;
> -	l_mg->data_line = new;
> +
> +	cur = pblk_line_get_data_line(pblk, type);
> +	pblk_line_set_data_line(pblk, type, new);
>   
>   	pblk_line_setup_metadata(new, l_mg, &pblk->lm);
>   	spin_unlock(&l_mg->free_lock);
> @@ -1659,7 +1660,7 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
>   
>   retry_setup:
>   	if (!pblk_line_init_metadata(pblk, new, cur)) {
> -		new = pblk_line_retry(pblk, new);
> +		new = pblk_line_retry(pblk, new, type);
>   		if (!new)
>   			goto out;
>   
> @@ -1667,7 +1668,7 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
>   	}
>   
>   	if (!pblk_line_init_bb(pblk, new, 1)) {
> -		new = pblk_line_retry(pblk, new);
> +		new = pblk_line_retry(pblk, new, type);
>   		if (!new)
>   			goto out;
>   
> @@ -1678,17 +1679,12 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
>   
>   	/* Allocate next line for preparation */
>   	spin_lock(&l_mg->free_lock);
> -	l_mg->data_next = pblk_line_get(pblk);
> -	if (!l_mg->data_next) {
> +	if (!pblk_line_init_next_line(pblk, type)) {
>   		/* If we cannot get a new line, we need to stop the pipeline.
>   		 * Only allow as many writes in as we can store safely and then
>   		 * fail gracefully
>   		 */
>   		pblk_stop_writes(pblk, new);
> -		l_mg->data_next = NULL;
> -	} else {
> -		l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
> -		l_mg->data_next->type = PBLK_LINETYPE_DATA;
>   	}
>   	spin_unlock(&l_mg->free_lock);
>   
> @@ -1801,15 +1797,100 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
>   	return err;
>   }
>   
> -struct pblk_line *pblk_line_get_data(struct pblk *pblk)
> +struct pblk_line *pblk_line_get_data_line(struct pblk *pblk, int type)
>   {
> -	return pblk->l_mg.data_line;
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		return pblk->l_mg.data_line;
> +	case PBLK_LINETYPE_GC:
> +		return pblk->l_mg.gc_line;
> +	case PBLK_LINETYPE_LOG:
> +		return pblk->l_mg.log_line;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +		return NULL;
> +	}
>   }
>   
>   /* For now, always erase next line */
> -struct pblk_line *pblk_line_get_erase(struct pblk *pblk)
> +struct pblk_line *pblk_line_get_next_line(struct pblk *pblk, int type)
>   {
> -	return pblk->l_mg.data_next;
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		return pblk->l_mg.data_next;
> +	case PBLK_LINETYPE_GC:
> +		return pblk->l_mg.gc_next;
> +	case PBLK_LINETYPE_LOG:
> +		return pblk->l_mg.log_next;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +		return NULL;
> +	}
> +}
> +
> +void pblk_line_set_data_line(struct pblk *pblk, int type, struct pblk_line *line)
> +{
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		pblk->l_mg.data_line = line;
> +		break;
> +	case PBLK_LINETYPE_GC:
> +		pblk->l_mg.gc_line = line;
> +		break;
> +	case PBLK_LINETYPE_LOG:
> +		pblk->l_mg.log_line = line;
> +		break;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +	}
> +}
> +
> +/* For now, always erase next line */
> +void pblk_line_set_next_line(struct pblk *pblk, int type, struct pblk_line *line)
> +{
> +	switch (type) {
> +	case PBLK_LINETYPE_DATA:
> +		pblk->l_mg.data_next = line;
> +		break;
> +	case PBLK_LINETYPE_GC:
> +		pblk->l_mg.gc_next = line;
> +		break;
> +	case PBLK_LINETYPE_LOG:
> +		pblk->l_mg.log_next = line;
> +		break;
> +	default:
> +		WARN(1, "Unsupported line type\n");
> +	}
> +}
> +
> +struct pblk_line *pblk_line_init_data_line(struct pblk *pblk, int type)
> +{
> +	struct pblk_line *line = pblk_line_get(pblk);
> +	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> +
> +	lockdep_assert_held(&l_mg->free_lock);
> +
> +	if (line) {
> +		line->seq_nr = l_mg->d_seq_nr++;
> +		line->type = type;
> +		pblk_line_set_data_line(pblk, type, line);
> +	}
> +	return line;
> +}
> +
> +struct pblk_line *pblk_line_init_next_line(struct pblk *pblk, int type)
> +{
> +	struct pblk_line *line = pblk_line_get(pblk);
> +	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> +
> +	lockdep_assert_held(&l_mg->free_lock);
> +
> +	if (line) {
> +		line->seq_nr = l_mg->d_seq_nr++;
> +		line->type = type;
> +		pblk_line_set_next_line(pblk, type, line);
> +	}
> +	return line;
>   }
>   
>   int pblk_line_is_full(struct pblk_line *line)
> diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
> index 1e227a08e54a..0d127e32d556 100644
> --- a/drivers/lightnvm/pblk-init.c
> +++ b/drivers/lightnvm/pblk-init.c
> @@ -139,11 +139,16 @@ static int pblk_l2p_recover(struct pblk *pblk, bool factory_init)
>   
>   	if (!line) {
>   		/* Configure next line for user data */
> -		line = pblk_line_get_first_data(pblk);
> +		line = pblk_line_get_first_data(pblk, PBLK_LINETYPE_DATA);
>   		if (!line)
>   			return -EFAULT;
>   	}
>   
> +	/* Configure next line for gc data */
> +	line = pblk_line_get_first_data(pblk, PBLK_LINETYPE_GC);
> +	if (!line)
> +		return -EFAULT;
> +
>   	return 0;
>   }
>   
> @@ -832,7 +837,7 @@ static int pblk_line_mg_init(struct pblk *pblk)
>   	int i, bb_distance;
>   
>   	l_mg->nr_lines = geo->num_chk;
> -	l_mg->log_line = l_mg->data_line = NULL;
> +	l_mg->log_line = l_mg->data_line = l_mg->gc_line = NULL;
>   	l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
>   	l_mg->nr_free_lines = 0;
>   	bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
> diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
> index 5408e32b2f13..9b0539137df0 100644
> --- a/drivers/lightnvm/pblk-map.c
> +++ b/drivers/lightnvm/pblk-map.c
> @@ -23,9 +23,9 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
>   			      struct ppa_addr *ppa_list,
>   			      unsigned long *lun_bitmap,
>   			      void *meta_list,
> -			      unsigned int valid_secs)
> +			      unsigned int valid_secs, int type)
>   {
> -	struct pblk_line *line = pblk_line_get_data(pblk);
> +	struct pblk_line *line = pblk_line_get_data_line(pblk, type);
>   	struct pblk_emeta *emeta;
>   	struct pblk_w_ctx *w_ctx;
>   	__le64 *lba_list;
> @@ -42,7 +42,7 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
>   		/* If we cannot allocate a new line, make sure to store metadata
>   		 * on current line and then fail
>   		 */
> -		line = pblk_line_replace_data(pblk);
> +		line = pblk_line_replace_data(pblk, type);
>   		pblk_line_close_meta(pblk, prev_line);
>   
>   		if (!line) {
> @@ -94,8 +94,8 @@ static int pblk_map_page_data(struct pblk *pblk, unsigned int sentry,
>   }
>   
>   int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
> -		 unsigned long *lun_bitmap, unsigned int valid_secs,
> -		 unsigned int off)
> +		unsigned long *lun_bitmap, unsigned int valid_secs,
> +		unsigned int off, int type)
>   {
>   	void *meta_list = pblk_get_meta_for_writes(pblk, rqd);
>   	void *meta_buffer;
> @@ -110,7 +110,7 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
>   		meta_buffer = pblk_get_meta(pblk, meta_list, i);
>   
>   		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
> -					lun_bitmap, meta_buffer, map_secs);
> +					 lun_bitmap, meta_buffer, map_secs, type);
>   		if (ret)
>   			return ret;
>   	}
> @@ -120,8 +120,9 @@ int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
>   
>   /* only if erase_ppa is set, acquire erase semaphore */
>   int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
> -		       unsigned int sentry, unsigned long *lun_bitmap,
> -		       unsigned int valid_secs, struct ppa_addr *erase_ppa)
> +		      unsigned int sentry, unsigned long *lun_bitmap,
> +		      unsigned int valid_secs, struct ppa_addr *erase_ppa,
> +		      int type)
>   {
>   	struct nvm_tgt_dev *dev = pblk->dev;
>   	struct nvm_geo *geo = &dev->geo;
> @@ -141,7 +142,7 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
>   		meta_buffer = pblk_get_meta(pblk, meta_list, i);
>   
>   		ret = pblk_map_page_data(pblk, sentry + i, &ppa_list[i],
> -					lun_bitmap, meta_buffer, map_secs);
> +					 lun_bitmap, meta_buffer, map_secs, type);
>   		if (ret)
>   			return ret;
>   
> @@ -150,10 +151,10 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
>   		/* line can change after page map. We might also be writing the
>   		 * last line.
>   		 */
> -		e_line = pblk_line_get_erase(pblk);
> +		e_line = pblk_line_get_next_line(pblk, type);
>   		if (!e_line)
>   			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
> -							valid_secs, i + min);
> +					   valid_secs, i + min, type);
>   
>   		spin_lock(&e_line->lock);
>   		if (!test_bit(erase_lun, e_line->erase_bitmap)) {
> @@ -168,17 +169,17 @@ int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
>   
>   			/* Avoid evaluating e_line->left_eblks */
>   			return pblk_map_rq(pblk, rqd, sentry, lun_bitmap,
> -							valid_secs, i + min);
> +					   valid_secs, i + min, type);
>   		}
>   		spin_unlock(&e_line->lock);
>   	}
>   
> -	d_line = pblk_line_get_data(pblk);
> +	d_line = pblk_line_get_data_line(pblk, type);
>   
>   	/* line can change after page map. We might also be writing the
>   	 * last line.
>   	 */
> -	e_line = pblk_line_get_erase(pblk);
> +	e_line = pblk_line_get_next_line(pblk, type);
>   	if (!e_line)
>   		return -ENOSPC;
>   
> diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
> index 865fe310cab4..38bf7b28e73f 100644
> --- a/drivers/lightnvm/pblk-recovery.c
> +++ b/drivers/lightnvm/pblk-recovery.c
> @@ -677,12 +677,12 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
>   {
>   	struct pblk_line_meta *lm = &pblk->lm;
>   	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> -	struct pblk_line *line, *tline, *data_line = NULL;
> +	struct pblk_line *line, *tline, *data_line = NULL, *gc_line = NULL;
>   	struct pblk_smeta *smeta;
>   	struct pblk_emeta *emeta;
>   	struct line_smeta *smeta_buf;
>   	int found_lines = 0, recovered_lines = 0, open_lines = 0;
> -	int is_next = 0;
> +	int is_data_next = 0, is_gc_next = 0;
>   	int meta_line;
>   	int i, valid_uuid = 0;
>   	LIST_HEAD(recov_list);
> @@ -838,7 +838,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
>   			trace_pblk_line_state(pblk_disk_name(pblk), line->id,
>   					line->state);
>   
> -			data_line = line;
> +			if (line->type == PBLK_LINETYPE_DATA)
> +				data_line = line;
> +			else if (line->type == PBLK_LINETYPE_GC)
> +				gc_line = line;
> +
>   			line->meta_line = meta_line;
>   
>   			open_lines++;
> @@ -852,19 +856,30 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
>   		spin_unlock(&l_mg->free_lock);
>   	} else {
>   		spin_lock(&l_mg->free_lock);
> -		l_mg->data_line = data_line;
> -		/* Allocate next line for preparation */
> -		l_mg->data_next = pblk_line_get(pblk);
> -		if (l_mg->data_next) {
> -			l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
> -			l_mg->data_next->type = PBLK_LINETYPE_DATA;
> -			is_next = 1;
> +		if (data_line) {
> +			pblk_line_set_data_line(pblk, PBLK_LINETYPE_DATA,
> +						data_line);
> +			/* Allocate next line for preparation */
> +			if (pblk_line_init_next_line(pblk, PBLK_LINETYPE_DATA))
> +				is_data_next = 1;
> +		}
> +		if (gc_line) {
> +			pblk_line_set_data_line(pblk, PBLK_LINETYPE_GC,
> +						gc_line);
> +			/* Allocate next line for preparation */
> +			if (pblk_line_init_next_line(pblk, PBLK_LINETYPE_GC))
> +				is_gc_next = 1;
>   		}
> +
>   		spin_unlock(&l_mg->free_lock);
>   	}
>   
> -	if (is_next)
> -		pblk_line_erase(pblk, l_mg->data_next);
> +	if (is_data_next)
> +		pblk_line_erase(pblk, pblk_line_get_next_line(pblk,
> +							   PBLK_LINETYPE_DATA));
> +	if (is_gc_next)
> +		pblk_line_erase(pblk, pblk_line_get_next_line(pblk,
> +							   PBLK_LINETYPE_GC));
>   
>   out:
>   	if (found_lines != recovered_lines)
> @@ -877,7 +892,7 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
>   /*
>    * Pad current line
>    */
> -int pblk_recov_pad(struct pblk *pblk)
> +int pblk_recov_pad(struct pblk *pblk, int type)
>   {
>   	struct pblk_line *line;
>   	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> @@ -885,7 +900,7 @@ int pblk_recov_pad(struct pblk *pblk)
>   	int ret = 0;
>   
>   	spin_lock(&l_mg->free_lock);
> -	line = l_mg->data_line;
> +	line = pblk_line_get_data_line(pblk, type);
>   	left_msecs = line->left_msecs;
>   	spin_unlock(&l_mg->free_lock);
>   
> diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
> index 4e63f9b5954c..1e38adc63800 100644
> --- a/drivers/lightnvm/pblk-write.c
> +++ b/drivers/lightnvm/pblk-write.c
> @@ -313,10 +313,10 @@ static int pblk_alloc_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
>   }
>   
>   static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
> -			   struct ppa_addr *erase_ppa)
> +			   struct ppa_addr *erase_ppa, int type)
>   {
>   	struct pblk_line_meta *lm = &pblk->lm;
> -	struct pblk_line *e_line = pblk_line_get_erase(pblk);
> +	struct pblk_line *e_line = pblk_line_get_next_line(pblk, type);
>   	struct pblk_c_ctx *c_ctx = nvm_rq_to_pdu(rqd);
>   	unsigned int valid = c_ctx->nr_valid;
>   	unsigned int padded = c_ctx->nr_padded;
> @@ -337,10 +337,10 @@ static int pblk_setup_w_rq(struct pblk *pblk, struct nvm_rq *rqd,
>   
>   	if (likely(!e_line || !atomic_read(&e_line->left_eblks)))
>   		ret = pblk_map_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
> -							valid, 0);
> +				  valid, 0, type);
>   	else
>   		ret = pblk_map_erase_rq(pblk, rqd, c_ctx->sentry, lun_bitmap,
> -							valid, erase_ppa);
> +					valid, erase_ppa, type);
>   
>   	return ret;
>   }
> @@ -446,12 +446,13 @@ int pblk_submit_meta_io(struct pblk *pblk, struct pblk_line *meta_line)
>   
>   static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
>   				       struct pblk_line *meta_line,
> -				       struct nvm_rq *data_rqd)
> +				       struct nvm_rq *data_rqd,
> +				       int type)
>   {
>   	struct nvm_tgt_dev *dev = pblk->dev;
>   	struct nvm_geo *geo = &dev->geo;
>   	struct pblk_c_ctx *data_c_ctx = nvm_rq_to_pdu(data_rqd);
> -	struct pblk_line *data_line = pblk_line_get_data(pblk);
> +	struct pblk_line *data_line = pblk_line_get_data_line(pblk, type);
>   	struct ppa_addr ppa, ppa_opt;
>   	u64 paddr;
>   	int pos_opt;
> @@ -481,7 +482,8 @@ static inline bool pblk_valid_meta_ppa(struct pblk *pblk,
>   }
>   
>   static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
> -						    struct nvm_rq *data_rqd)
> +						    struct nvm_rq *data_rqd,
> +						    int type)
>   {
>   	struct pblk_line_meta *lm = &pblk->lm;
>   	struct pblk_line_mgmt *l_mg = &pblk->l_mg;
> @@ -499,13 +501,13 @@ static struct pblk_line *pblk_should_submit_meta_io(struct pblk *pblk,
>   	}
>   	spin_unlock(&l_mg->close_lock);
>   
> -	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd))
> +	if (!pblk_valid_meta_ppa(pblk, meta_line, data_rqd, type))
>   		return NULL;
>   
>   	return meta_line;
>   }
>   
> -static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
> +static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd, int type)
>   {
>   	struct ppa_addr erase_ppa;
>   	struct pblk_line *meta_line;
> @@ -514,13 +516,13 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
>   	pblk_ppa_set_empty(&erase_ppa);
>   
>   	/* Assign lbas to ppas and populate request structure */
> -	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
> +	err = pblk_setup_w_rq(pblk, rqd, &erase_ppa, type);
>   	if (err) {
>   		pblk_err(pblk, "could not setup write request: %d\n", err);
>   		return NVM_IO_ERR;
>   	}
>   
> -	meta_line = pblk_should_submit_meta_io(pblk, rqd);
> +	meta_line = pblk_should_submit_meta_io(pblk, rqd, type);
>   
>   	/* Submit data write for current data line */
>   	err = pblk_submit_io(pblk, rqd);
> @@ -532,11 +534,12 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
>   	if (!pblk_ppa_empty(erase_ppa)) {
>   		/* Submit erase for next data line */
>   		if (pblk_blk_erase_async(pblk, erase_ppa)) {
> -			struct pblk_line *e_line = pblk_line_get_erase(pblk);
> +			struct pblk_line *e_line;
>   			struct nvm_tgt_dev *dev = pblk->dev;
>   			struct nvm_geo *geo = &dev->geo;
>   			int bit;
>   
> +			e_line = pblk_line_get_next_line(pblk, type);
>   			atomic_inc(&e_line->left_eblks);
>   			bit = pblk_ppa_to_pos(geo, erase_ppa);
>   			WARN_ON(!test_and_clear_bit(bit, e_line->erase_bitmap));
> @@ -574,6 +577,9 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
>   	unsigned int secs_to_flush, packed_meta_pgs;
>   	unsigned long pos;
>   	unsigned int resubmit;
> +	int type;
> +	struct pblk_w_ctx *w_ctx;
> +	struct pblk_rb_entry *entry;
>   
>   	*secs_left = 0;
>   
> @@ -633,13 +639,18 @@ static int pblk_submit_write(struct pblk *pblk, int *secs_left)
>   	rqd = pblk_alloc_rqd(pblk, PBLK_WRITE);
>   	rqd->bio = bio;
>   
> +	entry = &pblk->rwb.entries[pos];
> +	w_ctx = &entry->w_ctx;
> +	type = w_ctx->flags & PBLK_IOTYPE_GC ?
> +		PBLK_LINETYPE_GC : PBLK_LINETYPE_DATA;
> +
>   	if (pblk_rb_read_to_bio(&pblk->rwb, rqd, pos, secs_to_sync,
>   								secs_avail)) {
>   		pblk_err(pblk, "corrupted write bio\n");
>   		goto fail_put_bio;
>   	}
>   
> -	if (pblk_submit_io_set(pblk, rqd))
> +	if (pblk_submit_io_set(pblk, rqd, type))
>   		goto fail_free_bio;
>   
>   #ifdef CONFIG_NVM_PBLK_DEBUG
> diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
> index e304754aaa3c..93ef5a2bffe6 100644
> --- a/drivers/lightnvm/pblk.h
> +++ b/drivers/lightnvm/pblk.h
> @@ -316,6 +316,7 @@ enum {
>   	PBLK_LINETYPE_FREE = 0,
>   	PBLK_LINETYPE_LOG = 1,
>   	PBLK_LINETYPE_DATA = 2,
> +	PBLK_LINETYPE_GC = 3,
>   
>   	/* Line state */
>   	PBLK_LINESTATE_NEW = 9,
> @@ -526,8 +527,10 @@ struct pblk_line_mgmt {
>   
>   	struct pblk_line *log_line;	/* Current FTL log line */
>   	struct pblk_line *data_line;	/* Current data line */
> +	struct pblk_line *gc_line;	/* Current gc line */
>   	struct pblk_line *log_next;	/* Next FTL log line */
>   	struct pblk_line *data_next;	/* Next data line */
> +	struct pblk_line *gc_next;	/* Next gc line */
>   
>   	struct list_head emeta_list;	/* Lines queued to schedule emeta */
>   
> @@ -804,14 +807,20 @@ struct bio *pblk_bio_map_addr(struct pblk *pblk, void *data,
>   			      unsigned int nr_secs, unsigned int len,
>   			      int alloc_type, gfp_t gfp_mask);
>   struct pblk_line *pblk_line_get(struct pblk *pblk);
> -struct pblk_line *pblk_line_get_first_data(struct pblk *pblk);
> -struct pblk_line *pblk_line_replace_data(struct pblk *pblk);
> +struct pblk_line *pblk_line_get_first_data(struct pblk *pblk, int type);
> +struct pblk_line *pblk_line_replace_data(struct pblk *pblk, int type);
>   void pblk_ppa_to_line_put(struct pblk *pblk, struct ppa_addr ppa);
>   void pblk_rq_to_line_put(struct pblk *pblk, struct nvm_rq *rqd);
>   int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line);
>   void pblk_line_recov_close(struct pblk *pblk, struct pblk_line *line);
> -struct pblk_line *pblk_line_get_data(struct pblk *pblk);
> -struct pblk_line *pblk_line_get_erase(struct pblk *pblk);
> +struct pblk_line *pblk_line_get_data_line(struct pblk *pblk, int type);
> +struct pblk_line *pblk_line_get_next_line(struct pblk *pblk, int type);
> +void pblk_line_set_data_line(struct pblk *pblk, int type,
> +			     struct pblk_line *line);
> +void pblk_line_set_next_line(struct pblk *pblk, int type,
> +			     struct pblk_line *line);
> +struct pblk_line *pblk_line_init_next_line(struct pblk *pblk, int type);
> +struct pblk_line *pblk_line_init_data_line(struct pblk *pblk, int type);
>   int pblk_line_erase(struct pblk *pblk, struct pblk_line *line);
>   int pblk_line_is_full(struct pblk_line *line);
>   void pblk_line_free(struct pblk_line *line);
> @@ -875,11 +884,12 @@ int pblk_write_gc_to_cache(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
>    * pblk map
>    */
>   int pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
> -		       unsigned int sentry, unsigned long *lun_bitmap,
> -		       unsigned int valid_secs, struct ppa_addr *erase_ppa);
> +		      unsigned int sentry, unsigned long *lun_bitmap,
> +		      unsigned int valid_secs, struct ppa_addr *erase_ppa,
> +		      int type);
>   int pblk_map_rq(struct pblk *pblk, struct nvm_rq *rqd, unsigned int sentry,
> -		 unsigned long *lun_bitmap, unsigned int valid_secs,
> -		 unsigned int off);
> +		unsigned long *lun_bitmap, unsigned int valid_secs,
> +		unsigned int off, int type);
>   
>   /*
>    * pblk write thread
> @@ -899,7 +909,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
>    * pblk recovery
>    */
>   struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
> -int pblk_recov_pad(struct pblk *pblk);
> +int pblk_recov_pad(struct pblk *pblk, int type);
>   int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
>   
>   /*
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-04-26  9:11 ` Igor Konopko
@ 2019-04-26 10:04   ` Javier González
  2019-04-26 13:46     ` Igor Konopko
  0 siblings, 1 reply; 12+ messages in thread
From: Javier González @ 2019-04-26 10:04 UTC (permalink / raw)
  To: Konopko, Igor J
  Cc: Heiner Litz, Matias Bjørling, Hans Holmberg, linux-block,
	linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1793 bytes --]


> On 26 Apr 2019, at 11.11, Igor Konopko <igor.j.konopko@intel.com> wrote:
> 
> On 25.04.2019 07:21, Heiner Litz wrote:
>> Introduce the capability to manage multiple open lines. Maintain one line
>> for user writes (hot) and a second line for gc writes (cold). As user and
>> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
>> write will contain both gc and user data. This is acceptable, as on a
>> tested SSD with minimum write size of 64KB, less than 1% of all writes
>> contain both hot and cold sectors.
> 
> Hi Heiner
> 
> Generally I really like this changes, I was thinking about sth similar since a while, so it is very good to see that patch.
> 
> I have a one question related to this patch, since it is not very clear for me - how you ensure the data integrity in following scenarios:
> -we have open line X for user data and line Y for GC
> -GC writes LBA=N to line Y
> -user writes LBA=N to line X
> -we have power failure when both line X and Y were not written completely
> -during pblk creation we are executing OOB metadata recovery
> And here is the question, how we distinguish whether LBA=N from line Y or LBA=N from line X is the valid one?
> Line X and Y might have seq_id either descending or ascending - this would create two possible scenarios too.
> 
> Thanks
> Igor
> 

You are right, I think this is possible in the current implementation.

We need an extra constrain so that we only GC lines above the GC line
ID. This way, when we order lines on recovery, we can guarantee
consistency. This means potentially that we would need several open
lines for GC to avoid padding in case this constrain forces to choose a
line with an ID higher than the GC line ID.

What do you think?

Thanks,
Javier

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-04-26 10:04   ` Javier González
@ 2019-04-26 13:46     ` Igor Konopko
  2019-04-26 16:23       ` Heiner Litz
  0 siblings, 1 reply; 12+ messages in thread
From: Igor Konopko @ 2019-04-26 13:46 UTC (permalink / raw)
  To: Javier González
  Cc: Heiner Litz, Matias Bjørling, Hans Holmberg, linux-block,
	linux-kernel



On 26.04.2019 12:04, Javier González wrote:
> 
>> On 26 Apr 2019, at 11.11, Igor Konopko <igor.j.konopko@intel.com> wrote:
>>
>> On 25.04.2019 07:21, Heiner Litz wrote:
>>> Introduce the capability to manage multiple open lines. Maintain one line
>>> for user writes (hot) and a second line for gc writes (cold). As user and
>>> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
>>> write will contain both gc and user data. This is acceptable, as on a
>>> tested SSD with minimum write size of 64KB, less than 1% of all writes
>>> contain both hot and cold sectors.
>>
>> Hi Heiner
>>
>> Generally I really like this changes, I was thinking about sth similar since a while, so it is very good to see that patch.
>>
>> I have a one question related to this patch, since it is not very clear for me - how you ensure the data integrity in following scenarios:
>> -we have open line X for user data and line Y for GC
>> -GC writes LBA=N to line Y
>> -user writes LBA=N to line X
>> -we have power failure when both line X and Y were not written completely
>> -during pblk creation we are executing OOB metadata recovery
>> And here is the question, how we distinguish whether LBA=N from line Y or LBA=N from line X is the valid one?
>> Line X and Y might have seq_id either descending or ascending - this would create two possible scenarios too.
>>
>> Thanks
>> Igor
>>
> 
> You are right, I think this is possible in the current implementation.
> 
> We need an extra constrain so that we only GC lines above the GC line
> ID. This way, when we order lines on recovery, we can guarantee
> consistency. This means potentially that we would need several open
> lines for GC to avoid padding in case this constrain forces to choose a
> line with an ID higher than the GC line ID.
> 
> What do you think?

I'm not sure yet about your approach, I need to think and analyze this a 
little more.

I also believe that probably we need to ensure that current user data 
line seq_id is always above the current GC line seq_id or sth like that. 
We cannot also then GC any data from the lines which are still open, but 
I believe that this is a case even right now.

> 
> Thanks,
> Javier
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-04-26 13:46     ` Igor Konopko
@ 2019-04-26 16:23       ` Heiner Litz
  2019-05-01  6:19         ` Javier González
  0 siblings, 1 reply; 12+ messages in thread
From: Heiner Litz @ 2019-04-26 16:23 UTC (permalink / raw)
  To: Igor Konopko
  Cc: Javier González, Heiner Litz, Matias Bjørling,
	Hans Holmberg, linux-block, Linux Kernel Mailing List

Nice catch Igor, I hadn't thought of that.

Nevertheless, here is what I think: In the absence of a flush we don't
need to enforce ordering so we don't care about recovering the older
gc'ed write. If we completed a flush after the user write, we should
have already invalidated the gc mapping and hence will not recover it.
Let me know if I am missing something.

On Fri, Apr 26, 2019 at 6:46 AM Igor Konopko <igor.j.konopko@intel.com> wrote:
>
>
>
> On 26.04.2019 12:04, Javier González wrote:
> >
> >> On 26 Apr 2019, at 11.11, Igor Konopko <igor.j.konopko@intel.com> wrote:
> >>
> >> On 25.04.2019 07:21, Heiner Litz wrote:
> >>> Introduce the capability to manage multiple open lines. Maintain one line
> >>> for user writes (hot) and a second line for gc writes (cold). As user and
> >>> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
> >>> write will contain both gc and user data. This is acceptable, as on a
> >>> tested SSD with minimum write size of 64KB, less than 1% of all writes
> >>> contain both hot and cold sectors.
> >>
> >> Hi Heiner
> >>
> >> Generally I really like this changes, I was thinking about sth similar since a while, so it is very good to see that patch.
> >>
> >> I have a one question related to this patch, since it is not very clear for me - how you ensure the data integrity in following scenarios:
> >> -we have open line X for user data and line Y for GC
> >> -GC writes LBA=N to line Y
> >> -user writes LBA=N to line X
> >> -we have power failure when both line X and Y were not written completely
> >> -during pblk creation we are executing OOB metadata recovery
> >> And here is the question, how we distinguish whether LBA=N from line Y or LBA=N from line X is the valid one?
> >> Line X and Y might have seq_id either descending or ascending - this would create two possible scenarios too.
> >>
> >> Thanks
> >> Igor
> >>
> >
> > You are right, I think this is possible in the current implementation.
> >
> > We need an extra constrain so that we only GC lines above the GC line
> > ID. This way, when we order lines on recovery, we can guarantee
> > consistency. This means potentially that we would need several open
> > lines for GC to avoid padding in case this constrain forces to choose a
> > line with an ID higher than the GC line ID.
> >
> > What do you think?
>
> I'm not sure yet about your approach, I need to think and analyze this a
> little more.
>
> I also believe that probably we need to ensure that current user data
> line seq_id is always above the current GC line seq_id or sth like that.
> We cannot also then GC any data from the lines which are still open, but
> I believe that this is a case even right now.
>
> >
> > Thanks,
> > Javier
> >

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-04-26 16:23       ` Heiner Litz
@ 2019-05-01  6:19         ` Javier González
  2019-05-01 20:20           ` Heiner Litz
  0 siblings, 1 reply; 12+ messages in thread
From: Javier González @ 2019-05-01  6:19 UTC (permalink / raw)
  To: Heiner Litz
  Cc: Konopko, Igor J, Matias Bjørling, Hans Holmberg,
	linux-block, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 3434 bytes --]

> On 26 Apr 2019, at 18.23, Heiner Litz <hlitz@ucsc.edu> wrote:
> 
> Nice catch Igor, I hadn't thought of that.
> 
> Nevertheless, here is what I think: In the absence of a flush we don't
> need to enforce ordering so we don't care about recovering the older
> gc'ed write. If we completed a flush after the user write, we should
> have already invalidated the gc mapping and hence will not recover it.
> Let me know if I am missing something.

I think that this problem is orthogonal to a flush on the user path. For example

   - Write to LBA0 + completion to host
   - […]
   - GC LBA0
   - Write to LBA0 + completion to host
   - fsync() + completion
   - Power Failure

When we power up and do recovery in the current implementation, you
might get the old LBA0 mapped correctly in the L2P table.

If we enforce ID ordering for GC lines this problem goes away as we can
continue ordering lines based on ID and then recovering sequentially.

Thoughts?

Thanks,
Javier

> 
> On Fri, Apr 26, 2019 at 6:46 AM Igor Konopko <igor.j.konopko@intel.com> wrote:
>> On 26.04.2019 12:04, Javier González wrote:
>>>> On 26 Apr 2019, at 11.11, Igor Konopko <igor.j.konopko@intel.com> wrote:
>>>> 
>>>> On 25.04.2019 07:21, Heiner Litz wrote:
>>>>> Introduce the capability to manage multiple open lines. Maintain one line
>>>>> for user writes (hot) and a second line for gc writes (cold). As user and
>>>>> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
>>>>> write will contain both gc and user data. This is acceptable, as on a
>>>>> tested SSD with minimum write size of 64KB, less than 1% of all writes
>>>>> contain both hot and cold sectors.
>>>> 
>>>> Hi Heiner
>>>> 
>>>> Generally I really like this changes, I was thinking about sth similar since a while, so it is very good to see that patch.
>>>> 
>>>> I have a one question related to this patch, since it is not very clear for me - how you ensure the data integrity in following scenarios:
>>>> -we have open line X for user data and line Y for GC
>>>> -GC writes LBA=N to line Y
>>>> -user writes LBA=N to line X
>>>> -we have power failure when both line X and Y were not written completely
>>>> -during pblk creation we are executing OOB metadata recovery
>>>> And here is the question, how we distinguish whether LBA=N from line Y or LBA=N from line X is the valid one?
>>>> Line X and Y might have seq_id either descending or ascending - this would create two possible scenarios too.
>>>> 
>>>> Thanks
>>>> Igor
>>> 
>>> You are right, I think this is possible in the current implementation.
>>> 
>>> We need an extra constrain so that we only GC lines above the GC line
>>> ID. This way, when we order lines on recovery, we can guarantee
>>> consistency. This means potentially that we would need several open
>>> lines for GC to avoid padding in case this constrain forces to choose a
>>> line with an ID higher than the GC line ID.
>>> 
>>> What do you think?
>> 
>> I'm not sure yet about your approach, I need to think and analyze this a
>> little more.
>> 
>> I also believe that probably we need to ensure that current user data
>> line seq_id is always above the current GC line seq_id or sth like that.
>> We cannot also then GC any data from the lines which are still open, but
>> I believe that this is a case even right now.
>> 
>>> Thanks,
>>> Javier

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-05-01  6:19         ` Javier González
@ 2019-05-01 20:20           ` Heiner Litz
  2019-05-02  8:27             ` Javier González
  2019-05-02  9:08             ` Igor Konopko
  0 siblings, 2 replies; 12+ messages in thread
From: Heiner Litz @ 2019-05-01 20:20 UTC (permalink / raw)
  To: Javier González
  Cc: Konopko, Igor J, Matias Bjørling, Hans Holmberg,
	linux-block, Linux Kernel Mailing List

Javier, Igor,
you are correct. The problem exists if we have a power loss and we
have an open gc and an open user line and both contain the same LBA.
In that case, I think we need to care about the 4 scenarios:

1. user_seq_id > gc_seq_id and user_write after gc_write: No issue
2. user_seq_id > gc_seq_id and gc_write > user_write: Cannot happen,
open user lines are not gc'ed
3. gc_seq_id > user_seq_id and user_write after gc_write: RACE
4. gc_seq_id > user_seq_id and gc_write after user_write: No issue

To address 3.) we can do the following:
Whenever a gc line is opened, determine all open user lines and store
them in a field of pblk_line. When choosing a victim for GC, ignore
those lines.

Let me know if that sounds good and I will send a v2
Heiner

On Tue, Apr 30, 2019 at 11:19 PM Javier González <javier@javigon.com> wrote:
>
> > On 26 Apr 2019, at 18.23, Heiner Litz <hlitz@ucsc.edu> wrote:
> >
> > Nice catch Igor, I hadn't thought of that.
> >
> > Nevertheless, here is what I think: In the absence of a flush we don't
> > need to enforce ordering so we don't care about recovering the older
> > gc'ed write. If we completed a flush after the user write, we should
> > have already invalidated the gc mapping and hence will not recover it.
> > Let me know if I am missing something.
>
> I think that this problem is orthogonal to a flush on the user path. For example
>
>    - Write to LBA0 + completion to host
>    - […]
>    - GC LBA0
>    - Write to LBA0 + completion to host
>    - fsync() + completion
>    - Power Failure
>
> When we power up and do recovery in the current implementation, you
> might get the old LBA0 mapped correctly in the L2P table.
>
> If we enforce ID ordering for GC lines this problem goes away as we can
> continue ordering lines based on ID and then recovering sequentially.
>
> Thoughts?
>
> Thanks,
> Javier
>
> >
> > On Fri, Apr 26, 2019 at 6:46 AM Igor Konopko <igor.j.konopko@intel.com> wrote:
> >> On 26.04.2019 12:04, Javier González wrote:
> >>>> On 26 Apr 2019, at 11.11, Igor Konopko <igor.j.konopko@intel.com> wrote:
> >>>>
> >>>> On 25.04.2019 07:21, Heiner Litz wrote:
> >>>>> Introduce the capability to manage multiple open lines. Maintain one line
> >>>>> for user writes (hot) and a second line for gc writes (cold). As user and
> >>>>> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
> >>>>> write will contain both gc and user data. This is acceptable, as on a
> >>>>> tested SSD with minimum write size of 64KB, less than 1% of all writes
> >>>>> contain both hot and cold sectors.
> >>>>
> >>>> Hi Heiner
> >>>>
> >>>> Generally I really like this changes, I was thinking about sth similar since a while, so it is very good to see that patch.
> >>>>
> >>>> I have a one question related to this patch, since it is not very clear for me - how you ensure the data integrity in following scenarios:
> >>>> -we have open line X for user data and line Y for GC
> >>>> -GC writes LBA=N to line Y
> >>>> -user writes LBA=N to line X
> >>>> -we have power failure when both line X and Y were not written completely
> >>>> -during pblk creation we are executing OOB metadata recovery
> >>>> And here is the question, how we distinguish whether LBA=N from line Y or LBA=N from line X is the valid one?
> >>>> Line X and Y might have seq_id either descending or ascending - this would create two possible scenarios too.
> >>>>
> >>>> Thanks
> >>>> Igor
> >>>
> >>> You are right, I think this is possible in the current implementation.
> >>>
> >>> We need an extra constrain so that we only GC lines above the GC line
> >>> ID. This way, when we order lines on recovery, we can guarantee
> >>> consistency. This means potentially that we would need several open
> >>> lines for GC to avoid padding in case this constrain forces to choose a
> >>> line with an ID higher than the GC line ID.
> >>>
> >>> What do you think?
> >>
> >> I'm not sure yet about your approach, I need to think and analyze this a
> >> little more.
> >>
> >> I also believe that probably we need to ensure that current user data
> >> line seq_id is always above the current GC line seq_id or sth like that.
> >> We cannot also then GC any data from the lines which are still open, but
> >> I believe that this is a case even right now.
> >>
> >>> Thanks,
> >>> Javier

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-05-01 20:20           ` Heiner Litz
@ 2019-05-02  8:27             ` Javier González
  2019-05-02  9:08             ` Igor Konopko
  1 sibling, 0 replies; 12+ messages in thread
From: Javier González @ 2019-05-02  8:27 UTC (permalink / raw)
  To: Heiner Litz
  Cc: Konopko, Igor J, Matias Bjørling, Hans Holmberg,
	linux-block, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 1840 bytes --]

> On 1 May 2019, at 22.20, Heiner Litz <hlitz@ucsc.edu> wrote:
> 
> Javier, Igor,
> you are correct. The problem exists if we have a power loss and we
> have an open gc and an open user line and both contain the same LBA.
> In that case, I think we need to care about the 4 scenarios:
> 
> 1. user_seq_id > gc_seq_id and user_write after gc_write: No issue
> 2. user_seq_id > gc_seq_id and gc_write > user_write: Cannot happen,
> open user lines are not gc'ed
> 3. gc_seq_id > user_seq_id and user_write after gc_write: RACE
> 4. gc_seq_id > user_seq_id and gc_write after user_write: No issue
> 
> To address 3.) we can do the following:
> Whenever a gc line is opened, determine all open user lines and store
> them in a field of pblk_line. When choosing a victim for GC, ignore
> those lines.

What if the following happens:
  - LBA0 is mapped to line 3
  - GC kicks in
  - Open user line 5
  - Open GC line 6
  - Choose line 3 for GC
  - GC LBA0
  - LBA 0 updated and mapped to line 5
  - Power loss

In this case, recovering in order will make that the last mapped LBA is
the one on the GC line. Note that even when the mapping has been
invalidated, scan recovery does not know this and it will just update
the L2P as new lines are being recovered.

I think we need to enforce that no use line is open prior a new open GC
line. This is, when creating a GC line, we wait until the next user line
is to be allocated, and then we assign first the GC line and then the
user line. This can be extended for several open user and GC lines. This
way, the case above (3) cannot occur. In the example above we would
have:

  - LBA0 is mapped to line 3
  - GC kicks in
  - Open GC line 5.        \\ enforced
  - Open user line 6.      \\ enforced
  - Choose line 3 for GC
  - GC LBA0
  - LBA 0 updated and mapped to line 6
  - Power loss

Javier

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-05-01 20:20           ` Heiner Litz
  2019-05-02  8:27             ` Javier González
@ 2019-05-02  9:08             ` Igor Konopko
  2019-05-06  5:16               ` Heiner Litz
  1 sibling, 1 reply; 12+ messages in thread
From: Igor Konopko @ 2019-05-02  9:08 UTC (permalink / raw)
  To: Heiner Litz, Javier González
  Cc: Matias Bjørling, Hans Holmberg, linux-block,
	Linux Kernel Mailing List



On 01.05.2019 22:20, Heiner Litz wrote:
> Javier, Igor,
> you are correct. The problem exists if we have a power loss and we
> have an open gc and an open user line and both contain the same LBA.
> In that case, I think we need to care about the 4 scenarios:
> 
> 1. user_seq_id > gc_seq_id and user_write after gc_write: No issue
> 2. user_seq_id > gc_seq_id and gc_write > user_write: Cannot happen,
> open user lines are not gc'ed

Maybe it would be just a theoretical scenario, but I'm not seeing any 
reason why this cannot happen in pblk implementation:
Let assume that user line X+1 is opened when GC line X is already open 
and the user line is closed when GC line X is still in use. Then GC 
quickly choose user line X+1 as a GC victim and we are hitting 2nd case.

> 3. gc_seq_id > user_seq_id and user_write after gc_write: RACE
> 4. gc_seq_id > user_seq_id and gc_write after user_write: No issue
> 
> To address 3.) we can do the following:
> Whenever a gc line is opened, determine all open user lines and store
> them in a field of pblk_line. When choosing a victim for GC, ignore
> those lines.

Your solution sounds right, but I would extend this based on my previous 
comment to 2nd case by sth like: during opening new user data also add 
this line ID to this "blacklist" for the GC selection.

Igor

> 
> Let me know if that sounds good and I will send a v2
> Heiner
> 
> On Tue, Apr 30, 2019 at 11:19 PM Javier González <javier@javigon.com> wrote:
>>
>>> On 26 Apr 2019, at 18.23, Heiner Litz <hlitz@ucsc.edu> wrote:
>>>
>>> Nice catch Igor, I hadn't thought of that.
>>>
>>> Nevertheless, here is what I think: In the absence of a flush we don't
>>> need to enforce ordering so we don't care about recovering the older
>>> gc'ed write. If we completed a flush after the user write, we should
>>> have already invalidated the gc mapping and hence will not recover it.
>>> Let me know if I am missing something.
>>
>> I think that this problem is orthogonal to a flush on the user path. For example
>>
>>     - Write to LBA0 + completion to host
>>     - […]
>>     - GC LBA0
>>     - Write to LBA0 + completion to host
>>     - fsync() + completion
>>     - Power Failure
>>
>> When we power up and do recovery in the current implementation, you
>> might get the old LBA0 mapped correctly in the L2P table.
>>
>> If we enforce ID ordering for GC lines this problem goes away as we can
>> continue ordering lines based on ID and then recovering sequentially.
>>
>> Thoughts?
>>
>> Thanks,
>> Javier
>>
>>>
>>> On Fri, Apr 26, 2019 at 6:46 AM Igor Konopko <igor.j.konopko@intel.com> wrote:
>>>> On 26.04.2019 12:04, Javier González wrote:
>>>>>> On 26 Apr 2019, at 11.11, Igor Konopko <igor.j.konopko@intel.com> wrote:
>>>>>>
>>>>>> On 25.04.2019 07:21, Heiner Litz wrote:
>>>>>>> Introduce the capability to manage multiple open lines. Maintain one line
>>>>>>> for user writes (hot) and a second line for gc writes (cold). As user and
>>>>>>> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
>>>>>>> write will contain both gc and user data. This is acceptable, as on a
>>>>>>> tested SSD with minimum write size of 64KB, less than 1% of all writes
>>>>>>> contain both hot and cold sectors.
>>>>>>
>>>>>> Hi Heiner
>>>>>>
>>>>>> Generally I really like this changes, I was thinking about sth similar since a while, so it is very good to see that patch.
>>>>>>
>>>>>> I have a one question related to this patch, since it is not very clear for me - how you ensure the data integrity in following scenarios:
>>>>>> -we have open line X for user data and line Y for GC
>>>>>> -GC writes LBA=N to line Y
>>>>>> -user writes LBA=N to line X
>>>>>> -we have power failure when both line X and Y were not written completely
>>>>>> -during pblk creation we are executing OOB metadata recovery
>>>>>> And here is the question, how we distinguish whether LBA=N from line Y or LBA=N from line X is the valid one?
>>>>>> Line X and Y might have seq_id either descending or ascending - this would create two possible scenarios too.
>>>>>>
>>>>>> Thanks
>>>>>> Igor
>>>>>
>>>>> You are right, I think this is possible in the current implementation.
>>>>>
>>>>> We need an extra constrain so that we only GC lines above the GC line
>>>>> ID. This way, when we order lines on recovery, we can guarantee
>>>>> consistency. This means potentially that we would need several open
>>>>> lines for GC to avoid padding in case this constrain forces to choose a
>>>>> line with an ID higher than the GC line ID.
>>>>>
>>>>> What do you think?
>>>>
>>>> I'm not sure yet about your approach, I need to think and analyze this a
>>>> little more.
>>>>
>>>> I also believe that probably we need to ensure that current user data
>>>> line seq_id is always above the current GC line seq_id or sth like that.
>>>> We cannot also then GC any data from the lines which are still open, but
>>>> I believe that this is a case even right now.
>>>>
>>>>> Thanks,
>>>>> Javier

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-05-02  9:08             ` Igor Konopko
@ 2019-05-06  5:16               ` Heiner Litz
  2019-05-07  5:27                 ` Javier González
  0 siblings, 1 reply; 12+ messages in thread
From: Heiner Litz @ 2019-05-06  5:16 UTC (permalink / raw)
  To: Igor Konopko
  Cc: Javier González, Matias Bjørling, Hans Holmberg,
	linux-block, Linux Kernel Mailing List

Igor, Javier,

both of you are right. Here is what I came up with after some more thinking.

We can avoid the races in 2. and 3. with the following two invariants:
I1: If we have a GC line with seq_id X, only garbage collect from
lines older than X (this addresses 2.)
I2: Guarantee that the open GC line always has a smaller seq_id than
all open user lines (this addresses 3)

We can enforce I2 by adding a minor seq_id. The major sequence id is
only incremented when allocating a user line. Whenever a GC line is
allocated we read the current major seq_id (open user line) and
increment the minor seq_id. This allows us to order all GC lines
before the open user line during recovery.

Problem with this approach:
Consider the following example: There exist user lines U0, U1, U2
(where 0,1,2 are seq_ids) and a non-empty GC5 line (with seq_id 5). If
we now do only sequential writes all user lines will be overwritten
without GC being required. As a result, data will now reside on U6,
U7, U8. If we now need to GC we cannot because of I1.
Solution: We cannot fast-forward the GC line's seq_id because it
contains old data, so pad the GC line with zeros, close it and open a
new GC9 line.

Generality:
This approach extends to schemes that use e.g. hot, warm, cold open
lines (adding a minor_minor_seq_id)

Heiner


On Thu, May 2, 2019 at 2:08 AM Igor Konopko <igor.j.konopko@intel.com> wrote:
>
>
>
> On 01.05.2019 22:20, Heiner Litz wrote:
> > Javier, Igor,
> > you are correct. The problem exists if we have a power loss and we
> > have an open gc and an open user line and both contain the same LBA.
> > In that case, I think we need to care about the 4 scenarios:
> >
> > 1. user_seq_id > gc_seq_id and user_write after gc_write: No issue
> > 2. user_seq_id > gc_seq_id and gc_write > user_write: Cannot happen,
> > open user lines are not gc'ed
>
> Maybe it would be just a theoretical scenario, but I'm not seeing any
> reason why this cannot happen in pblk implementation:
> Let assume that user line X+1 is opened when GC line X is already open
> and the user line is closed when GC line X is still in use. Then GC
> quickly choose user line X+1 as a GC victim and we are hitting 2nd case.
>
> > 3. gc_seq_id > user_seq_id and user_write after gc_write: RACE
> > 4. gc_seq_id > user_seq_id and gc_write after user_write: No issue
> >
> > To address 3.) we can do the following:
> > Whenever a gc line is opened, determine all open user lines and store
> > them in a field of pblk_line. When choosing a victim for GC, ignore
> > those lines.
>
> Your solution sounds right, but I would extend this based on my previous
> comment to 2nd case by sth like: during opening new user data also add
> this line ID to this "blacklist" for the GC selection.
>
> Igor
>
> >
> > Let me know if that sounds good and I will send a v2
> > Heiner
> >
> > On Tue, Apr 30, 2019 at 11:19 PM Javier González <javier@javigon.com> wrote:
> >>
> >>> On 26 Apr 2019, at 18.23, Heiner Litz <hlitz@ucsc.edu> wrote:
> >>>
> >>> Nice catch Igor, I hadn't thought of that.
> >>>
> >>> Nevertheless, here is what I think: In the absence of a flush we don't
> >>> need to enforce ordering so we don't care about recovering the older
> >>> gc'ed write. If we completed a flush after the user write, we should
> >>> have already invalidated the gc mapping and hence will not recover it.
> >>> Let me know if I am missing something.
> >>
> >> I think that this problem is orthogonal to a flush on the user path. For example
> >>
> >>     - Write to LBA0 + completion to host
> >>     - […]
> >>     - GC LBA0
> >>     - Write to LBA0 + completion to host
> >>     - fsync() + completion
> >>     - Power Failure
> >>
> >> When we power up and do recovery in the current implementation, you
> >> might get the old LBA0 mapped correctly in the L2P table.
> >>
> >> If we enforce ID ordering for GC lines this problem goes away as we can
> >> continue ordering lines based on ID and then recovering sequentially.
> >>
> >> Thoughts?
> >>
> >> Thanks,
> >> Javier
> >>
> >>>
> >>> On Fri, Apr 26, 2019 at 6:46 AM Igor Konopko <igor.j.konopko@intel.com> wrote:
> >>>> On 26.04.2019 12:04, Javier González wrote:
> >>>>>> On 26 Apr 2019, at 11.11, Igor Konopko <igor.j.konopko@intel.com> wrote:
> >>>>>>
> >>>>>> On 25.04.2019 07:21, Heiner Litz wrote:
> >>>>>>> Introduce the capability to manage multiple open lines. Maintain one line
> >>>>>>> for user writes (hot) and a second line for gc writes (cold). As user and
> >>>>>>> gc writes still utilize a shared ring buffer, in rare cases a multi-sector
> >>>>>>> write will contain both gc and user data. This is acceptable, as on a
> >>>>>>> tested SSD with minimum write size of 64KB, less than 1% of all writes
> >>>>>>> contain both hot and cold sectors.
> >>>>>>
> >>>>>> Hi Heiner
> >>>>>>
> >>>>>> Generally I really like this changes, I was thinking about sth similar since a while, so it is very good to see that patch.
> >>>>>>
> >>>>>> I have a one question related to this patch, since it is not very clear for me - how you ensure the data integrity in following scenarios:
> >>>>>> -we have open line X for user data and line Y for GC
> >>>>>> -GC writes LBA=N to line Y
> >>>>>> -user writes LBA=N to line X
> >>>>>> -we have power failure when both line X and Y were not written completely
> >>>>>> -during pblk creation we are executing OOB metadata recovery
> >>>>>> And here is the question, how we distinguish whether LBA=N from line Y or LBA=N from line X is the valid one?
> >>>>>> Line X and Y might have seq_id either descending or ascending - this would create two possible scenarios too.
> >>>>>>
> >>>>>> Thanks
> >>>>>> Igor
> >>>>>
> >>>>> You are right, I think this is possible in the current implementation.
> >>>>>
> >>>>> We need an extra constrain so that we only GC lines above the GC line
> >>>>> ID. This way, when we order lines on recovery, we can guarantee
> >>>>> consistency. This means potentially that we would need several open
> >>>>> lines for GC to avoid padding in case this constrain forces to choose a
> >>>>> line with an ID higher than the GC line ID.
> >>>>>
> >>>>> What do you think?
> >>>>
> >>>> I'm not sure yet about your approach, I need to think and analyze this a
> >>>> little more.
> >>>>
> >>>> I also believe that probably we need to ensure that current user data
> >>>> line seq_id is always above the current GC line seq_id or sth like that.
> >>>> We cannot also then GC any data from the lines which are still open, but
> >>>> I believe that this is a case even right now.
> >>>>
> >>>>> Thanks,
> >>>>> Javier

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] lightnvm: pblk: Introduce hot-cold data separation
  2019-05-06  5:16               ` Heiner Litz
@ 2019-05-07  5:27                 ` Javier González
  0 siblings, 0 replies; 12+ messages in thread
From: Javier González @ 2019-05-07  5:27 UTC (permalink / raw)
  To: Heiner Litz
  Cc: Konopko, Igor J, Matias Bjørling, Hans Holmberg,
	linux-block, Linux Kernel Mailing List

[-- Attachment #1: Type: text/plain, Size: 1574 bytes --]

> On 6 May 2019, at 07.16, Heiner Litz <hlitz@ucsc.edu> wrote:
> 
> Igor, Javier,
> 
> both of you are right. Here is what I came up with after some more thinking.
> 
> We can avoid the races in 2. and 3. with the following two invariants:
> I1: If we have a GC line with seq_id X, only garbage collect from
> lines older than X (this addresses 2.)
> I2: Guarantee that the open GC line always has a smaller seq_id than
> all open user lines (this addresses 3)
> 
> We can enforce I2 by adding a minor seq_id. The major sequence id is
> only incremented when allocating a user line. Whenever a GC line is
> allocated we read the current major seq_id (open user line) and
> increment the minor seq_id. This allows us to order all GC lines
> before the open user line during recovery.
> 
> Problem with this approach:
> Consider the following example: There exist user lines U0, U1, U2
> (where 0,1,2 are seq_ids) and a non-empty GC5 line (with seq_id 5). If
> we now do only sequential writes all user lines will be overwritten
> without GC being required. As a result, data will now reside on U6,
> U7, U8. If we now need to GC we cannot because of I1.
> Solution: We cannot fast-forward the GC line's seq_id because it
> contains old data, so pad the GC line with zeros, close it and open a
> new GC9 line.
> 
> Generality:
> This approach extends to schemes that use e.g. hot, warm, cold open
> lines (adding a minor_minor_seq_id)
> 
> Heiner


Looks like a good solution that allows us to maintain the current mapping model.

Javier

[-- Attachment #2: Message signed with OpenPGP --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2019-05-07  5:28 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-04-25  5:21 [PATCH] lightnvm: pblk: Introduce hot-cold data separation Heiner Litz
2019-04-25  5:49 ` Javier González
2019-04-26  9:11 ` Igor Konopko
2019-04-26 10:04   ` Javier González
2019-04-26 13:46     ` Igor Konopko
2019-04-26 16:23       ` Heiner Litz
2019-05-01  6:19         ` Javier González
2019-05-01 20:20           ` Heiner Litz
2019-05-02  8:27             ` Javier González
2019-05-02  9:08             ` Igor Konopko
2019-05-06  5:16               ` Heiner Litz
2019-05-07  5:27                 ` Javier González

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).