linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
@ 2016-06-11 22:30 George Spelvin
  2016-06-12  7:20 ` Boris Brezillon
  0 siblings, 1 reply; 14+ messages in thread
From: George Spelvin @ 2016-06-11 22:30 UTC (permalink / raw)
  To: boris.brezillon; +Cc: linux, linux-kernel, richard

I was just browsing LKML history and wanted to understand this
concept, but while reading I think I spotted an error.


+static void nand_pairing_dist3_get_info(struct mtd_info *mtd, int page,
+					struct mtd_pairing_info *info)
+{
+	int lastpage = (mtd->erasesize / mtd->writesize) - 1;
+	int dist = 3;
+
+	if (page == lastpage)
+		dist = 2;
+
+	if (!page || (page & 1)) {
+		info->group = 0;
+		info->pair = (page + 1) / 2;
+	} else {
+		info->group = 1;
+		info->pair = (page + 1 - dist) / 2;
+	}
+}

As I understand this, the general rule is that odd pages i are paired with
even pages i+3, and group = !(i & 1).

If there are N pages total (numbered 0..N-1), this leaves four exceptions
to deal with:

-3 would be apired with 0
-1 would be paired with 2
N-3 would be paired with N
N-1 would be paired with N+2

This is solved by pairing 0 with 2, and N-1 with N-3.

In terms of group addresses, there are only two exceptions:
* 0 is pair 0, group 0 (not pair -1, group 1)
* N-1 is pair N/2/-1, group 1 (not pair N/2, group 0)

You have the first exception correct, but not the second; the condition
detects it, but the change to "dist" doesn't matter since the first half
of the second if() will be taken.


I think the easiest way to get this right is the following:

	page = page + (page != 0) + (page == lastpage);

	info->group = page & 1;
	if (page & 1)
		page -= dist;
	info->pair = page >> 1;

Rather than make up too many rules, this just maps 0 -> -1 and N-1 -> N,
then applies the general rule.

It also applies an offset of +1, to avoid negative numbers and the
problems of signed divides.


Also, a very minor note: divides are expensive.
A cheaper way to evaluate the "page == lastpage" condition is

	if ((page + 1) * mtd->writesize == mtd->erasesize)

(or you could add an mtd->write_per_erase field).


Applying all of this, the corrected code looks like the following:

(Note the addition of "const" and "__pure" annotations, which should
get copied to the "struct mtd_pairing_scheme" declaration.)

Signed-off-by: George Spelvin <linux@sciencehorizons.net>

/*
 * In distance-3 pairing, odd pages i are group 0, and are paired
 * with even pages i+3.  The exceptions are the first page (page 0)
 * and last page (page N-1) in an erase group.  These pair as if
 * they were pages -1 and N, respectively.
 */
static void nand_pairing_dist3_get_info(const struct mtd_info *mtd, int page,
					struct mtd_pairing_info *info)
{
	page += (page != 0) + ((page + 1) * mtd->writesize == mtd->erasesize);

	info->group = page & 1;
	if (page & 1)
		page -= 3;
	info->pair = page >> 1;
}

static int __pure nand_pairing_dist3_get_wunit(const struct mtd_info *mtd,
					const struct mtd_pairing_info *info)
{
	int page = 2 * info->pair + 3 * info->group;

	page -= (page != 0) + (page * mtd->writesize > mtd->erasesize);
	return page;
}

const struct mtd_pairing_scheme dist3_pairing_scheme = {
	.ngroups = 2,
	.get_info = nand_pairing_dist3_get_info,
	.get_wunit = nand_pairing_dist3_get_wunit,
};
EXPORT_SYMBOL_GPL(dist3_pairing_scheme);

/*
 * Distance-6 pairing works like distance-3 pairing, except that pages
 * are taken two at a time.  The lsbit of the page number is chopped off
 * and later re-added as the lsbit of the pair number.
 */
static void nand_pairing_dist6_get_info(const struct mtd_info *mtd, int page,
					struct mtd_pairing_info *info)
{
	bool lsbit = page & 1;

	page >>= 1;
	page += (page != 0) + ((page+1) * 2*mtd->writesize == mtd->erasesize);

	info->group = page & 1;
	if (page & 1)
		page -= 3;
	info->pair = page | lsbit;
}

static int __pure nand_pairing_dist6_get_wunit(const struct mtd_info *mtd,
				       const struct mtd_pairing_info *info)
{
	int page = (info->pair & ~1) + 3 * info->group;

	page -= (page != 0) + (page * 2 * mtd->writesize > mtd->erasesize);
	return 2 * page + (info->pair & 1);
}

const struct mtd_pairing_scheme dist6_pairing_scheme = {
	.ngroups = 2,
	.get_info = nand_pairing_dist6_get_info,
	.get_wunit = nand_pairing_dist6_get_wunit,
};
EXPORT_SYMBOL_GPL(dist6_pairing_scheme);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-11 22:30 [PATCH 2/4] mtd: nand: implement two pairing scheme George Spelvin
@ 2016-06-12  7:20 ` Boris Brezillon
  2016-06-12  9:23   ` George Spelvin
  0 siblings, 1 reply; 14+ messages in thread
From: Boris Brezillon @ 2016-06-12  7:20 UTC (permalink / raw)
  To: George Spelvin; +Cc: linux-kernel, richard, linux-mtd, Brian Norris

+ Brian and the MTD ML

Hi George,

On 11 Jun 2016 18:30:04 -0400
"George Spelvin" <linux@sciencehorizons.net> wrote:

> I was just browsing LKML history and wanted to understand this
> concept, but while reading I think I spotted an error.
> 
> 
> +static void nand_pairing_dist3_get_info(struct mtd_info *mtd, int page,
> +					struct mtd_pairing_info *info)
> +{
> +	int lastpage = (mtd->erasesize / mtd->writesize) - 1;
> +	int dist = 3;
> +
> +	if (page == lastpage)
> +		dist = 2;
> +
> +	if (!page || (page & 1)) {
> +		info->group = 0;
> +		info->pair = (page + 1) / 2;
> +	} else {
> +		info->group = 1;
> +		info->pair = (page + 1 - dist) / 2;
> +	}
> +}
> 
> As I understand this, the general rule is that odd pages i are paired with
> even pages i+3, and group = !(i & 1).
> 
> If there are N pages total (numbered 0..N-1), this leaves four exceptions
> to deal with:
> 
> -3 would be apired with 0
> -1 would be paired with 2
> N-3 would be paired with N
> N-1 would be paired with N+2
> 
> This is solved by pairing 0 with 2, and N-1 with N-3.
> 
> In terms of group addresses, there are only two exceptions:
> * 0 is pair 0, group 0 (not pair -1, group 1)
> * N-1 is pair N/2/-1, group 1 (not pair N/2, group 0)
> 
> You have the first exception correct, but not the second; the condition
> detects it, but the change to "dist" doesn't matter since the first half
> of the second if() will be taken.

Indeed. Nice catch!

> 
> 
> I think the easiest way to get this right is the following:
> 
> 	page = page + (page != 0) + (page == lastpage);
> 
> 	info->group = page & 1;
> 	if (page & 1)
> 		page -= dist;
> 	info->pair = page >> 1;
> 
> Rather than make up too many rules, this just maps 0 -> -1 and N-1 -> N,
> then applies the general rule.
> 
> It also applies an offset of +1, to avoid negative numbers and the
> problems of signed divides.

It seems to cover all cases.

> 
> 
> Also, a very minor note: divides are expensive.
> A cheaper way to evaluate the "page == lastpage" condition is
> 
> 	if ((page + 1) * mtd->writesize == mtd->erasesize)
> 
> (or you could add an mtd->write_per_erase field).

Okay. Actually I'd like to avoid adding new 'conversion' fields to the
mtd_info struct. Not sure we are really improving perfs when doing that,
since what takes long is the I/O ops between the flash and the
controller not the conversion operations.

> 
> 
> Applying all of this, the corrected code looks like the following:
> 
> (Note the addition of "const" and "__pure" annotations, which should
> get copied to the "struct mtd_pairing_scheme" declaration.)

I didn't know about the __pure attribute, thanks for mentioning it.
I agree on the const specifier.

> 
> Signed-off-by: George Spelvin <linux@sciencehorizons.net>
> 
> /*
>  * In distance-3 pairing, odd pages i are group 0, and are paired
>  * with even pages i+3.  The exceptions are the first page (page 0)
>  * and last page (page N-1) in an erase group.  These pair as if
>  * they were pages -1 and N, respectively.
>  */
> static void nand_pairing_dist3_get_info(const struct mtd_info *mtd, int page,
> 					struct mtd_pairing_info *info)
> {
> 	page += (page != 0) + ((page + 1) * mtd->writesize == mtd->erasesize);

Can we have a boolean to make it clearer?

	bool lastpage = ((page + 1) * mtd->writesize) == mtd->erasesize;

Also, the page update is quite obscure for people that did not have the
explanation you gave above. Can we make it

	/*
	 * The first and last pages are not surrounded by other pages,
	 * and are thus less sensitive to read/write disturbance.
	 * That's why NAND vendors decided to use a different distance
	 * for these 2 specific case, which complicates a bit the
	 * pairing scheme logic.
	 *
	 * To simplify the code, and keep the same distance for
	 * everyone, we'll increment all pages by 1 except the first
	 * one (page 0).
	 * The last page receives an extra +1 for the same reason.
	 */
	page += (page != 0) + lastpage;

> 

	/*
	 * The datasheets state that odd pages should be part of group
	 * 0 and even ones part of group 1 (except for the last and
	 * first pages) but this has changed when we added + 1 to the
	 * page variable.
	 */
> 	info->group = page & 1;

	/*
	 * We're trying to extract the pair id, which is always equal
	 * to first_page_of_a_pair / 2. Subtract the distance to the
	 * page if it's not part of group 0.
	 */
> 	if (page & 1)
> 		page -= 3;
> 	info->pair = page >> 1;
> }
> 
> static int __pure nand_pairing_dist3_get_wunit(const struct mtd_info *mtd,
> 					const struct mtd_pairing_info *info)
> {

I'll add the same kind of comment here.

> 	int page = 2 * info->pair + 3 * info->group;
> 
> 	page -= (page != 0) + (page * mtd->writesize > mtd->erasesize);
> 	return page;
> }
> 
> const struct mtd_pairing_scheme dist3_pairing_scheme = {
> 	.ngroups = 2,
> 	.get_info = nand_pairing_dist3_get_info,
> 	.get_wunit = nand_pairing_dist3_get_wunit,
> };
> EXPORT_SYMBOL_GPL(dist3_pairing_scheme);
> 
> /*
>  * Distance-6 pairing works like distance-3 pairing, except that pages
>  * are taken two at a time.  The lsbit of the page number is chopped off
>  * and later re-added as the lsbit of the pair number.
>  */
> static void nand_pairing_dist6_get_info(const struct mtd_info *mtd, int page,
> 					struct mtd_pairing_info *info)
> {
> 	bool lsbit = page & 1;
> 
> 	page >>= 1;
> 	page += (page != 0) + ((page+1) * 2*mtd->writesize == mtd->erasesize);
> 
> 	info->group = page & 1;
> 	if (page & 1)
> 		page -= 3;
> 	info->pair = page | lsbit;
> }
> 
> static int __pure nand_pairing_dist6_get_wunit(const struct mtd_info *mtd,
> 				       const struct mtd_pairing_info *info)
> {
> 	int page = (info->pair & ~1) + 3 * info->group;
> 
> 	page -= (page != 0) + (page * 2 * mtd->writesize > mtd->erasesize);
> 	return 2 * page + (info->pair & 1);
> }
> 
> const struct mtd_pairing_scheme dist6_pairing_scheme = {
> 	.ngroups = 2,
> 	.get_info = nand_pairing_dist6_get_info,
> 	.get_wunit = nand_pairing_dist6_get_wunit,
> };
> EXPORT_SYMBOL_GPL(dist6_pairing_scheme);
> 
> 

Thanks for your valuable review/suggestions.

Just out of curiosity, why are you interested in the pairing scheme
concept? Are you working with NANDs?

Best Regards,

Boris

-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12  7:20 ` Boris Brezillon
@ 2016-06-12  9:23   ` George Spelvin
  2016-06-12 11:11     ` Boris Brezillon
  0 siblings, 1 reply; 14+ messages in thread
From: George Spelvin @ 2016-06-12  9:23 UTC (permalink / raw)
  To: boris.brezillon, linux
  Cc: computersforpeace, linux-kernel, linux-mtd, richard

>> It also applies an offset of +1, to avoid negative numbers and the
>> problems of signed divides.

> It seems to cover all cases.

I wasn't sure why you used a signed int for the interface.

(Another thing I thought of, but am less sure of, is packing the group
and pair numbers into a register-passable int rather than a structure.
Even 2 bits for the group is probably the most that will ever be needed,
but it's easy to say the low 4 bits are the group and the high 28 are
the pair.  Just create a few access macros to pull them apart.

This was inspired by Linus's hash_len abstraction, recently moved to
<linux/stringhash.h>)

>> (or you could add an mtd->write_per_erase field).

> Okay. Actually I'd like to avoid adding new 'conversion' fields to the
> mtd_info struct. Not sure we are really improving perfs when doing that,
> since what takes long is the I/O ops between the flash and the
> controller not the conversion operations.

Well, yes, but you may need to do conversion ops for in-memory cache
lookups or searching for free blocks, or wear-levelling computations,
all of which may involve a great many conversions per actual I/O.

(In hindsight, I'd wish for writesize and write_per_erase, and not
store erasesize explicitly.  Not only is the multiply more efficient,
but it abolishes the error of an erase size which is not a multiple of
the write size by making it impossible.)

> Can we have a boolean to make it clearer?
>
>	bool lastpage = ((page + 1) * mtd->writesize) == mtd->erasesize;

An improvement IMHO.  You can use the same name in all four functions
to make the equivalence clear.

> Also, the page update is quite obscure for people that did not have the
> explanation you gave above. Can we make it

>	/*
>	 * The first and last pages are not surrounded by other pages,
>	 * and are thus less sensitive to read/write disturbance.
>	 * That's why NAND vendors decided to use a different distance
>	 * for these 2 specific case, which complicates a bit the
>	 * pairing scheme logic.

Um... this is, as far as I can tell, complete nonsense.

I realize you know this about a thousand times better than I do, so
I'm hesitant to make such a strong statement, but one thing that I do
know is that paired pages are stored in the exact same transistors.
The pairing is purely a logical addressing distance.  The physical
distance is exactly zero.

The qustion is why they chose this particular *logival* addressing
scheme, and I believe the reason is write bandwidth for the common case
of streaming writes to consecutive pages.

The obvious thing to do is pair consecutive even and odd pages (pages 0 and 1,
then 2 and 3, then...), but that makes it hard to pipeline programming of the
two pages.  You can't start programming page 1 until page 0 is finished.

The next obvious thing is stride-2: 0<->2, 1<->3, 4<->6, 5<->7, etc.

This is done in some MLC chips.  See p. 98 of this Micron data sheet:
http://pdf.datasheet.directory/datasheets-0/micron_technology/MT29F32G08CBACAWP_C.pdf
which has a stride-4 pairing.  0..3 pair with 4..8, then 9..11 with
12..15, and so on.

However, it's desirable to alternate group-0 and group-1 pages, since
the write operations are rather different and even take different amounts
of time.  Alternating them makes it possible to:
1) Possibly overlap parts of the writes that use different on-chip resources,
2) Average the non-overlapping times for minimum jitter.

This leads naturally to the stride-3 solution.  You want to minimize the
stride because you can read both pages in a pair with one read disturbance,
and the shorter the distance, the more likely you'll want both pages
(and the less buffering you'll need to make both available).

Stride-3 does have those two awkward edge cases, and changing the
stride is simply the simplest way to special-case them.

> Thanks for your valuable review/suggestions.
>
> Just out of curiosity, why are you interested in the pairing scheme
> concept? Are you working with NANDs?

Not at present, but I do embedded hardware and might some day.

Also, the data sheets are a real PITA to find.  I have yet to
see an actual data sheet that documents the stride-3 pairing scheme.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12  9:23   ` George Spelvin
@ 2016-06-12 11:11     ` Boris Brezillon
  2016-06-12 12:25       ` George Spelvin
  0 siblings, 1 reply; 14+ messages in thread
From: Boris Brezillon @ 2016-06-12 11:11 UTC (permalink / raw)
  To: George Spelvin
  Cc: computersforpeace, linux-kernel, linux-mtd, richard,
	Bean Huo 霍斌斌 (beanhuo)

On 12 Jun 2016 05:23:13 -0400
"George Spelvin" <linux@sciencehorizons.net> wrote:

> >> It also applies an offset of +1, to avoid negative numbers and the
> >> problems of signed divides.  
> 
> > It seems to cover all cases.  
> 
> I wasn't sure why you used a signed int for the interface.

No real reason other than consistency with other prototypes where page
is always expressed as an integer.

> 
> (Another thing I thought of, but am less sure of, is packing the group
> and pair numbers into a register-passable int rather than a structure.
> Even 2 bits for the group is probably the most that will ever be needed,
> but it's easy to say the low 4 bits are the group and the high 28 are
> the pair.  Just create a few access macros to pull them apart.

We could indeed do that, but again, do we really need to optimize
things like that?

> 
> This was inspired by Linus's hash_len abstraction, recently moved to
> <linux/stringhash.h>)
> 
> >> (or you could add an mtd->write_per_erase field).  
> 
> > Okay. Actually I'd like to avoid adding new 'conversion' fields to the
> > mtd_info struct. Not sure we are really improving perfs when doing that,
> > since what takes long is the I/O ops between the flash and the
> > controller not the conversion operations.  
> 
> Well, yes, but you may need to do conversion ops for in-memory cache
> lookups or searching for free blocks, or wear-levelling computations,
> all of which may involve a great many conversions per actual I/O.

That's true, even if I don't think it makes such a big difference (you
don't have that much paired pages manipulation that are not followed by
read/write accesses, and this is where the contention is).

> 
> (In hindsight, I'd wish for writesize and write_per_erase, and not
> store erasesize explicitly.  Not only is the multiply more efficient,
> but it abolishes the error of an erase size which is not a multiple of
> the write size by making it impossible.)

That's also true. Actually I was thinking about adding inline functions
to retrieve the eraseblock and page size instead of letting people
manipulate the ->writesize/erasesize fields. This way we would be able
to rework the internal representation.

> 
> > Can we have a boolean to make it clearer?
> >
> >	bool lastpage = ((page + 1) * mtd->writesize) == mtd->erasesize;  
> 
> An improvement IMHO.  You can use the same name in all four functions
> to make the equivalence clear.
> 
> > Also, the page update is quite obscure for people that did not have the
> > explanation you gave above. Can we make it  
> 
> >	/*
> >	 * The first and last pages are not surrounded by other pages,
> >	 * and are thus less sensitive to read/write disturbance.
> >	 * That's why NAND vendors decided to use a different distance
> >	 * for these 2 specific case, which complicates a bit the
> >	 * pairing scheme logic.  
> 
> Um... this is, as far as I can tell, complete nonsense.

Actually this was pure guessing, cause I never had a real explanation
for these weird pairing scheme.

> 
> I realize you know this about a thousand times better than I do, so
> I'm hesitant to make such a strong statement, but one thing that I do
> know is that paired pages are stored in the exact same transistors.
> The pairing is purely a logical addressing distance.  The physical
> distance is exactly zero.
> 
> The qustion is why they chose this particular *logival* addressing
> scheme, and I believe the reason is write bandwidth for the common case
> of streaming writes to consecutive pages.
> 
> The obvious thing to do is pair consecutive even and odd pages (pages 0 and 1,
> then 2 and 3, then...), but that makes it hard to pipeline programming of the
> two pages.  You can't start programming page 1 until page 0 is finished.
> 
> The next obvious thing is stride-2: 0<->2, 1<->3, 4<->6, 5<->7, etc.

Yes I understand that one.

> 
> This is done in some MLC chips.  See p. 98 of this Micron data sheet:
> http://pdf.datasheet.directory/datasheets-0/micron_technology/MT29F32G08CBACAWP_C.pdf
> which has a stride-4 pairing.  0..3 pair with 4..8, then 9..11 with
> 12..15, and so on.
> 
> However, it's desirable to alternate group-0 and group-1 pages, since
> the write operations are rather different and even take different amounts
> of time.  Alternating them makes it possible to:
> 1) Possibly overlap parts of the writes that use different on-chip resources,
> 2) Average the non-overlapping times for minimum jitter.

Okay, that's actually a good reason, and probably the part I was
missing to explain these non-log2 distance scheme leading to
heterogeneous distance (the first and last set of pages don't have
the same stride).

> 
> This leads naturally to the stride-3 solution.  You want to minimize the
> stride because you can read both pages in a pair with one read disturbance,
> and the shorter the distance, the more likely you'll want both pages
> (and the less buffering you'll need to make both available).
> 
> Stride-3 does have those two awkward edge cases, and changing the
> stride is simply the simplest way to special-case them.

Yep.

Still, I've seen weird things while working on modern MLC NANDs which
makes me think the pairing scheme is also here to help mitigate the
write-disturb effect, but I might be wrong. The behavior I'm
describing here has been observed on Hynix (H27QCG8T2E5R‐BCF) and
Toshiba (TC58TEG5DCLTA00) NANDs so far. When I write the 2 pages in a
pair, but not the following page, I see a high number of bitflips in
the last programmed page until the next page is programmed.

Let's take a real example. My NAND is exposing a stride-3 pairing
scheme, when I only program page 0, 1, 2, page 2 is showing a high
number of bitflips until page 3 is programmed. Actually, I don't
remember if the number decrease after programming page 3 or 4, but my
guess is that the NAND is accounting for future write-disturb when
programming a page in group 1, which makes this page un-reliable until
the subsequent page(s) have been programmed.

What's your opinion on that?

> 
> > Thanks for your valuable review/suggestions.
> >
> > Just out of curiosity, why are you interested in the pairing scheme
> > concept? Are you working with NANDs?  
> 
> Not at present, but I do embedded hardware and might some day.

Okay. You seem pretty well aware of MLC/TLC NAND constraints, and you
already have good idea of how things work.
Good to have someone like you reviewing this stuff.

> 
> Also, the data sheets are a real PITA to find.  I have yet to
> see an actual data sheet that documents the stride-3 pairing scheme.

Yes, that's a real problem. Here is a Samsung NAND data sheet
describing stride-3 [1], and an Hynix one describing stride-6 [2].

[1]http://dl.btc.pl/kamami_wa/k9gbg08u0a_ds.pdf
[2]http://www.szyuda88.com/uploadfile/cfile/201061714220663.pdf

-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12 11:11     ` Boris Brezillon
@ 2016-06-12 12:25       ` George Spelvin
  2016-06-12 12:42         ` Boris Brezillon
  0 siblings, 1 reply; 14+ messages in thread
From: George Spelvin @ 2016-06-12 12:25 UTC (permalink / raw)
  To: boris.brezillon, linux
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

>> (Another thing I thought of, but am less sure of, is packing the group
>> and pair numbers into a register-passable int rather than a structure.
>> Even 2 bits for the group is probably the most that will ever be needed,
>> but it's easy to say the low 4 bits are the group and the high 28 are
>> the pair.  Just create a few access macros to pull them apart.

> We could indeed do that, but again, do we really need to optimize
> things like that?

I don't have a good mental model of what the code calling these
translation functions looks like.  I was actually thinking that
if the results were returned by value, then the page to pair/group
translation function could be __pure, too, which might allow
for more optimization of the caller.

In fact, if (and only if!) the struct mtd_info structures are all
statically initialized, it would be legal to declare the functions
__attribute__const__.

Normally, an attribute((const)) function isn't allowed to dereference
pointers, but it *is* allowed to use information known at compile time,
and if the pointer is to a structure known at compile-time, then it's
okay.

All __attribute_const__ says is that the return value doesn't depend on
any *mutable* state.

>> Well, yes, but you may need to do conversion ops for in-memory cache
>> lookups or searching for free blocks, or wear-levelling computations,
>> all of which may involve a great many conversions per actual I/O.
>
> That's true, even if I don't think it makes such a big difference (you
> don't have that much paired pages manipulation that are not followed by
> read/write accesses, and this is where the contention is).

In that case, there's not much to worry about.  As I said, I don't have a
good idea what this information is used for.

>> However, it's desirable to alternate group-0 and group-1 pages, since
>> the write operations are rather different and even take different amounts
>> of time.  Alternating them makes it possible to:
>> 1) Possibly overlap parts of the writes that use different on-chip
>>    resources, and
>> 2) Average the non-overlapping times for minimum jitter.

> Okay, that's actually a good reason, and probably the part I was
> missing to explain these non-log2 distance scheme leading to
> heterogeneous distance (the first and last set of pages don't have
> the same stride).

Please note that I'm guessing, too; I don't actually *know*.

But the idea seems to hold together.

> Still, I've seen weird things while working on modern MLC NANDs which
> makes me think the pairing scheme is also here to help mitigate the
> write-disturb effect, but I might be wrong. The behavior I'm
> describing here has been observed on Hynix (H27QCG8T2E5R=E2=80=90BCF) and
> Toshiba (TC58TEG5DCLTA00) NANDs so far. When I write the 2 pages in a
> pair, but not the following page, I see a high number of bitflips in
> the last programmed page until the next page is programmed.
> 
> Let's take a real example. My NAND is exposing a stride-3 pairing
> scheme, when I only program page 0, 1, 2, page 2 is showing a high
> number of bitflips until page 3 is programmed. Actually, I don't
> remember if the number decrease after programming page 3 or 4, but my
> guess is that the NAND is accounting for future write-disturb when
> programming a page in group 1, which makes this page un-reliable until
> the subsequent page(s) have been programmed.
> 
> What's your opinion on that?

I'm a bit confused, too, but that actually seems plausible.  The Samsung
data sheet you pointed me to explicitly says that the pages in a block
must be programmed in order, no exceptions.  (In fact, an interesting
question is whether bad pages should be skipped or not!)

Given that, very predictable writer ordering, it would make sense to
precompensate for write disturb.

>> Also, the data sheets are a real PITA to find.  I have yet to
>> see an actual data sheet that documents the stride-3 pairing scheme.

> Yes, that's a real problem. Here is a Samsung NAND data sheet
> describing stride-3 [1], and an Hynix one describing stride-6 [2].
> 
> [1]http://dl.btc.pl/kamami_wa/k9gbg08u0a_ds.pdf
> [2]http://www.szyuda88.com/uploadfile/cfile/201061714220663.pdf

Thank you very much!

Did you see the footnote at the bottom of p. 64 of the latter?
Does that affect your pair/group addressing scheme?

It seems they are grouping not just 8K pages into even/odd double-pages,
and those 16K double-pages are being addressed with stride of 3.

But in particular, an interrupted write is likely to corrupt both
double-pages, 32K of data!

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12 12:25       ` George Spelvin
@ 2016-06-12 12:42         ` Boris Brezillon
  2016-06-12 15:10           ` Boris Brezillon
  2016-06-12 20:24           ` George Spelvin
  0 siblings, 2 replies; 14+ messages in thread
From: Boris Brezillon @ 2016-06-12 12:42 UTC (permalink / raw)
  To: George Spelvin
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

On 12 Jun 2016 08:25:49 -0400
"George Spelvin" <linux@sciencehorizons.net> wrote:

> >> (Another thing I thought of, but am less sure of, is packing the group
> >> and pair numbers into a register-passable int rather than a structure.
> >> Even 2 bits for the group is probably the most that will ever be needed,
> >> but it's easy to say the low 4 bits are the group and the high 28 are
> >> the pair.  Just create a few access macros to pull them apart.  
> 
> > We could indeed do that, but again, do we really need to optimize
> > things like that?  
> 
> I don't have a good mental model of what the code calling these
> translation functions looks like.  I was actually thinking that
> if the results were returned by value, then the page to pair/group
> translation function could be __pure, too, which might allow
> for more optimization of the caller.
> 
> In fact, if (and only if!) the struct mtd_info structures are all
> statically initialized, it would be legal to declare the functions
> __attribute__const__.
> 
> Normally, an attribute((const)) function isn't allowed to dereference
> pointers, but it *is* allowed to use information known at compile time,
> and if the pointer is to a structure known at compile-time, then it's
> okay.
> 
> All __attribute_const__ says is that the return value doesn't depend on
> any *mutable* state.
> 
> >> Well, yes, but you may need to do conversion ops for in-memory cache
> >> lookups or searching for free blocks, or wear-levelling computations,
> >> all of which may involve a great many conversions per actual I/O.  
> >
> > That's true, even if I don't think it makes such a big difference (you
> > don't have that much paired pages manipulation that are not followed by
> > read/write accesses, and this is where the contention is).  
> 
> In that case, there's not much to worry about.  As I said, I don't have a
> good idea what this information is used for.
> 
> >> However, it's desirable to alternate group-0 and group-1 pages, since
> >> the write operations are rather different and even take different amounts
> >> of time.  Alternating them makes it possible to:
> >> 1) Possibly overlap parts of the writes that use different on-chip
> >>    resources, and
> >> 2) Average the non-overlapping times for minimum jitter.  
> 
> > Okay, that's actually a good reason, and probably the part I was
> > missing to explain these non-log2 distance scheme leading to
> > heterogeneous distance (the first and last set of pages don't have
> > the same stride).  
> 
> Please note that I'm guessing, too; I don't actually *know*.
> 
> But the idea seems to hold together.
> 
> > Still, I've seen weird things while working on modern MLC NANDs which
> > makes me think the pairing scheme is also here to help mitigate the
> > write-disturb effect, but I might be wrong. The behavior I'm
> > describing here has been observed on Hynix (H27QCG8T2E5R=E2=80=90BCF) and
> > Toshiba (TC58TEG5DCLTA00) NANDs so far. When I write the 2 pages in a
> > pair, but not the following page, I see a high number of bitflips in
> > the last programmed page until the next page is programmed.
> > 
> > Let's take a real example. My NAND is exposing a stride-3 pairing
> > scheme, when I only program page 0, 1, 2, page 2 is showing a high
> > number of bitflips until page 3 is programmed. Actually, I don't
> > remember if the number decrease after programming page 3 or 4, but my
> > guess is that the NAND is accounting for future write-disturb when
> > programming a page in group 1, which makes this page un-reliable until
> > the subsequent page(s) have been programmed.
> > 
> > What's your opinion on that?  
> 
> I'm a bit confused, too, but that actually seems plausible.  The Samsung
> data sheet you pointed me to explicitly says that the pages in a block
> must be programmed in order, no exceptions.

Yep, that is mandatory.

> (In fact, an interesting
> question is whether bad pages should be skipped or not!)

There's no such thing. We have bad blocks, but when a block is bad all
the pages inside this block are considered bad. If one of the page in a
valid block shows uncorrectable errors, UBI/UBIFS will just refuse to
attach the partition/mount the FS.

> 
> Given that, very predictable writer ordering, it would make sense to
> precompensate for write disturb.

Yes, that's what I assumed, but this is not clearly documented.
Actually, I discovered that while trying to solve the paired pages
problem (when I was partially programming a block, it was showing
uncorrectable errors sooner than the fully written ones).

> 
> >> Also, the data sheets are a real PITA to find.  I have yet to
> >> see an actual data sheet that documents the stride-3 pairing scheme.  
> 
> > Yes, that's a real problem. Here is a Samsung NAND data sheet
> > describing stride-3 [1], and an Hynix one describing stride-6 [2].
> > 
> > [1]http://dl.btc.pl/kamami_wa/k9gbg08u0a_ds.pdf
> > [2]http://www.szyuda88.com/uploadfile/cfile/201061714220663.pdf  
> 
> Thank you very much!
> 
> Did you see the footnote at the bottom of p. 64 of the latter?
> Does that affect your pair/group addressing scheme?
> 
> It seems they are grouping not just 8K pages into even/odd double-pages,
> and those 16K double-pages are being addressed with stride of 3.
> 
> But in particular, an interrupted write is likely to corrupt both
> double-pages, 32K of data!

Yes, that's yet another problem I decided to ignore for now :).

I guess a solution would be to consider that all 4 pages are 'paired'
together, but this also implies considering that the NAND is a 4-level
cells, which will make us loose even more space when operating in 'SLC
mode' where we only write the lower page (page attached to group 0) of
each pair.

-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12 12:42         ` Boris Brezillon
@ 2016-06-12 15:10           ` Boris Brezillon
  2016-06-12 20:24           ` George Spelvin
  1 sibling, 0 replies; 14+ messages in thread
From: Boris Brezillon @ 2016-06-12 15:10 UTC (permalink / raw)
  To: George Spelvin
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

On Sun, 12 Jun 2016 14:42:15 +0200
Boris Brezillon <boris.brezillon@free-electrons.com> wrote:

> >   
> > >> Also, the data sheets are a real PITA to find.  I have yet to
> > >> see an actual data sheet that documents the stride-3 pairing scheme.    
> >   
> > > Yes, that's a real problem. Here is a Samsung NAND data sheet
> > > describing stride-3 [1], and an Hynix one describing stride-6 [2].
> > > 
> > > [1]http://dl.btc.pl/kamami_wa/k9gbg08u0a_ds.pdf
> > > [2]http://www.szyuda88.com/uploadfile/cfile/201061714220663.pdf    
> > 
> > Thank you very much!
> > 
> > Did you see the footnote at the bottom of p. 64 of the latter?
> > Does that affect your pair/group addressing scheme?
> > 
> > It seems they are grouping not just 8K pages into even/odd double-pages,
> > and those 16K double-pages are being addressed with stride of 3.
> > 
> > But in particular, an interrupted write is likely to corrupt both
> > double-pages, 32K of data!  
> 
> Yes, that's yet another problem I decided to ignore for now :).

Now I remember why I decided to ignore this. If you look at this other
Hynix data sheet [1] exposing the same pairing scheme you see that the
description as slightly changed. I don't know if it's a fix from the
previous description or if the pairing scheme are really different, but
until someone has tested it on a real device, I'll assume the Hynix
case is an exception which should be handled separately.

> 
> I guess a solution would be to consider that all 4 pages are 'paired'
> together, but this also implies considering that the NAND is a 4-level
> cells, which will make us loose even more space when operating in 'SLC
> mode' where we only write the lower page (page attached to group 0) of
> each pair.
> 

[1]http://minipcsale.ru/images/joomlakassa/TV_BOX/MINIX_NEO_X7/SK%20Hynix%20H27UCG8T2BTR.pdf

-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12 12:42         ` Boris Brezillon
  2016-06-12 15:10           ` Boris Brezillon
@ 2016-06-12 20:24           ` George Spelvin
  2016-06-12 21:13             ` Boris Brezillon
  1 sibling, 1 reply; 14+ messages in thread
From: George Spelvin @ 2016-06-12 20:24 UTC (permalink / raw)
  To: boris.brezillon, linux
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

Boris Brezillon wrote:
> On 12 Jun 2016 08:25:49 -0400
> "George Spelvin" <linux@sciencehorizons.net> wrote:
>> (In fact, an interesting
>> question is whether bad pages should be skipped or not!)
> 
> There's no such thing. We have bad blocks, but when a block is bad all
> the pages inside this block are considered bad. If one of the page in a
> valid block shows uncorrectable errors, UBI/UBIFS will just refuse to
> attach the partition/mount the FS.

Ah, okay.  I guess dealing with inconsistently-sized blocks is too much
hassle.  And a block has a single program/erase cycle count, so if one
part is close to wearing out, the rest is, too.

P.S. interesting NASA study of (SLC) flash disturb effects:
http://nepp.nasa.gov/DocUploads/9CCA546D-E7E6-4D96-880459A831EEA852/07-100%20Sheldon_JPL%20Distrub%20Testing%20in%20Flash%20Mem.pdf?q=disturb-testing-in-flash-memories

One thing they noted was that manufacturers' bad-blocck testing sucked,
and quite a few "bad" blocks became good and stayed good over time.

>> Given that, very predictable writer ordering, it would make sense to
>> precompensate for write disturb.
> 
> Yes, that's what I assumed, but this is not clearly documented.
> Actually, I discovered that while trying to solve the paired pages
> problem (when I was partially programming a block, it was showing
> uncorrectable errors sooner than the fully written ones).

Were the errors in a predictable direction?  My understanding is that
write disturb tends to add a little extra charge to the disturbed
floating gates (i.e. write them more toward 0), so you'd expect
to see extra 1s if the chip was underprogramming in antiipation.

I'm also having a hard time figuring out the bit assignment.
In general, "1" means uncharged floating gate and "0" means charged,
but different sources show different encodings for MLC.

Some (e.g. the NASA report above) show the progression from erased to
programmed as

11 - 10 - 01 - 00

so the msbit is a "big jump" and the lsbit is a "small jump", and to
program it in SLC mode you'd program both pages identically, then read
back the msbit.


Others, e.g.
http://users.ece.cmu.edu/~omutlu/pub/flash-programming-interference_iccd13.pdf
suggest the order is

11 - 10 - 00 - 01

This has the advantage that a 1-level mis-read only produces a 1-bit
error.

But in this case, to get SLC programming, you program the lsbit as
all-ones.

My problem is that I don't really understand MLC programming.


>>> [2]http://www.szyuda88.com/uploadfile/cfile/201061714220663.pdf  
>> 
>> Did you see the footnote at the bottom of p. 64 of the latter?
>> Does that affect your pair/group addressing scheme?
>> 
>> It seems they are grouping not just 8K pages into even/odd double-pages,
>> and those 16K double-pages are being addressed with stride of 3.
>> 
>> But in particular, an interrupted write is likely to corrupt both
>> double-pages, 32K of data!
> 
> Yes, that's yet another problem I decided to ignore for now :).
> 
> I guess a solution would be to consider that all 4 pages are 'paired'
> together, but this also implies considering that the NAND is a 4-level
> cells, which will make us loose even more space when operating in 'SLC
> mode' where we only write the lower page (page attached to group 0) of
> each pair.

It's more considering it to have 16K pages that can be accessed in half-pages.

> Now I remember why I decided to ignore this. If you look at this other
> Hynix data sheet [1] exposing the same pairing scheme you see that the
> description as slightly changed. I don't know if it's a fix from the
> previous description or if the pairing scheme are really different, but
> until someone has tested it on a real device, I'll assume the Hynix
> case is an exception which should be handled separately.

This chip has 16K pages.  But yes, it also has 256 pages/block.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12 20:24           ` George Spelvin
@ 2016-06-12 21:13             ` Boris Brezillon
  2016-06-12 21:37               ` Boris Brezillon
  2016-06-14  9:07               ` George Spelvin
  0 siblings, 2 replies; 14+ messages in thread
From: Boris Brezillon @ 2016-06-12 21:13 UTC (permalink / raw)
  To: George Spelvin
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

On 12 Jun 2016 16:24:53 -0400
"George Spelvin" <linux@sciencehorizons.net> wrote:

> Boris Brezillon wrote:
> > On 12 Jun 2016 08:25:49 -0400
> > "George Spelvin" <linux@sciencehorizons.net> wrote:  
> >> (In fact, an interesting
> >> question is whether bad pages should be skipped or not!)  
> > 
> > There's no such thing. We have bad blocks, but when a block is bad all
> > the pages inside this block are considered bad. If one of the page in a
> > valid block shows uncorrectable errors, UBI/UBIFS will just refuse to
> > attach the partition/mount the FS.  
> 
> Ah, okay.  I guess dealing with inconsistently-sized blocks is too much
> hassle.  And a block has a single program/erase cycle count, so if one
> part is close to wearing out, the rest is, too.
> 
> P.S. interesting NASA study of (SLC) flash disturb effects:
> http://nepp.nasa.gov/DocUploads/9CCA546D-E7E6-4D96-880459A831EEA852/07-100%20Sheldon_JPL%20Distrub%20Testing%20in%20Flash%20Mem.pdf?q=disturb-testing-in-flash-memories

Thanks for the link.

> 
> One thing they noted was that manufacturers' bad-blocck testing sucked,
> and quite a few "bad" blocks became good and stayed good over time.
> 
> >> Given that, very predictable writer ordering, it would make sense to
> >> precompensate for write disturb.  
> > 
> > Yes, that's what I assumed, but this is not clearly documented.
> > Actually, I discovered that while trying to solve the paired pages
> > problem (when I was partially programming a block, it was showing
> > uncorrectable errors sooner than the fully written ones).  
> 
> Were the errors in a predictable direction?  My understanding is that
> write disturb tends to add a little extra charge to the disturbed
> floating gates (i.e. write them more toward 0), so you'd expect
> to see extra 1s if the chip was underprogramming in antiipation.
> 
> I'm also having a hard time figuring out the bit assignment.
> In general, "1" means uncharged floating gate and "0" means charged,
> but different sources show different encodings for MLC.
> 
> Some (e.g. the NASA report above) show the progression from erased to
> programmed as
> 
> 11 - 10 - 01 - 00
> 
> so the msbit is a "big jump" and the lsbit is a "small jump", and to
> program it in SLC mode you'd program both pages identically, then read
> back the msbit.
> 
> 
> Others, e.g.
> http://users.ece.cmu.edu/~omutlu/pub/flash-programming-interference_iccd13.pdf
> suggest the order is
> 
> 11 - 10 - 00 - 01
> 
> This has the advantage that a 1-level mis-read only produces a 1-bit
> error.
> 
> But in this case, to get SLC programming, you program the lsbit as
> all-ones.
> 
> My problem is that I don't really understand MLC programming.

I came to the same conclusion: we really have these 2 cases in the
wild, which makes it even more complicated to define a standard
behavior.

> 
> 
> >>> [2]http://www.szyuda88.com/uploadfile/cfile/201061714220663.pdf    
> >> 
> >> Did you see the footnote at the bottom of p. 64 of the latter?
> >> Does that affect your pair/group addressing scheme?
> >> 
> >> It seems they are grouping not just 8K pages into even/odd double-pages,
> >> and those 16K double-pages are being addressed with stride of 3.
> >> 
> >> But in particular, an interrupted write is likely to corrupt both
> >> double-pages, 32K of data!  
> > 
> > Yes, that's yet another problem I decided to ignore for now :).
> > 
> > I guess a solution would be to consider that all 4 pages are 'paired'
> > together, but this also implies considering that the NAND is a 4-level
> > cells, which will make us loose even more space when operating in 'SLC
> > mode' where we only write the lower page (page attached to group 0) of
> > each pair.  
> 
> It's more considering it to have 16K pages that can be accessed in half-pages.

Yes, I know, but it's not really easy to fake that at the NAND level,
because programming 2 pages still requires 2 page program operation.
The MTD user could detect that the pairing scheme always exposes 2
consecutive non-paired pages, but as you've seen, this condition does
not necessarily imply the 'pair coupling' constraint, and we don't want
to increase the min_io_size value if it's not really necessary.


-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12 21:13             ` Boris Brezillon
@ 2016-06-12 21:37               ` Boris Brezillon
  2016-06-14  9:07               ` George Spelvin
  1 sibling, 0 replies; 14+ messages in thread
From: Boris Brezillon @ 2016-06-12 21:37 UTC (permalink / raw)
  To: George Spelvin
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

On Sun, 12 Jun 2016 23:13:14 +0200
Boris Brezillon <boris.brezillon@free-electrons.com> wrote:

> On 12 Jun 2016 16:24:53 -0400
> "George Spelvin" <linux@sciencehorizons.net> wrote:
> 
> > Boris Brezillon wrote:  
> > > On 12 Jun 2016 08:25:49 -0400
> > > "George Spelvin" <linux@sciencehorizons.net> wrote:    
> > >> (In fact, an interesting
> > >> question is whether bad pages should be skipped or not!)    
> > > 
> > > There's no such thing. We have bad blocks, but when a block is bad all
> > > the pages inside this block are considered bad. If one of the page in a
> > > valid block shows uncorrectable errors, UBI/UBIFS will just refuse to
> > > attach the partition/mount the FS.    
> > 
> > Ah, okay.  I guess dealing with inconsistently-sized blocks is too much
> > hassle.  And a block has a single program/erase cycle count, so if one
> > part is close to wearing out, the rest is, too.
> > 
> > P.S. interesting NASA study of (SLC) flash disturb effects:
> > http://nepp.nasa.gov/DocUploads/9CCA546D-E7E6-4D96-880459A831EEA852/07-100%20Sheldon_JPL%20Distrub%20Testing%20in%20Flash%20Mem.pdf?q=disturb-testing-in-flash-memories  
> 
> Thanks for the link.
> 
> > 
> > One thing they noted was that manufacturers' bad-blocck testing sucked,
> > and quite a few "bad" blocks became good and stayed good over time.
> >   
> > >> Given that, very predictable writer ordering, it would make sense to
> > >> precompensate for write disturb.    
> > > 
> > > Yes, that's what I assumed, but this is not clearly documented.
> > > Actually, I discovered that while trying to solve the paired pages
> > > problem (when I was partially programming a block, it was showing
> > > uncorrectable errors sooner than the fully written ones).    
> > 
> > Were the errors in a predictable direction?  My understanding is that
> > write disturb tends to add a little extra charge to the disturbed
> > floating gates (i.e. write them more toward 0), so you'd expect
> > to see extra 1s if the chip was underprogramming in antiipation.
> > 
> > I'm also having a hard time figuring out the bit assignment.
> > In general, "1" means uncharged floating gate and "0" means charged,
> > but different sources show different encodings for MLC.
> > 
> > Some (e.g. the NASA report above) show the progression from erased to
> > programmed as
> > 
> > 11 - 10 - 01 - 00
> > 
> > so the msbit is a "big jump" and the lsbit is a "small jump", and to
> > program it in SLC mode you'd program both pages identically, then read
> > back the msbit.
> > 
> > 
> > Others, e.g.
> > http://users.ece.cmu.edu/~omutlu/pub/flash-programming-interference_iccd13.pdf
> > suggest the order is
> > 
> > 11 - 10 - 00 - 01
> > 
> > This has the advantage that a 1-level mis-read only produces a 1-bit
> > error.
> > 
> > But in this case, to get SLC programming, you program the lsbit as
> > all-ones.
> > 
> > My problem is that I don't really understand MLC programming.  
> 
> I came to the same conclusion: we really have these 2 cases in the
> wild, which makes it even more complicated to define a standard
> behavior.
> 
> > 
> >   
> > >>> [2]http://www.szyuda88.com/uploadfile/cfile/201061714220663.pdf      
> > >> 
> > >> Did you see the footnote at the bottom of p. 64 of the latter?
> > >> Does that affect your pair/group addressing scheme?
> > >> 
> > >> It seems they are grouping not just 8K pages into even/odd double-pages,
> > >> and those 16K double-pages are being addressed with stride of 3.
> > >> 
> > >> But in particular, an interrupted write is likely to corrupt both
> > >> double-pages, 32K of data!    
> > > 
> > > Yes, that's yet another problem I decided to ignore for now :).
> > > 
> > > I guess a solution would be to consider that all 4 pages are 'paired'
> > > together, but this also implies considering that the NAND is a 4-level
> > > cells, which will make us loose even more space when operating in 'SLC
> > > mode' where we only write the lower page (page attached to group 0) of
> > > each pair.    
> > 
> > It's more considering it to have 16K pages that can be accessed in half-pages.  
> 
> Yes, I know, but it's not really easy to fake that at the NAND level,
> because programming 2 pages still requires 2 page program operation.
> The MTD user could detect that the pairing scheme always exposes 2
> consecutive non-paired pages, but as you've seen, this condition does
> not necessarily imply the 'pair coupling' constraint, and we don't want
> to increase the min_io_size value if it's not really necessary.
> 
> 

I'm just realizing this is actually a non-issue for the solution we
developed with Ricard. As I said, it's unsafe to partially write a
block in MLC mode, so the only sane way is either to write a block in
SLC mode, or atomically write a block in MLC mode, and that's what
we're doing with our 'UBI LEB consolidation' approach. I'm pretty sure
the problem described in the Hynix datasheet does not happen when only
writing in SLC mode. So, even if the pairing scheme does not account
for this extra 'coupling' constraint, we should be safe.

BTW, I think I should drop the mtd_wunit_to_pairing_info() function
until someone really needs it.

-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-12 21:13             ` Boris Brezillon
  2016-06-12 21:37               ` Boris Brezillon
@ 2016-06-14  9:07               ` George Spelvin
  2016-06-14  9:34                 ` Boris Brezillon
  2016-06-14 20:29                 ` Boris Brezillon
  1 sibling, 2 replies; 14+ messages in thread
From: George Spelvin @ 2016-06-14  9:07 UTC (permalink / raw)
  To: boris.brezillon, linux
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

Boris Brezillon wrote:
> On 12 Jun 2016 16:24:53 George Spelvin wrote:
>> Boris Brezillon wrote:
>> My problem is that I don't really understand MLC programming.

> I came to the same conclusion: we really have these 2 cases in the
> wild, which makes it even more complicated to define a standard
> behavior.

I did find a useful stuy of the issue: "Program Interference in MLC NAND
Flash Memory: Characterization, Modeling, and Mitigation"

https://users.ece.cmu.edu/~omutlu/pub/flash-programming-interference_iccd13.pdf

It describes the write-disturb-precompensation technique, and also
shows how the two-stage programming works.  (Although the fact that the
"least significant bit" is the *largest* voltage difference and is shown
on the *left* makes no sense at all.)

Looking at the demonstrated programming sequence, it looks like
it should be possible to probe for the bit assignment.  If you have
a half-programmed page, then any bits programmed to "0" are actually
sitting close to the threshold between the two middle voltage levels.

So you'll get a lot of errors reading them as "1", but the interesting
part is the read-back of the unprogrammed bit.

If the chip is using the binary sequence, you'll read either 10 or 01.
If the chip us ising the Gray-code sequence, you'll read 10 or 00.

Basically, you read both pages and see which bit combination never
appears.  That is the combination that corresponds to the highest voltage
level.

Another interesting paper is "Read Disturb Errors in MLC NAND Flash
Memory: Characterization, Mitigation, and Recovery"
https://users.ece.cmu.edu/~omutlu/pub/flash-read-disturb-errors_dsn15.pdf

That talks about tricks that do as you observe: increase read error to start.
(In order to decreaease read disturb, and thus read errors later.)

>> It's more considering it to have 16K pages that can be accessed in half-pages.

> Yes, I know, but it's not really easy to fake that at the NAND level,
> because programming 2 pages still requires 2 page program operation.
> The MTD user could detect that the pairing scheme always exposes 2
> consecutive non-paired pages, but as you've seen, this condition does
> not necessarily imply the 'pair coupling' constraint, and we don't want
> to increase the min_io_size value if it's not really necessary.

Ideally, it would be nice to separate the "SLC hack" from the "later
write failures can corrupt earlier data" workaround.

First, you get the latter working on SLC flash.  Then you add MLC, and
make MLC another reason why it can happen.

But I'm not certain this is actually necessary.  Could listing 4 pages
rather than 2 as in other data sheets just be an editing or translation
error?  Maybe someoe got confused about "in the same row" when they
wrote that clarifying example.

> I'm just realizing this is actually a non-issue for the solution we
> developed with Ricard.  As I said, it's unsafe to partially write a
> block in MLC mode, so the only sane way is either to write a block in
> SLC mode, or atomically write a block in MLC mode, and that's what
> we're doing with our 'UBI LEB consolidation' approach.  I'm pretty sure
> the problem described in the Hynix datasheet does not happen when only
> writing in SLC mode.  So, even if the pairing scheme does not account
> for this extra 'coupling' constraint, we should be safe.

I can't see any reason why it would affect MLC and not SLC.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-14  9:07               ` George Spelvin
@ 2016-06-14  9:34                 ` Boris Brezillon
  2016-06-14 20:29                 ` Boris Brezillon
  1 sibling, 0 replies; 14+ messages in thread
From: Boris Brezillon @ 2016-06-14  9:34 UTC (permalink / raw)
  To: George Spelvin
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

On 14 Jun 2016 05:07:26 -0400
"George Spelvin" <linux@sciencehorizons.net> wrote:

> Boris Brezillon wrote:
> > On 12 Jun 2016 16:24:53 George Spelvin wrote:  
> >> Boris Brezillon wrote:
> >> My problem is that I don't really understand MLC programming.  
> 
> > I came to the same conclusion: we really have these 2 cases in the
> > wild, which makes it even more complicated to define a standard
> > behavior.  
> 
> I did find a useful stuy of the issue: "Program Interference in MLC NAND
> Flash Memory: Characterization, Modeling, and Mitigation"
> 
> https://users.ece.cmu.edu/~omutlu/pub/flash-programming-interference_iccd13.pdf
> 
> It describes the write-disturb-precompensation technique, and also
> shows how the two-stage programming works.  (Although the fact that the
> "least significant bit" is the *largest* voltage difference and is shown
> on the *left* makes no sense at all.)
> 
> Looking at the demonstrated programming sequence, it looks like
> it should be possible to probe for the bit assignment.  If you have
> a half-programmed page, then any bits programmed to "0" are actually
> sitting close to the threshold between the two middle voltage levels.
> 
> So you'll get a lot of errors reading them as "1", but the interesting
> part is the read-back of the unprogrammed bit.
> 
> If the chip is using the binary sequence, you'll read either 10 or 01.
> If the chip us ising the Gray-code sequence, you'll read 10 or 00.
> 
> Basically, you read both pages and see which bit combination never
> appears.  That is the combination that corresponds to the highest voltage
> level.
> 
> Another interesting paper is "Read Disturb Errors in MLC NAND Flash
> Memory: Characterization, Mitigation, and Recovery"
> https://users.ece.cmu.edu/~omutlu/pub/flash-read-disturb-errors_dsn15.pdf
> 
> That talks about tricks that do as you observe: increase read error to start.
> (In order to decreaease read disturb, and thus read errors later.)

Thanks a lot for sharing your thoughts along with all these references,
that's really useful. I'll carefully read all of them.

> 
> >> It's more considering it to have 16K pages that can be accessed in half-pages.  
> 
> > Yes, I know, but it's not really easy to fake that at the NAND level,
> > because programming 2 pages still requires 2 page program operation.
> > The MTD user could detect that the pairing scheme always exposes 2
> > consecutive non-paired pages, but as you've seen, this condition does
> > not necessarily imply the 'pair coupling' constraint, and we don't want
> > to increase the min_io_size value if it's not really necessary.  
> 
> Ideally, it would be nice to separate the "SLC hack" from the "later
> write failures can corrupt earlier data" workaround.
> 
> First, you get the latter working on SLC flash.

When you say SLC flash, you're talking about MLC NANDs operating in SLC
mode, right?

> Then you add MLC, and
> make MLC another reason why it can happen.
> 
> But I'm not certain this is actually necessary.  Could listing 4 pages
> rather than 2 as in other data sheets just be an editing or translation
> error?  Maybe someoe got confused about "in the same row" when they
> wrote that clarifying example.

Yes, that's what I supposed. I'll try to test that on a real device.

> 
> > I'm just realizing this is actually a non-issue for the solution we
> > developed with Ricard.  As I said, it's unsafe to partially write a
> > block in MLC mode, so the only sane way is either to write a block in
> > SLC mode, or atomically write a block in MLC mode, and that's what
> > we're doing with our 'UBI LEB consolidation' approach.  I'm pretty sure
> > the problem described in the Hynix datasheet does not happen when only
> > writing in SLC mode.  So, even if the pairing scheme does not account
> > for this extra 'coupling' constraint, we should be safe.  
> 
> I can't see any reason why it would affect MLC and not SLC.

That's something we'll have to check on a real NAND exposing this
constraint (I'll try to find a board with one of these NANDs), but if
that's really the case, and programming page 1 can really spoil page 0
even if they're not sharing the same cells, then that's a big problem.

-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-06-14  9:07               ` George Spelvin
  2016-06-14  9:34                 ` Boris Brezillon
@ 2016-06-14 20:29                 ` Boris Brezillon
  1 sibling, 0 replies; 14+ messages in thread
From: Boris Brezillon @ 2016-06-14 20:29 UTC (permalink / raw)
  To: George Spelvin
  Cc: beanhuo, computersforpeace, linux-kernel, linux-mtd, richard

On 14 Jun 2016 05:07:26 -0400
"George Spelvin" <linux@sciencehorizons.net> wrote:

> Boris Brezillon wrote:
> > On 12 Jun 2016 16:24:53 George Spelvin wrote:  
> >> Boris Brezillon wrote:
> >> My problem is that I don't really understand MLC programming.  
> 
> > I came to the same conclusion: we really have these 2 cases in the
> > wild, which makes it even more complicated to define a standard
> > behavior.  
> 
> I did find a useful stuy of the issue: "Program Interference in MLC NAND
> Flash Memory: Characterization, Modeling, and Mitigation"
> 
> https://users.ece.cmu.edu/~omutlu/pub/flash-programming-interference_iccd13.pdf
> 
> It describes the write-disturb-precompensation technique, and also
> shows how the two-stage programming works.  (Although the fact that the
> "least significant bit" is the *largest* voltage difference and is shown
> on the *left* makes no sense at all.)

I think I read this document back when I started to look at how MLC
NANDs were working, but I didn't have the background to understand what
all this meant.

Reading it again makes a lot more sense, and actually I now understand
why those NANDs require data scrambling/randomization (their program
disturb modeling was done with random data, which makes it irrelevant
when repeated pattern are programmed).

> 
> Looking at the demonstrated programming sequence, it looks like
> it should be possible to probe for the bit assignment.  If you have
> a half-programmed page, then any bits programmed to "0" are actually
> sitting close to the threshold between the two middle voltage levels.
> 
> So you'll get a lot of errors reading them as "1", but the interesting
> part is the read-back of the unprogrammed bit.
> 
> If the chip is using the binary sequence, you'll read either 10 or 01.
> If the chip us ising the Gray-code sequence, you'll read 10 or 00.
> 
> Basically, you read both pages and see which bit combination never
> appears.  That is the combination that corresponds to the highest voltage
> level.
> 
> Another interesting paper is "Read Disturb Errors in MLC NAND Flash
> Memory: Characterization, Mitigation, and Recovery"
> https://users.ece.cmu.edu/~omutlu/pub/flash-read-disturb-errors_dsn15.pdf
> 
> That talks about tricks that do as you observe: increase read error to start.
> (In order to decreaease read disturb, and thus read errors later.)
> 
> >> It's more considering it to have 16K pages that can be accessed in half-pages.  
> 
> > Yes, I know, but it's not really easy to fake that at the NAND level,
> > because programming 2 pages still requires 2 page program operation.
> > The MTD user could detect that the pairing scheme always exposes 2
> > consecutive non-paired pages, but as you've seen, this condition does
> > not necessarily imply the 'pair coupling' constraint, and we don't want
> > to increase the min_io_size value if it's not really necessary.  
> 
> Ideally, it would be nice to separate the "SLC hack" from the "later
> write failures can corrupt earlier data" workaround.
> 
> First, you get the latter working on SLC flash.  Then you add MLC, and
> make MLC another reason why it can happen.
> 
> But I'm not certain this is actually necessary.  Could listing 4 pages
> rather than 2 as in other data sheets just be an editing or translation
> error?  Maybe someoe got confused about "in the same row" when they
> wrote that clarifying example.
> 
> > I'm just realizing this is actually a non-issue for the solution we
> > developed with Ricard.  As I said, it's unsafe to partially write a
> > block in MLC mode, so the only sane way is either to write a block in
> > SLC mode, or atomically write a block in MLC mode, and that's what
> > we're doing with our 'UBI LEB consolidation' approach.  I'm pretty sure
> > the problem described in the Hynix datasheet does not happen when only
> > writing in SLC mode.  So, even if the pairing scheme does not account
> > for this extra 'coupling' constraint, we should be safe.  
> 
> I can't see any reason why it would affect MLC and not SLC.



-- 
Boris Brezillon, Free Electrons
Embedded Linux and Kernel engineering
http://free-electrons.com

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [PATCH 2/4] mtd: nand: implement two pairing scheme
  2016-04-25 10:01 [PATCH 0/4] mtd: add support for pairing scheme description Boris Brezillon
@ 2016-04-25 10:01 ` Boris Brezillon
  0 siblings, 0 replies; 14+ messages in thread
From: Boris Brezillon @ 2016-04-25 10:01 UTC (permalink / raw)
  To: Boris Brezillon, Richard Weinberger, linux-mtd, David Woodhouse,
	Brian Norris
  Cc: linux-kernel

Implement two common pairing scheme (found on many MLC devices), and name
them in reference to the paired pages distance (3 or 6 pages).

Signed-off-by: Boris Brezillon <boris.brezillon@free-electrons.com>
Signed-off-by: Richard Weinberger <richard@nod.at>
---
 drivers/mtd/nand/nand_base.c | 96 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mtd/nand.h     |  3 ++
 2 files changed, 99 insertions(+)

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 7bc37b4..a64e0d5 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -4274,6 +4274,102 @@ static bool nand_ecc_strength_good(struct mtd_info *mtd)
 	return corr >= ds_corr && ecc->strength >= chip->ecc_strength_ds;
 }
 
+static void nand_pairing_dist3_get_info(struct mtd_info *mtd, int page,
+					struct mtd_pairing_info *info)
+{
+	int lastpage = (mtd->erasesize / mtd->writesize) - 1;
+	int dist = 3;
+
+	if (page == lastpage)
+		dist = 2;
+
+	if (!page || (page & 1)) {
+		info->group = 0;
+		info->pair = (page + 1) / 2;
+	} else {
+		info->group = 1;
+		info->pair = (page + 1 - dist) / 2;
+	}
+}
+
+static int nand_pairing_dist3_get_wunit(struct mtd_info *mtd,
+					const struct mtd_pairing_info *info)
+{
+	int lastpair = ((mtd->erasesize / mtd->writesize) - 1) / 2;
+	int page = info->pair * 2;
+	int dist = 3;
+
+	if (!info->group && !info->pair)
+		return 0;
+
+	if (info->pair == lastpair && info->group)
+		dist = 2;
+
+	if (!info->group)
+		page--;
+	else if (info->pair)
+		page += dist - 1;
+
+	if (page >= mtd->erasesize / mtd->writesize)
+		return -EINVAL;
+
+	return page;
+}
+
+const struct mtd_pairing_scheme dist3_pairing_scheme = {
+	.ngroups = 2,
+	.get_info = nand_pairing_dist3_get_info,
+	.get_wunit = nand_pairing_dist3_get_wunit,
+};
+EXPORT_SYMBOL_GPL(dist3_pairing_scheme);
+
+static void nand_pairing_dist6_get_info(struct mtd_info *mtd, int page,
+					struct mtd_pairing_info *info)
+{
+	int lastpage = (mtd->erasesize / mtd->writesize) - 1;
+	int half = page / 2;
+	int dist = 6;
+
+	if (half == lastpage / 2)
+		dist = 4;
+
+	if (!half  || (half & 1)) {
+		info->group = 0;
+		info->pair = (page + 2) / 2;
+	} else {
+		info->group = 1;
+		info->pair = (page + 2 - dist) / 2;
+	}
+}
+
+static int nand_pairing_dist6_get_wunit(struct mtd_info *mtd,
+				       const struct mtd_pairing_info *info)
+{
+	int lastpair = ((mtd->erasesize / mtd->writesize) - 1) / 2;
+	int page = info->pair * 2;
+	int dist = 6;
+
+	if (!info->group && !info->pair)
+		return 0;
+
+	if (info->pair >= lastpair - 1 && info->group)
+		dist = 4;
+
+	if (!info->group)
+		page += 2;
+	else
+		page += dist - 2;
+
+	return page;
+}
+
+const struct mtd_pairing_scheme dist6_pairing_scheme = {
+	.ngroups = 2,
+	.get_info = nand_pairing_dist6_get_info,
+	.get_wunit = nand_pairing_dist6_get_wunit,
+};
+EXPORT_SYMBOL_GPL(dist6_pairing_scheme);
+
 /**
  * nand_scan_tail - [NAND Interface] Scan for the NAND device
  * @mtd: MTD device structure
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index fbe8e16..874b267 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -1092,4 +1092,7 @@ int nand_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip, int page);
 /* Default read_oob syndrome implementation */
 int nand_read_oob_syndrome(struct mtd_info *mtd, struct nand_chip *chip,
 			   int page);
+
+extern const struct mtd_pairing_scheme dist3_pairing_scheme;
+extern const struct mtd_pairing_scheme dist6_pairing_scheme;
 #endif /* __LINUX_MTD_NAND_H */
-- 
2.7.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2016-06-14 20:29 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2016-06-11 22:30 [PATCH 2/4] mtd: nand: implement two pairing scheme George Spelvin
2016-06-12  7:20 ` Boris Brezillon
2016-06-12  9:23   ` George Spelvin
2016-06-12 11:11     ` Boris Brezillon
2016-06-12 12:25       ` George Spelvin
2016-06-12 12:42         ` Boris Brezillon
2016-06-12 15:10           ` Boris Brezillon
2016-06-12 20:24           ` George Spelvin
2016-06-12 21:13             ` Boris Brezillon
2016-06-12 21:37               ` Boris Brezillon
2016-06-14  9:07               ` George Spelvin
2016-06-14  9:34                 ` Boris Brezillon
2016-06-14 20:29                 ` Boris Brezillon
  -- strict thread matches above, loose matches on Subject: below --
2016-04-25 10:01 [PATCH 0/4] mtd: add support for pairing scheme description Boris Brezillon
2016-04-25 10:01 ` [PATCH 2/4] mtd: nand: implement two pairing scheme Boris Brezillon

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).