linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
@ 2012-01-22 18:46 Earl Chew
  2012-01-23 14:47 ` Eric W. Biederman
  0 siblings, 1 reply; 12+ messages in thread
From: Earl Chew @ 2012-01-22 18:46 UTC (permalink / raw)
  To: Ingo Molnar, Peter Zijlstra, Andrew Morton, Eric Paris, Serge E. Hallyn
  Cc: linux-kernel, adobriyan

The following illustrates the problem:

    cat /proc/sys/kernel/threads-max | od -bc
    0000000 065 062 071 071 062 012
              5   2   9   9   2  \n
    0000006

    dd bs=1 < /proc/sys/kernel/threads-max | od -bc
    1+0 records in
    1+0 records out
    1 byte (1 B) copied, 0.000102707 s, 9.7 kB/s
    0000000 065
              5
    0000001

The implementation in sysctl.c is inconsistent because support
for small reads is available for strings (eg /proc/sys/kernel/core_pattern),
but not integers.

This issue was also reported in:

    https://bugzilla.kernel.org/show_bug.cgi?id=19182

Signed-off-by: Earl Chew <echew@ixiacom.com>
---
 kernel/sysctl.c |   83 +++++++++++++++++++++++++++++++++++++-----------------
 1 files changed, 57 insertions(+), 26 deletions(-)

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ae27196..b13d47b 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -2215,31 +2215,41 @@ static int proc_get_long(char **buf, size_t *size,
  * @size: the size of the user buffer
  * @val: the integer to be converted
  * @neg: sign of the number, %TRUE for negative
+ * @skip: the number of characters to skip
  *
  * In case of success %0 is returned and @buf and @size are updated with
  * the amount of bytes written.
  */
 static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
-			  bool neg)
+			  bool neg, loff_t *skip)
 {
 	int len;
 	char tmp[TMPBUFLEN], *p = tmp;
 
 	sprintf(p, "%s%lu", neg ? "-" : "", val);
-	len = strlen(tmp);
-	if (len > *size)
-		len = *size;
-	if (copy_to_user(*buf, tmp, len))
-		return -EFAULT;
-	*size -= len;
-	*buf += len;
+	len = strlen(p);
+	if (*skip >= len) {
+		*skip -= len;
+	} else {
+		p += *skip;
+		len -= *skip;
+		*skip = 0;
+		if (len > *size)
+			len = *size;
+		if (copy_to_user(*buf, p, len))
+			return -EFAULT;
+		*size -= len;
+		*buf += len;
+	}
 	return 0;
 }
 #undef TMPBUFLEN
 
-static int proc_put_char(void __user **buf, size_t *size, char c)
+static int proc_put_char(void __user **buf, size_t *size, char c, loff_t *skip)
 {
-	if (*size) {
+	if (*skip) {
+		(*skip)--;
+	} else if (*size) {
 		char __user **buffer = (char __user **)buf;
 		if (put_user(c, *buffer))
 			return -EFAULT;
@@ -2279,10 +2289,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 {
 	int *i, vleft, first = 1, err = 0;
 	unsigned long page = 0;
+	loff_t skip = *ppos;
 	size_t left;
 	char *kbuf;
 	
-	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
+	if (!tbl_data || !table->maxlen || !*lenp) {
 		*lenp = 0;
 		return 0;
 	}
@@ -2331,18 +2342,26 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
 				err = -EINVAL;
 				break;
 			}
-			if (!first)
-				err = proc_put_char(&buffer, &left, '\t');
+			if (!first) {
+				err = proc_put_char(
+					&buffer, &left, '\t', &skip);
+			}
 			if (err)
 				break;
-			err = proc_put_long(&buffer, &left, lval, neg);
+			err = proc_put_long(&buffer, &left, lval, neg, &skip);
 			if (err)
 				break;
 		}
 	}
 
-	if (!write && !first && left && !err)
-		err = proc_put_char(&buffer, &left, '\n');
+	if (!write) {
+		if (!first && !err)
+			err = proc_put_char(&buffer, &left, '\n', &skip);
+		if (!err && skip) {
+			*lenp = 0;
+			return 0;
+		}
+	}
 	if (write && !err && left)
 		left -= proc_skip_spaces(&kbuf);
 free:
@@ -2497,10 +2516,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 	unsigned long *i, *min, *max;
 	int vleft, first = 1, err = 0;
 	unsigned long page = 0;
+	loff_t skip = *ppos;
 	size_t left;
 	char *kbuf;
 
-	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
+	if (!data || !table->maxlen || !*lenp) {
 		*lenp = 0;
 		return 0;
 	}
@@ -2546,15 +2566,22 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
 		} else {
 			val = convdiv * (*i) / convmul;
 			if (!first)
-				err = proc_put_char(&buffer, &left, '\t');
-			err = proc_put_long(&buffer, &left, val, false);
+				err = proc_put_char(
+					&buffer, &left, '\t', &skip);
+			err = proc_put_long(&buffer, &left, val, false, &skip);
 			if (err)
 				break;
 		}
 	}
 
-	if (!write && !first && left && !err)
-		err = proc_put_char(&buffer, &left, '\n');
+	if (!write) {
+		if (!first && !err)
+			err = proc_put_char(&buffer, &left, '\n', &skip);
+		if (!err && skip) {
+			*lenp = 0;
+			return 0;
+		}
+	}
 	if (write && !err)
 		left -= proc_skip_spaces(&kbuf);
 free:
@@ -2805,6 +2832,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	int err = 0;
 	bool first = 1;
 	size_t left = *lenp;
+	loff_t skip = *ppos;
 	unsigned long bitmap_len = table->maxlen;
 	unsigned long *bitmap = (unsigned long *) table->data;
 	unsigned long *tmp_bitmap = NULL;
@@ -2893,18 +2921,21 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 						   bit_a + 1) - 1;
 
 			if (!first) {
-				err = proc_put_char(&buffer, &left, ',');
+				err = proc_put_char(&buffer, &left, ',', &skip);
 				if (err)
 					break;
 			}
-			err = proc_put_long(&buffer, &left, bit_a, false);
+			err = proc_put_long(
+				&buffer, &left, bit_a, false, &skip);
 			if (err)
 				break;
 			if (bit_a != bit_b) {
-				err = proc_put_char(&buffer, &left, '-');
+				err = proc_put_char(
+					&buffer, &left, '-', &skip);
 				if (err)
 					break;
-				err = proc_put_long(&buffer, &left, bit_b, false);
+				err = proc_put_long(
+					&buffer, &left, bit_b, false, &skip);
 				if (err)
 					break;
 			}
@@ -2912,7 +2943,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 			first = 0; bit_b++;
 		}
 		if (!err)
-			err = proc_put_char(&buffer, &left, '\n');
+			err = proc_put_char(&buffer, &left, '\n', &skip);
 	}
 
 	if (!err) {
-- 
1.6.5.2



^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-22 18:46 [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c Earl Chew
@ 2012-01-23 14:47 ` Eric W. Biederman
  2012-01-23 15:49   ` Earl Chew
  0 siblings, 1 reply; 12+ messages in thread
From: Eric W. Biederman @ 2012-01-23 14:47 UTC (permalink / raw)
  To: Earl Chew
  Cc: Ingo Molnar, Peter Zijlstra, Andrew Morton, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

Earl Chew <echew@ixiacom.com> writes:

> The following illustrates the problem:
>
>     cat /proc/sys/kernel/threads-max | od -bc
>     0000000 065 062 071 071 062 012
>               5   2   9   9   2  \n
>     0000006
>
>     dd bs=1 < /proc/sys/kernel/threads-max | od -bc
>     1+0 records in
>     1+0 records out
>     1 byte (1 B) copied, 0.000102707 s, 9.7 kB/s
>     0000000 065
>               5
>     0000001
>
> The implementation in sysctl.c is inconsistent because support
> for small reads is available for strings (eg /proc/sys/kernel/core_pattern),
> but not integers.

The difference between strings and integers is that with strings
single byte reads/writes actually make sense.

Single decimal digits reads don't make much sense at all for integers.

Is there any compelling reason to support this complete idiocy from user
space programs?

Eric

> This issue was also reported in:
>
>     https://bugzilla.kernel.org/show_bug.cgi?id=19182
>
> Signed-off-by: Earl Chew <echew@ixiacom.com>
> ---
>  kernel/sysctl.c |   83 +++++++++++++++++++++++++++++++++++++-----------------
>  1 files changed, 57 insertions(+), 26 deletions(-)
>
> diff --git a/kernel/sysctl.c b/kernel/sysctl.c
> index ae27196..b13d47b 100644
> --- a/kernel/sysctl.c
> +++ b/kernel/sysctl.c
> @@ -2215,31 +2215,41 @@ static int proc_get_long(char **buf, size_t *size,
>   * @size: the size of the user buffer
>   * @val: the integer to be converted
>   * @neg: sign of the number, %TRUE for negative
> + * @skip: the number of characters to skip
>   *
>   * In case of success %0 is returned and @buf and @size are updated with
>   * the amount of bytes written.
>   */
>  static int proc_put_long(void __user **buf, size_t *size, unsigned long val,
> -			  bool neg)
> +			  bool neg, loff_t *skip)
>  {
>  	int len;
>  	char tmp[TMPBUFLEN], *p = tmp;
>  
>  	sprintf(p, "%s%lu", neg ? "-" : "", val);
> -	len = strlen(tmp);
> -	if (len > *size)
> -		len = *size;
> -	if (copy_to_user(*buf, tmp, len))
> -		return -EFAULT;
> -	*size -= len;
> -	*buf += len;
> +	len = strlen(p);
> +	if (*skip >= len) {
> +		*skip -= len;
> +	} else {
> +		p += *skip;
> +		len -= *skip;
> +		*skip = 0;
> +		if (len > *size)
> +			len = *size;
> +		if (copy_to_user(*buf, p, len))
> +			return -EFAULT;
> +		*size -= len;
> +		*buf += len;
> +	}
>  	return 0;
>  }
>  #undef TMPBUFLEN
>  
> -static int proc_put_char(void __user **buf, size_t *size, char c)
> +static int proc_put_char(void __user **buf, size_t *size, char c, loff_t *skip)
>  {
> -	if (*size) {
> +	if (*skip) {
> +		(*skip)--;
> +	} else if (*size) {
>  		char __user **buffer = (char __user **)buf;
>  		if (put_user(c, *buffer))
>  			return -EFAULT;
> @@ -2279,10 +2289,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
>  {
>  	int *i, vleft, first = 1, err = 0;
>  	unsigned long page = 0;
> +	loff_t skip = *ppos;
>  	size_t left;
>  	char *kbuf;
>  	
> -	if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) {
> +	if (!tbl_data || !table->maxlen || !*lenp) {
>  		*lenp = 0;
>  		return 0;
>  	}
> @@ -2331,18 +2342,26 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table,
>  				err = -EINVAL;
>  				break;
>  			}
> -			if (!first)
> -				err = proc_put_char(&buffer, &left, '\t');
> +			if (!first) {
> +				err = proc_put_char(
> +					&buffer, &left, '\t', &skip);
> +			}
>  			if (err)
>  				break;
> -			err = proc_put_long(&buffer, &left, lval, neg);
> +			err = proc_put_long(&buffer, &left, lval, neg, &skip);
>  			if (err)
>  				break;
>  		}
>  	}
>  
> -	if (!write && !first && left && !err)
> -		err = proc_put_char(&buffer, &left, '\n');
> +	if (!write) {
> +		if (!first && !err)
> +			err = proc_put_char(&buffer, &left, '\n', &skip);
> +		if (!err && skip) {
> +			*lenp = 0;
> +			return 0;
> +		}
> +	}
>  	if (write && !err && left)
>  		left -= proc_skip_spaces(&kbuf);
>  free:
> @@ -2497,10 +2516,11 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
>  	unsigned long *i, *min, *max;
>  	int vleft, first = 1, err = 0;
>  	unsigned long page = 0;
> +	loff_t skip = *ppos;
>  	size_t left;
>  	char *kbuf;
>  
> -	if (!data || !table->maxlen || !*lenp || (*ppos && !write)) {
> +	if (!data || !table->maxlen || !*lenp) {
>  		*lenp = 0;
>  		return 0;
>  	}
> @@ -2546,15 +2566,22 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
>  		} else {
>  			val = convdiv * (*i) / convmul;
>  			if (!first)
> -				err = proc_put_char(&buffer, &left, '\t');
> -			err = proc_put_long(&buffer, &left, val, false);
> +				err = proc_put_char(
> +					&buffer, &left, '\t', &skip);
> +			err = proc_put_long(&buffer, &left, val, false, &skip);
>  			if (err)
>  				break;
>  		}
>  	}
>  
> -	if (!write && !first && left && !err)
> -		err = proc_put_char(&buffer, &left, '\n');
> +	if (!write) {
> +		if (!first && !err)
> +			err = proc_put_char(&buffer, &left, '\n', &skip);
> +		if (!err && skip) {
> +			*lenp = 0;
> +			return 0;
> +		}
> +	}
>  	if (write && !err)
>  		left -= proc_skip_spaces(&kbuf);
>  free:
> @@ -2805,6 +2832,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
>  	int err = 0;
>  	bool first = 1;
>  	size_t left = *lenp;
> +	loff_t skip = *ppos;
>  	unsigned long bitmap_len = table->maxlen;
>  	unsigned long *bitmap = (unsigned long *) table->data;
>  	unsigned long *tmp_bitmap = NULL;
> @@ -2893,18 +2921,21 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
>  						   bit_a + 1) - 1;
>  
>  			if (!first) {
> -				err = proc_put_char(&buffer, &left, ',');
> +				err = proc_put_char(&buffer, &left, ',', &skip);
>  				if (err)
>  					break;
>  			}
> -			err = proc_put_long(&buffer, &left, bit_a, false);
> +			err = proc_put_long(
> +				&buffer, &left, bit_a, false, &skip);
>  			if (err)
>  				break;
>  			if (bit_a != bit_b) {
> -				err = proc_put_char(&buffer, &left, '-');
> +				err = proc_put_char(
> +					&buffer, &left, '-', &skip);
>  				if (err)
>  					break;
> -				err = proc_put_long(&buffer, &left, bit_b, false);
> +				err = proc_put_long(
> +					&buffer, &left, bit_b, false, &skip);
>  				if (err)
>  					break;
>  			}
> @@ -2912,7 +2943,7 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
>  			first = 0; bit_b++;
>  		}
>  		if (!err)
> -			err = proc_put_char(&buffer, &left, '\n');
> +			err = proc_put_char(&buffer, &left, '\n', &skip);
>  	}
>  
>  	if (!err) {

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-23 14:47 ` Eric W. Biederman
@ 2012-01-23 15:49   ` Earl Chew
  2012-01-23 16:35     ` Eric W. Biederman
  0 siblings, 1 reply; 12+ messages in thread
From: Earl Chew @ 2012-01-23 15:49 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Peter Zijlstra, Andrew Morton, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

On 23/01/2012 6:47 AM, Eric W. Biederman wrote:
> The difference between strings and integers is that with strings
> single byte reads/writes actually make sense.
> 
> Single decimal digits reads don't make much sense at all for integers.
> 
> Is there any compelling reason to support this complete idiocy from user
> space programs?

Eric,

[ I also note that the patch allows multiple short reads (not necessarily
  just a single byte at a time) from these entries in procfs. ]

I think there is a reasonable userspace expectation that entities
that present themselves as text files to produce results that are
consistent with the userspace model of a text file.

For a more concrete example, consider the following BusyBox ash(1) code:

read X < /proc/sys/kernel/threads-max
Y=$(cat /proc/sys/kernel/threads-max)

read A < /proc/sys/kernel/core_pattern
B=$(cat /proc/sys/kernel/core_pattern)

The fact that these yield different results is surprising given that
the user sees these as text files.

I think it is worthwhile asking the opposite question,

" Is there any compelling reason for the kernel to require userspace
  programs to access /proc/sys/kernel/threads-max any differently
  from, say, /etc/fstab ? "

If the answer is in the affirmative, then it requires some layer in
the userspace program (presumably the IO layer) to make some
special accommodation for some entries in /proc.

As can be seen in the case of BusyBox, this approach might be difficult
to swallow in something as generic as ash(1), so it is left to the user to
puzzle out why read fails, but cat works -- but only sometimes.

Earl


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-23 15:49   ` Earl Chew
@ 2012-01-23 16:35     ` Eric W. Biederman
  2012-01-23 16:43       ` Eric W. Biederman
  2012-01-23 16:47       ` Earl Chew
  0 siblings, 2 replies; 12+ messages in thread
From: Eric W. Biederman @ 2012-01-23 16:35 UTC (permalink / raw)
  To: Earl Chew
  Cc: Ingo Molnar, Peter Zijlstra, Andrew Morton, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

Earl Chew <echew@ixiacom.com> writes:

> On 23/01/2012 6:47 AM, Eric W. Biederman wrote:
>> The difference between strings and integers is that with strings
>> single byte reads/writes actually make sense.
>> 
>> Single decimal digits reads don't make much sense at all for integers.
>> 
>> Is there any compelling reason to support this complete idiocy from user
>> space programs?
>
> Eric,
>
> [ I also note that the patch allows multiple short reads (not necessarily
>   just a single byte at a time) from these entries in procfs. ]
>
> I think there is a reasonable userspace expectation that entities
> that present themselves as text files to produce results that are
> consistent with the userspace model of a text file.
>
> For a more concrete example, consider the following BusyBox ash(1) code:
>
> read X < /proc/sys/kernel/threads-max
> Y=$(cat /proc/sys/kernel/threads-max)
>
> read A < /proc/sys/kernel/core_pattern
> B=$(cat /proc/sys/kernel/core_pattern)


> The fact that these yield different results is surprising given that
> the user sees these as text files.
>
> I think it is worthwhile asking the opposite question,
>
> " Is there any compelling reason for the kernel to require userspace
>   programs to access /proc/sys/kernel/threads-max any differently
>   from, say, /etc/fstab ? "
>
> If the answer is in the affirmative, then it requires some layer in
> the userspace program (presumably the IO layer) to make some
> special accommodation for some entries in /proc.
>
> As can be seen in the case of BusyBox, this approach might be difficult
> to swallow in something as generic as ash(1), so it is left to the user to
> puzzle out why read fails, but cat works -- but only sometimes.

read X < file seems like a reasonable case to support.
Bash doesn't have that problem so presumably BusyBox is simply
inefficient.

If you are interested in fixing this properly with a tiny buffer
reachable from struct file I think this can be worth fixing.  I think
this is doable by using seq_file in proc_sys_read.

Rereading different bytes of the integer multiple times when the integer
may be changing does not seem like a reasonable implementation.

Eric

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-23 16:35     ` Eric W. Biederman
@ 2012-01-23 16:43       ` Eric W. Biederman
  2012-01-23 16:47       ` Earl Chew
  1 sibling, 0 replies; 12+ messages in thread
From: Eric W. Biederman @ 2012-01-23 16:43 UTC (permalink / raw)
  To: Earl Chew
  Cc: Ingo Molnar, Peter Zijlstra, Andrew Morton, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

ebiederm@xmission.com (Eric W. Biederman) writes:

>>
>> [ I also note that the patch allows multiple short reads (not necessarily
>>   just a single byte at a time) from these entries in procfs. ]
>>
>> I think there is a reasonable userspace expectation that entities
>> that present themselves as text files to produce results that are
>> consistent with the userspace model of a text file.
>>
>> For a more concrete example, consider the following BusyBox ash(1) code:
>>
>> read X < /proc/sys/kernel/threads-max
>> Y=$(cat /proc/sys/kernel/threads-max)
>>
>> read A < /proc/sys/kernel/core_pattern
>> B=$(cat /proc/sys/kernel/core_pattern)
>
>
>> The fact that these yield different results is surprising given that
>> the user sees these as text files.

To be precise /proc/sys/kernel/threads-max does not present itself as
an ordinary text file.  It reports that it's file length is 0 bytes
long.   That is different from an ordinary text file.

Eric

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-23 16:35     ` Eric W. Biederman
  2012-01-23 16:43       ` Eric W. Biederman
@ 2012-01-23 16:47       ` Earl Chew
  2012-01-24 22:50         ` Andrew Morton
  2012-01-29 22:56         ` Earl Chew
  1 sibling, 2 replies; 12+ messages in thread
From: Earl Chew @ 2012-01-23 16:47 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Ingo Molnar, Peter Zijlstra, Andrew Morton, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

On 23/01/2012 8:35 AM, Eric W. Biederman wrote:
> read X < file seems like a reasonable case to support.
> Bash doesn't have that problem so presumably BusyBox is simply
> inefficient.

I think you are correct. In BusyBox, I believe there is a conscious
design decision to favour minimum size over maximum speed where it makes sense.

> If you are interested in fixing this properly with a tiny buffer
> reachable from struct file I think this can be worth fixing.  I think
> this is doable by using seq_file in proc_sys_read.

I did think about using seq_file, but my initial thoughts were that it
would end up being a much bigger change and I was reluctant to take that
on without some indication that it would be a more acceptable approach.

> Rereading different bytes of the integer multiple times when the integer
> may be changing does not seem like a reasonable implementation.

Yes. I agree with you. I shall re-work the patch as per your suggestion.

Earl


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-23 16:47       ` Earl Chew
@ 2012-01-24 22:50         ` Andrew Morton
  2012-01-25  6:28           ` Eric W. Biederman
  2012-01-25 15:27           ` Earl Chew
  2012-01-29 22:56         ` Earl Chew
  1 sibling, 2 replies; 12+ messages in thread
From: Andrew Morton @ 2012-01-24 22:50 UTC (permalink / raw)
  To: Earl Chew
  Cc: Eric W. Biederman, Ingo Molnar, Peter Zijlstra, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

On Mon, 23 Jan 2012 08:47:51 -0800
Earl Chew <echew@ixiacom.com> wrote:

> > Rereading different bytes of the integer multiple times when the integer
> > may be changing does not seem like a reasonable implementation.
> 
> Yes. I agree with you. I shall re-work the patch as per your suggestion.

Yes, this is a bug and procfs should support byte-at-a-time reading of these
strings.  And yes, they are strings!  Of digits.

We fixed an instance of this in procfs a while ago (maybe a year ago?).
But I forget where it was.  It is surprising that a bug of this nature
survived so long.


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-24 22:50         ` Andrew Morton
@ 2012-01-25  6:28           ` Eric W. Biederman
  2012-01-25 15:27           ` Earl Chew
  1 sibling, 0 replies; 12+ messages in thread
From: Eric W. Biederman @ 2012-01-25  6:28 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Earl Chew, Ingo Molnar, Peter Zijlstra, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

Andrew Morton <akpm@linux-foundation.org> writes:

> On Mon, 23 Jan 2012 08:47:51 -0800
> Earl Chew <echew@ixiacom.com> wrote:
>
>> > Rereading different bytes of the integer multiple times when the integer
>> > may be changing does not seem like a reasonable implementation.
>> 
>> Yes. I agree with you. I shall re-work the patch as per your suggestion.
>
> Yes, this is a bug and procfs should support byte-at-a-time reading of these
> strings.  And yes, they are strings!  Of digits.
>
> We fixed an instance of this in procfs a while ago (maybe a year ago?).
> But I forget where it was.  It is surprising that a bug of this nature
> survived so long.

In the one value per file take on things where the expectation and
normal practice is to read or write the entire file not just a little
bit of it, I don't think it is that surprising.

At the same time I am more embarrassed that not long ago a bug was added
to sysctl where if someone makes a sysctl file pollable and then removes
the module we can oops the kernel merely by keeping that file open.  So
far we are safe because no one has used the polling support on anything
that is modular but... 

Eric

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-24 22:50         ` Andrew Morton
  2012-01-25  6:28           ` Eric W. Biederman
@ 2012-01-25 15:27           ` Earl Chew
  1 sibling, 0 replies; 12+ messages in thread
From: Earl Chew @ 2012-01-25 15:27 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Eric W. Biederman, Ingo Molnar, Peter Zijlstra, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

On 24/01/2012 2:50 PM, Andrew Morton wrote:
> We fixed an instance of this in procfs a while ago (maybe a year ago?).
> But I forget where it was.  It is surprising that a bug of this nature
> survived so long.

Also related to procfs, I posted the following fix for pread() and seq_file but
haven't received any feedback.

	https://lkml.org/lkml/2012/1/22/61

Would you take a look ?

Earl


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-23 16:47       ` Earl Chew
  2012-01-24 22:50         ` Andrew Morton
@ 2012-01-29 22:56         ` Earl Chew
  2012-01-30  0:15           ` Eric W. Biederman
  1 sibling, 1 reply; 12+ messages in thread
From: Earl Chew @ 2012-01-29 22:56 UTC (permalink / raw)
  To: Eric W. Biederman, Andrew Morton
  Cc: Ingo Molnar, Peter Zijlstra, Eric Paris, Serge E. Hallyn,
	linux-kernel, adobriyan

> If you are interested in fixing this properly with a tiny buffer
> reachable from struct file I think this can be worth fixing.  I think
> this is doable by using seq_file in proc_sys_read.

I've looked into making proc_sys_read() use seq_file, but there are a few
issues to work through.

I'm assuming that the existing kernel modules must continue to use the
ctl_table/proc_dointvec/etc interface without requiring patching.

The two main consequences of this are:

a. The existing struct ctl_table interface should be preserved.
   Kernel modules use this to publish into procfs.

b. The existing proc_dointvec(), etc, functions must be preserved
   because they are exported via EXPORT_SYMBOL. Kernel modules
   may rely on these symbols in arbitrary ways.

I tried use seq_file in proc_sys_read, and it comes close to solving
the problem. The issue is that proc_dointvec() requires a __user pointer.

A seq_file has an internal buffer that could be filled by proc_dointvec() -- but that
seq_file buffer is in kernel space.

This is not a problem isolated to the seq_file implementation. I think that
any approach involving a small local buffer would mean that that buffer is
in kernel space, and that buffer cannot be passed to proc_dointvec() as it
stands now because proc_dointvec() requires a __user buffer.


One approach that might work is to add a new field to struct ctl_table :

struct ctl_table
{
	...
	proc_seq_handler *proc_seq_handler;
};

Existing modules can continue to use proc_dointvec() etc and fixed
code can use the new proc_seq_handler interface using the new
proc_seq_dointvec() etc.


Although this preserves the old interface, it seems to me that this
approach is flawed in that kernel modules using struct ctl_table and proc_dointvec()
continue to be broken -- they are not fixed.


So, perhaps breaking assumption (b) might be a reasonable thing to do ?

If we accept that proc_dointvec() etc are only or mainly used in the
context of filling out struct ctl_table, and that other kernel modules
don't use proc_dointvec() in other contexts, then we can change the
call signature of proc_dointvec() to stop using a __user pointer:

1. Change signature of proc_dointvec() etc to stop using __user pointer for reads
2. Change definition of typedef proc_call_handler to stop using __user pointer for reads
3. In kernel/sysctl.c don't use copy_to_user() for reads

Then proc_sys_read() can use proc_dointvec() etc to fill seq_file.


For writes, the existing behaviour needs to be preserved. One approach
that would solve this would be to add:

	union proc_buffer {
		void __user *uptr;
		void *kptr;
	};

	typedef int proc_handler (struct ctl_table *ctl, int write,
                          union proc_buffer buffer, size_t *lenp, loff_t *ppos);

I think this would be preferable to either casting away __user for reads, or
adding two pointers to the proc_handler signature (one with __user, one without).

If write is indicated, use buffer.uptr, and if not the handlers would use buffer.kptr.


But this is not perfect.  Kernel modules containing their _own_ proc_handler
definitions would now be broken severely.


And now I circle back to the proc_seq_handler approach.  Each ctl_table client
must be fixed individually (ie switch from proc_handler to proc_seq_handler).


What do you think ?


Earl


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-29 22:56         ` Earl Chew
@ 2012-01-30  0:15           ` Eric W. Biederman
  2012-01-30  1:13             ` Earl Chew
  0 siblings, 1 reply; 12+ messages in thread
From: Eric W. Biederman @ 2012-01-30  0:15 UTC (permalink / raw)
  To: Earl Chew
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

Earl Chew <echew@ixiacom.com> writes:

>> If you are interested in fixing this properly with a tiny buffer
>> reachable from struct file I think this can be worth fixing.  I think
>> this is doable by using seq_file in proc_sys_read.
>
> I've looked into making proc_sys_read() use seq_file, but there are a few
> issues to work through.
>
> I'm assuming that the existing kernel modules must continue to use the
> ctl_table/proc_dointvec/etc interface without requiring patching.
>
> The two main consequences of this are:
>
> a. The existing struct ctl_table interface should be preserved.
>    Kernel modules use this to publish into procfs.

Generally.  Changing all 200 or so users of the ctl_table interface is a
pain to audit and make certain you having done something silly.

> b. The existing proc_dointvec(), etc, functions must be preserved
>    because they are exported via EXPORT_SYMBOL. Kernel modules
>    may rely on these symbols in arbitrary ways.

You only have to worry about in kernel users, and a quick source audit
should allow to be certain that there are no arbitrary weird uses
of proc_dointvec.

> I tried use seq_file in proc_sys_read, and it comes close to solving
> the problem. The issue is that proc_dointvec() requires a __user pointer.
>
> A seq_file has an internal buffer that could be filled by proc_dointvec() -- but that
> seq_file buffer is in kernel space.
>
> This is not a problem isolated to the seq_file implementation. I think that
> any approach involving a small local buffer would mean that that buffer is
> in kernel space, and that buffer cannot be passed to proc_dointvec() as it
> stands now because proc_dointvec() requires a __user buffer.

We can definitely change the signature of proc_handler and thus
proc_dointvec and friends to be friendlier to an internal buffer.

For prototyping it probably makes sense to use set_fs so you can
use an internal buffer without needing to change the method.

> One approach that might work is to add a new field to struct ctl_table :
>
> struct ctl_table
> {
> 	...
> 	proc_seq_handler *proc_seq_handler;
> };
>
> Existing modules can continue to use proc_dointvec() etc and fixed
> code can use the new proc_seq_handler interface using the new
> proc_seq_dointvec() etc.
>
>
> Although this preserves the old interface, it seems to me that this
> approach is flawed in that kernel modules using struct ctl_table and proc_dointvec()
> continue to be broken -- they are not fixed.

It might make a good intermediate transition scheme while existing
tables are being updated.

> So, perhaps breaking assumption (b) might be a reasonable thing to do ?

Definitely.

> If we accept that proc_dointvec() etc are only or mainly used in the
> context of filling out struct ctl_table, and that other kernel modules
> don't use proc_dointvec() in other contexts, then we can change the
> call signature of proc_dointvec() to stop using a __user pointer:

Exactly.

> 1. Change signature of proc_dointvec() etc to stop using __user pointer for reads
> 2. Change definition of typedef proc_call_handler to stop using __user pointer for reads
> 3. In kernel/sysctl.c don't use copy_to_user() for reads

All of which sound like nice cleanups, and likely to reduce the number
of places where people can badly get things wrong.

> Then proc_sys_read() can use proc_dointvec() etc to fill seq_file.

Yes.  That sounds like a nice cleanup.

> For writes, the existing behaviour needs to be preserved. One approach
> that would solve this would be to add:
>
> 	union proc_buffer {
> 		void __user *uptr;
> 		void *kptr;
> 	};
>
> 	typedef int proc_handler (struct ctl_table *ctl, int write,
>                           union proc_buffer buffer, size_t *lenp, loff_t *ppos);


> I think this would be preferable to either casting away __user for reads, or
> adding two pointers to the proc_handler signature (one with __user, one without).
>
> If write is indicated, use buffer.uptr, and if not the handlers would use buffer.kptr.

For interface design I would look carefully at the bitmap interface that
comes out of sysctl.  I think that is the most data interface we have.

I suspect what we want for writes is to read the existing value into an
internal buffer, and then allow partial writes to the internal buffer.

I don't know if writable seq_files exist.

> But this is not perfect.  Kernel modules containing their _own_ proc_handler
> definitions would now be broken severely.
>
>
> And now I circle back to the proc_seq_handler approach.  Each ctl_table client
> must be fixed individually (ie switch from proc_handler to proc_seq_handler).
>
> What do you think ?

It sounds like you are thinking things through well.  

For a transition strategy we may want to change proc_handler into
operations structure instead of just a function pointer.

Only having a single method we can call sounds like a pain.

struct ctl_proc_operations {
	int (*proc_handler)(....);
        int (*proc_seq_handler)(...);
};

Or maybe:

struct ctl_proc_operations {
	int (*proc_handler)(....);
        int (*proc_seq_read_handler)(...);
        int (*proc_seq_write_handler)(...);
};

Having the option of different read/write methods is a plus.

More usefully this should allow us to update what the implementation
of things like proc_dointvec mean without having to update all of
the tables.  Which should make it much easier to preserve correctness,
because we don't a huge number of manual edits to ctl_table structures
all over the kernel.

Eric


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c
  2012-01-30  0:15           ` Eric W. Biederman
@ 2012-01-30  1:13             ` Earl Chew
  0 siblings, 0 replies; 12+ messages in thread
From: Earl Chew @ 2012-01-30  1:13 UTC (permalink / raw)
  To: Eric W. Biederman
  Cc: Andrew Morton, Ingo Molnar, Peter Zijlstra, Eric Paris,
	Serge E. Hallyn, linux-kernel, adobriyan

> I suspect what we want for writes is to read the existing value into an
> internal buffer, and then allow partial writes to the internal buffer.

Oh I see. Hmm ... I get the feeling that doubles the scope of the exercise ;-)

I'll give it some more thought.

> I don't know if writable seq_files exist.

No, they don't -- at least not out-of-the-box. From fs/seq_file.c :

>         /*
>          * seq_files support lseek() and pread().  They do not implement
>          * write() at all, but we clear FMODE_PWRITE here for historical
>          * reasons.
>          *
>          * If a client of seq_files a) implements file.write() and b) wishes to
>          * support pwrite() then that client will need to implement its own
>          * file.open() which calls seq_open() and then sets FMODE_PWRITE.
>          */
>         file->f_mode &= ~FMODE_PWRITE;

Earl


^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2012-01-30  1:13 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-01-22 18:46 [PATCH] Support single byte reads from integers published in procfs by kernel/sysctl.c Earl Chew
2012-01-23 14:47 ` Eric W. Biederman
2012-01-23 15:49   ` Earl Chew
2012-01-23 16:35     ` Eric W. Biederman
2012-01-23 16:43       ` Eric W. Biederman
2012-01-23 16:47       ` Earl Chew
2012-01-24 22:50         ` Andrew Morton
2012-01-25  6:28           ` Eric W. Biederman
2012-01-25 15:27           ` Earl Chew
2012-01-29 22:56         ` Earl Chew
2012-01-30  0:15           ` Eric W. Biederman
2012-01-30  1:13             ` Earl Chew

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).