linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH 1/3] readfile: implement readfile syscall
@ 2020-07-04 20:50 Alexey Dobriyan
  2020-07-04 21:15 ` Miklos Szeredi
  0 siblings, 1 reply; 8+ messages in thread
From: Alexey Dobriyan @ 2020-07-04 20:50 UTC (permalink / raw)
  To: viro; +Cc: linux-kernel, gregkh, geert, willy, miklos, linux-fsdevel

Al wrote:

> > On Sat, Jul 04, 2020 at 09:41:09PM +0200, Miklos Szeredi wrote:
> >  1) just leave the first explanation (it's an open + read + close
> > equivalent) and leave out the rest
> >
> >  2) add a loop around the vfs_read() in the code.
> 
> 3) don't bother with the entire thing, until somebody manages to demonstrate
> a setup where it does make a real difference (compared to than the obvious
> sequence of syscalls, that is).

Ehh? System call overead is trivially measurable.
https://lwn.net/Articles/814175/

> At which point we'll need to figure out
> what's going on and deal with the underlying problem of that setup.

Run top?

Teach userspace to read 1 page minimum?

	192803 read(4, "cpu  3718263 4417 342808 7127674"..., 1024) = 1024
	192803 read(4, " 0 21217 21617 21954 10201 15425"..., 1024) = 1024
	192803 read(4, " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"..., 1024) = 1024
	192803 read(4, "", 1024)

Teach userspace to pread?

	192803 openat(AT_FDCWD, "/proc/uptime", O_RDONLY) = 5
	192803 lseek(5, 0, SEEK_SET)            = 0
	192803 read(5, "47198.56 713699.82\n", 8191) = 19

Rhetorical question: what is harder: ditch the main source of overhead
(VFS, seq_file, text) or teach userspace how to read files?

Here is open+read /proc/cpuinfo in python2 and python3.
Python2 case is terrifying.

BTW is there is something broken with seqfiles and record keeping?
Why does it return only 2 records per read?

Python 3:

openat(AT_FDCWD, "/proc/cpuinfo", O_RDONLY|O_CLOEXEC) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
ioctl(3, TCGETS, 0x7ffe6f1f0850) = -1 ENOTTY (Inappropriate ioctl for device)
lseek(3, 0, SEEK_CUR)            = 0
ioctl(3, TCGETS, 0x7ffe6f1f0710) = -1 ENOTTY (Inappropriate ioctl for device)
lseek(3, 0, SEEK_CUR)            = 0
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(3, "processor\t: 0\nvendor_id\t: Genuin"..., 8192) = 3038
read(3, "processor\t: 2\nvendor_id\t: Genuin"..., 5154) = 3038
read(3, "processor\t: 4\nvendor_id\t: Genuin"..., 2116) = 2116
read(3, "clmulqdq dtes64 monitor ds_cpl v"..., 8448) = 3966
read(3, "processor\t: 8\nvendor_id\t: Genuin"..., 4482) = 3038
read(3, "processor\t: 10\nvendor_id\t: Genui"..., 1444) = 1444
read(3, "t\t: 64\naddress sizes\t: 46 bits p"..., 16896) = 3116
read(3, "processor\t: 13\nvendor_id\t: Genui"..., 13780) = 3044
read(3, "processor\t: 15\nvendor_id\t: Genui"..., 10736) = 1522
read(3, "", 9214)                = 0


Python 2

openat(AT_FDCWD, "/proc/cpuinfo", O_RDONLY) = 3
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 0
lseek(3, 0, SEEK_CUR)            = 0
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
read(3, "processor\t: 0\nvendor_id\t: Genuin"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 1024
lseek(3, 0, SEEK_CUR)            = 1024
read(3, " cqm_occup_llc cqm_mbm_total cqm"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 2048
lseek(3, 0, SEEK_CUR)            = 2048
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 2048
lseek(3, 0, SEEK_CUR)            = 2048
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 2048
lseek(3, 0, SEEK_CUR)            = 2048
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 2048
lseek(3, 0, SEEK_CUR)            = 2048
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 2048
lseek(3, 0, SEEK_CUR)            = 2048
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 2048
lseek(3, 0, SEEK_CUR)            = 2048
read(3, "ebs bts rep_good nopl xtopology "..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 3072
lseek(3, 0, SEEK_CUR)            = 3072
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 3072
lseek(3, 0, SEEK_CUR)            = 3072
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 3072
lseek(3, 0, SEEK_CUR)            = 3072
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 3072
lseek(3, 0, SEEK_CUR)            = 3072
read(3, "ntel\ncpu family\t: 6\nmodel\t\t: 79\n"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 4096
lseek(3, 0, SEEK_CUR)            = 4096
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 4096
lseek(3, 0, SEEK_CUR)            = 4096
read(3, "bm_local dtherm ida arat pln pts"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 5120
lseek(3, 0, SEEK_CUR)            = 5120
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 5120
lseek(3, 0, SEEK_CUR)            = 5120
read(3, "nstop_tsc cpuid aperfmperf pni p"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 6144
lseek(3, 0, SEEK_CUR)            = 6144
read(3, "del name\t: Intel(R) Xeon(R) CPU "..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 7168
lseek(3, 0, SEEK_CUR)            = 7168
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 7168
lseek(3, 0, SEEK_CUR)            = 7168
read(3, "d_clear flush_l1d\nvmx flags\t: vn"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 8192
lseek(3, 0, SEEK_CUR)            = 8192
read(3, "clmulqdq dtes64 monitor ds_cpl v"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 9216
lseek(3, 0, SEEK_CUR)            = 9216
read(3, "E5-2620 v4 @ 2.10GHz\nstepping\t: "..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 10240
lseek(3, 0, SEEK_CUR)            = 10240
read(3, "vnmi preemption_timer posted_int"..., 1024) = 1024
read(3, " vmx smx est tm2 ssse3 sdbg fma "..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 12288
lseek(3, 0, SEEK_CUR)            = 12288
read(3, ": 1\nmicrocode\t: 0xb000038\ncpu MH"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 13312
lseek(3, 0, SEEK_CUR)            = 13312
read(3, "r invvpid ept_x_only ept_ad ept_"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 14336
lseek(3, 0, SEEK_CUR)            = 14336
read(3, "16 xtpr pdcm pcid dca sse4_1 sse"..., 1024) = 1024
read(3, "\t\t: 1326.352\ncache size\t: 20480 "..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 16384
lseek(3, 0, SEEK_CUR)            = 16384
read(3, "gb flexpriority apicv tsc_offset"..., 1024) = 1024
read(3, "4_2 x2apic movbe popcnt tsc_dead"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 18432
lseek(3, 0, SEEK_CUR)            = 18432
read(3, "KB\nphysical id\t: 0\nsiblings\t: 16"..., 1024) = 1024
read(3, " vtpr mtf vapic ept vpid unrestr"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 20480
lseek(3, 0, SEEK_CUR)            = 20480
read(3, "adline_timer aes xsave avx f16c "..., 2048) = 2048
read(3, "estricted_guest vapic_reg vid pl"..., 1024) = 1024
fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
lseek(3, 0, SEEK_CUR)            = 23552
lseek(3, 0, SEEK_CUR)            = 23552
read(3, "16c rdrand lahf_lm abm 3dnowpref"..., 2048) = 770
read(3, "", 1024)                = 0
close(3)                         = 0

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] readfile: implement readfile syscall
  2020-07-04 20:50 [PATCH 1/3] readfile: implement readfile syscall Alexey Dobriyan
@ 2020-07-04 21:15 ` Miklos Szeredi
  0 siblings, 0 replies; 8+ messages in thread
From: Miklos Szeredi @ 2020-07-04 21:15 UTC (permalink / raw)
  To: Alexey Dobriyan
  Cc: Al Viro, linux-kernel, Greg Kroah-Hartman, Geert Uytterhoeven,
	Matthew Wilcox, linux-fsdevel

On Sat, Jul 4, 2020 at 10:50 PM Alexey Dobriyan <adobriyan@gmail.com> wrote:
>
> Al wrote:
>
> > > On Sat, Jul 04, 2020 at 09:41:09PM +0200, Miklos Szeredi wrote:
> > >  1) just leave the first explanation (it's an open + read + close
> > > equivalent) and leave out the rest
> > >
> > >  2) add a loop around the vfs_read() in the code.
> >
> > 3) don't bother with the entire thing, until somebody manages to demonstrate
> > a setup where it does make a real difference (compared to than the obvious
> > sequence of syscalls, that is).
>
> Ehh? System call overead is trivially measurable.
> https://lwn.net/Articles/814175/
>
> > At which point we'll need to figure out
> > what's going on and deal with the underlying problem of that setup.
>
> Run top?
>
> Teach userspace to read 1 page minimum?
>
>         192803 read(4, "cpu  3718263 4417 342808 7127674"..., 1024) = 1024
>         192803 read(4, " 0 21217 21617 21954 10201 15425"..., 1024) = 1024
>         192803 read(4, " 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0"..., 1024) = 1024
>         192803 read(4, "", 1024)
>
> Teach userspace to pread?
>
>         192803 openat(AT_FDCWD, "/proc/uptime", O_RDONLY) = 5
>         192803 lseek(5, 0, SEEK_SET)            = 0
>         192803 read(5, "47198.56 713699.82\n", 8191) = 19
>
> Rhetorical question: what is harder: ditch the main source of overhead
> (VFS, seq_file, text) or teach userspace how to read files?
>
> Here is open+read /proc/cpuinfo in python2 and python3.
> Python2 case is terrifying.
>
> BTW is there is something broken with seqfiles and record keeping?
> Why does it return only 2 records per read?

seqfile is weird.  It uses an internal buffer (m->buf) that starts off
with a PAGE_SIZE size (m->size).  The internal buffer is filled with
whole records, if a single record is larger than the bufsize, then the
buffer is expanded until the record fits.  Then this internal buffer
is used to fill the user buffer supplied by read.  If the length of
the internal buffer (m->count) overflows the user buffer, then the
rest is set aside for the next read.  In this case the next read
starts with the tail (m->from) of the internal buffer, then, if
there's still space in the user buffer, it resets the internal buffer
(m->from = 0) and again fills it with at least one whole record and
copies as much of that to the user buffer as it can.

Note how this can lead to unfilled bytes at the end of the user
buffer.  Not sure what's the rationelle, it could just as well loop
back to filling a new buf it there's space left in the user buffer.
Maybe it was: better return whole records whenever possible.  Anyway
that can't be changed now, since it's bound to break something out
there.

Thanks,
Miklos

>
> Python 3:
>
> openat(AT_FDCWD, "/proc/cpuinfo", O_RDONLY|O_CLOEXEC) = 3
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> ioctl(3, TCGETS, 0x7ffe6f1f0850) = -1 ENOTTY (Inappropriate ioctl for device)
> lseek(3, 0, SEEK_CUR)            = 0
> ioctl(3, TCGETS, 0x7ffe6f1f0710) = -1 ENOTTY (Inappropriate ioctl for device)
> lseek(3, 0, SEEK_CUR)            = 0
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> read(3, "processor\t: 0\nvendor_id\t: Genuin"..., 8192) = 3038
> read(3, "processor\t: 2\nvendor_id\t: Genuin"..., 5154) = 3038
> read(3, "processor\t: 4\nvendor_id\t: Genuin"..., 2116) = 2116
> read(3, "clmulqdq dtes64 monitor ds_cpl v"..., 8448) = 3966
> read(3, "processor\t: 8\nvendor_id\t: Genuin"..., 4482) = 3038
> read(3, "processor\t: 10\nvendor_id\t: Genui"..., 1444) = 1444
> read(3, "t\t: 64\naddress sizes\t: 46 bits p"..., 16896) = 3116
> read(3, "processor\t: 13\nvendor_id\t: Genui"..., 13780) = 3044
> read(3, "processor\t: 15\nvendor_id\t: Genui"..., 10736) = 1522
> read(3, "", 9214)                = 0
>
>
> Python 2
>
> openat(AT_FDCWD, "/proc/cpuinfo", O_RDONLY) = 3
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 0
> lseek(3, 0, SEEK_CUR)            = 0
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> read(3, "processor\t: 0\nvendor_id\t: Genuin"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 1024
> lseek(3, 0, SEEK_CUR)            = 1024
> read(3, " cqm_occup_llc cqm_mbm_total cqm"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 2048
> lseek(3, 0, SEEK_CUR)            = 2048
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 2048
> lseek(3, 0, SEEK_CUR)            = 2048
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 2048
> lseek(3, 0, SEEK_CUR)            = 2048
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 2048
> lseek(3, 0, SEEK_CUR)            = 2048
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 2048
> lseek(3, 0, SEEK_CUR)            = 2048
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 2048
> lseek(3, 0, SEEK_CUR)            = 2048
> read(3, "ebs bts rep_good nopl xtopology "..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 3072
> lseek(3, 0, SEEK_CUR)            = 3072
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 3072
> lseek(3, 0, SEEK_CUR)            = 3072
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 3072
> lseek(3, 0, SEEK_CUR)            = 3072
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 3072
> lseek(3, 0, SEEK_CUR)            = 3072
> read(3, "ntel\ncpu family\t: 6\nmodel\t\t: 79\n"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 4096
> lseek(3, 0, SEEK_CUR)            = 4096
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 4096
> lseek(3, 0, SEEK_CUR)            = 4096
> read(3, "bm_local dtherm ida arat pln pts"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 5120
> lseek(3, 0, SEEK_CUR)            = 5120
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 5120
> lseek(3, 0, SEEK_CUR)            = 5120
> read(3, "nstop_tsc cpuid aperfmperf pni p"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 6144
> lseek(3, 0, SEEK_CUR)            = 6144
> read(3, "del name\t: Intel(R) Xeon(R) CPU "..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 7168
> lseek(3, 0, SEEK_CUR)            = 7168
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 7168
> lseek(3, 0, SEEK_CUR)            = 7168
> read(3, "d_clear flush_l1d\nvmx flags\t: vn"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 8192
> lseek(3, 0, SEEK_CUR)            = 8192
> read(3, "clmulqdq dtes64 monitor ds_cpl v"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 9216
> lseek(3, 0, SEEK_CUR)            = 9216
> read(3, "E5-2620 v4 @ 2.10GHz\nstepping\t: "..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 10240
> lseek(3, 0, SEEK_CUR)            = 10240
> read(3, "vnmi preemption_timer posted_int"..., 1024) = 1024
> read(3, " vmx smx est tm2 ssse3 sdbg fma "..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 12288
> lseek(3, 0, SEEK_CUR)            = 12288
> read(3, ": 1\nmicrocode\t: 0xb000038\ncpu MH"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 13312
> lseek(3, 0, SEEK_CUR)            = 13312
> read(3, "r invvpid ept_x_only ept_ad ept_"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 14336
> lseek(3, 0, SEEK_CUR)            = 14336
> read(3, "16 xtpr pdcm pcid dca sse4_1 sse"..., 1024) = 1024
> read(3, "\t\t: 1326.352\ncache size\t: 20480 "..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 16384
> lseek(3, 0, SEEK_CUR)            = 16384
> read(3, "gb flexpriority apicv tsc_offset"..., 1024) = 1024
> read(3, "4_2 x2apic movbe popcnt tsc_dead"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 18432
> lseek(3, 0, SEEK_CUR)            = 18432
> read(3, "KB\nphysical id\t: 0\nsiblings\t: 16"..., 1024) = 1024
> read(3, " vtpr mtf vapic ept vpid unrestr"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 20480
> lseek(3, 0, SEEK_CUR)            = 20480
> read(3, "adline_timer aes xsave avx f16c "..., 2048) = 2048
> read(3, "estricted_guest vapic_reg vid pl"..., 1024) = 1024
> fstat(3, {st_mode=S_IFREG|0444, st_size=0, ...}) = 0
> lseek(3, 0, SEEK_CUR)            = 23552
> lseek(3, 0, SEEK_CUR)            = 23552
> read(3, "16c rdrand lahf_lm abm 3dnowpref"..., 2048) = 770
> read(3, "", 1024)                = 0
> close(3)                         = 0

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] readfile: implement readfile syscall
  2020-07-04 20:12     ` Al Viro
@ 2020-07-04 20:16       ` Al Viro
  0 siblings, 0 replies; 8+ messages in thread
From: Al Viro @ 2020-07-04 20:16 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Greg Kroah-Hartman, Michael Kerrisk, shuah, Linux API,
	linux-fsdevel, linux-kernel, linux-man, linux-kselftest

On Sat, Jul 04, 2020 at 09:12:06PM +0100, Al Viro wrote:
> On Sat, Jul 04, 2020 at 09:41:09PM +0200, Miklos Szeredi wrote:
> > And "If the size of file is smaller than the value provided in count
> > then the whole file will be copied into buf", which is simply a lie;
> > for example seq_file will happily return a smaller-than-PAGE_SIZE
> > chunk if at least one record fits in there.  You'll have a very hard
> > time explaining that in the man page.  So I think there are two
> > possible ways forward:
> > 
> >  1) just leave the first explanation (it's an open + read + close
> > equivalent) and leave out the rest
> > 
> >  2) add a loop around the vfs_read() in the code.
> 
> 3) don't bother with the entire thing, until somebody manages to demonstrate
> a setup where it does make a real difference (compared to than the obvious
> sequence of syscalls, that is).  At which point we'll need to figure out
> what's going on and deal with the underlying problem of that setup.

Incidentally, if that's intended for use on _sysfs_, I would like to see the
effects of that sucker being called by many processes in parallel, seeing that
sysfs has, er, certain scalability problems in its lookups.  And I would be
very surprised if they were not heavier than said overhead of two extra syscalls.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] readfile: implement readfile syscall
  2020-07-04 19:41   ` Miklos Szeredi
@ 2020-07-04 20:12     ` Al Viro
  2020-07-04 20:16       ` Al Viro
  0 siblings, 1 reply; 8+ messages in thread
From: Al Viro @ 2020-07-04 20:12 UTC (permalink / raw)
  To: Miklos Szeredi
  Cc: Greg Kroah-Hartman, Michael Kerrisk, shuah, Linux API,
	linux-fsdevel, linux-kernel, linux-man, linux-kselftest

On Sat, Jul 04, 2020 at 09:41:09PM +0200, Miklos Szeredi wrote:
> And "If the size of file is smaller than the value provided in count
> then the whole file will be copied into buf", which is simply a lie;
> for example seq_file will happily return a smaller-than-PAGE_SIZE
> chunk if at least one record fits in there.  You'll have a very hard
> time explaining that in the man page.  So I think there are two
> possible ways forward:
> 
>  1) just leave the first explanation (it's an open + read + close
> equivalent) and leave out the rest
> 
>  2) add a loop around the vfs_read() in the code.

3) don't bother with the entire thing, until somebody manages to demonstrate
a setup where it does make a real difference (compared to than the obvious
sequence of syscalls, that is).  At which point we'll need to figure out
what's going on and deal with the underlying problem of that setup.

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] readfile: implement readfile syscall
  2020-07-04 14:02 ` [PATCH 1/3] readfile: implement readfile syscall Greg Kroah-Hartman
  2020-07-04 18:35   ` Geert Uytterhoeven
  2020-07-04 19:14   ` Matthew Wilcox
@ 2020-07-04 19:41   ` Miklos Szeredi
  2020-07-04 20:12     ` Al Viro
  2 siblings, 1 reply; 8+ messages in thread
From: Miklos Szeredi @ 2020-07-04 19:41 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Al Viro, Michael Kerrisk, shuah, Linux API, linux-fsdevel,
	linux-kernel, linux-man, linux-kselftest

On Sat, Jul 4, 2020 at 4:03 PM Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
>
> It's a tiny syscall, meant to allow a user to do a single "open this
> file, read into this buffer, and close the file" all in a single shot.
>
> Should be good for reading "tiny" files like sysfs, procfs, and other
> "small" files.
>
> There is no restarting the syscall, this is a "simple" syscall, with the
> attempt to make reading "simple" files easier with less syscall
> overhead.
>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
> ---
>  fs/open.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 50 insertions(+)
>
> diff --git a/fs/open.c b/fs/open.c
> index 6cd48a61cda3..4469faa9379c 100644
> --- a/fs/open.c
> +++ b/fs/open.c
> @@ -1370,3 +1370,53 @@ int stream_open(struct inode *inode, struct file *filp)
>  }
>
>  EXPORT_SYMBOL(stream_open);
> +
> +static struct file *readfile_open(int dfd, const char __user *filename,
> +                                 struct open_flags *op)
> +{
> +       struct filename *tmp;
> +       struct file *f;
> +
> +       tmp = getname(filename);
> +       if (IS_ERR(tmp))
> +               return (struct file *)tmp;
> +
> +       f = do_filp_open(dfd, tmp, op);
> +       if (!IS_ERR(f))
> +               fsnotify_open(f);
> +
> +       putname(tmp);
> +       return f;
> +}
> +
> +SYSCALL_DEFINE5(readfile, int, dfd, const char __user *, filename,
> +               char __user *, buffer, size_t, bufsize, int, flags)
> +{
> +       struct open_flags op;
> +       struct open_how how;
> +       struct file *file;
> +       loff_t pos = 0;
> +       int retval;
> +
> +       /* only accept a small subset of O_ flags that make sense */
> +       if ((flags & (O_NOFOLLOW | O_NOATIME)) != flags)
> +               return -EINVAL;
> +
> +       /* add some needed flags to be able to open the file properly */
> +       flags |= O_RDONLY | O_LARGEFILE;
> +
> +       how = build_open_how(flags, 0000);
> +       retval = build_open_flags(&how, &op);
> +       if (retval)
> +               return retval;
> +
> +       file = readfile_open(dfd, filename, &op);
> +       if (IS_ERR(file))
> +               return PTR_ERR(file);
> +
> +       retval = vfs_read(file, buffer, bufsize, &pos);
> +
> +       filp_close(file, NULL);
> +
> +       return retval;

Manpage says: "doing the sequence of open() and then read() and then
close()", which is exactly what it does.

But then it goes on to say: "If the file is larger than the value
provided in count then only count number of bytes will be copied into
buf", which is only half true, it should be: "If the file is larger
than the value provided in count then at most count number of bytes
will be copied into buf", which is not a lot of information.

And "If the size of file is smaller than the value provided in count
then the whole file will be copied into buf", which is simply a lie;
for example seq_file will happily return a smaller-than-PAGE_SIZE
chunk if at least one record fits in there.  You'll have a very hard
time explaining that in the man page.  So I think there are two
possible ways forward:

 1) just leave the first explanation (it's an open + read + close
equivalent) and leave out the rest

 2) add a loop around the vfs_read() in the code.

I'd strongly prefer #2 because with the non-looping read it's
impossible to detect whether the file was completely read or not, and
that's just going to lead to surprises and bugs in userspace code.

Thanks,
Miklos

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] readfile: implement readfile syscall
  2020-07-04 14:02 ` [PATCH 1/3] readfile: implement readfile syscall Greg Kroah-Hartman
  2020-07-04 18:35   ` Geert Uytterhoeven
@ 2020-07-04 19:14   ` Matthew Wilcox
  2020-07-04 19:41   ` Miklos Szeredi
  2 siblings, 0 replies; 8+ messages in thread
From: Matthew Wilcox @ 2020-07-04 19:14 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: viro, mtk.manpages, shuah, linux-api, linux-fsdevel,
	linux-kernel, linux-man, linux-kselftest

On Sat, Jul 04, 2020 at 04:02:47PM +0200, Greg Kroah-Hartman wrote:
> +	/* only accept a small subset of O_ flags that make sense */
> +	if ((flags & (O_NOFOLLOW | O_NOATIME)) != flags)
> +		return -EINVAL;
> +
> +	/* add some needed flags to be able to open the file properly */
> +	flags |= O_RDONLY | O_LARGEFILE;

Should we also add O_NOCTTY to prevent any problems if this is called on
a tty?


^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH 1/3] readfile: implement readfile syscall
  2020-07-04 14:02 ` [PATCH 1/3] readfile: implement readfile syscall Greg Kroah-Hartman
@ 2020-07-04 18:35   ` Geert Uytterhoeven
  2020-07-04 19:14   ` Matthew Wilcox
  2020-07-04 19:41   ` Miklos Szeredi
  2 siblings, 0 replies; 8+ messages in thread
From: Geert Uytterhoeven @ 2020-07-04 18:35 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Al Viro, Michael Kerrisk (man-pages),
	Shuah Khan, Linux API, Linux FS Devel, Linux Kernel Mailing List,
	linux-man, open list:KERNEL SELFTEST FRAMEWORK

Hi Greg,

On Sat, Jul 4, 2020 at 4:05 PM Greg Kroah-Hartman
<gregkh@linuxfoundation.org> wrote:
> It's a tiny syscall, meant to allow a user to do a single "open this
> file, read into this buffer, and close the file" all in a single shot.
>
> Should be good for reading "tiny" files like sysfs, procfs, and other
> "small" files.
>
> There is no restarting the syscall, this is a "simple" syscall, with the
> attempt to make reading "simple" files easier with less syscall
> overhead.
>
> Cc: Alexander Viro <viro@zeniv.linux.org.uk>
> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Thanks for your patch!

> --- a/fs/open.c
> +++ b/fs/open.c

> +SYSCALL_DEFINE5(readfile, int, dfd, const char __user *, filename,
> +               char __user *, buffer, size_t, bufsize, int, flags)
> +{
> +       struct open_flags op;
> +       struct open_how how;
> +       struct file *file;
> +       loff_t pos = 0;
> +       int retval;
> +
> +       /* only accept a small subset of O_ flags that make sense */
> +       if ((flags & (O_NOFOLLOW | O_NOATIME)) != flags)
> +               return -EINVAL;
> +
> +       /* add some needed flags to be able to open the file properly */
> +       flags |= O_RDONLY | O_LARGEFILE;
> +
> +       how = build_open_how(flags, 0000);
> +       retval = build_open_flags(&how, &op);
> +       if (retval)
> +               return retval;
> +
> +       file = readfile_open(dfd, filename, &op);
> +       if (IS_ERR(file))
> +               return PTR_ERR(file);
> +
> +       retval = vfs_read(file, buffer, bufsize, &pos);

Should there be a way for the user to be informed that the file doesn't
fit in the provided buffer (.e.g. -EFBIG)?

> +
> +       filp_close(file, NULL);
> +
> +       return retval;
> +}

Gr{oetje,eeting}s,

                        Geert

-- 
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH 1/3] readfile: implement readfile syscall
  2020-07-04 14:02 [PATCH 0/3] readfile(2): a new syscall to make open/read/close faster Greg Kroah-Hartman
@ 2020-07-04 14:02 ` Greg Kroah-Hartman
  2020-07-04 18:35   ` Geert Uytterhoeven
                     ` (2 more replies)
  0 siblings, 3 replies; 8+ messages in thread
From: Greg Kroah-Hartman @ 2020-07-04 14:02 UTC (permalink / raw)
  To: viro, mtk.manpages, shuah, linux-api
  Cc: linux-fsdevel, linux-kernel, linux-man, linux-kselftest,
	Greg Kroah-Hartman

It's a tiny syscall, meant to allow a user to do a single "open this
file, read into this buffer, and close the file" all in a single shot.

Should be good for reading "tiny" files like sysfs, procfs, and other
"small" files.

There is no restarting the syscall, this is a "simple" syscall, with the
attempt to make reading "simple" files easier with less syscall
overhead.

Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/open.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

diff --git a/fs/open.c b/fs/open.c
index 6cd48a61cda3..4469faa9379c 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -1370,3 +1370,53 @@ int stream_open(struct inode *inode, struct file *filp)
 }
 
 EXPORT_SYMBOL(stream_open);
+
+static struct file *readfile_open(int dfd, const char __user *filename,
+				  struct open_flags *op)
+{
+	struct filename *tmp;
+	struct file *f;
+
+	tmp = getname(filename);
+	if (IS_ERR(tmp))
+		return (struct file *)tmp;
+
+	f = do_filp_open(dfd, tmp, op);
+	if (!IS_ERR(f))
+		fsnotify_open(f);
+
+	putname(tmp);
+	return f;
+}
+
+SYSCALL_DEFINE5(readfile, int, dfd, const char __user *, filename,
+		char __user *, buffer, size_t, bufsize, int, flags)
+{
+	struct open_flags op;
+	struct open_how how;
+	struct file *file;
+	loff_t pos = 0;
+	int retval;
+
+	/* only accept a small subset of O_ flags that make sense */
+	if ((flags & (O_NOFOLLOW | O_NOATIME)) != flags)
+		return -EINVAL;
+
+	/* add some needed flags to be able to open the file properly */
+	flags |= O_RDONLY | O_LARGEFILE;
+
+	how = build_open_how(flags, 0000);
+	retval = build_open_flags(&how, &op);
+	if (retval)
+		return retval;
+
+	file = readfile_open(dfd, filename, &op);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	retval = vfs_read(file, buffer, bufsize, &pos);
+
+	filp_close(file, NULL);
+
+	return retval;
+}
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2020-07-04 21:15 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-04 20:50 [PATCH 1/3] readfile: implement readfile syscall Alexey Dobriyan
2020-07-04 21:15 ` Miklos Szeredi
  -- strict thread matches above, loose matches on Subject: below --
2020-07-04 14:02 [PATCH 0/3] readfile(2): a new syscall to make open/read/close faster Greg Kroah-Hartman
2020-07-04 14:02 ` [PATCH 1/3] readfile: implement readfile syscall Greg Kroah-Hartman
2020-07-04 18:35   ` Geert Uytterhoeven
2020-07-04 19:14   ` Matthew Wilcox
2020-07-04 19:41   ` Miklos Szeredi
2020-07-04 20:12     ` Al Viro
2020-07-04 20:16       ` Al Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).