* fs: Use non-const iov in aio_read/aio_write @ 2014-11-02 23:05 Herbert Xu 2014-11-03 0:16 ` Al Viro 0 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-02 23:05 UTC (permalink / raw) To: David S. Miller, netdev, Linux Kernel Mailing List; +Cc: Benjamin LaHaise Currently the functions aio_read/aio_write use a const iov as input. This is unnecessary as all their callers supply a stack-based or kmalloced iov which is never reused. Conceptually this is fine because iovs supplied to aio_read/aio_write ultimately come from user-space so we always have to make a copy of them for the kernel. This is also a joke because for as long (since 2.1.15) as we've had the const iov, the network stack (currently through do_sock_read and do_sock_write) has been casting the const away. IOW if anybody did supply a const iov they would crash and burn if they ever entered the network stack. The network stack needs a non-const iov because it iterates through the iov as it reads/writes data. So we have two alternatives, either change the network stack to not touch the iovs or make the iovs non-const. As there is no reason for the iovs to be const in the first place, I have taken the second choice and changed all aio_read/aio_write functions to use non-const iovs. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index b30753c..dfefc79 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -434,8 +434,8 @@ prototypes: loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); - ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); + ssize_t (*aio_read) (struct kiocb *, struct iovec *, unsigned long, loff_t); + ssize_t (*aio_write) (struct kiocb *, struct iovec *, unsigned long, loff_t); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 20bf204..a2ba142 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -811,8 +811,8 @@ struct file_operations { loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); - ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); + ssize_t (*aio_read) (struct kiocb *, struct iovec *, unsigned long, loff_t); + ssize_t (*aio_write) (struct kiocb *, struct iovec *, unsigned long, loff_t); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index c952b98..c7490bd 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -144,7 +144,7 @@ static int hypfs_open(struct inode *inode, struct file *filp) return nonseekable_open(inode, filp); } -static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov, +static ssize_t hypfs_aio_read(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t offset) { char *data; @@ -167,7 +167,7 @@ static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov, return ret; } -static ssize_t hypfs_aio_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t hypfs_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t offset) { int rc; diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 524b707..d94e5b0 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -598,13 +598,13 @@ static ssize_t write_null(struct file *file, const char __user *buf, return count; } -static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov, +static ssize_t aio_read_null(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { return 0; } -static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov, +static ssize_t aio_write_null(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { return iov_length(iov, nr_segs); diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c index 6d7f453..8b75de4f 100644 --- a/drivers/infiniband/hw/ipath/ipath_file_ops.c +++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c @@ -53,7 +53,7 @@ static int ipath_open(struct inode *, struct file *); static int ipath_close(struct inode *, struct file *); static ssize_t ipath_write(struct file *, const char __user *, size_t, loff_t *); -static ssize_t ipath_writev(struct kiocb *, const struct iovec *, +static ssize_t ipath_writev(struct kiocb *, struct iovec *, unsigned long , loff_t); static unsigned int ipath_poll(struct file *, struct poll_table_struct *); static int ipath_mmap(struct file *, struct vm_area_struct *); @@ -2414,7 +2414,7 @@ bail: return ret; } -static ssize_t ipath_writev(struct kiocb *iocb, const struct iovec *iov, +static ssize_t ipath_writev(struct kiocb *iocb, struct iovec *iov, unsigned long dim, loff_t off) { struct file *filp = iocb->ki_filp; diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c index b15e34e..8872924 100644 --- a/drivers/infiniband/hw/qib/qib_file_ops.c +++ b/drivers/infiniband/hw/qib/qib_file_ops.c @@ -55,7 +55,7 @@ static int qib_open(struct inode *, struct file *); static int qib_close(struct inode *, struct file *); static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *); -static ssize_t qib_aio_write(struct kiocb *, const struct iovec *, +static ssize_t qib_aio_write(struct kiocb *, struct iovec *, unsigned long, loff_t); static unsigned int qib_poll(struct file *, struct poll_table_struct *); static int qib_mmapf(struct file *, struct vm_area_struct *); @@ -2245,7 +2245,7 @@ bail: return ret; } -static ssize_t qib_aio_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t qib_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long dim, loff_t off) { struct qib_filedata *fp = iocb->ki_filp->private_data; diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 6f226de..823522e 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -761,7 +761,7 @@ err: return err; } -static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, +static ssize_t macvtap_aio_write(struct kiocb *iocb, struct iovec *iv, unsigned long count, loff_t pos) { struct file *file = iocb->ki_filp; @@ -871,7 +871,7 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, return ret; } -static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, +static ssize_t macvtap_aio_read(struct kiocb *iocb, struct iovec *iv, unsigned long count, loff_t pos) { struct file *file = iocb->ki_filp; diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746..8d06816 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1206,7 +1206,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, return total_len; } -static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, +static ssize_t tun_chr_aio_write(struct kiocb *iocb, struct iovec *iv, unsigned long count, loff_t pos) { struct file *file = iocb->ki_filp; @@ -1371,7 +1371,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, return ret; } -static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, +static ssize_t tun_chr_aio_read(struct kiocb *iocb, struct iovec *iv, unsigned long count, loff_t pos) { struct file *file = iocb->ki_filp; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 63314ed..47fec3fd 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -958,7 +958,7 @@ static int ffs_aio_cancel(struct kiocb *kiocb) } static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb, - const struct iovec *iovec, + struct iovec *iovec, unsigned long nr_segs, loff_t loff) { struct ffs_io_data *io_data; @@ -985,7 +985,7 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb, } static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb, - const struct iovec *iovec, + struct iovec *iovec, unsigned long nr_segs, loff_t loff) { struct ffs_io_data *io_data; diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index c744e49..211ab83 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -695,7 +695,7 @@ fail: } static ssize_t -ep_aio_read(struct kiocb *iocb, const struct iovec *iov, +ep_aio_read(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t o) { struct ep_data *epdata = iocb->ki_filp->private_data; @@ -712,7 +712,7 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov, } static ssize_t -ep_aio_write(struct kiocb *iocb, const struct iovec *iov, +ep_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t o) { struct ep_data *epdata = iocb->ki_filp->private_data; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index afd2b44..ca3db8d 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -33,13 +33,13 @@ static ssize_t bad_file_write(struct file *filp, const char __user *buf, return -EIO; } -static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov, +static ssize_t bad_file_aio_read(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { return -EIO; } -static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t bad_file_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { return -EIO; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ca88731..88ce708 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1277,7 +1277,7 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, return err; } -static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, +static ssize_t fuse_dev_read(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct fuse_copy_state cs; @@ -1881,7 +1881,7 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, return err; } -static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t fuse_dev_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct fuse_copy_state cs; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 643faa4..2617860 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2114,7 +2114,7 @@ out: /** * ntfs_file_aio_write - */ -static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t ntfs_file_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; diff --git a/include/linux/fs.h b/include/linux/fs.h index 4e41a4a..2585428 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1485,8 +1485,8 @@ struct file_operations { loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); - ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t); - ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t); + ssize_t (*aio_read) (struct kiocb *, struct iovec *, unsigned long, loff_t); + ssize_t (*aio_write) (struct kiocb *, struct iovec *, unsigned long, loff_t); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iterate) (struct file *, struct dir_context *); diff --git a/net/socket.c b/net/socket.c index fe20c31..3c6fbab 100644 --- a/net/socket.c +++ b/net/socket.c @@ -114,9 +114,9 @@ unsigned int sysctl_net_busy_poll __read_mostly; #endif static int sock_no_open(struct inode *irrelevant, struct file *dontcare); -static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, +static ssize_t sock_aio_read(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos); -static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t sock_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos); static int sock_mmap(struct file *file, struct vm_area_struct *vma); @@ -901,7 +901,7 @@ static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, } static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, - struct file *file, const struct iovec *iov, + struct file *file, struct iovec *iov, unsigned long nr_segs) { struct socket *sock = file->private_data; @@ -915,14 +915,14 @@ static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, msg->msg_namelen = 0; msg->msg_control = NULL; msg->msg_controllen = 0; - msg->msg_iov = (struct iovec *)iov; + msg->msg_iov = iov; msg->msg_iovlen = nr_segs; msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); } -static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, +static ssize_t sock_aio_read(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct sock_iocb siocb, *x; @@ -941,7 +941,7 @@ static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, } static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, - struct file *file, const struct iovec *iov, + struct file *file, struct iovec *iov, unsigned long nr_segs) { struct socket *sock = file->private_data; @@ -955,7 +955,7 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, msg->msg_namelen = 0; msg->msg_control = NULL; msg->msg_controllen = 0; - msg->msg_iov = (struct iovec *)iov; + msg->msg_iov = iov; msg->msg_iovlen = nr_segs; msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; if (sock->type == SOCK_SEQPACKET) @@ -964,7 +964,7 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, return __sock_sendmsg(iocb, sock, msg, size); } -static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t sock_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct sock_iocb siocb, *x; diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index 166d59c..229b5a9 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -2995,7 +2995,7 @@ static ssize_t snd_pcm_write(struct file *file, const char __user *buf, return result; } -static ssize_t snd_pcm_aio_read(struct kiocb *iocb, const struct iovec *iov, +static ssize_t snd_pcm_aio_read(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { @@ -3031,7 +3031,7 @@ static ssize_t snd_pcm_aio_read(struct kiocb *iocb, const struct iovec *iov, return result; } -static ssize_t snd_pcm_aio_write(struct kiocb *iocb, const struct iovec *iov, +static ssize_t snd_pcm_aio_write(struct kiocb *iocb, struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct snd_pcm_file *pcm_file; Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: fs: Use non-const iov in aio_read/aio_write 2014-11-02 23:05 fs: Use non-const iov in aio_read/aio_write Herbert Xu @ 2014-11-03 0:16 ` Al Viro 2014-11-03 0:21 ` Al Viro 2014-11-03 0:22 ` Herbert Xu 0 siblings, 2 replies; 82+ messages in thread From: Al Viro @ 2014-11-03 0:16 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Mon, Nov 03, 2014 at 07:05:52AM +0800, Herbert Xu wrote: > Currently the functions aio_read/aio_write use a const iov as > input. This is unnecessary as all their callers supply a > stack-based or kmalloced iov which is never reused. Conceptually > this is fine because iovs supplied to aio_read/aio_write ultimately > come from user-space so we always have to make a copy of them for > the kernel. > > This is also a joke because for as long (since 2.1.15) as we've > had the const iov, the network stack (currently through do_sock_read > and do_sock_write) has been casting the const away. IOW if anybody > did supply a const iov they would crash and burn if they ever > entered the network stack. > > The network stack needs a non-const iov because it iterates through > the iov as it reads/writes data. > > So we have two alternatives, either change the network stack to > not touch the iovs or make the iovs non-const. > > As there is no reason for the iovs to be const in the first place, > I have taken the second choice and changed all aio_read/aio_write > functions to use non-const iovs. NAK with extreme prejudice. The right way to deal with that is to convert the socket side of things to iov_iter. And give it a consistent behaviour, while we are at it (some protocols do advance the damn thing, so do not). There are _very_ good reasons to have those iovecs unchanged - if you look at the callers on the socket side, you'll see a bunch that has to _copy_ iovec just to avoid it being buggered. And you get rather suboptimal behaviour in memcpy_fromiovec() and friends, exactly because you have to skip through the emptied elements. IOW, no way in hell. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: fs: Use non-const iov in aio_read/aio_write 2014-11-03 0:16 ` Al Viro @ 2014-11-03 0:21 ` Al Viro 2014-11-03 0:22 ` Herbert Xu 1 sibling, 0 replies; 82+ messages in thread From: Al Viro @ 2014-11-03 0:21 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Mon, Nov 03, 2014 at 12:16:34AM +0000, Al Viro wrote: > NAK with extreme prejudice. The right way to deal with that is > to convert the socket side of things to iov_iter. And give it a > consistent behaviour, while we are at it (some protocols do advance > the damn thing, so do not). There are _very_ good reasons to have those > iovecs unchanged - if you look at the callers on the socket side, you'll > see a bunch that has to _copy_ iovec just to avoid it being buggered. > And you get rather suboptimal behaviour in memcpy_fromiovec() and friends, > exactly because you have to skip through the emptied elements. > > IOW, no way in hell. PS: I do have the beginning of that stuff sitting in the local queue since April; see http://marc.info/?l=linux-xfs&m=139179304710494&w=2 for the beginning of the story. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: fs: Use non-const iov in aio_read/aio_write 2014-11-03 0:16 ` Al Viro 2014-11-03 0:21 ` Al Viro @ 2014-11-03 0:22 ` Herbert Xu 2014-11-03 0:45 ` Al Viro 1 sibling, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-03 0:22 UTC (permalink / raw) To: Al Viro Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Mon, Nov 03, 2014 at 12:16:34AM +0000, Al Viro wrote: > > NAK with extreme prejudice. The right way to deal with that is > to convert the socket side of things to iov_iter. And give it a > consistent behaviour, while we are at it (some protocols do advance > the damn thing, so do not). There are _very_ good reasons to have those > iovecs unchanged - if you look at the callers on the socket side, you'll > see a bunch that has to _copy_ iovec just to avoid it being buggered. > And you get rather suboptimal behaviour in memcpy_fromiovec() and friends, > exactly because you have to skip through the emptied elements. > > IOW, no way in hell. You're welcome to send patches fix every spot in the network stack that writes to the iovec. But until the network stack is all fixed up, having a const struct iovec in aio_read/aio_write is a delusion. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: fs: Use non-const iov in aio_read/aio_write 2014-11-03 0:22 ` Herbert Xu @ 2014-11-03 0:45 ` Al Viro 2014-11-03 5:37 ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-03 0:45 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Mon, Nov 03, 2014 at 08:22:07AM +0800, Herbert Xu wrote: > On Mon, Nov 03, 2014 at 12:16:34AM +0000, Al Viro wrote: > > > > NAK with extreme prejudice. The right way to deal with that is > > to convert the socket side of things to iov_iter. And give it a > > consistent behaviour, while we are at it (some protocols do advance > > the damn thing, so do not). There are _very_ good reasons to have those > > iovecs unchanged - if you look at the callers on the socket side, you'll > > see a bunch that has to _copy_ iovec just to avoid it being buggered. > > And you get rather suboptimal behaviour in memcpy_fromiovec() and friends, > > exactly because you have to skip through the emptied elements. > > > > IOW, no way in hell. > > You're welcome to send patches fix every spot in the network stack > that writes to the iovec. But until the network stack is all fixed > up, having a const struct iovec in aio_read/aio_write is a delusion. Check how many ->aio_read() and ->aio_write() instances are left. If you are implying that dealing with the ones in net/* is not feasible, I invite you to check the situation in fs/*, where we used to have quite a few. Compare it with what used to be there in e.g. January. Note, BTW, that there's a damn good reason to convert the socket side of things to iov_iter - as it is, ->splice_write() there is basically done with page-by-page mapping and doing kernel_sendmsg(); being able to deal with "map and copy" stuff *inside* ->sendmsg() would not only reduce the overhead, it would allow to get rid of ->sendpage() completely. Basically, let ->sendmsg() instances check the iov_iter type and play zerocopy games if it's an "array of kernel pages" kind. Compare ->sendpage() and ->sendmsg() instances for the protocols that have nontrivial ->sendpage(); you'll see that there's a lot of duplication. Merging them looks very feasible, with divergence happening only very deep in the call chain. ^ permalink raw reply [flat|nested] 82+ messages in thread
* [0/3] net: Kill skb_copy_datagram_const_iovec 2014-11-03 0:45 ` Al Viro @ 2014-11-03 5:37 ` Herbert Xu 2014-11-03 5:44 ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu ` (3 more replies) 0 siblings, 4 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-03 5:37 UTC (permalink / raw) To: Al Viro Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Mon, Nov 03, 2014 at 12:45:03AM +0000, Al Viro wrote: > > Note, BTW, that there's a damn good reason to convert the socket side of > things to iov_iter - as it is, ->splice_write() there is basically done with > page-by-page mapping and doing kernel_sendmsg(); being able to deal with > "map and copy" stuff *inside* ->sendmsg() would not only reduce the overhead, > it would allow to get rid of ->sendpage() completely. Basically, let > ->sendmsg() instances check the iov_iter type and play zerocopy games if > it's an "array of kernel pages" kind. Compare ->sendpage() and ->sendmsg() > instances for the protocols that have nontrivial ->sendpage(); you'll see > that there's a lot of duplication. Merging them looks very feasible, with > divergence happening only very deep in the call chain. Honestly I don't really care which way we end up going as long as we pick one solution and stick with it. Right now we have an abomination in the form of skb_copy_datagram_const_iovec which is the worst of both worlds, plus it duplicates tons of code. So here's a few patches to kill this crap. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read 2014-11-03 5:37 ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu @ 2014-11-03 5:44 ` Herbert Xu 2014-11-03 5:44 ` [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu ` (2 subsequent siblings) 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-03 5:44 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise I started working on this patch after discovering the horror of skb_copy_datagram_iovec and skb_copy_datagram_const_iovec. It's ridiculous to have two versions of the same thing. Especially when the reason they exist is because of a stupid disagreement between fs and net on how we should itereate over iovecs. To reiterate, fs wants to keep the iovecs themselves constant and use iterators to keep state while net is used to keeping the state within the iovecs. Without judging the merits of either approach, we should stick to one of them. And regardless of which one we end up picking, we can always kill skb_copy_datagram_const_iovec which is plain wrong as it starts from the very beginning of the iovec every single time. This patch uses the do_sock_read approach of casting the const away for the time being. If we end up going the other way we can trivially convert this over to using iterators. In the mean time this would at least allow us to kill skb_copy_datagram_const_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/tun.c | 28 ++++++++++++---------------- 1 file changed, 12 insertions(+), 16 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746..657f811 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -1230,11 +1230,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iovec *iv, int len) { struct tun_pi pi = { 0, skb->protocol }; ssize_t total = 0; - int vlan_offset = 0, copied; + int vlan_offset = 0; int vlan_hlen = 0; int vnet_hdr_sz = 0; @@ -1244,16 +1244,18 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (tun->flags & TUN_VNET_HDR) vnet_hdr_sz = tun->vnet_hdr_sz; + total = skb->len + vlan_hlen + vnet_hdr_sz; + if (!(tun->flags & TUN_NO_PI)) { if ((len -= sizeof(pi)) < 0) return -EINVAL; - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { + if (len < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (memcpy_toiovec(iv, (void *)&pi, sizeof(pi))) return -EFAULT; total += sizeof(pi); } @@ -1299,15 +1301,11 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (unlikely(memcpy_toiovec(iv, (void *)&gso, sizeof(gso)))) return -EFAULT; - total += vnet_hdr_sz; } - copied = total; len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { int copy, ret; struct { @@ -1321,21 +1319,19 @@ static ssize_t tun_put_user(struct tun_struct *tun, vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); + ret = skb_copy_datagram_iovec(skb, 0, iv, copy); len -= copy; - copied += copy; if (ret || !len) goto done; copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); + ret = memcpy_toiovec(iv, (void *)&veth, copy); len -= copy; - copied += copy; if (ret || !len) goto done; } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iovec(skb, vlan_offset, iv, len); done: tun->dev->stats.tx_packets++; @@ -1345,7 +1341,7 @@ done: } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + struct iovec *iv, ssize_t len, int noblock) { struct sk_buff *skb; ssize_t ret = 0; @@ -1387,7 +1383,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = tun_do_read(tun, tfile, iv, len, + ret = tun_do_read(tun, tfile, (struct iovec *)iv, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec 2014-11-03 5:37 ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu 2014-11-03 5:44 ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu @ 2014-11-03 5:44 ` Herbert Xu 2014-11-03 5:44 ` [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read Herbert Xu 2014-11-03 20:05 ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-03 5:44 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise Now that both macvtap and tun are using skb_copy_datagram_iovec, we can kill the abomination that is skb_copy_datagram_const_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- include/linux/skbuff.h | 3 - net/core/datagram.c | 89 ------------------------------------------------- 2 files changed, 92 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6c8b6f6..d12c81b 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2638,9 +2638,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, int len); int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, int offset, size_t count); -int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, - const struct iovec *to, int to_offset, - int size); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); diff --git a/net/core/datagram.c b/net/core/datagram.c index fdbc9a8..30e2ebd 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -393,95 +393,6 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_iovec); /** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to - * @len: amount of data to copy from buffer to iovec - * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. - */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); - -/** * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying to ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read 2014-11-03 5:37 ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu 2014-11-03 5:44 ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu 2014-11-03 5:44 ` [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu @ 2014-11-03 5:44 ` Herbert Xu 2014-11-03 20:05 ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-03 5:44 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise I started working on this patch after discovering the horror of skb_copy_datagram_iovec and skb_copy_datagram_const_iovec. It's ridiculous to have two versions of the same thing. Especially when the reason they exist is because of a stupid disagreement between fs and net on how we should itereate over iovecs. To reiterate, fs wants to keep the iovecs themselves constant and use iterators to keep state while net is used to keeping the state within the iovecs. Without judging the merits of either approach, we should stick to one of them. And regardless of which one we end up picking, we can always kill skb_copy_datagram_const_iovec which is plain wrong as it starts from the very beginning of the iovec every single time. This patch uses the do_sock_read approach of casting the const away for the time being. If we end up going the other way we can trivially convert this over to using iterators. In the mean time this would at least allow us to kill skb_copy_datagram_const_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/macvtap.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 6f226de..d830e25 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -776,12 +776,12 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, /* Put packet to the user space buffer */ static ssize_t macvtap_put_user(struct macvtap_queue *q, const struct sk_buff *skb, - const struct iovec *iv, int len) + struct iovec *iv, int len) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; - int copied, total; + int total; if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr; @@ -791,10 +791,10 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); - if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) + if (memcpy_toiovec(iv, (void *)&vnet_hdr, sizeof(vnet_hdr))) return -EFAULT; } - total = copied = vnet_hdr_len; + total = vnet_hdr_len; total += skb->len; if (!vlan_tx_tag_present(skb)) @@ -813,28 +813,26 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, total += VLAN_HLEN; copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); + ret = skb_copy_datagram_iovec(skb, 0, iv, copy); len -= copy; - copied += copy; if (ret || !len) goto done; copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); + ret = memcpy_toiovec(iv, (void *)&veth, copy); len -= copy; - copied += copy; if (ret || !len) goto done; } - ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + ret = skb_copy_datagram_iovec(skb, vlan_offset, iv, len); done: return ret ? ret : total; } static ssize_t macvtap_do_read(struct macvtap_queue *q, - const struct iovec *iv, unsigned long len, + struct iovec *iv, unsigned long len, int noblock) { DEFINE_WAIT(wait); @@ -884,7 +882,8 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK); + ret = macvtap_do_read(q, (struct iovec *)iv, len, + file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [0/3] net: Kill skb_copy_datagram_const_iovec 2014-11-03 5:37 ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu ` (2 preceding siblings ...) 2014-11-03 5:44 ` [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read Herbert Xu @ 2014-11-03 20:05 ` David Miller 2014-11-04 3:38 ` Herbert Xu 2014-11-04 5:45 ` [0/3] " Al Viro 3 siblings, 2 replies; 82+ messages in thread From: David Miller @ 2014-11-03 20:05 UTC (permalink / raw) To: herbert; +Cc: viro, netdev, linux-kernel, bcrl From: Herbert Xu <herbert@gondor.apana.org.au> Date: Mon, 3 Nov 2014 13:37:51 +0800 > On Mon, Nov 03, 2014 at 12:45:03AM +0000, Al Viro wrote: >> >> Note, BTW, that there's a damn good reason to convert the socket side of >> things to iov_iter - as it is, ->splice_write() there is basically done with >> page-by-page mapping and doing kernel_sendmsg(); being able to deal with >> "map and copy" stuff *inside* ->sendmsg() would not only reduce the overhead, >> it would allow to get rid of ->sendpage() completely. Basically, let >> ->sendmsg() instances check the iov_iter type and play zerocopy games if >> it's an "array of kernel pages" kind. Compare ->sendpage() and ->sendmsg() >> instances for the protocols that have nontrivial ->sendpage(); you'll see >> that there's a lot of duplication. Merging them looks very feasible, with >> divergence happening only very deep in the call chain. > > Honestly I don't really care which way we end up going as long as > we pick one solution and stick with it. Right now we have an > abomination in the form of skb_copy_datagram_const_iovec which is > the worst of both worlds, plus it duplicates tons of code. > > So here's a few patches to kill this crap. To pick one direction and go with it, I totally agree with. But a patch set like this as an interim solution, I am not so happy with. If the method says const, we have a contract with the caller to not modify the iovec. That caller can assume that we have not done so. So this patch set violated that contract and can result in real bugs either now or in the future. I'll see if I can make some progress converting the networking over to iov_iter. It can't be that difficult... albeit perhaps a little time consuming. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [0/3] net: Kill skb_copy_datagram_const_iovec 2014-11-03 20:05 ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller @ 2014-11-04 3:38 ` Herbert Xu 2014-11-04 8:31 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu ` (3 more replies) 2014-11-04 5:45 ` [0/3] " Al Viro 1 sibling, 4 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-04 3:38 UTC (permalink / raw) To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl On Mon, Nov 03, 2014 at 03:05:53PM -0500, David Miller wrote: > > I'll see if I can make some progress converting the networking over > to iov_iter. It can't be that difficult... albeit perhaps a little > time consuming. OK great. I'll try to convert tun/macvtap over to iov_iter. Thanks, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 3:38 ` Herbert Xu @ 2014-11-04 8:31 ` Herbert Xu 2014-11-04 14:32 ` Al Viro 2014-11-05 20:24 ` David Miller 2014-11-04 8:31 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu ` (2 subsequent siblings) 3 siblings, 2 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-04 8:31 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise This patch adds skb_copy_datagram_iter, which is identical to skb_copy_datagram_iovec except that it operates on iov_iter instead of iovec. Eventually all users of skb_copy_datagram_iovec should switch over to iov_iter and then we can remove skb_copy_datagram_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- include/linux/skbuff.h | 3 + net/core/datagram.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6c8b6f6..5ff7054 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -148,6 +148,7 @@ struct net_device; struct scatterlist; struct pipe_inode_info; +struct iov_iter; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -2641,6 +2642,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, const struct iovec *to, int to_offset, int size); +int skb_copy_datagram_iter(const struct sk_buff *from, int offset, + struct iov_iter *to, int size); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); diff --git a/net/core/datagram.c b/net/core/datagram.c index fdbc9a8..45a9d4d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -49,6 +49,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/uio.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -482,6 +483,87 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_const_iovec); /** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + trace_skb_copy_datagram_iovec(skb, len); + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (copy_to_iter(skb->data + offset, copy, to)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + struct page *page = skb_frag_page(frag); + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = copy_to_iter(vaddr + frag->page_offset + + offset - start, copy, to); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iter(frag_iter, offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_iter); + +/** * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying to ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 8:31 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu @ 2014-11-04 14:32 ` Al Viro 2014-11-04 14:35 ` Al Viro 2014-11-04 14:42 ` Herbert Xu 2014-11-05 20:24 ` David Miller 1 sibling, 2 replies; 82+ messages in thread From: Al Viro @ 2014-11-04 14:32 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 04:31:34PM +0800, Herbert Xu wrote: > This patch adds skb_copy_datagram_iter, which is identical to > skb_copy_datagram_iovec except that it operates on iov_iter > instead of iovec. > > Eventually all users of skb_copy_datagram_iovec should switch > over to iov_iter and then we can remove skb_copy_datagram_iovec. Too noisy, IMO. How about skb_copy_datagram_msg() first? The fewer places have to even think of iovec or iov_iter, the better... ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 14:32 ` Al Viro @ 2014-11-04 14:35 ` Al Viro 2014-11-04 14:44 ` Herbert Xu 2014-11-04 14:42 ` Herbert Xu 1 sibling, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-04 14:35 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote: > On Tue, Nov 04, 2014 at 04:31:34PM +0800, Herbert Xu wrote: > > This patch adds skb_copy_datagram_iter, which is identical to > > skb_copy_datagram_iovec except that it operates on iov_iter > > instead of iovec. > > > > Eventually all users of skb_copy_datagram_iovec should switch > > over to iov_iter and then we can remove skb_copy_datagram_iovec. > > Too noisy, IMO. How about skb_copy_datagram_msg() first? The fewer > places have to even think of iovec or iov_iter, the better... PS: "too noisy" is about turning every callsite of skb_copy_datagram_iovec into that of skb_copy_datagram_iter; the helper itself is just fine. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 14:35 ` Al Viro @ 2014-11-04 14:44 ` Herbert Xu 2014-11-04 14:52 ` Al Viro 0 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-04 14:44 UTC (permalink / raw) To: Al Viro Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 02:35:36PM +0000, Al Viro wrote: > On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote: > > > Too noisy, IMO. How about skb_copy_datagram_msg() first? The fewer > > places have to even think of iovec or iov_iter, the better... > > PS: "too noisy" is about turning every callsite of skb_copy_datagram_iovec > into that of skb_copy_datagram_iter; the helper itself is just fine. Hmm if that is your concern then I don't see how skb_copy_datagram_msg changes things as you'd still have to convert every existing caller of skb_copy_datagram_iovec. Colour me confused. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 14:44 ` Herbert Xu @ 2014-11-04 14:52 ` Al Viro 2014-11-04 14:55 ` Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-04 14:52 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 10:44:16PM +0800, Herbert Xu wrote: > On Tue, Nov 04, 2014 at 02:35:36PM +0000, Al Viro wrote: > > On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote: > > > > > Too noisy, IMO. How about skb_copy_datagram_msg() first? The fewer > > > places have to even think of iovec or iov_iter, the better... > > > > PS: "too noisy" is about turning every callsite of skb_copy_datagram_iovec > > into that of skb_copy_datagram_iter; the helper itself is just fine. > > Hmm if that is your concern then I don't see how skb_copy_datagram_msg > changes things as you'd still have to convert every existing caller > of skb_copy_datagram_iovec. Colour me confused. Fewer places having to even think of iovec/iov_iter... ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 14:52 ` Al Viro @ 2014-11-04 14:55 ` Herbert Xu 0 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-04 14:55 UTC (permalink / raw) To: Al Viro Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 02:52:22PM +0000, Al Viro wrote: > > > Hmm if that is your concern then I don't see how skb_copy_datagram_msg > > changes things as you'd still have to convert every existing caller > > of skb_copy_datagram_iovec. Colour me confused. > > Fewer places having to even think of iovec/iov_iter... Well it's the difference between skb_copy_datagram_iter(..., &kmsghdr->iov_iter, ...) and skb_copy_datagram_msg(..., kmsghdr, ...) Heck we could even make skb_copy_datagram_msg an inline wrapper around skb_copy_datagram_iter if you like. Anyway, the point is that not everything comes with a kmsghdr. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 14:32 ` Al Viro 2014-11-04 14:35 ` Al Viro @ 2014-11-04 14:42 ` Herbert Xu 2014-11-04 15:13 ` Al Viro 1 sibling, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-04 14:42 UTC (permalink / raw) To: Al Viro Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote: > > Too noisy, IMO. How about skb_copy_datagram_msg() first? The fewer > places have to even think of iovec or iov_iter, the better... We have places like tcp ucopy and tun that do not have msghdr. So doing skb_copy_datagram_msg means that we'd have to create a fake msghdr wrapper around them. The point is not everything comes in via sendmsg/recvmsg. What is your motivation for hiding iov/iov_iter? Do you plan to change their API at some future point? Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 14:42 ` Herbert Xu @ 2014-11-04 15:13 ` Al Viro 2014-11-05 2:22 ` Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-04 15:13 UTC (permalink / raw) To: Herbert Xu Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 10:42:58PM +0800, Herbert Xu wrote: > On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote: > > > > Too noisy, IMO. How about skb_copy_datagram_msg() first? The fewer > > places have to even think of iovec or iov_iter, the better... > > We have places like tcp ucopy and tun that do not have msghdr. > So doing skb_copy_datagram_msg means that we'd have to create > a fake msghdr wrapper around them. The point is not everything > comes in via sendmsg/recvmsg. I'm certainly not suggesting it as a primitive. > What is your motivation for hiding iov/iov_iter? Do you plan to > change their API at some future point? Think of it that way: every sendmsg/recvmsg path leading to memcpy_fromiovec and its friends (including the open-coded ones) would need to be changed at some point. Assuming we do not end up passing struct iov_iter * as an extra argument through a fairly large part of net/* (and that would be prohibitively hard and messy, not to mention the effects on the stack footprint, etc.), the most obvious strategy is to have that thing passed where msg_iov/msg_iovlen are - in struct msghdr. *IF* we go that way, it makes a whole lot of sense to start with a bunch of cleanups that will make sense on their own (most of callers of skb_copy_datagram_iovec do look like skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); might as well give it an inlined helper) and will reduce the amount of places where ->msg_iov is used. With such cleanups standing on their own and being splittable from the rest of the queue. And leaving us with fewer places in code that deal with ->msg_iov and need to be dealt with. Please, look through my yesterday posting upthread. Outline of the proposed strategy is there... FWIW, this is from the beginning of April queue - rebased to current, but very likely incomplete. Variant taking iov_iter would come later and yes, it would replace the ..._iovec() one as primitive. With much fewer places to worry about. commit 8241142acab3451239029085286b717ca30aac33 Author: Al Viro <viro@zeniv.linux.org.uk> Date: Sun Apr 6 18:41:28 2014 -0400 new helper: skb_copy_datagram_msg() Absolute majority of skb_copy_datagram_iovec() callers (49 out of 56) are passing it msg->msg_iov as iovec. Provide a trivial wrapper that takes msg as argument instead of iovec. Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 1be8228..dcbd858 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock, memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 6c9c16d..443cbbf 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb) { total_len = min_t(size_t, total_len, skb->len); - error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len); + error = skb_copy_datagram_msg(skb, 0, m, total_len); if (error == 0) { consume_skb(skb); return total_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 6c8b6f6..379ab46 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2662,6 +2662,12 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb); struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *skb_vlan_untag(struct sk_buff *skb); +static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, + struct msghdr *msg, int size) +{ + return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size); +} + struct skb_checksum_ops { __wsum (*update)(const void *mem, int len, __wsum wsum); __wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len); diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index c00897f..425942d 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1758,7 +1758,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr copied = size; msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, offset, msg, copied); if (!err && msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name); diff --git a/net/atm/common.c b/net/atm/common.c index 6a76515..9cd1cca 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, msg->msg_flags |= MSG_TRUNC; } - error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + error = skb_copy_datagram_msg(skb, 0, msg, copied); if (error) return error; sock_recv_ts_and_drops(msg, sk, skb); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c35c3f4..f4f835e19 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { ax25_digi digi; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 339c74a..0a7cc56 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err == 0) { sock_recv_ts_and_drops(msg, sk, skb); @@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } chunk = min_t(unsigned int, skb->len, size); - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, 0, msg, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (!copied) copied = -EFAULT; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 115f149..29e1ec7 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 43f750e..fbcd156 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, copylen = len; } - ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen); + ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; diff --git a/net/core/sock.c b/net/core/sock.c index 15e0c67..ac56dd0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5ab6627..8e6ae94 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -896,7 +896,7 @@ verify_sock_status: else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index ef2ad8a..fc9193e 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, } /* FIXME: skip headers if necessary ?! */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 9d1f648..73a4d53 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c373a9a..21894df 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 57f7c98..736236c 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } /* Don't bother checking the checksum */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db31..ee8fa4b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39ec0c3..c239f47 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1833,8 +1833,7 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cd0db54..d7266f7 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1281,8 +1281,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2cdc383..5c6996e 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; @@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 896af88..f642598 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, } if (skb_csum_unnecessary(skb)) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); if (err == -EINVAL) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6ba535..5f68cd72 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -424,8 +424,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 91729b8..8b7ca1c 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1805,7 +1805,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, + rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied); if (rc) goto out_free; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 3f3a6cb..3f1a37b 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1394,7 +1394,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, copied = size; msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index a089b6b..057b564 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1355,7 +1355,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN; cskb = skb; - if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, offset, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/key/af_key.c b/net/key/af_key.c index 1847ec4..e588309 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 369a982..a6cc1fe 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 0edb263..2177b96 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b704a93..c559bcd 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock, else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (likely(err == 0)) err = len; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index bb9cbc1..8fa230b 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -819,8 +819,8 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, used = len; if (!(flags & MSG_TRUNC)) { - int rc = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + int rc = skb_copy_datagram_msg(skb, offset, + msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1de72d..580b794 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } skb_reset_transport_header(data_skb); - err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1b06a1f..7e13f6a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_msg(skb, 0, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 51f077a..83bc785 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = min_t(unsigned int, rlen, len); cskb = skb; - if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 11c3544..9d7d2b7 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = len; } - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 87d20f4..4cd13d8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 290352c..0918bc2 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, copylen = len; } - rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen); + rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 70a547e..44b2123 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1296,7 +1296,7 @@ copy: else len = skb->len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (!err) err = (flags & MSG_TRUNC) ? skb->len : len; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index a85c1a0..9b600c2 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { struct sockaddr_rose *srose; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index e9aaa65..4575485 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,7 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); + ret = skb_copy_datagram_msg(skb, offset, msg, copy); if (ret < 0) goto copy_error; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 634a2ab..2120292 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, if (copied > len) copied = len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); event = sctp_skb2event(skb); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 51bddc2..f726eaa 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1372,8 +1372,8 @@ restart: sz = buf_len; m->msg_flags |= MSG_TRUNC; } - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg), - m->msg_iov, sz); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), + m, sz); if (res) goto exit; res = sz; @@ -1473,8 +1473,8 @@ restart: needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset, - m->msg_iov, sz_to_copy); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset, + m, sz_to_copy); if (res) goto exit; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e968843..350771a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1825,7 +1825,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size); + err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; @@ -2030,8 +2030,8 @@ again: } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, - msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + msg, chunk)) { if (copied == 0) copied = -EFAULT; break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 9bb63ff..a57ddef 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1773,8 +1773,7 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, } /* Place the datagram payload in the user's iovec. */ - err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov, - payload_len); + err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len); if (err) goto out; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5ad4418..59e785b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 15:13 ` Al Viro @ 2014-11-05 2:22 ` Herbert Xu 2014-11-05 3:27 ` David Miller 0 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-05 2:22 UTC (permalink / raw) To: Al Viro Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise On Tue, Nov 04, 2014 at 03:13:55PM +0000, Al Viro wrote: > > Think of it that way: every sendmsg/recvmsg path leading to memcpy_fromiovec > and its friends (including the open-coded ones) would need to be changed > at some point. Assuming we do not end up passing struct iov_iter * as > an extra argument through a fairly large part of net/* (and that would > be prohibitively hard and messy, not to mention the effects on the stack > footprint, etc.), the most obvious strategy is to have that thing passed > where msg_iov/msg_iovlen are - in struct msghdr. *IF* we go that way, > it makes a whole lot of sense to start with a bunch of cleanups that > will make sense on their own (most of callers of skb_copy_datagram_iovec > do look like skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); might > as well give it an inlined helper) and will reduce the amount of places > where ->msg_iov is used. With such cleanups standing on their own and > being splittable from the rest of the queue. And leaving us with fewer > places in code that deal with ->msg_iov and need to be dealt with. I think your solution is great. However, I don't see how my four patches impede in anyway the work that you're doing. I presume your first patch will make skb_copy_datagram_msg just a wrapper around skb_copy_datagram_iovec. Since I'm not removing skb_copy_datagram_iovec (it can't be removed until all users are gone) you can still do that and when you're ready to switch over to iov_iter you can just move the wrapper over to skb_copy_datagram_iter. No? Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 2:22 ` Herbert Xu @ 2014-11-05 3:27 ` David Miller 2014-11-05 3:55 ` Al Viro 0 siblings, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-05 3:27 UTC (permalink / raw) To: herbert; +Cc: viro, netdev, linux-kernel, bcrl From: Herbert Xu <herbert@gondor.apana.org.au> Date: Wed, 5 Nov 2014 10:22:51 +0800 > On Tue, Nov 04, 2014 at 03:13:55PM +0000, Al Viro wrote: >> >> Think of it that way: every sendmsg/recvmsg path leading to memcpy_fromiovec >> and its friends (including the open-coded ones) would need to be changed >> at some point. Assuming we do not end up passing struct iov_iter * as >> an extra argument through a fairly large part of net/* (and that would >> be prohibitively hard and messy, not to mention the effects on the stack >> footprint, etc.), the most obvious strategy is to have that thing passed >> where msg_iov/msg_iovlen are - in struct msghdr. *IF* we go that way, >> it makes a whole lot of sense to start with a bunch of cleanups that >> will make sense on their own (most of callers of skb_copy_datagram_iovec >> do look like skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); might >> as well give it an inlined helper) and will reduce the amount of places >> where ->msg_iov is used. With such cleanups standing on their own and >> being splittable from the rest of the queue. And leaving us with fewer >> places in code that deal with ->msg_iov and need to be dealt with. > > I think your solution is great. However, I don't see how my four > patches impede in anyway the work that you're doing. I presume > your first patch will make skb_copy_datagram_msg just a wrapper > around skb_copy_datagram_iovec. > > Since I'm not removing skb_copy_datagram_iovec (it can't be removed > until all users are gone) you can still do that and when you're > ready to switch over to iov_iter you can just move the wrapper over > to skb_copy_datagram_iter. No? Agreed, I think both efforts can proceed in parallel. Al, is this the helper you are talking about? diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 1be8228..a08057d 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock, memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); mISDN_sock_cmsg(sk, msg, skb); diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 6c9c16d..25234d9 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb) { total_len = min_t(size_t, total_len, skb->len); - error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len); + error = skb_copy_datagram_msghdr(skb, m, total_len); if (error == 0) { consume_skb(skb); return total_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ad9675..19fe8cc 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/kmemcheck.h> #include <linux/compiler.h> +#include <linux/socket.h> #include <linux/time.h> #include <linux/bug.h> #include <linux/cache.h> @@ -2637,6 +2638,11 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to, int size); +static inline int skb_copy_datagram_msghdr(const struct sk_buff *from, + struct msghdr *msg, int size) +{ + return skb_copy_datagram_iovec(from, 0, msg->msg_iov, size); +} int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, diff --git a/net/atm/common.c b/net/atm/common.c index 6a76515..7e42bbe 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, msg->msg_flags |= MSG_TRUNC; } - error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + error = skb_copy_datagram_msghdr(skb, msg, copied); if (error) return error; sock_recv_ts_and_drops(msg, sk, skb); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c35c3f4..a91075c 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msghdr(skb, msg, copied); if (msg->msg_name) { ax25_digi digi; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 339c74a..a68dd75 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err == 0) { sock_recv_ts_and_drops(msg, sk, skb); @@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } chunk = min_t(unsigned int, skb->len, size); - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) { + if (skb_copy_datagram_msghdr(skb, msg, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (!copied) copied = -EFAULT; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 115f149..45d4fba 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 43f750e..67e63b6 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, copylen = len; } - ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen); + ret = skb_copy_datagram_msghdr(skb, m, copylen); if (ret) goto out_free; diff --git a/net/core/sock.c b/net/core/sock.c index 15e0c67..220c791 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto out_free_skb; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5ab6627..7ccf58f 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -896,7 +896,7 @@ verify_sock_status: else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + if (skb_copy_datagram_msghdr(skb, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index ef2ad8a..7017055 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, } /* FIXME: skip headers if necessary ?! */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto done; diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 9d1f648..5dd893a 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto done; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c373a9a..d643882 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 57f7c98..a6e0197 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } /* Don't bother checking the checksum */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto done; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db31..2f4bb30 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto done; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39ec0c3..3638679 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msghdr(skb, msg, skb->len); if (err) break; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2cdc383..4bd84fd 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto out_free_skb; @@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 075a0fb..5d37aa1 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, } if (skb_csum_unnecessary(skb)) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); } else { err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); if (err == -EINVAL) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 92fafd4..54db6dc 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1396,7 +1396,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, copied = size; msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msghdr(skb, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/key/af_key.c b/net/key/af_key.c index 1847ec4..f09a848 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto out_free; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 369a982..2913198 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 0edb263..8613881 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b704a93..5f3c0f5 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock, else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msghdr(skb, msg, len); if (likely(err == 0)) err = len; diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1de72d..123cffd 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } skb_reset_transport_header(data_skb); - err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(data_skb, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1b06a1f..6bc3556 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_msghdr(skb, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 51f077a..e962c07 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = min_t(unsigned int, rlen, len); cskb = skb; - if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) { + if (skb_copy_datagram_msghdr(cskb, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 11c3544..4467b2c 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = len; } - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msghdr(skb, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 87d20f4..390b776 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); if (err) goto out_free; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 290352c..b4835bc 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, copylen = len; } - rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen); + rval = skb_copy_datagram_msghdr(skb, msg, copylen); if (rval) { rval = -EFAULT; goto out; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 70a547e..f544658 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1296,7 +1296,7 @@ copy: else len = skb->len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msghdr(skb, msg, len); if (!err) err = (flags & MSG_TRUNC) ? skb->len : len; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index a85c1a0..b660504 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msghdr(skb, msg, copied); if (msg->msg_name) { struct sockaddr_rose *srose; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 634a2ab..0fca34c 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, if (copied > len) copied = len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msghdr(skb, msg, copied); event = sctp_skb2event(skb); diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5ad4418..fad0702 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msghdr(skb, msg, copied); if (rc) goto out_free_dgram; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 3:27 ` David Miller @ 2014-11-05 3:55 ` Al Viro 2014-11-05 4:12 ` Al Viro 2014-11-05 20:50 ` David Miller 0 siblings, 2 replies; 82+ messages in thread From: Al Viro @ 2014-11-05 3:55 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Tue, Nov 04, 2014 at 10:27:27PM -0500, David Miller wrote: > Al, is this the helper you are talking about? Mostly, except that I kept it 4-argument (and used skb_copy_datagram_msg() for name). Matter of taste - the ones you've missed because of that are net/appletalk/ddp.c:1761: err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied); net/ipv4/tcp.c:1836: err = skb_copy_datagram_iovec(skb, offset, net/ipv4/udp.c:1284: err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), net/ipv6/udp.c:427: err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), net/ipx/af_ipx.c:1808: rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, net/iucv/af_iucv.c:1358: if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) { net/llc/af_llc.c:822: int rc = skb_copy_datagram_iovec(skb, offset, net/rxrpc/ar-recvmsg.c:183: ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); net/tipc/socket.c:1375: res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg), net/tipc/socket.c:1476: res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset, net/unix/af_unix.c:1828: err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size); net/unix/af_unix.c:2033: if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, net/vmw_vsock/vmci_transport.c:1776: err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov, and back then I decided that 13 more converted instances might be worth keeping it in 4-argument form... What do you think of the trick with user_msghdr, BTW? ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 3:55 ` Al Viro @ 2014-11-05 4:12 ` Al Viro 2014-11-05 20:51 ` David Miller 2014-11-05 20:50 ` David Miller 1 sibling, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-05 4:12 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Wed, Nov 05, 2014 at 03:55:36AM +0000, Al Viro wrote: > What do you think of the trick with user_msghdr, BTW? PS: where do you prefer the branches to be based off? git://git.kernel.org/pub/scm/linux/kernel/git/davem/net#master, mainline, something else? I can certainly do that as patches over email, the question is what's best used as base... FWIW, the analysis I've posted was in 3.18-rc3 and it looks like it ought to be valid in net#master as well. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 4:12 ` Al Viro @ 2014-11-05 20:51 ` David Miller 0 siblings, 0 replies; 82+ messages in thread From: David Miller @ 2014-11-05 20:51 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl From: Al Viro <viro@ZenIV.linux.org.uk> Date: Wed, 5 Nov 2014 04:12:32 +0000 > On Wed, Nov 05, 2014 at 03:55:36AM +0000, Al Viro wrote: >> What do you think of the trick with user_msghdr, BTW? > > PS: where do you prefer the branches to be based off? > git://git.kernel.org/pub/scm/linux/kernel/git/davem/net#master, mainline, > something else? I can certainly do that as patches over email, the > question is what's best used as base... FWIW, the analysis I've posted > was in 3.18-rc3 and it looks like it ought to be valid in net#master > as well. Let's work against net-next, ie: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next#master I can integrate, your, mine, and Herbert's changes all into the same place. Thanks. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 3:55 ` Al Viro 2014-11-05 4:12 ` Al Viro @ 2014-11-05 20:50 ` David Miller 2014-11-05 21:07 ` Al Viro 1 sibling, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-05 20:50 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl From: Al Viro <viro@ZenIV.linux.org.uk> Date: Wed, 5 Nov 2014 03:55:36 +0000 > On Tue, Nov 04, 2014 at 10:27:27PM -0500, David Miller wrote: > >> Al, is this the helper you are talking about? > > Mostly, except that I kept it 4-argument (and used skb_copy_datagram_msg() > for name). Matter of taste - the ones you've missed because of that are ... > and back then I decided that 13 more converted instances might be worth keeping > it in 4-argument form... Ok, fixed up patch below: > What do you think of the trick with user_msghdr, BTW? I think we can get away with it if, as you say, we don't export a 'msghdr' from any uapi headers. And indeed, double checking, it's purely a linux/socket.h thing. If this patch is OK, mind if I toss it into net-next Al? diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c index 1be8228..dcbd858 100644 --- a/drivers/isdn/mISDN/socket.c +++ b/drivers/isdn/mISDN/socket.c @@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock, memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb), MISDN_HEADER_LEN); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); mISDN_sock_cmsg(sk, msg, skb); diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 6c9c16d..443cbbf 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock, if (skb) { total_len = min_t(size_t, total_len, skb->len); - error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len); + error = skb_copy_datagram_msg(skb, 0, m, total_len); if (error == 0) { consume_skb(skb); return total_len; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ad9675..31cdb7e 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -21,6 +21,7 @@ #include <linux/bug.h> #include <linux/cache.h> #include <linux/rbtree.h> +#include <linux/socket.h> #include <linux/atomic.h> #include <asm/types.h> @@ -2637,6 +2638,11 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int skb_copy_datagram_iovec(const struct sk_buff *from, int offset, struct iovec *to, int size); +static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, + struct msghdr *msg, int size) +{ + return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size); +} int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index c00897f..425942d 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1758,7 +1758,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr copied = size; msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, offset, msg, copied); if (!err && msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name); diff --git a/net/atm/common.c b/net/atm/common.c index 6a76515..9cd1cca 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, msg->msg_flags |= MSG_TRUNC; } - error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + error = skb_copy_datagram_msg(skb, 0, msg, copied); if (error) return error; sock_recv_ts_and_drops(msg, sk, skb); diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index c35c3f4..f4f835e 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { ax25_digi digi; diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 339c74a..0a7cc56 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err == 0) { sock_recv_ts_and_drops(msg, sk, skb); @@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } chunk = min_t(unsigned int, skb->len, size); - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, 0, msg, chunk)) { skb_queue_head(&sk->sk_receive_queue, skb); if (!copied) copied = -EFAULT; diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 115f149..29e1ec7 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); switch (hci_pi(sk)->channel) { case HCI_CHANNEL_RAW: diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 43f750e..fbcd156 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock, copylen = len; } - ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen); + ret = skb_copy_datagram_msg(skb, 0, m, copylen); if (ret) goto out_free; diff --git a/net/core/sock.c b/net/core/sock.c index 15e0c67..ac56dd0 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 5ab6627..8e6ae94 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -896,7 +896,7 @@ verify_sock_status: else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) { + if (skb_copy_datagram_msg(skb, 0, msg, len)) { /* Exception. Bailout! */ len = -EFAULT; break; diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c index ef2ad8a..fc9193e 100644 --- a/net/ieee802154/dgram.c +++ b/net/ieee802154/dgram.c @@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk, } /* FIXME: skip headers if necessary ?! */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c index 9d1f648..73a4d53 100644 --- a/net/ieee802154/raw.c +++ b/net/ieee802154/raw.c @@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c373a9a..21894df 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 57f7c98..736236c 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } /* Don't bother checking the checksum */ - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db31..ee8fa4b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 39ec0c3..c239f47 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len) /* XXX -- need to support SO_PEEK_OFF */ skb_queue_walk(&sk->sk_write_queue, skb) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len); + err = skb_copy_datagram_msg(skb, 0, msg, skb->len); if (err) break; @@ -1833,8 +1833,7 @@ do_prequeue: } if (!(flags & MSG_TRUNC)) { - err = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + err = skb_copy_datagram_msg(skb, offset, msg, used); if (err) { /* Exception. Bailout! */ if (!copied) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 3f001db..df19027 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1281,8 +1281,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index 2cdc383..5c6996e 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; @@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, msg->msg_flags |= MSG_TRUNC; copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free_skb; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 075a0fb..0cbcf98 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk, } if (skb_csum_unnecessary(skb)) { - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else if (msg->msg_flags&MSG_TRUNC) { if (__skb_checksum_complete(skb)) goto csum_copy_err; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); } else { err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov); if (err == -EINVAL) diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index f6ba535..9b68092 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -424,8 +424,8 @@ try_again: } if (skb_csum_unnecessary(skb)) - err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr), - msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, sizeof(struct udphdr), + msg, copied); else { err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov); if (err == -EINVAL) diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 313ef46..a0c7536 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1805,8 +1805,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov, - copied); + rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied); if (rc) goto out_free; if (skb->tstamp.tv64) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 92fafd4..980bc26 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1396,7 +1396,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, copied = size; msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index a089b6b..057b564 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1355,7 +1355,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock, sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN; cskb = skb; - if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, offset, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/key/af_key.c b/net/key/af_key.c index 1847ec4..e588309 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb, } skb_reset_transport_header(skb); - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 369a982..a6cc1fe 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 0edb263..2177b96 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk, copied = len; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto done; diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index b704a93..c559bcd 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock, else if (len < skb->len) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (likely(err == 0)) err = len; diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index bb9cbc1..af66266 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -819,8 +819,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, used = len; if (!(flags & MSG_TRUNC)) { - int rc = skb_copy_datagram_iovec(skb, offset, - msg->msg_iov, used); + int rc = skb_copy_datagram_msg(skb, offset, msg, used); if (rc) { /* Exception. Bailout! */ if (!copied) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f1de72d..580b794 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock, } skb_reset_transport_header(data_skb); - err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(data_skb, 0, msg, copied); if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name); diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 1b06a1f..7e13f6a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + er = skb_copy_datagram_msg(skb, 0, msg, copied); if (er < 0) { skb_free_datagram(sk, skb); release_sock(sk); diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 51f077a..83bc785 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = min_t(unsigned int, rlen, len); cskb = skb; - if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) { + if (skb_copy_datagram_msg(cskb, 0, msg, copied)) { if (!(flags & MSG_PEEK)) skb_queue_head(&sk->sk_receive_queue, skb); return -EFAULT; diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c index 11c3544..9d7d2b7 100644 --- a/net/nfc/rawsock.c +++ b/net/nfc/rawsock.c @@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock, copied = len; } - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); skb_free_datagram(sk, skb); diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 87d20f4..4cd13d8 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); if (err) goto out_free; diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 290352c..0918bc2 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk, copylen = len; } - rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen); + rval = skb_copy_datagram_msg(skb, 0, msg, copylen); if (rval) { rval = -EFAULT; goto out; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 70a547e..44b2123 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1296,7 +1296,7 @@ copy: else len = skb->len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len); + err = skb_copy_datagram_msg(skb, 0, msg, len); if (!err) err = (flags & MSG_TRUNC) ? skb->len : len; diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index a85c1a0..9b600c2 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_flags |= MSG_TRUNC; } - skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + skb_copy_datagram_msg(skb, 0, msg, copied); if (msg->msg_name) { struct sockaddr_rose *srose; diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c index e9aaa65..4575485 100644 --- a/net/rxrpc/ar-recvmsg.c +++ b/net/rxrpc/ar-recvmsg.c @@ -180,7 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock, if (copy > len - copied) copy = len - copied; - ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy); + ret = skb_copy_datagram_msg(skb, offset, msg, copy); if (ret < 0) goto copy_error; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 634a2ab..2120292 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk, if (copied > len) copied = len; - err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + err = skb_copy_datagram_msg(skb, 0, msg, copied); event = sctp_skb2event(skb); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index ad8a1a1..591bbfa 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1372,8 +1372,7 @@ restart: sz = buf_len; m->msg_flags |= MSG_TRUNC; } - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg), - m->msg_iov, sz); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz); if (res) goto exit; res = sz; @@ -1473,8 +1472,8 @@ restart: needed = (buf_len - sz_copied); sz_to_copy = (sz <= needed) ? sz : needed; - res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset, - m->msg_iov, sz_to_copy); + res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset, + m, sz_to_copy); if (res) goto exit; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e968843..5eee625 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1825,7 +1825,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, else if (size < skb->len - skip) msg->msg_flags |= MSG_TRUNC; - err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size); + err = skb_copy_datagram_msg(skb, skip, msg, size); if (err) goto out_free; @@ -2030,8 +2030,8 @@ again: } chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size); - if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip, - msg->msg_iov, chunk)) { + if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip, + msg, chunk)) { if (copied == 0) copied = -EFAULT; break; diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 9bb63ff..a57ddef 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1773,8 +1773,7 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb, } /* Place the datagram payload in the user's iovec. */ - err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov, - payload_len); + err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len); if (err) goto out; diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5ad4418..59e785b 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock, /* Currently, each datagram always contains a complete record */ msg->msg_flags |= MSG_EOR; - rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); + rc = skb_copy_datagram_msg(skb, 0, msg, copied); if (rc) goto out_free_dgram; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 20:50 ` David Miller @ 2014-11-05 21:07 ` Al Viro 2014-11-05 21:57 ` David Miller 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-05 21:07 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Wed, Nov 05, 2014 at 03:50:54PM -0500, David Miller wrote: > I think we can get away with it if, as you say, we don't export a 'msghdr' > from any uapi headers. OK. I'm about halfway through the review of struct msghdr instances in the current tree right now, will post user_msghdr patch once I'm done. Already found a dumb bug in o2net_send_tcp_msg() while doing that - broken by me back in 3.15 ;-/ Will send a fix to Linus in an hour or so... > And indeed, double checking, it's purely a linux/socket.h thing. > > If this patch is OK, mind if I toss it into net-next Al? Sure, no problem - AFAICS, the only real difference from rebase of April one I've quoted upthread is that you add include of socket.h into skbuff.h; the rest of the differences is pure whitespace noise. Ping me when you put it there, OK? I'll rebase the rest of old stuff on top of it (similar helpers, mostly). ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 21:07 ` Al Viro @ 2014-11-05 21:57 ` David Miller 2014-11-06 3:25 ` Al Viro 0 siblings, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-05 21:57 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl From: Al Viro <viro@ZenIV.linux.org.uk> Date: Wed, 5 Nov 2014 21:07:45 +0000 > Ping me when you put it there, OK? I'll rebase the rest of old stuff on > top of it (similar helpers, mostly). I just pushed it into net-next, thanks Al. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 21:57 ` David Miller @ 2014-11-06 3:25 ` Al Viro 2014-11-06 5:50 ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu ` (3 more replies) 0 siblings, 4 replies; 82+ messages in thread From: Al Viro @ 2014-11-06 3:25 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Wed, Nov 05, 2014 at 04:57:19PM -0500, David Miller wrote: > From: Al Viro <viro@ZenIV.linux.org.uk> > Date: Wed, 5 Nov 2014 21:07:45 +0000 > > > Ping me when you put it there, OK? I'll rebase the rest of old stuff on > > top of it (similar helpers, mostly). > > I just pushed it into net-next, thanks Al. OK, I've taken the beginning of the old queue on top of net-next; it's in git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net. >From the quick look at the remaining ->msg_iov users: * I'll need to add several iov_iter primitives - counterparts of checksum.h stuff (copy_and_csum_{from,to}_iter(), maybe some more). Not a big deal, I'll do that tomorrow. That will give us a clean iov_iter-based counterpart of skb_copy_and_csum_datagram_iovec(). * a new helper: zerocopy_sg_from_iter(). I have it, actually, but I'd rather not step on Herbert's toes - it's too close to the areas his series will touch, so that's probably for when his series goes in. It will be needed for complete macvtap conversion... * why doesn't verify_iovec() use rw_copy_check_uvector()? The only real differences I see is that (a) you do allocation in callers (same as rw_copy_check_uvector() would've done), (b) you return EMSGSIZE in case of too long vector, while rw_copy_check_uvector() returns EINVAL in that case and (c) you don't do access_ok(). The last one is described as optimization, but for iov_iter primitives it's a serious PITA - for iovec-backed instances they are using __copy_from_user()/__copy_to_user(), etc. It certainly would be nice to have the same code doing all copying of iovecs from userland - readv/writev/aio/sendmsg/recvmsg/etc. Am I missing something subtle semantical difference in there? EMSGSIZE vs EINVAL is trivial (we can lift that check into the callers, if nothing else), but I could miss something more interesting... * various getfrag will need to grow iov_iter-based counterparts, but ip_append_output() needs no changes, AFAICS. * crypto stuff will be easy to convert - iov_iter_get_pages() would suffice for a primitive * there's some really weird stuff in there. Just what is this static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { struct iovec *iov; u8 __user *type = NULL; u8 __user *code = NULL; int probed = 0; unsigned int i; if (!msg->msg_iov) return 0; for (i = 0; i < msg->msg_iovlen; i++) { iov = &msg->msg_iov[i]; if (!iov) continue; trying to do? "If non-NULL pointer + i somehow happened to be NULL, skip it and try to use the same pointer + i + 1"? Huh? Had been that way since the function first went in back in 2004 ("[IPV4] XFRM: probe icmp type/code when sending packets via raw socket.", according to historical tree)... * rds, bluetooth and vsock are doing something odd; need to RTFS some more. * not sure I understand what TIPC is doing - does it prohibit too short first segment of ->msg_iov? net/tipc/socket.c:dest_name_check() looks odd _and_ potentially racy - we read the same data twice and hope our checks still apply. I asked TIPC folks about that race back in April, but it looks like that fell through the cracks... Overall, so far it looks more or less feasible - other than the missing csum primitives, current mm/iov_iter.c should suffice. I have _not_ seriously looked into sendpage yet; that might very well require some more. ^ permalink raw reply [flat|nested] 82+ messages in thread
* ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 3:25 ` Al Viro @ 2014-11-06 5:50 ` Herbert Xu 2014-11-06 6:43 ` Al Viro 2014-11-06 9:50 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Jon Maloy ` (2 subsequent siblings) 3 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-06 5:50 UTC (permalink / raw) To: Al Viro Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura, Hideaki YOSHIFUJI On Thu, Nov 06, 2014 at 03:25:34AM +0000, Al Viro wrote: > > * there's some really weird stuff in there. Just what is this > static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) > { It looks like newbie coding that's all. There's nothing tricky here as far as I can tell. We're just trying to fetch the ICMP header to seed the IPsec lookup. So how about this rewrite? I'm assuming that you're not going to get rid of memcpy_fromiovecend/memcpy_toiovecend, if you are, let me know and I'll redo this with iterators. ipv4: Use standard iovec primitive in raw_probe_proto_opt The function raw_probe_proto_opt tries to extract the first two bytes from the user input in order to seed the IPsec lookup for ICMP packets. In doing so it's processing iovec by hand and overcomplicating things. This patch replaces the manual iovec processing with a call to memcpy_fromiovecend. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 739db31..04f67e1 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -422,48 +422,20 @@ error: static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { - struct iovec *iov; - u8 __user *type = NULL; - u8 __user *code = NULL; - int probed = 0; - unsigned int i; + struct icmphdr icmph; + int err; - if (!msg->msg_iov) + if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; - for (i = 0; i < msg->msg_iovlen; i++) { - iov = &msg->msg_iov[i]; - if (!iov) - continue; - - switch (fl4->flowi4_proto) { - case IPPROTO_ICMP: - /* check if one-byte field is readable or not. */ - if (iov->iov_base && iov->iov_len < 1) - break; - - if (!type) { - type = iov->iov_base; - /* check if code field is readable or not. */ - if (iov->iov_len > 1) - code = type + 1; - } else if (!code) - code = iov->iov_base; - - if (type && code) { - if (get_user(fl4->fl4_icmp_type, type) || - get_user(fl4->fl4_icmp_code, code)) - return -EFAULT; - probed = 1; - } - break; - default: - probed = 1; - break; - } - if (probed) - break; - } + /* We only need the first two bytes. */ + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); + if (err) + return err; + + fl4->fl4_icmp_type = icmph.type; + fl4->fl4_icmp_code = icmph.code; + return 0; } Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 5:50 ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu @ 2014-11-06 6:43 ` Al Viro 2014-11-06 6:46 ` Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-06 6:43 UTC (permalink / raw) To: Herbert Xu Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura, Hideaki YOSHIFUJI On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote: > + /* We only need the first two bytes. */ > + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); > + if (err) > + return err; > + > + fl4->fl4_icmp_type = icmph.type; > + fl4->fl4_icmp_code = icmph.code; That's more readable, but that exposes another problem in there - we read the same piece of userland data twice, with no promise whatsoever that we'll get the same value both times... ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 6:43 ` Al Viro @ 2014-11-06 6:46 ` Herbert Xu 2014-11-06 7:11 ` Al Viro 2014-11-06 21:28 ` David Miller 0 siblings, 2 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-06 6:46 UTC (permalink / raw) To: Al Viro Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura, Hideaki YOSHIFUJI On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote: > On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote: > > + /* We only need the first two bytes. */ > > + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); > > + if (err) > > + return err; > > + > > + fl4->fl4_icmp_type = icmph.type; > > + fl4->fl4_icmp_code = icmph.code; > > That's more readable, but that exposes another problem in there - we read > the same piece of userland data twice, with no promise whatsoever that we'll > get the same value both times... Sure, but you have to be root anyway to write to raw sockets. Patches are welcome :) Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 6:46 ` Herbert Xu @ 2014-11-06 7:11 ` Al Viro 2014-11-06 9:55 ` Jon Maloy 2014-11-06 21:28 ` David Miller 1 sibling, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-06 7:11 UTC (permalink / raw) To: Herbert Xu Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura, Hideaki YOSHIFUJI On Thu, Nov 06, 2014 at 02:46:29PM +0800, Herbert Xu wrote: > On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote: > > On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote: > > > + /* We only need the first two bytes. */ > > > + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); > > > + if (err) > > > + return err; > > > + > > > + fl4->fl4_icmp_type = icmph.type; > > > + fl4->fl4_icmp_code = icmph.code; > > > > That's more readable, but that exposes another problem in there - we read > > the same piece of userland data twice, with no promise whatsoever that we'll > > get the same value both times... > > Sure, but you have to be root anyway to write to raw sockets. Point, but that might very well be a pattern to watch for - there's at least one more instance in TIPC (also not exploitable, according to TIPC folks) and such bugs are easily repeated... BTW, I've picked the tun and macvtap related bits from another part of old queue; see vfs.git#untested-macvtap - it's on top of #iov_iter-net and it's really completely untested. Back then I was mostly interested in killing as many ->aio_write() instances as I could, so it's only the "send" side of things. ^ permalink raw reply [flat|nested] 82+ messages in thread
* RE: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 7:11 ` Al Viro @ 2014-11-06 9:55 ` Jon Maloy 2014-11-06 22:16 ` Al Viro 0 siblings, 1 reply; 82+ messages in thread From: Jon Maloy @ 2014-11-06 9:55 UTC (permalink / raw) To: Al Viro, Herbert Xu Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura, Hideaki YOSHIFUJI > -----Original Message----- > From: netdev-owner@vger.kernel.org [mailto:netdev- > owner@vger.kernel.org] On Behalf Of Al Viro > Sent: November-06-14 8:11 AM > To: Herbert Xu > Cc: David Miller; netdev@vger.kernel.org; linux-kernel@vger.kernel.org; > bcrl@kvack.org; Masahide Nakamura; Hideaki YOSHIFUJI > Subject: Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt > > On Thu, Nov 06, 2014 at 02:46:29PM +0800, Herbert Xu wrote: > > On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote: > > > On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote: > > > > + /* We only need the first two bytes. */ > > > > + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); > > > > + if (err) > > > > + return err; > > > > + > > > > + fl4->fl4_icmp_type = icmph.type; > > > > + fl4->fl4_icmp_code = icmph.code; > > > > > > That's more readable, but that exposes another problem in there - we > > > read the same piece of userland data twice, with no promise > > > whatsoever that we'll get the same value both times... > > > > Sure, but you have to be root anyway to write to raw sockets. > > Point, but that might very well be a pattern to watch for - there's at least one > more instance in TIPC (also not exploitable, according to TIPC folks) and such I don't recall this, and I can't see where it would be either. Can you please point to where it is? ///jon > bugs are easily repeated... > > BTW, I've picked the tun and macvtap related bits from another part of old > queue; see vfs.git#untested-macvtap - it's on top of #iov_iter-net and it's > really completely untested. Back then I was mostly interested in killing as > many ->aio_write() instances as I could, so it's only the "send" side of things. > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in the body > of a message to majordomo@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 9:55 ` Jon Maloy @ 2014-11-06 22:16 ` Al Viro 2014-11-28 5:14 ` Al Viro 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-06 22:16 UTC (permalink / raw) To: Jon Maloy Cc: Herbert Xu, David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura, Hideaki YOSHIFUJI On Thu, Nov 06, 2014 at 09:55:31AM +0000, Jon Maloy wrote: > > Point, but that might very well be a pattern to watch for - there's at least one > > more instance in TIPC (also not exploitable, according to TIPC folks) and such > > I don't recall this, and I can't see where it would be either. Can you please > point to where it is? The same dest_name_check() thing. This if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr))) return -EFAULT; if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN))) return -EACCES; is easily bypassed. Suppose you want to send a packet with these two bits in ->tcm_type not being 00, and you don't have CAP_NET_ADMIN. Not a problem - spawn two threads sharing memory, have one trying to call sendmsg() while another keeps flipping these two bits. Sooner of later you'll get the timing right and have these bits observed as 00 in dest_name_check() and 11 when it comes to memcpy_fromiovecend() actually copying the whole thing. And considering that the interval between those two is much longer than the loop in the second thread would take on each iteration, I'd expect the odds around 25% per attempted sendmsg(). IOW, this test is either pointless and can be removed completely, or there's an exploitable race. As far as I understand from your replies both back then and in another branch of this thread, it's the former and the proper fix is to remove at least that part of dest_name_check(). So this case is also not something exploitable, but it certainly matches the same pattern. My point was simply that this pattern is worth watching for - recurrent bug classes like that have a good chance to spawn an instance that will be exploitable. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 22:16 ` Al Viro @ 2014-11-28 5:14 ` Al Viro 0 siblings, 0 replies; 82+ messages in thread From: Al Viro @ 2014-11-28 5:14 UTC (permalink / raw) To: Jon Maloy Cc: Herbert Xu, David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura, Hideaki YOSHIFUJI On Thu, Nov 06, 2014 at 10:16:08PM +0000, Al Viro wrote: > On Thu, Nov 06, 2014 at 09:55:31AM +0000, Jon Maloy wrote: > > > Point, but that might very well be a pattern to watch for - there's at least one > > > more instance in TIPC (also not exploitable, according to TIPC folks) and such > > > > I don't recall this, and I can't see where it would be either. Can you please > > point to where it is? > > The same dest_name_check() thing. This > if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr))) > return -EFAULT; > if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN))) > return -EACCES; > is easily bypassed. Suppose you want to send a packet with these two > bits in ->tcm_type not being 00, and you don't have CAP_NET_ADMIN. > Not a problem - spawn two threads sharing memory, have one trying to > call sendmsg() while another keeps flipping these two bits. Sooner > of later you'll get the timing right and have these bits observed as 00 > in dest_name_check() and 11 when it comes to memcpy_fromiovecend() actually > copying the whole thing. And considering that the interval between those > two is much longer than the loop in the second thread would take on > each iteration, I'd expect the odds around 25% per attempted sendmsg(). > > IOW, this test is either pointless and can be removed completely, or there's > an exploitable race. As far as I understand from your replies both back then > and in another branch of this thread, it's the former and the proper fix is > to remove at least that part of dest_name_check(). So this case is also > not something exploitable, but it certainly matches the same pattern. > > My point was simply that this pattern is worth watching for - recurrent bug > classes like that have a good chance to spawn an instance that will be > exploitable. Ping? Can we simply remove dest_name_check() completely? That's one of the few remaining obstacles to making ->sendmsg() iov_iter-clean. For now I'm simply commenting its call out in tipc_sendmsg(); if it _is_ needed for anything, we'll need to get rid of that double copying from userland. I can do that, but my impression from your comments back in April is that you planned to removed the damn check anyway. Another question: in tipc_send_stream() we have mtu = tsk->max_pkt; send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE); __skb_queue_head_init(&head); rc = tipc_msg_build(mhdr, m, sent, send, mtu, &head); if (unlikely(rc < 0)) goto exit; do { if (likely(!tsk_conn_cong(tsk))) { rc = tipc_link_xmit(&head, dnode, ref); if (likely(!rc)) { tsk->sent_unacked++; sent += send; if (sent == dsz) break; goto next; } if (rc == -EMSGSIZE) { tsk->max_pkt = tipc_node_get_mtu(dnode, ref); goto next; } How can it hit that EMSGSIZE? AFAICS, it can come only from int __tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list) { struct tipc_msg *msg = buf_msg(skb_peek(list)); uint psz = msg_size(msg); ... uint mtu = link->max_pkt; ... /* Has valid packet limit been used ? */ if (unlikely(psz > mtu)) { __skb_queue_purge(list); return -EMSGSIZE; } and msg_size() is basically the bits copied into skb by tipc_msg_build() and set by msg_set_size() in there. And unless I'm seriously misreading that function, it can't be more than pktmax argument, i.e. mtu. So unless something manages to crap into our skb or change mtu right under us, it shouldn't be possible. And mtu (i.e. ->max_pkt) ought to be protected by lock_sock() there. What's going on there? ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 6:46 ` Herbert Xu 2014-11-06 7:11 ` Al Viro @ 2014-11-06 21:28 ` David Miller 2014-11-07 2:00 ` Herbert Xu 1 sibling, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-06 21:28 UTC (permalink / raw) To: herbert; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji From: Herbert Xu <herbert@gondor.apana.org.au> Date: Thu, 6 Nov 2014 14:46:29 +0800 > On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote: >> On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote: >> > + /* We only need the first two bytes. */ >> > + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); >> > + if (err) >> > + return err; >> > + >> > + fl4->fl4_icmp_type = icmph.type; >> > + fl4->fl4_icmp_code = icmph.code; >> >> That's more readable, but that exposes another problem in there - we read >> the same piece of userland data twice, with no promise whatsoever that we'll >> get the same value both times... > > Sure, but you have to be root anyway to write to raw sockets. > > Patches are welcome :) I'd agree with this root-only argument maybe 15 years ago, but with containers and stuff like that we want to prevent root X from messing up the machine for root Y. This is a recurring topic, and I'd strongly like to avoid adding new ways that these kinds of problems can happen. For example, I'm still on the hook to address the AF_NETLINK mmap TX code, which has a similarly abusable issue. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-06 21:28 ` David Miller @ 2014-11-07 2:00 ` Herbert Xu 2014-11-07 13:25 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-07 2:00 UTC (permalink / raw) To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji On Thu, Nov 06, 2014 at 04:28:08PM -0500, David Miller wrote: > From: Herbert Xu <herbert@gondor.apana.org.au> > Date: Thu, 6 Nov 2014 14:46:29 +0800 > > > On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote: > >> On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote: > >> > + /* We only need the first two bytes. */ > >> > + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); > >> > + if (err) > >> > + return err; > >> > + > >> > + fl4->fl4_icmp_type = icmph.type; > >> > + fl4->fl4_icmp_code = icmph.code; > >> > >> That's more readable, but that exposes another problem in there - we read > >> the same piece of userland data twice, with no promise whatsoever that we'll > >> get the same value both times... > > > > Sure, but you have to be root anyway to write to raw sockets. > > > > Patches are welcome :) > > I'd agree with this root-only argument maybe 15 years ago, but with > containers and stuff like that we want to prevent root X from messing > up the machine for root Y. > > This is a recurring topic, and I'd strongly like to avoid adding new > ways that these kinds of problems can happen. > > For example, I'm still on the hook to address the AF_NETLINK mmap TX > code, which has a similarly abusable issue. Fair enough. Even though the bug existed prior to my patch I'll see if we could get rid of it. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice 2014-11-07 2:00 ` Herbert Xu @ 2014-11-07 13:25 ` Herbert Xu 2014-11-07 13:27 ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu ` (2 more replies) 0 siblings, 3 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:25 UTC (permalink / raw) To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji Hi Dave: This series rewrites the function raw_probe_proto_opt in a more readable fasion, and then fixes the long-standing bug where we read the probed bytes twice which means that what we're using to probe may in fact be invalid. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt 2014-11-07 13:25 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu @ 2014-11-07 13:27 ` Herbert Xu 2014-11-07 13:27 ` [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt Herbert Xu 2014-11-10 19:26 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice David Miller 2 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:27 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki, nakam The function raw_probe_proto_opt tries to extract the first two bytes from the user input in order to seed the IPsec lookup for ICMP packets. In doing so it's processing iovec by hand and overcomplicating things. This patch replaces the manual iovec processing with a call to memcpy_fromiovecend. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- net/ipv4/raw.c | 50 +++++++++++--------------------------------------- 1 file changed, 11 insertions(+), 39 deletions(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ee8fa4b..9be9050 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -422,48 +422,20 @@ error: static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { - struct iovec *iov; - u8 __user *type = NULL; - u8 __user *code = NULL; - int probed = 0; - unsigned int i; + struct icmphdr icmph; + int err; - if (!msg->msg_iov) + if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; - for (i = 0; i < msg->msg_iovlen; i++) { - iov = &msg->msg_iov[i]; - if (!iov) - continue; - - switch (fl4->flowi4_proto) { - case IPPROTO_ICMP: - /* check if one-byte field is readable or not. */ - if (iov->iov_base && iov->iov_len < 1) - break; - - if (!type) { - type = iov->iov_base; - /* check if code field is readable or not. */ - if (iov->iov_len > 1) - code = type + 1; - } else if (!code) - code = iov->iov_base; - - if (type && code) { - if (get_user(fl4->fl4_icmp_type, type) || - get_user(fl4->fl4_icmp_code, code)) - return -EFAULT; - probed = 1; - } - break; - default: - probed = 1; - break; - } - if (probed) - break; - } + /* We only need the first two bytes. */ + err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); + if (err) + return err; + + fl4->fl4_icmp_type = icmph.type; + fl4->fl4_icmp_code = icmph.code; + return 0; } ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt 2014-11-07 13:25 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu 2014-11-07 13:27 ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu @ 2014-11-07 13:27 ` Herbert Xu 2014-11-10 19:26 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice David Miller 2 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:27 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki, nakam Ever since raw_probe_proto_opt was added it had the problem of causing the user iov to be read twice, once during the probe for the protocol header and once again in ip_append_data. This is a potential security problem since it means that whatever we're probing may be invalid. This patch plugs the hole by firstly advancing the iov so we don't read the same spot again, and secondly saving what we read the first time around for use by ip_append_data. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- net/ipv4/raw.c | 62 +++++++++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 8 deletions(-) diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9be9050..43385a9 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -79,6 +79,16 @@ #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> #include <linux/compat.h> +#include <linux/uio.h> + +struct raw_frag_vec { + struct iovec *iov; + union { + struct icmphdr icmph; + char c[1]; + } hdr; + int hlen; +}; static struct raw_hashinfo raw_v4_hashinfo = { .lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock), @@ -420,25 +430,57 @@ error: return err; } -static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) +static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4) { - struct icmphdr icmph; int err; if (fl4->flowi4_proto != IPPROTO_ICMP) return 0; /* We only need the first two bytes. */ - err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2); + rfv->hlen = 2; + + err = memcpy_fromiovec(rfv->hdr.c, rfv->iov, rfv->hlen); if (err) return err; - fl4->fl4_icmp_type = icmph.type; - fl4->fl4_icmp_code = icmph.code; + fl4->fl4_icmp_type = rfv->hdr.icmph.type; + fl4->fl4_icmp_code = rfv->hdr.icmph.code; return 0; } +static int raw_getfrag(void *from, char *to, int offset, int len, int odd, + struct sk_buff *skb) +{ + struct raw_frag_vec *rfv = from; + + if (offset < rfv->hlen) { + int copy = min(rfv->hlen - offset, len); + + if (skb->ip_summed == CHECKSUM_PARTIAL) + memcpy(to, rfv->hdr.c + offset, copy); + else + skb->csum = csum_block_add( + skb->csum, + csum_partial_copy_nocheck(rfv->hdr.c + offset, + to, copy, 0), + odd); + + odd = 0; + offset += copy; + to += copy; + len -= copy; + + if (!len) + return 0; + } + + offset -= rfv->hlen; + + return ip_generic_getfrag(rfv->iov, to, offset, len, odd, skb); +} + static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -452,6 +494,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, u8 tos; int err; struct ip_options_data opt_copy; + struct raw_frag_vec rfv; err = -EMSGSIZE; if (len > 0xFFFF) @@ -557,7 +600,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, daddr, saddr, 0, 0); if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl4, msg); + rfv.iov = msg->msg_iov; + rfv.hlen = 0; + + err = raw_probe_proto_opt(&rfv, &fl4); if (err) goto done; } @@ -588,8 +634,8 @@ back_from_confirm: if (!ipc.addr) ipc.addr = fl4.daddr; lock_sock(sk); - err = ip_append_data(sk, &fl4, ip_generic_getfrag, - msg->msg_iov, len, 0, + err = ip_append_data(sk, &fl4, raw_getfrag, + &rfv, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice 2014-11-07 13:25 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu 2014-11-07 13:27 ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu 2014-11-07 13:27 ` [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt Herbert Xu @ 2014-11-10 19:26 ` David Miller 2 siblings, 0 replies; 82+ messages in thread From: David Miller @ 2014-11-10 19:26 UTC (permalink / raw) To: herbert; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji From: Herbert Xu <herbert@gondor.apana.org.au> Date: Fri, 7 Nov 2014 21:25:53 +0800 > This series rewrites the function raw_probe_proto_opt in a more > readable fasion, and then fixes the long-standing bug where we > read the probed bytes twice which means that what we're using to > probe may in fact be invalid. Series applied to net-next, thanks Herbert. ^ permalink raw reply [flat|nested] 82+ messages in thread
* RE: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 3:25 ` Al Viro 2014-11-06 5:50 ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu @ 2014-11-06 9:50 ` Jon Maloy 2014-11-07 21:48 ` David Miller 2014-11-07 21:52 ` David Miller 3 siblings, 0 replies; 82+ messages in thread From: Jon Maloy @ 2014-11-06 9:50 UTC (permalink / raw) To: Al Viro, David Miller; +Cc: herbert, netdev, linux-kernel, bcrl > -----Original Message----- > From: netdev-owner@vger.kernel.org [mailto:netdev- > owner@vger.kernel.org] On Behalf Of Al Viro > Sent: November-06-14 4:26 AM > To: David Miller > Cc: herbert@gondor.apana.org.au; netdev@vger.kernel.org; linux- > kernel@vger.kernel.org; bcrl@kvack.org > Subject: Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter > > On Wed, Nov 05, 2014 at 04:57:19PM -0500, David Miller wrote: > > From: Al Viro <viro@ZenIV.linux.org.uk> > > Date: Wed, 5 Nov 2014 21:07:45 +0000 > > > > > Ping me when you put it there, OK? I'll rebase the rest of old > > > stuff on top of it (similar helpers, mostly). > > > > I just pushed it into net-next, thanks Al. > > OK, I've taken the beginning of the old queue on top of net-next; it's in > git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net. > > From the quick look at the remaining ->msg_iov users: > > * I'll need to add several iov_iter primitives - counterparts of > checksum.h stuff (copy_and_csum_{from,to}_iter(), maybe some more). > Not a big deal, I'll do that tomorrow. That will give us a clean iov_iter-based > counterpart of skb_copy_and_csum_datagram_iovec(). > > * a new helper: zerocopy_sg_from_iter(). I have it, actually, but I'd > rather not step on Herbert's toes - it's too close to the areas his series will > touch, so that's probably for when his series goes in. > It will be needed for complete macvtap conversion... > > * why doesn't verify_iovec() use rw_copy_check_uvector()? The > only real differences I see is that (a) you do allocation in callers (same as > rw_copy_check_uvector() would've done), (b) you return EMSGSIZE in case > of too long vector, while rw_copy_check_uvector() returns EINVAL in that > case and (c) you don't do access_ok(). The last one is described as > optimization, but for iov_iter primitives it's a serious PITA - for iovec-backed > instances they are using __copy_from_user()/__copy_to_user(), etc. > It certainly would be nice to have the same code doing all copying of > iovecs from userland - readv/writev/aio/sendmsg/recvmsg/etc. Am I > missing something subtle semantical difference in there? EMSGSIZE vs > EINVAL is trivial (we can lift that check into the callers, if nothing else), but I > could miss something more interesting... > > * various getfrag will need to grow iov_iter-based counterparts, but > ip_append_output() needs no changes, AFAICS. > > * crypto stuff will be easy to convert - iov_iter_get_pages() would > suffice for a primitive > > * there's some really weird stuff in there. Just what is this static int > raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) { > struct iovec *iov; > u8 __user *type = NULL; > u8 __user *code = NULL; > int probed = 0; > unsigned int i; > > if (!msg->msg_iov) > return 0; > > for (i = 0; i < msg->msg_iovlen; i++) { > iov = &msg->msg_iov[i]; > if (!iov) > continue; > trying to do? "If non-NULL pointer + i somehow happened to be NULL, skip it > and try to use the same pointer + i + 1"? Huh? Had been that way since the > function first went in back in 2004 ("[IPV4] XFRM: probe icmp type/code > when sending packets via raw socket.", according to historical tree)... > > * rds, bluetooth and vsock are doing something odd; need to RTFS > some more. > > * not sure I understand what TIPC is doing - does it prohibit too short > first segment of ->msg_iov? Yes, that is the purpose. It was needed in early versions of TIPC, which was using TIPC itself, instead of netlink, as carrier of configuration commands. This option is long gone, and we can safely remove those checks. I'll post a patch. ///jon net/tipc/socket.c:dest_name_check() looks > odd _and_ potentially racy - we read the same data twice and hope our > checks still apply. I asked TIPC folks about that race back in April, but it looks > like that fell through the cracks... > > Overall, so far it looks more or less feasible - other than the missing csum > primitives, current mm/iov_iter.c should suffice. I have _not_ seriously > looked into sendpage yet; that might very well require some more. > -- > To unsubscribe from this list: send the line "unsubscribe netdev" in the body > of a message to majordomo@vger.kernel.org More majordomo info at > http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 3:25 ` Al Viro 2014-11-06 5:50 ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu 2014-11-06 9:50 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Jon Maloy @ 2014-11-07 21:48 ` David Miller 2014-11-07 22:11 ` Al Viro 2014-11-07 21:52 ` David Miller 3 siblings, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-07 21:48 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl From: Al Viro <viro@ZenIV.linux.org.uk> Date: Thu, 6 Nov 2014 03:25:34 +0000 > OK, I've taken the beginning of the old queue on top of net-next; it's > in git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net. What I see in there looks good. I wonder if we can somehow make msghdr pointer args const... but this is not so important now. Some minor coding style nits, comments: /* Like * this. */ and for multi-line function calls in the networking, align the second and subsequent lines at the first column after the openning parenthesis of the first line. Thanks. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 21:48 ` David Miller @ 2014-11-07 22:11 ` Al Viro 2014-11-07 22:31 ` Al Viro 2014-11-07 23:42 ` Al Viro 0 siblings, 2 replies; 82+ messages in thread From: Al Viro @ 2014-11-07 22:11 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Fri, Nov 07, 2014 at 04:48:59PM -0500, David Miller wrote: > From: Al Viro <viro@ZenIV.linux.org.uk> > Date: Thu, 6 Nov 2014 03:25:34 +0000 > > > OK, I've taken the beginning of the old queue on top of net-next; it's > > in git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net. > > What I see in there looks good. I wonder if we can somehow make msghdr > pointer args const... but this is not so important now. > > Some minor coding style nits, comments: > > /* Like > * this. > */ > > and for multi-line function calls in the networking, align the second > and subsequent lines at the first column after the openning parenthesis > of the first line. OK... I played with csum side of things a bit, and I've noticed something really nasty - iov_iter primitives all assume that iovec has been validated, _including_ access_ok() on all ranges. That allows us to use __copy_...() in primitives, and on the read/write/readv/writev/aio side of things we have that validation done when we copy iovec from userland (or set a single-element iovec over the userland-supplied range, as in read(2)/write(2)). net/* primitives, OTOH, do access_ok() themselves - syscalls do _not_ check access_ok() on iovec from untrusted source and rely on the low-level stuff to do such checks. Result: regular IO syscalls on sockets (i.e. not recvmsg/sendmsg, usual read/write) do these checks (at least) twice and use of copy_from_iter() in ->recvmsg() opens quite a nasty hole - one can call recvmsg(2) with kernel address in ->msg_iov[0].base and have such an instance of ->recvmsg() stomp on the kernel memory. At the very least, with Herbert's patches we need to validate that somewhere on the way to tun and macvtap recvmsg instances. We can do that right there, and as a stopgap measure it might be a good idea. However, it's not a sane long-term solution. We could, of course, add those access_ok() in mm/iov_iter.c and drop them from fs/read_write.c and fs/aio.c, but I really don't see the point - why not do that along with the checks we do in verify_iovec() anyway? And drop them from memcpy_fromiovec() et.al. I'm looking through the tree right now; so far it looks like we can just move those suckers to the point where we validate iovec and lose them from low-level iovec and csum copying completely. I still haven't finished tracing all possible paths for address to arrive at the points where we currently check that stuff, but so far it looks very doable. Comments? ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 22:11 ` Al Viro @ 2014-11-07 22:31 ` Al Viro 2014-11-07 22:35 ` Al Viro 2014-11-07 23:42 ` Al Viro 1 sibling, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-07 22:31 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Fri, Nov 07, 2014 at 10:11:14PM +0000, Al Viro wrote: > I'm looking through the tree right now; so far it looks like we can just > move those suckers to the point where we validate iovec and lose them > from low-level iovec and csum copying completely. I still haven't finished > tracing all possible paths for address to arrive at the points where we > currently check that stuff, but so far it looks very doable. BTW, csum side of that is also chock-full of duplicate access_ok() - e.g. generic csum_and_copy_from_user() checks before calling csum_partial_copy_from_user(). And generic instance of that is using __copy_from_user(), all right, but a _lot_ of non-default instances repeat that access_ok(). ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 22:31 ` Al Viro @ 2014-11-07 22:35 ` Al Viro 0 siblings, 0 replies; 82+ messages in thread From: Al Viro @ 2014-11-07 22:35 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Fri, Nov 07, 2014 at 10:31:53PM +0000, Al Viro wrote: > On Fri, Nov 07, 2014 at 10:11:14PM +0000, Al Viro wrote: > > > I'm looking through the tree right now; so far it looks like we can just > > move those suckers to the point where we validate iovec and lose them > > from low-level iovec and csum copying completely. I still haven't finished > > tracing all possible paths for address to arrive at the points where we > > currently check that stuff, but so far it looks very doable. > > BTW, csum side of that is also chock-full of duplicate access_ok() - > e.g. generic csum_and_copy_from_user() checks before calling > csum_partial_copy_from_user(). And generic instance of that is using > __copy_from_user(), all right, but a _lot_ of non-default instances > repeat that access_ok(). While we are at it: here's the default csum_and_copy_to_user() static __inline__ __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len, __wsum sum, int *err_ptr) { sum = csum_partial(src, len, sum); if (access_ok(VERIFY_WRITE, dst, len)) { if (copy_to_user(dst, src, len) == 0) return sum; } if (len) *err_ptr = -EFAULT; return (__force __wsum)-1; /* invalid checksum */ } Note that we do that access_ok() and follow it with copy_to_user() on exact same range, i.e. repeat the same damn check... ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 22:11 ` Al Viro 2014-11-07 22:31 ` Al Viro @ 2014-11-07 23:42 ` Al Viro 2014-11-08 2:21 ` Herbert Xu 2014-11-09 21:19 ` Al Viro 1 sibling, 2 replies; 82+ messages in thread From: Al Viro @ 2014-11-07 23:42 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Fri, Nov 07, 2014 at 10:11:14PM +0000, Al Viro wrote: > I'm looking through the tree right now; so far it looks like we can just > move those suckers to the point where we validate iovec and lose them > from low-level iovec and csum copying completely. I still haven't finished > tracing all possible paths for address to arrive at the points where we > currently check that stuff, but so far it looks very doable. Definitely doable. The only remaining interesting part is drivers/vhost with the stuff it puts in vq->iov[]. If we can guarantee that it satisfies the sanity checks (access_ok() and size-related ones), we are done - making verify_iovec() use rw_copy_check_uvector() (and verify_compat_iov() use compat_rw_copy_check_uvector()) will suffice to guarantee that none of csum_partial_copy_fromiovecend memcpy_fromiovec memcpy_toiovec memcpy_toiovecend memcpy_fromiovecend skb_copy_datagram_iovec skb_copy_datagram_iter skb_copy_datagram_from_iter zerocopy_sg_from_iter skb_copy_and_csum_datagram skb_copy_and_csum_datagram_iovec csum_and_copy_from_user csum_and_copy_to_user csum_partial_copy_from_user will ever see an address that doesn't satisfy access_ok() checks. And having looked at the data flow... we definitely want to do those checks on intake of iovec - as it is, we usually repeat them quite a few times for the same iovec segment, and we practically never end up _not_ doing them for some segment of iovec, unless we hit a failure exit before we get around to copying any data at all. I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the series moving those checks to the moment of copying iovec from userland, so that kernel-side we could always rely on ->msg_iov elements having been verified. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 23:42 ` Al Viro @ 2014-11-08 2:21 ` Herbert Xu 2014-11-09 21:19 ` Al Viro 1 sibling, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-08 2:21 UTC (permalink / raw) To: Al Viro; +Cc: David Miller, netdev, linux-kernel, bcrl On Fri, Nov 07, 2014 at 11:42:53PM +0000, Al Viro wrote: > I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the > series moving those checks to the moment of copying iovec from userland, > so that kernel-side we could always rely on ->msg_iov elements having been > verified. Great thanks! Dave, please hold off on my skb_copy_datagram_iter series until this verify_iovec change is added. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 23:42 ` Al Viro 2014-11-08 2:21 ` Herbert Xu @ 2014-11-09 21:19 ` Al Viro 2014-11-10 5:20 ` David Miller 2014-11-10 10:14 ` Michael S. Tsirkin 1 sibling, 2 replies; 82+ messages in thread From: Al Viro @ 2014-11-09 21:19 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl, Michael S. Tsirkin [Michael Cc'd] On Fri, Nov 07, 2014 at 11:42:53PM +0000, Al Viro wrote: > I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the > series moving those checks to the moment of copying iovec from userland, > so that kernel-side we could always rely on ->msg_iov elements having been > verified. Two questions: 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks? AFAICS, x86 and ppc just check that address is OK for 64bit process - if a 32bit process passes the kernel an address that would be valid for 64bit process, but not for 32bit one, we just get a pagefault in __copy_from_user() and friends. No kernel objects are going to have a virtual address in that range, so access_ok() doesn't bother preventing such access attempts there... 2) shouldn't vhost_dev_cleanup() stop the worker thread before doing anything else? AFAICS, we do parts of vhost_dev teardown while the thread is still running; granted, we keep dev->mm pinned down until after it stops (or we would be _really_ screwed), but is it safe to do all those fput()s, etc. while it's still running? Michael? ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-09 21:19 ` Al Viro @ 2014-11-10 5:20 ` David Miller 2014-11-10 6:58 ` Al Viro 2014-11-10 10:14 ` Michael S. Tsirkin 1 sibling, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-10 5:20 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl, mst From: Al Viro <viro@ZenIV.linux.org.uk> Date: Sun, 9 Nov 2014 21:19:08 +0000 > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks? sparc64 will just fault no matter what kind of task it is. It is impossible for a user task to generate a reference to a kernel virtual address, as kernel and user accesses each go via a separate address space identifier. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-10 5:20 ` David Miller @ 2014-11-10 6:58 ` Al Viro 2014-11-10 7:30 ` David Miller 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-10 6:58 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl, mst On Mon, Nov 10, 2014 at 12:20:20AM -0500, David Miller wrote: > From: Al Viro <viro@ZenIV.linux.org.uk> > Date: Sun, 9 Nov 2014 21:19:08 +0000 > > > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks? > > sparc64 will just fault no matter what kind of task it is. > > It is impossible for a user task to generate a reference to > a kernel virtual address, as kernel and user accesses each > go via a separate address space identifier. Sure, but why do we have access_ok() there at all? I.e. why not just have it constant 1? ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-10 6:58 ` Al Viro @ 2014-11-10 7:30 ` David Miller 2014-11-10 9:09 ` Al Viro 0 siblings, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-10 7:30 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl, mst From: Al Viro <viro@ZenIV.linux.org.uk> Date: Mon, 10 Nov 2014 06:58:17 +0000 > On Mon, Nov 10, 2014 at 12:20:20AM -0500, David Miller wrote: >> From: Al Viro <viro@ZenIV.linux.org.uk> >> Date: Sun, 9 Nov 2014 21:19:08 +0000 >> >> > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks? >> >> sparc64 will just fault no matter what kind of task it is. >> >> It is impossible for a user task to generate a reference to >> a kernel virtual address, as kernel and user accesses each >> go via a separate address space identifier. > > Sure, but why do we have access_ok() there at all? I.e. why not just have > it constant 1? Since access_ok() is in fact constant 1 on sparc64, where we use it, does it really matter? ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-10 7:30 ` David Miller @ 2014-11-10 9:09 ` Al Viro 2014-11-10 16:18 ` David Miller 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-10 9:09 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl, mst On Mon, Nov 10, 2014 at 02:30:00AM -0500, David Miller wrote: > From: Al Viro <viro@ZenIV.linux.org.uk> > Date: Mon, 10 Nov 2014 06:58:17 +0000 > > > On Mon, Nov 10, 2014 at 12:20:20AM -0500, David Miller wrote: > >> From: Al Viro <viro@ZenIV.linux.org.uk> > >> Date: Sun, 9 Nov 2014 21:19:08 +0000 > >> > >> > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks? > >> > >> sparc64 will just fault no matter what kind of task it is. > >> > >> It is impossible for a user task to generate a reference to > >> a kernel virtual address, as kernel and user accesses each > >> go via a separate address space identifier. > > > > Sure, but why do we have access_ok() there at all? I.e. why not just have > > it constant 1? > > Since access_ok() is in fact constant 1 on sparc64, where we use it, > does it really matter? *blink* My apologies - I've got confused by the maze of twisty includes, all alike. Right you are; in biarch case it *doesn't* depend on 32bit vs. 64bit. STACK_TOP-using one is sparc32 variant where we obviously don't have biarch at all. Anyway, the series switching to {compat_,}rw_copy_check_uvector() and getting rid of duplicate checks is in vfs.git#iov_iter-net. Warning: it's almost completely untested. It survives boot, ssh into it and runltp -f syscalls (no regressions), but that's about it. BTW, what's the usual regression suite used for net/* stuff? 3 commits in there, on top of net-next#master; head should be at 555126. There's a bunch of fairly obvious followups becoming possible after that, but let's keep those separate... ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-10 9:09 ` Al Viro @ 2014-11-10 16:18 ` David Miller 0 siblings, 0 replies; 82+ messages in thread From: David Miller @ 2014-11-10 16:18 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl, mst From: Al Viro <viro@ZenIV.linux.org.uk> Date: Mon, 10 Nov 2014 09:09:55 +0000 > BTW, what's the usual regression suite used for net/* stuff? There's a regression suite? :-) ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-09 21:19 ` Al Viro 2014-11-10 5:20 ` David Miller @ 2014-11-10 10:14 ` Michael S. Tsirkin 1 sibling, 0 replies; 82+ messages in thread From: Michael S. Tsirkin @ 2014-11-10 10:14 UTC (permalink / raw) To: Al Viro; +Cc: David Miller, herbert, netdev, linux-kernel, bcrl On Sun, Nov 09, 2014 at 09:19:08PM +0000, Al Viro wrote: > [Michael Cc'd] > > On Fri, Nov 07, 2014 at 11:42:53PM +0000, Al Viro wrote: > > > I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the > > series moving those checks to the moment of copying iovec from userland, > > so that kernel-side we could always rely on ->msg_iov elements having been > > verified. > > Two questions: > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks? > AFAICS, x86 and ppc just check that address is OK for 64bit process - > if a 32bit process passes the kernel an address that would be valid > for 64bit process, but not for 32bit one, we just get a pagefault in > __copy_from_user() and friends. No kernel objects are going to have > a virtual address in that range, so access_ok() doesn't bother preventing > such access attempts there... > > 2) shouldn't vhost_dev_cleanup() stop the worker thread before doing anything > else? > AFAICS, we do parts of vhost_dev teardown while the thread is > still running; granted, we keep dev->mm pinned down until after it stops > (or we would be _really_ screwed), but is it safe to do all those fput()s, etc. > while it's still running? Michael? Before invoking vhost_dev_cleanup, the caller for vhost-net (vhost_net_release) sets private data to NULL (using vhost_net_stop_vq) which guarantees thread will do nothing at all. vhost scsi does it in vhost_scsi_clear_endpoint. -- MST ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 3:25 ` Al Viro ` (2 preceding siblings ...) 2014-11-07 21:48 ` David Miller @ 2014-11-07 21:52 ` David Miller 3 siblings, 0 replies; 82+ messages in thread From: David Miller @ 2014-11-07 21:52 UTC (permalink / raw) To: viro; +Cc: herbert, netdev, linux-kernel, bcrl From: Al Viro <viro@ZenIV.linux.org.uk> Date: Thu, 6 Nov 2014 03:25:34 +0000 > * a new helper: zerocopy_sg_from_iter(). I have it, actually, > but I'd rather not step on Herbert's toes - it's too close to the areas > his series will touch, so that's probably for when his series goes in. > It will be needed for complete macvtap conversion... Just a heads up, his series is applied to net-next. > * why doesn't verify_iovec() use rw_copy_check_uvector()? The only > real differences I see is that (a) you do allocation in callers (same as > rw_copy_check_uvector() would've done), (b) you return EMSGSIZE in case of > too long vector, while rw_copy_check_uvector() returns EINVAL in that case > and (c) you don't do access_ok(). The last one is described as optimization, > but for iov_iter primitives it's a serious PITA - for iovec-backed instances > they are using __copy_from_user()/__copy_to_user(), etc. The answer is that nobody knew abuot it and looked, that's why. > It certainly would be nice to have the same code doing all copying > of iovecs from userland - readv/writev/aio/sendmsg/recvmsg/etc. Am I > missing something subtle semantical difference in there? EMSGSIZE vs EINVAL > is trivial (we can lift that check into the callers, if nothing else), but > I could miss something more interesting... We also need compat counterparts. > * various getfrag will need to grow iov_iter-based counterparts, > but ip_append_output() needs no changes, AFAICS. Right. > * there's some really weird stuff in there. Just what is this > static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) > { > struct iovec *iov; > u8 __user *type = NULL; > u8 __user *code = NULL; > int probed = 0; > unsigned int i; > > if (!msg->msg_iov) > return 0; > > for (i = 0; i < msg->msg_iovlen; i++) { > iov = &msg->msg_iov[i]; > if (!iov) > continue; > trying to do? "If non-NULL pointer + i somehow happened to be NULL, skip it > and try to use the same pointer + i + 1"? Huh? Had been that way since > the function first went in back in 2004 ("[IPV4] XFRM: probe icmp type/code > when sending packets via raw socket.", according to historical tree)... This is probably just bogus, because this address-of will never evaluate to NULL. > * rds, bluetooth and vsock are doing something odd; need to RTFS some > more. It is not surprising.... :-/ ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-04 8:31 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu 2014-11-04 14:32 ` Al Viro @ 2014-11-05 20:24 ` David Miller 2014-11-06 8:23 ` Herbert Xu 2014-11-06 8:27 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 1 sibling, 2 replies; 82+ messages in thread From: David Miller @ 2014-11-05 20:24 UTC (permalink / raw) To: herbert; +Cc: viro, netdev, linux-kernel, bcrl Herbert, please provide a cover letter for this series, and the most recent version of patch #2 gets various rejects when I try to apply it to net-next. Thanks. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-05 20:24 ` David Miller @ 2014-11-06 8:23 ` Herbert Xu 2014-11-06 17:25 ` David Miller 2014-11-06 8:27 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 1 sibling, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-06 8:23 UTC (permalink / raw) To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl On Wed, Nov 05, 2014 at 03:24:10PM -0500, David Miller wrote: > > Herbert, please provide a cover letter for this series, and the most recent > version of patch #2 gets various rejects when I try to apply it to net-next. Sure, I'll regenerate them. However, while doing so I noticed that a number of my patches on tun/macvtap that you have previously set as accepted are missing from net-next. Could this be why you got the rejects? For example, this patch wasn't in net-next when I just did a pull. https://patchwork.ozlabs.org/patch/405966/ Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 8:23 ` Herbert Xu @ 2014-11-06 17:25 ` David Miller 2014-11-07 1:59 ` Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-06 17:25 UTC (permalink / raw) To: herbert; +Cc: viro, netdev, linux-kernel, bcrl From: Herbert Xu <herbert@gondor.apana.org.au> Date: Thu, 6 Nov 2014 16:23:38 +0800 > On Wed, Nov 05, 2014 at 03:24:10PM -0500, David Miller wrote: >> >> Herbert, please provide a cover letter for this series, and the most recent >> version of patch #2 gets various rejects when I try to apply it to net-next. > > Sure, I'll regenerate them. However, while doing so I noticed that > a number of my patches on tun/macvtap that you have previously set > as accepted are missing from net-next. Could this be why you got > the rejects? Those were bug fixes so went into plain 'net', they will show up next time I do a merge and I will deal with the conflicts, if any. ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 17:25 ` David Miller @ 2014-11-07 1:59 ` Herbert Xu 2014-11-07 3:13 ` David Miller 0 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-07 1:59 UTC (permalink / raw) To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl On Thu, Nov 06, 2014 at 12:25:00PM -0500, David Miller wrote: > From: Herbert Xu <herbert@gondor.apana.org.au> > Date: Thu, 6 Nov 2014 16:23:38 +0800 > > > On Wed, Nov 05, 2014 at 03:24:10PM -0500, David Miller wrote: > >> > >> Herbert, please provide a cover letter for this series, and the most recent > >> version of patch #2 gets various rejects when I try to apply it to net-next. > > > > Sure, I'll regenerate them. However, while doing so I noticed that > > a number of my patches on tun/macvtap that you have previously set > > as accepted are missing from net-next. Could this be why you got > > the rejects? > > Those were bug fixes so went into plain 'net', they will show up next > time I do a merge and I will deal with the conflicts, if any. I see. In that case it might be best to wait until those fixes hit net-next first before applying these patches as otherwise Stephen will get hit with some nasty merge conflicts. I'll repost them for RFC with the problems that Al pointed out fixed in the mean time. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 1:59 ` Herbert Xu @ 2014-11-07 3:13 ` David Miller 2014-11-07 13:21 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: David Miller @ 2014-11-07 3:13 UTC (permalink / raw) To: herbert; +Cc: viro, netdev, linux-kernel, bcrl From: Herbert Xu <herbert@gondor.apana.org.au> Date: Fri, 7 Nov 2014 09:59:44 +0800 > In that case it might be best to wait until those fixes hit net-next > first before applying these patches as otherwise Stephen will get > hit with some nasty merge conflicts. I just merged net into net-next, so this barrier should no longer be present. Thanks. ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version 2014-11-07 3:13 ` David Miller @ 2014-11-07 13:21 ` Herbert Xu 2014-11-07 13:22 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu ` (3 more replies) 0 siblings, 4 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:21 UTC (permalink / raw) To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki Hi Dave: This patch series adds the helper skb_copy_datagram_iter, which is meant to replace both skb_copy_datagram_iovec and its evil twin skb_copy_datagram_const_iovec. It then converts tun and macvtap over to the new helper and finally removes skb_copy_datagram_const_iovec which is only used by tun and macvtap. The copy_to_iter return value issue pointed out by Al has now been fixed. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-07 13:21 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu @ 2014-11-07 13:22 ` Herbert Xu 2014-11-07 13:22 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu ` (2 subsequent siblings) 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki This patch adds skb_copy_datagram_iter, which is identical to skb_copy_datagram_iovec except that it operates on iov_iter instead of iovec. Eventually all users of skb_copy_datagram_iovec should switch over to iov_iter and then we can remove skb_copy_datagram_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- include/linux/skbuff.h | 3 + net/core/datagram.c | 87 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 53f4f6c..933cfce 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -150,6 +150,7 @@ struct net_device; struct scatterlist; struct pipe_inode_info; +struct iov_iter; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -2653,6 +2654,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, const struct iovec *to, int to_offset, int size); +int skb_copy_datagram_iter(const struct sk_buff *from, int offset, + struct iov_iter *to, int size); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); diff --git a/net/core/datagram.c b/net/core/datagram.c index fdbc9a8..84d90d0 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -49,6 +49,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/uio.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -482,6 +483,92 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_const_iovec); /** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + trace_skb_copy_datagram_iovec(skb, len); + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (copy_to_iter(skb->data + offset, copy, to) != copy) + goto short_copy; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (copy_page_to_iter(skb_frag_page(frag), + frag->page_offset + offset - + start, copy, to) != copy) + goto short_copy; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iter(frag_iter, offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + + /* This is not really a user copy fault, but rather someone + * gave us a bogus length on the skb. We should probably + * print a warning here as it may indicate a kernel bug. + */ + +fault: + return -EFAULT; + +short_copy: + if (iov_iter_count(to)) + goto fault; + + return 0; +} +EXPORT_SYMBOL(skb_copy_datagram_iter); + +/** * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying to ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 2/4] tun: Use iovec iterators 2014-11-07 13:21 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 2014-11-07 13:22 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu @ 2014-11-07 13:22 ` Herbert Xu 2014-11-07 13:22 ` [PATCH 3/4] macvtap: " Herbert Xu 2014-11-07 13:22 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/tun.c | 65 ++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746..2ff769b 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -71,6 +71,7 @@ #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; - ssize_t total = 0; - int vlan_offset = 0, copied; + ssize_t total; + int vlan_offset; int vlan_hlen = 0; int vnet_hdr_sz = 0; @@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (tun->flags & TUN_VNET_HDR) vnet_hdr_sz = tun->vnet_hdr_sz; + total = skb->len + vlan_hlen + vnet_hdr_sz; + if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) < 0) + if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { + total += sizeof(pi); + if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi)) return -EFAULT; - total += sizeof(pi); } if (vnet_hdr_sz) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= vnet_hdr_sz) < 0) + if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (skb_is_gso(skb)) { @@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso)) return -EFAULT; - total += vnet_hdr_sz; } - copied = total; - len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { - int copy, ret; + int ret; struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun, vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + tun->dev->stats.tx_bytes += skb->len + vlan_hlen; return total; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + const struct iovec *iv, unsigned long segs, + ssize_t len, int noblock) { struct sk_buff *skb; ssize_t ret = 0; int peeked, err, off = 0; + struct iov_iter iter; tun_debug(KERN_INFO, tun, "tun_do_read\n"); @@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, /* Read frames from queue */ skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, &peeked, &off, &err); - if (skb) { - ret = tun_put_user(tun, tfile, skb, iv, len); - kfree_skb(skb); - } else - ret = err; + if (!skb) + return ret; + + iov_iter_init(&iter, READ, iv, segs, len); + ret = tun_put_user(tun, tfile, skb, &iter); + kfree_skb(skb); return ret; } @@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = tun_do_read(tun, tfile, iv, len, + ret = tun_do_read(tun, tfile, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) @@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, m->msg_iov, total_len, + ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 3/4] macvtap: Use iovec iterators 2014-11-07 13:21 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 2014-11-07 13:22 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu 2014-11-07 13:22 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu @ 2014-11-07 13:22 ` Herbert Xu 2014-11-07 13:22 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/macvtap.c | 46 +++++++++++++++++++++------------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 880cc09..cea99d4 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -15,6 +15,7 @@ #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> +#include <linux/uio.h> #include <net/ipv6.h> #include <net/net_namespace.h> @@ -778,31 +779,29 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, /* Put packet to the user space buffer */ static ssize_t macvtap_put_user(struct macvtap_queue *q, const struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; - int copied, total; + int total; if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr; vnet_hdr_len = q->vnet_hdr_sz; - if ((len -= vnet_hdr_len) < 0) + if (iov_iter_count(iter) < vnet_hdr_len) return -EINVAL; macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); - if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) + if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) != + sizeof(vnet_hdr)) return -EFAULT; } - total = copied = vnet_hdr_len; + total = vnet_hdr_len; total += skb->len; - if (!vlan_tx_tag_present(skb)) - len = min_t(int, skb->len, len); - else { - int copy; + if (vlan_tx_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -811,37 +810,33 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - len = min_t(int, skb->len + VLAN_HLEN, len); total += VLAN_HLEN; - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret != sizeof(veth) || !iov_iter_count(iter)) goto done; } - ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + ret = skb_copy_datagram_iter(skb, vlan_offset, iter, + skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t macvtap_do_read(struct macvtap_queue *q, - const struct iovec *iv, unsigned long len, + const struct iovec *iv, unsigned long segs, + unsigned long len, int noblock) { DEFINE_WAIT(wait); struct sk_buff *skb; ssize_t ret = 0; + struct iov_iter iter; while (len) { if (!noblock) @@ -863,7 +858,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, schedule(); continue; } - ret = macvtap_put_user(q, skb, iv, len); + iov_iter_init(&iter, READ, iv, segs, len); + ret = macvtap_put_user(q, skb, &iter); kfree_skb(skb); break; } @@ -886,7 +882,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK); + ret = macvtap_do_read(q, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; @@ -1117,7 +1113,7 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) return -EINVAL; - ret = macvtap_do_read(q, m->msg_iov, total_len, + ret = macvtap_do_read(q, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec 2014-11-07 13:21 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu ` (2 preceding siblings ...) 2014-11-07 13:22 ` [PATCH 3/4] macvtap: " Herbert Xu @ 2014-11-07 13:22 ` Herbert Xu 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki Now that both macvtap and tun are using skb_copy_datagram_iter, we can kill the abomination that is skb_copy_datagram_const_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- include/linux/skbuff.h | 3 - net/core/datagram.c | 89 ------------------------------------------------- 2 files changed, 92 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 933cfce..103fbe8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2651,9 +2651,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, int len); int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, int offset, size_t count); -int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, - const struct iovec *to, int to_offset, - int size); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); diff --git a/net/core/datagram.c b/net/core/datagram.c index 84d90d0..26391a3 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -394,95 +394,6 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_iovec); /** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to - * @len: amount of data to copy from buffer to iovec - * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. - */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); - -/** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version 2014-11-05 20:24 ` David Miller 2014-11-06 8:23 ` Herbert Xu @ 2014-11-06 8:27 ` Herbert Xu 2014-11-06 8:28 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu ` (3 more replies) 1 sibling, 4 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-06 8:27 UTC (permalink / raw) To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki Hi Dave: This patch series adds the helper skb_copy_datagram_iter, which is meant to replace both skb_copy_datagram_iovec and its evil twin skb_copy_datagram_const_iovec. It then converts tun and macvtap over to the new helper and finally removes skb_copy_datagram_const_iovec which is only used by tun and macvtap. Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 8:27 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu @ 2014-11-06 8:28 ` Herbert Xu 2014-11-06 17:30 ` Al Viro 2014-11-06 8:28 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu ` (2 subsequent siblings) 3 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-06 8:28 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki This patch adds skb_copy_datagram_iter, which is identical to skb_copy_datagram_iovec except that it operates on iov_iter instead of iovec. Eventually all users of skb_copy_datagram_iovec should switch over to iov_iter and then we can remove skb_copy_datagram_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- include/linux/skbuff.h | 3 + net/core/datagram.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 85 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 39ec753..a405013 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -150,6 +150,7 @@ struct net_device; struct scatterlist; struct pipe_inode_info; +struct iov_iter; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -2655,6 +2656,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, const struct iovec *to, int to_offset, int size); +int skb_copy_datagram_iter(const struct sk_buff *from, int offset, + struct iov_iter *to, int size); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb); int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags); diff --git a/net/core/datagram.c b/net/core/datagram.c index fdbc9a8..45a9d4d 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -49,6 +49,7 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/uio.h> #include <net/protocol.h> #include <linux/skbuff.h> @@ -482,6 +483,87 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_const_iovec); /** + * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. + * @skb: buffer to copy + * @offset: offset in the buffer to start copying from + * @to: iovec iterator to copy to + * @len: amount of data to copy from buffer to iovec + */ +int skb_copy_datagram_iter(const struct sk_buff *skb, int offset, + struct iov_iter *to, int len) +{ + int start = skb_headlen(skb); + int i, copy = start - offset; + struct sk_buff *frag_iter; + + trace_skb_copy_datagram_iovec(skb, len); + + /* Copy header. */ + if (copy > 0) { + if (copy > len) + copy = len; + if (copy_to_iter(skb->data + offset, copy, to)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + + /* Copy paged appendix. Hmm... why does this look so complicated? */ + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { + int end; + const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + + WARN_ON(start > offset + len); + + end = start + skb_frag_size(frag); + if ((copy = end - offset) > 0) { + int err; + u8 *vaddr; + struct page *page = skb_frag_page(frag); + + if (copy > len) + copy = len; + vaddr = kmap(page); + err = copy_to_iter(vaddr + frag->page_offset + + offset - start, copy, to); + kunmap(page); + if (err) + goto fault; + if (!(len -= copy)) + return 0; + offset += copy; + } + start = end; + } + + skb_walk_frags(skb, frag_iter) { + int end; + + WARN_ON(start > offset + len); + + end = start + frag_iter->len; + if ((copy = end - offset) > 0) { + if (copy > len) + copy = len; + if (skb_copy_datagram_iter(frag_iter, offset - start, + to, copy)) + goto fault; + if ((len -= copy) == 0) + return 0; + offset += copy; + } + start = end; + } + if (!len) + return 0; + +fault: + return -EFAULT; +} +EXPORT_SYMBOL(skb_copy_datagram_iter); + +/** * skb_copy_datagram_from_iovec - Copy a datagram from an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying to ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 8:28 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu @ 2014-11-06 17:30 ` Al Viro 2014-11-07 1:58 ` Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-06 17:30 UTC (permalink / raw) To: Herbert Xu; +Cc: David Miller, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki On Thu, Nov 06, 2014 at 04:28:18PM +0800, Herbert Xu wrote: > + if (copy_to_iter(skb->data + offset, copy, to)) > + goto fault; Sorry, no - copy_to_iter() returns the number of bytes copied, not 0 or -EFAULT. > + vaddr = kmap(page); > + err = copy_to_iter(vaddr + frag->page_offset + > + offset - start, copy, to); > + kunmap(page); > + if (err) > + goto fault; And that one should be copied = copy_page_to_iter(page, frag->page_offset + offset - start, copy, to); if (copied != copy) goto fault; Don't bother with kmap(), vaddr and all that shite. The primitive is copy_page_to_iter(page, offset_in_page, nbytes, iter) it does all needed kmap itself and it's smart enough to use kmap_atomic when it can get away with that. Similar for copy_page_from_iter(). Both of those (as well as copy_{to,from}_iter()) advance iov_iter and return the number of bytes actually copied. So the check for EFAULT is "it has copied less than you've asked it to copy *and* you haven't run out that iov_iter". The second part is guaranteed to be true in this case - your code makes sure that 'copy' is no more than the space left in iterator. In general, this check would be spelled if (copied != copy && iov_iter_count(to)) goto fault; ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter 2014-11-06 17:30 ` Al Viro @ 2014-11-07 1:58 ` Herbert Xu 0 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-07 1:58 UTC (permalink / raw) To: Al Viro; +Cc: David Miller, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki On Thu, Nov 06, 2014 at 05:30:12PM +0000, Al Viro wrote: > On Thu, Nov 06, 2014 at 04:28:18PM +0800, Herbert Xu wrote: > > + if (copy_to_iter(skb->data + offset, copy, to)) > > + goto fault; > > Sorry, no - copy_to_iter() returns the number of bytes copied, not 0 or -EFAULT. > > > + vaddr = kmap(page); > > + err = copy_to_iter(vaddr + frag->page_offset + > > + offset - start, copy, to); > > + kunmap(page); > > + if (err) > > + goto fault; > > And that one should be > copied = copy_page_to_iter(page, frag->page_offset + > offset - start, copy, to); > if (copied != copy) > goto fault; > > Don't bother with kmap(), vaddr and all that shite. The primitive is > copy_page_to_iter(page, offset_in_page, nbytes, iter) > it does all needed kmap itself and it's smart enough to use kmap_atomic > when it can get away with that. Similar for copy_page_from_iter(). > > Both of those (as well as copy_{to,from}_iter()) advance iov_iter and return > the number of bytes actually copied. So the check for EFAULT is "it has copied > less than you've asked it to copy *and* you haven't run out that iov_iter". > The second part is guaranteed to be true in this case - your code makes sure > that 'copy' is no more than the space left in iterator. > > In general, this check would be spelled > if (copied != copy && iov_iter_count(to)) > goto fault; Thanks, I'll redo the patches. -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 2/4] tun: Use iovec iterators 2014-11-06 8:27 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 2014-11-06 8:28 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu @ 2014-11-06 8:28 ` Herbert Xu 2014-11-06 8:28 ` [PATCH 3/4] macvtap: " Herbert Xu 2014-11-06 8:28 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-06 8:28 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/tun.c | 65 ++++++++++++++++++++++++------------------------------ 1 file changed, 30 insertions(+), 35 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746..b4ac4d5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -71,6 +71,7 @@ #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; - ssize_t total = 0; - int vlan_offset = 0, copied; + ssize_t total; + int vlan_offset; int vlan_hlen = 0; int vnet_hdr_sz = 0; @@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (tun->flags & TUN_VNET_HDR) vnet_hdr_sz = tun->vnet_hdr_sz; + total = skb->len + vlan_hlen + vnet_hdr_sz; + if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) < 0) + if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { + total += sizeof(pi); + if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (copy_to_iter(&pi, sizeof(pi), iter)) return -EFAULT; - total += sizeof(pi); } if (vnet_hdr_sz) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= vnet_hdr_sz) < 0) + if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (skb_is_gso(skb)) { @@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (copy_to_iter(&gso, sizeof(gso), iter)) return -EFAULT; - total += vnet_hdr_sz; } - copied = total; - len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { - int copy, ret; + int ret; struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun, vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret || !iov_iter_count(iter)) goto done; } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + tun->dev->stats.tx_bytes += skb->len + vlan_hlen; return total; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + const struct iovec *iv, unsigned long segs, + ssize_t len, int noblock) { struct sk_buff *skb; ssize_t ret = 0; int peeked, err, off = 0; + struct iov_iter iter; tun_debug(KERN_INFO, tun, "tun_do_read\n"); @@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, /* Read frames from queue */ skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, &peeked, &off, &err); - if (skb) { - ret = tun_put_user(tun, tfile, skb, iv, len); - kfree_skb(skb); - } else - ret = err; + if (!skb) + return ret; + + iov_iter_init(&iter, READ, iv, segs, len); + ret = tun_put_user(tun, tfile, skb, &iter); + kfree_skb(skb); return ret; } @@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = tun_do_read(tun, tfile, iv, len, + ret = tun_do_read(tun, tfile, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) @@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, m->msg_iov, total_len, + ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 3/4] macvtap: Use iovec iterators 2014-11-06 8:27 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 2014-11-06 8:28 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu 2014-11-06 8:28 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu @ 2014-11-06 8:28 ` Herbert Xu 2014-11-06 17:33 ` Al Viro 2014-11-06 8:28 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 3 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-06 8:28 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/macvtap.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 880cc09..a0e1dd7 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -15,6 +15,7 @@ #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> +#include <linux/uio.h> #include <net/ipv6.h> #include <net/net_namespace.h> @@ -778,31 +779,28 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, /* Put packet to the user space buffer */ static ssize_t macvtap_put_user(struct macvtap_queue *q, const struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; - int copied, total; + int total; if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr; vnet_hdr_len = q->vnet_hdr_sz; - if ((len -= vnet_hdr_len) < 0) + if (iov_iter_count(iter) < vnet_hdr_len) return -EINVAL; macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); - if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) + if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter)) return -EFAULT; } - total = copied = vnet_hdr_len; + total = vnet_hdr_len; total += skb->len; - if (!vlan_tx_tag_present(skb)) - len = min_t(int, skb->len, len); - else { - int copy; + if (vlan_tx_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -811,37 +809,33 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - len = min_t(int, skb->len + VLAN_HLEN, len); total += VLAN_HLEN; - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret || !iov_iter_count(iter)) goto done; } - ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + ret = skb_copy_datagram_iter(skb, vlan_offset, iter, + skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t macvtap_do_read(struct macvtap_queue *q, - const struct iovec *iv, unsigned long len, + const struct iovec *iv, unsigned long segs, + unsigned long len, int noblock) { DEFINE_WAIT(wait); struct sk_buff *skb; ssize_t ret = 0; + struct iov_iter iter; while (len) { if (!noblock) @@ -863,7 +857,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, schedule(); continue; } - ret = macvtap_put_user(q, skb, iv, len); + iov_iter_init(&iter, READ, iv, segs, len); + ret = macvtap_put_user(q, skb, &iter); kfree_skb(skb); break; } @@ -886,7 +881,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK); + ret = macvtap_do_read(q, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; @@ -1117,7 +1112,7 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) return -EINVAL; - ret = macvtap_do_read(q, m->msg_iov, total_len, + ret = macvtap_do_read(q, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 3/4] macvtap: Use iovec iterators 2014-11-06 8:28 ` [PATCH 3/4] macvtap: " Herbert Xu @ 2014-11-06 17:33 ` Al Viro 0 siblings, 0 replies; 82+ messages in thread From: Al Viro @ 2014-11-06 17:33 UTC (permalink / raw) To: Herbert Xu; +Cc: David Miller, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki On Thu, Nov 06, 2014 at 04:28:20PM +0800, Herbert Xu wrote: > + if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter)) > return -EFAULT; Again, wrong calling conventions. It returns how much has it copied. > + ret = copy_to_iter(&veth, sizeof(veth), iter); > + if (ret || !iov_iter_count(iter)) > goto done; Ditto. ^ permalink raw reply [flat|nested] 82+ messages in thread
* [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec 2014-11-06 8:27 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu ` (2 preceding siblings ...) 2014-11-06 8:28 ` [PATCH 3/4] macvtap: " Herbert Xu @ 2014-11-06 8:28 ` Herbert Xu 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-06 8:28 UTC (permalink / raw) To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki Now that both macvtap and tun are using skb_copy_datagram_iter, we can kill the abomination that is skb_copy_datagram_const_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- include/linux/skbuff.h | 3 - net/core/datagram.c | 89 ------------------------------------------------- 2 files changed, 92 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index a405013..da59580 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2653,9 +2653,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, int len); int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, int offset, size_t count); -int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, - const struct iovec *to, int to_offset, - int size); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); diff --git a/net/core/datagram.c b/net/core/datagram.c index 45a9d4d..93054b9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -394,95 +394,6 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_iovec); /** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to - * @len: amount of data to copy from buffer to iovec - * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. - */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); - -/** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 2/4] tun: Use iovec iterators 2014-11-04 3:38 ` Herbert Xu 2014-11-04 8:31 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu @ 2014-11-04 8:31 ` Herbert Xu 2014-11-04 8:37 ` Herbert Xu 2014-11-04 8:31 ` [PATCH 3/4] macvtap: " Herbert Xu 2014-11-04 8:31 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 3 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-04 8:31 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/tun.c | 65 +++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 34 deletions(-) diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746..cfb81ca 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -71,6 +71,7 @@ #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; - ssize_t total = 0; - int vlan_offset = 0, copied; + ssize_t total; + int vlan_offset; int vlan_hlen = 0; int vnet_hdr_sz = 0; @@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (tun->flags & TUN_VNET_HDR) vnet_hdr_sz = tun->vnet_hdr_sz; + total = skb->len + vlan_hlen + vnet_hdr_sz; + if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) < 0) + if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { + if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (copy_to_iter(&pi, sizeof(pi), iter)) return -EFAULT; total += sizeof(pi); } if (vnet_hdr_sz) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= vnet_hdr_sz) < 0) + if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (skb_is_gso(skb)) { @@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (copy_to_iter(&gso, sizeof(gso), iter)) return -EFAULT; - total += vnet_hdr_sz; } - copied = total; - len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { - int copy, ret; + int ret; struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -1320,36 +1318,34 @@ static ssize_t tun_put_user(struct tun_struct *tun, vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret || !iov_iter_count(iter)) goto done; + + __skb_pull(skb, vlan_offset); } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + tun->dev->stats.tx_bytes += skb->len + vlan_hlen; return total; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + const struct iovec *iv, unsigned long segs, + ssize_t len, int noblock) { struct sk_buff *skb; ssize_t ret = 0; int peeked, err, off = 0; + struct iov_iter iter; tun_debug(KERN_INFO, tun, "tun_do_read\n"); @@ -1362,11 +1358,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, /* Read frames from queue */ skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, &peeked, &off, &err); - if (skb) { - ret = tun_put_user(tun, tfile, skb, iv, len); - kfree_skb(skb); - } else - ret = err; + if (!skb) + return ret; + + iov_iter_init(&iter, READ, iv, segs, len); + ret = tun_put_user(tun, tfile, skb, &iter); + kfree_skb(skb); return ret; } @@ -1387,7 +1384,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = tun_do_read(tun, tfile, iv, len, + ret = tun_do_read(tun, tfile, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) @@ -1488,7 +1485,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, m->msg_iov, total_len, + ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 2/4] tun: Use iovec iterators 2014-11-04 8:31 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu @ 2014-11-04 8:37 ` Herbert Xu 2014-11-05 2:49 ` YOSHIFUJI Hideaki 0 siblings, 1 reply; 82+ messages in thread From: Herbert Xu @ 2014-11-04 8:37 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise Oops, this patch had a left-over skb_pull which made it broken. Here is a fixed version. tun: Use iovec iterators This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746..ff955cdb 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -71,6 +71,7 @@ #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; - ssize_t total = 0; - int vlan_offset = 0, copied; + ssize_t total; + int vlan_offset; int vlan_hlen = 0; int vnet_hdr_sz = 0; @@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (tun->flags & TUN_VNET_HDR) vnet_hdr_sz = tun->vnet_hdr_sz; + total = skb->len + vlan_hlen + vnet_hdr_sz; + if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) < 0) + if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { + if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (copy_to_iter(&pi, sizeof(pi), iter)) return -EFAULT; total += sizeof(pi); } if (vnet_hdr_sz) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= vnet_hdr_sz) < 0) + if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (skb_is_gso(skb)) { @@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (copy_to_iter(&gso, sizeof(gso), iter)) return -EFAULT; - total += vnet_hdr_sz; } - copied = total; - len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { - int copy, ret; + int ret; struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun, vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret || !iov_iter_count(iter)) goto done; } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + tun->dev->stats.tx_bytes += skb->len + vlan_hlen; return total; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + const struct iovec *iv, unsigned long segs, + ssize_t len, int noblock) { struct sk_buff *skb; ssize_t ret = 0; int peeked, err, off = 0; + struct iov_iter iter; tun_debug(KERN_INFO, tun, "tun_do_read\n"); @@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, /* Read frames from queue */ skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, &peeked, &off, &err); - if (skb) { - ret = tun_put_user(tun, tfile, skb, iv, len); - kfree_skb(skb); - } else - ret = err; + if (!skb) + return ret; + + iov_iter_init(&iter, READ, iv, segs, len); + ret = tun_put_user(tun, tfile, skb, &iter); + kfree_skb(skb); return ret; } @@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = tun_do_read(tun, tfile, iv, len, + ret = tun_do_read(tun, tfile, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) @@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, m->msg_iov, total_len, + ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [PATCH 2/4] tun: Use iovec iterators 2014-11-04 8:37 ` Herbert Xu @ 2014-11-05 2:49 ` YOSHIFUJI Hideaki 2014-11-05 3:41 ` Herbert Xu 0 siblings, 1 reply; 82+ messages in thread From: YOSHIFUJI Hideaki @ 2014-11-05 2:49 UTC (permalink / raw) To: Herbert Xu, Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise Cc: hideaki.yoshifuji Hi, Herbert Xu wrote: > Oops, this patch had a left-over skb_pull which made it broken. > Here is a fixed version. > > tun: Use iovec iterators > > This patch removes the use of skb_copy_datagram_const_iovec in > favour of the iovec iterator-based skb_copy_datagram_iter. > > Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> > > diff --git a/drivers/net/tun.c b/drivers/net/tun.c > index 9dd3746..ff955cdb 100644 > --- a/drivers/net/tun.c > +++ b/drivers/net/tun.c : > @@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun, > if (tun->flags & TUN_VNET_HDR) > vnet_hdr_sz = tun->vnet_hdr_sz; > > + total = skb->len + vlan_hlen + vnet_hdr_sz; > + > if (!(tun->flags & TUN_NO_PI)) { > - if ((len -= sizeof(pi)) < 0) > + if (iov_iter_count(iter) < sizeof(pi)) > return -EINVAL; > > - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { > + if (iov_iter_count(iter) < total) { I guess this should be: sizeof(pi) + total -- Hideaki Yoshifuji <hideaki.yoshifuji@miraclelinux.com> Technical Division, MIRACLE LINUX CORPORATION ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [PATCH 2/4] tun: Use iovec iterators 2014-11-05 2:49 ` YOSHIFUJI Hideaki @ 2014-11-05 3:41 ` Herbert Xu 0 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-05 3:41 UTC (permalink / raw) To: YOSHIFUJI Hideaki Cc: viro, davem, netdev, linux-kernel, bcrl, hideaki.yoshifuji YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com> wrote: >> >> - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { >> + if (iov_iter_count(iter) < total) { > > I guess this should be: sizeof(pi) + total Good catch! Here is a third update: tun: Use iovec iterators This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> diff --git a/drivers/net/tun.c b/drivers/net/tun.c index 9dd3746..b4ac4d5 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -71,6 +71,7 @@ #include <net/rtnetlink.h> #include <net/sock.h> #include <linux/seq_file.h> +#include <linux/uio.h> #include <asm/uaccess.h> @@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv, static ssize_t tun_put_user(struct tun_struct *tun, struct tun_file *tfile, struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { struct tun_pi pi = { 0, skb->protocol }; - ssize_t total = 0; - int vlan_offset = 0, copied; + ssize_t total; + int vlan_offset; int vlan_hlen = 0; int vnet_hdr_sz = 0; @@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun, if (tun->flags & TUN_VNET_HDR) vnet_hdr_sz = tun->vnet_hdr_sz; + total = skb->len + vlan_hlen + vnet_hdr_sz; + if (!(tun->flags & TUN_NO_PI)) { - if ((len -= sizeof(pi)) < 0) + if (iov_iter_count(iter) < sizeof(pi)) return -EINVAL; - if (len < skb->len + vlan_hlen + vnet_hdr_sz) { + total += sizeof(pi); + if (iov_iter_count(iter) < total) { /* Packet will be striped */ pi.flags |= TUN_PKT_STRIP; } - if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi))) + if (copy_to_iter(&pi, sizeof(pi), iter)) return -EFAULT; - total += sizeof(pi); } if (vnet_hdr_sz) { struct virtio_net_hdr gso = { 0 }; /* no info leak */ - if ((len -= vnet_hdr_sz) < 0) + if (iov_iter_count(iter) < vnet_hdr_sz) return -EINVAL; if (skb_is_gso(skb)) { @@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun, gso.flags = VIRTIO_NET_HDR_F_DATA_VALID; } /* else everything is zero */ - if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total, - sizeof(gso)))) + if (copy_to_iter(&gso, sizeof(gso), iter)) return -EFAULT; - total += vnet_hdr_sz; } - copied = total; - len = min_t(int, skb->len + vlan_hlen, len); - total += skb->len + vlan_hlen; if (vlan_hlen) { - int copy, ret; + int ret; struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun, vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret || !iov_iter_count(iter)) goto done; } - skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset); done: tun->dev->stats.tx_packets++; - tun->dev->stats.tx_bytes += len; + tun->dev->stats.tx_bytes += skb->len + vlan_hlen; return total; } static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, - const struct iovec *iv, ssize_t len, int noblock) + const struct iovec *iv, unsigned long segs, + ssize_t len, int noblock) { struct sk_buff *skb; ssize_t ret = 0; int peeked, err, off = 0; + struct iov_iter iter; tun_debug(KERN_INFO, tun, "tun_do_read\n"); @@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile, /* Read frames from queue */ skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0, &peeked, &off, &err); - if (skb) { - ret = tun_put_user(tun, tfile, skb, iv, len); - kfree_skb(skb); - } else - ret = err; + if (!skb) + return ret; + + iov_iter_init(&iter, READ, iv, segs, len); + ret = tun_put_user(tun, tfile, skb, &iter); + kfree_skb(skb); return ret; } @@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = tun_do_read(tun, tfile, iv, len, + ret = tun_do_read(tun, tfile, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) @@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock, SOL_PACKET, TUN_TX_TIMESTAMP); goto out; } - ret = tun_do_read(tun, tfile, m->msg_iov, total_len, + ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; Cheers, -- Email: Herbert Xu <herbert@gondor.apana.org.au> Home Page: http://gondor.apana.org.au/~herbert/ PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 3/4] macvtap: Use iovec iterators 2014-11-04 3:38 ` Herbert Xu 2014-11-04 8:31 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu 2014-11-04 8:31 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu @ 2014-11-04 8:31 ` Herbert Xu 2014-11-04 8:31 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-04 8:31 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise This patch removes the use of skb_copy_datagram_const_iovec in favour of the iovec iterator-based skb_copy_datagram_iter. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- drivers/net/macvtap.c | 45 ++++++++++++++++++++------------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c index 880cc09..a0e1dd7 100644 --- a/drivers/net/macvtap.c +++ b/drivers/net/macvtap.c @@ -15,6 +15,7 @@ #include <linux/cdev.h> #include <linux/idr.h> #include <linux/fs.h> +#include <linux/uio.h> #include <net/ipv6.h> #include <net/net_namespace.h> @@ -778,31 +779,28 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, /* Put packet to the user space buffer */ static ssize_t macvtap_put_user(struct macvtap_queue *q, const struct sk_buff *skb, - const struct iovec *iv, int len) + struct iov_iter *iter) { int ret; int vnet_hdr_len = 0; int vlan_offset = 0; - int copied, total; + int total; if (q->flags & IFF_VNET_HDR) { struct virtio_net_hdr vnet_hdr; vnet_hdr_len = q->vnet_hdr_sz; - if ((len -= vnet_hdr_len) < 0) + if (iov_iter_count(iter) < vnet_hdr_len) return -EINVAL; macvtap_skb_to_vnet_hdr(skb, &vnet_hdr); - if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr))) + if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter)) return -EFAULT; } - total = copied = vnet_hdr_len; + total = vnet_hdr_len; total += skb->len; - if (!vlan_tx_tag_present(skb)) - len = min_t(int, skb->len, len); - else { - int copy; + if (vlan_tx_tag_present(skb)) { struct { __be16 h_vlan_proto; __be16 h_vlan_TCI; @@ -811,37 +809,33 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q, veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb)); vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto); - len = min_t(int, skb->len + VLAN_HLEN, len); total += VLAN_HLEN; - copy = min_t(int, vlan_offset, len); - ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset); + if (ret || !iov_iter_count(iter)) goto done; - copy = min_t(int, sizeof(veth), len); - ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy); - len -= copy; - copied += copy; - if (ret || !len) + ret = copy_to_iter(&veth, sizeof(veth), iter); + if (ret || !iov_iter_count(iter)) goto done; } - ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len); + ret = skb_copy_datagram_iter(skb, vlan_offset, iter, + skb->len - vlan_offset); done: return ret ? ret : total; } static ssize_t macvtap_do_read(struct macvtap_queue *q, - const struct iovec *iv, unsigned long len, + const struct iovec *iv, unsigned long segs, + unsigned long len, int noblock) { DEFINE_WAIT(wait); struct sk_buff *skb; ssize_t ret = 0; + struct iov_iter iter; while (len) { if (!noblock) @@ -863,7 +857,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q, schedule(); continue; } - ret = macvtap_put_user(q, skb, iv, len); + iov_iter_init(&iter, READ, iv, segs, len); + ret = macvtap_put_user(q, skb, &iter); kfree_skb(skb); break; } @@ -886,7 +881,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, goto out; } - ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK); + ret = macvtap_do_read(q, iv, count, len, file->f_flags & O_NONBLOCK); ret = min_t(ssize_t, ret, len); if (ret > 0) iocb->ki_pos = ret; @@ -1117,7 +1112,7 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock, int ret; if (flags & ~(MSG_DONTWAIT|MSG_TRUNC)) return -EINVAL; - ret = macvtap_do_read(q, m->msg_iov, total_len, + ret = macvtap_do_read(q, m->msg_iov, m->msg_iovlen, total_len, flags & MSG_DONTWAIT); if (ret > total_len) { m->msg_flags |= MSG_TRUNC; ^ permalink raw reply related [flat|nested] 82+ messages in thread
* [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec 2014-11-04 3:38 ` Herbert Xu ` (2 preceding siblings ...) 2014-11-04 8:31 ` [PATCH 3/4] macvtap: " Herbert Xu @ 2014-11-04 8:31 ` Herbert Xu 3 siblings, 0 replies; 82+ messages in thread From: Herbert Xu @ 2014-11-04 8:31 UTC (permalink / raw) To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise Now that both macvtap and tun are using skb_copy_datagram_iter, we can kill the abomination that is skb_copy_datagram_const_iovec. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> --- include/linux/skbuff.h | 3 - net/core/datagram.c | 89 ------------------------------------------------- 2 files changed, 92 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 5ff7054..dfd8623 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2639,9 +2639,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset, int len); int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm, int offset, size_t count); -int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset, - const struct iovec *to, int to_offset, - int size); int skb_copy_datagram_iter(const struct sk_buff *from, int offset, struct iov_iter *to, int size); void skb_free_datagram(struct sock *sk, struct sk_buff *skb); diff --git a/net/core/datagram.c b/net/core/datagram.c index 45a9d4d..93054b9 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -394,95 +394,6 @@ fault: EXPORT_SYMBOL(skb_copy_datagram_iovec); /** - * skb_copy_datagram_const_iovec - Copy a datagram to an iovec. - * @skb: buffer to copy - * @offset: offset in the buffer to start copying from - * @to: io vector to copy to - * @to_offset: offset in the io vector to start copying to - * @len: amount of data to copy from buffer to iovec - * - * Returns 0 or -EFAULT. - * Note: the iovec is not modified during the copy. - */ -int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset, - const struct iovec *to, int to_offset, - int len) -{ - int start = skb_headlen(skb); - int i, copy = start - offset; - struct sk_buff *frag_iter; - - /* Copy header. */ - if (copy > 0) { - if (copy > len) - copy = len; - if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - - /* Copy paged appendix. Hmm... why does this look so complicated? */ - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - int end; - const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; - - WARN_ON(start > offset + len); - - end = start + skb_frag_size(frag); - if ((copy = end - offset) > 0) { - int err; - u8 *vaddr; - struct page *page = skb_frag_page(frag); - - if (copy > len) - copy = len; - vaddr = kmap(page); - err = memcpy_toiovecend(to, vaddr + frag->page_offset + - offset - start, to_offset, copy); - kunmap(page); - if (err) - goto fault; - if (!(len -= copy)) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - - skb_walk_frags(skb, frag_iter) { - int end; - - WARN_ON(start > offset + len); - - end = start + frag_iter->len; - if ((copy = end - offset) > 0) { - if (copy > len) - copy = len; - if (skb_copy_datagram_const_iovec(frag_iter, - offset - start, - to, to_offset, - copy)) - goto fault; - if ((len -= copy) == 0) - return 0; - offset += copy; - to_offset += copy; - } - start = end; - } - if (!len) - return 0; - -fault: - return -EFAULT; -} -EXPORT_SYMBOL(skb_copy_datagram_const_iovec); - -/** * skb_copy_datagram_iter - Copy a datagram to an iovec iterator. * @skb: buffer to copy * @offset: offset in the buffer to start copying from ^ permalink raw reply related [flat|nested] 82+ messages in thread
* Re: [0/3] net: Kill skb_copy_datagram_const_iovec 2014-11-03 20:05 ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller 2014-11-04 3:38 ` Herbert Xu @ 2014-11-04 5:45 ` Al Viro 2014-11-05 1:53 ` Al Viro 1 sibling, 1 reply; 82+ messages in thread From: Al Viro @ 2014-11-04 5:45 UTC (permalink / raw) To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl On Mon, Nov 03, 2014 at 03:05:53PM -0500, David Miller wrote: > I'll see if I can make some progress converting the networking over > to iov_iter. It can't be that difficult... albeit perhaps a little > time consuming. FWIW, I have a queue that got started back in April; basically, the plan of attack was * separate kernel-side and userland msghdr. * localize ->msg_iov uses - most of that gets taken care of by several new helpers, as in new helper: skb_copy_datagram_msg() Absolute majority of skb_copy_datagram_iovec() callers (49 out of 56) are passing it msg->msg_iov as iovec. Provide a trivial wrapper that takes msg as argument instead of iovec. and several like that (the numbers in the above are probably incorrect these days - it was done more than half a year ago). * switch kernel-side msghdr to iov_iter. That means diverging layouts; it's really not hard, since we have copying of msghdr from userland already localized. Initially - just a mechanical conversion (i.e. direct uses of iov_iter fields instead of ->msg_iov/->msg_iovlen; note that after the introduction of wrappers the number of such places is very much reduced). * start converting those relatively few places to iov_iter primitives. And that's where it got stalled, since we have to deal with expectations of callers. Syscall ones are trivial; that's not a problem. Unfortunately, there are kernel_{send,recv}msg() users, and those do care about the state the iovec is left in. Strictly speaking, the state of iovec after e.g. ->sendmsg() is undefined. And it's not just protocol-dependent - unless I'm seriously misreading it, tcp_sendmsg() ends up modifying iovec in case when it hits tcp_send_rcvq(), while in the normal case it leaves iovec unmodified. So in general you need to feed ->{send,recv}msg() a throwaway copy of iovec. Leads to wonders like /* NB we can't trust socket ops to either consume our iovs * or leave them alone. */ LASSERT (niov > 0); for (nob = i = 0; i < niov; i++) { scratchiov[i] = iov[i]; nob += scratchiov[i].iov_len; } LASSERT (nob <= conn->ksnc_rx_nob_wanted); rc = kernel_recvmsg(conn->ksnc_sock, &msg, (struct kvec *)scratchiov, niov, nob, MSG_DONTWAIT); etc. However, there are places that don't bother and do this: while (total_rx < data) { rx_loop = kernel_recvmsg(conn->sock, &msg, iov_p, iov_len, (data - total_rx), MSG_WAITALL); if (rx_loop <= 0) { pr_debug("rx_loop: %d total_rx: %d\n", rx_loop, total_rx); return rx_loop; } total_rx += rx_loop; pr_debug("rx_loop: %d, total_rx: %d, data: %d\n", rx_loop, total_rx, data); } (that's iscsit_do_rx_data()). Maybe it's a bug; maybe it's relying on specific behaviour of the protocol known to be used - this code clearly expects recvmsg to advance iovec, which seems to depend only on the protocol. At the moment. In any case, it's very brittle... Hell knows; I hadn't finished digging through that zoo - got sidetracked back then. *IF* all such places either use a throwaway copy or assume that iovec gets modified, we can do the following: switch the access to iovecs to iov_iter primitives, with the first kind of callers creating a throwaway iov_iter and the second just feeding the same iov_iter to e.g. kernel_recvmsg(). iovec will remain constant, iov_iter will be advanced. Moreover, in a lot of cases of first kind will be able to get rid of throwaway iov_iter (and of manually advancing it), effectively converting to the second one. If we have places that currently rely on iovec remaining unchanged (i.e. manually advancing it after kernel_{send,recv}msg()), the series will be more painful ;-/ I very much hope that no such places exist... FWIW, there is also a tactical question that needs to be dealt with. We can, of course, start with renaming the "kernel-side" (i.e. post copy_msghdr_from_user()/get_compat_msghdr()) to struct kmsghdr. OTOH, that's a _lot_ of churn for very little reason - most of the instances in the tree are of that kind. So I did it the other way round - introduced struct user_msghdr (only in linux/socket.h; note that we do *not* have struct msghdr in uabi/linux/socket.h, or anywhere else in uabi/*), made the syscalls take pointers to it and (initially) rely upon the identical layouts in copy_msghdr_from_user(); once we put iov_iter into kernel-side msghdr, we'll just do it like get_compat_msghdr() does. Is that acceptable? It would greatly reduce the amount of churn in net/* - we don't need to pass iov_iter separately and most of the functions in the middle of call chains are completely unchanged. Only the originators of ->sendmsg()/->recvmsg() and the places doing actual data copying need to be touched. OTOH, it makes for kernel struct msghdr looking odd - instead of normal ->msg_iov and ->msg_iovlen it would have ->msg_iov_iter, with ->sendmsg()/->recvmsg() callers needing to set it up... OTTH, the things *are* odd from userland programmer POV - sendmsg(2) and recvmsg(2) leave the iovec unchanged, and having it changed unpredicatably in the kernel-side counterparts seems to make for a nasty trap. Certainly makes for a bunch of nasty comments in the code using those... Comments? ^ permalink raw reply [flat|nested] 82+ messages in thread
* Re: [0/3] net: Kill skb_copy_datagram_const_iovec 2014-11-04 5:45 ` [0/3] " Al Viro @ 2014-11-05 1:53 ` Al Viro 0 siblings, 0 replies; 82+ messages in thread From: Al Viro @ 2014-11-05 1:53 UTC (permalink / raw) To: David Miller Cc: herbert, netdev, linux-kernel, bcrl, Steve French, Sage Weil, Nicholas A. Bellinger On Tue, Nov 04, 2014 at 05:45:13AM +0000, Al Viro wrote: > Hell knows; I hadn't finished digging through that zoo - got sidetracked back > then. *IF* all such places either use a throwaway copy or assume that iovec > gets modified, we can do the following: switch the access to iovecs to > iov_iter primitives, with the first kind of callers creating a throwaway > iov_iter and the second just feeding the same iov_iter to e.g. > kernel_recvmsg(). iovec will remain constant, iov_iter will be advanced. > Moreover, in a lot of cases of first kind will be able to get rid of > throwaway iov_iter (and of manually advancing it), effectively converting > to the second one. All right, now I _have_ finished that. See the resulting notes below. TL;DR version: looks like hypothesis above is correct, modulo 2 places, both buggy - cifs smb_send_kvec() apparently relies on ->sendmsg() leaving the iovec unchanged and so does of the ceph_tcp_sendmsg() callers (write_partial_kvec()). For TCP that's not always true. Another apparent bug caught in process is iscsi iscsit_do_tx_data() - assumes that iovec is being consumed by sendmsg(). I don't see how that could not be a bug - TCP sockets can get there and tcp_sendmsg() normally *doesn't* modify the iovec. Sometimes it does, unfortunately for other two places... Maintainers Cc'd... Full version follows: ----------------------------------------------------------------------------- ->sendmsg(): there's such method in struct proto_ops and in struct proto; the latter is called by (some of) the former, in cases when ->sendmsg() isn't the same for the entire family. Instances of proto ->sendmsg() are, on several occasions, called directly; some of those calls are from another such instance (with unchanged payload). There are two exceptions to that - in tipc_accept() and tipc_connect() we call such instances with empty payload. All calls via method are from proto_ops ->sendmsg() instances, payload unchanged. Instances of proto_ops ->sendmsg() are almost never called directly. All exceptions are from another such instance with unchanged payload. There are two places that call proto_ops ->sendmsg() via method - __sock_sendmsg_nosec() in net/socket.c, and handle_tx() in drivers/vhost/net.c. The latter appears to be playing somewhat unusual games with passing NULL iocb, making it impossible to use the former... Everybody else in the kernel goes through __sock_sendmsg_nosec(), though - it's a chokepoint for sendmsg path. vhost callsite is somewhat worrying - granted, most of the ->sendmsg() instances don't give a damn about iocb at all. The rest, though... E.g. what happens if we do VHOST_NET_SET_BACKEND with backend.fd being an AF_UNIX socket? AFAICS, if it ever gets to that ->sendmsg() call afterwards, we'll get an oops when e.g. unix_dgram_sendmsg() calls kiocb_to_siocb(NULL). The same goes for AF_NETLINK; AF_TIPC is even more interesting, since there NULL iocb is used as "we are in weird callchain, socket is already locked" flag (for tipc_{accept,connect}() callsites). What's going on there? [After having talked with mst: drivers/vhost/net.c checks that it's AF_PACKET/SOCK_RAW (packet_sendmsg()) *or* comes from tun.c (tun_sendmsg()) or from macvtap.c (macvtap_sendmsg()) and all of those ignore iocb completely] FWIW, the situation with iocb is * AF_UNIX and AF_NETLINK use it to get to sock_iocb * AF_TIPC uses it as a flag - it never dereferences the damn thing and only compares it with NULL, to determine whether it's in a normal call chain, or in tipc_accept/tipc_connect one... * everything else ignores it completely, either directly or after passing it to something that ignores it (rxrpc has unusually deep chain, but it still ends up ignoring the sucker). ->recvmsg(): again, present in proto_ops and proto, in the same relationship to each other. The situation is a bit simpler there: there is only one direct caller of an instance of struct proto ->recvmsg() and it is in another such instance, arguments unchanged. All callers via method are in the instances of struct proto_ops ->recvmsg(), message-related arguments unchanged. No direct callers of struct proto_ops recvmsg, three call sites via method - regular one in __sock_recvmsg_nosec() plus two in drivers/vhost/net.c. The latter have NULL iocb and are saved from oopsing in ->recvmsg() by the same logics that saves vhost on sendmsg side. iocb is ignored by everything except AF_UNIX and AF_NETLINK (those use it for sock_iocb) and neither can be reached from vhost path. So it boils down to the following: drivers/vhost/net.c aside, everything goes through __sock_sendmsg_nosec() on the sendmsg side and __sock_recvmsg_nosec() on recvmsg one. Call chains leading to __sock_sendmsg_nosec(): __sock_sendmsg_nosec() <- sock_sendmsg_nosec() <- ___sys_sendmsg() <- __sys_sendmsg() <- sys_compat_sendmsg() <- sys_sendmsg() <- __sys_sendmmsg() <- sys_compat_senmmmsg() <- sys_sendmmsg() <- __sock_sendmsg() <- do_sock_write() <- sock_aio_write() == ->aio_write() <- sock_sendmsg() <- svc_sendto() [no iovec at all] <- ___sys_sendmsg() [see above] <- sys_sendto() <- kernel_sendmsg() All syscalls (and there's quite a tangled mess with sys_socketcall, assorted ARM wrappers, etc.) end up with iovec discarded. Ditto for ->aio_write() callers - they all free the iovec soon after ->aio_write() returns and never look at it before freeing. Call chains leading to __sock_recvmsg_nosec(): __sock_recvmsg_nosec() <- sock_recvmsg_nosec() <- ___sys_recvmsg() <- __sys_recvmsg() <- sys_compat_recvmsg() <- sys_recvmsg() <- __sys_recvmmsg() <- sys_compat_recvmmsg() <- sys_recvmmsg() <- __sock_recvmsg() <- do_sock_read() <- sock_aio_read() == ->aio_read() <- sock_recvmsg() [why is it not static, BTW?] <- ___sys_recvmsg() [see above] <- sys_recvfrom() <- kernel_recvmsg() Again, both the sycalls and ->aio_read() callers end up discarding iovec. All of that leaves us with kernel_{send,recv}msg() as the next-order chokepoints. kernel_recvmsg() callers: drbd drbd_recv_short() - single-element iovec discarded nbd sock_xmit() - single-element iovec discarded; would be better off with advancing iov_iter. isdn l1oip_socket_thread() - single-element iovec discarded lustre ksocknal_lib_recv_iov() - iovec copied (with unhappy comment), copy passed to kernel_recvmsg() and discarded lustre ksocknal_lib_recv_kiov() - ditto. Would be much better off with bvec-based iov_iter lustre libcfs_sock_read() - single-element iovec discarded; would be better off with advancing iov_iter. iscsi iscsit_do_rx_data() - assumes that iovec is being consumed. AFAICS, it's guaranteed to be TCP and tcp_recvmsg() appears to act that way, so it's probably OK... usbip usbip_recv() - single-element iovec discarded; would be better off with advancing iov_iter cifs cifs_readv_from_socket() - iovec copied, copy passed to kernel_recvmsg() and discarded; *definitely* would be better off with advancing iov_iter. dlm receive_from_sock() - 1- or 2-element iovec discarded. ncpfs _recv() - single-element iovec discarded. ocfs2 o2net_recv_tcp_msg() - single-element iovec discarded. ceph ceph_tcp_recvmsg() - single-element iovec discarded; at least one of the loops using it would be better off with advancing iov_iter. ipvs ip_vs_receive() - single-element iovec discarded. sunrpc svc_udp_recvfrom() - no payload (MSG_PEEK, that one) tipc tipc_receive_from_sock() - single-element iovec discarded. sunrpc svc_recvfrom() - confusing; looks like iovec is a throwaway one, though (and we might be better off if we could use an iov_iter of bvec sort instead). kernel_sendmsg() callers: drbd drbd_send() - single-element iovec discarded; would be better off with advancing iov_iter nbd sock_xmit() - ditto isdn l1oip_socket_send() - single-element iovec discarded iscsi iscsi_sw_tcp_xmit_segment() - single-element iovec discarded lustre ksocknal_lib_send_iov() - iovec copied (with unhappy comment), copy passed to kernel_sendmsg() and discarded lustre ksocknal_lib_send_kiov - ditto. Would be much better off with bvec iov_iter. lustre libcfs_sock_write() - single-element iovec discarded; better off with advancing iov_iter iscsi iscsit_do_tx_data() - assumes that iovec is being consumed. I don't see how that could not be a bug - TCP sockets can get there. usbip stub_send_ret_submit() - iovec is built and discarded; short write is treated as an error usbip stub_send_ret_unlink() - ditto usbip vhci_send_cmd_submit() - ditto usbip vhci_send_cmd_unlink() - ditto cifs smb_send_kvec() - apparently relies on ->sendmsg() leaving the iovec unchanged. Looks like a bug - tcp_sendmsg() might drain iovec in some cases. dlm sctp_send_shutdown() - no payload dlm sctp_init_assoc() - single-element iovec discarded ncpfs do_send() - single-element iovec discarded ocfs2 o2net_send_tcp_msg() - callers pass it a throwaway iovec bnep bnep_send() - single-element iovec discarded bnep_tx_frame() - short iovec is built and discarded cmtp_send_frame() - single-element iovec discarded hidp_send_frame() - single-element iovec discarded rfcomm_send_frame() - single-element iovec discarded rfcomm_send_test() - short iovec is built and discarded core sock_no_sendpage*() - single-element iovec discarded ipvs ip_vs_send_async() - single-element iovec discarded rds rds_tcp_sendmsg() - single-element iovec discarded rxrpc rxrpc_busy() - single-element iovec discarded rxrpc rxrpc_process_call() - short iovec is built and discarded rxrpc rxrpc_abort_connection() - short iovec is built and discarded rxrpc rxrpc_reject_packets() - assumes that sendmsg drains iovec; may be a bug. rxrpc rxrpc_send_packet() - single-element iovec discarded rxrpc rxkad_issue_challenge() - short iovec is built and discarded rxrpc rxkad_send_response - short iovec is built and discarded sunrpc xs_send_kvec() - single-element iovec discarded; really asks or iov_iter... tipc tipc_send_to_sock() - single-element iovec discarded; for some reason it seems to believe that short writes never happen... ceph ceph_tcp_sendmsg() - one caller appears to discard iovec, another (write_partial_kvec()) apparently assumes that iovec is unchanged by sendmsg. Not guaranteed to be true for TCP, AFAICAS. ^ permalink raw reply [flat|nested] 82+ messages in thread
end of thread, other threads:[~2014-11-28 5:14 UTC | newest] Thread overview: 82+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2014-11-02 23:05 fs: Use non-const iov in aio_read/aio_write Herbert Xu 2014-11-03 0:16 ` Al Viro 2014-11-03 0:21 ` Al Viro 2014-11-03 0:22 ` Herbert Xu 2014-11-03 0:45 ` Al Viro 2014-11-03 5:37 ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu 2014-11-03 5:44 ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu 2014-11-03 5:44 ` [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu 2014-11-03 5:44 ` [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read Herbert Xu 2014-11-03 20:05 ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller 2014-11-04 3:38 ` Herbert Xu 2014-11-04 8:31 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu 2014-11-04 14:32 ` Al Viro 2014-11-04 14:35 ` Al Viro 2014-11-04 14:44 ` Herbert Xu 2014-11-04 14:52 ` Al Viro 2014-11-04 14:55 ` Herbert Xu 2014-11-04 14:42 ` Herbert Xu 2014-11-04 15:13 ` Al Viro 2014-11-05 2:22 ` Herbert Xu 2014-11-05 3:27 ` David Miller 2014-11-05 3:55 ` Al Viro 2014-11-05 4:12 ` Al Viro 2014-11-05 20:51 ` David Miller 2014-11-05 20:50 ` David Miller 2014-11-05 21:07 ` Al Viro 2014-11-05 21:57 ` David Miller 2014-11-06 3:25 ` Al Viro 2014-11-06 5:50 ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu 2014-11-06 6:43 ` Al Viro 2014-11-06 6:46 ` Herbert Xu 2014-11-06 7:11 ` Al Viro 2014-11-06 9:55 ` Jon Maloy 2014-11-06 22:16 ` Al Viro 2014-11-28 5:14 ` Al Viro 2014-11-06 21:28 ` David Miller 2014-11-07 2:00 ` Herbert Xu 2014-11-07 13:25 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu 2014-11-07 13:27 ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu 2014-11-07 13:27 ` [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt Herbert Xu 2014-11-10 19:26 ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice David Miller 2014-11-06 9:50 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Jon Maloy 2014-11-07 21:48 ` David Miller 2014-11-07 22:11 ` Al Viro 2014-11-07 22:31 ` Al Viro 2014-11-07 22:35 ` Al Viro 2014-11-07 23:42 ` Al Viro 2014-11-08 2:21 ` Herbert Xu 2014-11-09 21:19 ` Al Viro 2014-11-10 5:20 ` David Miller 2014-11-10 6:58 ` Al Viro 2014-11-10 7:30 ` David Miller 2014-11-10 9:09 ` Al Viro 2014-11-10 16:18 ` David Miller 2014-11-10 10:14 ` Michael S. Tsirkin 2014-11-07 21:52 ` David Miller 2014-11-05 20:24 ` David Miller 2014-11-06 8:23 ` Herbert Xu 2014-11-06 17:25 ` David Miller 2014-11-07 1:59 ` Herbert Xu 2014-11-07 3:13 ` David Miller 2014-11-07 13:21 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 2014-11-07 13:22 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu 2014-11-07 13:22 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu 2014-11-07 13:22 ` [PATCH 3/4] macvtap: " Herbert Xu 2014-11-07 13:22 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 2014-11-06 8:27 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu 2014-11-06 8:28 ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu 2014-11-06 17:30 ` Al Viro 2014-11-07 1:58 ` Herbert Xu 2014-11-06 8:28 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu 2014-11-06 8:28 ` [PATCH 3/4] macvtap: " Herbert Xu 2014-11-06 17:33 ` Al Viro 2014-11-06 8:28 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 2014-11-04 8:31 ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu 2014-11-04 8:37 ` Herbert Xu 2014-11-05 2:49 ` YOSHIFUJI Hideaki 2014-11-05 3:41 ` Herbert Xu 2014-11-04 8:31 ` [PATCH 3/4] macvtap: " Herbert Xu 2014-11-04 8:31 ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu 2014-11-04 5:45 ` [0/3] " Al Viro 2014-11-05 1:53 ` Al Viro
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for NNTP newsgroup(s).