linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* fs: Use non-const iov in aio_read/aio_write
@ 2014-11-02 23:05 Herbert Xu
  2014-11-03  0:16 ` Al Viro
  0 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-02 23:05 UTC (permalink / raw)
  To: David S. Miller, netdev, Linux Kernel Mailing List; +Cc: Benjamin LaHaise

Currently the functions aio_read/aio_write use a const iov as
input.  This is unnecessary as all their callers supply a
stack-based or kmalloced iov which is never reused.  Conceptually
this is fine because iovs supplied to aio_read/aio_write ultimately
come from user-space so we always have to make a copy of them for
the kernel.

This is also a joke because for as long (since 2.1.15) as we've
had the const iov, the network stack (currently through do_sock_read
and do_sock_write) has been casting the const away.  IOW if anybody
did supply a const iov they would crash and burn if they ever
entered the network stack.

The network stack needs a non-const iov because it iterates through
the iov as it reads/writes data.

So we have two alternatives, either change the network stack to
not touch the iovs or make the iovs non-const.

As there is no reason for the iovs to be const in the first place,
I have taken the second choice and changed all aio_read/aio_write
functions to use non-const iovs.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index b30753c..dfefc79 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -434,8 +434,8 @@ prototypes:
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_read) (struct kiocb *, struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_write) (struct kiocb *, struct iovec *, unsigned long, loff_t);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 20bf204..a2ba142 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -811,8 +811,8 @@ struct file_operations {
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_read) (struct kiocb *, struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_write) (struct kiocb *, struct iovec *, unsigned long, loff_t);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c
index c952b98..c7490bd 100644
--- a/arch/s390/hypfs/inode.c
+++ b/arch/s390/hypfs/inode.c
@@ -144,7 +144,7 @@ static int hypfs_open(struct inode *inode, struct file *filp)
 	return nonseekable_open(inode, filp);
 }
 
-static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t hypfs_aio_read(struct kiocb *iocb, struct iovec *iov,
 			      unsigned long nr_segs, loff_t offset)
 {
 	char *data;
@@ -167,7 +167,7 @@ static ssize_t hypfs_aio_read(struct kiocb *iocb, const struct iovec *iov,
 
 	return ret;
 }
-static ssize_t hypfs_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t hypfs_aio_write(struct kiocb *iocb, struct iovec *iov,
 			      unsigned long nr_segs, loff_t offset)
 {
 	int rc;
diff --git a/drivers/char/mem.c b/drivers/char/mem.c
index 524b707..d94e5b0 100644
--- a/drivers/char/mem.c
+++ b/drivers/char/mem.c
@@ -598,13 +598,13 @@ static ssize_t write_null(struct file *file, const char __user *buf,
 	return count;
 }
 
-static ssize_t aio_read_null(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t aio_read_null(struct kiocb *iocb, struct iovec *iov,
 			     unsigned long nr_segs, loff_t pos)
 {
 	return 0;
 }
 
-static ssize_t aio_write_null(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t aio_write_null(struct kiocb *iocb, struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
 	return iov_length(iov, nr_segs);
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 6d7f453..8b75de4f 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -53,7 +53,7 @@ static int ipath_open(struct inode *, struct file *);
 static int ipath_close(struct inode *, struct file *);
 static ssize_t ipath_write(struct file *, const char __user *, size_t,
 			   loff_t *);
-static ssize_t ipath_writev(struct kiocb *, const struct iovec *,
+static ssize_t ipath_writev(struct kiocb *, struct iovec *,
 			    unsigned long , loff_t);
 static unsigned int ipath_poll(struct file *, struct poll_table_struct *);
 static int ipath_mmap(struct file *, struct vm_area_struct *);
@@ -2414,7 +2414,7 @@ bail:
 	return ret;
 }
 
-static ssize_t ipath_writev(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t ipath_writev(struct kiocb *iocb, struct iovec *iov,
 			    unsigned long dim, loff_t off)
 {
 	struct file *filp = iocb->ki_filp;
diff --git a/drivers/infiniband/hw/qib/qib_file_ops.c b/drivers/infiniband/hw/qib/qib_file_ops.c
index b15e34e..8872924 100644
--- a/drivers/infiniband/hw/qib/qib_file_ops.c
+++ b/drivers/infiniband/hw/qib/qib_file_ops.c
@@ -55,7 +55,7 @@
 static int qib_open(struct inode *, struct file *);
 static int qib_close(struct inode *, struct file *);
 static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *);
-static ssize_t qib_aio_write(struct kiocb *, const struct iovec *,
+static ssize_t qib_aio_write(struct kiocb *, struct iovec *,
 			     unsigned long, loff_t);
 static unsigned int qib_poll(struct file *, struct poll_table_struct *);
 static int qib_mmapf(struct file *, struct vm_area_struct *);
@@ -2245,7 +2245,7 @@ bail:
 	return ret;
 }
 
-static ssize_t qib_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t qib_aio_write(struct kiocb *iocb, struct iovec *iov,
 			     unsigned long dim, loff_t off)
 {
 	struct qib_filedata *fp = iocb->ki_filp->private_data;
diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6f226de..823522e 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -761,7 +761,7 @@ err:
 	return err;
 }
 
-static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
+static ssize_t macvtap_aio_write(struct kiocb *iocb, struct iovec *iv,
 				 unsigned long count, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
@@ -871,7 +871,7 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q,
 	return ret;
 }
 
-static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
+static ssize_t macvtap_aio_read(struct kiocb *iocb, struct iovec *iv,
 				unsigned long count, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..8d06816 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1206,7 +1206,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	return total_len;
 }
 
-static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
+static ssize_t tun_chr_aio_write(struct kiocb *iocb, struct iovec *iv,
 			      unsigned long count, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
@@ -1371,7 +1371,7 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	return ret;
 }
 
-static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
+static ssize_t tun_chr_aio_read(struct kiocb *iocb, struct iovec *iv,
 			    unsigned long count, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 63314ed..47fec3fd 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -958,7 +958,7 @@ static int ffs_aio_cancel(struct kiocb *kiocb)
 }
 
 static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
-				    const struct iovec *iovec,
+				    struct iovec *iovec,
 				    unsigned long nr_segs, loff_t loff)
 {
 	struct ffs_io_data *io_data;
@@ -985,7 +985,7 @@ static ssize_t ffs_epfile_aio_write(struct kiocb *kiocb,
 }
 
 static ssize_t ffs_epfile_aio_read(struct kiocb *kiocb,
-				   const struct iovec *iovec,
+				   struct iovec *iovec,
 				   unsigned long nr_segs, loff_t loff)
 {
 	struct ffs_io_data *io_data;
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index c744e49..211ab83 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -695,7 +695,7 @@ fail:
 }
 
 static ssize_t
-ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
+ep_aio_read(struct kiocb *iocb, struct iovec *iov,
 		unsigned long nr_segs, loff_t o)
 {
 	struct ep_data		*epdata = iocb->ki_filp->private_data;
@@ -712,7 +712,7 @@ ep_aio_read(struct kiocb *iocb, const struct iovec *iov,
 }
 
 static ssize_t
-ep_aio_write(struct kiocb *iocb, const struct iovec *iov,
+ep_aio_write(struct kiocb *iocb, struct iovec *iov,
 		unsigned long nr_segs, loff_t o)
 {
 	struct ep_data		*epdata = iocb->ki_filp->private_data;
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index afd2b44..ca3db8d 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -33,13 +33,13 @@ static ssize_t bad_file_write(struct file *filp, const char __user *buf,
         return -EIO;
 }
 
-static ssize_t bad_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t bad_file_aio_read(struct kiocb *iocb, struct iovec *iov,
 			unsigned long nr_segs, loff_t pos)
 {
 	return -EIO;
 }
 
-static ssize_t bad_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t bad_file_aio_write(struct kiocb *iocb, struct iovec *iov,
 			unsigned long nr_segs, loff_t pos)
 {
 	return -EIO;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ca88731..88ce708 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1277,7 +1277,7 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
 	return err;
 }
 
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_read(struct kiocb *iocb, struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
 	struct fuse_copy_state cs;
@@ -1881,7 +1881,7 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
 	return err;
 }
 
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_write(struct kiocb *iocb, struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
 	struct fuse_copy_state cs;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa4..2617860 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2114,7 +2114,7 @@ out:
 /**
  * ntfs_file_aio_write -
  */
-static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t ntfs_file_aio_write(struct kiocb *iocb, struct iovec *iov,
 		unsigned long nr_segs, loff_t pos)
 {
 	struct file *file = iocb->ki_filp;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4e41a4a..2585428 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1485,8 +1485,8 @@ struct file_operations {
 	loff_t (*llseek) (struct file *, loff_t, int);
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
-	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_read) (struct kiocb *, struct iovec *, unsigned long, loff_t);
+	ssize_t (*aio_write) (struct kiocb *, struct iovec *, unsigned long, loff_t);
 	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *);
 	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *);
 	int (*iterate) (struct file *, struct dir_context *);
diff --git a/net/socket.c b/net/socket.c
index fe20c31..3c6fbab 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -114,9 +114,9 @@ unsigned int sysctl_net_busy_poll __read_mostly;
 #endif
 
 static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
-static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t sock_aio_read(struct kiocb *iocb, struct iovec *iov,
 			 unsigned long nr_segs, loff_t pos);
-static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t sock_aio_write(struct kiocb *iocb, struct iovec *iov,
 			  unsigned long nr_segs, loff_t pos);
 static int sock_mmap(struct file *file, struct vm_area_struct *vma);
 
@@ -901,7 +901,7 @@ static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb,
 }
 
 static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
-		struct file *file, const struct iovec *iov,
+		struct file *file, struct iovec *iov,
 		unsigned long nr_segs)
 {
 	struct socket *sock = file->private_data;
@@ -915,14 +915,14 @@ static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb,
 	msg->msg_namelen = 0;
 	msg->msg_control = NULL;
 	msg->msg_controllen = 0;
-	msg->msg_iov = (struct iovec *)iov;
+	msg->msg_iov = iov;
 	msg->msg_iovlen = nr_segs;
 	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 
 	return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags);
 }
 
-static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t sock_aio_read(struct kiocb *iocb, struct iovec *iov,
 				unsigned long nr_segs, loff_t pos)
 {
 	struct sock_iocb siocb, *x;
@@ -941,7 +941,7 @@ static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
 }
 
 static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
-			struct file *file, const struct iovec *iov,
+			struct file *file, struct iovec *iov,
 			unsigned long nr_segs)
 {
 	struct socket *sock = file->private_data;
@@ -955,7 +955,7 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 	msg->msg_namelen = 0;
 	msg->msg_control = NULL;
 	msg->msg_controllen = 0;
-	msg->msg_iov = (struct iovec *)iov;
+	msg->msg_iov = iov;
 	msg->msg_iovlen = nr_segs;
 	msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0;
 	if (sock->type == SOCK_SEQPACKET)
@@ -964,7 +964,7 @@ static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb,
 	return __sock_sendmsg(iocb, sock, msg, size);
 }
 
-static ssize_t sock_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t sock_aio_write(struct kiocb *iocb, struct iovec *iov,
 			  unsigned long nr_segs, loff_t pos)
 {
 	struct sock_iocb siocb, *x;
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c
index 166d59c..229b5a9 100644
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2995,7 +2995,7 @@ static ssize_t snd_pcm_write(struct file *file, const char __user *buf,
 	return result;
 }
 
-static ssize_t snd_pcm_aio_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t snd_pcm_aio_read(struct kiocb *iocb, struct iovec *iov,
 			     unsigned long nr_segs, loff_t pos)
 
 {
@@ -3031,7 +3031,7 @@ static ssize_t snd_pcm_aio_read(struct kiocb *iocb, const struct iovec *iov,
 	return result;
 }
 
-static ssize_t snd_pcm_aio_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t snd_pcm_aio_write(struct kiocb *iocb, struct iovec *iov,
 			      unsigned long nr_segs, loff_t pos)
 {
 	struct snd_pcm_file *pcm_file;

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: fs: Use non-const iov in aio_read/aio_write
  2014-11-02 23:05 fs: Use non-const iov in aio_read/aio_write Herbert Xu
@ 2014-11-03  0:16 ` Al Viro
  2014-11-03  0:21   ` Al Viro
  2014-11-03  0:22   ` Herbert Xu
  0 siblings, 2 replies; 82+ messages in thread
From: Al Viro @ 2014-11-03  0:16 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Mon, Nov 03, 2014 at 07:05:52AM +0800, Herbert Xu wrote:
> Currently the functions aio_read/aio_write use a const iov as
> input.  This is unnecessary as all their callers supply a
> stack-based or kmalloced iov which is never reused.  Conceptually
> this is fine because iovs supplied to aio_read/aio_write ultimately
> come from user-space so we always have to make a copy of them for
> the kernel.
> 
> This is also a joke because for as long (since 2.1.15) as we've
> had the const iov, the network stack (currently through do_sock_read
> and do_sock_write) has been casting the const away.  IOW if anybody
> did supply a const iov they would crash and burn if they ever
> entered the network stack.
> 
> The network stack needs a non-const iov because it iterates through
> the iov as it reads/writes data.
> 
> So we have two alternatives, either change the network stack to
> not touch the iovs or make the iovs non-const.
> 
> As there is no reason for the iovs to be const in the first place,
> I have taken the second choice and changed all aio_read/aio_write
> functions to use non-const iovs.

NAK with extreme prejudice.  The right way to deal with that is
to convert the socket side of things to iov_iter.  And give it a
consistent behaviour, while we are at it (some protocols do advance
the damn thing, so do not).  There are _very_ good reasons to have those
iovecs unchanged - if you look at the callers on the socket side, you'll
see a bunch that has to _copy_ iovec just to avoid it being buggered.
And you get rather suboptimal behaviour in memcpy_fromiovec() and friends,
exactly because you have to skip through the emptied elements.

IOW, no way in hell.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: fs: Use non-const iov in aio_read/aio_write
  2014-11-03  0:16 ` Al Viro
@ 2014-11-03  0:21   ` Al Viro
  2014-11-03  0:22   ` Herbert Xu
  1 sibling, 0 replies; 82+ messages in thread
From: Al Viro @ 2014-11-03  0:21 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Mon, Nov 03, 2014 at 12:16:34AM +0000, Al Viro wrote:

> NAK with extreme prejudice.  The right way to deal with that is
> to convert the socket side of things to iov_iter.  And give it a
> consistent behaviour, while we are at it (some protocols do advance
> the damn thing, so do not).  There are _very_ good reasons to have those
> iovecs unchanged - if you look at the callers on the socket side, you'll
> see a bunch that has to _copy_ iovec just to avoid it being buggered.
> And you get rather suboptimal behaviour in memcpy_fromiovec() and friends,
> exactly because you have to skip through the emptied elements.
> 
> IOW, no way in hell.

PS: I do have the beginning of that stuff sitting in the local queue since
April; see http://marc.info/?l=linux-xfs&m=139179304710494&w=2 for the
beginning of the story.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: fs: Use non-const iov in aio_read/aio_write
  2014-11-03  0:16 ` Al Viro
  2014-11-03  0:21   ` Al Viro
@ 2014-11-03  0:22   ` Herbert Xu
  2014-11-03  0:45     ` Al Viro
  1 sibling, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-03  0:22 UTC (permalink / raw)
  To: Al Viro
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Mon, Nov 03, 2014 at 12:16:34AM +0000, Al Viro wrote:
> 
> NAK with extreme prejudice.  The right way to deal with that is
> to convert the socket side of things to iov_iter.  And give it a
> consistent behaviour, while we are at it (some protocols do advance
> the damn thing, so do not).  There are _very_ good reasons to have those
> iovecs unchanged - if you look at the callers on the socket side, you'll
> see a bunch that has to _copy_ iovec just to avoid it being buggered.
> And you get rather suboptimal behaviour in memcpy_fromiovec() and friends,
> exactly because you have to skip through the emptied elements.
> 
> IOW, no way in hell.

You're welcome to send patches fix every spot in the network stack
that writes to the iovec.  But until the network stack is all fixed
up, having a const struct iovec in aio_read/aio_write is a delusion.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: fs: Use non-const iov in aio_read/aio_write
  2014-11-03  0:22   ` Herbert Xu
@ 2014-11-03  0:45     ` Al Viro
  2014-11-03  5:37       ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-03  0:45 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Mon, Nov 03, 2014 at 08:22:07AM +0800, Herbert Xu wrote:
> On Mon, Nov 03, 2014 at 12:16:34AM +0000, Al Viro wrote:
> > 
> > NAK with extreme prejudice.  The right way to deal with that is
> > to convert the socket side of things to iov_iter.  And give it a
> > consistent behaviour, while we are at it (some protocols do advance
> > the damn thing, so do not).  There are _very_ good reasons to have those
> > iovecs unchanged - if you look at the callers on the socket side, you'll
> > see a bunch that has to _copy_ iovec just to avoid it being buggered.
> > And you get rather suboptimal behaviour in memcpy_fromiovec() and friends,
> > exactly because you have to skip through the emptied elements.
> > 
> > IOW, no way in hell.
> 
> You're welcome to send patches fix every spot in the network stack
> that writes to the iovec.  But until the network stack is all fixed
> up, having a const struct iovec in aio_read/aio_write is a delusion.

Check how many ->aio_read() and ->aio_write() instances are left.  If you
are implying that dealing with the ones in net/* is not feasible, I invite
you to check the situation in fs/*, where we used to have quite a few.
Compare it with what used to be there in e.g. January.

Note, BTW, that there's a damn good reason to convert the socket side of
things to iov_iter - as it is, ->splice_write() there is basically done with
page-by-page mapping and doing kernel_sendmsg(); being able to deal with
"map and copy" stuff *inside* ->sendmsg() would not only reduce the overhead,
it would allow to get rid of ->sendpage() completely.  Basically, let
->sendmsg() instances check the iov_iter type and play zerocopy games if
it's an "array of kernel pages" kind.  Compare ->sendpage() and ->sendmsg()
instances for the protocols that have nontrivial ->sendpage(); you'll see
that there's a lot of duplication.  Merging them looks very feasible, with
divergence happening only very deep in the call chain.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [0/3] net: Kill skb_copy_datagram_const_iovec
  2014-11-03  0:45     ` Al Viro
@ 2014-11-03  5:37       ` Herbert Xu
  2014-11-03  5:44         ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu
                           ` (3 more replies)
  0 siblings, 4 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-03  5:37 UTC (permalink / raw)
  To: Al Viro
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Mon, Nov 03, 2014 at 12:45:03AM +0000, Al Viro wrote:
>
> Note, BTW, that there's a damn good reason to convert the socket side of
> things to iov_iter - as it is, ->splice_write() there is basically done with
> page-by-page mapping and doing kernel_sendmsg(); being able to deal with
> "map and copy" stuff *inside* ->sendmsg() would not only reduce the overhead,
> it would allow to get rid of ->sendpage() completely.  Basically, let
> ->sendmsg() instances check the iov_iter type and play zerocopy games if
> it's an "array of kernel pages" kind.  Compare ->sendpage() and ->sendmsg()
> instances for the protocols that have nontrivial ->sendpage(); you'll see
> that there's a lot of duplication.  Merging them looks very feasible, with
> divergence happening only very deep in the call chain.

Honestly I don't really care which way we end up going as long as
we pick one solution and stick with it.  Right now we have an
abomination in the form of skb_copy_datagram_const_iovec which is
the worst of both worlds, plus it duplicates tons of code.

So here's a few patches to kill this crap.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read
  2014-11-03  5:37       ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
@ 2014-11-03  5:44         ` Herbert Xu
  2014-11-03  5:44         ` [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-03  5:44 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

I started working on this patch after discovering the horror of
skb_copy_datagram_iovec and skb_copy_datagram_const_iovec.  It's
ridiculous to have two versions of the same thing.  Especially when
the reason they exist is because of a stupid disagreement between
fs and net on how we should itereate over iovecs.

To reiterate, fs wants to keep the iovecs themselves constant and
use iterators to keep state while net is used to keeping the state
within the iovecs.

Without judging the merits of either approach, we should stick to
one of them.  And regardless of which one we end up picking, we
can always kill skb_copy_datagram_const_iovec which is plain wrong
as it starts from the very beginning of the iovec every single time.

This patch uses the do_sock_read approach of casting the const away
for the time being.  If we end up going the other way we can trivially
convert this over to using iterators.  In the mean time this would at
least allow us to kill skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/tun.c |   28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..657f811 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1230,11 +1230,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct sk_buff *skb,
-			    const struct iovec *iv, int len)
+			    struct iovec *iv, int len)
 {
 	struct tun_pi pi = { 0, skb->protocol };
 	ssize_t total = 0;
-	int vlan_offset = 0, copied;
+	int vlan_offset = 0;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
@@ -1244,16 +1244,18 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	if (tun->flags & TUN_VNET_HDR)
 		vnet_hdr_sz = tun->vnet_hdr_sz;
 
+	total = skb->len + vlan_hlen + vnet_hdr_sz;
+
 	if (!(tun->flags & TUN_NO_PI)) {
 		if ((len -= sizeof(pi)) < 0)
 			return -EINVAL;
 
-		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
+		if (len < total) {
 			/* Packet will be striped */
 			pi.flags |= TUN_PKT_STRIP;
 		}
 
-		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
+		if (memcpy_toiovec(iv, (void *)&pi, sizeof(pi)))
 			return -EFAULT;
 		total += sizeof(pi);
 	}
@@ -1299,15 +1301,11 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
-					       sizeof(gso))))
+		if (unlikely(memcpy_toiovec(iv, (void *)&gso, sizeof(gso))))
 			return -EFAULT;
-		total += vnet_hdr_sz;
 	}
 
-	copied = total;
 	len = min_t(int, skb->len + vlan_hlen, len);
-	total += skb->len + vlan_hlen;
 	if (vlan_hlen) {
 		int copy, ret;
 		struct {
@@ -1321,21 +1319,19 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
 		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
+		ret = skb_copy_datagram_iovec(skb, 0, iv, copy);
 		len -= copy;
-		copied += copy;
 		if (ret || !len)
 			goto done;
 
 		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
+		ret = memcpy_toiovec(iv, (void *)&veth, copy);
 		len -= copy;
-		copied += copy;
 		if (ret || !len)
 			goto done;
 	}
 
-	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	skb_copy_datagram_iovec(skb, vlan_offset, iv, len);
 
 done:
 	tun->dev->stats.tx_packets++;
@@ -1345,7 +1341,7 @@ done:
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-			   const struct iovec *iv, ssize_t len, int noblock)
+			   struct iovec *iv, ssize_t len, int noblock)
 {
 	struct sk_buff *skb;
 	ssize_t ret = 0;
@@ -1387,7 +1383,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = tun_do_read(tun, tfile, iv, len,
+	ret = tun_do_read(tun, tfile, (struct iovec *)iv, len,
 			  file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read
  2014-11-03  5:37       ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  2014-11-03  5:44         ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu
  2014-11-03  5:44         ` [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
@ 2014-11-03  5:44         ` Herbert Xu
  2014-11-03 20:05         ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-03  5:44 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

I started working on this patch after discovering the horror of
skb_copy_datagram_iovec and skb_copy_datagram_const_iovec.  It's
ridiculous to have two versions of the same thing.  Especially when
the reason they exist is because of a stupid disagreement between
fs and net on how we should itereate over iovecs.

To reiterate, fs wants to keep the iovecs themselves constant and
use iterators to keep state while net is used to keeping the state
within the iovecs.

Without judging the merits of either approach, we should stick to
one of them.  And regardless of which one we end up picking, we
can always kill skb_copy_datagram_const_iovec which is plain wrong
as it starts from the very beginning of the iovec every single time.

This patch uses the do_sock_read approach of casting the const away
for the time being.  If we end up going the other way we can trivially
convert this over to using iterators.  In the mean time this would at
least allow us to kill skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/macvtap.c |   21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 6f226de..d830e25 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -776,12 +776,12 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 /* Put packet to the user space buffer */
 static ssize_t macvtap_put_user(struct macvtap_queue *q,
 				const struct sk_buff *skb,
-				const struct iovec *iv, int len)
+				struct iovec *iv, int len)
 {
 	int ret;
 	int vnet_hdr_len = 0;
 	int vlan_offset = 0;
-	int copied, total;
+	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
@@ -791,10 +791,10 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 
 		macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
 
-		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
+		if (memcpy_toiovec(iv, (void *)&vnet_hdr, sizeof(vnet_hdr)))
 			return -EFAULT;
 	}
-	total = copied = vnet_hdr_len;
+	total = vnet_hdr_len;
 	total += skb->len;
 
 	if (!vlan_tx_tag_present(skb))
@@ -813,28 +813,26 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		total += VLAN_HLEN;
 
 		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
+		ret = skb_copy_datagram_iovec(skb, 0, iv, copy);
 		len -= copy;
-		copied += copy;
 		if (ret || !len)
 			goto done;
 
 		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
+		ret = memcpy_toiovec(iv, (void *)&veth, copy);
 		len -= copy;
-		copied += copy;
 		if (ret || !len)
 			goto done;
 	}
 
-	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	ret = skb_copy_datagram_iovec(skb, vlan_offset, iv, len);
 
 done:
 	return ret ? ret : total;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       const struct iovec *iv, unsigned long len,
+			       struct iovec *iv, unsigned long len,
 			       int noblock)
 {
 	DEFINE_WAIT(wait);
@@ -884,7 +882,8 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK);
+	ret = macvtap_do_read(q, (struct iovec *)iv, len,
+			      file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
 		iocb->ki_pos = ret;

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec
  2014-11-03  5:37       ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  2014-11-03  5:44         ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu
@ 2014-11-03  5:44         ` Herbert Xu
  2014-11-03  5:44         ` [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read Herbert Xu
  2014-11-03 20:05         ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-03  5:44 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

Now that both macvtap and tun are using skb_copy_datagram_iovec, we
can kill the abomination that is skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 -
 net/core/datagram.c    |   89 -------------------------------------------------
 2 files changed, 92 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6c8b6f6..d12c81b 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2638,9 +2638,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 int len);
 int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 			   int offset, size_t count);
-int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
-				  const struct iovec *to, int to_offset,
-				  int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a8..30e2ebd 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -393,95 +393,6 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 
 /**
- *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: io vector to copy to
- *	@to_offset: offset in the io vector to start copying to
- *	@len: amount of data to copy from buffer to iovec
- *
- *	Returns 0 or -EFAULT.
- *	Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
-				  const struct iovec *to, int to_offset,
-				  int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-		to_offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
-						offset - start, to_offset, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_const_iovec(frag_iter,
-							  offset - start,
-							  to, to_offset,
-							  copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
-
-/**
  *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying to

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [0/3] net: Kill skb_copy_datagram_const_iovec
  2014-11-03  5:37       ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
                           ` (2 preceding siblings ...)
  2014-11-03  5:44         ` [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read Herbert Xu
@ 2014-11-03 20:05         ` David Miller
  2014-11-04  3:38           ` Herbert Xu
  2014-11-04  5:45           ` [0/3] " Al Viro
  3 siblings, 2 replies; 82+ messages in thread
From: David Miller @ 2014-11-03 20:05 UTC (permalink / raw)
  To: herbert; +Cc: viro, netdev, linux-kernel, bcrl

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Mon, 3 Nov 2014 13:37:51 +0800

> On Mon, Nov 03, 2014 at 12:45:03AM +0000, Al Viro wrote:
>>
>> Note, BTW, that there's a damn good reason to convert the socket side of
>> things to iov_iter - as it is, ->splice_write() there is basically done with
>> page-by-page mapping and doing kernel_sendmsg(); being able to deal with
>> "map and copy" stuff *inside* ->sendmsg() would not only reduce the overhead,
>> it would allow to get rid of ->sendpage() completely.  Basically, let
>> ->sendmsg() instances check the iov_iter type and play zerocopy games if
>> it's an "array of kernel pages" kind.  Compare ->sendpage() and ->sendmsg()
>> instances for the protocols that have nontrivial ->sendpage(); you'll see
>> that there's a lot of duplication.  Merging them looks very feasible, with
>> divergence happening only very deep in the call chain.
> 
> Honestly I don't really care which way we end up going as long as
> we pick one solution and stick with it.  Right now we have an
> abomination in the form of skb_copy_datagram_const_iovec which is
> the worst of both worlds, plus it duplicates tons of code.
> 
> So here's a few patches to kill this crap.

To pick one direction and go with it, I totally agree with.

But a patch set like this as an interim solution, I am not so happy
with.

If the method says const, we have a contract with the caller to not
modify the iovec.  That caller can assume that we have not done so.

So this patch set violated that contract and can result in real bugs
either now or in the future.

I'll see if I can make some progress converting the networking over
to iov_iter.  It can't be that difficult... albeit perhaps a little
time consuming.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [0/3] net: Kill skb_copy_datagram_const_iovec
  2014-11-03 20:05         ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller
@ 2014-11-04  3:38           ` Herbert Xu
  2014-11-04  8:31             ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
                               ` (3 more replies)
  2014-11-04  5:45           ` [0/3] " Al Viro
  1 sibling, 4 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-04  3:38 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl

On Mon, Nov 03, 2014 at 03:05:53PM -0500, David Miller wrote:
>
> I'll see if I can make some progress converting the networking over
> to iov_iter.  It can't be that difficult... albeit perhaps a little
> time consuming.

OK great.  I'll try to convert tun/macvtap over to iov_iter.

Thanks,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [0/3] net: Kill skb_copy_datagram_const_iovec
  2014-11-03 20:05         ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller
  2014-11-04  3:38           ` Herbert Xu
@ 2014-11-04  5:45           ` Al Viro
  2014-11-05  1:53             ` Al Viro
  1 sibling, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-04  5:45 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Mon, Nov 03, 2014 at 03:05:53PM -0500, David Miller wrote:

> I'll see if I can make some progress converting the networking over
> to iov_iter.  It can't be that difficult... albeit perhaps a little
> time consuming.

FWIW, I have a queue that got started back in April; basically, the plan
of attack was
	* separate kernel-side and userland msghdr.
	* localize ->msg_iov uses - most of that gets taken care of by
several new helpers, as in
    new helper: skb_copy_datagram_msg()
    
    Absolute majority of skb_copy_datagram_iovec() callers (49 out of 56)
    are passing it msg->msg_iov as iovec.  Provide a trivial wrapper that
    takes msg as argument instead of iovec.
and several like that (the numbers in the above are probably incorrect these
days - it was done more than half a year ago).
	* switch kernel-side msghdr to iov_iter.  That means diverging
layouts; it's really not hard, since we have copying of msghdr from
userland already localized.  Initially - just a mechanical conversion
(i.e. direct uses of iov_iter fields instead of ->msg_iov/->msg_iovlen;
note that after the introduction of wrappers the number of such places
is very much reduced).
	* start converting those relatively few places to iov_iter primitives.

And that's where it got stalled, since we have to deal with expectations
of callers.  Syscall ones are trivial; that's not a problem.  Unfortunately,
there are kernel_{send,recv}msg() users, and those do care about the state the
iovec is left in.  Strictly speaking, the state of iovec after e.g.
->sendmsg() is undefined.  And it's not just protocol-dependent - unless
I'm seriously misreading it, tcp_sendmsg() ends up modifying iovec in case
when it hits tcp_send_rcvq(), while in the normal case it leaves iovec
unmodified.  So in general you need to feed ->{send,recv}msg() a throwaway copy
of iovec.  Leads to wonders like
        /* NB we can't trust socket ops to either consume our iovs
         * or leave them alone. */
        LASSERT (niov > 0);

        for (nob = i = 0; i < niov; i++) {
                scratchiov[i] = iov[i];
                nob += scratchiov[i].iov_len;
        }
        LASSERT (nob <= conn->ksnc_rx_nob_wanted);

        rc = kernel_recvmsg(conn->ksnc_sock, &msg,
                (struct kvec *)scratchiov, niov, nob, MSG_DONTWAIT);
etc.  However, there are places that don't bother and do this:
        while (total_rx < data) {
                rx_loop = kernel_recvmsg(conn->sock, &msg, iov_p, iov_len,
                                        (data - total_rx), MSG_WAITALL);
                if (rx_loop <= 0) {
                        pr_debug("rx_loop: %d total_rx: %d\n",
                                rx_loop, total_rx); 
                        return rx_loop;
                }
                total_rx += rx_loop;
                pr_debug("rx_loop: %d, total_rx: %d, data: %d\n",
                                rx_loop, total_rx, data);
        }
(that's iscsit_do_rx_data()).  Maybe it's a bug; maybe it's relying on
specific behaviour of the protocol known to be used - this code clearly
expects recvmsg to advance iovec, which seems to depend only on the
protocol.  At the moment.  In any case, it's very brittle...

Hell knows; I hadn't finished digging through that zoo - got sidetracked back
then.  *IF* all such places either use a throwaway copy or assume that iovec
gets modified, we can do the following: switch the access to iovecs to
iov_iter primitives, with the first kind of callers creating a throwaway
iov_iter and the second just feeding the same iov_iter to e.g.
kernel_recvmsg().  iovec will remain constant, iov_iter will be advanced.
Moreover, in a lot of cases of first kind will be able to get rid of
throwaway iov_iter (and of manually advancing it), effectively converting
to the second one.

If we have places that currently rely on iovec remaining unchanged (i.e.
manually advancing it after kernel_{send,recv}msg()), the series will be
more painful ;-/  I very much hope that no such places exist...

FWIW, there is also a tactical question that needs to be dealt with.  We
can, of course, start with renaming the "kernel-side" (i.e. post
copy_msghdr_from_user()/get_compat_msghdr()) to struct kmsghdr.  OTOH,
that's a _lot_ of churn for very little reason - most of the instances
in the tree are of that kind.  So I did it the other way round - introduced
struct user_msghdr (only in linux/socket.h; note that we do *not* have
struct msghdr in uabi/linux/socket.h, or anywhere else in uabi/*),
made the syscalls take pointers to it and (initially) rely upon the identical
layouts in copy_msghdr_from_user(); once we put iov_iter into kernel-side
msghdr, we'll just do it like get_compat_msghdr() does.

Is that acceptable?  It would greatly reduce the amount of churn in net/* -
we don't need to pass iov_iter separately and most of the functions in
the middle of call chains are completely unchanged.  Only the originators
of ->sendmsg()/->recvmsg() and the places doing actual data copying
need to be touched.  OTOH, it makes for kernel struct msghdr looking
odd - instead of normal ->msg_iov and ->msg_iovlen it would have
->msg_iov_iter, with ->sendmsg()/->recvmsg() callers needing to set it
up...  OTTH, the things *are* odd from userland programmer POV - sendmsg(2)
and recvmsg(2) leave the iovec unchanged, and having it changed unpredicatably
in the kernel-side counterparts seems to make for a nasty trap.  Certainly
makes for a bunch of nasty comments in the code using those...

Comments?

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04  3:38           ` Herbert Xu
@ 2014-11-04  8:31             ` Herbert Xu
  2014-11-04 14:32               ` Al Viro
  2014-11-05 20:24               ` David Miller
  2014-11-04  8:31             ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
                               ` (2 subsequent siblings)
  3 siblings, 2 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-04  8:31 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

This patch adds skb_copy_datagram_iter, which is identical to
skb_copy_datagram_iovec except that it operates on iov_iter
instead of iovec.

Eventually all users of skb_copy_datagram_iovec should switch
over to iov_iter and then we can remove skb_copy_datagram_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 +
 net/core/datagram.c    |   82 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6c8b6f6..5ff7054 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -148,6 +148,7 @@
 struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
+struct iov_iter;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -2641,6 +2642,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
 				  const struct iovec *to, int to_offset,
 				  int size);
+int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
+			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a8..45a9d4d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/uio.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -482,6 +483,87 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
 
 /**
+ *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	trace_skb_copy_datagram_iovec(skb, len);
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (copy_to_iter(skb->data + offset, copy, to))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_frag_size(frag);
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			struct page *page = skb_frag_page(frag);
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = copy_to_iter(vaddr + frag->page_offset +
+					   offset - start, copy, to);
+			kunmap(page);
+			if (err)
+				goto fault;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_iter(frag_iter, offset - start,
+						   to, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
+/**
  *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying to

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 2/4] tun: Use iovec iterators
  2014-11-04  3:38           ` Herbert Xu
  2014-11-04  8:31             ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
@ 2014-11-04  8:31             ` Herbert Xu
  2014-11-04  8:37               ` Herbert Xu
  2014-11-04  8:31             ` [PATCH 3/4] macvtap: " Herbert Xu
  2014-11-04  8:31             ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  3 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-04  8:31 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/tun.c |   65 +++++++++++++++++++++++++-----------------------------
 1 file changed, 31 insertions(+), 34 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..cfb81ca 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <linux/seq_file.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
@@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct sk_buff *skb,
-			    const struct iovec *iv, int len)
+			    struct iov_iter *iter)
 {
 	struct tun_pi pi = { 0, skb->protocol };
-	ssize_t total = 0;
-	int vlan_offset = 0, copied;
+	ssize_t total;
+	int vlan_offset;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
@@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	if (tun->flags & TUN_VNET_HDR)
 		vnet_hdr_sz = tun->vnet_hdr_sz;
 
+	total = skb->len + vlan_hlen + vnet_hdr_sz;
+
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) < 0)
+		if (iov_iter_count(iter) < sizeof(pi))
 			return -EINVAL;
 
-		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
+		if (iov_iter_count(iter) < total) {
 			/* Packet will be striped */
 			pi.flags |= TUN_PKT_STRIP;
 		}
 
-		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
+		if (copy_to_iter(&pi, sizeof(pi), iter))
 			return -EFAULT;
 		total += sizeof(pi);
 	}
 
 	if (vnet_hdr_sz) {
 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		if ((len -= vnet_hdr_sz) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
 
 		if (skb_is_gso(skb)) {
@@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
-					       sizeof(gso))))
+		if (copy_to_iter(&gso, sizeof(gso), iter))
 			return -EFAULT;
-		total += vnet_hdr_sz;
 	}
 
-	copied = total;
-	len = min_t(int, skb->len + vlan_hlen, len);
-	total += skb->len + vlan_hlen;
 	if (vlan_hlen) {
-		int copy, ret;
+		int ret;
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -1320,36 +1318,34 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
+
+		__skb_pull(skb, vlan_offset);
 	}
 
-	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
 
 done:
 	tun->dev->stats.tx_packets++;
-	tun->dev->stats.tx_bytes += len;
+	tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
 
 	return total;
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-			   const struct iovec *iv, ssize_t len, int noblock)
+			   const struct iovec *iv, unsigned long segs,
+			   ssize_t len, int noblock)
 {
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 	int peeked, err, off = 0;
+	struct iov_iter iter;
 
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
@@ -1362,11 +1358,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	/* Read frames from queue */
 	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
 				  &peeked, &off, &err);
-	if (skb) {
-		ret = tun_put_user(tun, tfile, skb, iv, len);
-		kfree_skb(skb);
-	} else
-		ret = err;
+	if (!skb)
+		return ret;
+
+	iov_iter_init(&iter, READ, iv, segs, len);
+	ret = tun_put_user(tun, tfile, skb, &iter);
+	kfree_skb(skb);
 
 	return ret;
 }
@@ -1387,7 +1384,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = tun_do_read(tun, tfile, iv, len,
+	ret = tun_do_read(tun, tfile, iv, count, len,
 			  file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
@@ -1488,7 +1485,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, m->msg_iov, total_len,
+	ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 3/4] macvtap: Use iovec iterators
  2014-11-04  3:38           ` Herbert Xu
  2014-11-04  8:31             ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
  2014-11-04  8:31             ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
@ 2014-11-04  8:31             ` Herbert Xu
  2014-11-04  8:31             ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-04  8:31 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/macvtap.c |   45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 880cc09..a0e1dd7 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -15,6 +15,7 @@
 #include <linux/cdev.h>
 #include <linux/idr.h>
 #include <linux/fs.h>
+#include <linux/uio.h>
 
 #include <net/ipv6.h>
 #include <net/net_namespace.h>
@@ -778,31 +779,28 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 /* Put packet to the user space buffer */
 static ssize_t macvtap_put_user(struct macvtap_queue *q,
 				const struct sk_buff *skb,
-				const struct iovec *iv, int len)
+				struct iov_iter *iter)
 {
 	int ret;
 	int vnet_hdr_len = 0;
 	int vlan_offset = 0;
-	int copied, total;
+	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
 		vnet_hdr_len = q->vnet_hdr_sz;
-		if ((len -= vnet_hdr_len) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
 		macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
 
-		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
+		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter))
 			return -EFAULT;
 	}
-	total = copied = vnet_hdr_len;
+	total = vnet_hdr_len;
 	total += skb->len;
 
-	if (!vlan_tx_tag_present(skb))
-		len = min_t(int, skb->len, len);
-	else {
-		int copy;
+	if (vlan_tx_tag_present(skb)) {
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -811,37 +809,33 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
-		len = min_t(int, skb->len + VLAN_HLEN, len);
 		total += VLAN_HLEN;
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 	}
 
-	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
+				     skb->len - vlan_offset);
 
 done:
 	return ret ? ret : total;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       const struct iovec *iv, unsigned long len,
+			       const struct iovec *iv, unsigned long segs,
+			       unsigned long len,
 			       int noblock)
 {
 	DEFINE_WAIT(wait);
 	struct sk_buff *skb;
 	ssize_t ret = 0;
+	struct iov_iter iter;
 
 	while (len) {
 		if (!noblock)
@@ -863,7 +857,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q,
 			schedule();
 			continue;
 		}
-		ret = macvtap_put_user(q, skb, iv, len);
+		iov_iter_init(&iter, READ, iv, segs, len);
+		ret = macvtap_put_user(q, skb, &iter);
 		kfree_skb(skb);
 		break;
 	}
@@ -886,7 +881,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK);
+	ret = macvtap_do_read(q, iv, count, len, file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
 		iocb->ki_pos = ret;
@@ -1117,7 +1112,7 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
 	int ret;
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	ret = macvtap_do_read(q, m->msg_iov, total_len,
+	ret = macvtap_do_read(q, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec
  2014-11-04  3:38           ` Herbert Xu
                               ` (2 preceding siblings ...)
  2014-11-04  8:31             ` [PATCH 3/4] macvtap: " Herbert Xu
@ 2014-11-04  8:31             ` Herbert Xu
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-04  8:31 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

Now that both macvtap and tun are using skb_copy_datagram_iter, we
can kill the abomination that is skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 -
 net/core/datagram.c    |   89 -------------------------------------------------
 2 files changed, 92 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ff7054..dfd8623 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2639,9 +2639,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 int len);
 int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 			   int offset, size_t count);
-int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
-				  const struct iovec *to, int to_offset,
-				  int size);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 45a9d4d..93054b9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -394,95 +394,6 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 
 /**
- *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: io vector to copy to
- *	@to_offset: offset in the io vector to start copying to
- *	@len: amount of data to copy from buffer to iovec
- *
- *	Returns 0 or -EFAULT.
- *	Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
-				  const struct iovec *to, int to_offset,
-				  int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-		to_offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
-						offset - start, to_offset, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_const_iovec(frag_iter,
-							  offset - start,
-							  to, to_offset,
-							  copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
-
-/**
  *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH 2/4] tun: Use iovec iterators
  2014-11-04  8:31             ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
@ 2014-11-04  8:37               ` Herbert Xu
  2014-11-05  2:49                 ` YOSHIFUJI Hideaki
  0 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-04  8:37 UTC (permalink / raw)
  To: Al Viro, David S. Miller, netdev, Linux Kernel Mailing List,
	Benjamin LaHaise

Oops, this patch had a left-over skb_pull which made it broken.
Here is a fixed version.

tun: Use iovec iterators

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..ff955cdb 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <linux/seq_file.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
@@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct sk_buff *skb,
-			    const struct iovec *iv, int len)
+			    struct iov_iter *iter)
 {
 	struct tun_pi pi = { 0, skb->protocol };
-	ssize_t total = 0;
-	int vlan_offset = 0, copied;
+	ssize_t total;
+	int vlan_offset;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
@@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	if (tun->flags & TUN_VNET_HDR)
 		vnet_hdr_sz = tun->vnet_hdr_sz;
 
+	total = skb->len + vlan_hlen + vnet_hdr_sz;
+
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) < 0)
+		if (iov_iter_count(iter) < sizeof(pi))
 			return -EINVAL;
 
-		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
+		if (iov_iter_count(iter) < total) {
 			/* Packet will be striped */
 			pi.flags |= TUN_PKT_STRIP;
 		}
 
-		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
+		if (copy_to_iter(&pi, sizeof(pi), iter))
 			return -EFAULT;
 		total += sizeof(pi);
 	}
 
 	if (vnet_hdr_sz) {
 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		if ((len -= vnet_hdr_sz) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
 
 		if (skb_is_gso(skb)) {
@@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
-					       sizeof(gso))))
+		if (copy_to_iter(&gso, sizeof(gso), iter))
 			return -EFAULT;
-		total += vnet_hdr_sz;
 	}
 
-	copied = total;
-	len = min_t(int, skb->len + vlan_hlen, len);
-	total += skb->len + vlan_hlen;
 	if (vlan_hlen) {
-		int copy, ret;
+		int ret;
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 	}
 
-	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
 
 done:
 	tun->dev->stats.tx_packets++;
-	tun->dev->stats.tx_bytes += len;
+	tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
 
 	return total;
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-			   const struct iovec *iv, ssize_t len, int noblock)
+			   const struct iovec *iv, unsigned long segs,
+			   ssize_t len, int noblock)
 {
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 	int peeked, err, off = 0;
+	struct iov_iter iter;
 
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
@@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	/* Read frames from queue */
 	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
 				  &peeked, &off, &err);
-	if (skb) {
-		ret = tun_put_user(tun, tfile, skb, iv, len);
-		kfree_skb(skb);
-	} else
-		ret = err;
+	if (!skb)
+		return ret;
+
+	iov_iter_init(&iter, READ, iv, segs, len);
+	ret = tun_put_user(tun, tfile, skb, &iter);
+	kfree_skb(skb);
 
 	return ret;
 }
@@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = tun_do_read(tun, tfile, iv, len,
+	ret = tun_do_read(tun, tfile, iv, count, len,
 			  file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
@@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, m->msg_iov, total_len,
+	ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04  8:31             ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
@ 2014-11-04 14:32               ` Al Viro
  2014-11-04 14:35                 ` Al Viro
  2014-11-04 14:42                 ` Herbert Xu
  2014-11-05 20:24               ` David Miller
  1 sibling, 2 replies; 82+ messages in thread
From: Al Viro @ 2014-11-04 14:32 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 04:31:34PM +0800, Herbert Xu wrote:
> This patch adds skb_copy_datagram_iter, which is identical to
> skb_copy_datagram_iovec except that it operates on iov_iter
> instead of iovec.
> 
> Eventually all users of skb_copy_datagram_iovec should switch
> over to iov_iter and then we can remove skb_copy_datagram_iovec.

Too noisy, IMO.  How about skb_copy_datagram_msg() first?  The fewer
places have to even think of iovec or iov_iter, the better...

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04 14:32               ` Al Viro
@ 2014-11-04 14:35                 ` Al Viro
  2014-11-04 14:44                   ` Herbert Xu
  2014-11-04 14:42                 ` Herbert Xu
  1 sibling, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-04 14:35 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote:
> On Tue, Nov 04, 2014 at 04:31:34PM +0800, Herbert Xu wrote:
> > This patch adds skb_copy_datagram_iter, which is identical to
> > skb_copy_datagram_iovec except that it operates on iov_iter
> > instead of iovec.
> > 
> > Eventually all users of skb_copy_datagram_iovec should switch
> > over to iov_iter and then we can remove skb_copy_datagram_iovec.
> 
> Too noisy, IMO.  How about skb_copy_datagram_msg() first?  The fewer
> places have to even think of iovec or iov_iter, the better...

PS: "too noisy" is about turning every callsite of skb_copy_datagram_iovec
into that of skb_copy_datagram_iter; the helper itself is just fine.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04 14:32               ` Al Viro
  2014-11-04 14:35                 ` Al Viro
@ 2014-11-04 14:42                 ` Herbert Xu
  2014-11-04 15:13                   ` Al Viro
  1 sibling, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-04 14:42 UTC (permalink / raw)
  To: Al Viro
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote:
>
> Too noisy, IMO.  How about skb_copy_datagram_msg() first?  The fewer
> places have to even think of iovec or iov_iter, the better...

We have places like tcp ucopy and tun that do not have msghdr.
So doing skb_copy_datagram_msg means that we'd have to create
a fake msghdr wrapper around them.  The point is not everything
comes in via sendmsg/recvmsg.

What is your motivation for hiding iov/iov_iter? Do you plan to
change their API at some future point?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04 14:35                 ` Al Viro
@ 2014-11-04 14:44                   ` Herbert Xu
  2014-11-04 14:52                     ` Al Viro
  0 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-04 14:44 UTC (permalink / raw)
  To: Al Viro
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 02:35:36PM +0000, Al Viro wrote:
> On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote:
>
> > Too noisy, IMO.  How about skb_copy_datagram_msg() first?  The fewer
> > places have to even think of iovec or iov_iter, the better...
> 
> PS: "too noisy" is about turning every callsite of skb_copy_datagram_iovec
> into that of skb_copy_datagram_iter; the helper itself is just fine.

Hmm if that is your concern then I don't see how skb_copy_datagram_msg
changes things as you'd still have to convert every existing caller
of skb_copy_datagram_iovec.  Colour me confused.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04 14:44                   ` Herbert Xu
@ 2014-11-04 14:52                     ` Al Viro
  2014-11-04 14:55                       ` Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-04 14:52 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 10:44:16PM +0800, Herbert Xu wrote:
> On Tue, Nov 04, 2014 at 02:35:36PM +0000, Al Viro wrote:
> > On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote:
> >
> > > Too noisy, IMO.  How about skb_copy_datagram_msg() first?  The fewer
> > > places have to even think of iovec or iov_iter, the better...
> > 
> > PS: "too noisy" is about turning every callsite of skb_copy_datagram_iovec
> > into that of skb_copy_datagram_iter; the helper itself is just fine.
> 
> Hmm if that is your concern then I don't see how skb_copy_datagram_msg
> changes things as you'd still have to convert every existing caller
> of skb_copy_datagram_iovec.  Colour me confused.

Fewer places having to even think of iovec/iov_iter...

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04 14:52                     ` Al Viro
@ 2014-11-04 14:55                       ` Herbert Xu
  0 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-04 14:55 UTC (permalink / raw)
  To: Al Viro
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 02:52:22PM +0000, Al Viro wrote:
>
> > Hmm if that is your concern then I don't see how skb_copy_datagram_msg
> > changes things as you'd still have to convert every existing caller
> > of skb_copy_datagram_iovec.  Colour me confused.
> 
> Fewer places having to even think of iovec/iov_iter...

Well it's the difference between

	skb_copy_datagram_iter(..., &kmsghdr->iov_iter, ...)

and

	skb_copy_datagram_msg(..., kmsghdr, ...)

Heck we could even make skb_copy_datagram_msg an inline wrapper
around skb_copy_datagram_iter if you like.

Anyway, the point is that not everything comes with a kmsghdr.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04 14:42                 ` Herbert Xu
@ 2014-11-04 15:13                   ` Al Viro
  2014-11-05  2:22                     ` Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-04 15:13 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 10:42:58PM +0800, Herbert Xu wrote:
> On Tue, Nov 04, 2014 at 02:32:00PM +0000, Al Viro wrote:
> >
> > Too noisy, IMO.  How about skb_copy_datagram_msg() first?  The fewer
> > places have to even think of iovec or iov_iter, the better...
> 
> We have places like tcp ucopy and tun that do not have msghdr.
> So doing skb_copy_datagram_msg means that we'd have to create
> a fake msghdr wrapper around them.  The point is not everything
> comes in via sendmsg/recvmsg.

I'm certainly not suggesting it as a primitive.

> What is your motivation for hiding iov/iov_iter? Do you plan to
> change their API at some future point?

Think of it that way: every sendmsg/recvmsg path leading to memcpy_fromiovec
and its friends (including the open-coded ones) would need to be changed
at some point.  Assuming we do not end up passing struct iov_iter * as
an extra argument through a fairly large part of net/* (and that would
be prohibitively hard and messy, not to mention the effects on the stack
footprint, etc.), the most obvious strategy is to have that thing passed
where msg_iov/msg_iovlen are - in struct msghdr.  *IF* we go that way,
it makes a whole lot of sense to start with a bunch of cleanups that
will make sense on their own (most of callers of skb_copy_datagram_iovec
do look like skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); might
as well give it an inlined helper) and will reduce the amount of places
where ->msg_iov is used.  With such cleanups standing on their own and
being splittable from the rest of the queue.  And leaving us with fewer
places in code that deal with ->msg_iov and need to be dealt with.

Please, look through my yesterday posting upthread.  Outline of the
proposed strategy is there...

FWIW, this is from the beginning of April queue - rebased to current,
but very likely incomplete.  Variant taking iov_iter would come later
and yes, it would replace the ..._iovec() one as primitive.  With much
fewer places to worry about.

commit 8241142acab3451239029085286b717ca30aac33
Author: Al Viro <viro@zeniv.linux.org.uk>
Date:   Sun Apr 6 18:41:28 2014 -0400

    new helper: skb_copy_datagram_msg()
    
    Absolute majority of skb_copy_datagram_iovec() callers (49 out of 56)
    are passing it msg->msg_iov as iovec.  Provide a trivial wrapper that
    takes msg as argument instead of iovec.
    
    Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 1be8228..dcbd858 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb),
 	       MISDN_HEADER_LEN);
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	mISDN_sock_cmsg(sk, msg, skb);
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 6c9c16d..443cbbf 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	if (skb) {
 		total_len = min_t(size_t, total_len, skb->len);
-		error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len);
+		error = skb_copy_datagram_msg(skb, 0, m, total_len);
 		if (error == 0) {
 			consume_skb(skb);
 			return total_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 6c8b6f6..379ab46 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2662,6 +2662,12 @@ unsigned int skb_gso_transport_seglen(const struct sk_buff *skb);
 struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features);
 struct sk_buff *skb_vlan_untag(struct sk_buff *skb);
 
+static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
+			    struct msghdr *msg, int size)
+{
+	return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size);
+}
+
 struct skb_checksum_ops {
 	__wsum (*update)(const void *mem, int len, __wsum wsum);
 	__wsum (*combine)(__wsum csum, __wsum csum2, int offset, int len);
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index c00897f..425942d 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1758,7 +1758,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
 	}
-	err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, offset, msg, copied);
 
 	if (!err && msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name);
diff --git a/net/atm/common.c b/net/atm/common.c
index 6a76515..9cd1cca 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	error = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (error)
 		return error;
 	sock_recv_ts_and_drops(msg, sk, skb);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c35c3f4..f4f835e19 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		ax25_digi digi;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 339c74a..0a7cc56 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err == 0) {
 		sock_recv_ts_and_drops(msg, sk, skb);
 
@@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 
 		chunk = min_t(unsigned int, skb->len, size);
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) {
+		if (skb_copy_datagram_msg(skb, 0, msg, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
 			if (!copied)
 				copied = -EFAULT;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 115f149..29e1ec7 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	switch (hci_pi(sk)->channel) {
 	case HCI_CHANNEL_RAW:
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 43f750e..fbcd156 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copylen = len;
 	}
 
-	ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen);
+	ret = skb_copy_datagram_msg(skb, 0, m, copylen);
 	if (ret)
 		goto out_free;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 15e0c67..ac56dd0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 5ab6627..8e6ae94 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -896,7 +896,7 @@ verify_sock_status:
 		else if (len < skb->len)
 			msg->msg_flags |= MSG_TRUNC;
 
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+		if (skb_copy_datagram_msg(skb, 0, msg, len)) {
 			/* Exception. Bailout! */
 			len = -EFAULT;
 			break;
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index ef2ad8a..fc9193e 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	/* FIXME: skip headers if necessary ?! */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
index 9d1f648..73a4d53 100644
--- a/net/ieee802154/raw.c
+++ b/net/ieee802154/raw.c
@@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c373a9a..21894df 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 57f7c98..736236c 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	}
 
 	/* Don't bother checking the checksum */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 739db31..ee8fa4b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 39ec0c3..c239f47 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 	/* XXX -- need to support SO_PEEK_OFF */
 
 	skb_queue_walk(&sk->sk_write_queue, skb) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
 		if (err)
 			break;
 
@@ -1833,8 +1833,7 @@ do_prequeue:
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_iovec(skb, offset,
-						      msg->msg_iov, used);
+			err = skb_copy_datagram_msg(skb, offset, msg, used);
 			if (err) {
 				/* Exception. Bailout! */
 				if (!copied)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cd0db54..d7266f7 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1281,8 +1281,8 @@ try_again:
 	}
 
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-					      msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+					      msg, copied);
 	else {
 		err = skb_copy_and_csum_datagram_iovec(skb,
 						       sizeof(struct udphdr),
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2cdc383..5c6996e 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
@@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 896af88..f642598 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	if (skb_csum_unnecessary(skb)) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	} else if (msg->msg_flags&MSG_TRUNC) {
 		if (__skb_checksum_complete(skb))
 			goto csum_copy_err;
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	} else {
 		err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov);
 		if (err == -EINVAL)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f6ba535..5f68cd72 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -424,8 +424,8 @@ try_again:
 	}
 
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-					      msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+					      msg, copied);
 	else {
 		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
 		if (err == -EINVAL)
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 91729b8..8b7ca1c 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1805,7 +1805,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov,
+	rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg,
 				     copied);
 	if (rc)
 		goto out_free;
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 3f3a6cb..3f1a37b 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1394,7 +1394,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
 	}
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index a089b6b..057b564 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1355,7 +1355,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN;
 
 	cskb = skb;
-	if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) {
+	if (skb_copy_datagram_msg(cskb, offset, msg, copied)) {
 		if (!(flags & MSG_PEEK))
 			skb_queue_head(&sk->sk_receive_queue, skb);
 		return -EFAULT;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 1847ec4..e588309 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 369a982..a6cc1fe 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 0edb263..2177b96 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index b704a93..c559bcd 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else if (len < skb->len)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msg(skb, 0, msg, len);
 	if (likely(err == 0))
 		err = len;
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index bb9cbc1..8fa230b 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -819,8 +819,8 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 			used = len;
 
 		if (!(flags & MSG_TRUNC)) {
-			int rc = skb_copy_datagram_iovec(skb, offset,
-							 msg->msg_iov, used);
+			int rc = skb_copy_datagram_msg(skb, offset,
+							 msg, used);
 			if (rc) {
 				/* Exception. Bailout! */
 				if (!copied)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1de72d..580b794 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(data_skb);
-	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 1b06a1f..7e13f6a 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	er = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (er < 0) {
 		skb_free_datagram(sk, skb);
 		release_sock(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 51f077a..83bc785 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	copied = min_t(unsigned int, rlen, len);
 
 	cskb = skb;
-	if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) {
+	if (skb_copy_datagram_msg(cskb, 0, msg, copied)) {
 		if (!(flags & MSG_PEEK))
 			skb_queue_head(&sk->sk_receive_queue, skb);
 		return -EFAULT;
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 11c3544..9d7d2b7 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied = len;
 	}
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 87d20f4..4cd13d8 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 290352c..0918bc2 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copylen = len;
 	}
 
-	rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen);
+	rval = skb_copy_datagram_msg(skb, 0, msg, copylen);
 	if (rval) {
 		rval = -EFAULT;
 		goto out;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 70a547e..44b2123 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1296,7 +1296,7 @@ copy:
 	else
 		len = skb->len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msg(skb, 0, msg, len);
 	if (!err)
 		err = (flags & MSG_TRUNC) ? skb->len : len;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index a85c1a0..9b600c2 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		struct sockaddr_rose *srose;
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index e9aaa65..4575485 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -180,7 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (copy > len - copied)
 			copy = len - copied;
 
-		ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy);
+		ret = skb_copy_datagram_msg(skb, offset, msg, copy);
 
 		if (ret < 0)
 			goto copy_error;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 634a2ab..2120292 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (copied > len)
 		copied = len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	event = sctp_skb2event(skb);
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 51bddc2..f726eaa 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1372,8 +1372,8 @@ restart:
 			sz = buf_len;
 			m->msg_flags |= MSG_TRUNC;
 		}
-		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg),
-					      m->msg_iov, sz);
+		res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg),
+					      m, sz);
 		if (res)
 			goto exit;
 		res = sz;
@@ -1473,8 +1473,8 @@ restart:
 		needed = (buf_len - sz_copied);
 		sz_to_copy = (sz <= needed) ? sz : needed;
 
-		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset,
-					      m->msg_iov, sz_to_copy);
+		res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset,
+					      m, sz_to_copy);
 		if (res)
 			goto exit;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e968843..350771a 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1825,7 +1825,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else if (size < skb->len - skip)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
+	err = skb_copy_datagram_msg(skb, skip, msg, size);
 	if (err)
 		goto out_free;
 
@@ -2030,8 +2030,8 @@ again:
 		}
 
 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
-		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
-					    msg->msg_iov, chunk)) {
+		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
+					    msg, chunk)) {
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 9bb63ff..a57ddef 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1773,8 +1773,7 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb,
 	}
 
 	/* Place the datagram payload in the user's iovec. */
-	err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov,
-		payload_len);
+	err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len);
 	if (err)
 		goto out;
 
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5ad4418..59e785b 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
 	/* Currently, each datagram always contains a complete record */
 	msg->msg_flags |= MSG_EOR;
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (rc)
 		goto out_free_dgram;
 

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [0/3] net: Kill skb_copy_datagram_const_iovec
  2014-11-04  5:45           ` [0/3] " Al Viro
@ 2014-11-05  1:53             ` Al Viro
  0 siblings, 0 replies; 82+ messages in thread
From: Al Viro @ 2014-11-05  1:53 UTC (permalink / raw)
  To: David Miller
  Cc: herbert, netdev, linux-kernel, bcrl, Steve French, Sage Weil,
	Nicholas A. Bellinger

On Tue, Nov 04, 2014 at 05:45:13AM +0000, Al Viro wrote:

> Hell knows; I hadn't finished digging through that zoo - got sidetracked back
> then.  *IF* all such places either use a throwaway copy or assume that iovec
> gets modified, we can do the following: switch the access to iovecs to
> iov_iter primitives, with the first kind of callers creating a throwaway
> iov_iter and the second just feeding the same iov_iter to e.g.
> kernel_recvmsg().  iovec will remain constant, iov_iter will be advanced.
> Moreover, in a lot of cases of first kind will be able to get rid of
> throwaway iov_iter (and of manually advancing it), effectively converting
> to the second one.

All right, now I _have_ finished that.  See the resulting notes below.
TL;DR version: looks like hypothesis above is correct, modulo 2 places,
both buggy - cifs smb_send_kvec() apparently relies on ->sendmsg() leaving the
iovec unchanged and so does of the ceph_tcp_sendmsg() callers
(write_partial_kvec()).  For TCP that's not always true.  Another apparent
bug caught in process is iscsi iscsit_do_tx_data() - assumes that iovec is
being consumed by sendmsg().  I don't see how that could not be a bug - TCP
sockets can get there and tcp_sendmsg() normally *doesn't* modify the iovec.
Sometimes it does, unfortunately for other two places...

Maintainers Cc'd...


Full version follows:
-----------------------------------------------------------------------------
->sendmsg(): there's such method in struct proto_ops and in struct proto; the
latter is called by (some of) the former, in cases when ->sendmsg() isn't the
same for the entire family.

Instances of proto ->sendmsg() are, on several occasions, called directly; some
of those calls are from another such instance (with unchanged payload).  There
are two exceptions to that - in tipc_accept() and tipc_connect() we call such
instances with empty payload.  All calls via method are from proto_ops
->sendmsg() instances, payload unchanged.

Instances of proto_ops ->sendmsg() are almost never called directly.  All
exceptions are from another such instance with unchanged payload.

There are two places that call proto_ops ->sendmsg() via method -
__sock_sendmsg_nosec() in net/socket.c, and handle_tx() in drivers/vhost/net.c.
The latter appears to be playing somewhat unusual games with passing NULL iocb,
making it impossible to use the former...  Everybody else in the kernel goes
through __sock_sendmsg_nosec(), though - it's a chokepoint for sendmsg path.

vhost callsite is somewhat worrying - granted, most of the ->sendmsg()
instances don't give a damn about iocb at all.  The rest, though...
E.g. what happens if we do VHOST_NET_SET_BACKEND with backend.fd being an
AF_UNIX socket?  AFAICS, if it ever gets to that ->sendmsg() call afterwards,
we'll get an oops when e.g. unix_dgram_sendmsg() calls kiocb_to_siocb(NULL).
The same goes for AF_NETLINK; AF_TIPC is even more interesting, since there
NULL iocb is used as "we are in weird callchain, socket is already locked"
flag (for tipc_{accept,connect}() callsites).  What's going on there?
[After having talked with mst: drivers/vhost/net.c checks that it's
AF_PACKET/SOCK_RAW (packet_sendmsg()) *or* comes from tun.c (tun_sendmsg())
or from macvtap.c (macvtap_sendmsg()) and all of those ignore iocb completely]

FWIW, the situation with iocb is
	* AF_UNIX and AF_NETLINK use it to get to sock_iocb
	* AF_TIPC uses it as a flag - it never dereferences the damn thing and
only compares it with NULL, to determine whether it's in a normal call chain,
or in tipc_accept/tipc_connect one...
	* everything else ignores it completely, either directly or after
passing it to something that ignores it (rxrpc has unusually deep chain, but it
still ends up ignoring the sucker).

->recvmsg(): again, present in proto_ops and proto, in the same relationship
to each other.  The situation is a bit simpler there: there is only one direct
caller of an instance of struct proto ->recvmsg() and it is in another such
instance, arguments unchanged.  All callers via method are in the instances of
struct proto_ops ->recvmsg(), message-related arguments unchanged.  No direct
callers of struct proto_ops recvmsg, three call sites via method - regular one
in __sock_recvmsg_nosec() plus two in drivers/vhost/net.c.  The latter have
NULL iocb and are saved from oopsing in ->recvmsg() by the same logics that
saves vhost on sendmsg side.  iocb is ignored by everything except AF_UNIX
and AF_NETLINK (those use it for sock_iocb) and neither can be reached from
vhost path.

So it boils down to the following: drivers/vhost/net.c aside, everything goes
through __sock_sendmsg_nosec() on the sendmsg side and __sock_recvmsg_nosec()
on recvmsg one.

Call chains leading to __sock_sendmsg_nosec():

__sock_sendmsg_nosec()
	<- sock_sendmsg_nosec()
		<- ___sys_sendmsg()
			<- __sys_sendmsg()
				<- sys_compat_sendmsg()
				<- sys_sendmsg()
			<- __sys_sendmmsg()
				<- sys_compat_senmmmsg()
				<- sys_sendmmsg()
	<- __sock_sendmsg()
		<- do_sock_write()
			<- sock_aio_write()
				== ->aio_write()
		<- sock_sendmsg()
			<- svc_sendto()	[no iovec at all]
			<- ___sys_sendmsg() [see above]
			<- sys_sendto()
			<- kernel_sendmsg()
All syscalls (and there's quite a tangled mess with sys_socketcall,
assorted ARM wrappers, etc.) end up with iovec discarded.
Ditto for ->aio_write() callers - they all free the iovec soon after
->aio_write() returns and never look at it before freeing.

Call chains leading to __sock_recvmsg_nosec():

__sock_recvmsg_nosec()
	<- sock_recvmsg_nosec()
		<- ___sys_recvmsg()
			<- __sys_recvmsg()
				<- sys_compat_recvmsg()
				<- sys_recvmsg()
			<- __sys_recvmmsg()
				<- sys_compat_recvmmsg()
				<- sys_recvmmsg()
	<- __sock_recvmsg()
		<- do_sock_read()
			<- sock_aio_read()
				== ->aio_read()
		<- sock_recvmsg() [why is it not static, BTW?]
			<- ___sys_recvmsg() [see above]
			<- sys_recvfrom()
			<- kernel_recvmsg()
Again, both the sycalls and ->aio_read() callers end up discarding
iovec.

All of that leaves us with kernel_{send,recv}msg() as the next-order
chokepoints.

kernel_recvmsg() callers:
	drbd drbd_recv_short() - single-element iovec discarded
	nbd sock_xmit() - single-element iovec discarded; would be better off
with advancing iov_iter.
	isdn l1oip_socket_thread() - single-element iovec discarded
	lustre ksocknal_lib_recv_iov() - iovec copied (with unhappy comment),
copy passed to kernel_recvmsg() and discarded
	lustre ksocknal_lib_recv_kiov() - ditto.  Would be much better off
with bvec-based iov_iter
	lustre libcfs_sock_read() - single-element iovec discarded; would be
better off with advancing iov_iter.
	iscsi iscsit_do_rx_data() - assumes that iovec is being consumed.
AFAICS, it's guaranteed to be TCP and tcp_recvmsg() appears to act that way,
so it's probably OK...
	usbip usbip_recv() - single-element iovec discarded; would be better
off with advancing iov_iter
	cifs cifs_readv_from_socket() - iovec copied, copy passed to
kernel_recvmsg() and discarded; *definitely* would be better off with advancing
iov_iter.
	dlm receive_from_sock() - 1- or 2-element iovec discarded.
	ncpfs _recv() - single-element iovec discarded.
	ocfs2 o2net_recv_tcp_msg() - single-element iovec discarded.
	ceph ceph_tcp_recvmsg() - single-element iovec discarded; at least one
of the loops using it would be better off with advancing iov_iter.
	ipvs ip_vs_receive() - single-element iovec discarded.
	sunrpc svc_udp_recvfrom() - no payload (MSG_PEEK, that one)
	tipc tipc_receive_from_sock() - single-element iovec discarded.
	sunrpc svc_recvfrom() - confusing; looks like iovec is a throwaway
one, though (and we might be better off if we could use an iov_iter of bvec
sort instead).

kernel_sendmsg() callers:
	drbd drbd_send() - single-element iovec discarded; would be better
off with advancing iov_iter
	nbd sock_xmit() - ditto
	isdn l1oip_socket_send() - single-element iovec discarded
	iscsi iscsi_sw_tcp_xmit_segment() - single-element iovec discarded
	lustre ksocknal_lib_send_iov() - iovec copied (with unhappy comment),
copy passed to kernel_sendmsg() and discarded
	lustre ksocknal_lib_send_kiov - ditto.  Would be much better off
with bvec iov_iter.
	lustre libcfs_sock_write() - single-element iovec discarded; better
off with advancing iov_iter
	iscsi iscsit_do_tx_data() - assumes that iovec is being consumed.
I don't see how that could not be a bug - TCP sockets can get there.
	usbip stub_send_ret_submit() - iovec is built and discarded;
short write is treated as an error
	usbip stub_send_ret_unlink() - ditto
	usbip vhci_send_cmd_submit() - ditto
	usbip vhci_send_cmd_unlink() - ditto
	cifs smb_send_kvec() - apparently relies on ->sendmsg() leaving the
iovec unchanged.  Looks like a bug - tcp_sendmsg() might drain iovec in some
cases.
	dlm sctp_send_shutdown() - no payload
	dlm sctp_init_assoc() - single-element iovec discarded
	ncpfs do_send() - single-element iovec discarded
	ocfs2 o2net_send_tcp_msg() - callers pass it a throwaway iovec
	bnep bnep_send() - single-element iovec discarded
	bnep_tx_frame() - short iovec is built and discarded
	cmtp_send_frame() - single-element iovec discarded
	hidp_send_frame() - single-element iovec discarded
	rfcomm_send_frame() - single-element iovec discarded
	rfcomm_send_test() - short iovec is built and discarded
	core sock_no_sendpage*() - single-element iovec discarded
	ipvs ip_vs_send_async() - single-element iovec discarded
	rds rds_tcp_sendmsg() - single-element iovec discarded
	rxrpc rxrpc_busy() - single-element iovec discarded
	rxrpc rxrpc_process_call() - short iovec is built and discarded
	rxrpc rxrpc_abort_connection() - short iovec is built and discarded
	rxrpc rxrpc_reject_packets() - assumes that sendmsg drains iovec;
may be a bug.
	rxrpc rxrpc_send_packet() - single-element iovec discarded
	rxrpc rxkad_issue_challenge() - short iovec is built and discarded
	rxrpc rxkad_send_response - short iovec is built and discarded
	sunrpc xs_send_kvec() - single-element iovec discarded; really asks
or iov_iter...
	tipc tipc_send_to_sock() - single-element iovec discarded; for some
reason it seems to believe that short writes never happen...
	ceph ceph_tcp_sendmsg() - one caller appears to discard iovec, another
(write_partial_kvec()) apparently assumes that iovec is unchanged by sendmsg.
Not guaranteed to be true for TCP, AFAICAS.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04 15:13                   ` Al Viro
@ 2014-11-05  2:22                     ` Herbert Xu
  2014-11-05  3:27                       ` David Miller
  0 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-05  2:22 UTC (permalink / raw)
  To: Al Viro
  Cc: David S. Miller, netdev, Linux Kernel Mailing List, Benjamin LaHaise

On Tue, Nov 04, 2014 at 03:13:55PM +0000, Al Viro wrote:
> 
> Think of it that way: every sendmsg/recvmsg path leading to memcpy_fromiovec
> and its friends (including the open-coded ones) would need to be changed
> at some point.  Assuming we do not end up passing struct iov_iter * as
> an extra argument through a fairly large part of net/* (and that would
> be prohibitively hard and messy, not to mention the effects on the stack
> footprint, etc.), the most obvious strategy is to have that thing passed
> where msg_iov/msg_iovlen are - in struct msghdr.  *IF* we go that way,
> it makes a whole lot of sense to start with a bunch of cleanups that
> will make sense on their own (most of callers of skb_copy_datagram_iovec
> do look like skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); might
> as well give it an inlined helper) and will reduce the amount of places
> where ->msg_iov is used.  With such cleanups standing on their own and
> being splittable from the rest of the queue.  And leaving us with fewer
> places in code that deal with ->msg_iov and need to be dealt with.

I think your solution is great.  However, I don't see how my four
patches impede in anyway the work that you're doing.  I presume
your first patch will make skb_copy_datagram_msg just a wrapper
around skb_copy_datagram_iovec.

Since I'm not removing skb_copy_datagram_iovec (it can't be removed
until all users are gone) you can still do that and when you're
ready to switch over to iov_iter you can just move the wrapper over
to skb_copy_datagram_iter.  No?

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 2/4] tun: Use iovec iterators
  2014-11-04  8:37               ` Herbert Xu
@ 2014-11-05  2:49                 ` YOSHIFUJI Hideaki
  2014-11-05  3:41                   ` Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: YOSHIFUJI Hideaki @ 2014-11-05  2:49 UTC (permalink / raw)
  To: Herbert Xu, Al Viro, David S. Miller, netdev,
	Linux Kernel Mailing List, Benjamin LaHaise
  Cc: hideaki.yoshifuji

Hi,

Herbert Xu wrote:
> Oops, this patch had a left-over skb_pull which made it broken.
> Here is a fixed version.
>
> tun: Use iovec iterators
>
> This patch removes the use of skb_copy_datagram_const_iovec in
> favour of the iovec iterator-based skb_copy_datagram_iter.
>
> Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
>
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index 9dd3746..ff955cdb 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
:
> @@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun,
>   	if (tun->flags & TUN_VNET_HDR)
>   		vnet_hdr_sz = tun->vnet_hdr_sz;
>
> +	total = skb->len + vlan_hlen + vnet_hdr_sz;
> +
>   	if (!(tun->flags & TUN_NO_PI)) {
> -		if ((len -= sizeof(pi)) < 0)
> +		if (iov_iter_count(iter) < sizeof(pi))
>   			return -EINVAL;
>
> -		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
> +		if (iov_iter_count(iter) < total) {

I guess this should be: sizeof(pi) + total

-- 
Hideaki Yoshifuji <hideaki.yoshifuji@miraclelinux.com>
Technical Division, MIRACLE LINUX CORPORATION

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05  2:22                     ` Herbert Xu
@ 2014-11-05  3:27                       ` David Miller
  2014-11-05  3:55                         ` Al Viro
  0 siblings, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-05  3:27 UTC (permalink / raw)
  To: herbert; +Cc: viro, netdev, linux-kernel, bcrl

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Wed, 5 Nov 2014 10:22:51 +0800

> On Tue, Nov 04, 2014 at 03:13:55PM +0000, Al Viro wrote:
>> 
>> Think of it that way: every sendmsg/recvmsg path leading to memcpy_fromiovec
>> and its friends (including the open-coded ones) would need to be changed
>> at some point.  Assuming we do not end up passing struct iov_iter * as
>> an extra argument through a fairly large part of net/* (and that would
>> be prohibitively hard and messy, not to mention the effects on the stack
>> footprint, etc.), the most obvious strategy is to have that thing passed
>> where msg_iov/msg_iovlen are - in struct msghdr.  *IF* we go that way,
>> it makes a whole lot of sense to start with a bunch of cleanups that
>> will make sense on their own (most of callers of skb_copy_datagram_iovec
>> do look like skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied); might
>> as well give it an inlined helper) and will reduce the amount of places
>> where ->msg_iov is used.  With such cleanups standing on their own and
>> being splittable from the rest of the queue.  And leaving us with fewer
>> places in code that deal with ->msg_iov and need to be dealt with.
> 
> I think your solution is great.  However, I don't see how my four
> patches impede in anyway the work that you're doing.  I presume
> your first patch will make skb_copy_datagram_msg just a wrapper
> around skb_copy_datagram_iovec.
> 
> Since I'm not removing skb_copy_datagram_iovec (it can't be removed
> until all users are gone) you can still do that and when you're
> ready to switch over to iov_iter you can just move the wrapper over
> to skb_copy_datagram_iter.  No?

Agreed, I think both efforts can proceed in parallel.

Al, is this the helper you are talking about?

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 1be8228..a08057d 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb),
 	       MISDN_HEADER_LEN);
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 
 	mISDN_sock_cmsg(sk, msg, skb);
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 6c9c16d..25234d9 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	if (skb) {
 		total_len = min_t(size_t, total_len, skb->len);
-		error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len);
+		error = skb_copy_datagram_msghdr(skb, m, total_len);
 		if (error == 0) {
 			consume_skb(skb);
 			return total_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ad9675..19fe8cc 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
 #include <linux/compiler.h>
+#include <linux/socket.h>
 #include <linux/time.h>
 #include <linux/bug.h>
 #include <linux/cache.h>
@@ -2637,6 +2638,11 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait);
 int skb_copy_datagram_iovec(const struct sk_buff *from, int offset,
 			    struct iovec *to, int size);
+static inline int skb_copy_datagram_msghdr(const struct sk_buff *from,
+					   struct msghdr *msg, int size)
+{
+	return skb_copy_datagram_iovec(from, 0, msg->msg_iov, size);
+}
 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
 				     struct iovec *iov);
 int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
diff --git a/net/atm/common.c b/net/atm/common.c
index 6a76515..7e42bbe 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	error = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (error)
 		return error;
 	sock_recv_ts_and_drops(msg, sk, skb);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c35c3f4..a91075c 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msghdr(skb, msg, copied);
 
 	if (msg->msg_name) {
 		ax25_digi digi;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 339c74a..a68dd75 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err == 0) {
 		sock_recv_ts_and_drops(msg, sk, skb);
 
@@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 
 		chunk = min_t(unsigned int, skb->len, size);
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) {
+		if (skb_copy_datagram_msghdr(skb, msg, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
 			if (!copied)
 				copied = -EFAULT;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 115f149..45d4fba 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 
 	switch (hci_pi(sk)->channel) {
 	case HCI_CHANNEL_RAW:
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 43f750e..67e63b6 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copylen = len;
 	}
 
-	ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen);
+	ret = skb_copy_datagram_msghdr(skb, m, copylen);
 	if (ret)
 		goto out_free;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 15e0c67..220c791 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 5ab6627..7ccf58f 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -896,7 +896,7 @@ verify_sock_status:
 		else if (len < skb->len)
 			msg->msg_flags |= MSG_TRUNC;
 
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+		if (skb_copy_datagram_msghdr(skb, msg, len)) {
 			/* Exception. Bailout! */
 			len = -EFAULT;
 			break;
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index ef2ad8a..7017055 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	/* FIXME: skip headers if necessary ?! */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
index 9d1f648..5dd893a 100644
--- a/net/ieee802154/raw.c
+++ b/net/ieee802154/raw.c
@@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c373a9a..d643882 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 57f7c98..a6e0197 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	}
 
 	/* Don't bother checking the checksum */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 739db31..2f4bb30 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 39ec0c3..3638679 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 	/* XXX -- need to support SO_PEEK_OFF */
 
 	skb_queue_walk(&sk->sk_write_queue, skb) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+		err = skb_copy_datagram_msghdr(skb, msg, skb->len);
 		if (err)
 			break;
 
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2cdc383..4bd84fd 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto out_free_skb;
 
@@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 075a0fb..5d37aa1 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	if (skb_csum_unnecessary(skb)) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msghdr(skb, msg, copied);
 	} else if (msg->msg_flags&MSG_TRUNC) {
 		if (__skb_checksum_complete(skb))
 			goto csum_copy_err;
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msghdr(skb, msg, copied);
 	} else {
 		err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov);
 		if (err == -EINVAL)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92fafd4..54db6dc 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1396,7 +1396,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
 	}
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msghdr(skb, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 1847ec4..f09a848 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 369a982..2913198 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 0edb263..8613881 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index b704a93..5f3c0f5 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else if (len < skb->len)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msghdr(skb, msg, len);
 	if (likely(err == 0))
 		err = len;
 
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1de72d..123cffd 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(data_skb);
-	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(data_skb, msg, copied);
 
 	if (msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 1b06a1f..6bc3556 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	er = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (er < 0) {
 		skb_free_datagram(sk, skb);
 		release_sock(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 51f077a..e962c07 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	copied = min_t(unsigned int, rlen, len);
 
 	cskb = skb;
-	if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) {
+	if (skb_copy_datagram_msghdr(cskb, msg, copied)) {
 		if (!(flags & MSG_PEEK))
 			skb_queue_head(&sk->sk_receive_queue, skb);
 		return -EFAULT;
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 11c3544..4467b2c 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied = len;
 	}
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msghdr(skb, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 87d20f4..390b776 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 290352c..b4835bc 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copylen = len;
 	}
 
-	rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen);
+	rval = skb_copy_datagram_msghdr(skb, msg, copylen);
 	if (rval) {
 		rval = -EFAULT;
 		goto out;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 70a547e..f544658 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1296,7 +1296,7 @@ copy:
 	else
 		len = skb->len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msghdr(skb, msg, len);
 	if (!err)
 		err = (flags & MSG_TRUNC) ? skb->len : len;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index a85c1a0..b660504 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msghdr(skb, msg, copied);
 
 	if (msg->msg_name) {
 		struct sockaddr_rose *srose;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 634a2ab..0fca34c 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (copied > len)
 		copied = len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msghdr(skb, msg, copied);
 
 	event = sctp_skb2event(skb);
 
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5ad4418..fad0702 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
 	/* Currently, each datagram always contains a complete record */
 	msg->msg_flags |= MSG_EOR;
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msghdr(skb, msg, copied);
 	if (rc)
 		goto out_free_dgram;
 

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH 2/4] tun: Use iovec iterators
  2014-11-05  2:49                 ` YOSHIFUJI Hideaki
@ 2014-11-05  3:41                   ` Herbert Xu
  0 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-05  3:41 UTC (permalink / raw)
  To: YOSHIFUJI Hideaki
  Cc: viro, davem, netdev, linux-kernel, bcrl, hideaki.yoshifuji

YOSHIFUJI Hideaki <hideaki.yoshifuji@miraclelinux.com> wrote:
>>
>> -             if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
>> +             if (iov_iter_count(iter) < total) {
> 
> I guess this should be: sizeof(pi) + total

Good catch! Here is a third update:

tun: Use iovec iterators

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..b4ac4d5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <linux/seq_file.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
@@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct sk_buff *skb,
-			    const struct iovec *iv, int len)
+			    struct iov_iter *iter)
 {
 	struct tun_pi pi = { 0, skb->protocol };
-	ssize_t total = 0;
-	int vlan_offset = 0, copied;
+	ssize_t total;
+	int vlan_offset;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
@@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	if (tun->flags & TUN_VNET_HDR)
 		vnet_hdr_sz = tun->vnet_hdr_sz;
 
+	total = skb->len + vlan_hlen + vnet_hdr_sz;
+
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) < 0)
+		if (iov_iter_count(iter) < sizeof(pi))
 			return -EINVAL;
 
-		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
+		total += sizeof(pi);
+		if (iov_iter_count(iter) < total) {
 			/* Packet will be striped */
 			pi.flags |= TUN_PKT_STRIP;
 		}
 
-		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
+		if (copy_to_iter(&pi, sizeof(pi), iter))
 			return -EFAULT;
-		total += sizeof(pi);
 	}
 
 	if (vnet_hdr_sz) {
 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		if ((len -= vnet_hdr_sz) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
 
 		if (skb_is_gso(skb)) {
@@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
-					       sizeof(gso))))
+		if (copy_to_iter(&gso, sizeof(gso), iter))
 			return -EFAULT;
-		total += vnet_hdr_sz;
 	}
 
-	copied = total;
-	len = min_t(int, skb->len + vlan_hlen, len);
-	total += skb->len + vlan_hlen;
 	if (vlan_hlen) {
-		int copy, ret;
+		int ret;
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 	}
 
-	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
 
 done:
 	tun->dev->stats.tx_packets++;
-	tun->dev->stats.tx_bytes += len;
+	tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
 
 	return total;
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-			   const struct iovec *iv, ssize_t len, int noblock)
+			   const struct iovec *iv, unsigned long segs,
+			   ssize_t len, int noblock)
 {
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 	int peeked, err, off = 0;
+	struct iov_iter iter;
 
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
@@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	/* Read frames from queue */
 	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
 				  &peeked, &off, &err);
-	if (skb) {
-		ret = tun_put_user(tun, tfile, skb, iv, len);
-		kfree_skb(skb);
-	} else
-		ret = err;
+	if (!skb)
+		return ret;
+
+	iov_iter_init(&iter, READ, iv, segs, len);
+	ret = tun_put_user(tun, tfile, skb, &iter);
+	kfree_skb(skb);
 
 	return ret;
 }
@@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = tun_do_read(tun, tfile, iv, len,
+	ret = tun_do_read(tun, tfile, iv, count, len,
 			  file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
@@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, m->msg_iov, total_len,
+	ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05  3:27                       ` David Miller
@ 2014-11-05  3:55                         ` Al Viro
  2014-11-05  4:12                           ` Al Viro
  2014-11-05 20:50                           ` David Miller
  0 siblings, 2 replies; 82+ messages in thread
From: Al Viro @ 2014-11-05  3:55 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Tue, Nov 04, 2014 at 10:27:27PM -0500, David Miller wrote:

> Al, is this the helper you are talking about?

Mostly, except that I kept it 4-argument (and used skb_copy_datagram_msg()
for name).  Matter of taste - the ones you've missed because of that are

net/appletalk/ddp.c:1761:	err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied);
net/ipv4/tcp.c:1836:			err = skb_copy_datagram_iovec(skb, offset,
net/ipv4/udp.c:1284:		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
net/ipv6/udp.c:427:			err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
net/ipx/af_ipx.c:1808:	rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov,
net/iucv/af_iucv.c:1358:	if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) {
net/llc/af_llc.c:822:			int rc = skb_copy_datagram_iovec(skb, offset,
net/rxrpc/ar-recvmsg.c:183:		ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy);
net/tipc/socket.c:1375:		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg),
net/tipc/socket.c:1476:		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset,
net/unix/af_unix.c:1828:	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
net/unix/af_unix.c:2033:		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
net/vmw_vsock/vmci_transport.c:1776:	err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov,

and back then I decided that 13 more converted instances might be worth keeping
it in 4-argument form...

What do you think of the trick with user_msghdr, BTW?

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05  3:55                         ` Al Viro
@ 2014-11-05  4:12                           ` Al Viro
  2014-11-05 20:51                             ` David Miller
  2014-11-05 20:50                           ` David Miller
  1 sibling, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-05  4:12 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Wed, Nov 05, 2014 at 03:55:36AM +0000, Al Viro wrote:
> What do you think of the trick with user_msghdr, BTW?

	PS: where do you prefer the branches to be based off?
git://git.kernel.org/pub/scm/linux/kernel/git/davem/net#master, mainline,
something else?  I can certainly do that as patches over email, the
question is what's best used as base...  FWIW, the analysis I've posted
was in 3.18-rc3 and it looks like it ought to be valid in net#master
as well.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-04  8:31             ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
  2014-11-04 14:32               ` Al Viro
@ 2014-11-05 20:24               ` David Miller
  2014-11-06  8:23                 ` Herbert Xu
  2014-11-06  8:27                 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
  1 sibling, 2 replies; 82+ messages in thread
From: David Miller @ 2014-11-05 20:24 UTC (permalink / raw)
  To: herbert; +Cc: viro, netdev, linux-kernel, bcrl


Herbert, please provide a cover letter for this series, and the most recent
version of patch #2 gets various rejects when I try to apply it to net-next.

Thanks.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05  3:55                         ` Al Viro
  2014-11-05  4:12                           ` Al Viro
@ 2014-11-05 20:50                           ` David Miller
  2014-11-05 21:07                             ` Al Viro
  1 sibling, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-05 20:50 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 5 Nov 2014 03:55:36 +0000

> On Tue, Nov 04, 2014 at 10:27:27PM -0500, David Miller wrote:
> 
>> Al, is this the helper you are talking about?
> 
> Mostly, except that I kept it 4-argument (and used skb_copy_datagram_msg()
> for name).  Matter of taste - the ones you've missed because of that are
 ...
> and back then I decided that 13 more converted instances might be worth keeping
> it in 4-argument form...

Ok, fixed up patch below:

> What do you think of the trick with user_msghdr, BTW?

I think we can get away with it if, as you say, we don't export a 'msghdr'
from any uapi headers.

And indeed, double checking, it's purely a linux/socket.h thing.

If this patch is OK, mind if I toss it into net-next Al?

diff --git a/drivers/isdn/mISDN/socket.c b/drivers/isdn/mISDN/socket.c
index 1be8228..dcbd858 100644
--- a/drivers/isdn/mISDN/socket.c
+++ b/drivers/isdn/mISDN/socket.c
@@ -163,7 +163,7 @@ mISDN_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	memcpy(skb_push(skb, MISDN_HEADER_LEN), mISDN_HEAD_P(skb),
 	       MISDN_HEADER_LEN);
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	mISDN_sock_cmsg(sk, msg, skb);
 
diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 6c9c16d..443cbbf 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -981,7 +981,7 @@ static int pppoe_recvmsg(struct kiocb *iocb, struct socket *sock,
 
 	if (skb) {
 		total_len = min_t(size_t, total_len, skb->len);
-		error = skb_copy_datagram_iovec(skb, 0, m->msg_iov, total_len);
+		error = skb_copy_datagram_msg(skb, 0, m, total_len);
 		if (error == 0) {
 			consume_skb(skb);
 			return total_len;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 5ad9675..31cdb7e 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -21,6 +21,7 @@
 #include <linux/bug.h>
 #include <linux/cache.h>
 #include <linux/rbtree.h>
+#include <linux/socket.h>
 
 #include <linux/atomic.h>
 #include <asm/types.h>
@@ -2637,6 +2638,11 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
 			   struct poll_table_struct *wait);
 int skb_copy_datagram_iovec(const struct sk_buff *from, int offset,
 			    struct iovec *to, int size);
+static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset,
+					struct msghdr *msg, int size)
+{
+	return skb_copy_datagram_iovec(from, offset, msg->msg_iov, size);
+}
 int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen,
 				     struct iovec *iov);
 int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index c00897f..425942d 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1758,7 +1758,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
 	}
-	err = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, offset, msg, copied);
 
 	if (!err && msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_at *, sat, msg->msg_name);
diff --git a/net/atm/common.c b/net/atm/common.c
index 6a76515..9cd1cca 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -554,7 +554,7 @@ int vcc_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	error = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	error = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (error)
 		return error;
 	sock_recv_ts_and_drops(msg, sk, skb);
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index c35c3f4..f4f835e 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -1634,7 +1634,7 @@ static int ax25_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		ax25_digi digi;
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index 339c74a..0a7cc56 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -237,7 +237,7 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err == 0) {
 		sock_recv_ts_and_drops(msg, sk, skb);
 
@@ -328,7 +328,7 @@ int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock,
 		}
 
 		chunk = min_t(unsigned int, skb->len, size);
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, chunk)) {
+		if (skb_copy_datagram_msg(skb, 0, msg, chunk)) {
 			skb_queue_head(&sk->sk_receive_queue, skb);
 			if (!copied)
 				copied = -EFAULT;
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index 115f149..29e1ec7 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -878,7 +878,7 @@ static int hci_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	switch (hci_pi(sk)->channel) {
 	case HCI_CHANNEL_RAW:
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index 43f750e..fbcd156 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -293,7 +293,7 @@ static int caif_seqpkt_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copylen = len;
 	}
 
-	ret = skb_copy_datagram_iovec(skb, 0, m->msg_iov, copylen);
+	ret = skb_copy_datagram_msg(skb, 0, m, copylen);
 	if (ret)
 		goto out_free;
 
diff --git a/net/core/sock.c b/net/core/sock.c
index 15e0c67..ac56dd0 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -2457,7 +2457,7 @@ int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index 5ab6627..8e6ae94 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -896,7 +896,7 @@ verify_sock_status:
 		else if (len < skb->len)
 			msg->msg_flags |= MSG_TRUNC;
 
-		if (skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len)) {
+		if (skb_copy_datagram_msg(skb, 0, msg, len)) {
 			/* Exception. Bailout! */
 			len = -EFAULT;
 			break;
diff --git a/net/ieee802154/dgram.c b/net/ieee802154/dgram.c
index ef2ad8a..fc9193e 100644
--- a/net/ieee802154/dgram.c
+++ b/net/ieee802154/dgram.c
@@ -324,7 +324,7 @@ static int dgram_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	/* FIXME: skip headers if necessary ?! */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ieee802154/raw.c b/net/ieee802154/raw.c
index 9d1f648..73a4d53 100644
--- a/net/ieee802154/raw.c
+++ b/net/ieee802154/raw.c
@@ -195,7 +195,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c373a9a..21894df 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -424,7 +424,7 @@ int ip_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index 57f7c98..736236c 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -875,7 +875,7 @@ int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	}
 
 	/* Don't bother checking the checksum */
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 739db31..ee8fa4b 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -718,7 +718,7 @@ static int raw_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 39ec0c3..c239f47 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1377,7 +1377,7 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
 	/* XXX -- need to support SO_PEEK_OFF */
 
 	skb_queue_walk(&sk->sk_write_queue, skb) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, skb->len);
+		err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
 		if (err)
 			break;
 
@@ -1833,8 +1833,7 @@ do_prequeue:
 		}
 
 		if (!(flags & MSG_TRUNC)) {
-			err = skb_copy_datagram_iovec(skb, offset,
-						      msg->msg_iov, used);
+			err = skb_copy_datagram_msg(skb, offset, msg, used);
 			if (err) {
 				/* Exception. Bailout! */
 				if (!copied)
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 3f001db..df19027 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1281,8 +1281,8 @@ try_again:
 	}
 
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-					      msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+					    msg, copied);
 	else {
 		err = skb_copy_and_csum_datagram_iovec(skb,
 						       sizeof(struct udphdr),
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index 2cdc383..5c6996e 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -351,7 +351,7 @@ int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
@@ -445,7 +445,7 @@ int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len,
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free_skb;
 
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index 075a0fb..0cbcf98 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -486,11 +486,11 @@ static int rawv6_recvmsg(struct kiocb *iocb, struct sock *sk,
 	}
 
 	if (skb_csum_unnecessary(skb)) {
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	} else if (msg->msg_flags&MSG_TRUNC) {
 		if (__skb_checksum_complete(skb))
 			goto csum_copy_err;
-		err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	} else {
 		err = skb_copy_and_csum_datagram_iovec(skb, 0, msg->msg_iov);
 		if (err == -EINVAL)
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index f6ba535..9b68092 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -424,8 +424,8 @@ try_again:
 	}
 
 	if (skb_csum_unnecessary(skb))
-		err = skb_copy_datagram_iovec(skb, sizeof(struct udphdr),
-					      msg->msg_iov, copied);
+		err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
+					    msg, copied);
 	else {
 		err = skb_copy_and_csum_datagram_iovec(skb, sizeof(struct udphdr), msg->msg_iov);
 		if (err == -EINVAL)
diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c
index 313ef46..a0c7536 100644
--- a/net/ipx/af_ipx.c
+++ b/net/ipx/af_ipx.c
@@ -1805,8 +1805,7 @@ static int ipx_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	rc = skb_copy_datagram_iovec(skb, sizeof(struct ipxhdr), msg->msg_iov,
-				     copied);
+	rc = skb_copy_datagram_msg(skb, sizeof(struct ipxhdr), msg, copied);
 	if (rc)
 		goto out_free;
 	if (skb->tstamp.tv64)
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index 92fafd4..980bc26 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1396,7 +1396,7 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock,
 		copied = size;
 		msg->msg_flags |= MSG_TRUNC;
 	}
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index a089b6b..057b564 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -1355,7 +1355,7 @@ static int iucv_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		sk->sk_shutdown = sk->sk_shutdown | RCV_SHUTDOWN;
 
 	cskb = skb;
-	if (skb_copy_datagram_iovec(cskb, offset, msg->msg_iov, copied)) {
+	if (skb_copy_datagram_msg(cskb, offset, msg, copied)) {
 		if (!(flags & MSG_PEEK))
 			skb_queue_head(&sk->sk_receive_queue, skb);
 		return -EFAULT;
diff --git a/net/key/af_key.c b/net/key/af_key.c
index 1847ec4..e588309 100644
--- a/net/key/af_key.c
+++ b/net/key/af_key.c
@@ -3654,7 +3654,7 @@ static int pfkey_recvmsg(struct kiocb *kiocb,
 	}
 
 	skb_reset_transport_header(skb);
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index 369a982..a6cc1fe 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -528,7 +528,7 @@ static int l2tp_ip_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *m
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index 0edb263..2177b96 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -672,7 +672,7 @@ static int l2tp_ip6_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copied = len;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto done;
 
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index b704a93..c559bcd 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -208,7 +208,7 @@ static int pppol2tp_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else if (len < skb->len)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msg(skb, 0, msg, len);
 	if (likely(err == 0))
 		err = len;
 
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index bb9cbc1..af66266 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -819,8 +819,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock,
 			used = len;
 
 		if (!(flags & MSG_TRUNC)) {
-			int rc = skb_copy_datagram_iovec(skb, offset,
-							 msg->msg_iov, used);
+			int rc = skb_copy_datagram_msg(skb, offset, msg, used);
 			if (rc) {
 				/* Exception. Bailout! */
 				if (!copied)
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index f1de72d..580b794 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -2401,7 +2401,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 
 	skb_reset_transport_header(data_skb);
-	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(data_skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		DECLARE_SOCKADDR(struct sockaddr_nl *, addr, msg->msg_name);
diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c
index 1b06a1f..7e13f6a 100644
--- a/net/netrom/af_netrom.c
+++ b/net/netrom/af_netrom.c
@@ -1167,7 +1167,7 @@ static int nr_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	er = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	er = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (er < 0) {
 		skb_free_datagram(sk, skb);
 		release_sock(sk);
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index 51f077a..83bc785 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -832,7 +832,7 @@ static int llcp_sock_recvmsg(struct kiocb *iocb, struct socket *sock,
 	copied = min_t(unsigned int, rlen, len);
 
 	cskb = skb;
-	if (skb_copy_datagram_iovec(cskb, 0, msg->msg_iov, copied)) {
+	if (skb_copy_datagram_msg(cskb, 0, msg, copied)) {
 		if (!(flags & MSG_PEEK))
 			skb_queue_head(&sk->sk_receive_queue, skb);
 		return -EFAULT;
diff --git a/net/nfc/rawsock.c b/net/nfc/rawsock.c
index 11c3544..9d7d2b7 100644
--- a/net/nfc/rawsock.c
+++ b/net/nfc/rawsock.c
@@ -269,7 +269,7 @@ static int rawsock_recvmsg(struct kiocb *iocb, struct socket *sock,
 		copied = len;
 	}
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	skb_free_datagram(sk, skb);
 
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 87d20f4..4cd13d8 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2953,7 +2953,7 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (err)
 		goto out_free;
 
diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c
index 290352c..0918bc2 100644
--- a/net/phonet/datagram.c
+++ b/net/phonet/datagram.c
@@ -150,7 +150,7 @@ static int pn_recvmsg(struct kiocb *iocb, struct sock *sk,
 		copylen = len;
 	}
 
-	rval = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copylen);
+	rval = skb_copy_datagram_msg(skb, 0, msg, copylen);
 	if (rval) {
 		rval = -EFAULT;
 		goto out;
diff --git a/net/phonet/pep.c b/net/phonet/pep.c
index 70a547e..44b2123 100644
--- a/net/phonet/pep.c
+++ b/net/phonet/pep.c
@@ -1296,7 +1296,7 @@ copy:
 	else
 		len = skb->len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, len);
+	err = skb_copy_datagram_msg(skb, 0, msg, len);
 	if (!err)
 		err = (flags & MSG_TRUNC) ? skb->len : len;
 
diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c
index a85c1a0..9b600c2 100644
--- a/net/rose/af_rose.c
+++ b/net/rose/af_rose.c
@@ -1249,7 +1249,7 @@ static int rose_recvmsg(struct kiocb *iocb, struct socket *sock,
 		msg->msg_flags |= MSG_TRUNC;
 	}
 
-	skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	if (msg->msg_name) {
 		struct sockaddr_rose *srose;
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index e9aaa65..4575485 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -180,7 +180,7 @@ int rxrpc_recvmsg(struct kiocb *iocb, struct socket *sock,
 		if (copy > len - copied)
 			copy = len - copied;
 
-		ret = skb_copy_datagram_iovec(skb, offset, msg->msg_iov, copy);
+		ret = skb_copy_datagram_msg(skb, offset, msg, copy);
 
 		if (ret < 0)
 			goto copy_error;
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 634a2ab..2120292 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2095,7 +2095,7 @@ static int sctp_recvmsg(struct kiocb *iocb, struct sock *sk,
 	if (copied > len)
 		copied = len;
 
-	err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_msg(skb, 0, msg, copied);
 
 	event = sctp_skb2event(skb);
 
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index ad8a1a1..591bbfa 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -1372,8 +1372,7 @@ restart:
 			sz = buf_len;
 			m->msg_flags |= MSG_TRUNC;
 		}
-		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg),
-					      m->msg_iov, sz);
+		res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg), m, sz);
 		if (res)
 			goto exit;
 		res = sz;
@@ -1473,8 +1472,8 @@ restart:
 		needed = (buf_len - sz_copied);
 		sz_to_copy = (sz <= needed) ? sz : needed;
 
-		res = skb_copy_datagram_iovec(buf, msg_hdr_sz(msg) + offset,
-					      m->msg_iov, sz_to_copy);
+		res = skb_copy_datagram_msg(buf, msg_hdr_sz(msg) + offset,
+					    m, sz_to_copy);
 		if (res)
 			goto exit;
 
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index e968843..5eee625 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -1825,7 +1825,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock,
 	else if (size < skb->len - skip)
 		msg->msg_flags |= MSG_TRUNC;
 
-	err = skb_copy_datagram_iovec(skb, skip, msg->msg_iov, size);
+	err = skb_copy_datagram_msg(skb, skip, msg, size);
 	if (err)
 		goto out_free;
 
@@ -2030,8 +2030,8 @@ again:
 		}
 
 		chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
-		if (skb_copy_datagram_iovec(skb, UNIXCB(skb).consumed + skip,
-					    msg->msg_iov, chunk)) {
+		if (skb_copy_datagram_msg(skb, UNIXCB(skb).consumed + skip,
+					  msg, chunk)) {
 			if (copied == 0)
 				copied = -EFAULT;
 			break;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 9bb63ff..a57ddef 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -1773,8 +1773,7 @@ static int vmci_transport_dgram_dequeue(struct kiocb *kiocb,
 	}
 
 	/* Place the datagram payload in the user's iovec. */
-	err = skb_copy_datagram_iovec(skb, sizeof(*dg), msg->msg_iov,
-		payload_len);
+	err = skb_copy_datagram_msg(skb, sizeof(*dg), msg, payload_len);
 	if (err)
 		goto out;
 
diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c
index 5ad4418..59e785b 100644
--- a/net/x25/af_x25.c
+++ b/net/x25/af_x25.c
@@ -1335,7 +1335,7 @@ static int x25_recvmsg(struct kiocb *iocb, struct socket *sock,
 	/* Currently, each datagram always contains a complete record */
 	msg->msg_flags |= MSG_EOR;
 
-	rc = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
+	rc = skb_copy_datagram_msg(skb, 0, msg, copied);
 	if (rc)
 		goto out_free_dgram;
 

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05  4:12                           ` Al Viro
@ 2014-11-05 20:51                             ` David Miller
  0 siblings, 0 replies; 82+ messages in thread
From: David Miller @ 2014-11-05 20:51 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 5 Nov 2014 04:12:32 +0000

> On Wed, Nov 05, 2014 at 03:55:36AM +0000, Al Viro wrote:
>> What do you think of the trick with user_msghdr, BTW?
> 
> 	PS: where do you prefer the branches to be based off?
> git://git.kernel.org/pub/scm/linux/kernel/git/davem/net#master, mainline,
> something else?  I can certainly do that as patches over email, the
> question is what's best used as base...  FWIW, the analysis I've posted
> was in 3.18-rc3 and it looks like it ought to be valid in net#master
> as well.

Let's work against net-next, ie:

git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next#master

I can integrate, your, mine, and Herbert's changes all into the same
place.

Thanks.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05 20:50                           ` David Miller
@ 2014-11-05 21:07                             ` Al Viro
  2014-11-05 21:57                               ` David Miller
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-05 21:07 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Wed, Nov 05, 2014 at 03:50:54PM -0500, David Miller wrote:
> I think we can get away with it if, as you say, we don't export a 'msghdr'
> from any uapi headers.

OK.  I'm about halfway through the review of struct msghdr instances in the
current tree right now, will post user_msghdr patch once I'm done.  Already
found a dumb bug in o2net_send_tcp_msg() while doing that - broken by me
back in 3.15 ;-/  Will send a fix to Linus in an hour or so...

> And indeed, double checking, it's purely a linux/socket.h thing.
> 
> If this patch is OK, mind if I toss it into net-next Al?

Sure, no problem - AFAICS, the only real difference from rebase of April one
I've quoted upthread is that you add include of socket.h into skbuff.h; the
rest of the differences is pure whitespace noise.

Ping me when you put it there, OK?  I'll rebase the rest of old stuff on
top of it (similar helpers, mostly).

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05 21:07                             ` Al Viro
@ 2014-11-05 21:57                               ` David Miller
  2014-11-06  3:25                                 ` Al Viro
  0 siblings, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-05 21:57 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Wed, 5 Nov 2014 21:07:45 +0000

> Ping me when you put it there, OK?  I'll rebase the rest of old stuff on
> top of it (similar helpers, mostly).

I just pushed it into net-next, thanks Al.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05 21:57                               ` David Miller
@ 2014-11-06  3:25                                 ` Al Viro
  2014-11-06  5:50                                   ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
                                                     ` (3 more replies)
  0 siblings, 4 replies; 82+ messages in thread
From: Al Viro @ 2014-11-06  3:25 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Wed, Nov 05, 2014 at 04:57:19PM -0500, David Miller wrote:
> From: Al Viro <viro@ZenIV.linux.org.uk>
> Date: Wed, 5 Nov 2014 21:07:45 +0000
> 
> > Ping me when you put it there, OK?  I'll rebase the rest of old stuff on
> > top of it (similar helpers, mostly).
> 
> I just pushed it into net-next, thanks Al.

OK, I've taken the beginning of the old queue on top of net-next; it's
in git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net.

>From the quick look at the remaining ->msg_iov users:

	* I'll need to add several iov_iter primitives - counterparts of
checksum.h stuff (copy_and_csum_{from,to}_iter(), maybe some more).  Not
a big deal, I'll do that tomorrow.  That will give us a clean iov_iter-based
counterpart of skb_copy_and_csum_datagram_iovec().

	* a new helper: zerocopy_sg_from_iter().  I have it, actually,
but I'd rather not step on Herbert's toes - it's too close to the areas
his series will touch, so that's probably for when his series goes in.
It will be needed for complete macvtap conversion...

	* why doesn't verify_iovec() use rw_copy_check_uvector()?  The only
real differences I see is that (a) you do allocation in callers (same as
rw_copy_check_uvector() would've done), (b) you return EMSGSIZE in case of
too long vector, while rw_copy_check_uvector() returns EINVAL in that case
and (c) you don't do access_ok().  The last one is described as optimization,
but for iov_iter primitives it's a serious PITA - for iovec-backed instances
they are using __copy_from_user()/__copy_to_user(), etc.
	It certainly would be nice to have the same code doing all copying
of iovecs from userland - readv/writev/aio/sendmsg/recvmsg/etc.  Am I
missing something subtle semantical difference in there?  EMSGSIZE vs EINVAL
is trivial (we can lift that check into the callers, if nothing else), but
I could miss something more interesting...

	* various getfrag will need to grow iov_iter-based counterparts,
but ip_append_output() needs no changes, AFAICS.

	* crypto stuff will be easy to convert - iov_iter_get_pages()
would suffice for a primitive

	* there's some really weird stuff in there.  Just what is this
static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
{
        struct iovec *iov;
        u8 __user *type = NULL;
        u8 __user *code = NULL;
        int probed = 0;
        unsigned int i;

        if (!msg->msg_iov)
                return 0;

        for (i = 0; i < msg->msg_iovlen; i++) {
                iov = &msg->msg_iov[i];
                if (!iov)
                        continue;
trying to do?  "If non-NULL pointer + i somehow happened to be NULL, skip it
and try to use the same pointer + i + 1"?  Huh?  Had been that way since
the function first went in back in 2004 ("[IPV4] XFRM: probe icmp type/code
when sending packets via raw socket.", according to historical tree)...

	* rds, bluetooth and vsock are doing something odd; need to RTFS some
more.

	* not sure I understand what TIPC is doing - does it prohibit too
short first segment of ->msg_iov?  net/tipc/socket.c:dest_name_check() looks
odd _and_ potentially racy - we read the same data twice and hope our checks
still apply.  I asked TIPC folks about that race back in April, but it
looks like that fell through the cracks...

Overall, so far it looks more or less feasible - other than the missing csum
primitives, current mm/iov_iter.c should suffice.  I have _not_ seriously
looked into sendpage yet; that might very well require some more.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06  3:25                                 ` Al Viro
@ 2014-11-06  5:50                                   ` Herbert Xu
  2014-11-06  6:43                                     ` Al Viro
  2014-11-06  9:50                                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Jon Maloy
                                                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  5:50 UTC (permalink / raw)
  To: Al Viro
  Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura,
	Hideaki YOSHIFUJI

On Thu, Nov 06, 2014 at 03:25:34AM +0000, Al Viro wrote:
>
> 	* there's some really weird stuff in there.  Just what is this
> static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
> {

It looks like newbie coding that's all.  There's nothing tricky
here as far as I can tell.  We're just trying to fetch the ICMP
header to seed the IPsec lookup.

So how about this rewrite? I'm assuming that you're not going
to get rid of memcpy_fromiovecend/memcpy_toiovecend, if you
are, let me know and I'll redo this with iterators.

ipv4: Use standard iovec primitive in raw_probe_proto_opt

The function raw_probe_proto_opt tries to extract the first two
bytes from the user input in order to seed the IPsec lookup for
ICMP packets.  In doing so it's processing iovec by hand and
overcomplicating things.

This patch replaces the manual iovec processing with a call to
memcpy_fromiovecend.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 739db31..04f67e1 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -422,48 +422,20 @@ error:
 
 static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
 {
-	struct iovec *iov;
-	u8 __user *type = NULL;
-	u8 __user *code = NULL;
-	int probed = 0;
-	unsigned int i;
+	struct icmphdr icmph;
+	int err;
 
-	if (!msg->msg_iov)
+	if (fl4->flowi4_proto != IPPROTO_ICMP)
 		return 0;
 
-	for (i = 0; i < msg->msg_iovlen; i++) {
-		iov = &msg->msg_iov[i];
-		if (!iov)
-			continue;
-
-		switch (fl4->flowi4_proto) {
-		case IPPROTO_ICMP:
-			/* check if one-byte field is readable or not. */
-			if (iov->iov_base && iov->iov_len < 1)
-				break;
-
-			if (!type) {
-				type = iov->iov_base;
-				/* check if code field is readable or not. */
-				if (iov->iov_len > 1)
-					code = type + 1;
-			} else if (!code)
-				code = iov->iov_base;
-
-			if (type && code) {
-				if (get_user(fl4->fl4_icmp_type, type) ||
-				    get_user(fl4->fl4_icmp_code, code))
-					return -EFAULT;
-				probed = 1;
-			}
-			break;
-		default:
-			probed = 1;
-			break;
-		}
-		if (probed)
-			break;
-	}
+	/* We only need the first two bytes. */
+	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
+	if (err)
+		return err;
+
+	fl4->fl4_icmp_type = icmph.type;
+	fl4->fl4_icmp_code = icmph.code;
+
 	return 0;
 }
 

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06  5:50                                   ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
@ 2014-11-06  6:43                                     ` Al Viro
  2014-11-06  6:46                                       ` Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-06  6:43 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura,
	Hideaki YOSHIFUJI

On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote:
> +	/* We only need the first two bytes. */
> +	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
> +	if (err)
> +		return err;
> +
> +	fl4->fl4_icmp_type = icmph.type;
> +	fl4->fl4_icmp_code = icmph.code;

That's more readable, but that exposes another problem in there - we read
the same piece of userland data twice, with no promise whatsoever that we'll
get the same value both times...

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06  6:43                                     ` Al Viro
@ 2014-11-06  6:46                                       ` Herbert Xu
  2014-11-06  7:11                                         ` Al Viro
  2014-11-06 21:28                                         ` David Miller
  0 siblings, 2 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  6:46 UTC (permalink / raw)
  To: Al Viro
  Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura,
	Hideaki YOSHIFUJI

On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote:
> On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote:
> > +	/* We only need the first two bytes. */
> > +	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
> > +	if (err)
> > +		return err;
> > +
> > +	fl4->fl4_icmp_type = icmph.type;
> > +	fl4->fl4_icmp_code = icmph.code;
> 
> That's more readable, but that exposes another problem in there - we read
> the same piece of userland data twice, with no promise whatsoever that we'll
> get the same value both times...

Sure, but you have to be root anyway to write to raw sockets.

Patches are welcome :)

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06  6:46                                       ` Herbert Xu
@ 2014-11-06  7:11                                         ` Al Viro
  2014-11-06  9:55                                           ` Jon Maloy
  2014-11-06 21:28                                         ` David Miller
  1 sibling, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-06  7:11 UTC (permalink / raw)
  To: Herbert Xu
  Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura,
	Hideaki YOSHIFUJI

On Thu, Nov 06, 2014 at 02:46:29PM +0800, Herbert Xu wrote:
> On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote:
> > On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote:
> > > +	/* We only need the first two bytes. */
> > > +	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
> > > +	if (err)
> > > +		return err;
> > > +
> > > +	fl4->fl4_icmp_type = icmph.type;
> > > +	fl4->fl4_icmp_code = icmph.code;
> > 
> > That's more readable, but that exposes another problem in there - we read
> > the same piece of userland data twice, with no promise whatsoever that we'll
> > get the same value both times...
> 
> Sure, but you have to be root anyway to write to raw sockets.

Point, but that might very well be a pattern to watch for - there's at least
one more instance in TIPC (also not exploitable, according to TIPC folks)
and such bugs are easily repeated...

BTW, I've picked the tun and macvtap related bits from another part of old
queue; see vfs.git#untested-macvtap - it's on top of #iov_iter-net and it's
really completely untested.  Back then I was mostly interested in killing
as many ->aio_write() instances as I could, so it's only the "send" side of
things.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-05 20:24               ` David Miller
@ 2014-11-06  8:23                 ` Herbert Xu
  2014-11-06 17:25                   ` David Miller
  2014-11-06  8:27                 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
  1 sibling, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  8:23 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl

On Wed, Nov 05, 2014 at 03:24:10PM -0500, David Miller wrote:
> 
> Herbert, please provide a cover letter for this series, and the most recent
> version of patch #2 gets various rejects when I try to apply it to net-next.

Sure, I'll regenerate them.  However, while doing so I noticed that
a number of my patches on tun/macvtap that you have previously set
as accepted are missing from net-next.  Could this be why you got
the rejects?

For example, this patch wasn't in net-next when I just did a pull.

https://patchwork.ozlabs.org/patch/405966/

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version
  2014-11-05 20:24               ` David Miller
  2014-11-06  8:23                 ` Herbert Xu
@ 2014-11-06  8:27                 ` Herbert Xu
  2014-11-06  8:28                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
                                     ` (3 more replies)
  1 sibling, 4 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  8:27 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

Hi Dave:

This patch series adds the helper skb_copy_datagram_iter, which
is meant to replace both skb_copy_datagram_iovec and its evil
twin skb_copy_datagram_const_iovec.

It then converts tun and macvtap over to the new helper and finally
removes skb_copy_datagram_const_iovec which is only used by tun
and macvtap.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06  8:27                 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
@ 2014-11-06  8:28                   ` Herbert Xu
  2014-11-06 17:30                     ` Al Viro
  2014-11-06  8:28                   ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
                                     ` (2 subsequent siblings)
  3 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

This patch adds skb_copy_datagram_iter, which is identical to
skb_copy_datagram_iovec except that it operates on iov_iter
instead of iovec.

Eventually all users of skb_copy_datagram_iovec should switch
over to iov_iter and then we can remove skb_copy_datagram_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 +
 net/core/datagram.c    |   82 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 39ec753..a405013 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -150,6 +150,7 @@
 struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
+struct iov_iter;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -2655,6 +2656,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
 				  const struct iovec *to, int to_offset,
 				  int size);
+int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
+			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a8..45a9d4d 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/uio.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -482,6 +483,87 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
 
 /**
+ *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	trace_skb_copy_datagram_iovec(skb, len);
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (copy_to_iter(skb->data + offset, copy, to))
+			goto fault;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_frag_size(frag);
+		if ((copy = end - offset) > 0) {
+			int err;
+			u8  *vaddr;
+			struct page *page = skb_frag_page(frag);
+
+			if (copy > len)
+				copy = len;
+			vaddr = kmap(page);
+			err = copy_to_iter(vaddr + frag->page_offset +
+					   offset - start, copy, to);
+			kunmap(page);
+			if (err)
+				goto fault;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_iter(frag_iter, offset - start,
+						   to, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+fault:
+	return -EFAULT;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
+/**
  *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying to

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 2/4] tun: Use iovec iterators
  2014-11-06  8:27                 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
  2014-11-06  8:28                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
@ 2014-11-06  8:28                   ` Herbert Xu
  2014-11-06  8:28                   ` [PATCH 3/4] macvtap: " Herbert Xu
  2014-11-06  8:28                   ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/tun.c |   65 ++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..b4ac4d5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <linux/seq_file.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
@@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct sk_buff *skb,
-			    const struct iovec *iv, int len)
+			    struct iov_iter *iter)
 {
 	struct tun_pi pi = { 0, skb->protocol };
-	ssize_t total = 0;
-	int vlan_offset = 0, copied;
+	ssize_t total;
+	int vlan_offset;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
@@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	if (tun->flags & TUN_VNET_HDR)
 		vnet_hdr_sz = tun->vnet_hdr_sz;
 
+	total = skb->len + vlan_hlen + vnet_hdr_sz;
+
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) < 0)
+		if (iov_iter_count(iter) < sizeof(pi))
 			return -EINVAL;
 
-		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
+		total += sizeof(pi);
+		if (iov_iter_count(iter) < total) {
 			/* Packet will be striped */
 			pi.flags |= TUN_PKT_STRIP;
 		}
 
-		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
+		if (copy_to_iter(&pi, sizeof(pi), iter))
 			return -EFAULT;
-		total += sizeof(pi);
 	}
 
 	if (vnet_hdr_sz) {
 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		if ((len -= vnet_hdr_sz) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
 
 		if (skb_is_gso(skb)) {
@@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
-					       sizeof(gso))))
+		if (copy_to_iter(&gso, sizeof(gso), iter))
 			return -EFAULT;
-		total += vnet_hdr_sz;
 	}
 
-	copied = total;
-	len = min_t(int, skb->len + vlan_hlen, len);
-	total += skb->len + vlan_hlen;
 	if (vlan_hlen) {
-		int copy, ret;
+		int ret;
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 	}
 
-	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
 
 done:
 	tun->dev->stats.tx_packets++;
-	tun->dev->stats.tx_bytes += len;
+	tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
 
 	return total;
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-			   const struct iovec *iv, ssize_t len, int noblock)
+			   const struct iovec *iv, unsigned long segs,
+			   ssize_t len, int noblock)
 {
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 	int peeked, err, off = 0;
+	struct iov_iter iter;
 
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
@@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	/* Read frames from queue */
 	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
 				  &peeked, &off, &err);
-	if (skb) {
-		ret = tun_put_user(tun, tfile, skb, iv, len);
-		kfree_skb(skb);
-	} else
-		ret = err;
+	if (!skb)
+		return ret;
+
+	iov_iter_init(&iter, READ, iv, segs, len);
+	ret = tun_put_user(tun, tfile, skb, &iter);
+	kfree_skb(skb);
 
 	return ret;
 }
@@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = tun_do_read(tun, tfile, iv, len,
+	ret = tun_do_read(tun, tfile, iv, count, len,
 			  file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
@@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, m->msg_iov, total_len,
+	ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 3/4] macvtap: Use iovec iterators
  2014-11-06  8:27                 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
  2014-11-06  8:28                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
  2014-11-06  8:28                   ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
@ 2014-11-06  8:28                   ` Herbert Xu
  2014-11-06 17:33                     ` Al Viro
  2014-11-06  8:28                   ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  3 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/macvtap.c |   45 ++++++++++++++++++++-------------------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 880cc09..a0e1dd7 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -15,6 +15,7 @@
 #include <linux/cdev.h>
 #include <linux/idr.h>
 #include <linux/fs.h>
+#include <linux/uio.h>
 
 #include <net/ipv6.h>
 #include <net/net_namespace.h>
@@ -778,31 +779,28 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 /* Put packet to the user space buffer */
 static ssize_t macvtap_put_user(struct macvtap_queue *q,
 				const struct sk_buff *skb,
-				const struct iovec *iv, int len)
+				struct iov_iter *iter)
 {
 	int ret;
 	int vnet_hdr_len = 0;
 	int vlan_offset = 0;
-	int copied, total;
+	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
 		vnet_hdr_len = q->vnet_hdr_sz;
-		if ((len -= vnet_hdr_len) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
 		macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
 
-		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
+		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter))
 			return -EFAULT;
 	}
-	total = copied = vnet_hdr_len;
+	total = vnet_hdr_len;
 	total += skb->len;
 
-	if (!vlan_tx_tag_present(skb))
-		len = min_t(int, skb->len, len);
-	else {
-		int copy;
+	if (vlan_tx_tag_present(skb)) {
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -811,37 +809,33 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
-		len = min_t(int, skb->len + VLAN_HLEN, len);
 		total += VLAN_HLEN;
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 	}
 
-	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
+				     skb->len - vlan_offset);
 
 done:
 	return ret ? ret : total;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       const struct iovec *iv, unsigned long len,
+			       const struct iovec *iv, unsigned long segs,
+			       unsigned long len,
 			       int noblock)
 {
 	DEFINE_WAIT(wait);
 	struct sk_buff *skb;
 	ssize_t ret = 0;
+	struct iov_iter iter;
 
 	while (len) {
 		if (!noblock)
@@ -863,7 +857,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q,
 			schedule();
 			continue;
 		}
-		ret = macvtap_put_user(q, skb, iv, len);
+		iov_iter_init(&iter, READ, iv, segs, len);
+		ret = macvtap_put_user(q, skb, &iter);
 		kfree_skb(skb);
 		break;
 	}
@@ -886,7 +881,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK);
+	ret = macvtap_do_read(q, iv, count, len, file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
 		iocb->ki_pos = ret;
@@ -1117,7 +1112,7 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
 	int ret;
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	ret = macvtap_do_read(q, m->msg_iov, total_len,
+	ret = macvtap_do_read(q, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec
  2014-11-06  8:27                 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
                                     ` (2 preceding siblings ...)
  2014-11-06  8:28                   ` [PATCH 3/4] macvtap: " Herbert Xu
@ 2014-11-06  8:28                   ` Herbert Xu
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-06  8:28 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

Now that both macvtap and tun are using skb_copy_datagram_iter, we
can kill the abomination that is skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 -
 net/core/datagram.c    |   89 -------------------------------------------------
 2 files changed, 92 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index a405013..da59580 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2653,9 +2653,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 int len);
 int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 			   int offset, size_t count);
-int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
-				  const struct iovec *to, int to_offset,
-				  int size);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 45a9d4d..93054b9 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -394,95 +394,6 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 
 /**
- *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: io vector to copy to
- *	@to_offset: offset in the io vector to start copying to
- *	@len: amount of data to copy from buffer to iovec
- *
- *	Returns 0 or -EFAULT.
- *	Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
-				  const struct iovec *to, int to_offset,
-				  int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-		to_offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
-						offset - start, to_offset, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_const_iovec(frag_iter,
-							  offset - start,
-							  to, to_offset,
-							  copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
-
-/**
  *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* RE: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06  3:25                                 ` Al Viro
  2014-11-06  5:50                                   ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
@ 2014-11-06  9:50                                   ` Jon Maloy
  2014-11-07 21:48                                   ` David Miller
  2014-11-07 21:52                                   ` David Miller
  3 siblings, 0 replies; 82+ messages in thread
From: Jon Maloy @ 2014-11-06  9:50 UTC (permalink / raw)
  To: Al Viro, David Miller; +Cc: herbert, netdev, linux-kernel, bcrl



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Al Viro
> Sent: November-06-14 4:26 AM
> To: David Miller
> Cc: herbert@gondor.apana.org.au; netdev@vger.kernel.org; linux-
> kernel@vger.kernel.org; bcrl@kvack.org
> Subject: Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
> 
> On Wed, Nov 05, 2014 at 04:57:19PM -0500, David Miller wrote:
> > From: Al Viro <viro@ZenIV.linux.org.uk>
> > Date: Wed, 5 Nov 2014 21:07:45 +0000
> >
> > > Ping me when you put it there, OK?  I'll rebase the rest of old
> > > stuff on top of it (similar helpers, mostly).
> >
> > I just pushed it into net-next, thanks Al.
> 
> OK, I've taken the beginning of the old queue on top of net-next; it's in
> git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net.
> 
> From the quick look at the remaining ->msg_iov users:
> 
> 	* I'll need to add several iov_iter primitives - counterparts of
> checksum.h stuff (copy_and_csum_{from,to}_iter(), maybe some more).
> Not a big deal, I'll do that tomorrow.  That will give us a clean iov_iter-based
> counterpart of skb_copy_and_csum_datagram_iovec().
> 
> 	* a new helper: zerocopy_sg_from_iter().  I have it, actually, but I'd
> rather not step on Herbert's toes - it's too close to the areas his series will
> touch, so that's probably for when his series goes in.
> It will be needed for complete macvtap conversion...
> 
> 	* why doesn't verify_iovec() use rw_copy_check_uvector()?  The
> only real differences I see is that (a) you do allocation in callers (same as
> rw_copy_check_uvector() would've done), (b) you return EMSGSIZE in case
> of too long vector, while rw_copy_check_uvector() returns EINVAL in that
> case and (c) you don't do access_ok().  The last one is described as
> optimization, but for iov_iter primitives it's a serious PITA - for iovec-backed
> instances they are using __copy_from_user()/__copy_to_user(), etc.
> 	It certainly would be nice to have the same code doing all copying of
> iovecs from userland - readv/writev/aio/sendmsg/recvmsg/etc.  Am I
> missing something subtle semantical difference in there?  EMSGSIZE vs
> EINVAL is trivial (we can lift that check into the callers, if nothing else), but I
> could miss something more interesting...
> 
> 	* various getfrag will need to grow iov_iter-based counterparts, but
> ip_append_output() needs no changes, AFAICS.
> 
> 	* crypto stuff will be easy to convert - iov_iter_get_pages() would
> suffice for a primitive
> 
> 	* there's some really weird stuff in there.  Just what is this static int
> raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg) {
>         struct iovec *iov;
>         u8 __user *type = NULL;
>         u8 __user *code = NULL;
>         int probed = 0;
>         unsigned int i;
> 
>         if (!msg->msg_iov)
>                 return 0;
> 
>         for (i = 0; i < msg->msg_iovlen; i++) {
>                 iov = &msg->msg_iov[i];
>                 if (!iov)
>                         continue;
> trying to do?  "If non-NULL pointer + i somehow happened to be NULL, skip it
> and try to use the same pointer + i + 1"?  Huh?  Had been that way since the
> function first went in back in 2004 ("[IPV4] XFRM: probe icmp type/code
> when sending packets via raw socket.", according to historical tree)...
> 
> 	* rds, bluetooth and vsock are doing something odd; need to RTFS
> some more.
> 
> 	* not sure I understand what TIPC is doing - does it prohibit too short
> first segment of ->msg_iov?  

Yes, that is the purpose. It was needed in early versions of TIPC, which was
using TIPC itself, instead of netlink, as carrier of configuration commands.
This option is long gone, and we can safely remove those checks.  I'll
post a patch.

///jon

net/tipc/socket.c:dest_name_check() looks
> odd _and_ potentially racy - we read the same data twice and hope our
> checks still apply.  I asked TIPC folks about that race back in April, but it looks
> like that fell through the cracks...
> 
> Overall, so far it looks more or less feasible - other than the missing csum
> primitives, current mm/iov_iter.c should suffice.  I have _not_ seriously
> looked into sendpage yet; that might very well require some more.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* RE: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06  7:11                                         ` Al Viro
@ 2014-11-06  9:55                                           ` Jon Maloy
  2014-11-06 22:16                                             ` Al Viro
  0 siblings, 1 reply; 82+ messages in thread
From: Jon Maloy @ 2014-11-06  9:55 UTC (permalink / raw)
  To: Al Viro, Herbert Xu
  Cc: David Miller, netdev, linux-kernel, bcrl, Masahide Nakamura,
	Hideaki YOSHIFUJI



> -----Original Message-----
> From: netdev-owner@vger.kernel.org [mailto:netdev-
> owner@vger.kernel.org] On Behalf Of Al Viro
> Sent: November-06-14 8:11 AM
> To: Herbert Xu
> Cc: David Miller; netdev@vger.kernel.org; linux-kernel@vger.kernel.org;
> bcrl@kvack.org; Masahide Nakamura; Hideaki YOSHIFUJI
> Subject: Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
> 
> On Thu, Nov 06, 2014 at 02:46:29PM +0800, Herbert Xu wrote:
> > On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote:
> > > On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote:
> > > > +	/* We only need the first two bytes. */
> > > > +	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
> > > > +	if (err)
> > > > +		return err;
> > > > +
> > > > +	fl4->fl4_icmp_type = icmph.type;
> > > > +	fl4->fl4_icmp_code = icmph.code;
> > >
> > > That's more readable, but that exposes another problem in there - we
> > > read the same piece of userland data twice, with no promise
> > > whatsoever that we'll get the same value both times...
> >
> > Sure, but you have to be root anyway to write to raw sockets.
> 
> Point, but that might very well be a pattern to watch for - there's at least one
> more instance in TIPC (also not exploitable, according to TIPC folks) and such

I don't recall this, and I can't see where it would be either. Can you please
point to where it is?

///jon

> bugs are easily repeated...
> 
> BTW, I've picked the tun and macvtap related bits from another part of old
> queue; see vfs.git#untested-macvtap - it's on top of #iov_iter-net and it's
> really completely untested.  Back then I was mostly interested in killing as
> many ->aio_write() instances as I could, so it's only the "send" side of things.
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in the body
> of a message to majordomo@vger.kernel.org More majordomo info at
> http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06  8:23                 ` Herbert Xu
@ 2014-11-06 17:25                   ` David Miller
  2014-11-07  1:59                     ` Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-06 17:25 UTC (permalink / raw)
  To: herbert; +Cc: viro, netdev, linux-kernel, bcrl

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 6 Nov 2014 16:23:38 +0800

> On Wed, Nov 05, 2014 at 03:24:10PM -0500, David Miller wrote:
>> 
>> Herbert, please provide a cover letter for this series, and the most recent
>> version of patch #2 gets various rejects when I try to apply it to net-next.
> 
> Sure, I'll regenerate them.  However, while doing so I noticed that
> a number of my patches on tun/macvtap that you have previously set
> as accepted are missing from net-next.  Could this be why you got
> the rejects?

Those were bug fixes so went into plain 'net', they will show up next
time I do a merge and I will deal with the conflicts, if any.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06  8:28                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
@ 2014-11-06 17:30                     ` Al Viro
  2014-11-07  1:58                       ` Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-06 17:30 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

On Thu, Nov 06, 2014 at 04:28:18PM +0800, Herbert Xu wrote:
> +		if (copy_to_iter(skb->data + offset, copy, to))
> +			goto fault;

Sorry, no - copy_to_iter() returns the number of bytes copied, not 0 or -EFAULT.

> +			vaddr = kmap(page);
> +			err = copy_to_iter(vaddr + frag->page_offset +
> +					   offset - start, copy, to);
> +			kunmap(page);
> +			if (err)
> +				goto fault;

And that one should be
			copied = copy_page_to_iter(page, frag->page_offset +
					   offset - start, copy, to);
			if (copied != copy)
				goto fault;

Don't bother with kmap(), vaddr and all that shite.  The primitive is
	copy_page_to_iter(page, offset_in_page, nbytes, iter)
it does all needed kmap itself and it's smart enough to use kmap_atomic
when it can get away with that.  Similar for copy_page_from_iter().

Both of those (as well as copy_{to,from}_iter()) advance iov_iter and return
the number of bytes actually copied.  So the check for EFAULT is "it has copied
less than you've asked it to copy *and* you haven't run out that iov_iter".
The second part is guaranteed to be true in this case - your code makes sure
that 'copy' is no more than the space left in iterator.

In general, this check would be spelled
			if (copied != copy && iov_iter_count(to))
				goto fault;

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 3/4] macvtap: Use iovec iterators
  2014-11-06  8:28                   ` [PATCH 3/4] macvtap: " Herbert Xu
@ 2014-11-06 17:33                     ` Al Viro
  0 siblings, 0 replies; 82+ messages in thread
From: Al Viro @ 2014-11-06 17:33 UTC (permalink / raw)
  To: Herbert Xu; +Cc: David Miller, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

On Thu, Nov 06, 2014 at 04:28:20PM +0800, Herbert Xu wrote:

> +		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter))
>  			return -EFAULT;

Again, wrong calling conventions.  It returns how much has it copied.

> +		ret = copy_to_iter(&veth, sizeof(veth), iter);
> +		if (ret || !iov_iter_count(iter))
>  			goto done;
Ditto.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06  6:46                                       ` Herbert Xu
  2014-11-06  7:11                                         ` Al Viro
@ 2014-11-06 21:28                                         ` David Miller
  2014-11-07  2:00                                           ` Herbert Xu
  1 sibling, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-06 21:28 UTC (permalink / raw)
  To: herbert; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Thu, 6 Nov 2014 14:46:29 +0800

> On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote:
>> On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote:
>> > +	/* We only need the first two bytes. */
>> > +	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
>> > +	if (err)
>> > +		return err;
>> > +
>> > +	fl4->fl4_icmp_type = icmph.type;
>> > +	fl4->fl4_icmp_code = icmph.code;
>> 
>> That's more readable, but that exposes another problem in there - we read
>> the same piece of userland data twice, with no promise whatsoever that we'll
>> get the same value both times...
> 
> Sure, but you have to be root anyway to write to raw sockets.
> 
> Patches are welcome :)

I'd agree with this root-only argument maybe 15 years ago, but with
containers and stuff like that we want to prevent root X from messing
up the machine for root Y.

This is a recurring topic, and I'd strongly like to avoid adding new
ways that these kinds of problems can happen.

For example, I'm still on the hook to address the AF_NETLINK mmap TX
code, which has a similarly abusable issue.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06  9:55                                           ` Jon Maloy
@ 2014-11-06 22:16                                             ` Al Viro
  2014-11-28  5:14                                               ` Al Viro
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-06 22:16 UTC (permalink / raw)
  To: Jon Maloy
  Cc: Herbert Xu, David Miller, netdev, linux-kernel, bcrl,
	Masahide Nakamura, Hideaki YOSHIFUJI

On Thu, Nov 06, 2014 at 09:55:31AM +0000, Jon Maloy wrote:
> > Point, but that might very well be a pattern to watch for - there's at least one
> > more instance in TIPC (also not exploitable, according to TIPC folks) and such
> 
> I don't recall this, and I can't see where it would be either. Can you please
> point to where it is?

The same dest_name_check() thing.  This
        if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr)))
                return -EFAULT;
        if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN)))
                return -EACCES;
is easily bypassed.  Suppose you want to send a packet with these two
bits in ->tcm_type not being 00, and you don't have CAP_NET_ADMIN.
Not a problem - spawn two threads sharing memory, have one trying to
call sendmsg() while another keeps flipping these two bits.  Sooner
of later you'll get the timing right and have these bits observed as 00
in dest_name_check() and 11 when it comes to memcpy_fromiovecend() actually
copying the whole thing.  And considering that the interval between those
two is much longer than the loop in the second thread would take on
each iteration, I'd expect the odds around 25% per attempted sendmsg().

IOW, this test is either pointless and can be removed completely, or there's
an exploitable race.  As far as I understand from your replies both back then
and in another branch of this thread, it's the former and the proper fix is
to remove at least that part of dest_name_check().  So this case is also
not something exploitable, but it certainly matches the same pattern.

My point was simply that this pattern is worth watching for - recurrent bug
classes like that have a good chance to spawn an instance that will be
exploitable.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06 17:30                     ` Al Viro
@ 2014-11-07  1:58                       ` Herbert Xu
  0 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07  1:58 UTC (permalink / raw)
  To: Al Viro; +Cc: David Miller, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

On Thu, Nov 06, 2014 at 05:30:12PM +0000, Al Viro wrote:
> On Thu, Nov 06, 2014 at 04:28:18PM +0800, Herbert Xu wrote:
> > +		if (copy_to_iter(skb->data + offset, copy, to))
> > +			goto fault;
> 
> Sorry, no - copy_to_iter() returns the number of bytes copied, not 0 or -EFAULT.
>
> > +			vaddr = kmap(page);
> > +			err = copy_to_iter(vaddr + frag->page_offset +
> > +					   offset - start, copy, to);
> > +			kunmap(page);
> > +			if (err)
> > +				goto fault;
> 
> And that one should be
> 			copied = copy_page_to_iter(page, frag->page_offset +
> 					   offset - start, copy, to);
> 			if (copied != copy)
> 				goto fault;
> 
> Don't bother with kmap(), vaddr and all that shite.  The primitive is
> 	copy_page_to_iter(page, offset_in_page, nbytes, iter)
> it does all needed kmap itself and it's smart enough to use kmap_atomic
> when it can get away with that.  Similar for copy_page_from_iter().
> 
> Both of those (as well as copy_{to,from}_iter()) advance iov_iter and return
> the number of bytes actually copied.  So the check for EFAULT is "it has copied
> less than you've asked it to copy *and* you haven't run out that iov_iter".
> The second part is guaranteed to be true in this case - your code makes sure
> that 'copy' is no more than the space left in iterator.
> 
> In general, this check would be spelled
> 			if (copied != copy && iov_iter_count(to))
> 				goto fault;

Thanks, I'll redo the patches.
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06 17:25                   ` David Miller
@ 2014-11-07  1:59                     ` Herbert Xu
  2014-11-07  3:13                       ` David Miller
  0 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-07  1:59 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl

On Thu, Nov 06, 2014 at 12:25:00PM -0500, David Miller wrote:
> From: Herbert Xu <herbert@gondor.apana.org.au>
> Date: Thu, 6 Nov 2014 16:23:38 +0800
> 
> > On Wed, Nov 05, 2014 at 03:24:10PM -0500, David Miller wrote:
> >> 
> >> Herbert, please provide a cover letter for this series, and the most recent
> >> version of patch #2 gets various rejects when I try to apply it to net-next.
> > 
> > Sure, I'll regenerate them.  However, while doing so I noticed that
> > a number of my patches on tun/macvtap that you have previously set
> > as accepted are missing from net-next.  Could this be why you got
> > the rejects?
> 
> Those were bug fixes so went into plain 'net', they will show up next
> time I do a merge and I will deal with the conflicts, if any.

I see.  In that case it might be best to wait until those fixes hit
net-next first before applying these patches as otherwise Stephen will
get hit with some nasty merge conflicts.

I'll repost them for RFC with the problems that Al pointed out fixed
in the mean time.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06 21:28                                         ` David Miller
@ 2014-11-07  2:00                                           ` Herbert Xu
  2014-11-07 13:25                                             ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: Herbert Xu @ 2014-11-07  2:00 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji

On Thu, Nov 06, 2014 at 04:28:08PM -0500, David Miller wrote:
> From: Herbert Xu <herbert@gondor.apana.org.au>
> Date: Thu, 6 Nov 2014 14:46:29 +0800
> 
> > On Thu, Nov 06, 2014 at 06:43:18AM +0000, Al Viro wrote:
> >> On Thu, Nov 06, 2014 at 01:50:23PM +0800, Herbert Xu wrote:
> >> > +	/* We only need the first two bytes. */
> >> > +	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
> >> > +	if (err)
> >> > +		return err;
> >> > +
> >> > +	fl4->fl4_icmp_type = icmph.type;
> >> > +	fl4->fl4_icmp_code = icmph.code;
> >> 
> >> That's more readable, but that exposes another problem in there - we read
> >> the same piece of userland data twice, with no promise whatsoever that we'll
> >> get the same value both times...
> > 
> > Sure, but you have to be root anyway to write to raw sockets.
> > 
> > Patches are welcome :)
> 
> I'd agree with this root-only argument maybe 15 years ago, but with
> containers and stuff like that we want to prevent root X from messing
> up the machine for root Y.
> 
> This is a recurring topic, and I'd strongly like to avoid adding new
> ways that these kinds of problems can happen.
> 
> For example, I'm still on the hook to address the AF_NETLINK mmap TX
> code, which has a similarly abusable issue.

Fair enough.  Even though the bug existed prior to my patch I'll
see if we could get rid of it.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07  1:59                     ` Herbert Xu
@ 2014-11-07  3:13                       ` David Miller
  2014-11-07 13:21                         ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
  0 siblings, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-07  3:13 UTC (permalink / raw)
  To: herbert; +Cc: viro, netdev, linux-kernel, bcrl

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 7 Nov 2014 09:59:44 +0800

> In that case it might be best to wait until those fixes hit net-next
> first before applying these patches as otherwise Stephen will get
> hit with some nasty merge conflicts.

I just merged net into net-next, so this barrier should no longer
be present.

Thanks.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version
  2014-11-07  3:13                       ` David Miller
@ 2014-11-07 13:21                         ` Herbert Xu
  2014-11-07 13:22                           ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
                                             ` (3 more replies)
  0 siblings, 4 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:21 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

Hi Dave:

This patch series adds the helper skb_copy_datagram_iter, which
is meant to replace both skb_copy_datagram_iovec and its evil
twin skb_copy_datagram_const_iovec.

It then converts tun and macvtap over to the new helper and finally
removes skb_copy_datagram_const_iovec which is only used by tun
and macvtap.

The copy_to_iter return value issue pointed out by Al has now been
fixed.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07 13:21                         ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
@ 2014-11-07 13:22                           ` Herbert Xu
  2014-11-07 13:22                           ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
                                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

This patch adds skb_copy_datagram_iter, which is identical to
skb_copy_datagram_iovec except that it operates on iov_iter
instead of iovec.

Eventually all users of skb_copy_datagram_iovec should switch
over to iov_iter and then we can remove skb_copy_datagram_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 +
 net/core/datagram.c    |   87 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 53f4f6c..933cfce 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -150,6 +150,7 @@
 struct net_device;
 struct scatterlist;
 struct pipe_inode_info;
+struct iov_iter;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -2653,6 +2654,8 @@ int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
 				  const struct iovec *to, int to_offset,
 				  int size);
+int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
+			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
 void skb_free_datagram_locked(struct sock *sk, struct sk_buff *skb);
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index fdbc9a8..84d90d0 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -49,6 +49,7 @@
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
+#include <linux/uio.h>
 
 #include <net/protocol.h>
 #include <linux/skbuff.h>
@@ -482,6 +483,92 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
 
 /**
+ *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
+ *	@skb: buffer to copy
+ *	@offset: offset in the buffer to start copying from
+ *	@to: iovec iterator to copy to
+ *	@len: amount of data to copy from buffer to iovec
+ */
+int skb_copy_datagram_iter(const struct sk_buff *skb, int offset,
+			   struct iov_iter *to, int len)
+{
+	int start = skb_headlen(skb);
+	int i, copy = start - offset;
+	struct sk_buff *frag_iter;
+
+	trace_skb_copy_datagram_iovec(skb, len);
+
+	/* Copy header. */
+	if (copy > 0) {
+		if (copy > len)
+			copy = len;
+		if (copy_to_iter(skb->data + offset, copy, to) != copy)
+			goto short_copy;
+		if ((len -= copy) == 0)
+			return 0;
+		offset += copy;
+	}
+
+	/* Copy paged appendix. Hmm... why does this look so complicated? */
+	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+		int end;
+		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+
+		WARN_ON(start > offset + len);
+
+		end = start + skb_frag_size(frag);
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (copy_page_to_iter(skb_frag_page(frag),
+					      frag->page_offset + offset -
+					      start, copy, to) != copy)
+				goto short_copy;
+			if (!(len -= copy))
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+
+	skb_walk_frags(skb, frag_iter) {
+		int end;
+
+		WARN_ON(start > offset + len);
+
+		end = start + frag_iter->len;
+		if ((copy = end - offset) > 0) {
+			if (copy > len)
+				copy = len;
+			if (skb_copy_datagram_iter(frag_iter, offset - start,
+						   to, copy))
+				goto fault;
+			if ((len -= copy) == 0)
+				return 0;
+			offset += copy;
+		}
+		start = end;
+	}
+	if (!len)
+		return 0;
+
+	/* This is not really a user copy fault, but rather someone
+	 * gave us a bogus length on the skb.  We should probably
+	 * print a warning here as it may indicate a kernel bug.
+	 */
+
+fault:
+	return -EFAULT;
+
+short_copy:
+	if (iov_iter_count(to))
+		goto fault;
+
+	return 0;
+}
+EXPORT_SYMBOL(skb_copy_datagram_iter);
+
+/**
  *	skb_copy_datagram_from_iovec - Copy a datagram from an iovec.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying to

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 2/4] tun: Use iovec iterators
  2014-11-07 13:21                         ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
  2014-11-07 13:22                           ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
@ 2014-11-07 13:22                           ` Herbert Xu
  2014-11-07 13:22                           ` [PATCH 3/4] macvtap: " Herbert Xu
  2014-11-07 13:22                           ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/tun.c |   65 ++++++++++++++++++++++++------------------------------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 9dd3746..2ff769b 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -71,6 +71,7 @@
 #include <net/rtnetlink.h>
 #include <net/sock.h>
 #include <linux/seq_file.h>
+#include <linux/uio.h>
 
 #include <asm/uaccess.h>
 
@@ -1230,11 +1231,11 @@ static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
 static ssize_t tun_put_user(struct tun_struct *tun,
 			    struct tun_file *tfile,
 			    struct sk_buff *skb,
-			    const struct iovec *iv, int len)
+			    struct iov_iter *iter)
 {
 	struct tun_pi pi = { 0, skb->protocol };
-	ssize_t total = 0;
-	int vlan_offset = 0, copied;
+	ssize_t total;
+	int vlan_offset;
 	int vlan_hlen = 0;
 	int vnet_hdr_sz = 0;
 
@@ -1244,23 +1245,25 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 	if (tun->flags & TUN_VNET_HDR)
 		vnet_hdr_sz = tun->vnet_hdr_sz;
 
+	total = skb->len + vlan_hlen + vnet_hdr_sz;
+
 	if (!(tun->flags & TUN_NO_PI)) {
-		if ((len -= sizeof(pi)) < 0)
+		if (iov_iter_count(iter) < sizeof(pi))
 			return -EINVAL;
 
-		if (len < skb->len + vlan_hlen + vnet_hdr_sz) {
+		total += sizeof(pi);
+		if (iov_iter_count(iter) < total) {
 			/* Packet will be striped */
 			pi.flags |= TUN_PKT_STRIP;
 		}
 
-		if (memcpy_toiovecend(iv, (void *) &pi, 0, sizeof(pi)))
+		if (copy_to_iter(&pi, sizeof(pi), iter) != sizeof(pi))
 			return -EFAULT;
-		total += sizeof(pi);
 	}
 
 	if (vnet_hdr_sz) {
 		struct virtio_net_hdr gso = { 0 }; /* no info leak */
-		if ((len -= vnet_hdr_sz) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_sz)
 			return -EINVAL;
 
 		if (skb_is_gso(skb)) {
@@ -1299,17 +1302,12 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 			gso.flags = VIRTIO_NET_HDR_F_DATA_VALID;
 		} /* else everything is zero */
 
-		if (unlikely(memcpy_toiovecend(iv, (void *)&gso, total,
-					       sizeof(gso))))
+		if (copy_to_iter(&gso, sizeof(gso), iter) != sizeof(gso))
 			return -EFAULT;
-		total += vnet_hdr_sz;
 	}
 
-	copied = total;
-	len = min_t(int, skb->len + vlan_hlen, len);
-	total += skb->len + vlan_hlen;
 	if (vlan_hlen) {
-		int copy, ret;
+		int ret;
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -1320,36 +1318,32 @@ static ssize_t tun_put_user(struct tun_struct *tun,
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret != sizeof(veth) || !iov_iter_count(iter))
 			goto done;
 	}
 
-	skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	skb_copy_datagram_iter(skb, vlan_offset, iter, skb->len - vlan_offset);
 
 done:
 	tun->dev->stats.tx_packets++;
-	tun->dev->stats.tx_bytes += len;
+	tun->dev->stats.tx_bytes += skb->len + vlan_hlen;
 
 	return total;
 }
 
 static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
-			   const struct iovec *iv, ssize_t len, int noblock)
+			   const struct iovec *iv, unsigned long segs,
+			   ssize_t len, int noblock)
 {
 	struct sk_buff *skb;
 	ssize_t ret = 0;
 	int peeked, err, off = 0;
+	struct iov_iter iter;
 
 	tun_debug(KERN_INFO, tun, "tun_do_read\n");
 
@@ -1362,11 +1356,12 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 	/* Read frames from queue */
 	skb = __skb_recv_datagram(tfile->socket.sk, noblock ? MSG_DONTWAIT : 0,
 				  &peeked, &off, &err);
-	if (skb) {
-		ret = tun_put_user(tun, tfile, skb, iv, len);
-		kfree_skb(skb);
-	} else
-		ret = err;
+	if (!skb)
+		return ret;
+
+	iov_iter_init(&iter, READ, iv, segs, len);
+	ret = tun_put_user(tun, tfile, skb, &iter);
+	kfree_skb(skb);
 
 	return ret;
 }
@@ -1387,7 +1382,7 @@ static ssize_t tun_chr_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = tun_do_read(tun, tfile, iv, len,
+	ret = tun_do_read(tun, tfile, iv, count, len,
 			  file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
@@ -1488,7 +1483,7 @@ static int tun_recvmsg(struct kiocb *iocb, struct socket *sock,
 					 SOL_PACKET, TUN_TX_TIMESTAMP);
 		goto out;
 	}
-	ret = tun_do_read(tun, tfile, m->msg_iov, total_len,
+	ret = tun_do_read(tun, tfile, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 3/4] macvtap: Use iovec iterators
  2014-11-07 13:21                         ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
  2014-11-07 13:22                           ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
  2014-11-07 13:22                           ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
@ 2014-11-07 13:22                           ` Herbert Xu
  2014-11-07 13:22                           ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

This patch removes the use of skb_copy_datagram_const_iovec in
favour of the iovec iterator-based skb_copy_datagram_iter.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 drivers/net/macvtap.c |   46 +++++++++++++++++++++-------------------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/drivers/net/macvtap.c b/drivers/net/macvtap.c
index 880cc09..cea99d4 100644
--- a/drivers/net/macvtap.c
+++ b/drivers/net/macvtap.c
@@ -15,6 +15,7 @@
 #include <linux/cdev.h>
 #include <linux/idr.h>
 #include <linux/fs.h>
+#include <linux/uio.h>
 
 #include <net/ipv6.h>
 #include <net/net_namespace.h>
@@ -778,31 +779,29 @@ static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv,
 /* Put packet to the user space buffer */
 static ssize_t macvtap_put_user(struct macvtap_queue *q,
 				const struct sk_buff *skb,
-				const struct iovec *iv, int len)
+				struct iov_iter *iter)
 {
 	int ret;
 	int vnet_hdr_len = 0;
 	int vlan_offset = 0;
-	int copied, total;
+	int total;
 
 	if (q->flags & IFF_VNET_HDR) {
 		struct virtio_net_hdr vnet_hdr;
 		vnet_hdr_len = q->vnet_hdr_sz;
-		if ((len -= vnet_hdr_len) < 0)
+		if (iov_iter_count(iter) < vnet_hdr_len)
 			return -EINVAL;
 
 		macvtap_skb_to_vnet_hdr(skb, &vnet_hdr);
 
-		if (memcpy_toiovecend(iv, (void *)&vnet_hdr, 0, sizeof(vnet_hdr)))
+		if (copy_to_iter(&vnet_hdr, sizeof(vnet_hdr), iter) !=
+		    sizeof(vnet_hdr))
 			return -EFAULT;
 	}
-	total = copied = vnet_hdr_len;
+	total = vnet_hdr_len;
 	total += skb->len;
 
-	if (!vlan_tx_tag_present(skb))
-		len = min_t(int, skb->len, len);
-	else {
-		int copy;
+	if (vlan_tx_tag_present(skb)) {
 		struct {
 			__be16 h_vlan_proto;
 			__be16 h_vlan_TCI;
@@ -811,37 +810,33 @@ static ssize_t macvtap_put_user(struct macvtap_queue *q,
 		veth.h_vlan_TCI = htons(vlan_tx_tag_get(skb));
 
 		vlan_offset = offsetof(struct vlan_ethhdr, h_vlan_proto);
-		len = min_t(int, skb->len + VLAN_HLEN, len);
 		total += VLAN_HLEN;
 
-		copy = min_t(int, vlan_offset, len);
-		ret = skb_copy_datagram_const_iovec(skb, 0, iv, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = skb_copy_datagram_iter(skb, 0, iter, vlan_offset);
+		if (ret || !iov_iter_count(iter))
 			goto done;
 
-		copy = min_t(int, sizeof(veth), len);
-		ret = memcpy_toiovecend(iv, (void *)&veth, copied, copy);
-		len -= copy;
-		copied += copy;
-		if (ret || !len)
+		ret = copy_to_iter(&veth, sizeof(veth), iter);
+		if (ret != sizeof(veth) || !iov_iter_count(iter))
 			goto done;
 	}
 
-	ret = skb_copy_datagram_const_iovec(skb, vlan_offset, iv, copied, len);
+	ret = skb_copy_datagram_iter(skb, vlan_offset, iter,
+				     skb->len - vlan_offset);
 
 done:
 	return ret ? ret : total;
 }
 
 static ssize_t macvtap_do_read(struct macvtap_queue *q,
-			       const struct iovec *iv, unsigned long len,
+			       const struct iovec *iv, unsigned long segs,
+			       unsigned long len,
 			       int noblock)
 {
 	DEFINE_WAIT(wait);
 	struct sk_buff *skb;
 	ssize_t ret = 0;
+	struct iov_iter iter;
 
 	while (len) {
 		if (!noblock)
@@ -863,7 +858,8 @@ static ssize_t macvtap_do_read(struct macvtap_queue *q,
 			schedule();
 			continue;
 		}
-		ret = macvtap_put_user(q, skb, iv, len);
+		iov_iter_init(&iter, READ, iv, segs, len);
+		ret = macvtap_put_user(q, skb, &iter);
 		kfree_skb(skb);
 		break;
 	}
@@ -886,7 +882,7 @@ static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv,
 		goto out;
 	}
 
-	ret = macvtap_do_read(q, iv, len, file->f_flags & O_NONBLOCK);
+	ret = macvtap_do_read(q, iv, count, len, file->f_flags & O_NONBLOCK);
 	ret = min_t(ssize_t, ret, len);
 	if (ret > 0)
 		iocb->ki_pos = ret;
@@ -1117,7 +1113,7 @@ static int macvtap_recvmsg(struct kiocb *iocb, struct socket *sock,
 	int ret;
 	if (flags & ~(MSG_DONTWAIT|MSG_TRUNC))
 		return -EINVAL;
-	ret = macvtap_do_read(q, m->msg_iov, total_len,
+	ret = macvtap_do_read(q, m->msg_iov, m->msg_iovlen, total_len,
 			  flags & MSG_DONTWAIT);
 	if (ret > total_len) {
 		m->msg_flags |= MSG_TRUNC;

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec
  2014-11-07 13:21                         ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
                                             ` (2 preceding siblings ...)
  2014-11-07 13:22                           ` [PATCH 3/4] macvtap: " Herbert Xu
@ 2014-11-07 13:22                           ` Herbert Xu
  3 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:22 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki

Now that both macvtap and tun are using skb_copy_datagram_iter, we
can kill the abomination that is skb_copy_datagram_const_iovec.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 include/linux/skbuff.h |    3 -
 net/core/datagram.c    |   89 -------------------------------------------------
 2 files changed, 92 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 933cfce..103fbe8 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -2651,9 +2651,6 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
 				 int len);
 int zerocopy_sg_from_iovec(struct sk_buff *skb, const struct iovec *frm,
 			   int offset, size_t count);
-int skb_copy_datagram_const_iovec(const struct sk_buff *from, int offset,
-				  const struct iovec *to, int to_offset,
-				  int size);
 int skb_copy_datagram_iter(const struct sk_buff *from, int offset,
 			   struct iov_iter *to, int size);
 void skb_free_datagram(struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 84d90d0..26391a3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -394,95 +394,6 @@ fault:
 EXPORT_SYMBOL(skb_copy_datagram_iovec);
 
 /**
- *	skb_copy_datagram_const_iovec - Copy a datagram to an iovec.
- *	@skb: buffer to copy
- *	@offset: offset in the buffer to start copying from
- *	@to: io vector to copy to
- *	@to_offset: offset in the io vector to start copying to
- *	@len: amount of data to copy from buffer to iovec
- *
- *	Returns 0 or -EFAULT.
- *	Note: the iovec is not modified during the copy.
- */
-int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
-				  const struct iovec *to, int to_offset,
-				  int len)
-{
-	int start = skb_headlen(skb);
-	int i, copy = start - offset;
-	struct sk_buff *frag_iter;
-
-	/* Copy header. */
-	if (copy > 0) {
-		if (copy > len)
-			copy = len;
-		if (memcpy_toiovecend(to, skb->data + offset, to_offset, copy))
-			goto fault;
-		if ((len -= copy) == 0)
-			return 0;
-		offset += copy;
-		to_offset += copy;
-	}
-
-	/* Copy paged appendix. Hmm... why does this look so complicated? */
-	for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
-		int end;
-		const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
-
-		WARN_ON(start > offset + len);
-
-		end = start + skb_frag_size(frag);
-		if ((copy = end - offset) > 0) {
-			int err;
-			u8  *vaddr;
-			struct page *page = skb_frag_page(frag);
-
-			if (copy > len)
-				copy = len;
-			vaddr = kmap(page);
-			err = memcpy_toiovecend(to, vaddr + frag->page_offset +
-						offset - start, to_offset, copy);
-			kunmap(page);
-			if (err)
-				goto fault;
-			if (!(len -= copy))
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-
-	skb_walk_frags(skb, frag_iter) {
-		int end;
-
-		WARN_ON(start > offset + len);
-
-		end = start + frag_iter->len;
-		if ((copy = end - offset) > 0) {
-			if (copy > len)
-				copy = len;
-			if (skb_copy_datagram_const_iovec(frag_iter,
-							  offset - start,
-							  to, to_offset,
-							  copy))
-				goto fault;
-			if ((len -= copy) == 0)
-				return 0;
-			offset += copy;
-			to_offset += copy;
-		}
-		start = end;
-	}
-	if (!len)
-		return 0;
-
-fault:
-	return -EFAULT;
-}
-EXPORT_SYMBOL(skb_copy_datagram_const_iovec);
-
-/**
  *	skb_copy_datagram_iter - Copy a datagram to an iovec iterator.
  *	@skb: buffer to copy
  *	@offset: offset in the buffer to start copying from

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice
  2014-11-07  2:00                                           ` Herbert Xu
@ 2014-11-07 13:25                                             ` Herbert Xu
  2014-11-07 13:27                                               ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
                                                                 ` (2 more replies)
  0 siblings, 3 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:25 UTC (permalink / raw)
  To: David Miller; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji

Hi Dave:

This series rewrites the function raw_probe_proto_opt in a more
readable fasion, and then fixes the long-standing bug where we
read the probed bytes twice which means that what we're using to
probe may in fact be invalid.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-07 13:25                                             ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu
@ 2014-11-07 13:27                                               ` Herbert Xu
  2014-11-07 13:27                                               ` [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt Herbert Xu
  2014-11-10 19:26                                               ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice David Miller
  2 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:27 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki, nakam

The function raw_probe_proto_opt tries to extract the first two
bytes from the user input in order to seed the IPsec lookup for
ICMP packets.  In doing so it's processing iovec by hand and
overcomplicating things.

This patch replaces the manual iovec processing with a call to
memcpy_fromiovecend.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/ipv4/raw.c |   50 +++++++++++---------------------------------------
 1 file changed, 11 insertions(+), 39 deletions(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index ee8fa4b..9be9050 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -422,48 +422,20 @@ error:
 
 static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
 {
-	struct iovec *iov;
-	u8 __user *type = NULL;
-	u8 __user *code = NULL;
-	int probed = 0;
-	unsigned int i;
+	struct icmphdr icmph;
+	int err;
 
-	if (!msg->msg_iov)
+	if (fl4->flowi4_proto != IPPROTO_ICMP)
 		return 0;
 
-	for (i = 0; i < msg->msg_iovlen; i++) {
-		iov = &msg->msg_iov[i];
-		if (!iov)
-			continue;
-
-		switch (fl4->flowi4_proto) {
-		case IPPROTO_ICMP:
-			/* check if one-byte field is readable or not. */
-			if (iov->iov_base && iov->iov_len < 1)
-				break;
-
-			if (!type) {
-				type = iov->iov_base;
-				/* check if code field is readable or not. */
-				if (iov->iov_len > 1)
-					code = type + 1;
-			} else if (!code)
-				code = iov->iov_base;
-
-			if (type && code) {
-				if (get_user(fl4->fl4_icmp_type, type) ||
-				    get_user(fl4->fl4_icmp_code, code))
-					return -EFAULT;
-				probed = 1;
-			}
-			break;
-		default:
-			probed = 1;
-			break;
-		}
-		if (probed)
-			break;
-	}
+	/* We only need the first two bytes. */
+	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
+	if (err)
+		return err;
+
+	fl4->fl4_icmp_type = icmph.type;
+	fl4->fl4_icmp_code = icmph.code;
+
 	return 0;
 }
 

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt
  2014-11-07 13:25                                             ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu
  2014-11-07 13:27                                               ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
@ 2014-11-07 13:27                                               ` Herbert Xu
  2014-11-10 19:26                                               ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice David Miller
  2 siblings, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-07 13:27 UTC (permalink / raw)
  To: David Miller, viro, netdev, linux-kernel, bcrl, YOSHIFUJI Hideaki, nakam

Ever since raw_probe_proto_opt was added it had the problem of
causing the user iov to be read twice, once during the probe for
the protocol header and once again in ip_append_data.

This is a potential security problem since it means that whatever
we're probing may be invalid.  This patch plugs the hole by
firstly advancing the iov so we don't read the same spot again,
and secondly saving what we read the first time around for use
by ip_append_data.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---

 net/ipv4/raw.c |   62 +++++++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 54 insertions(+), 8 deletions(-)

diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 9be9050..43385a9 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -79,6 +79,16 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/compat.h>
+#include <linux/uio.h>
+
+struct raw_frag_vec {
+	struct iovec *iov;
+	union {
+		struct icmphdr icmph;
+		char c[1];
+	} hdr;
+	int hlen;
+};
 
 static struct raw_hashinfo raw_v4_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
@@ -420,25 +430,57 @@ error:
 	return err;
 }
 
-static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
+static int raw_probe_proto_opt(struct raw_frag_vec *rfv, struct flowi4 *fl4)
 {
-	struct icmphdr icmph;
 	int err;
 
 	if (fl4->flowi4_proto != IPPROTO_ICMP)
 		return 0;
 
 	/* We only need the first two bytes. */
-	err = memcpy_fromiovecend((void *)&icmph, msg->msg_iov, 0, 2);
+	rfv->hlen = 2;
+
+	err = memcpy_fromiovec(rfv->hdr.c, rfv->iov, rfv->hlen);
 	if (err)
 		return err;
 
-	fl4->fl4_icmp_type = icmph.type;
-	fl4->fl4_icmp_code = icmph.code;
+	fl4->fl4_icmp_type = rfv->hdr.icmph.type;
+	fl4->fl4_icmp_code = rfv->hdr.icmph.code;
 
 	return 0;
 }
 
+static int raw_getfrag(void *from, char *to, int offset, int len, int odd,
+		       struct sk_buff *skb)
+{
+	struct raw_frag_vec *rfv = from;
+
+	if (offset < rfv->hlen) {
+		int copy = min(rfv->hlen - offset, len);
+
+		if (skb->ip_summed == CHECKSUM_PARTIAL)
+			memcpy(to, rfv->hdr.c + offset, copy);
+		else
+			skb->csum = csum_block_add(
+				skb->csum,
+				csum_partial_copy_nocheck(rfv->hdr.c + offset,
+							  to, copy, 0),
+				odd);
+
+		odd = 0;
+		offset += copy;
+		to += copy;
+		len -= copy;
+
+		if (!len)
+			return 0;
+	}
+
+	offset -= rfv->hlen;
+
+	return ip_generic_getfrag(rfv->iov, to, offset, len, odd, skb);
+}
+
 static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 		       size_t len)
 {
@@ -452,6 +494,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 	u8  tos;
 	int err;
 	struct ip_options_data opt_copy;
+	struct raw_frag_vec rfv;
 
 	err = -EMSGSIZE;
 	if (len > 0xFFFF)
@@ -557,7 +600,10 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			   daddr, saddr, 0, 0);
 
 	if (!inet->hdrincl) {
-		err = raw_probe_proto_opt(&fl4, msg);
+		rfv.iov = msg->msg_iov;
+		rfv.hlen = 0;
+
+		err = raw_probe_proto_opt(&rfv, &fl4);
 		if (err)
 			goto done;
 	}
@@ -588,8 +634,8 @@ back_from_confirm:
 		if (!ipc.addr)
 			ipc.addr = fl4.daddr;
 		lock_sock(sk);
-		err = ip_append_data(sk, &fl4, ip_generic_getfrag,
-				     msg->msg_iov, len, 0,
+		err = ip_append_data(sk, &fl4, raw_getfrag,
+				     &rfv, len, 0,
 				     &ipc, &rt, msg->msg_flags);
 		if (err)
 			ip_flush_pending_frames(sk);

^ permalink raw reply related	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06  3:25                                 ` Al Viro
  2014-11-06  5:50                                   ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
  2014-11-06  9:50                                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Jon Maloy
@ 2014-11-07 21:48                                   ` David Miller
  2014-11-07 22:11                                     ` Al Viro
  2014-11-07 21:52                                   ` David Miller
  3 siblings, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-07 21:48 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Thu, 6 Nov 2014 03:25:34 +0000

> OK, I've taken the beginning of the old queue on top of net-next; it's
> in git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net.

What I see in there looks good.   I wonder if we can somehow make msghdr
pointer args const... but this is not so important now.

Some minor coding style nits, comments:

	/* Like
	 * this.
	 */

and for multi-line function calls in the networking, align the second
and subsequent lines at the first column after the openning parenthesis
of the first line.

Thanks.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-06  3:25                                 ` Al Viro
                                                     ` (2 preceding siblings ...)
  2014-11-07 21:48                                   ` David Miller
@ 2014-11-07 21:52                                   ` David Miller
  3 siblings, 0 replies; 82+ messages in thread
From: David Miller @ 2014-11-07 21:52 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Thu, 6 Nov 2014 03:25:34 +0000

> 	* a new helper: zerocopy_sg_from_iter().  I have it, actually,
> but I'd rather not step on Herbert's toes - it's too close to the areas
> his series will touch, so that's probably for when his series goes in.
> It will be needed for complete macvtap conversion...

Just a heads up, his series is applied to net-next.

> 	* why doesn't verify_iovec() use rw_copy_check_uvector()?  The only
> real differences I see is that (a) you do allocation in callers (same as
> rw_copy_check_uvector() would've done), (b) you return EMSGSIZE in case of
> too long vector, while rw_copy_check_uvector() returns EINVAL in that case
> and (c) you don't do access_ok().  The last one is described as optimization,
> but for iov_iter primitives it's a serious PITA - for iovec-backed instances
> they are using __copy_from_user()/__copy_to_user(), etc.

The answer is that nobody knew abuot it and looked, that's why.

> 	It certainly would be nice to have the same code doing all copying
> of iovecs from userland - readv/writev/aio/sendmsg/recvmsg/etc.  Am I
> missing something subtle semantical difference in there?  EMSGSIZE vs EINVAL
> is trivial (we can lift that check into the callers, if nothing else), but
> I could miss something more interesting...

We also need compat counterparts.

> 	* various getfrag will need to grow iov_iter-based counterparts,
> but ip_append_output() needs no changes, AFAICS.

Right.

> 	* there's some really weird stuff in there.  Just what is this
> static int raw_probe_proto_opt(struct flowi4 *fl4, struct msghdr *msg)
> {
>         struct iovec *iov;
>         u8 __user *type = NULL;
>         u8 __user *code = NULL;
>         int probed = 0;
>         unsigned int i;
> 
>         if (!msg->msg_iov)
>                 return 0;
> 
>         for (i = 0; i < msg->msg_iovlen; i++) {
>                 iov = &msg->msg_iov[i];
>                 if (!iov)
>                         continue;
> trying to do?  "If non-NULL pointer + i somehow happened to be NULL, skip it
> and try to use the same pointer + i + 1"?  Huh?  Had been that way since
> the function first went in back in 2004 ("[IPV4] XFRM: probe icmp type/code
> when sending packets via raw socket.", according to historical tree)...

This is probably just bogus, because this address-of will never evaluate to
NULL.

> 	* rds, bluetooth and vsock are doing something odd; need to RTFS some
> more.

It is not surprising.... :-/

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07 21:48                                   ` David Miller
@ 2014-11-07 22:11                                     ` Al Viro
  2014-11-07 22:31                                       ` Al Viro
  2014-11-07 23:42                                       ` Al Viro
  0 siblings, 2 replies; 82+ messages in thread
From: Al Viro @ 2014-11-07 22:11 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Fri, Nov 07, 2014 at 04:48:59PM -0500, David Miller wrote:
> From: Al Viro <viro@ZenIV.linux.org.uk>
> Date: Thu, 6 Nov 2014 03:25:34 +0000
> 
> > OK, I've taken the beginning of the old queue on top of net-next; it's
> > in git://git.kernel.org//pub/scm/linux/kernel/git/viro/vfs.git iov_iter-net.
> 
> What I see in there looks good.   I wonder if we can somehow make msghdr
> pointer args const... but this is not so important now.
> 
> Some minor coding style nits, comments:
> 
> 	/* Like
> 	 * this.
> 	 */
> 
> and for multi-line function calls in the networking, align the second
> and subsequent lines at the first column after the openning parenthesis
> of the first line.

OK...  I played with csum side of things a bit, and I've noticed something
really nasty - iov_iter primitives all assume that iovec has been validated,
_including_ access_ok() on all ranges.  That allows us to use __copy_...()
in primitives, and on the read/write/readv/writev/aio side of things we have
that validation done when we copy iovec from userland (or set a single-element
iovec over the userland-supplied range, as in read(2)/write(2)).

net/* primitives, OTOH, do access_ok() themselves - syscalls do _not_ check
access_ok() on iovec from untrusted source and rely on the low-level stuff
to do such checks.

Result: regular IO syscalls on sockets (i.e. not recvmsg/sendmsg, usual
read/write) do these checks (at least) twice and use of copy_from_iter()
in ->recvmsg() opens quite a nasty hole - one can call recvmsg(2) with
kernel address in ->msg_iov[0].base and have such an instance of ->recvmsg()
stomp on the kernel memory.  At the very least, with Herbert's patches
we need to validate that somewhere on the way to tun and macvtap recvmsg
instances.  We can do that right there, and as a stopgap measure it might
be a good idea.  However, it's not a sane long-term solution.

We could, of course, add those access_ok() in mm/iov_iter.c and drop them
from fs/read_write.c and fs/aio.c, but I really don't see the point - why
not do that along with the checks we do in verify_iovec() anyway?  And drop
them from memcpy_fromiovec() et.al.

I'm looking through the tree right now; so far it looks like we can just
move those suckers to the point where we validate iovec and lose them
from low-level iovec and csum copying completely.  I still haven't finished
tracing all possible paths for address to arrive at the points where we
currently check that stuff, but so far it looks very doable.

Comments?

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07 22:11                                     ` Al Viro
@ 2014-11-07 22:31                                       ` Al Viro
  2014-11-07 22:35                                         ` Al Viro
  2014-11-07 23:42                                       ` Al Viro
  1 sibling, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-07 22:31 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Fri, Nov 07, 2014 at 10:11:14PM +0000, Al Viro wrote:

> I'm looking through the tree right now; so far it looks like we can just
> move those suckers to the point where we validate iovec and lose them
> from low-level iovec and csum copying completely.  I still haven't finished
> tracing all possible paths for address to arrive at the points where we
> currently check that stuff, but so far it looks very doable.

BTW, csum side of that is also chock-full of duplicate access_ok() -
e.g. generic csum_and_copy_from_user() checks before calling
csum_partial_copy_from_user().  And generic instance of that is using
__copy_from_user(), all right, but a _lot_ of non-default instances
repeat that access_ok().

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07 22:31                                       ` Al Viro
@ 2014-11-07 22:35                                         ` Al Viro
  0 siblings, 0 replies; 82+ messages in thread
From: Al Viro @ 2014-11-07 22:35 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Fri, Nov 07, 2014 at 10:31:53PM +0000, Al Viro wrote:
> On Fri, Nov 07, 2014 at 10:11:14PM +0000, Al Viro wrote:
> 
> > I'm looking through the tree right now; so far it looks like we can just
> > move those suckers to the point where we validate iovec and lose them
> > from low-level iovec and csum copying completely.  I still haven't finished
> > tracing all possible paths for address to arrive at the points where we
> > currently check that stuff, but so far it looks very doable.
> 
> BTW, csum side of that is also chock-full of duplicate access_ok() -
> e.g. generic csum_and_copy_from_user() checks before calling
> csum_partial_copy_from_user().  And generic instance of that is using
> __copy_from_user(), all right, but a _lot_ of non-default instances
> repeat that access_ok().

While we are at it: here's the default csum_and_copy_to_user()
static __inline__ __wsum csum_and_copy_to_user
(const void *src, void __user *dst, int len, __wsum sum, int *err_ptr)
{
        sum = csum_partial(src, len, sum);

        if (access_ok(VERIFY_WRITE, dst, len)) {
                if (copy_to_user(dst, src, len) == 0)
                        return sum;
        }
        if (len)
                *err_ptr = -EFAULT;

        return (__force __wsum)-1; /* invalid checksum */
}

Note that we do that access_ok() and follow it with copy_to_user() on exact
same range, i.e. repeat the same damn check...

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07 22:11                                     ` Al Viro
  2014-11-07 22:31                                       ` Al Viro
@ 2014-11-07 23:42                                       ` Al Viro
  2014-11-08  2:21                                         ` Herbert Xu
  2014-11-09 21:19                                         ` Al Viro
  1 sibling, 2 replies; 82+ messages in thread
From: Al Viro @ 2014-11-07 23:42 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl

On Fri, Nov 07, 2014 at 10:11:14PM +0000, Al Viro wrote:

> I'm looking through the tree right now; so far it looks like we can just
> move those suckers to the point where we validate iovec and lose them
> from low-level iovec and csum copying completely.  I still haven't finished
> tracing all possible paths for address to arrive at the points where we
> currently check that stuff, but so far it looks very doable.

Definitely doable.  The only remaining interesting part is drivers/vhost
with the stuff it puts in vq->iov[].  If we can guarantee that it satisfies
the sanity checks (access_ok() and size-related ones), we are done -
making verify_iovec() use rw_copy_check_uvector() (and verify_compat_iov()
use compat_rw_copy_check_uvector()) will suffice to guarantee that none of
	csum_partial_copy_fromiovecend
	memcpy_fromiovec
	memcpy_toiovec
	memcpy_toiovecend
	memcpy_fromiovecend
	skb_copy_datagram_iovec
	skb_copy_datagram_iter
	skb_copy_datagram_from_iter
	zerocopy_sg_from_iter
	skb_copy_and_csum_datagram
	skb_copy_and_csum_datagram_iovec
	csum_and_copy_from_user
	csum_and_copy_to_user
	csum_partial_copy_from_user
will ever see an address that doesn't satisfy access_ok() checks.  And
having looked at the data flow...  we definitely want to do those checks
on intake of iovec - as it is, we usually repeat them quite a few times
for the same iovec segment, and we practically never end up _not_ doing them
for some segment of iovec, unless we hit a failure exit before we get around
to copying any data at all.

I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the
series moving those checks to the moment of copying iovec from userland,
so that kernel-side we could always rely on ->msg_iov elements having been
verified.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07 23:42                                       ` Al Viro
@ 2014-11-08  2:21                                         ` Herbert Xu
  2014-11-09 21:19                                         ` Al Viro
  1 sibling, 0 replies; 82+ messages in thread
From: Herbert Xu @ 2014-11-08  2:21 UTC (permalink / raw)
  To: Al Viro; +Cc: David Miller, netdev, linux-kernel, bcrl

On Fri, Nov 07, 2014 at 11:42:53PM +0000, Al Viro wrote:
> I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the
> series moving those checks to the moment of copying iovec from userland,
> so that kernel-side we could always rely on ->msg_iov elements having been
> verified.

Great thanks!

Dave, please hold off on my skb_copy_datagram_iter series until
this verify_iovec change is added.

Cheers,
-- 
Email: Herbert Xu <herbert@gondor.apana.org.au>
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-07 23:42                                       ` Al Viro
  2014-11-08  2:21                                         ` Herbert Xu
@ 2014-11-09 21:19                                         ` Al Viro
  2014-11-10  5:20                                           ` David Miller
  2014-11-10 10:14                                           ` Michael S. Tsirkin
  1 sibling, 2 replies; 82+ messages in thread
From: Al Viro @ 2014-11-09 21:19 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl, Michael S. Tsirkin

[Michael Cc'd]

On Fri, Nov 07, 2014 at 11:42:53PM +0000, Al Viro wrote:

> I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the
> series moving those checks to the moment of copying iovec from userland,
> so that kernel-side we could always rely on ->msg_iov elements having been
> verified.

Two questions:
1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks?
AFAICS, x86 and ppc just check that address is OK for 64bit process -
if a 32bit process passes the kernel an address that would be valid
for 64bit process, but not for 32bit one, we just get a pagefault in
__copy_from_user() and friends.  No kernel objects are going to have
a virtual address in that range, so access_ok() doesn't bother preventing
such access attempts there...

2) shouldn't vhost_dev_cleanup() stop the worker thread before doing anything
else?  AFAICS, we do parts of vhost_dev teardown while the thread is
still running; granted, we keep dev->mm pinned down until after it stops
(or we would be _really_ screwed), but is it safe to do all those fput()s, etc.
while it's still running?  Michael?

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-09 21:19                                         ` Al Viro
@ 2014-11-10  5:20                                           ` David Miller
  2014-11-10  6:58                                             ` Al Viro
  2014-11-10 10:14                                           ` Michael S. Tsirkin
  1 sibling, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-10  5:20 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl, mst

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Sun, 9 Nov 2014 21:19:08 +0000

> 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks?

sparc64 will just fault no matter what kind of task it is.

It is impossible for a user task to generate a reference to
a kernel virtual address, as kernel and user accesses each
go via a separate address space identifier.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-10  5:20                                           ` David Miller
@ 2014-11-10  6:58                                             ` Al Viro
  2014-11-10  7:30                                               ` David Miller
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-10  6:58 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl, mst

On Mon, Nov 10, 2014 at 12:20:20AM -0500, David Miller wrote:
> From: Al Viro <viro@ZenIV.linux.org.uk>
> Date: Sun, 9 Nov 2014 21:19:08 +0000
> 
> > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks?
> 
> sparc64 will just fault no matter what kind of task it is.
> 
> It is impossible for a user task to generate a reference to
> a kernel virtual address, as kernel and user accesses each
> go via a separate address space identifier.

Sure, but why do we have access_ok() there at all?  I.e. why not just have
it constant 1?

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-10  6:58                                             ` Al Viro
@ 2014-11-10  7:30                                               ` David Miller
  2014-11-10  9:09                                                 ` Al Viro
  0 siblings, 1 reply; 82+ messages in thread
From: David Miller @ 2014-11-10  7:30 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl, mst

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 10 Nov 2014 06:58:17 +0000

> On Mon, Nov 10, 2014 at 12:20:20AM -0500, David Miller wrote:
>> From: Al Viro <viro@ZenIV.linux.org.uk>
>> Date: Sun, 9 Nov 2014 21:19:08 +0000
>> 
>> > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks?
>> 
>> sparc64 will just fault no matter what kind of task it is.
>> 
>> It is impossible for a user task to generate a reference to
>> a kernel virtual address, as kernel and user accesses each
>> go via a separate address space identifier.
> 
> Sure, but why do we have access_ok() there at all?  I.e. why not just have
> it constant 1?

Since access_ok() is in fact constant 1 on sparc64, where we use it,
does it really matter?

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-10  7:30                                               ` David Miller
@ 2014-11-10  9:09                                                 ` Al Viro
  2014-11-10 16:18                                                   ` David Miller
  0 siblings, 1 reply; 82+ messages in thread
From: Al Viro @ 2014-11-10  9:09 UTC (permalink / raw)
  To: David Miller; +Cc: herbert, netdev, linux-kernel, bcrl, mst

On Mon, Nov 10, 2014 at 02:30:00AM -0500, David Miller wrote:
> From: Al Viro <viro@ZenIV.linux.org.uk>
> Date: Mon, 10 Nov 2014 06:58:17 +0000
> 
> > On Mon, Nov 10, 2014 at 12:20:20AM -0500, David Miller wrote:
> >> From: Al Viro <viro@ZenIV.linux.org.uk>
> >> Date: Sun, 9 Nov 2014 21:19:08 +0000
> >> 
> >> > 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks?
> >> 
> >> sparc64 will just fault no matter what kind of task it is.
> >> 
> >> It is impossible for a user task to generate a reference to
> >> a kernel virtual address, as kernel and user accesses each
> >> go via a separate address space identifier.
> > 
> > Sure, but why do we have access_ok() there at all?  I.e. why not just have
> > it constant 1?
> 
> Since access_ok() is in fact constant 1 on sparc64, where we use it,
> does it really matter?

*blink*

My apologies - I've got confused by the maze of twisty includes, all alike.
Right you are; in biarch case it *doesn't* depend on 32bit vs. 64bit.
STACK_TOP-using one is sparc32 variant where we obviously don't have
biarch at all.

Anyway, the series switching to {compat_,}rw_copy_check_uvector() and
getting rid of duplicate checks is in vfs.git#iov_iter-net.  Warning:
it's almost completely untested.  It survives boot, ssh into it and
runltp -f syscalls (no regressions), but that's about it.  BTW, what's
the usual regression suite used for net/* stuff?

3 commits in there, on top of net-next#master; head should be at 555126.
There's a bunch of fairly obvious followups becoming possible after that,
but let's keep those separate...

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-09 21:19                                         ` Al Viro
  2014-11-10  5:20                                           ` David Miller
@ 2014-11-10 10:14                                           ` Michael S. Tsirkin
  1 sibling, 0 replies; 82+ messages in thread
From: Michael S. Tsirkin @ 2014-11-10 10:14 UTC (permalink / raw)
  To: Al Viro; +Cc: David Miller, herbert, netdev, linux-kernel, bcrl

On Sun, Nov 09, 2014 at 09:19:08PM +0000, Al Viro wrote:
> [Michael Cc'd]
> 
> On Fri, Nov 07, 2014 at 11:42:53PM +0000, Al Viro wrote:
> 
> > I'll finish RTFS drivers/vhost and if it turns out to be OK I'll post the
> > series moving those checks to the moment of copying iovec from userland,
> > so that kernel-side we could always rely on ->msg_iov elements having been
> > verified.
> 
> Two questions:
> 1) does sparc64 access_ok() need to differ for 32bit and 64bit tasks?
> AFAICS, x86 and ppc just check that address is OK for 64bit process -
> if a 32bit process passes the kernel an address that would be valid
> for 64bit process, but not for 32bit one, we just get a pagefault in
> __copy_from_user() and friends.  No kernel objects are going to have
> a virtual address in that range, so access_ok() doesn't bother preventing
> such access attempts there...
> 
> 2) shouldn't vhost_dev_cleanup() stop the worker thread before doing anything
> else?
>  AFAICS, we do parts of vhost_dev teardown while the thread is
> still running; granted, we keep dev->mm pinned down until after it stops
> (or we would be _really_ screwed), but is it safe to do all those fput()s, etc.
> while it's still running?  Michael?

Before invoking vhost_dev_cleanup,
the caller for vhost-net (vhost_net_release) sets private data to NULL
(using vhost_net_stop_vq) which guarantees thread will do nothing at all.
vhost scsi does it in vhost_scsi_clear_endpoint.


-- 
MST

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 1/4] inet: Add skb_copy_datagram_iter
  2014-11-10  9:09                                                 ` Al Viro
@ 2014-11-10 16:18                                                   ` David Miller
  0 siblings, 0 replies; 82+ messages in thread
From: David Miller @ 2014-11-10 16:18 UTC (permalink / raw)
  To: viro; +Cc: herbert, netdev, linux-kernel, bcrl, mst

From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 10 Nov 2014 09:09:55 +0000

> BTW, what's the usual regression suite used for net/* stuff?

There's a regression suite? :-)

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice
  2014-11-07 13:25                                             ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu
  2014-11-07 13:27                                               ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
  2014-11-07 13:27                                               ` [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt Herbert Xu
@ 2014-11-10 19:26                                               ` David Miller
  2 siblings, 0 replies; 82+ messages in thread
From: David Miller @ 2014-11-10 19:26 UTC (permalink / raw)
  To: herbert; +Cc: viro, netdev, linux-kernel, bcrl, nakam, yoshfuji

From: Herbert Xu <herbert@gondor.apana.org.au>
Date: Fri, 7 Nov 2014 21:25:53 +0800

> This series rewrites the function raw_probe_proto_opt in a more
> readable fasion, and then fixes the long-standing bug where we
> read the probed bytes twice which means that what we're using to
> probe may in fact be invalid.

Series applied to net-next, thanks Herbert.

^ permalink raw reply	[flat|nested] 82+ messages in thread

* Re: ipv4: Use standard iovec primitive in raw_probe_proto_opt
  2014-11-06 22:16                                             ` Al Viro
@ 2014-11-28  5:14                                               ` Al Viro
  0 siblings, 0 replies; 82+ messages in thread
From: Al Viro @ 2014-11-28  5:14 UTC (permalink / raw)
  To: Jon Maloy
  Cc: Herbert Xu, David Miller, netdev, linux-kernel, bcrl,
	Masahide Nakamura, Hideaki YOSHIFUJI

On Thu, Nov 06, 2014 at 10:16:08PM +0000, Al Viro wrote:
> On Thu, Nov 06, 2014 at 09:55:31AM +0000, Jon Maloy wrote:
> > > Point, but that might very well be a pattern to watch for - there's at least one
> > > more instance in TIPC (also not exploitable, according to TIPC folks) and such
> > 
> > I don't recall this, and I can't see where it would be either. Can you please
> > point to where it is?
> 
> The same dest_name_check() thing.  This
>         if (copy_from_user(&hdr, m->msg_iov[0].iov_base, sizeof(hdr)))
>                 return -EFAULT;
>         if ((ntohs(hdr.tcm_type) & 0xC000) && (!capable(CAP_NET_ADMIN)))
>                 return -EACCES;
> is easily bypassed.  Suppose you want to send a packet with these two
> bits in ->tcm_type not being 00, and you don't have CAP_NET_ADMIN.
> Not a problem - spawn two threads sharing memory, have one trying to
> call sendmsg() while another keeps flipping these two bits.  Sooner
> of later you'll get the timing right and have these bits observed as 00
> in dest_name_check() and 11 when it comes to memcpy_fromiovecend() actually
> copying the whole thing.  And considering that the interval between those
> two is much longer than the loop in the second thread would take on
> each iteration, I'd expect the odds around 25% per attempted sendmsg().
> 
> IOW, this test is either pointless and can be removed completely, or there's
> an exploitable race.  As far as I understand from your replies both back then
> and in another branch of this thread, it's the former and the proper fix is
> to remove at least that part of dest_name_check().  So this case is also
> not something exploitable, but it certainly matches the same pattern.
> 
> My point was simply that this pattern is worth watching for - recurrent bug
> classes like that have a good chance to spawn an instance that will be
> exploitable.

Ping?  Can we simply remove dest_name_check() completely?  That's one of the
few remaining obstacles to making ->sendmsg() iov_iter-clean.  For now I'm
simply commenting its call out in tipc_sendmsg(); if it _is_ needed for
anything, we'll need to get rid of that double copying from userland.  I can
do that, but my impression from your comments back in April is that you
planned to removed the damn check anyway.

Another question: in tipc_send_stream() we have
        mtu = tsk->max_pkt;
        send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
        __skb_queue_head_init(&head);
        rc = tipc_msg_build(mhdr, m, sent, send, mtu, &head);
        if (unlikely(rc < 0))
                goto exit;
        do {   
                if (likely(!tsk_conn_cong(tsk))) {
                        rc = tipc_link_xmit(&head, dnode, ref);
                        if (likely(!rc)) {
                                tsk->sent_unacked++;
                                sent += send;
                                if (sent == dsz)
                                        break;
                                goto next;
                        }
                        if (rc == -EMSGSIZE) {
                                tsk->max_pkt = tipc_node_get_mtu(dnode, ref);
                                goto next;
                        }

How can it hit that EMSGSIZE?  AFAICS, it can come only from
int __tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list)
{
        struct tipc_msg *msg = buf_msg(skb_peek(list));
        uint psz = msg_size(msg);
...
        uint mtu = link->max_pkt;
...
        /* Has valid packet limit been used ? */   
        if (unlikely(psz > mtu)) {
                __skb_queue_purge(list);
                return -EMSGSIZE;
        }

and msg_size() is basically the bits copied into skb by tipc_msg_build() and
set by msg_set_size() in there.  And unless I'm seriously misreading that
function, it can't be more than pktmax argument, i.e. mtu.  So unless something
manages to crap into our skb or change mtu right under us, it shouldn't be
possible.  And mtu (i.e. ->max_pkt) ought to be protected by lock_sock() there.

What's going on there?

^ permalink raw reply	[flat|nested] 82+ messages in thread

end of thread, other threads:[~2014-11-28  5:14 UTC | newest]

Thread overview: 82+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-11-02 23:05 fs: Use non-const iov in aio_read/aio_write Herbert Xu
2014-11-03  0:16 ` Al Viro
2014-11-03  0:21   ` Al Viro
2014-11-03  0:22   ` Herbert Xu
2014-11-03  0:45     ` Al Viro
2014-11-03  5:37       ` [0/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
2014-11-03  5:44         ` [PATCH 1/3] tun: Modify const aio_read iovec per do_sock_read Herbert Xu
2014-11-03  5:44         ` [PATCH 3/3] net: Kill skb_copy_datagram_const_iovec Herbert Xu
2014-11-03  5:44         ` [PATCH 2/3] macvtap: Modify const aio_read iovec per do_sock_read Herbert Xu
2014-11-03 20:05         ` [0/3] net: Kill skb_copy_datagram_const_iovec David Miller
2014-11-04  3:38           ` Herbert Xu
2014-11-04  8:31             ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
2014-11-04 14:32               ` Al Viro
2014-11-04 14:35                 ` Al Viro
2014-11-04 14:44                   ` Herbert Xu
2014-11-04 14:52                     ` Al Viro
2014-11-04 14:55                       ` Herbert Xu
2014-11-04 14:42                 ` Herbert Xu
2014-11-04 15:13                   ` Al Viro
2014-11-05  2:22                     ` Herbert Xu
2014-11-05  3:27                       ` David Miller
2014-11-05  3:55                         ` Al Viro
2014-11-05  4:12                           ` Al Viro
2014-11-05 20:51                             ` David Miller
2014-11-05 20:50                           ` David Miller
2014-11-05 21:07                             ` Al Viro
2014-11-05 21:57                               ` David Miller
2014-11-06  3:25                                 ` Al Viro
2014-11-06  5:50                                   ` ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
2014-11-06  6:43                                     ` Al Viro
2014-11-06  6:46                                       ` Herbert Xu
2014-11-06  7:11                                         ` Al Viro
2014-11-06  9:55                                           ` Jon Maloy
2014-11-06 22:16                                             ` Al Viro
2014-11-28  5:14                                               ` Al Viro
2014-11-06 21:28                                         ` David Miller
2014-11-07  2:00                                           ` Herbert Xu
2014-11-07 13:25                                             ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice Herbert Xu
2014-11-07 13:27                                               ` [PATCH 1/2] ipv4: Use standard iovec primitive in raw_probe_proto_opt Herbert Xu
2014-11-07 13:27                                               ` [PATCH 2/2] ipv4: Avoid reading user iov twice after raw_probe_proto_opt Herbert Xu
2014-11-10 19:26                                               ` [PATCH 0/2] ipv4: Simplify raw_probe_proto_opt and avoid reading user iov twice David Miller
2014-11-06  9:50                                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Jon Maloy
2014-11-07 21:48                                   ` David Miller
2014-11-07 22:11                                     ` Al Viro
2014-11-07 22:31                                       ` Al Viro
2014-11-07 22:35                                         ` Al Viro
2014-11-07 23:42                                       ` Al Viro
2014-11-08  2:21                                         ` Herbert Xu
2014-11-09 21:19                                         ` Al Viro
2014-11-10  5:20                                           ` David Miller
2014-11-10  6:58                                             ` Al Viro
2014-11-10  7:30                                               ` David Miller
2014-11-10  9:09                                                 ` Al Viro
2014-11-10 16:18                                                   ` David Miller
2014-11-10 10:14                                           ` Michael S. Tsirkin
2014-11-07 21:52                                   ` David Miller
2014-11-05 20:24               ` David Miller
2014-11-06  8:23                 ` Herbert Xu
2014-11-06 17:25                   ` David Miller
2014-11-07  1:59                     ` Herbert Xu
2014-11-07  3:13                       ` David Miller
2014-11-07 13:21                         ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
2014-11-07 13:22                           ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
2014-11-07 13:22                           ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
2014-11-07 13:22                           ` [PATCH 3/4] macvtap: " Herbert Xu
2014-11-07 13:22                           ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
2014-11-06  8:27                 ` [PATCH 0/4] Replace skb_copy_datagram_const_iovec with iterator version Herbert Xu
2014-11-06  8:28                   ` [PATCH 1/4] inet: Add skb_copy_datagram_iter Herbert Xu
2014-11-06 17:30                     ` Al Viro
2014-11-07  1:58                       ` Herbert Xu
2014-11-06  8:28                   ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
2014-11-06  8:28                   ` [PATCH 3/4] macvtap: " Herbert Xu
2014-11-06 17:33                     ` Al Viro
2014-11-06  8:28                   ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
2014-11-04  8:31             ` [PATCH 2/4] tun: Use iovec iterators Herbert Xu
2014-11-04  8:37               ` Herbert Xu
2014-11-05  2:49                 ` YOSHIFUJI Hideaki
2014-11-05  3:41                   ` Herbert Xu
2014-11-04  8:31             ` [PATCH 3/4] macvtap: " Herbert Xu
2014-11-04  8:31             ` [PATCH 4/4] net: Kill skb_copy_datagram_const_iovec Herbert Xu
2014-11-04  5:45           ` [0/3] " Al Viro
2014-11-05  1:53             ` Al Viro

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).