All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5] tools/libxc: Implement writev_exact() in the same style as write_exact()
@ 2015-02-19 11:45 Andrew Cooper
  2015-02-19 16:39 ` Ian Campbell
  0 siblings, 1 reply; 5+ messages in thread
From: Andrew Cooper @ 2015-02-19 11:45 UTC (permalink / raw)
  To: Xen-devel; +Cc: Andrew Cooper, Ian Jackson, Ian Campbell, Wei Liu

Implement a writev() wrapper which takes care of ensuring that partial writes
are completed.

writev_exact() is slightly more flexible than writev() in that it will cope
with an iovcnt greater than IOV_MAX, and will take care of correctly
submitting POSIX-compliant writev() calls.  The caller of writev_exact() is
however still required to ensure that the sum of iov_len's does not overflow a
ssize_t.

Signed-off-by: Andrew Cooper <andrew.cooper3@citrix.com>
CC: Ian Campbell <Ian.Campbell@citrix.com>
CC: Ian Jackson <Ian.Jackson@eu.citrix.com>
CC: Wei Liu <wei.liu2@citrix.com>

---
v5:
 * Rewrite commit message.
v4:
 * Allow this to compile in a stubdom environment.
v3:
 * Re-add adjustment for partial writes.
 * Split min/max adjustment into separate patch.
v2:
 * Remove adjustment for partial writes of a specific iov[] entry.
---
 tools/libxc/xc_private.c |   79 ++++++++++++++++++++++++++++++++++++++++++++++
 tools/libxc/xc_private.h |   14 ++++++++
 2 files changed, 93 insertions(+)

diff --git a/tools/libxc/xc_private.c b/tools/libxc/xc_private.c
index df6cd9b..35c514a 100644
--- a/tools/libxc/xc_private.c
+++ b/tools/libxc/xc_private.c
@@ -860,6 +860,85 @@ int write_exact(int fd, const void *data, size_t size)
     return 0;
 }
 
+#if defined(__MINIOS__)
+/*
+ * MiniOS's libc doesn't know about writev(). Implement it as multiple write()s.
+ */
+int writev_exact(int fd, const struct iovec *iov, int iovcnt)
+{
+    int rc, i;
+
+    for ( i = 0; i < iovcnt; ++i )
+    {
+        rc = write_exact(fd, iov[i].iov_base, iov[i].iov_len);
+        if ( rc )
+            return rc;
+    }
+
+    return 0;
+}
+#else
+int writev_exact(int fd, const struct iovec *iov, int iovcnt)
+{
+    struct iovec *local_iov = NULL;
+    int rc = 0, iov_idx = 0, saved_errno = 0;
+    ssize_t len;
+
+    while ( iov_idx < iovcnt )
+    {
+        /* Skip over iov[] entries with 0 length. */
+        while ( iov[iov_idx].iov_len == 0 )
+            if ( ++iov_idx == iovcnt )
+                goto out;
+
+        len = writev(fd, &iov[iov_idx], min(iovcnt - iov_idx, IOV_MAX));
+        saved_errno = errno;
+
+        if ( (len == -1) && (errno == EINTR) )
+            continue;
+        if ( len <= 0 )
+        {
+            rc = -1;
+            goto out;
+        }
+
+        /* Check iov[] to see whether we had a partial or complete write. */
+        while ( len > 0 && (iov_idx < iovcnt) )
+        {
+            if ( len >= iov[iov_idx].iov_len )
+                len -= iov[iov_idx++].iov_len;
+            else
+            {
+                /* Partial write of iov[iov_idx]. Copy iov so we can adjust
+                 * element iov_idx and resubmit the rest. */
+                if ( !local_iov )
+                {
+                    local_iov = malloc(iovcnt * sizeof(*iov));
+                    if ( !local_iov )
+                    {
+                        saved_errno = ENOMEM;
+                        goto out;
+                    }
+
+                    iov = memcpy(local_iov, iov, iovcnt * sizeof(*iov));
+                }
+
+                local_iov[iov_idx].iov_base += len;
+                local_iov[iov_idx].iov_len  -= len;
+                break;
+            }
+        }
+    }
+
+    saved_errno = 0;
+
+ out:
+    free(local_iov);
+    errno = saved_errno;
+    return rc;
+}
+#endif
+
 int xc_ffs8(uint8_t x)
 {
     int i;
diff --git a/tools/libxc/xc_private.h b/tools/libxc/xc_private.h
index 45b8644..f74f7d7 100644
--- a/tools/libxc/xc_private.h
+++ b/tools/libxc/xc_private.h
@@ -42,6 +42,19 @@
 #define VALGRIND_MAKE_MEM_UNDEFINED(addr, len) /* addr, len */
 #endif
 
+#if defined(__MINIOS__)
+/*
+ * MiniOS's libc doesn't know about sys/uio.h or writev().
+ * Declare enough of sys/uio.h to compile.
+ */
+struct iovec {
+    void *iov_base;
+    size_t iov_len;
+};
+#else
+#include <sys/uio.h>
+#endif
+
 #define DECLARE_HYPERCALL privcmd_hypercall_t hypercall
 #define DECLARE_DOMCTL struct xen_domctl domctl
 #define DECLARE_SYSCTL struct xen_sysctl sysctl
@@ -395,6 +408,7 @@ int xc_add_mmu_update(xc_interface *xch, struct xc_mmu *mmu,
 /* Return 0 on success; -1 on error setting errno. */
 int read_exact(int fd, void *data, size_t size); /* EOF => -1, errno=0 */
 int write_exact(int fd, const void *data, size_t size);
+int writev_exact(int fd, const struct iovec *iov, int iovcnt);
 
 int xc_ffs8(uint8_t x);
 int xc_ffs16(uint16_t x);
-- 
1.7.10.4

^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] tools/libxc: Implement writev_exact() in the same style as write_exact()
  2015-02-19 11:45 [PATCH v5] tools/libxc: Implement writev_exact() in the same style as write_exact() Andrew Cooper
@ 2015-02-19 16:39 ` Ian Campbell
  2015-02-19 16:58   ` Andrew Cooper
  0 siblings, 1 reply; 5+ messages in thread
From: Ian Campbell @ 2015-02-19 16:39 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Ian Jackson, Xen-devel

On Thu, 2015-02-19 at 11:45 +0000, Andrew Cooper wrote:

> +    while ( iov_idx < iovcnt )
> +    {
> +        /* Skip over iov[] entries with 0 length. */
> +        while ( iov[iov_idx].iov_len == 0 )
> +            if ( ++iov_idx == iovcnt )
> +                goto out;

Is this required for some reason or just an optimisation?

> +
> +        len = writev(fd, &iov[iov_idx], min(iovcnt - iov_idx, IOV_MAX));
> +        saved_errno = errno;
> +
> +        if ( (len == -1) && (errno == EINTR) )
> +            continue;
> +        if ( len <= 0 )
> +        {
> +            rc = -1;
> +            goto out;
> +        }
> +
> +        /* Check iov[] to see whether we had a partial or complete write. */
> +        while ( len > 0 && (iov_idx < iovcnt) )
> +        {
> +            if ( len >= iov[iov_idx].iov_len )
> +                len -= iov[iov_idx++].iov_len;
> +            else
> +            {
> +                /* Partial write of iov[iov_idx]. Copy iov so we can adjust
> +                 * element iov_idx and resubmit the rest. */

I suppose we can't / don't want to just declare that the input is
non-const and potentially corrupted?

> +                if ( !local_iov )
> +                {
> +                    local_iov = malloc(iovcnt * sizeof(*iov));
> +                    if ( !local_iov )
> +                    {
> +                        saved_errno = ENOMEM;
> +                        goto out;

What is rc at this point? I think it is 0, but I think you want it to be
-1?

It might be better to drop the inialiser of rc and set it in the one or
two places which would then need it (the goto out in the skip-0-length
loop and just before the out label AFAICT).

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] tools/libxc: Implement writev_exact() in the same style as write_exact()
  2015-02-19 16:39 ` Ian Campbell
@ 2015-02-19 16:58   ` Andrew Cooper
  2015-02-19 17:01     ` Ian Campbell
  0 siblings, 1 reply; 5+ messages in thread
From: Andrew Cooper @ 2015-02-19 16:58 UTC (permalink / raw)
  To: Ian Campbell; +Cc: Wei Liu, Ian Jackson, Xen-devel

On 19/02/15 16:39, Ian Campbell wrote:
> On Thu, 2015-02-19 at 11:45 +0000, Andrew Cooper wrote:
>
>> +    while ( iov_idx < iovcnt )
>> +    {
>> +        /* Skip over iov[] entries with 0 length. */
>> +        while ( iov[iov_idx].iov_len == 0 )
>> +            if ( ++iov_idx == iovcnt )
>> +                goto out;
> Is this required for some reason or just an optimisation?

Experimentally, submitting a writev() with every iov of length 0 results
in an EINVAL on CentOS 5.x

This causes a failure if a partial write adjustment happens and only iov
entries of length 0 remain in the set.

>
>> +
>> +        len = writev(fd, &iov[iov_idx], min(iovcnt - iov_idx, IOV_MAX));
>> +        saved_errno = errno;
>> +
>> +        if ( (len == -1) && (errno == EINTR) )
>> +            continue;
>> +        if ( len <= 0 )
>> +        {
>> +            rc = -1;
>> +            goto out;
>> +        }
>> +
>> +        /* Check iov[] to see whether we had a partial or complete write. */
>> +        while ( len > 0 && (iov_idx < iovcnt) )
>> +        {
>> +            if ( len >= iov[iov_idx].iov_len )
>> +                len -= iov[iov_idx++].iov_len;
>> +            else
>> +            {
>> +                /* Partial write of iov[iov_idx]. Copy iov so we can adjust
>> +                 * element iov_idx and resubmit the rest. */
> I suppose we can't / don't want to just declare that the input is
> non-const and potentially corrupted?

That was the v1 implementation, and specifically objected to during review.

>
>> +                if ( !local_iov )
>> +                {
>> +                    local_iov = malloc(iovcnt * sizeof(*iov));
>> +                    if ( !local_iov )
>> +                    {
>> +                        saved_errno = ENOMEM;
>> +                        goto out;
> What is rc at this point? I think it is 0, but I think you want it to be
> -1?
>
> It might be better to drop the inialiser of rc and set it in the one or
> two places which would then need it (the goto out in the skip-0-length
> loop and just before the out label AFAICT).
>
>

Hmm yes - I shall do.

For what it is worth, I can't find any way of provoking a partial write
of an individual iov element, and I don't realistically expect this
codepath to actually be used.  POSIX however doesn't guarantee that it
can't happen.

~Andrew

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] tools/libxc: Implement writev_exact() in the same style as write_exact()
  2015-02-19 16:58   ` Andrew Cooper
@ 2015-02-19 17:01     ` Ian Campbell
  2015-02-19 18:20       ` Andrew Cooper
  0 siblings, 1 reply; 5+ messages in thread
From: Ian Campbell @ 2015-02-19 17:01 UTC (permalink / raw)
  To: Andrew Cooper; +Cc: Wei Liu, Ian Jackson, Xen-devel

On Thu, 2015-02-19 at 16:58 +0000, Andrew Cooper wrote:
> On 19/02/15 16:39, Ian Campbell wrote:
> > On Thu, 2015-02-19 at 11:45 +0000, Andrew Cooper wrote:
> >
> >> +    while ( iov_idx < iovcnt )
> >> +    {
> >> +        /* Skip over iov[] entries with 0 length. */
> >> +        while ( iov[iov_idx].iov_len == 0 )
> >> +            if ( ++iov_idx == iovcnt )
> >> +                goto out;
> > Is this required for some reason or just an optimisation?
> 
> Experimentally, submitting a writev() with every iov of length 0 results
> in an EINVAL on CentOS 5.x

How exciting!

> This causes a failure if a partial write adjustment happens and only iov
> entries of length 0 remain in the set.

If only entries of length 0 remain then isn't that a complete-write of
the final non-empty entry?

> 
> >
> >> +
> >> +        len = writev(fd, &iov[iov_idx], min(iovcnt - iov_idx, IOV_MAX));
> >> +        saved_errno = errno;
> >> +
> >> +        if ( (len == -1) && (errno == EINTR) )
> >> +            continue;
> >> +        if ( len <= 0 )
> >> +        {
> >> +            rc = -1;
> >> +            goto out;
> >> +        }
> >> +
> >> +        /* Check iov[] to see whether we had a partial or complete write. */
> >> +        while ( len > 0 && (iov_idx < iovcnt) )
> >> +        {
> >> +            if ( len >= iov[iov_idx].iov_len )
> >> +                len -= iov[iov_idx++].iov_len;
> >> +            else
> >> +            {
> >> +                /* Partial write of iov[iov_idx]. Copy iov so we can adjust
> >> +                 * element iov_idx and resubmit the rest. */
> > I suppose we can't / don't want to just declare that the input is
> > non-const and potentially corrupted?
> 
> That was the v1 implementation, and specifically objected to during review.

OK.

> 
> >
> >> +                if ( !local_iov )
> >> +                {
> >> +                    local_iov = malloc(iovcnt * sizeof(*iov));
> >> +                    if ( !local_iov )
> >> +                    {
> >> +                        saved_errno = ENOMEM;
> >> +                        goto out;
> > What is rc at this point? I think it is 0, but I think you want it to be
> > -1?
> >
> > It might be better to drop the inialiser of rc and set it in the one or
> > two places which would then need it (the goto out in the skip-0-length
> > loop and just before the out label AFAICT).
> >
> >
> 
> Hmm yes - I shall do.
> 
> For what it is worth, I can't find any way of provoking a partial write
> of an individual iov element, and I don't realistically expect this
> codepath to actually be used.  POSIX however doesn't guarantee that it
> can't happen.

Better safe than sorry then.

Ian.

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v5] tools/libxc: Implement writev_exact() in the same style as write_exact()
  2015-02-19 17:01     ` Ian Campbell
@ 2015-02-19 18:20       ` Andrew Cooper
  0 siblings, 0 replies; 5+ messages in thread
From: Andrew Cooper @ 2015-02-19 18:20 UTC (permalink / raw)
  To: Ian Campbell; +Cc: Wei Liu, Ian Jackson, Xen-devel

On 19/02/15 17:01, Ian Campbell wrote:
> On Thu, 2015-02-19 at 16:58 +0000, Andrew Cooper wrote:
>> On 19/02/15 16:39, Ian Campbell wrote:
>>> On Thu, 2015-02-19 at 11:45 +0000, Andrew Cooper wrote:
>>>
>>>> +    while ( iov_idx < iovcnt )
>>>> +    {
>>>> +        /* Skip over iov[] entries with 0 length. */
>>>> +        while ( iov[iov_idx].iov_len == 0 )
>>>> +            if ( ++iov_idx == iovcnt )
>>>> +                goto out;
>>> Is this required for some reason or just an optimisation?
>> Experimentally, submitting a writev() with every iov of length 0 results
>> in an EINVAL on CentOS 5.x
> How exciting!
>
>> This causes a failure if a partial write adjustment happens and only iov
>> entries of length 0 remain in the set.
> If only entries of length 0 remain then isn't that a complete-write of
> the final non-empty entry?

Hmm - I think you are right.  This loop might now be redundant with the
partial iov[] handling below.  That bit of code did the hokey-cokey
several times.

On the other hand, I seem to remember that it ended up like this very
deliberately, and that I couldn't cover the edge case with only one
loop.  I think I am going to have to debug this back into existence again.

~Andrew

^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2015-02-19 18:20 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-02-19 11:45 [PATCH v5] tools/libxc: Implement writev_exact() in the same style as write_exact() Andrew Cooper
2015-02-19 16:39 ` Ian Campbell
2015-02-19 16:58   ` Andrew Cooper
2015-02-19 17:01     ` Ian Campbell
2015-02-19 18:20       ` Andrew Cooper

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.