Re: [PATCH v2] x86/HVM: avoid pointer wraparound in bufioreq handling

From: Stefano Stabellini <stefano.stabellini@eu.citrix.com>
To: Jan Beulich <JBeulich@suse.com>
Cc: Wei Liu <wei.liu2@citrix.com>,
	Stefano Stabellini <stefano.stabellini@eu.citrix.com>,
	Andrew Cooper <andrew.cooper3@citrix.com>,
	Ian Jackson <Ian.Jackson@eu.citrix.com>, Tim Deegan <tim@xen.org>,
	Ian Campbell <Ian.Campbell@eu.citrix.com>,
	xen-devel <xen-devel@lists.xenproject.org>,
	Keir Fraser <keir@xen.org>
Subject: Re: [PATCH v2] x86/HVM: avoid pointer wraparound in bufioreq handling
Date: Tue, 21 Jul 2015 17:18:24 +0100	[thread overview]
Message-ID: <alpine.DEB.2.02.1507211511010.17378@kaball.uk.xensource.com> (raw)
In-Reply-To: <5582E0660200007800086A27@mail.emea.novell.com>

On Thu, 18 Jun 2015, Jan Beulich wrote:
> The number of slots per page being 511 (i.e. not a power of two) means
> that the (32-bit) read and write indexes going beyond 2^32 will likely
> disturb operation. Extend I/O req server creation so the caller can
> indicate that it is using suitable atomic accesses where needed (not
> all accesses to the two pointers really need to be atomic), allowing
> the hypervisor to atomically canonicalize both pointers when both have
> gone through at least one cycle.
> 
> Signed-off-by: Jan Beulich <jbeulich@suse.com>
> Acked-by: Ian Campbell <ian.campbell@citrix.com>
> ---
> v2: Limit canonicalization loop to IOREQ_BUFFER_SLOT_NUM iterations.
>     Adjust xc_hvm_create_ioreq_server() documentation.
> 
> --- a/tools/libxc/include/xenctrl.h
> +++ b/tools/libxc/include/xenctrl.h
> @@ -1933,7 +1933,8 @@ int xc_get_hvm_param(xc_interface *handl
>   *
>   * @parm xch a handle to an open hypervisor interface.
>   * @parm domid the domain id to be serviced
> - * @parm handle_bufioreq should the IOREQ Server handle buffered requests?
> + * @parm handle_bufioreq how should the IOREQ Server handle buffered requests
> + *                       (HVM_IOREQSRV_BUFIOREQ_*)?
>   * @parm id pointer to an ioservid_t to receive the IOREQ Server id.
>   * @return 0 on success, -1 on failure.
>   */
> --- a/tools/libxc/xc_domain.c
> +++ b/tools/libxc/xc_domain.c
> @@ -1411,7 +1411,7 @@ int xc_hvm_create_ioreq_server(xc_interf
>      hypercall.arg[1] = HYPERCALL_BUFFER_AS_ARG(arg);
>  
>      arg->domid = domid;
> -    arg->handle_bufioreq = !!handle_bufioreq;
> +    arg->handle_bufioreq = handle_bufioreq;
>  
>      rc = do_xen_hypercall(xch, &hypercall);
>  
> --- a/xen/arch/x86/hvm/hvm.c
> +++ b/xen/arch/x86/hvm/hvm.c
> @@ -921,7 +921,7 @@ static void hvm_ioreq_server_disable(str
>  
>  static int hvm_ioreq_server_init(struct hvm_ioreq_server *s, struct domain *d,
>                                   domid_t domid, bool_t is_default,
> -                                 bool_t handle_bufioreq, ioservid_t id)
> +                                 int bufioreq_handling, ioservid_t id)

uint8_t?

>  {
>      struct vcpu *v;
>      int rc;
> @@ -938,7 +938,11 @@ static int hvm_ioreq_server_init(struct 
>      if ( rc )
>          return rc;
>  
> -    rc = hvm_ioreq_server_setup_pages(s, is_default, handle_bufioreq);
> +    if ( bufioreq_handling == HVM_IOREQSRV_BUFIOREQ_ATOMIC )
> +        s->bufioreq_atomic = 1;
> +
> +    rc = hvm_ioreq_server_setup_pages(
> +             s, is_default, bufioreq_handling != HVM_IOREQSRV_BUFIOREQ_OFF);
>      if ( rc )
>          goto fail_map;
>  
> @@ -997,12 +1001,15 @@ static ioservid_t next_ioservid(struct d
>  }
>  
>  static int hvm_create_ioreq_server(struct domain *d, domid_t domid,
> -                                   bool_t is_default, bool_t handle_bufioreq,
> +                                   bool_t is_default, int bufioreq_handling,

uint8_t?

>                                     ioservid_t *id)
>  {
>      struct hvm_ioreq_server *s;
>      int rc;
>  
> +    if ( bufioreq_handling > HVM_IOREQSRV_BUFIOREQ_ATOMIC )
> +        return -EINVAL;
> +
>      rc = -ENOMEM;
>      s = xzalloc(struct hvm_ioreq_server);
>      if ( !s )
> @@ -1015,7 +1022,7 @@ static int hvm_create_ioreq_server(struc
>      if ( is_default && d->arch.hvm_domain.default_ioreq_server != NULL )
>          goto fail2;
>  
> -    rc = hvm_ioreq_server_init(s, d, domid, is_default, handle_bufioreq,
> +    rc = hvm_ioreq_server_init(s, d, domid, is_default, bufioreq_handling,
>                                 next_ioservid(d));
>      if ( rc )
>          goto fail3;
> @@ -2560,7 +2567,7 @@ int hvm_buffered_io_send(ioreq_t *p)
>      spin_lock(&s->bufioreq_lock);
>  
> -    if ( (pg->write_pointer - pg->read_pointer) >=
> +    if ( (pg->ptrs.write_pointer - pg->ptrs.read_pointer) >=
>           (IOREQ_BUFFER_SLOT_NUM - qw) )
>      {
>          /* The queue is full: send the iopacket through the normal path. */
> @@ -2568,17 +2575,29 @@ int hvm_buffered_io_send(ioreq_t *p)
>          return 0;
>      }
>  
> -    pg->buf_ioreq[pg->write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
> +    pg->buf_ioreq[pg->ptrs.write_pointer % IOREQ_BUFFER_SLOT_NUM] = bp;
>  
>      if ( qw )
>      {
>          bp.data = p->data >> 32;
> -        pg->buf_ioreq[(pg->write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
> +        pg->buf_ioreq[(pg->ptrs.write_pointer+1) % IOREQ_BUFFER_SLOT_NUM] = bp;
>      }
>  
>      /* Make the ioreq_t visible /before/ write_pointer. */
>      wmb();
> -    pg->write_pointer += qw ? 2 : 1;
> +    pg->ptrs.write_pointer += qw ? 2 : 1;
> +
> +    /* Canonicalize read/write pointers to prevent their overflow. */
> +    while ( s->bufioreq_atomic && qw++ < IOREQ_BUFFER_SLOT_NUM &&
> +            pg->ptrs.read_pointer >= IOREQ_BUFFER_SLOT_NUM )
> +    {
> +        union bufioreq_pointers old = pg->ptrs, new;
> +        unsigned int n = old.read_pointer / IOREQ_BUFFER_SLOT_NUM;
> +
> +        new.read_pointer = old.read_pointer - n * IOREQ_BUFFER_SLOT_NUM;
> +        new.write_pointer = old.write_pointer - n * IOREQ_BUFFER_SLOT_NUM;
> +        cmpxchg(&pg->ptrs.full, old.full, new.full);
> +    }

This is not safe: if the reader increments read_pointer (atomically)
after you read old.read_pointer and before cmpxchg, the increment is
lost.

I think you need to remove the cmpxchg and subtract a multiple of
IOREQ_BUFFER_SLOT_NUM from read_pointer atomically here. Unfortunately
if we do that, we cannot update both write_pointer and read_pointer
together anymore. But if we decrement write_pointer atomically before
read_pointer, I think we should be fine if we appropriately fix the
reader side too:

QEMU:
  atomic_read(&read_pointer);
  read write_pointer;
  xen_rmb();
  while (read_pointer < write_pointer) {

      [work]

      atomic_add(&read_pointer, stuff);
      read write_pointer;
      xen_rmb();
  }

Xen:
  [work]
  increment write_pointer;
  xen_wmb();

  if (read_pointer >= IOREQ_BUFFER_SLOT_NUM) {
      write_pointer -= IOREQ_BUFFER_SLOT_NUM;
      xen_wmb();
      atomic_sub(read_pointer, IOREQ_BUFFER_SLOT_NUM);
  }