Re: [KVM PATCH v9 2/2] KVM: add iosignalfd support

From: Gregory Haskins <ghaskins@novell.com>
To: "Michael S. Tsirkin" <mst@redhat.com>
Cc: kvm@vger.kernel.org, linux-kernel@vger.kernel.org, avi@redhat.com
Subject: Re: [KVM PATCH v9 2/2] KVM: add iosignalfd support
Date: Tue, 07 Jul 2009 08:15:08 -0400	[thread overview]
Message-ID: <4A533C4C.7020202@novell.com> (raw)
In-Reply-To: <20090707112024.GA3647@redhat.com>

[-- Attachment #1: Type: text/plain, Size: 15320 bytes --]

Michael S. Tsirkin wrote:
> Some comments below. Sorry, I know it's late in the series, but
> previously I've been too confused with complicated locking to notice
> anything else :(.
>
> On Mon, Jul 06, 2009 at 04:33:21PM -0400, Gregory Haskins wrote:
>   
>
>> +#define KVM_IOSIGNALFD_FLAG_PIO       (1 << 1) /* is a pio (otherwise mmio) */
>> +#define KVM_IOSIGNALFD_FLAG_DEASSIGN  (1 << 2)
>> +
>> +struct kvm_iosignalfd {
>> +	__u64 trigger;
>>     
>
> for length <8, it's the 8*len least significant bits that are used, right?
> That's a bit ugly ... Maybe just put an 8 byte array here instead, then
> the first len bytes are valid.
>   

The original series uses an array that I kmalloc'ed to size, or left it
NULL for a wildcard.  Avi didn't like this and requested a u64 for all
values.  I don't care either way, but since you two are asking for
conflicting designs, I will let you debate.

>   
>>  
>>  void
>> -kvm_irqfd_init(struct kvm *kvm)
>> +kvm_eventfd_init(struct kvm *kvm)
>>  {
>>  	spin_lock_init(&kvm->irqfds.lock);
>>  	INIT_LIST_HEAD(&kvm->irqfds.items);
>> +
>>     
>
> don't need this empty line
>   

Ack
>   
>> +	INIT_LIST_HEAD(&kvm->iosignalfds);
>>  }
>>  
>>  /*
>> @@ -327,3 +333,275 @@ static void __exit irqfd_module_exit(void)
>>  
>>  module_init(irqfd_module_init);
>>  module_exit(irqfd_module_exit);
>> +
>> +/*
>> + * --------------------------------------------------------------------
>> + * iosignalfd: translate a PIO/MMIO memory write to an eventfd signal.
>> + *
>> + * userspace can register a PIO/MMIO address with an eventfd for recieving
>>     
>
> recieving -> receiving
>
>   
Ack

/me is embarrassed

>> + * notification when the memory has been touched.
>> + * --------------------------------------------------------------------
>> + */
>> +
>> +struct _iosignalfd {
>> +	struct list_head     list;
>> +	u64                  addr;
>> +	size_t               length;
>>     
>
> "int length" should be enough: the value is 1, 2, 4 or 8.
> and put wildcard near it if you want to save some space
>
>   
Ok

>> +	struct eventfd_ctx  *eventfd;
>> +	u64                  match;
>>     
>
> match -> value
>
>   
Ok

>> +	struct kvm_io_device dev;
>> +	int                  wildcard:1;
>>     
>
> don't use bitfields
>   

bool?
>   
>> +};
>> +
>> +static inline struct _iosignalfd *
>> +to_iosignalfd(struct kvm_io_device *dev)
>> +{
>> +	return container_of(dev, struct _iosignalfd, dev);
>> +}
>> +
>> +static void
>> +iosignalfd_release(struct _iosignalfd *p)
>> +{
>> +	eventfd_ctx_put(p->eventfd);
>> +	list_del(&p->list);
>> +	kfree(p);
>> +}
>> +
>> +static bool
>> +iosignalfd_in_range(struct _iosignalfd *p, gpa_t addr, int len, const void *val)
>> +{
>> +	u64 _val;
>> +
>> +	if (!(addr == p->addr && len == p->length))
>>     
>
> de-morgan's laws can help simplify this
>
>   
>> +		/* address-range must be precise for a hit */
>>     
>
> So there's apparently no way to specify that
> you want 1,2, or 4 byte writes at address X?
>   

No, they can be any legal size (1, 2, 4, or 8).  The only limitation is
that any overlap of two or more registrations have to be uniform in
addr/len.
>   
>> +		return false;
>> +
>> +	if (p->wildcard)
>> +		/* all else equal, wildcard is always a hit */
>> +		return true;
>> +
>> +	/* otherwise, we have to actually compare the data */
>> +
>> +	BUG_ON(!IS_ALIGNED((unsigned long)val, len));
>> +
>> +	switch (len) {
>> +	case 1:
>> +		_val = *(u8 *)val;
>> +		break;
>> +	case 2:
>> +		_val = *(u16 *)val;
>> +		break;
>> +	case 4:
>> +		_val = *(u32 *)val;
>> +		break;
>> +	case 8:
>> +		_val = *(u64 *)val;
>> +		break;
>> +	default:
>> +		return false;
>> +	}
>> +
>> +	return _val == p->match ? true : false;
>>     
>
> Life be simpler if we use an 8 byte array for match
> and just do memcmp here.
>
>   
You would need to use an n-byte array, technically (to avoid endian
issues).  As mentioned earlier, I already did it that way in earlier
versions but Avi wanted to see it this current (u64 based) way.

>> +}
>> +
>> +/*
>> + * MMIO/PIO writes trigger an event if the addr/val match
>> + */
>>     
>
> single line comment can look like this:
> /* MMIO/PIO writes trigger an event if the addr/val match */
>
>   
Ack

>> +static int
>> +iosignalfd_write(struct kvm_io_device *this, gpa_t addr, int len,
>> +		 const void *val)
>> +{
>> +	struct _iosignalfd *p = to_iosignalfd(this);
>> +
>> +	if (!iosignalfd_in_range(p, addr, len, val))
>> +		return -EOPNOTSUPP;
>> +
>> +	eventfd_signal(p->eventfd, 1);
>> +	return 0;
>> +}
>> +
>> +/*
>> + * This function is called as KVM is completely shutting down.  We do not
>> + * need to worry about locking just nuke anything we have as quickly as possible
>> + */
>> +static void
>> +iosignalfd_destructor(struct kvm_io_device *this)
>> +{
>> +	struct _iosignalfd *p = to_iosignalfd(this);
>> +
>> +	iosignalfd_release(p);
>> +}
>> +
>> +static const struct kvm_io_device_ops iosignalfd_ops = {
>> +	.write      = iosignalfd_write,
>> +	.destructor = iosignalfd_destructor,
>> +};
>> +
>> +static bool
>> +iosignalfd_overlap(struct _iosignalfd *lhs, struct _iosignalfd *rhs)
>>     
>
> this checks both region overlap and data collision.
> better split into two helpers?
>
>   
Why?

>> +{
>> +	/*
>> +	 * Check for completely non-overlapping regions.  We can simply
>> +	 * return "false" for non-overlapping regions and be done with
>> +	 * it.
>> +	 */
>>     
>
> done with it -> ignore the value
>
>   
I think that is a valid expression (at least here in the states), but
ok.  Note that "false" means we are accepting the request, not ignoring
any value.  I will construct a better comment either way.

>> +	if ((rhs->addr + rhs->length) <= lhs->addr)
>> +		return false;
>>     
>
> rhs->addr + rhs->length <= lhs->addr
> is not less clear, as precedence for simple math
> follows the familiar rules from school.
>
>   

Yes, but the "eye compiler" isn't as efficient as a machine driven tool.
;)  The annotation is for the readers benefit (or at least me), not
technical/mathematical correctness.

But whatever, I'll take it out.

>> +
>> +	if ((lhs->addr + lhs->length) <= rhs->addr)
>>     
>
> this math can overflow.
>
>   
Well, we should probably have vetted that during assign since
addr+length that overflows is not a valid region.  I will put a check in
for that.

>> +		return false;
>>     
>
> or shorter:
>  	if (rhs->addr + rhs->length <= lhs->addr ||
>  	    lhs->addr + lhs->length <= rhs->addr)
>            return true;
>
>   
Ok

>> +
>> +	/*
>> +	 * If we get here, we know there is *some* overlap, but we don't
>> +	 * yet know how much.
>>     
>
> how much?
>   
Well, as stated we don't know yet.

>   
>>  Make sure its a "precise" overlap, or
>>     
>
> precise overlap -> address/len pairs match
>
>   

Precisely.

>> +	 * its rejected as incompatible
>> +	 */
>>     
>
> "rejected" does not seem to make sense in the context of a boolean
> function
>
>   

Why?  true = rejected, false = accepted.  Seems boolean to me.  Whats
wrong with that?

>> +	if (lhs->addr != rhs->addr)
>> +		return true;
>> +
>> +	if (lhs->length != rhs->length)
>> +		return true;
>> +
>>     
>
> or shorter:
> 	if (lhs->addr != rhs->addr || lhs->length != rhs->length)
>            return true;
>   

Ok

>
>   
>> +	/*
>> +	 * If we get here, the request should be a precise overlap
>> +	 * between rhs+lhs.  The only thing left to check is for
>> +	 * data-match overlap.  If the data match is distinctly different
>> +	 * we can allow the two to co-exist.  Any kind of wild-card
>> +	 * consitutes an incompatible range, so reject any wild-cards,
>> +	 * or if the match token is identical.
>> +	 */
>> +	if (lhs->wildcard || rhs->wildcard || lhs->match == rhs->match)
>> +		return true;
>> +
>> +	return false;
>> +}
>>     
>
>
>   
>> +
>> +/* assumes kvm->slots_lock write-lock held */
>>     
>
> it seems you only need read lock?
>
>   

The caller needs write-lock, so we just inherit that state.  I can
update the comment though (I just ran a find/replace on "kvm->lock held"
while updating to your new interface, thus the vague comment)

>> +static bool
>> +iosignalfd_check_collision(struct kvm *kvm, struct _iosignalfd *p)
>> +{
>> +	struct _iosignalfd *_p;
>> +
>> +	list_for_each_entry(_p, &kvm->iosignalfds, list)
>> +		if (iosignalfd_overlap(_p, p))
>>     
>
> This looks wrong: let's assume I want one signal with length 1 and one
> with length 2 at address 0. They never trigger together, so it should
> be ok to have 2 such devices.
>   

We have previously decided to not support mis-matched overlaps.  It's
more complicated to handle, and there isn't a compelling use-case for it
that I am aware of.

>   
>> +			return true;
>> +
>> +	return false;
>> +}
>> +
>> +static int
>> +kvm_assign_iosignalfd(struct kvm *kvm, struct kvm_iosignalfd *args)
>> +{
>> +	int                       pio = args->flags & KVM_IOSIGNALFD_FLAG_PIO;
>> +	struct kvm_io_bus        *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
>> +	struct _iosignalfd       *p;
>> +	struct eventfd_ctx       *eventfd;
>> +	int                       ret;
>> +
>> +	switch (args->len) {
>> +	case 1:
>> +	case 2:
>> +	case 4:
>> +	case 8:
>> +		break;
>> +	default:
>> +		return -EINVAL;
>> +	}
>> +
>> +	eventfd = eventfd_ctx_fdget(args->fd);
>> +	if (IS_ERR(eventfd))
>> +		return PTR_ERR(eventfd);
>>     
>
> since this eventfd is kept around indefinitely, we should keep the
> file * around as well, so that this eventfd is accounted for
> properly with # of open files limit set by the admin.
>   

I agree.  The fact that I am missing that is a side-effect to the recent
change in eventfd-upstream.  If I acquire both a file* and ctx* and hold
them, it should work around the issue, but perhaps we should let the
eventfd interface handle this.

>   
>> +
>> +	p = kzalloc(sizeof(*p), GFP_KERNEL);
>> +	if (!p) {
>> +		ret = -ENOMEM;
>> +		goto fail;
>> +	}
>> +
>> +	INIT_LIST_HEAD(&p->list);
>> +	p->addr    = args->addr;
>> +	p->length  = args->len;
>> +	p->eventfd = eventfd;
>> +
>> +	/*
>> +	 * A trigger address is optional, otherwise this is a wildcard
>> +	 */
>>     
>
> A single line comment can look like this:
> 	/* A trigger address is optional, otherwise this is a wildcard */
>
>   
>> +	if (args->flags & KVM_IOSIGNALFD_FLAG_TRIGGER)
>> +		p->match = args->trigger;
>>     
>
> For len < 8, high bits in trigger are ignored.
> it's better to check that they are 0, less confusing
> if the user e.g. gets the endian-ness wrong.
>
>   
>> +	else
>> +		p->wildcard = true;
>> +
>>     
>
>   
>> +	down_write(&kvm->slots_lock);
>> +
>> +	/* Verify that there isnt a match already */
>>     
>
> Better to put documentation to where function is declared,
> not where it is used.
>
>   
>> +	if (iosignalfd_check_collision(kvm, p)) {
>> +		ret = -EEXIST;
>> +		goto unlock_fail;
>> +	}
>> +
>> +	kvm_iodevice_init(&p->dev, &iosignalfd_ops);
>> +
>> +	ret = __kvm_io_bus_register_dev(bus, &p->dev);
>> +	if (ret < 0)
>> +		goto unlock_fail;
>> +
>> +	list_add_tail(&p->list, &kvm->iosignalfds);
>> +
>> +	up_write(&kvm->slots_lock);
>> +
>> +	return 0;
>> +
>>     
>
> we probably do not need an empty line after each line of code here
>
>   
>> +unlock_fail:
>> +	up_write(&kvm->slots_lock);
>> +fail:
>> +	/*
>> +	 * it would have never made it to the list in the failure path, so
>> +	 * we dont need to worry about removing it
>> +	 */
>>     
>
> of course: you goto fail before list_add
> can just kill this comment
>
>   
>> +	kfree(p);
>> +
>> +	eventfd_ctx_put(eventfd);
>> +
>> +	return ret;
>>     
>
> we probably do not need an empty line after each line of code here
>
>
>   
>> +}
>> +
>> +
>>     
>
> two empty lines
>   

Ack
>   
>> +static int
>> +kvm_deassign_iosignalfd(struct kvm *kvm, struct kvm_iosignalfd *args)
>> +{
>> +	int                       pio = args->flags & KVM_IOSIGNALFD_FLAG_PIO;
>> +	struct kvm_io_bus        *bus = pio ? &kvm->pio_bus : &kvm->mmio_bus;
>> +	struct _iosignalfd       *p, *tmp;
>> +	struct eventfd_ctx       *eventfd;
>> +	int                       ret = 0;
>> +
>> +	eventfd = eventfd_ctx_fdget(args->fd);
>> +	if (IS_ERR(eventfd))
>> +		return PTR_ERR(eventfd);
>> +
>> +	down_write(&kvm->slots_lock);
>> +
>> +	list_for_each_entry_safe(p, tmp, &kvm->iosignalfds, list) {
>> +
>>     
>
> kill empty line
>
>   
>> +		if (p->eventfd != eventfd)
>> +			continue;
>>     
>
> So for deassign, you ignore all arguments besides fd?  Is this
> intentional? Looks strange - think of multiple addresses triggering a
> single eventfd. But if so, it's better to have a different ioctl with
> just the fields we need.
>   

Hmm... I suspect the check for a range-match got lost along the way.  I
agree we should probably qualify this with more than just the eventfd.

>   
>> +
>> +		__kvm_io_bus_unregister_dev(bus, &p->dev);
>> +		iosignalfd_release(p);
>>     
>
> a single deassign removed multiple irqfds? Looks ugly.
>   

Avi requested this general concept.

>   
>> +	}
>> +
>>     
>
> kill empty line
>
>   
>> +	up_write(&kvm->slots_lock);
>> +
>> +	eventfd_ctx_put(eventfd);
>> +
>> +	return ret;
>> +}
>>     
>
> return error status if no device was found?
>   

Ack
>   
>> +
>> +int
>> +kvm_iosignalfd(struct kvm *kvm, struct kvm_iosignalfd *args)
>> +{
>> +	if (args->flags & KVM_IOSIGNALFD_FLAG_DEASSIGN)
>> +		return kvm_deassign_iosignalfd(kvm, args);
>>     
>
> Better check that only known flag values are present.
> Otherwise when you add more flags things just break
> silently.
>   

Ok
>   
>> +
>> +	return kvm_assign_iosignalfd(kvm, args);
>> +}
>> diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
>> index 11595c7..5ac381b 100644
>> --- a/virt/kvm/kvm_main.c
>> +++ b/virt/kvm/kvm_main.c
>> @@ -979,7 +979,7 @@ static struct kvm *kvm_create_vm(void)
>>  	spin_lock_init(&kvm->mmu_lock);
>>  	spin_lock_init(&kvm->requests_lock);
>>  	kvm_io_bus_init(&kvm->pio_bus);
>> -	kvm_irqfd_init(kvm);
>> +	kvm_eventfd_init(kvm);
>>  	mutex_init(&kvm->lock);
>>  	mutex_init(&kvm->irq_lock);
>>  	kvm_io_bus_init(&kvm->mmio_bus);
>> @@ -2271,6 +2271,15 @@ static long kvm_vm_ioctl(struct file *filp,
>>  		r = kvm_irqfd(kvm, data.fd, data.gsi, data.flags);
>>  		break;
>>  	}
>> +	case KVM_IOSIGNALFD: {
>> +		struct kvm_iosignalfd data;
>> +
>> +		r = -EFAULT;
>>     
>
> this trick is nice, it saves a line of code for the closing brace
> but why waste it on an empty line above then?
>
>   
>> +		if (copy_from_user(&data, argp, sizeof data))
>> +			goto out;
>> +		r = kvm_iosignalfd(kvm, &data);
>> +		break;
>> +	}
>>  #ifdef CONFIG_KVM_APIC_ARCHITECTURE
>>  	case KVM_SET_BOOT_CPU_ID:
>>  		r = 0;
>>     

[-- Attachment #2: OpenPGP digital signature --]
[-- Type: application/pgp-signature, Size: 266 bytes --]