All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v3] e1000e: using bottom half to send packets
@ 2020-07-21 15:17 Li Qiang
  2020-07-22  3:32 ` Jason Wang
  0 siblings, 1 reply; 7+ messages in thread
From: Li Qiang @ 2020-07-21 15:17 UTC (permalink / raw)
  To: dmitry.fleytman, jasowang, pbonzini
  Cc: alxndr, Li Qiang, liq3ea, qemu-devel, ppandit

Alexander Bulekov reported a UAF bug related e1000e packets send.

-->https://bugs.launchpad.net/qemu/+bug/1886362

This is because the guest trigger a e1000e packet send and set the
data's address to e1000e's MMIO address. So when the e1000e do DMA
it will write the MMIO again and trigger re-entrancy and finally
causes this UAF.

Paolo suggested to use a bottom half whenever MMIO is doing complicate
things in here:
-->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html

Reference here:
'The easiest solution is to delay processing of descriptors to a bottom
half whenever MMIO is doing something complicated.  This is also better
for latency because it will free the vCPU thread more quickly and leave
the work to the I/O thread.'

This patch fixes this UAF.

Reported-by: Alexander Bulekov <alxndr@bu.edu>
Signed-off-by: Li Qiang <liq3ea@163.com>
---
Change since v2:
1. Add comments for the tx bh schdule when VM resumes
2. Leave the set ics code in 'e1000e_start_xmit'
3. Cancel the tx bh and reset tx_waiting in e1000e_core_reset

Change since v1:
Per Jason's review here:
-- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
1. Cancel and schedule the tx bh when VM is stopped or resume
2. Add a tx_burst for e1000e configuration to throttle the bh execution
3. Add a tx_waiting to record whether the bh is pending or not
Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
acquired.

 hw/net/e1000e.c      |   6 +++
 hw/net/e1000e_core.c | 107 +++++++++++++++++++++++++++++++++++--------
 hw/net/e1000e_core.h |   8 ++++
 3 files changed, 101 insertions(+), 20 deletions(-)

diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
index fda34518c9..24e35a78bf 100644
--- a/hw/net/e1000e.c
+++ b/hw/net/e1000e.c
@@ -77,10 +77,14 @@ typedef struct E1000EState {
 
     bool disable_vnet;
 
+    int32_t tx_burst;
+
     E1000ECore core;
 
 } E1000EState;
 
+#define TX_BURST 256
+
 #define E1000E_MMIO_IDX     0
 #define E1000E_FLASH_IDX    1
 #define E1000E_IO_IDX       2
@@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
 {
     s->core.owner = &s->parent_obj;
     s->core.owner_nic = s->nic;
+    s->core.tx_burst = s->tx_burst;
 }
 
 static void
@@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
                         e1000e_prop_subsys_ven, uint16_t),
     DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
                         e1000e_prop_subsys, uint16_t),
+    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
index bcd186cac5..2fdfc23204 100644
--- a/hw/net/e1000e_core.c
+++ b/hw/net/e1000e_core.c
@@ -910,18 +910,17 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
 }
 
 static void
-e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
+e1000e_start_xmit(struct e1000e_tx *q)
 {
+    E1000ECore *core = q->core;
     dma_addr_t base;
     struct e1000_tx_desc desc;
-    bool ide = false;
-    const E1000E_RingInfo *txi = txr->i;
-    uint32_t cause = E1000_ICS_TXQE;
+    const E1000E_RingInfo *txi;
+    E1000E_TxRing txr;
+    int32_t num_packets = 0;
 
-    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
-        trace_e1000e_tx_disabled();
-        return;
-    }
+    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
+    txi = txr.i;
 
     while (!e1000e_ring_empty(core, txi)) {
         base = e1000e_ring_head_descr(core, txi);
@@ -931,14 +930,24 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
         trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
                               desc.lower.data, desc.upper.data);
 
-        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
-        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
+        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
+        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
+                                            &q->ide, txi->idx);
 
         e1000e_ring_advance(core, txi, 1);
+        if (++num_packets >= core->tx_burst) {
+            break;
+        }
+    }
+
+    if (num_packets >= core->tx_burst) {
+        qemu_bh_schedule(q->tx_bh);
+        q->tx_waiting = 1;
+        return;
     }
 
-    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
-        e1000e_set_interrupt_cause(core, cause);
+    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
+        e1000e_set_interrupt_cause(core, q->cause);
     }
 }
 
@@ -2423,32 +2432,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
 static void
 e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
 {
-    E1000E_TxRing txr;
     core->mac[index] = val;
 
     if (core->mac[TARC0] & E1000_TARC_ENABLE) {
-        e1000e_tx_ring_init(core, &txr, 0);
-        e1000e_start_xmit(core, &txr);
+        if (core->tx[0].tx_waiting) {
+            return;
+        }
+        core->tx[0].tx_waiting = 1;
+        if (!core->vm_running) {
+            return;
+        }
+        qemu_bh_schedule(core->tx[0].tx_bh);
     }
 
     if (core->mac[TARC1] & E1000_TARC_ENABLE) {
-        e1000e_tx_ring_init(core, &txr, 1);
-        e1000e_start_xmit(core, &txr);
+        if (core->tx[1].tx_waiting) {
+            return;
+        }
+        core->tx[1].tx_waiting = 1;
+        if (!core->vm_running) {
+            return;
+        }
+        qemu_bh_schedule(core->tx[1].tx_bh);
     }
 }
 
 static void
 e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
 {
-    E1000E_TxRing txr;
     int qidx = e1000e_mq_queue_idx(TDT, index);
     uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
 
     core->mac[index] = val & 0xffff;
 
     if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
-        e1000e_tx_ring_init(core, &txr, qidx);
-        e1000e_start_xmit(core, &txr);
+        qemu_bh_schedule(core->tx[qidx].tx_bh);
     }
 }
 
@@ -3315,11 +3333,52 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
         trace_e1000e_vm_state_running();
         e1000e_intrmgr_resume(core);
         e1000e_autoneg_resume(core);
+        core->vm_running = 1;
+
+        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
+            /*
+             * Schedule tx bh unconditionally to make sure
+             * tx work after live migration since we don't
+             * migrate tx_waiting.
+             */
+            qemu_bh_schedule(core->tx[i].tx_bh);
+        }
+
     } else {
         trace_e1000e_vm_state_stopped();
+
+        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
+            qemu_bh_cancel(core->tx[i].tx_bh);
+        }
+
         e1000e_autoneg_pause(core);
         e1000e_intrmgr_pause(core);
+        core->vm_running = 0;
+    }
+}
+
+
+static void e1000e_core_tx_bh(void *opaque)
+{
+    struct e1000e_tx *q = opaque;
+    E1000ECore *core = q->core;
+
+    if (!core->vm_running) {
+        assert(q->tx_waiting);
+        return;
+    }
+
+    q->tx_waiting = 0;
+
+    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
+        trace_e1000e_tx_disabled();
+        return;
     }
+
+    q->cause = E1000_ICS_TXQE;
+    q->ide = false;
+
+    e1000e_start_xmit(q);
 }
 
 void
@@ -3334,12 +3393,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
                                        e1000e_autoneg_timer, core);
     e1000e_intrmgr_pci_realize(core);
 
+    core->vm_running = runstate_is_running();
     core->vmstate =
         qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
 
     for (i = 0; i < E1000E_NUM_QUEUES; i++) {
         net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
                         E1000E_MAX_TX_FRAGS, core->has_vnet);
+        core->tx[i].core = core;
+        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
     }
 
     net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
@@ -3367,6 +3429,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
     for (i = 0; i < E1000E_NUM_QUEUES; i++) {
         net_tx_pkt_reset(core->tx[i].tx_pkt);
         net_tx_pkt_uninit(core->tx[i].tx_pkt);
+        qemu_bh_cancel(core->tx[i].tx_bh);
+        qemu_bh_delete(core->tx[i].tx_bh);
+        core->tx[i].tx_bh = NULL;
     }
 
     net_rx_pkt_uninit(core->rx_pkt);
@@ -3480,6 +3545,8 @@ e1000e_core_reset(E1000ECore *core)
         net_tx_pkt_reset(core->tx[i].tx_pkt);
         memset(&core->tx[i].props, 0, sizeof(core->tx[i].props));
         core->tx[i].skip_cp = false;
+        qemu_bh_cancel(core->tx[i].tx_bh);
+        core->tx[i].tx_waiting = 0;
     }
 }
 
diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
index aee32f7e48..0c16dce3a6 100644
--- a/hw/net/e1000e_core.h
+++ b/hw/net/e1000e_core.h
@@ -77,10 +77,18 @@ struct E1000Core {
         unsigned char sum_needed;
         bool cptse;
         struct NetTxPkt *tx_pkt;
+        QEMUBH *tx_bh;
+        uint32_t tx_waiting;
+        uint32_t cause;
+        bool ide;
+        E1000ECore *core;
     } tx[E1000E_NUM_QUEUES];
 
     struct NetRxPkt *rx_pkt;
 
+    int32_t tx_burst;
+
+    bool vm_running;
     bool has_vnet;
     int max_queue_num;
 
-- 
2.17.1



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] e1000e: using bottom half to send packets
  2020-07-21 15:17 [PATCH v3] e1000e: using bottom half to send packets Li Qiang
@ 2020-07-22  3:32 ` Jason Wang
  2020-07-22  4:47   ` Li Qiang
  2020-07-22  4:52   ` P J P
  0 siblings, 2 replies; 7+ messages in thread
From: Jason Wang @ 2020-07-22  3:32 UTC (permalink / raw)
  To: Li Qiang, dmitry.fleytman, pbonzini; +Cc: alxndr, liq3ea, qemu-devel, ppandit


On 2020/7/21 下午11:17, Li Qiang wrote:
> Alexander Bulekov reported a UAF bug related e1000e packets send.
>
> -->https://bugs.launchpad.net/qemu/+bug/1886362
>
> This is because the guest trigger a e1000e packet send and set the
> data's address to e1000e's MMIO address. So when the e1000e do DMA
> it will write the MMIO again and trigger re-entrancy and finally
> causes this UAF.
>
> Paolo suggested to use a bottom half whenever MMIO is doing complicate
> things in here:
> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>
> Reference here:
> 'The easiest solution is to delay processing of descriptors to a bottom
> half whenever MMIO is doing something complicated.  This is also better
> for latency because it will free the vCPU thread more quickly and leave
> the work to the I/O thread.'
>
> This patch fixes this UAF.
>
> Reported-by: Alexander Bulekov <alxndr@bu.edu>
> Signed-off-by: Li Qiang <liq3ea@163.com>
> ---
> Change since v2:
> 1. Add comments for the tx bh schdule when VM resumes
> 2. Leave the set ics code in 'e1000e_start_xmit'
> 3. Cancel the tx bh and reset tx_waiting in e1000e_core_reset


So based on our discussion this is probably not sufficient. It solves 
the issue TX re-entrancy but not RX (e.g RX DMA to RDT?) Or is e1000e's 
RX is reentrant?

Thanks


>
> Change since v1:
> Per Jason's review here:
> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> 1. Cancel and schedule the tx bh when VM is stopped or resume
> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> 3. Add a tx_waiting to record whether the bh is pending or not
> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> acquired.
>
>   hw/net/e1000e.c      |   6 +++
>   hw/net/e1000e_core.c | 107 +++++++++++++++++++++++++++++++++++--------
>   hw/net/e1000e_core.h |   8 ++++
>   3 files changed, 101 insertions(+), 20 deletions(-)
>
> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> index fda34518c9..24e35a78bf 100644
> --- a/hw/net/e1000e.c
> +++ b/hw/net/e1000e.c
> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>   
>       bool disable_vnet;
>   
> +    int32_t tx_burst;
> +
>       E1000ECore core;
>   
>   } E1000EState;
>   
> +#define TX_BURST 256
> +
>   #define E1000E_MMIO_IDX     0
>   #define E1000E_FLASH_IDX    1
>   #define E1000E_IO_IDX       2
> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>   {
>       s->core.owner = &s->parent_obj;
>       s->core.owner_nic = s->nic;
> +    s->core.tx_burst = s->tx_burst;
>   }
>   
>   static void
> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>                           e1000e_prop_subsys_ven, uint16_t),
>       DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>                           e1000e_prop_subsys, uint16_t),
> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>       DEFINE_PROP_END_OF_LIST(),
>   };
>   
> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> index bcd186cac5..2fdfc23204 100644
> --- a/hw/net/e1000e_core.c
> +++ b/hw/net/e1000e_core.c
> @@ -910,18 +910,17 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
>   }
>   
>   static void
> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> +e1000e_start_xmit(struct e1000e_tx *q)
>   {
> +    E1000ECore *core = q->core;
>       dma_addr_t base;
>       struct e1000_tx_desc desc;
> -    bool ide = false;
> -    const E1000E_RingInfo *txi = txr->i;
> -    uint32_t cause = E1000_ICS_TXQE;
> +    const E1000E_RingInfo *txi;
> +    E1000E_TxRing txr;
> +    int32_t num_packets = 0;
>   
> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> -        trace_e1000e_tx_disabled();
> -        return;
> -    }
> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> +    txi = txr.i;
>   
>       while (!e1000e_ring_empty(core, txi)) {
>           base = e1000e_ring_head_descr(core, txi);
> @@ -931,14 +930,24 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>           trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>                                 desc.lower.data, desc.upper.data);
>   
> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> +                                            &q->ide, txi->idx);
>   
>           e1000e_ring_advance(core, txi, 1);
> +        if (++num_packets >= core->tx_burst) {
> +            break;
> +        }
> +    }
> +
> +    if (num_packets >= core->tx_burst) {
> +        qemu_bh_schedule(q->tx_bh);
> +        q->tx_waiting = 1;
> +        return;
>       }
>   
> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> -        e1000e_set_interrupt_cause(core, cause);
> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> +        e1000e_set_interrupt_cause(core, q->cause);
>       }
>   }
>   
> @@ -2423,32 +2432,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
>   static void
>   e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>   {
> -    E1000E_TxRing txr;
>       core->mac[index] = val;
>   
>       if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> -        e1000e_tx_ring_init(core, &txr, 0);
> -        e1000e_start_xmit(core, &txr);
> +        if (core->tx[0].tx_waiting) {
> +            return;
> +        }
> +        core->tx[0].tx_waiting = 1;
> +        if (!core->vm_running) {
> +            return;
> +        }
> +        qemu_bh_schedule(core->tx[0].tx_bh);
>       }
>   
>       if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> -        e1000e_tx_ring_init(core, &txr, 1);
> -        e1000e_start_xmit(core, &txr);
> +        if (core->tx[1].tx_waiting) {
> +            return;
> +        }
> +        core->tx[1].tx_waiting = 1;
> +        if (!core->vm_running) {
> +            return;
> +        }
> +        qemu_bh_schedule(core->tx[1].tx_bh);
>       }
>   }
>   
>   static void
>   e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>   {
> -    E1000E_TxRing txr;
>       int qidx = e1000e_mq_queue_idx(TDT, index);
>       uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>   
>       core->mac[index] = val & 0xffff;
>   
>       if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> -        e1000e_tx_ring_init(core, &txr, qidx);
> -        e1000e_start_xmit(core, &txr);
> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
>       }
>   }
>   
> @@ -3315,11 +3333,52 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
>           trace_e1000e_vm_state_running();
>           e1000e_intrmgr_resume(core);
>           e1000e_autoneg_resume(core);
> +        core->vm_running = 1;
> +
> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> +            /*
> +             * Schedule tx bh unconditionally to make sure
> +             * tx work after live migration since we don't
> +             * migrate tx_waiting.
> +             */
> +            qemu_bh_schedule(core->tx[i].tx_bh);
> +        }
> +
>       } else {
>           trace_e1000e_vm_state_stopped();
> +
> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> +            qemu_bh_cancel(core->tx[i].tx_bh);
> +        }
> +
>           e1000e_autoneg_pause(core);
>           e1000e_intrmgr_pause(core);
> +        core->vm_running = 0;
> +    }
> +}
> +
> +
> +static void e1000e_core_tx_bh(void *opaque)
> +{
> +    struct e1000e_tx *q = opaque;
> +    E1000ECore *core = q->core;
> +
> +    if (!core->vm_running) {
> +        assert(q->tx_waiting);
> +        return;
> +    }
> +
> +    q->tx_waiting = 0;
> +
> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> +        trace_e1000e_tx_disabled();
> +        return;
>       }
> +
> +    q->cause = E1000_ICS_TXQE;
> +    q->ide = false;
> +
> +    e1000e_start_xmit(q);
>   }
>   
>   void
> @@ -3334,12 +3393,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
>                                          e1000e_autoneg_timer, core);
>       e1000e_intrmgr_pci_realize(core);
>   
> +    core->vm_running = runstate_is_running();
>       core->vmstate =
>           qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>   
>       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>           net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>                           E1000E_MAX_TX_FRAGS, core->has_vnet);
> +        core->tx[i].core = core;
> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
>       }
>   
>       net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> @@ -3367,6 +3429,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>           net_tx_pkt_reset(core->tx[i].tx_pkt);
>           net_tx_pkt_uninit(core->tx[i].tx_pkt);
> +        qemu_bh_cancel(core->tx[i].tx_bh);
> +        qemu_bh_delete(core->tx[i].tx_bh);
> +        core->tx[i].tx_bh = NULL;
>       }
>   
>       net_rx_pkt_uninit(core->rx_pkt);
> @@ -3480,6 +3545,8 @@ e1000e_core_reset(E1000ECore *core)
>           net_tx_pkt_reset(core->tx[i].tx_pkt);
>           memset(&core->tx[i].props, 0, sizeof(core->tx[i].props));
>           core->tx[i].skip_cp = false;
> +        qemu_bh_cancel(core->tx[i].tx_bh);
> +        core->tx[i].tx_waiting = 0;
>       }
>   }
>   
> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> index aee32f7e48..0c16dce3a6 100644
> --- a/hw/net/e1000e_core.h
> +++ b/hw/net/e1000e_core.h
> @@ -77,10 +77,18 @@ struct E1000Core {
>           unsigned char sum_needed;
>           bool cptse;
>           struct NetTxPkt *tx_pkt;
> +        QEMUBH *tx_bh;
> +        uint32_t tx_waiting;
> +        uint32_t cause;
> +        bool ide;
> +        E1000ECore *core;
>       } tx[E1000E_NUM_QUEUES];
>   
>       struct NetRxPkt *rx_pkt;
>   
> +    int32_t tx_burst;
> +
> +    bool vm_running;
>       bool has_vnet;
>       int max_queue_num;
>   



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] e1000e: using bottom half to send packets
  2020-07-22  3:32 ` Jason Wang
@ 2020-07-22  4:47   ` Li Qiang
  2020-07-22  5:49     ` Jason Wang
  2020-07-22  4:52   ` P J P
  1 sibling, 1 reply; 7+ messages in thread
From: Li Qiang @ 2020-07-22  4:47 UTC (permalink / raw)
  To: Jason Wang
  Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
	Alexander Bulekov, Paolo Bonzini

Jason Wang <jasowang@redhat.com> 于2020年7月22日周三 上午11:32写道:
>
>
> On 2020/7/21 下午11:17, Li Qiang wrote:
> > Alexander Bulekov reported a UAF bug related e1000e packets send.
> >
> > -->https://bugs.launchpad.net/qemu/+bug/1886362
> >
> > This is because the guest trigger a e1000e packet send and set the
> > data's address to e1000e's MMIO address. So when the e1000e do DMA
> > it will write the MMIO again and trigger re-entrancy and finally
> > causes this UAF.
> >
> > Paolo suggested to use a bottom half whenever MMIO is doing complicate
> > things in here:
> > -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
> >
> > Reference here:
> > 'The easiest solution is to delay processing of descriptors to a bottom
> > half whenever MMIO is doing something complicated.  This is also better
> > for latency because it will free the vCPU thread more quickly and leave
> > the work to the I/O thread.'
> >
> > This patch fixes this UAF.
> >
> > Reported-by: Alexander Bulekov <alxndr@bu.edu>
> > Signed-off-by: Li Qiang <liq3ea@163.com>
> > ---
> > Change since v2:
> > 1. Add comments for the tx bh schdule when VM resumes
> > 2. Leave the set ics code in 'e1000e_start_xmit'
> > 3. Cancel the tx bh and reset tx_waiting in e1000e_core_reset
>
>
> So based on our discussion this is probably not sufficient. It solves
> the issue TX re-entrancy but not RX (e.g RX DMA to RDT?) Or is e1000e's
> RX is reentrant?
>

It seems the RX has no reentrant issue if we address the TX reentrant issue.
Though we can write the RX-related DMA register, then when we receive
packets, we write to the MMIO with any data.
However this is not different between the guest write any data to MMIO.

I think we can hold on this patch and discuss more about this MMIO
reentrant issue then make
the decision.

Thanks,
Li Qiang


> Thanks
>
>
> >
> > Change since v1:
> > Per Jason's review here:
> > -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> > 1. Cancel and schedule the tx bh when VM is stopped or resume
> > 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> > 3. Add a tx_waiting to record whether the bh is pending or not
> > Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> > acquired.
> >
> >   hw/net/e1000e.c      |   6 +++
> >   hw/net/e1000e_core.c | 107 +++++++++++++++++++++++++++++++++++--------
> >   hw/net/e1000e_core.h |   8 ++++
> >   3 files changed, 101 insertions(+), 20 deletions(-)
> >
> > diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> > index fda34518c9..24e35a78bf 100644
> > --- a/hw/net/e1000e.c
> > +++ b/hw/net/e1000e.c
> > @@ -77,10 +77,14 @@ typedef struct E1000EState {
> >
> >       bool disable_vnet;
> >
> > +    int32_t tx_burst;
> > +
> >       E1000ECore core;
> >
> >   } E1000EState;
> >
> > +#define TX_BURST 256
> > +
> >   #define E1000E_MMIO_IDX     0
> >   #define E1000E_FLASH_IDX    1
> >   #define E1000E_IO_IDX       2
> > @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> >   {
> >       s->core.owner = &s->parent_obj;
> >       s->core.owner_nic = s->nic;
> > +    s->core.tx_burst = s->tx_burst;
> >   }
> >
> >   static void
> > @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> >                           e1000e_prop_subsys_ven, uint16_t),
> >       DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> >                           e1000e_prop_subsys, uint16_t),
> > +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> >       DEFINE_PROP_END_OF_LIST(),
> >   };
> >
> > diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> > index bcd186cac5..2fdfc23204 100644
> > --- a/hw/net/e1000e_core.c
> > +++ b/hw/net/e1000e_core.c
> > @@ -910,18 +910,17 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
> >   }
> >
> >   static void
> > -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> > +e1000e_start_xmit(struct e1000e_tx *q)
> >   {
> > +    E1000ECore *core = q->core;
> >       dma_addr_t base;
> >       struct e1000_tx_desc desc;
> > -    bool ide = false;
> > -    const E1000E_RingInfo *txi = txr->i;
> > -    uint32_t cause = E1000_ICS_TXQE;
> > +    const E1000E_RingInfo *txi;
> > +    E1000E_TxRing txr;
> > +    int32_t num_packets = 0;
> >
> > -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> > -        trace_e1000e_tx_disabled();
> > -        return;
> > -    }
> > +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> > +    txi = txr.i;
> >
> >       while (!e1000e_ring_empty(core, txi)) {
> >           base = e1000e_ring_head_descr(core, txi);
> > @@ -931,14 +930,24 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >           trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> >                                 desc.lower.data, desc.upper.data);
> >
> > -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> > -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> > +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> > +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> > +                                            &q->ide, txi->idx);
> >
> >           e1000e_ring_advance(core, txi, 1);
> > +        if (++num_packets >= core->tx_burst) {
> > +            break;
> > +        }
> > +    }
> > +
> > +    if (num_packets >= core->tx_burst) {
> > +        qemu_bh_schedule(q->tx_bh);
> > +        q->tx_waiting = 1;
> > +        return;
> >       }
> >
> > -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> > -        e1000e_set_interrupt_cause(core, cause);
> > +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> > +        e1000e_set_interrupt_cause(core, q->cause);
> >       }
> >   }
> >
> > @@ -2423,32 +2432,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
> >   static void
> >   e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> >   {
> > -    E1000E_TxRing txr;
> >       core->mac[index] = val;
> >
> >       if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> > -        e1000e_tx_ring_init(core, &txr, 0);
> > -        e1000e_start_xmit(core, &txr);
> > +        if (core->tx[0].tx_waiting) {
> > +            return;
> > +        }
> > +        core->tx[0].tx_waiting = 1;
> > +        if (!core->vm_running) {
> > +            return;
> > +        }
> > +        qemu_bh_schedule(core->tx[0].tx_bh);
> >       }
> >
> >       if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> > -        e1000e_tx_ring_init(core, &txr, 1);
> > -        e1000e_start_xmit(core, &txr);
> > +        if (core->tx[1].tx_waiting) {
> > +            return;
> > +        }
> > +        core->tx[1].tx_waiting = 1;
> > +        if (!core->vm_running) {
> > +            return;
> > +        }
> > +        qemu_bh_schedule(core->tx[1].tx_bh);
> >       }
> >   }
> >
> >   static void
> >   e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> >   {
> > -    E1000E_TxRing txr;
> >       int qidx = e1000e_mq_queue_idx(TDT, index);
> >       uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
> >
> >       core->mac[index] = val & 0xffff;
> >
> >       if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> > -        e1000e_tx_ring_init(core, &txr, qidx);
> > -        e1000e_start_xmit(core, &txr);
> > +        qemu_bh_schedule(core->tx[qidx].tx_bh);
> >       }
> >   }
> >
> > @@ -3315,11 +3333,52 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
> >           trace_e1000e_vm_state_running();
> >           e1000e_intrmgr_resume(core);
> >           e1000e_autoneg_resume(core);
> > +        core->vm_running = 1;
> > +
> > +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> > +            /*
> > +             * Schedule tx bh unconditionally to make sure
> > +             * tx work after live migration since we don't
> > +             * migrate tx_waiting.
> > +             */
> > +            qemu_bh_schedule(core->tx[i].tx_bh);
> > +        }
> > +
> >       } else {
> >           trace_e1000e_vm_state_stopped();
> > +
> > +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> > +            qemu_bh_cancel(core->tx[i].tx_bh);
> > +        }
> > +
> >           e1000e_autoneg_pause(core);
> >           e1000e_intrmgr_pause(core);
> > +        core->vm_running = 0;
> > +    }
> > +}
> > +
> > +
> > +static void e1000e_core_tx_bh(void *opaque)
> > +{
> > +    struct e1000e_tx *q = opaque;
> > +    E1000ECore *core = q->core;
> > +
> > +    if (!core->vm_running) {
> > +        assert(q->tx_waiting);
> > +        return;
> > +    }
> > +
> > +    q->tx_waiting = 0;
> > +
> > +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> > +        trace_e1000e_tx_disabled();
> > +        return;
> >       }
> > +
> > +    q->cause = E1000_ICS_TXQE;
> > +    q->ide = false;
> > +
> > +    e1000e_start_xmit(q);
> >   }
> >
> >   void
> > @@ -3334,12 +3393,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
> >                                          e1000e_autoneg_timer, core);
> >       e1000e_intrmgr_pci_realize(core);
> >
> > +    core->vm_running = runstate_is_running();
> >       core->vmstate =
> >           qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
> >
> >       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >           net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> >                           E1000E_MAX_TX_FRAGS, core->has_vnet);
> > +        core->tx[i].core = core;
> > +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> >       }
> >
> >       net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> > @@ -3367,6 +3429,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> >       for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >           net_tx_pkt_reset(core->tx[i].tx_pkt);
> >           net_tx_pkt_uninit(core->tx[i].tx_pkt);
> > +        qemu_bh_cancel(core->tx[i].tx_bh);
> > +        qemu_bh_delete(core->tx[i].tx_bh);
> > +        core->tx[i].tx_bh = NULL;
> >       }
> >
> >       net_rx_pkt_uninit(core->rx_pkt);
> > @@ -3480,6 +3545,8 @@ e1000e_core_reset(E1000ECore *core)
> >           net_tx_pkt_reset(core->tx[i].tx_pkt);
> >           memset(&core->tx[i].props, 0, sizeof(core->tx[i].props));
> >           core->tx[i].skip_cp = false;
> > +        qemu_bh_cancel(core->tx[i].tx_bh);
> > +        core->tx[i].tx_waiting = 0;
> >       }
> >   }
> >
> > diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> > index aee32f7e48..0c16dce3a6 100644
> > --- a/hw/net/e1000e_core.h
> > +++ b/hw/net/e1000e_core.h
> > @@ -77,10 +77,18 @@ struct E1000Core {
> >           unsigned char sum_needed;
> >           bool cptse;
> >           struct NetTxPkt *tx_pkt;
> > +        QEMUBH *tx_bh;
> > +        uint32_t tx_waiting;
> > +        uint32_t cause;
> > +        bool ide;
> > +        E1000ECore *core;
> >       } tx[E1000E_NUM_QUEUES];
> >
> >       struct NetRxPkt *rx_pkt;
> >
> > +    int32_t tx_burst;
> > +
> > +    bool vm_running;
> >       bool has_vnet;
> >       int max_queue_num;
> >
>


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] e1000e: using bottom half to send packets
  2020-07-22  3:32 ` Jason Wang
  2020-07-22  4:47   ` Li Qiang
@ 2020-07-22  4:52   ` P J P
  1 sibling, 0 replies; 7+ messages in thread
From: P J P @ 2020-07-22  4:52 UTC (permalink / raw)
  To: Jason Wang
  Cc: dmitry.fleytman, Li Qiang, liq3ea, qemu-devel, alxndr, pbonzini

[-- Attachment #1: Type: text/plain, Size: 1618 bytes --]

+-- On Wed, 22 Jul 2020, Jason Wang wrote --+
| On 2020/7/21 下午11:17, Li Qiang wrote:
| > Alexander Bulekov reported a UAF bug related e1000e packets send.
| >
| > -->https://bugs.launchpad.net/qemu/+bug/1886362
| >
| > This is because the guest trigger a e1000e packet send and set the
| > data's address to e1000e's MMIO address. So when the e1000e do DMA
| > it will write the MMIO again and trigger re-entrancy and finally
| > causes this UAF.
| >
| > Paolo suggested to use a bottom half whenever MMIO is doing complicate
| > things in here:
| > -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
| >
| > Reference here:
| > 'The easiest solution is to delay processing of descriptors to a bottom
| > half whenever MMIO is doing something complicated.  This is also better
| > for latency because it will free the vCPU thread more quickly and leave
| > the work to the I/O thread.'
| >
| > This patch fixes this UAF.
| >
| > Reported-by: Alexander Bulekov <alxndr@bu.edu>
| > Signed-off-by: Li Qiang <liq3ea@163.com>
| > ---
| > Change since v2:
| > 1. Add comments for the tx bh schdule when VM resumes
| > 2. Leave the set ics code in 'e1000e_start_xmit'
| > 3. Cancel the tx bh and reset tx_waiting in e1000e_core_reset
| 
| 
| So based on our discussion this is probably not sufficient. It solves the
| issue TX re-entrancy but not RX (e.g RX DMA to RDT?) Or is e1000e's RX is
| reentrant?

Fixes: (CVE-2020-15859) -> https://bugzilla.redhat.com/show_bug.cgi?id=1859168

Thank you.
--
Prasad J Pandit / Red Hat Product Security Team
8685 545E B54C 486B C6EB 271E E285 8B5A F050 DE8D

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] e1000e: using bottom half to send packets
  2020-07-22  4:47   ` Li Qiang
@ 2020-07-22  5:49     ` Jason Wang
  2020-07-22  8:31       ` Jason Wang
  2020-07-22 10:45       ` Li Qiang
  0 siblings, 2 replies; 7+ messages in thread
From: Jason Wang @ 2020-07-22  5:49 UTC (permalink / raw)
  To: Li Qiang
  Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
	Alexander Bulekov, Paolo Bonzini


On 2020/7/22 下午12:47, Li Qiang wrote:
> Jason Wang <jasowang@redhat.com> 于2020年7月22日周三 上午11:32写道:
>>
>> On 2020/7/21 下午11:17, Li Qiang wrote:
>>> Alexander Bulekov reported a UAF bug related e1000e packets send.
>>>
>>> -->https://bugs.launchpad.net/qemu/+bug/1886362
>>>
>>> This is because the guest trigger a e1000e packet send and set the
>>> data's address to e1000e's MMIO address. So when the e1000e do DMA
>>> it will write the MMIO again and trigger re-entrancy and finally
>>> causes this UAF.
>>>
>>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
>>> things in here:
>>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
>>>
>>> Reference here:
>>> 'The easiest solution is to delay processing of descriptors to a bottom
>>> half whenever MMIO is doing something complicated.  This is also better
>>> for latency because it will free the vCPU thread more quickly and leave
>>> the work to the I/O thread.'
>>>
>>> This patch fixes this UAF.
>>>
>>> Reported-by: Alexander Bulekov <alxndr@bu.edu>
>>> Signed-off-by: Li Qiang <liq3ea@163.com>
>>> ---
>>> Change since v2:
>>> 1. Add comments for the tx bh schdule when VM resumes
>>> 2. Leave the set ics code in 'e1000e_start_xmit'
>>> 3. Cancel the tx bh and reset tx_waiting in e1000e_core_reset
>>
>> So based on our discussion this is probably not sufficient. It solves
>> the issue TX re-entrancy but not RX (e.g RX DMA to RDT?) Or is e1000e's
>> RX is reentrant?
>>
> It seems the RX has no reentrant issue if we address the TX reentrant issue.


Just to make sure I understand.

Did you mean

1) RX code is reentrant

or

2) the following can't happen

e1000_receive()
     e1000e_write_packet_to_guest()
         e1000e_write_to_rx_buffers()
             pci_dma_write()
                 ...
                     e1000e_set_rdt()
                         qemu_flush_queued_packets()
                             e1000e_receive()

?


> Though we can write the RX-related DMA register, then when we receive
> packets, we write to the MMIO with any data.
> However this is not different between the guest write any data to MMIO.
>
> I think we can hold on this patch and discuss more about this MMIO
> reentrant issue then make
> the decision.


Right, and we need start to think of how to do the test, e.g qtest?

Thanks


>
> Thanks,
> Li Qiang
>
>
>> Thanks
>>
>>
>>> Change since v1:
>>> Per Jason's review here:
>>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
>>> 1. Cancel and schedule the tx bh when VM is stopped or resume
>>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
>>> 3. Add a tx_waiting to record whether the bh is pending or not
>>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
>>> acquired.
>>>
>>>    hw/net/e1000e.c      |   6 +++
>>>    hw/net/e1000e_core.c | 107 +++++++++++++++++++++++++++++++++++--------
>>>    hw/net/e1000e_core.h |   8 ++++
>>>    3 files changed, 101 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
>>> index fda34518c9..24e35a78bf 100644
>>> --- a/hw/net/e1000e.c
>>> +++ b/hw/net/e1000e.c
>>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>>>
>>>        bool disable_vnet;
>>>
>>> +    int32_t tx_burst;
>>> +
>>>        E1000ECore core;
>>>
>>>    } E1000EState;
>>>
>>> +#define TX_BURST 256
>>> +
>>>    #define E1000E_MMIO_IDX     0
>>>    #define E1000E_FLASH_IDX    1
>>>    #define E1000E_IO_IDX       2
>>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>>>    {
>>>        s->core.owner = &s->parent_obj;
>>>        s->core.owner_nic = s->nic;
>>> +    s->core.tx_burst = s->tx_burst;
>>>    }
>>>
>>>    static void
>>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>>>                            e1000e_prop_subsys_ven, uint16_t),
>>>        DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>>>                            e1000e_prop_subsys, uint16_t),
>>> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>>>        DEFINE_PROP_END_OF_LIST(),
>>>    };
>>>
>>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
>>> index bcd186cac5..2fdfc23204 100644
>>> --- a/hw/net/e1000e_core.c
>>> +++ b/hw/net/e1000e_core.c
>>> @@ -910,18 +910,17 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
>>>    }
>>>
>>>    static void
>>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>> +e1000e_start_xmit(struct e1000e_tx *q)
>>>    {
>>> +    E1000ECore *core = q->core;
>>>        dma_addr_t base;
>>>        struct e1000_tx_desc desc;
>>> -    bool ide = false;
>>> -    const E1000E_RingInfo *txi = txr->i;
>>> -    uint32_t cause = E1000_ICS_TXQE;
>>> +    const E1000E_RingInfo *txi;
>>> +    E1000E_TxRing txr;
>>> +    int32_t num_packets = 0;
>>>
>>> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>> -        trace_e1000e_tx_disabled();
>>> -        return;
>>> -    }
>>> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
>>> +    txi = txr.i;
>>>
>>>        while (!e1000e_ring_empty(core, txi)) {
>>>            base = e1000e_ring_head_descr(core, txi);
>>> @@ -931,14 +930,24 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>>            trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>>>                                  desc.lower.data, desc.upper.data);
>>>
>>> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
>>> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
>>> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
>>> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
>>> +                                            &q->ide, txi->idx);
>>>
>>>            e1000e_ring_advance(core, txi, 1);
>>> +        if (++num_packets >= core->tx_burst) {
>>> +            break;
>>> +        }
>>> +    }
>>> +
>>> +    if (num_packets >= core->tx_burst) {
>>> +        qemu_bh_schedule(q->tx_bh);
>>> +        q->tx_waiting = 1;
>>> +        return;
>>>        }
>>>
>>> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
>>> -        e1000e_set_interrupt_cause(core, cause);
>>> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
>>> +        e1000e_set_interrupt_cause(core, q->cause);
>>>        }
>>>    }
>>>
>>> @@ -2423,32 +2432,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
>>>    static void
>>>    e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>>>    {
>>> -    E1000E_TxRing txr;
>>>        core->mac[index] = val;
>>>
>>>        if (core->mac[TARC0] & E1000_TARC_ENABLE) {
>>> -        e1000e_tx_ring_init(core, &txr, 0);
>>> -        e1000e_start_xmit(core, &txr);
>>> +        if (core->tx[0].tx_waiting) {
>>> +            return;
>>> +        }
>>> +        core->tx[0].tx_waiting = 1;
>>> +        if (!core->vm_running) {
>>> +            return;
>>> +        }
>>> +        qemu_bh_schedule(core->tx[0].tx_bh);
>>>        }
>>>
>>>        if (core->mac[TARC1] & E1000_TARC_ENABLE) {
>>> -        e1000e_tx_ring_init(core, &txr, 1);
>>> -        e1000e_start_xmit(core, &txr);
>>> +        if (core->tx[1].tx_waiting) {
>>> +            return;
>>> +        }
>>> +        core->tx[1].tx_waiting = 1;
>>> +        if (!core->vm_running) {
>>> +            return;
>>> +        }
>>> +        qemu_bh_schedule(core->tx[1].tx_bh);
>>>        }
>>>    }
>>>
>>>    static void
>>>    e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>>>    {
>>> -    E1000E_TxRing txr;
>>>        int qidx = e1000e_mq_queue_idx(TDT, index);
>>>        uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>>>
>>>        core->mac[index] = val & 0xffff;
>>>
>>>        if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
>>> -        e1000e_tx_ring_init(core, &txr, qidx);
>>> -        e1000e_start_xmit(core, &txr);
>>> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
>>>        }
>>>    }
>>>
>>> @@ -3315,11 +3333,52 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
>>>            trace_e1000e_vm_state_running();
>>>            e1000e_intrmgr_resume(core);
>>>            e1000e_autoneg_resume(core);
>>> +        core->vm_running = 1;
>>> +
>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> +            /*
>>> +             * Schedule tx bh unconditionally to make sure
>>> +             * tx work after live migration since we don't
>>> +             * migrate tx_waiting.
>>> +             */
>>> +            qemu_bh_schedule(core->tx[i].tx_bh);
>>> +        }
>>> +
>>>        } else {
>>>            trace_e1000e_vm_state_stopped();
>>> +
>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>> +            qemu_bh_cancel(core->tx[i].tx_bh);
>>> +        }
>>> +
>>>            e1000e_autoneg_pause(core);
>>>            e1000e_intrmgr_pause(core);
>>> +        core->vm_running = 0;
>>> +    }
>>> +}
>>> +
>>> +
>>> +static void e1000e_core_tx_bh(void *opaque)
>>> +{
>>> +    struct e1000e_tx *q = opaque;
>>> +    E1000ECore *core = q->core;
>>> +
>>> +    if (!core->vm_running) {
>>> +        assert(q->tx_waiting);
>>> +        return;
>>> +    }
>>> +
>>> +    q->tx_waiting = 0;
>>> +
>>> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>> +        trace_e1000e_tx_disabled();
>>> +        return;
>>>        }
>>> +
>>> +    q->cause = E1000_ICS_TXQE;
>>> +    q->ide = false;
>>> +
>>> +    e1000e_start_xmit(q);
>>>    }
>>>
>>>    void
>>> @@ -3334,12 +3393,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
>>>                                           e1000e_autoneg_timer, core);
>>>        e1000e_intrmgr_pci_realize(core);
>>>
>>> +    core->vm_running = runstate_is_running();
>>>        core->vmstate =
>>>            qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>>>
>>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>            net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>>>                            E1000E_MAX_TX_FRAGS, core->has_vnet);
>>> +        core->tx[i].core = core;
>>> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
>>>        }
>>>
>>>        net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
>>> @@ -3367,6 +3429,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
>>>            net_tx_pkt_uninit(core->tx[i].tx_pkt);
>>> +        qemu_bh_cancel(core->tx[i].tx_bh);
>>> +        qemu_bh_delete(core->tx[i].tx_bh);
>>> +        core->tx[i].tx_bh = NULL;
>>>        }
>>>
>>>        net_rx_pkt_uninit(core->rx_pkt);
>>> @@ -3480,6 +3545,8 @@ e1000e_core_reset(E1000ECore *core)
>>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
>>>            memset(&core->tx[i].props, 0, sizeof(core->tx[i].props));
>>>            core->tx[i].skip_cp = false;
>>> +        qemu_bh_cancel(core->tx[i].tx_bh);
>>> +        core->tx[i].tx_waiting = 0;
>>>        }
>>>    }
>>>
>>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
>>> index aee32f7e48..0c16dce3a6 100644
>>> --- a/hw/net/e1000e_core.h
>>> +++ b/hw/net/e1000e_core.h
>>> @@ -77,10 +77,18 @@ struct E1000Core {
>>>            unsigned char sum_needed;
>>>            bool cptse;
>>>            struct NetTxPkt *tx_pkt;
>>> +        QEMUBH *tx_bh;
>>> +        uint32_t tx_waiting;
>>> +        uint32_t cause;
>>> +        bool ide;
>>> +        E1000ECore *core;
>>>        } tx[E1000E_NUM_QUEUES];
>>>
>>>        struct NetRxPkt *rx_pkt;
>>>
>>> +    int32_t tx_burst;
>>> +
>>> +    bool vm_running;
>>>        bool has_vnet;
>>>        int max_queue_num;
>>>



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] e1000e: using bottom half to send packets
  2020-07-22  5:49     ` Jason Wang
@ 2020-07-22  8:31       ` Jason Wang
  2020-07-22 10:45       ` Li Qiang
  1 sibling, 0 replies; 7+ messages in thread
From: Jason Wang @ 2020-07-22  8:31 UTC (permalink / raw)
  To: Li Qiang
  Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
	Alexander Bulekov, Paolo Bonzini


On 2020/7/22 下午1:49, Jason Wang wrote:
>
> On 2020/7/22 下午12:47, Li Qiang wrote:
>> Jason Wang <jasowang@redhat.com> 于2020年7月22日周三 上午11:32写道:
>>>
>>> On 2020/7/21 下午11:17, Li Qiang wrote:
>>>> Alexander Bulekov reported a UAF bug related e1000e packets send.
>>>>
>>>> -->https://bugs.launchpad.net/qemu/+bug/1886362
>>>>
>>>> This is because the guest trigger a e1000e packet send and set the
>>>> data's address to e1000e's MMIO address. So when the e1000e do DMA
>>>> it will write the MMIO again and trigger re-entrancy and finally
>>>> causes this UAF.
>>>>
>>>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
>>>> things in here:
>>>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html 
>>>>
>>>>
>>>> Reference here:
>>>> 'The easiest solution is to delay processing of descriptors to a 
>>>> bottom
>>>> half whenever MMIO is doing something complicated.  This is also 
>>>> better
>>>> for latency because it will free the vCPU thread more quickly and 
>>>> leave
>>>> the work to the I/O thread.'
>>>>
>>>> This patch fixes this UAF.
>>>>
>>>> Reported-by: Alexander Bulekov <alxndr@bu.edu>
>>>> Signed-off-by: Li Qiang <liq3ea@163.com>
>>>> ---
>>>> Change since v2:
>>>> 1. Add comments for the tx bh schdule when VM resumes
>>>> 2. Leave the set ics code in 'e1000e_start_xmit'
>>>> 3. Cancel the tx bh and reset tx_waiting in e1000e_core_reset
>>>
>>> So based on our discussion this is probably not sufficient. It solves
>>> the issue TX re-entrancy but not RX (e.g RX DMA to RDT?) Or is e1000e's
>>> RX is reentrant?
>>>
>> It seems the RX has no reentrant issue if we address the TX reentrant 
>> issue.
>
>
> Just to make sure I understand.
>
> Did you mean
>
> 1) RX code is reentrant
>
> or
>
> 2) the following can't happen
>
> e1000_receive()
>     e1000e_write_packet_to_guest()
>         e1000e_write_to_rx_buffers()
>             pci_dma_write()
>                 ...
>                     e1000e_set_rdt()
>                         qemu_flush_queued_packets()
>                             e1000e_receive()
>
> ?


Ok, I think we can simply prevent the RX reentrancy by checking 
queue->delivering in qemu_net_queue_flush() as what has been done for TX.

And for e1000e, we can simply detect the reentrancy and return early.

Let me cook patches.

Thanks


>
>
>> Though we can write the RX-related DMA register, then when we receive
>> packets, we write to the MMIO with any data.
>> However this is not different between the guest write any data to MMIO.
>>
>> I think we can hold on this patch and discuss more about this MMIO
>> reentrant issue then make
>> the decision.
>
>
> Right, and we need start to think of how to do the test, e.g qtest?
>
> Thanks
>
>
>>
>> Thanks,
>> Li Qiang
>>
>>
>>> Thanks
>>>
>>>
>>>> Change since v1:
>>>> Per Jason's review here:
>>>> -- 
>>>> https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
>>>> 1. Cancel and schedule the tx bh when VM is stopped or resume
>>>> 2. Add a tx_burst for e1000e configuration to throttle the bh 
>>>> execution
>>>> 3. Add a tx_waiting to record whether the bh is pending or not
>>>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the 
>>>> BQL is
>>>> acquired.
>>>>
>>>>    hw/net/e1000e.c      |   6 +++
>>>>    hw/net/e1000e_core.c | 107 
>>>> +++++++++++++++++++++++++++++++++++--------
>>>>    hw/net/e1000e_core.h |   8 ++++
>>>>    3 files changed, 101 insertions(+), 20 deletions(-)
>>>>
>>>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
>>>> index fda34518c9..24e35a78bf 100644
>>>> --- a/hw/net/e1000e.c
>>>> +++ b/hw/net/e1000e.c
>>>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
>>>>
>>>>        bool disable_vnet;
>>>>
>>>> +    int32_t tx_burst;
>>>> +
>>>>        E1000ECore core;
>>>>
>>>>    } E1000EState;
>>>>
>>>> +#define TX_BURST 256
>>>> +
>>>>    #define E1000E_MMIO_IDX     0
>>>>    #define E1000E_FLASH_IDX    1
>>>>    #define E1000E_IO_IDX       2
>>>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
>>>>    {
>>>>        s->core.owner = &s->parent_obj;
>>>>        s->core.owner_nic = s->nic;
>>>> +    s->core.tx_burst = s->tx_burst;
>>>>    }
>>>>
>>>>    static void
>>>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
>>>>                            e1000e_prop_subsys_ven, uint16_t),
>>>>        DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
>>>>                            e1000e_prop_subsys, uint16_t),
>>>> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
>>>>        DEFINE_PROP_END_OF_LIST(),
>>>>    };
>>>>
>>>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
>>>> index bcd186cac5..2fdfc23204 100644
>>>> --- a/hw/net/e1000e_core.c
>>>> +++ b/hw/net/e1000e_core.c
>>>> @@ -910,18 +910,17 @@ e1000e_rx_ring_init(E1000ECore *core, 
>>>> E1000E_RxRing *rxr, int idx)
>>>>    }
>>>>
>>>>    static void
>>>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
>>>> +e1000e_start_xmit(struct e1000e_tx *q)
>>>>    {
>>>> +    E1000ECore *core = q->core;
>>>>        dma_addr_t base;
>>>>        struct e1000_tx_desc desc;
>>>> -    bool ide = false;
>>>> -    const E1000E_RingInfo *txi = txr->i;
>>>> -    uint32_t cause = E1000_ICS_TXQE;
>>>> +    const E1000E_RingInfo *txi;
>>>> +    E1000E_TxRing txr;
>>>> +    int32_t num_packets = 0;
>>>>
>>>> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>>> -        trace_e1000e_tx_disabled();
>>>> -        return;
>>>> -    }
>>>> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
>>>> +    txi = txr.i;
>>>>
>>>>        while (!e1000e_ring_empty(core, txi)) {
>>>>            base = e1000e_ring_head_descr(core, txi);
>>>> @@ -931,14 +930,24 @@ e1000e_start_xmit(E1000ECore *core, const 
>>>> E1000E_TxRing *txr)
>>>>            trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
>>>>                                  desc.lower.data, desc.upper.data);
>>>>
>>>> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
>>>> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, 
>>>> txi->idx);
>>>> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
>>>> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
>>>> +                                            &q->ide, txi->idx);
>>>>
>>>>            e1000e_ring_advance(core, txi, 1);
>>>> +        if (++num_packets >= core->tx_burst) {
>>>> +            break;
>>>> +        }
>>>> +    }
>>>> +
>>>> +    if (num_packets >= core->tx_burst) {
>>>> +        qemu_bh_schedule(q->tx_bh);
>>>> +        q->tx_waiting = 1;
>>>> +        return;
>>>>        }
>>>>
>>>> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
>>>> -        e1000e_set_interrupt_cause(core, cause);
>>>> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, 
>>>> &q->cause)) {
>>>> +        e1000e_set_interrupt_cause(core, q->cause);
>>>>        }
>>>>    }
>>>>
>>>> @@ -2423,32 +2432,41 @@ e1000e_set_dbal(E1000ECore *core, int 
>>>> index, uint32_t val)
>>>>    static void
>>>>    e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
>>>>    {
>>>> -    E1000E_TxRing txr;
>>>>        core->mac[index] = val;
>>>>
>>>>        if (core->mac[TARC0] & E1000_TARC_ENABLE) {
>>>> -        e1000e_tx_ring_init(core, &txr, 0);
>>>> -        e1000e_start_xmit(core, &txr);
>>>> +        if (core->tx[0].tx_waiting) {
>>>> +            return;
>>>> +        }
>>>> +        core->tx[0].tx_waiting = 1;
>>>> +        if (!core->vm_running) {
>>>> +            return;
>>>> +        }
>>>> +        qemu_bh_schedule(core->tx[0].tx_bh);
>>>>        }
>>>>
>>>>        if (core->mac[TARC1] & E1000_TARC_ENABLE) {
>>>> -        e1000e_tx_ring_init(core, &txr, 1);
>>>> -        e1000e_start_xmit(core, &txr);
>>>> +        if (core->tx[1].tx_waiting) {
>>>> +            return;
>>>> +        }
>>>> +        core->tx[1].tx_waiting = 1;
>>>> +        if (!core->vm_running) {
>>>> +            return;
>>>> +        }
>>>> +        qemu_bh_schedule(core->tx[1].tx_bh);
>>>>        }
>>>>    }
>>>>
>>>>    static void
>>>>    e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
>>>>    {
>>>> -    E1000E_TxRing txr;
>>>>        int qidx = e1000e_mq_queue_idx(TDT, index);
>>>>        uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
>>>>
>>>>        core->mac[index] = val & 0xffff;
>>>>
>>>>        if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
>>>> -        e1000e_tx_ring_init(core, &txr, qidx);
>>>> -        e1000e_start_xmit(core, &txr);
>>>> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
>>>>        }
>>>>    }
>>>>
>>>> @@ -3315,11 +3333,52 @@ e1000e_vm_state_change(void *opaque, int 
>>>> running, RunState state)
>>>>            trace_e1000e_vm_state_running();
>>>>            e1000e_intrmgr_resume(core);
>>>>            e1000e_autoneg_resume(core);
>>>> +        core->vm_running = 1;
>>>> +
>>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>> +            /*
>>>> +             * Schedule tx bh unconditionally to make sure
>>>> +             * tx work after live migration since we don't
>>>> +             * migrate tx_waiting.
>>>> +             */
>>>> +            qemu_bh_schedule(core->tx[i].tx_bh);
>>>> +        }
>>>> +
>>>>        } else {
>>>>            trace_e1000e_vm_state_stopped();
>>>> +
>>>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>> +            qemu_bh_cancel(core->tx[i].tx_bh);
>>>> +        }
>>>> +
>>>>            e1000e_autoneg_pause(core);
>>>>            e1000e_intrmgr_pause(core);
>>>> +        core->vm_running = 0;
>>>> +    }
>>>> +}
>>>> +
>>>> +
>>>> +static void e1000e_core_tx_bh(void *opaque)
>>>> +{
>>>> +    struct e1000e_tx *q = opaque;
>>>> +    E1000ECore *core = q->core;
>>>> +
>>>> +    if (!core->vm_running) {
>>>> +        assert(q->tx_waiting);
>>>> +        return;
>>>> +    }
>>>> +
>>>> +    q->tx_waiting = 0;
>>>> +
>>>> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
>>>> +        trace_e1000e_tx_disabled();
>>>> +        return;
>>>>        }
>>>> +
>>>> +    q->cause = E1000_ICS_TXQE;
>>>> +    q->ide = false;
>>>> +
>>>> +    e1000e_start_xmit(q);
>>>>    }
>>>>
>>>>    void
>>>> @@ -3334,12 +3393,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
>>>> e1000e_autoneg_timer, core);
>>>>        e1000e_intrmgr_pci_realize(core);
>>>>
>>>> +    core->vm_running = runstate_is_running();
>>>>        core->vmstate =
>>>> qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
>>>>
>>>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>            net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
>>>>                            E1000E_MAX_TX_FRAGS, core->has_vnet);
>>>> +        core->tx[i].core = core;
>>>> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, 
>>>> &core->tx[i]);
>>>>        }
>>>>
>>>>        net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
>>>> @@ -3367,6 +3429,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
>>>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
>>>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
>>>>            net_tx_pkt_uninit(core->tx[i].tx_pkt);
>>>> +        qemu_bh_cancel(core->tx[i].tx_bh);
>>>> +        qemu_bh_delete(core->tx[i].tx_bh);
>>>> +        core->tx[i].tx_bh = NULL;
>>>>        }
>>>>
>>>>        net_rx_pkt_uninit(core->rx_pkt);
>>>> @@ -3480,6 +3545,8 @@ e1000e_core_reset(E1000ECore *core)
>>>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
>>>>            memset(&core->tx[i].props, 0, sizeof(core->tx[i].props));
>>>>            core->tx[i].skip_cp = false;
>>>> +        qemu_bh_cancel(core->tx[i].tx_bh);
>>>> +        core->tx[i].tx_waiting = 0;
>>>>        }
>>>>    }
>>>>
>>>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
>>>> index aee32f7e48..0c16dce3a6 100644
>>>> --- a/hw/net/e1000e_core.h
>>>> +++ b/hw/net/e1000e_core.h
>>>> @@ -77,10 +77,18 @@ struct E1000Core {
>>>>            unsigned char sum_needed;
>>>>            bool cptse;
>>>>            struct NetTxPkt *tx_pkt;
>>>> +        QEMUBH *tx_bh;
>>>> +        uint32_t tx_waiting;
>>>> +        uint32_t cause;
>>>> +        bool ide;
>>>> +        E1000ECore *core;
>>>>        } tx[E1000E_NUM_QUEUES];
>>>>
>>>>        struct NetRxPkt *rx_pkt;
>>>>
>>>> +    int32_t tx_burst;
>>>> +
>>>> +    bool vm_running;
>>>>        bool has_vnet;
>>>>        int max_queue_num;
>>>>
>
>



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v3] e1000e: using bottom half to send packets
  2020-07-22  5:49     ` Jason Wang
  2020-07-22  8:31       ` Jason Wang
@ 2020-07-22 10:45       ` Li Qiang
  1 sibling, 0 replies; 7+ messages in thread
From: Li Qiang @ 2020-07-22 10:45 UTC (permalink / raw)
  To: Jason Wang
  Cc: Dmitry Fleytman, Li Qiang, Qemu Developers, P J P,
	Alexander Bulekov, Paolo Bonzini

Jason Wang <jasowang@redhat.com> 于2020年7月22日周三 下午1:49写道:
>
>
> On 2020/7/22 下午12:47, Li Qiang wrote:
> > Jason Wang <jasowang@redhat.com> 于2020年7月22日周三 上午11:32写道:
> >>
> >> On 2020/7/21 下午11:17, Li Qiang wrote:
> >>> Alexander Bulekov reported a UAF bug related e1000e packets send.
> >>>
> >>> -->https://bugs.launchpad.net/qemu/+bug/1886362
> >>>
> >>> This is because the guest trigger a e1000e packet send and set the
> >>> data's address to e1000e's MMIO address. So when the e1000e do DMA
> >>> it will write the MMIO again and trigger re-entrancy and finally
> >>> causes this UAF.
> >>>
> >>> Paolo suggested to use a bottom half whenever MMIO is doing complicate
> >>> things in here:
> >>> -->https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg03342.html
> >>>
> >>> Reference here:
> >>> 'The easiest solution is to delay processing of descriptors to a bottom
> >>> half whenever MMIO is doing something complicated.  This is also better
> >>> for latency because it will free the vCPU thread more quickly and leave
> >>> the work to the I/O thread.'
> >>>
> >>> This patch fixes this UAF.
> >>>
> >>> Reported-by: Alexander Bulekov <alxndr@bu.edu>
> >>> Signed-off-by: Li Qiang <liq3ea@163.com>
> >>> ---
> >>> Change since v2:
> >>> 1. Add comments for the tx bh schdule when VM resumes
> >>> 2. Leave the set ics code in 'e1000e_start_xmit'
> >>> 3. Cancel the tx bh and reset tx_waiting in e1000e_core_reset
> >>
> >> So based on our discussion this is probably not sufficient. It solves
> >> the issue TX re-entrancy but not RX (e.g RX DMA to RDT?) Or is e1000e's
> >> RX is reentrant?
> >>
> > It seems the RX has no reentrant issue if we address the TX reentrant issue.
>
>
> Just to make sure I understand.
>
> Did you mean
>
> 1) RX code is reentrant
>
> or
>
> 2) the following can't happen
>
> e1000_receive()
>      e1000e_write_packet_to_guest()
>          e1000e_write_to_rx_buffers()
>              pci_dma_write()
>                  ...
>                      e1000e_set_rdt()
>                          qemu_flush_queued_packets()
>                              e1000e_receive()
>
> ?
>

Once I don't think there is a such path. But just RX->TX->RX(loopback).
Then in my last email I think if we solve TX reentry issue there will
be no RX reentry issue.
But it is not the case.
Anyway seems your new patch is more elegant.

Thanks,
Li Qiang


>
> > Though we can write the RX-related DMA register, then when we receive
> > packets, we write to the MMIO with any data.
> > However this is not different between the guest write any data to MMIO.
> >
> > I think we can hold on this patch and discuss more about this MMIO
> > reentrant issue then make
> > the decision.
>
>
> Right, and we need start to think of how to do the test, e.g qtest?
>
> Thanks
>
>
> >
> > Thanks,
> > Li Qiang
> >
> >
> >> Thanks
> >>
> >>
> >>> Change since v1:
> >>> Per Jason's review here:
> >>> -- https://lists.nongnu.org/archive/html/qemu-devel/2020-07/msg05368.html
> >>> 1. Cancel and schedule the tx bh when VM is stopped or resume
> >>> 2. Add a tx_burst for e1000e configuration to throttle the bh execution
> >>> 3. Add a tx_waiting to record whether the bh is pending or not
> >>> Don't use BQL in the tx_bh handler as when tx_bh is executed, the BQL is
> >>> acquired.
> >>>
> >>>    hw/net/e1000e.c      |   6 +++
> >>>    hw/net/e1000e_core.c | 107 +++++++++++++++++++++++++++++++++++--------
> >>>    hw/net/e1000e_core.h |   8 ++++
> >>>    3 files changed, 101 insertions(+), 20 deletions(-)
> >>>
> >>> diff --git a/hw/net/e1000e.c b/hw/net/e1000e.c
> >>> index fda34518c9..24e35a78bf 100644
> >>> --- a/hw/net/e1000e.c
> >>> +++ b/hw/net/e1000e.c
> >>> @@ -77,10 +77,14 @@ typedef struct E1000EState {
> >>>
> >>>        bool disable_vnet;
> >>>
> >>> +    int32_t tx_burst;
> >>> +
> >>>        E1000ECore core;
> >>>
> >>>    } E1000EState;
> >>>
> >>> +#define TX_BURST 256
> >>> +
> >>>    #define E1000E_MMIO_IDX     0
> >>>    #define E1000E_FLASH_IDX    1
> >>>    #define E1000E_IO_IDX       2
> >>> @@ -263,6 +267,7 @@ static void e1000e_core_realize(E1000EState *s)
> >>>    {
> >>>        s->core.owner = &s->parent_obj;
> >>>        s->core.owner_nic = s->nic;
> >>> +    s->core.tx_burst = s->tx_burst;
> >>>    }
> >>>
> >>>    static void
> >>> @@ -665,6 +670,7 @@ static Property e1000e_properties[] = {
> >>>                            e1000e_prop_subsys_ven, uint16_t),
> >>>        DEFINE_PROP_SIGNED("subsys", E1000EState, subsys, 0,
> >>>                            e1000e_prop_subsys, uint16_t),
> >>> +    DEFINE_PROP_INT32("x-txburst", E1000EState, tx_burst, TX_BURST),
> >>>        DEFINE_PROP_END_OF_LIST(),
> >>>    };
> >>>
> >>> diff --git a/hw/net/e1000e_core.c b/hw/net/e1000e_core.c
> >>> index bcd186cac5..2fdfc23204 100644
> >>> --- a/hw/net/e1000e_core.c
> >>> +++ b/hw/net/e1000e_core.c
> >>> @@ -910,18 +910,17 @@ e1000e_rx_ring_init(E1000ECore *core, E1000E_RxRing *rxr, int idx)
> >>>    }
> >>>
> >>>    static void
> >>> -e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >>> +e1000e_start_xmit(struct e1000e_tx *q)
> >>>    {
> >>> +    E1000ECore *core = q->core;
> >>>        dma_addr_t base;
> >>>        struct e1000_tx_desc desc;
> >>> -    bool ide = false;
> >>> -    const E1000E_RingInfo *txi = txr->i;
> >>> -    uint32_t cause = E1000_ICS_TXQE;
> >>> +    const E1000E_RingInfo *txi;
> >>> +    E1000E_TxRing txr;
> >>> +    int32_t num_packets = 0;
> >>>
> >>> -    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> -        trace_e1000e_tx_disabled();
> >>> -        return;
> >>> -    }
> >>> +    e1000e_tx_ring_init(core, &txr, q - &core->tx[0]);
> >>> +    txi = txr.i;
> >>>
> >>>        while (!e1000e_ring_empty(core, txi)) {
> >>>            base = e1000e_ring_head_descr(core, txi);
> >>> @@ -931,14 +930,24 @@ e1000e_start_xmit(E1000ECore *core, const E1000E_TxRing *txr)
> >>>            trace_e1000e_tx_descr((void *)(intptr_t)desc.buffer_addr,
> >>>                                  desc.lower.data, desc.upper.data);
> >>>
> >>> -        e1000e_process_tx_desc(core, txr->tx, &desc, txi->idx);
> >>> -        cause |= e1000e_txdesc_writeback(core, base, &desc, &ide, txi->idx);
> >>> +        e1000e_process_tx_desc(core, txr.tx, &desc, txi->idx);
> >>> +        q->cause |= e1000e_txdesc_writeback(core, base, &desc,
> >>> +                                            &q->ide, txi->idx);
> >>>
> >>>            e1000e_ring_advance(core, txi, 1);
> >>> +        if (++num_packets >= core->tx_burst) {
> >>> +            break;
> >>> +        }
> >>> +    }
> >>> +
> >>> +    if (num_packets >= core->tx_burst) {
> >>> +        qemu_bh_schedule(q->tx_bh);
> >>> +        q->tx_waiting = 1;
> >>> +        return;
> >>>        }
> >>>
> >>> -    if (!ide || !e1000e_intrmgr_delay_tx_causes(core, &cause)) {
> >>> -        e1000e_set_interrupt_cause(core, cause);
> >>> +    if (!q->ide || !e1000e_intrmgr_delay_tx_causes(core, &q->cause)) {
> >>> +        e1000e_set_interrupt_cause(core, q->cause);
> >>>        }
> >>>    }
> >>>
> >>> @@ -2423,32 +2432,41 @@ e1000e_set_dbal(E1000ECore *core, int index, uint32_t val)
> >>>    static void
> >>>    e1000e_set_tctl(E1000ECore *core, int index, uint32_t val)
> >>>    {
> >>> -    E1000E_TxRing txr;
> >>>        core->mac[index] = val;
> >>>
> >>>        if (core->mac[TARC0] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, 0);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        if (core->tx[0].tx_waiting) {
> >>> +            return;
> >>> +        }
> >>> +        core->tx[0].tx_waiting = 1;
> >>> +        if (!core->vm_running) {
> >>> +            return;
> >>> +        }
> >>> +        qemu_bh_schedule(core->tx[0].tx_bh);
> >>>        }
> >>>
> >>>        if (core->mac[TARC1] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, 1);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        if (core->tx[1].tx_waiting) {
> >>> +            return;
> >>> +        }
> >>> +        core->tx[1].tx_waiting = 1;
> >>> +        if (!core->vm_running) {
> >>> +            return;
> >>> +        }
> >>> +        qemu_bh_schedule(core->tx[1].tx_bh);
> >>>        }
> >>>    }
> >>>
> >>>    static void
> >>>    e1000e_set_tdt(E1000ECore *core, int index, uint32_t val)
> >>>    {
> >>> -    E1000E_TxRing txr;
> >>>        int qidx = e1000e_mq_queue_idx(TDT, index);
> >>>        uint32_t tarc_reg = (qidx == 0) ? TARC0 : TARC1;
> >>>
> >>>        core->mac[index] = val & 0xffff;
> >>>
> >>>        if (core->mac[tarc_reg] & E1000_TARC_ENABLE) {
> >>> -        e1000e_tx_ring_init(core, &txr, qidx);
> >>> -        e1000e_start_xmit(core, &txr);
> >>> +        qemu_bh_schedule(core->tx[qidx].tx_bh);
> >>>        }
> >>>    }
> >>>
> >>> @@ -3315,11 +3333,52 @@ e1000e_vm_state_change(void *opaque, int running, RunState state)
> >>>            trace_e1000e_vm_state_running();
> >>>            e1000e_intrmgr_resume(core);
> >>>            e1000e_autoneg_resume(core);
> >>> +        core->vm_running = 1;
> >>> +
> >>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> +            /*
> >>> +             * Schedule tx bh unconditionally to make sure
> >>> +             * tx work after live migration since we don't
> >>> +             * migrate tx_waiting.
> >>> +             */
> >>> +            qemu_bh_schedule(core->tx[i].tx_bh);
> >>> +        }
> >>> +
> >>>        } else {
> >>>            trace_e1000e_vm_state_stopped();
> >>> +
> >>> +        for (int i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>> +            qemu_bh_cancel(core->tx[i].tx_bh);
> >>> +        }
> >>> +
> >>>            e1000e_autoneg_pause(core);
> >>>            e1000e_intrmgr_pause(core);
> >>> +        core->vm_running = 0;
> >>> +    }
> >>> +}
> >>> +
> >>> +
> >>> +static void e1000e_core_tx_bh(void *opaque)
> >>> +{
> >>> +    struct e1000e_tx *q = opaque;
> >>> +    E1000ECore *core = q->core;
> >>> +
> >>> +    if (!core->vm_running) {
> >>> +        assert(q->tx_waiting);
> >>> +        return;
> >>> +    }
> >>> +
> >>> +    q->tx_waiting = 0;
> >>> +
> >>> +    if (!(core->mac[TCTL] & E1000_TCTL_EN)) {
> >>> +        trace_e1000e_tx_disabled();
> >>> +        return;
> >>>        }
> >>> +
> >>> +    q->cause = E1000_ICS_TXQE;
> >>> +    q->ide = false;
> >>> +
> >>> +    e1000e_start_xmit(q);
> >>>    }
> >>>
> >>>    void
> >>> @@ -3334,12 +3393,15 @@ e1000e_core_pci_realize(E1000ECore     *core,
> >>>                                           e1000e_autoneg_timer, core);
> >>>        e1000e_intrmgr_pci_realize(core);
> >>>
> >>> +    core->vm_running = runstate_is_running();
> >>>        core->vmstate =
> >>>            qemu_add_vm_change_state_handler(e1000e_vm_state_change, core);
> >>>
> >>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>>            net_tx_pkt_init(&core->tx[i].tx_pkt, core->owner,
> >>>                            E1000E_MAX_TX_FRAGS, core->has_vnet);
> >>> +        core->tx[i].core = core;
> >>> +        core->tx[i].tx_bh = qemu_bh_new(e1000e_core_tx_bh, &core->tx[i]);
> >>>        }
> >>>
> >>>        net_rx_pkt_init(&core->rx_pkt, core->has_vnet);
> >>> @@ -3367,6 +3429,9 @@ e1000e_core_pci_uninit(E1000ECore *core)
> >>>        for (i = 0; i < E1000E_NUM_QUEUES; i++) {
> >>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
> >>>            net_tx_pkt_uninit(core->tx[i].tx_pkt);
> >>> +        qemu_bh_cancel(core->tx[i].tx_bh);
> >>> +        qemu_bh_delete(core->tx[i].tx_bh);
> >>> +        core->tx[i].tx_bh = NULL;
> >>>        }
> >>>
> >>>        net_rx_pkt_uninit(core->rx_pkt);
> >>> @@ -3480,6 +3545,8 @@ e1000e_core_reset(E1000ECore *core)
> >>>            net_tx_pkt_reset(core->tx[i].tx_pkt);
> >>>            memset(&core->tx[i].props, 0, sizeof(core->tx[i].props));
> >>>            core->tx[i].skip_cp = false;
> >>> +        qemu_bh_cancel(core->tx[i].tx_bh);
> >>> +        core->tx[i].tx_waiting = 0;
> >>>        }
> >>>    }
> >>>
> >>> diff --git a/hw/net/e1000e_core.h b/hw/net/e1000e_core.h
> >>> index aee32f7e48..0c16dce3a6 100644
> >>> --- a/hw/net/e1000e_core.h
> >>> +++ b/hw/net/e1000e_core.h
> >>> @@ -77,10 +77,18 @@ struct E1000Core {
> >>>            unsigned char sum_needed;
> >>>            bool cptse;
> >>>            struct NetTxPkt *tx_pkt;
> >>> +        QEMUBH *tx_bh;
> >>> +        uint32_t tx_waiting;
> >>> +        uint32_t cause;
> >>> +        bool ide;
> >>> +        E1000ECore *core;
> >>>        } tx[E1000E_NUM_QUEUES];
> >>>
> >>>        struct NetRxPkt *rx_pkt;
> >>>
> >>> +    int32_t tx_burst;
> >>> +
> >>> +    bool vm_running;
> >>>        bool has_vnet;
> >>>        int max_queue_num;
> >>>
>


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2020-07-22 10:47 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-07-21 15:17 [PATCH v3] e1000e: using bottom half to send packets Li Qiang
2020-07-22  3:32 ` Jason Wang
2020-07-22  4:47   ` Li Qiang
2020-07-22  5:49     ` Jason Wang
2020-07-22  8:31       ` Jason Wang
2020-07-22 10:45       ` Li Qiang
2020-07-22  4:52   ` P J P

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.