QEMU

Bug #1805256
Comment #19

Comment 19 for bug 1805256

Revision history for this message

Rafael David Tinoco (rafaeldtinoco) wrote on 2019-09-11: Re: qemu_futex_wait() lockups in ARM64: 2 possible issues

#19

> Zhengui's theory that notify_me doesn't work properly on ARM is more
> promising, but he couldn't provide a clear explanation of why he thought
> notify_me is involved. In particular, I would have expected notify_me to
> be wrong if the qemu_poll_ns call came from aio_ctx_dispatch, for example:
>
>
> glib_pollfds_fill
> g_main_context_prepare
> aio_ctx_prepare
> atomic_or(&ctx->notify_me, 1)
> qemu_poll_ns
> glib_pollfds_poll
> g_main_context_check
> aio_ctx_check
> atomic_and(&ctx->notify_me, ~1)
> g_main_context_dispatch
> aio_ctx_dispatch
> /* do something for event */
> qemu_poll_ns
>

Paolo,

I tried confining execution in a single NUMA domain (cpu & mem) and
still faced the issue, then, I added a mutex "ctx->notify_me_lcktest"
into context to protect "ctx->notify_me", like showed bellow, and it
seems to have either fixed or mitigated it.

I was able to cause the hung once every 3 or 4 runs. I have already ran
qemu-img convert more than 30 times now and couldn't reproduce it again.

Next step is to play with the barriers and check why existing ones
aren't enough for ordering access to ctx->notify_me ... or should I
try/do something else in your opinion ?

This arch/machine (Huawei D06):

$ lscpu
Architecture: aarch64
Byte Order: Little Endian
CPU(s): 96
On-line CPU(s) list: 0-95
Thread(s) per core: 1
Core(s) per socket: 48
Socket(s): 2
NUMA node(s): 4
Vendor ID: 0x48
Model: 0
Stepping: 0x0
CPU max MHz: 2000.0000
CPU min MHz: 200.0000
BogoMIPS: 200.00
L1d cache: 64K
L1i cache: 64K
L2 cache: 512K
L3 cache: 32768K
NUMA node0 CPU(s): 0-23
NUMA node1 CPU(s): 24-47
NUMA node2 CPU(s): 48-71
NUMA node3 CPU(s): 72-95
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
cpuid asimdrdm dcpop

----

diff --git a/include/block/aio.h b/include/block/aio.h
index 0ca25dfec6..0724086d91 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -84,6 +84,7 @@ struct AioContext {
      * dispatch phase, hence a simple counter is enough for them.
      */
     uint32_t notify_me;
+ QemuMutex notify_me_lcktest;

     /* A lock to protect between QEMUBH and AioHandler adders and deleter,
      * and to ensure that no callbacks are removed while we're walking and
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 51c41ed3c9..031d6e2997 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -529,7 +529,9 @@ static bool run_poll_handlers(AioContext *ctx,
int64_t max_ns, int64_t *timeout)
     bool progress;
     int64_t start_time, elapsed_time;

+ qemu_mutex_lock(&ctx->notify_me_lcktest);
assert(ctx->notify_me);
+ qemu_mutex_unlock(&ctx->notify_me_lcktest);
assert(qemu_lockcnt_count(&ctx->list_lock) > 0);

     trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
@@ -601,8 +603,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
      * so disable the optimization now.
      */
     if (blocking) {
+ qemu_mutex_lock(&ctx->notify_me_lcktest);
         assert(in_aio_context_home_thread(ctx));
         atomic_add(&ctx->notify_me, 2);
+ qemu_mutex_unlock(&ctx->notify_me_lcktest);
     }

qemu_lockcnt_inc(&ctx->list_lock);
@@ -647,8 +651,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
}

     if (blocking) {
+ qemu_mutex_lock(&ctx->notify_me_lcktest);
         atomic_sub(&ctx->notify_me, 2);
         aio_notify_accept(ctx);
+ qemu_mutex_unlock(&ctx->notify_me_lcktest);
     }

/* Adjust polling time */
diff --git a/util/async.c b/util/async.c
index c10642a385..140e1e86f5 100644
--- a/util/async.c
+++ b/util/async.c
@@ -221,7 +221,9 @@ aio_ctx_prepare(GSource *source, gint *timeout)
{
AioContext *ctx = (AioContext *) source;

+ qemu_mutex_lock(&ctx->notify_me_lcktest);
atomic_or(&ctx->notify_me, 1);
+ qemu_mutex_unlock(&ctx->notify_me_lcktest);

     /* We assume there is no timeout already supplied */
     *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
@@ -239,8 +241,10 @@ aio_ctx_check(GSource *source)
     AioContext *ctx = (AioContext *) source;
     QEMUBH *bh;

+ qemu_mutex_lock(&ctx->notify_me_lcktest);
atomic_and(&ctx->notify_me, ~1);
aio_notify_accept(ctx);
+ qemu_mutex_unlock(&ctx->notify_me_lcktest);

     for (bh = ctx->first_bh; bh; bh = bh->next) {
         if (bh->scheduled) {
@@ -346,11 +350,13 @@ void aio_notify(AioContext *ctx)
     /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs
      * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
      */
- smp_mb();
+ //smp_mb();
+ qemu_mutex_lock(&ctx->notify_me_lcktest);
     if (ctx->notify_me) {
         event_notifier_set(&ctx->notifier);
         atomic_mb_set(&ctx->notified, true);
     }
+ qemu_mutex_unlock(&ctx->notify_me_lcktest);
}

void aio_notify_accept(AioContext *ctx)
@@ -424,6 +430,8 @@ AioContext *aio_context_new(Error **errp)
ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
QSLIST_INIT(&ctx->scheduled_coroutines);

+ qemu_rec_mutex_init(&ctx->notify_me_lcktest);
+
     aio_set_event_notifier(ctx, &ctx->notifier,
                            false,
                            (EventNotifierHandler *)

> Zhengui's theory that notify_me doesn't work properly on ARM is more
> promising, but he couldn't provide a clear explanation of why he thought
> notify_me is involved.  In particular, I would have expected notify_me to
> be wrong if the qemu_poll_ns call came from aio_ctx_dispatch, for example:
> 
> 
>     glib_pollfds_fill
>       g_main_context_prepare
>         aio_ctx_prepare
>           atomic_or(&ctx->notify_me, 1)
>     qemu_poll_ns
>     glib_pollfds_poll
>       g_main_context_check
>         aio_ctx_check
>           atomic_and(&ctx->notify_me, ~1)
>       g_main_context_dispatch
>         aio_ctx_dispatch
>           /* do something for event */
>             qemu_poll_ns 
>

Paolo,

I was able to cause the hung once every 3 or 4 runs. I have already ran
qemu-img convert more than 30 times now and couldn't reproduce it again.

Next step is to play with the barriers and check why existing ones
aren't enough for ordering access to ctx->notify_me ... or should I
try/do something else in your opinion ?

This arch/machine (Huawei D06):

$ lscpu
Architecture:        aarch64
Byte Order:          Little Endian
CPU(s):              96
On-line CPU(s) list: 0-95
Thread(s) per core:  1
Core(s) per socket:  48
Socket(s):           2
NUMA node(s):        4
Vendor ID:           0x48
Model:               0
Stepping:            0x0
CPU max MHz:         2000.0000
CPU min MHz:         200.0000
BogoMIPS:            200.00
L1d cache:           64K
L1i cache:           64K
L2 cache:            512K
L3 cache:            32768K
NUMA node0 CPU(s):   0-23
NUMA node1 CPU(s):   24-47
NUMA node2 CPU(s):   48-71
NUMA node3 CPU(s):   72-95
Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
cpuid asimdrdm dcpop

----

+    qemu_mutex_lock(&ctx->notify_me_lcktest);
     assert(ctx->notify_me);
+    qemu_mutex_unlock(&ctx->notify_me_lcktest);
     assert(qemu_lockcnt_count(&ctx->list_lock) > 0);

trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
@@ -601,8 +603,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
      * so disable the optimization now.
      */
     if (blocking) {
+        qemu_mutex_lock(&ctx->notify_me_lcktest);
         assert(in_aio_context_home_thread(ctx));
         atomic_add(&ctx->notify_me, 2);
+        qemu_mutex_unlock(&ctx->notify_me_lcktest);
     }

qemu_lockcnt_inc(&ctx->list_lock);
@@ -647,8 +651,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
     }

if (blocking) {
+        qemu_mutex_lock(&ctx->notify_me_lcktest);
         atomic_sub(&ctx->notify_me, 2);
         aio_notify_accept(ctx);
+        qemu_mutex_unlock(&ctx->notify_me_lcktest);
     }

/* Adjust polling time */
diff --git a/util/async.c b/util/async.c
index c10642a385..140e1e86f5 100644
--- a/util/async.c
+++ b/util/async.c
@@ -221,7 +221,9 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
 {
     AioContext *ctx = (AioContext *) source;

+    qemu_mutex_lock(&ctx->notify_me_lcktest);
     atomic_or(&ctx->notify_me, 1);
+    qemu_mutex_unlock(&ctx->notify_me_lcktest);

+    qemu_mutex_lock(&ctx->notify_me_lcktest);
     atomic_and(&ctx->notify_me, ~1);
     aio_notify_accept(ctx);
+    qemu_mutex_unlock(&ctx->notify_me_lcktest);

for (bh = ctx->first_bh; bh; bh = bh->next) {
         if (bh->scheduled) {
@@ -346,11 +350,13 @@ void aio_notify(AioContext *ctx)
     /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
      * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
      */
-    smp_mb();
+    //smp_mb();
+    qemu_mutex_lock(&ctx->notify_me_lcktest);
     if (ctx->notify_me) {
         event_notifier_set(&ctx->notifier);
         atomic_mb_set(&ctx->notified, true);
     }
+    qemu_mutex_unlock(&ctx->notify_me_lcktest);
 }

void aio_notify_accept(AioContext *ctx)
@@ -424,6 +430,8 @@ AioContext *aio_context_new(Error **errp)
     ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
     QSLIST_INIT(&ctx->scheduled_coroutines);

+    qemu_rec_mutex_init(&ctx->notify_me_lcktest);
+
     aio_set_event_notifier(ctx, &ctx->notifier,
                            false,
                            (EventNotifierHandler *)