QEMU

Bug #1805256
Comment #20

Comment 20 for bug 1805256

Revision history for this message

dann frazier (dannf) wrote on 2019-09-24: Re: [Qemu-devel] qemu_futex_wait() lockups in ARM64: 2 possible issues

#20

On Wed, Sep 11, 2019 at 04:09:25PM -0300, Rafael David Tinoco wrote:
> > Zhengui's theory that notify_me doesn't work properly on ARM is more
> > promising, but he couldn't provide a clear explanation of why he thought
> > notify_me is involved. In particular, I would have expected notify_me to
> > be wrong if the qemu_poll_ns call came from aio_ctx_dispatch, for example:
> >
> >
> > glib_pollfds_fill
> > g_main_context_prepare
> > aio_ctx_prepare
> > atomic_or(&ctx->notify_me, 1)
> > qemu_poll_ns
> > glib_pollfds_poll
> > g_main_context_check
> > aio_ctx_check
> > atomic_and(&ctx->notify_me, ~1)
> > g_main_context_dispatch
> > aio_ctx_dispatch
> > /* do something for event */
> > qemu_poll_ns
> >
>
> Paolo,
>
> I tried confining execution in a single NUMA domain (cpu & mem) and
> still faced the issue, then, I added a mutex "ctx->notify_me_lcktest"
> into context to protect "ctx->notify_me", like showed bellow, and it
> seems to have either fixed or mitigated it.
>
> I was able to cause the hung once every 3 or 4 runs. I have already ran
> qemu-img convert more than 30 times now and couldn't reproduce it again.
>
> Next step is to play with the barriers and check why existing ones
> aren't enough for ordering access to ctx->notify_me ... or should I
> try/do something else in your opinion ?
>
> This arch/machine (Huawei D06):
>
> $ lscpu
> Architecture: aarch64
> Byte Order: Little Endian
> CPU(s): 96
> On-line CPU(s) list: 0-95
> Thread(s) per core: 1
> Core(s) per socket: 48
> Socket(s): 2
> NUMA node(s): 4
> Vendor ID: 0x48
> Model: 0
> Stepping: 0x0
> CPU max MHz: 2000.0000
> CPU min MHz: 200.0000
> BogoMIPS: 200.00
> L1d cache: 64K
> L1i cache: 64K
> L2 cache: 512K
> L3 cache: 32768K
> NUMA node0 CPU(s): 0-23
> NUMA node1 CPU(s): 24-47
> NUMA node2 CPU(s): 48-71
> NUMA node3 CPU(s): 72-95
> Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
> cpuid asimdrdm dcpop

Note that I'm also seeing this on a ThunderX2 (same calltrace):

$ lscpu
Architecture: aarch64
Byte Order: Little Endian
CPU(s): 224
On-line CPU(s) list: 0-223
Thread(s) per core: 4
Core(s) per socket: 28
Socket(s): 2
NUMA node(s): 2
Vendor ID: Cavium
Model: 1
Model name: ThunderX2 99xx
Stepping: 0x1
BogoMIPS: 400.00
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 32768K
NUMA node0 CPU(s): 0-111
NUMA node1 CPU(s): 112-223
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm

-dann

> ----
>
> diff --git a/include/block/aio.h b/include/block/aio.h
> index 0ca25dfec6..0724086d91 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -84,6 +84,7 @@ struct AioContext {
> * dispatch phase, hence a simple counter is enough for them.
> */
> uint32_t notify_me;
> + QemuMutex notify_me_lcktest;
>
> /* A lock to protect between QEMUBH and AioHandler adders and deleter,
> * and to ensure that no callbacks are removed while we're walking and
> diff --git a/util/aio-posix.c b/util/aio-posix.c
> index 51c41ed3c9..031d6e2997 100644
> --- a/util/aio-posix.c
> +++ b/util/aio-posix.c
> @@ -529,7 +529,9 @@ static bool run_poll_handlers(AioContext *ctx,
> int64_t max_ns, int64_t *timeout)
> bool progress;
> int64_t start_time, elapsed_time;
>
> + qemu_mutex_lock(&ctx->notify_me_lcktest);
> assert(ctx->notify_me);
> + qemu_mutex_unlock(&ctx->notify_me_lcktest);
> assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
>
> trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
> @@ -601,8 +603,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
> * so disable the optimization now.
> */
> if (blocking) {
> + qemu_mutex_lock(&ctx->notify_me_lcktest);
> assert(in_aio_context_home_thread(ctx));
> atomic_add(&ctx->notify_me, 2);
> + qemu_mutex_unlock(&ctx->notify_me_lcktest);
> }
>
> qemu_lockcnt_inc(&ctx->list_lock);
> @@ -647,8 +651,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
> }
>
> if (blocking) {
> + qemu_mutex_lock(&ctx->notify_me_lcktest);
> atomic_sub(&ctx->notify_me, 2);
> aio_notify_accept(ctx);
> + qemu_mutex_unlock(&ctx->notify_me_lcktest);
> }
>
> /* Adjust polling time */
> diff --git a/util/async.c b/util/async.c
> index c10642a385..140e1e86f5 100644
> --- a/util/async.c
> +++ b/util/async.c
> @@ -221,7 +221,9 @@ aio_ctx_prepare(GSource *source, gint *timeout)
> {
> AioContext *ctx = (AioContext *) source;
>
> + qemu_mutex_lock(&ctx->notify_me_lcktest);
> atomic_or(&ctx->notify_me, 1);
> + qemu_mutex_unlock(&ctx->notify_me_lcktest);
>
> /* We assume there is no timeout already supplied */
> *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
> @@ -239,8 +241,10 @@ aio_ctx_check(GSource *source)
> AioContext *ctx = (AioContext *) source;
> QEMUBH *bh;
>
> + qemu_mutex_lock(&ctx->notify_me_lcktest);
> atomic_and(&ctx->notify_me, ~1);
> aio_notify_accept(ctx);
> + qemu_mutex_unlock(&ctx->notify_me_lcktest);
>
> for (bh = ctx->first_bh; bh; bh = bh->next) {
> if (bh->scheduled) {
> @@ -346,11 +350,13 @@ void aio_notify(AioContext *ctx)
> /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs
> * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
> */
> - smp_mb();
> + //smp_mb();
> + qemu_mutex_lock(&ctx->notify_me_lcktest);
> if (ctx->notify_me) {
> event_notifier_set(&ctx->notifier);
> atomic_mb_set(&ctx->notified, true);
> }
> + qemu_mutex_unlock(&ctx->notify_me_lcktest);
> }
>
> void aio_notify_accept(AioContext *ctx)
> @@ -424,6 +430,8 @@ AioContext *aio_context_new(Error **errp)
> ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
> QSLIST_INIT(&ctx->scheduled_coroutines);
>
> + qemu_rec_mutex_init(&ctx->notify_me_lcktest);
> +
> aio_set_event_notifier(ctx, &ctx->notifier,
> false,
> (EventNotifierHandler *)
>

On Wed, Sep 11, 2019 at 04:09:25PM -0300, Rafael David Tinoco wrote:
> > Zhengui's theory that notify_me doesn't work properly on ARM is more
> > promising, but he couldn't provide a clear explanation of why he thought
> > notify_me is involved.  In particular, I would have expected notify_me to
> > be wrong if the qemu_poll_ns call came from aio_ctx_dispatch, for example:
> > 
> > 
> >     glib_pollfds_fill
> >       g_main_context_prepare
> >         aio_ctx_prepare
> >           atomic_or(&ctx->notify_me, 1)
> >     qemu_poll_ns
> >     glib_pollfds_poll
> >       g_main_context_check
> >         aio_ctx_check
> >           atomic_and(&ctx->notify_me, ~1)
> >       g_main_context_dispatch
> >         aio_ctx_dispatch
> >           /* do something for event */
> >             qemu_poll_ns 
> > 
> 
> Paolo,
> 
> I tried confining execution in a single NUMA domain (cpu & mem) and
> still faced the issue, then, I added a mutex "ctx->notify_me_lcktest"
> into context to protect "ctx->notify_me", like showed bellow, and it
> seems to have either fixed or mitigated it.
> 
> I was able to cause the hung once every 3 or 4 runs. I have already ran
> qemu-img convert more than 30 times now and couldn't reproduce it again.
> 
> Next step is to play with the barriers and check why existing ones
> aren't enough for ordering access to ctx->notify_me ... or should I
> try/do something else in your opinion ?
> 
> This arch/machine (Huawei D06):
> 
> $ lscpu
> Architecture:        aarch64
> Byte Order:          Little Endian
> CPU(s):              96
> On-line CPU(s) list: 0-95
> Thread(s) per core:  1
> Core(s) per socket:  48
> Socket(s):           2
> NUMA node(s):        4
> Vendor ID:           0x48
> Model:               0
> Stepping:            0x0
> CPU max MHz:         2000.0000
> CPU min MHz:         200.0000
> BogoMIPS:            200.00
> L1d cache:           64K
> L1i cache:           64K
> L2 cache:            512K
> L3 cache:            32768K
> NUMA node0 CPU(s):   0-23
> NUMA node1 CPU(s):   24-47
> NUMA node2 CPU(s):   48-71
> NUMA node3 CPU(s):   72-95
> Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
> cpuid asimdrdm dcpop

Note that I'm also seeing this on a ThunderX2 (same calltrace):

$ lscpu
Architecture:        aarch64
Byte Order:          Little Endian
CPU(s):              224
On-line CPU(s) list: 0-223
Thread(s) per core:  4
Core(s) per socket:  28
Socket(s):           2
NUMA node(s):        2
Vendor ID:           Cavium
Model:               1
Model name:          ThunderX2 99xx
Stepping:            0x1
BogoMIPS:            400.00
L1d cache:           32K
L1i cache:           32K
L2 cache:            256K
L3 cache:            32768K
NUMA node0 CPU(s):   0-111
NUMA node1 CPU(s):   112-223
Flags:               fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm

-dann

> ----
> 
> diff --git a/include/block/aio.h b/include/block/aio.h
> index 0ca25dfec6..0724086d91 100644
> --- a/include/block/aio.h
> +++ b/include/block/aio.h
> @@ -84,6 +84,7 @@ struct AioContext {
>       * dispatch phase, hence a simple counter is enough for them.
>       */
>      uint32_t notify_me;
> +    QemuMutex notify_me_lcktest;
> 
>      /* A lock to protect between QEMUBH and AioHandler adders and deleter,
>       * and to ensure that no callbacks are removed while we're walking and
> diff --git a/util/aio-posix.c b/util/aio-posix.c
> index 51c41ed3c9..031d6e2997 100644
> --- a/util/aio-posix.c
> +++ b/util/aio-posix.c
> @@ -529,7 +529,9 @@ static bool run_poll_handlers(AioContext *ctx,
> int64_t max_ns, int64_t *timeout)
>      bool progress;
>      int64_t start_time, elapsed_time;
> 
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      assert(ctx->notify_me);
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
>      assert(qemu_lockcnt_count(&ctx->list_lock) > 0);
> 
>      trace_run_poll_handlers_begin(ctx, max_ns, *timeout);
> @@ -601,8 +603,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>       * so disable the optimization now.
>       */
>      if (blocking) {
> +        qemu_mutex_lock(&ctx->notify_me_lcktest);
>          assert(in_aio_context_home_thread(ctx));
>          atomic_add(&ctx->notify_me, 2);
> +        qemu_mutex_unlock(&ctx->notify_me_lcktest);
>      }
> 
>      qemu_lockcnt_inc(&ctx->list_lock);
> @@ -647,8 +651,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
>      }
> 
>      if (blocking) {
> +        qemu_mutex_lock(&ctx->notify_me_lcktest);
>          atomic_sub(&ctx->notify_me, 2);
>          aio_notify_accept(ctx);
> +        qemu_mutex_unlock(&ctx->notify_me_lcktest);
>      }
> 
>      /* Adjust polling time */
> diff --git a/util/async.c b/util/async.c
> index c10642a385..140e1e86f5 100644
> --- a/util/async.c
> +++ b/util/async.c
> @@ -221,7 +221,9 @@ aio_ctx_prepare(GSource *source, gint    *timeout)
>  {
>      AioContext *ctx = (AioContext *) source;
> 
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      atomic_or(&ctx->notify_me, 1);
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
> 
>      /* We assume there is no timeout already supplied */
>      *timeout = qemu_timeout_ns_to_ms(aio_compute_timeout(ctx));
> @@ -239,8 +241,10 @@ aio_ctx_check(GSource *source)
>      AioContext *ctx = (AioContext *) source;
>      QEMUBH *bh;
> 
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      atomic_and(&ctx->notify_me, ~1);
>      aio_notify_accept(ctx);
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
> 
>      for (bh = ctx->first_bh; bh; bh = bh->next) {
>          if (bh->scheduled) {
> @@ -346,11 +350,13 @@ void aio_notify(AioContext *ctx)
>      /* Write e.g. bh->scheduled before reading ctx->notify_me.  Pairs
>       * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
>       */
> -    smp_mb();
> +    //smp_mb();
> +    qemu_mutex_lock(&ctx->notify_me_lcktest);
>      if (ctx->notify_me) {
>          event_notifier_set(&ctx->notifier);
>          atomic_mb_set(&ctx->notified, true);
>      }
> +    qemu_mutex_unlock(&ctx->notify_me_lcktest);
>  }
> 
>  void aio_notify_accept(AioContext *ctx)
> @@ -424,6 +430,8 @@ AioContext *aio_context_new(Error **errp)
>      ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
>      QSLIST_INIT(&ctx->scheduled_coroutines);
> 
> +    qemu_rec_mutex_init(&ctx->notify_me_lcktest);
> +
>      aio_set_event_notifier(ctx, &ctx->notifier,
>                             false,
>                             (EventNotifierHandler *)
>