On Wed, Sep 11, 2019 at 04:09:25PM -0300, Rafael David Tinoco wrote:
> > Zhengui's theory that notify_me doesn't work properly on ARM is more
> > promising, but he couldn't provide a clear explanation of why he thought
> > notify_me is involved. In particular, I would have expected notify_me to
> > be wrong if the qemu_poll_ns call came from aio_ctx_dispatch, for example:
> >
> >
> > glib_pollfds_fill
> > g_main_context_prepare
> > aio_ctx_prepare
> > atomic_or(&ctx->notify_me, 1)
> > qemu_poll_ns
> > glib_pollfds_poll
> > g_main_context_check
> > aio_ctx_check
> > atomic_and(&ctx->notify_me, ~1)
> > g_main_context_dispatch
> > aio_ctx_dispatch
> > /* do something for event */
> > qemu_poll_ns
> >
>
> Paolo,
>
> I tried confining execution in a single NUMA domain (cpu & mem) and
> still faced the issue, then, I added a mutex "ctx->notify_me_lcktest"
> into context to protect "ctx->notify_me", like showed bellow, and it
> seems to have either fixed or mitigated it.
>
> I was able to cause the hung once every 3 or 4 runs. I have already ran
> qemu-img convert more than 30 times now and couldn't reproduce it again.
>
> Next step is to play with the barriers and check why existing ones
> aren't enough for ordering access to ctx->notify_me ... or should I
> try/do something else in your opinion ?
>
> This arch/machine (Huawei D06):
>
> $ lscpu
> Architecture: aarch64
> Byte Order: Little Endian
> CPU(s): 96
> On-line CPU(s) list: 0-95
> Thread(s) per core: 1
> Core(s) per socket: 48
> Socket(s): 2
> NUMA node(s): 4
> Vendor ID: 0x48
> Model: 0
> Stepping: 0x0
> CPU max MHz: 2000.0000
> CPU min MHz: 200.0000
> BogoMIPS: 200.00
> L1d cache: 64K
> L1i cache: 64K
> L2 cache: 512K
> L3 cache: 32768K
> NUMA node0 CPU(s): 0-23
> NUMA node1 CPU(s): 24-47
> NUMA node2 CPU(s): 48-71
> NUMA node3 CPU(s): 72-95
> Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
> cpuid asimdrdm dcpop
Note that I'm also seeing this on a ThunderX2 (same calltrace):
On Wed, Sep 11, 2019 at 04:09:25PM -0300, Rafael David Tinoco wrote: context_ prepare or(&ctx- >notify_ me, 1) context_ check and(&ctx- >notify_ me, ~1) context_ dispatch me_lcktest"
> > Zhengui's theory that notify_me doesn't work properly on ARM is more
> > promising, but he couldn't provide a clear explanation of why he thought
> > notify_me is involved. In particular, I would have expected notify_me to
> > be wrong if the qemu_poll_ns call came from aio_ctx_dispatch, for example:
> >
> >
> > glib_pollfds_fill
> > g_main_
> > aio_ctx_prepare
> > atomic_
> > qemu_poll_ns
> > glib_pollfds_poll
> > g_main_
> > aio_ctx_check
> > atomic_
> > g_main_
> > aio_ctx_dispatch
> > /* do something for event */
> > qemu_poll_ns
> >
>
> Paolo,
>
> I tried confining execution in a single NUMA domain (cpu & mem) and
> still faced the issue, then, I added a mutex "ctx->notify_
> into context to protect "ctx->notify_me", like showed bellow, and it
> seems to have either fixed or mitigated it.
>
> I was able to cause the hung once every 3 or 4 runs. I have already ran
> qemu-img convert more than 30 times now and couldn't reproduce it again.
>
> Next step is to play with the barriers and check why existing ones
> aren't enough for ordering access to ctx->notify_me ... or should I
> try/do something else in your opinion ?
>
> This arch/machine (Huawei D06):
>
> $ lscpu
> Architecture: aarch64
> Byte Order: Little Endian
> CPU(s): 96
> On-line CPU(s) list: 0-95
> Thread(s) per core: 1
> Core(s) per socket: 48
> Socket(s): 2
> NUMA node(s): 4
> Vendor ID: 0x48
> Model: 0
> Stepping: 0x0
> CPU max MHz: 2000.0000
> CPU min MHz: 200.0000
> BogoMIPS: 200.00
> L1d cache: 64K
> L1i cache: 64K
> L2 cache: 512K
> L3 cache: 32768K
> NUMA node0 CPU(s): 0-23
> NUMA node1 CPU(s): 24-47
> NUMA node2 CPU(s): 48-71
> NUMA node3 CPU(s): 72-95
> Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics
> cpuid asimdrdm dcpop
Note that I'm also seeing this on a ThunderX2 (same calltrace):
$ lscpu
Architecture: aarch64
Byte Order: Little Endian
CPU(s): 224
On-line CPU(s) list: 0-223
Thread(s) per core: 4
Core(s) per socket: 28
Socket(s): 2
NUMA node(s): 2
Vendor ID: Cavium
Model: 1
Model name: ThunderX2 99xx
Stepping: 0x1
BogoMIPS: 400.00
L1d cache: 32K
L1i cache: 32K
L2 cache: 256K
L3 cache: 32768K
NUMA node0 CPU(s): 0-111
NUMA node1 CPU(s): 112-223
Flags: fp asimd evtstrm aes pmull sha1 sha2 crc32 atomics cpuid asimdrdm
-dann
> ---- block/aio. h b/include/ block/aio. h .0724086d91 100644 block/aio. h block/aio. h .031d6e2997 100644 handlers( AioContext *ctx, lock(&ctx- >notify_ me_lcktest) ; ctx->notify_ me); unlock( &ctx->notify_ me_lcktest) ; qemu_lockcnt_ count(& ctx->list_ lock) > 0); poll_handlers_ begin(ctx, max_ns, *timeout); lock(&ctx- >notify_ me_lcktest) ; in_aio_ context_ home_thread( ctx)); add(&ctx- >notify_ me, 2); unlock( &ctx->notify_ me_lcktest) ; inc(&ctx- >list_lock) ; lock(&ctx- >notify_ me_lcktest) ; sub(&ctx- >notify_ me, 2); accept( ctx); unlock( &ctx->notify_ me_lcktest) ; .140e1e86f5 100644 prepare( GSource *source, gint *timeout) lock(&ctx- >notify_ me_lcktest) ; or(&ctx- >notify_ me, 1); unlock( &ctx->notify_ me_lcktest) ; ns_to_ms( aio_compute_ timeout( ctx)); check(GSource *source) lock(&ctx- >notify_ me_lcktest) ; and(&ctx- >notify_ me, ~1); accept( ctx); unlock( &ctx->notify_ me_lcktest) ; AioContext *ctx) lock(&ctx- >notify_ me_lcktest) ; set(&ctx- >notifier) ; mb_set( &ctx->notified, true); unlock( &ctx->notify_ me_lcktest) ; accept( AioContext *ctx) new(Error **errp) INIT(&ctx- >scheduled_ coroutines) ; mutex_init( &ctx->notify_ me_lcktest) ; event_notifier( ctx, &ctx->notifier, andler *)
>
> diff --git a/include/
> index 0ca25dfec6.
> --- a/include/
> +++ b/include/
> @@ -84,6 +84,7 @@ struct AioContext {
> * dispatch phase, hence a simple counter is enough for them.
> */
> uint32_t notify_me;
> + QemuMutex notify_me_lcktest;
>
> /* A lock to protect between QEMUBH and AioHandler adders and deleter,
> * and to ensure that no callbacks are removed while we're walking and
> diff --git a/util/aio-posix.c b/util/aio-posix.c
> index 51c41ed3c9.
> --- a/util/aio-posix.c
> +++ b/util/aio-posix.c
> @@ -529,7 +529,9 @@ static bool run_poll_
> int64_t max_ns, int64_t *timeout)
> bool progress;
> int64_t start_time, elapsed_time;
>
> + qemu_mutex_
> assert(
> + qemu_mutex_
> assert(
>
> trace_run_
> @@ -601,8 +603,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
> * so disable the optimization now.
> */
> if (blocking) {
> + qemu_mutex_
> assert(
> atomic_
> + qemu_mutex_
> }
>
> qemu_lockcnt_
> @@ -647,8 +651,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
> }
>
> if (blocking) {
> + qemu_mutex_
> atomic_
> aio_notify_
> + qemu_mutex_
> }
>
> /* Adjust polling time */
> diff --git a/util/async.c b/util/async.c
> index c10642a385.
> --- a/util/async.c
> +++ b/util/async.c
> @@ -221,7 +221,9 @@ aio_ctx_
> {
> AioContext *ctx = (AioContext *) source;
>
> + qemu_mutex_
> atomic_
> + qemu_mutex_
>
> /* We assume there is no timeout already supplied */
> *timeout = qemu_timeout_
> @@ -239,8 +241,10 @@ aio_ctx_
> AioContext *ctx = (AioContext *) source;
> QEMUBH *bh;
>
> + qemu_mutex_
> atomic_
> aio_notify_
> + qemu_mutex_
>
> for (bh = ctx->first_bh; bh; bh = bh->next) {
> if (bh->scheduled) {
> @@ -346,11 +350,13 @@ void aio_notify(
> /* Write e.g. bh->scheduled before reading ctx->notify_me. Pairs
> * with atomic_or in aio_ctx_prepare or atomic_add in aio_poll.
> */
> - smp_mb();
> + //smp_mb();
> + qemu_mutex_
> if (ctx->notify_me) {
> event_notifier_
> atomic_
> }
> + qemu_mutex_
> }
>
> void aio_notify_
> @@ -424,6 +430,8 @@ AioContext *aio_context_
> ctx->co_schedule_bh = aio_bh_new(ctx, co_schedule_bh_cb, ctx);
> QSLIST_
>
> + qemu_rec_
> +
> aio_set_
> false,
> (EventNotifierH
>