qemu/util/qemu-thread-posix.c
Akihiko Odaki d1895f4c17 qemu-thread: Avoid futex abstraction for non-Linux
qemu-thread used to abstract pthread primitives into futex for the
QemuEvent implementation of POSIX systems other than Linux. However,
this abstraction has one key difference: unlike futex, pthread
primitives require an explicit destruction, and it must be ordered after
wait and wake operations.

It would be easier to perform destruction if a wait operation ensures
the corresponding wake operation finishes as POSIX semaphore does, but
that requires to protect state accesses in qemu_event_set() and
qemu_event_wait() with a mutex. On the other hand, real futex does not
need such a protection but needs complex barrier and atomic operations
to ensure ordering between the two functions.

Add special implementations of qemu_event_set() and qemu_event_wait()
using pthread primitives. qemu_event_wait() will ensure qemu_event_set()
finishes, and these functions will avoid complex barrier and atomic
operations to ensure ordering between them.

Signed-off-by: Akihiko Odaki <akihiko.odaki@daynix.com>
Tested-by: Phil Dennis-Jordan <phil@philjordan.eu>
Reviewed-by: Phil Dennis-Jordan <phil@philjordan.eu>
Link: https://lore.kernel.org/r/20250526-event-v4-5-5b784cc8e1de@daynix.com
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
2025-06-06 14:32:55 +02:00

708 lines
18 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Wrappers around mutex/cond/thread functions
*
* Copyright Red Hat, Inc. 2009
*
* Author:
* Marcelo Tosatti <mtosatti@redhat.com>
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#include "qemu/osdep.h"
#include "qemu/thread.h"
#include "qemu/atomic.h"
#include "qemu/notify.h"
#include "qemu-thread-common.h"
#include "qemu/tsan.h"
#include "qemu/bitmap.h"
#ifdef CONFIG_PTHREAD_SET_NAME_NP
#include <pthread_np.h>
#endif
static bool name_threads;
void qemu_thread_naming(bool enable)
{
name_threads = enable;
#if !defined CONFIG_PTHREAD_SETNAME_NP_W_TID && \
!defined CONFIG_PTHREAD_SETNAME_NP_WO_TID && \
!defined CONFIG_PTHREAD_SET_NAME_NP
/* This is a debugging option, not fatal */
if (enable) {
fprintf(stderr, "qemu: thread naming not supported on this host\n");
}
#endif
}
static void error_exit(int err, const char *msg)
{
fprintf(stderr, "qemu: %s: %s\n", msg, strerror(err));
abort();
}
static inline clockid_t qemu_timedwait_clockid(void)
{
#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
return CLOCK_MONOTONIC;
#else
return CLOCK_REALTIME;
#endif
}
static void compute_abs_deadline(struct timespec *ts, int ms)
{
clock_gettime(qemu_timedwait_clockid(), ts);
ts->tv_nsec += (ms % 1000) * 1000000;
ts->tv_sec += ms / 1000;
if (ts->tv_nsec >= 1000000000) {
ts->tv_sec++;
ts->tv_nsec -= 1000000000;
}
}
void qemu_mutex_init(QemuMutex *mutex)
{
int err;
err = pthread_mutex_init(&mutex->lock, NULL);
if (err)
error_exit(err, __func__);
qemu_mutex_post_init(mutex);
}
void qemu_mutex_destroy(QemuMutex *mutex)
{
int err;
assert(mutex->initialized);
mutex->initialized = false;
err = pthread_mutex_destroy(&mutex->lock);
if (err)
error_exit(err, __func__);
}
void qemu_mutex_lock_impl(QemuMutex *mutex, const char *file, const int line)
{
int err;
assert(mutex->initialized);
qemu_mutex_pre_lock(mutex, file, line);
err = pthread_mutex_lock(&mutex->lock);
if (err)
error_exit(err, __func__);
qemu_mutex_post_lock(mutex, file, line);
}
int qemu_mutex_trylock_impl(QemuMutex *mutex, const char *file, const int line)
{
int err;
assert(mutex->initialized);
err = pthread_mutex_trylock(&mutex->lock);
if (err == 0) {
qemu_mutex_post_lock(mutex, file, line);
return 0;
}
if (err != EBUSY) {
error_exit(err, __func__);
}
return -EBUSY;
}
void qemu_mutex_unlock_impl(QemuMutex *mutex, const char *file, const int line)
{
int err;
assert(mutex->initialized);
qemu_mutex_pre_unlock(mutex, file, line);
err = pthread_mutex_unlock(&mutex->lock);
if (err)
error_exit(err, __func__);
}
void qemu_rec_mutex_init(QemuRecMutex *mutex)
{
int err;
pthread_mutexattr_t attr;
pthread_mutexattr_init(&attr);
pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
err = pthread_mutex_init(&mutex->m.lock, &attr);
pthread_mutexattr_destroy(&attr);
if (err) {
error_exit(err, __func__);
}
mutex->m.initialized = true;
}
void qemu_rec_mutex_destroy(QemuRecMutex *mutex)
{
qemu_mutex_destroy(&mutex->m);
}
void qemu_rec_mutex_lock_impl(QemuRecMutex *mutex, const char *file, int line)
{
qemu_mutex_lock_impl(&mutex->m, file, line);
}
int qemu_rec_mutex_trylock_impl(QemuRecMutex *mutex, const char *file, int line)
{
return qemu_mutex_trylock_impl(&mutex->m, file, line);
}
void qemu_rec_mutex_unlock_impl(QemuRecMutex *mutex, const char *file, int line)
{
qemu_mutex_unlock_impl(&mutex->m, file, line);
}
void qemu_cond_init(QemuCond *cond)
{
pthread_condattr_t attr;
int err;
err = pthread_condattr_init(&attr);
if (err) {
error_exit(err, __func__);
}
#ifdef CONFIG_PTHREAD_CONDATTR_SETCLOCK
err = pthread_condattr_setclock(&attr, qemu_timedwait_clockid());
if (err) {
error_exit(err, __func__);
}
#endif
err = pthread_cond_init(&cond->cond, &attr);
if (err) {
error_exit(err, __func__);
}
err = pthread_condattr_destroy(&attr);
if (err) {
error_exit(err, __func__);
}
cond->initialized = true;
}
void qemu_cond_destroy(QemuCond *cond)
{
int err;
assert(cond->initialized);
cond->initialized = false;
err = pthread_cond_destroy(&cond->cond);
if (err)
error_exit(err, __func__);
}
void qemu_cond_signal(QemuCond *cond)
{
int err;
assert(cond->initialized);
err = pthread_cond_signal(&cond->cond);
if (err)
error_exit(err, __func__);
}
void qemu_cond_broadcast(QemuCond *cond)
{
int err;
assert(cond->initialized);
err = pthread_cond_broadcast(&cond->cond);
if (err)
error_exit(err, __func__);
}
void qemu_cond_wait_impl(QemuCond *cond, QemuMutex *mutex, const char *file, const int line)
{
int err;
assert(cond->initialized);
qemu_mutex_pre_unlock(mutex, file, line);
err = pthread_cond_wait(&cond->cond, &mutex->lock);
qemu_mutex_post_lock(mutex, file, line);
if (err)
error_exit(err, __func__);
}
static bool TSA_NO_TSA
qemu_cond_timedwait_ts(QemuCond *cond, QemuMutex *mutex, struct timespec *ts,
const char *file, const int line)
{
int err;
assert(cond->initialized);
trace_qemu_mutex_unlock(mutex, file, line);
err = pthread_cond_timedwait(&cond->cond, &mutex->lock, ts);
trace_qemu_mutex_locked(mutex, file, line);
if (err && err != ETIMEDOUT) {
error_exit(err, __func__);
}
return err != ETIMEDOUT;
}
bool qemu_cond_timedwait_impl(QemuCond *cond, QemuMutex *mutex, int ms,
const char *file, const int line)
{
struct timespec ts;
compute_abs_deadline(&ts, ms);
return qemu_cond_timedwait_ts(cond, mutex, &ts, file, line);
}
void qemu_sem_init(QemuSemaphore *sem, int init)
{
qemu_mutex_init(&sem->mutex);
qemu_cond_init(&sem->cond);
if (init < 0) {
error_exit(EINVAL, __func__);
}
sem->count = init;
}
void qemu_sem_destroy(QemuSemaphore *sem)
{
qemu_cond_destroy(&sem->cond);
qemu_mutex_destroy(&sem->mutex);
}
void qemu_sem_post(QemuSemaphore *sem)
{
qemu_mutex_lock(&sem->mutex);
if (sem->count == UINT_MAX) {
error_exit(EINVAL, __func__);
} else {
sem->count++;
qemu_cond_signal(&sem->cond);
}
qemu_mutex_unlock(&sem->mutex);
}
int qemu_sem_timedwait(QemuSemaphore *sem, int ms)
{
bool rc = true;
struct timespec ts;
compute_abs_deadline(&ts, ms);
qemu_mutex_lock(&sem->mutex);
while (sem->count == 0) {
if (ms == 0) {
rc = false;
} else {
rc = qemu_cond_timedwait_ts(&sem->cond, &sem->mutex, &ts,
__FILE__, __LINE__);
}
if (!rc) { /* timeout */
break;
}
}
if (rc) {
--sem->count;
}
qemu_mutex_unlock(&sem->mutex);
return (rc ? 0 : -1);
}
void qemu_sem_wait(QemuSemaphore *sem)
{
qemu_mutex_lock(&sem->mutex);
while (sem->count == 0) {
qemu_cond_wait(&sem->cond, &sem->mutex);
}
--sem->count;
qemu_mutex_unlock(&sem->mutex);
}
#ifdef CONFIG_LINUX
#include "qemu/futex.h"
#endif
/* Valid transitions:
* - FREE -> SET (qemu_event_set)
* - BUSY -> SET (qemu_event_set)
* - SET -> FREE (qemu_event_reset)
* - FREE -> BUSY (qemu_event_wait)
*
* With futex, the waking and blocking operations follow
* BUSY -> SET and FREE -> BUSY, respectively.
*
* Without futex, BUSY -> SET and FREE -> BUSY never happen. Instead, the waking
* operation follows FREE -> SET and the blocking operation will happen in
* qemu_event_wait() if the event is not SET.
*
* SET->BUSY does not happen (it can be observed from the outside but
* it really is SET->FREE->BUSY).
*
* busy->free provably cannot happen; to enforce it, the set->free transition
* is done with an OR, which becomes a no-op if the event has concurrently
* transitioned to free or busy.
*/
#define EV_SET 0
#define EV_FREE 1
#define EV_BUSY -1
void qemu_event_init(QemuEvent *ev, bool init)
{
#ifndef CONFIG_LINUX
pthread_mutex_init(&ev->lock, NULL);
pthread_cond_init(&ev->cond, NULL);
#endif
ev->value = (init ? EV_SET : EV_FREE);
ev->initialized = true;
}
void qemu_event_destroy(QemuEvent *ev)
{
assert(ev->initialized);
ev->initialized = false;
#ifndef CONFIG_LINUX
pthread_mutex_destroy(&ev->lock);
pthread_cond_destroy(&ev->cond);
#endif
}
void qemu_event_set(QemuEvent *ev)
{
assert(ev->initialized);
#ifdef CONFIG_LINUX
/*
* Pairs with both qemu_event_reset() and qemu_event_wait().
*
* qemu_event_set has release semantics, but because it *loads*
* ev->value we need a full memory barrier here.
*/
smp_mb();
if (qatomic_read(&ev->value) != EV_SET) {
int old = qatomic_xchg(&ev->value, EV_SET);
/* Pairs with memory barrier in kernel futex_wait system call. */
smp_mb__after_rmw();
if (old == EV_BUSY) {
/* There were waiters, wake them up. */
qemu_futex_wake_all(ev);
}
}
#else
pthread_mutex_lock(&ev->lock);
/* Pairs with qemu_event_reset()'s load acquire. */
qatomic_store_release(&ev->value, EV_SET);
pthread_cond_broadcast(&ev->cond);
pthread_mutex_unlock(&ev->lock);
#endif
}
void qemu_event_reset(QemuEvent *ev)
{
assert(ev->initialized);
#ifdef CONFIG_LINUX
/*
* If there was a concurrent reset (or even reset+wait),
* do nothing. Otherwise change EV_SET->EV_FREE.
*/
qatomic_or(&ev->value, EV_FREE);
/*
* Order reset before checking the condition in the caller.
* Pairs with the first memory barrier in qemu_event_set().
*/
smp_mb__after_rmw();
#else
/*
* If futexes are not available, there are no EV_FREE->EV_BUSY
* transitions because wakeups are done entirely through the
* condition variable. Since qatomic_set() only writes EV_FREE,
* the load seems useless but in reality, the acquire synchronizes
* with qemu_event_set()'s store release: if qemu_event_reset()
* sees EV_SET here, then the caller will certainly see a
* successful condition and skip qemu_event_wait():
*
* done = 1; if (done == 0)
* qemu_event_set() { qemu_event_reset() {
* lock();
* ev->value = EV_SET -----> load ev->value
* ev->value = old value | EV_FREE
* cond_broadcast()
* unlock(); }
* } if (done == 0)
* // qemu_event_wait() not called
*/
qatomic_set(&ev->value, qatomic_load_acquire(&ev->value) | EV_FREE);
#endif
}
void qemu_event_wait(QemuEvent *ev)
{
assert(ev->initialized);
#ifdef CONFIG_LINUX
while (true) {
/*
* qemu_event_wait must synchronize with qemu_event_set even if it does
* not go down the slow path, so this load-acquire is needed that
* synchronizes with the first memory barrier in qemu_event_set().
*/
unsigned value = qatomic_load_acquire(&ev->value);
if (value == EV_SET) {
break;
}
if (value == EV_FREE) {
/*
* Leave the event reset and tell qemu_event_set that there are
* waiters. No need to retry, because there cannot be a concurrent
* busy->free transition. After the CAS, the event will be either
* set or busy.
*
* This cmpxchg doesn't have particular ordering requirements if it
* succeeds (moving the store earlier can only cause qemu_event_set()
* to issue _more_ wakeups), the failing case needs acquire semantics
* like the load above.
*/
if (qatomic_cmpxchg(&ev->value, EV_FREE, EV_BUSY) == EV_SET) {
break;
}
}
/*
* This is the final check for a concurrent set, so it does need
* a smp_mb() pairing with the second barrier of qemu_event_set().
* The barrier is inside the FUTEX_WAIT system call.
*/
qemu_futex_wait(ev, EV_BUSY);
}
#else
pthread_mutex_lock(&ev->lock);
while (qatomic_read(&ev->value) != EV_SET) {
pthread_cond_wait(&ev->cond, &ev->lock);
}
pthread_mutex_unlock(&ev->lock);
#endif
}
static __thread NotifierList thread_exit;
/*
* Note that in this implementation you can register a thread-exit
* notifier for the main thread, but it will never be called.
* This is OK because main thread exit can only happen when the
* entire process is exiting, and the API allows notifiers to not
* be called on process exit.
*/
void qemu_thread_atexit_add(Notifier *notifier)
{
notifier_list_add(&thread_exit, notifier);
}
void qemu_thread_atexit_remove(Notifier *notifier)
{
notifier_remove(notifier);
}
static void qemu_thread_atexit_notify(void *arg)
{
/*
* Called when non-main thread exits (via qemu_thread_exit()
* or by returning from its start routine.)
*/
notifier_list_notify(&thread_exit, NULL);
}
typedef struct {
void *(*start_routine)(void *);
void *arg;
char *name;
} QemuThreadArgs;
static void *qemu_thread_start(void *args)
{
QemuThreadArgs *qemu_thread_args = args;
void *(*start_routine)(void *) = qemu_thread_args->start_routine;
void *arg = qemu_thread_args->arg;
void *r;
/* Attempt to set the threads name; note that this is for debug, so
* we're not going to fail if we can't set it.
*/
if (name_threads && qemu_thread_args->name) {
# if defined(CONFIG_PTHREAD_SETNAME_NP_W_TID)
pthread_setname_np(pthread_self(), qemu_thread_args->name);
# elif defined(CONFIG_PTHREAD_SETNAME_NP_WO_TID)
pthread_setname_np(qemu_thread_args->name);
# elif defined(CONFIG_PTHREAD_SET_NAME_NP)
pthread_set_name_np(pthread_self(), qemu_thread_args->name);
# endif
}
QEMU_TSAN_ANNOTATE_THREAD_NAME(qemu_thread_args->name);
g_free(qemu_thread_args->name);
g_free(qemu_thread_args);
/*
* GCC 11 with glibc 2.17 on PowerPC reports
*
* qemu-thread-posix.c:540:5: error: __sigsetjmp accessing 656 bytes
* in a region of size 528 [-Werror=stringop-overflow=]
* 540 | pthread_cleanup_push(qemu_thread_atexit_notify, NULL);
* | ^~~~~~~~~~~~~~~~~~~~
*
* which is clearly nonsense.
*/
#pragma GCC diagnostic push
#ifndef __clang__
#pragma GCC diagnostic ignored "-Wstringop-overflow"
#endif
pthread_cleanup_push(qemu_thread_atexit_notify, NULL);
r = start_routine(arg);
pthread_cleanup_pop(1);
#pragma GCC diagnostic pop
return r;
}
void qemu_thread_create(QemuThread *thread, const char *name,
void *(*start_routine)(void*),
void *arg, int mode)
{
sigset_t set, oldset;
int err;
pthread_attr_t attr;
QemuThreadArgs *qemu_thread_args;
err = pthread_attr_init(&attr);
if (err) {
error_exit(err, __func__);
}
if (mode == QEMU_THREAD_DETACHED) {
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
}
/* Leave signal handling to the iothread. */
sigfillset(&set);
/* Blocking the signals can result in undefined behaviour. */
sigdelset(&set, SIGSEGV);
sigdelset(&set, SIGFPE);
sigdelset(&set, SIGILL);
/* TODO avoid SIGBUS loss on macOS */
pthread_sigmask(SIG_SETMASK, &set, &oldset);
qemu_thread_args = g_new0(QemuThreadArgs, 1);
qemu_thread_args->name = g_strdup(name);
qemu_thread_args->start_routine = start_routine;
qemu_thread_args->arg = arg;
err = pthread_create(&thread->thread, &attr,
qemu_thread_start, qemu_thread_args);
if (err)
error_exit(err, __func__);
pthread_sigmask(SIG_SETMASK, &oldset, NULL);
pthread_attr_destroy(&attr);
}
int qemu_thread_set_affinity(QemuThread *thread, unsigned long *host_cpus,
unsigned long nbits)
{
#if defined(CONFIG_PTHREAD_AFFINITY_NP)
const size_t setsize = CPU_ALLOC_SIZE(nbits);
unsigned long value;
cpu_set_t *cpuset;
int err;
cpuset = CPU_ALLOC(nbits);
g_assert(cpuset);
CPU_ZERO_S(setsize, cpuset);
value = find_first_bit(host_cpus, nbits);
while (value < nbits) {
CPU_SET_S(value, setsize, cpuset);
value = find_next_bit(host_cpus, nbits, value + 1);
}
err = pthread_setaffinity_np(thread->thread, setsize, cpuset);
CPU_FREE(cpuset);
return err;
#else
return -ENOSYS;
#endif
}
int qemu_thread_get_affinity(QemuThread *thread, unsigned long **host_cpus,
unsigned long *nbits)
{
#if defined(CONFIG_PTHREAD_AFFINITY_NP)
unsigned long tmpbits;
cpu_set_t *cpuset;
size_t setsize;
int i, err;
tmpbits = CPU_SETSIZE;
while (true) {
setsize = CPU_ALLOC_SIZE(tmpbits);
cpuset = CPU_ALLOC(tmpbits);
g_assert(cpuset);
err = pthread_getaffinity_np(thread->thread, setsize, cpuset);
if (err) {
CPU_FREE(cpuset);
if (err != -EINVAL) {
return err;
}
tmpbits *= 2;
} else {
break;
}
}
/* Convert the result into a proper bitmap. */
*nbits = tmpbits;
*host_cpus = bitmap_new(tmpbits);
for (i = 0; i < tmpbits; i++) {
if (CPU_ISSET(i, cpuset)) {
set_bit(i, *host_cpus);
}
}
CPU_FREE(cpuset);
return 0;
#else
return -ENOSYS;
#endif
}
void qemu_thread_get_self(QemuThread *thread)
{
thread->thread = pthread_self();
}
bool qemu_thread_is_self(QemuThread *thread)
{
return pthread_equal(pthread_self(), thread->thread);
}
void qemu_thread_exit(void *retval)
{
pthread_exit(retval);
}
void *qemu_thread_join(QemuThread *thread)
{
int err;
void *ret;
err = pthread_join(thread->thread, &ret);
if (err) {
error_exit(err, __func__);
}
return ret;
}