qemu/include/sysemu/kvm_int.h
Peter Xu fde43f4b8f KVM: Dynamic sized kvm memslots array
Zhiyi reported an infinite loop issue in VFIO use case.  The cause of that
was a separate discussion, however during that I found a regression of
dirty sync slowness when profiling.

Each KVMMemoryListerner maintains an array of kvm memslots.  Currently it's
statically allocated to be the max supported by the kernel.  However after
Linux commit 4fc096a99e ("KVM: Raise the maximum number of user memslots"),
the max supported memslots reported now grows to some number large enough
so that it may not be wise to always statically allocate with the max
reported.

What's worse, QEMU kvm code still walks all the allocated memslots entries
to do any form of lookups.  It can drastically slow down all memslot
operations because each of such loop can run over 32K times on the new
kernels.

Fix this issue by making the memslots to be allocated dynamically.

Here the initial size was set to 16 because it should cover the basic VM
usages, so that the hope is the majority VM use case may not even need to
grow at all (e.g. if one starts a VM with ./qemu-system-x86_64 by default
it'll consume 9 memslots), however not too large to waste memory.

There can also be even better way to address this, but so far this is the
simplest and should be already better even than before we grow the max
supported memslots.  For example, in the case of above issue when VFIO was
attached on a 32GB system, there are only ~10 memslots used.  So it could
be good enough as of now.

In the above VFIO context, measurement shows that the precopy dirty sync
shrinked from ~86ms to ~3ms after this patch applied.  It should also apply
to any KVM enabled VM even without VFIO.

NOTE: we don't have a FIXES tag for this patch because there's no real
commit that regressed this in QEMU. Such behavior existed for a long time,
but only start to be a problem when the kernel reports very large
nr_slots_max value.  However that's pretty common now (the kernel change
was merged in 2021) so we attached cc:stable because we'll want this change
to be backported to stable branches.

Cc: qemu-stable <qemu-stable@nongnu.org>
Reported-by: Zhiyi Guo <zhguo@redhat.com>
Tested-by: Zhiyi Guo <zhguo@redhat.com>
Signed-off-by: Peter Xu <peterx@redhat.com>
Acked-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Fabiano Rosas <farosas@suse.de>
Link: https://lore.kernel.org/r/20240917163835.194664-2-peterx@redhat.com
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
(cherry picked from commit 5504a81261)
Signed-off-by: Michael Tokarev <mjt@tls.msk.ru>
(Mjt: context fixup in accel/kvm/trace-events)
2024-11-08 13:02:57 +03:00

143 lines
3.9 KiB
C

/*
* Internal definitions for a target's KVM support
*
* This work is licensed under the terms of the GNU GPL, version 2 or later.
* See the COPYING file in the top-level directory.
*
*/
#ifndef QEMU_KVM_INT_H
#define QEMU_KVM_INT_H
#include "exec/memory.h"
#include "qapi/qapi-types-common.h"
#include "qemu/accel.h"
#include "qemu/queue.h"
#include "sysemu/kvm.h"
typedef struct KVMSlot
{
hwaddr start_addr;
ram_addr_t memory_size;
void *ram;
int slot;
int flags;
int old_flags;
/* Dirty bitmap cache for the slot */
unsigned long *dirty_bmap;
unsigned long dirty_bmap_size;
/* Cache of the address space ID */
int as_id;
/* Cache of the offset in ram address space */
ram_addr_t ram_start_offset;
} KVMSlot;
typedef struct KVMMemoryUpdate {
QSIMPLEQ_ENTRY(KVMMemoryUpdate) next;
MemoryRegionSection section;
} KVMMemoryUpdate;
typedef struct KVMMemoryListener {
MemoryListener listener;
KVMSlot *slots;
unsigned int nr_used_slots;
unsigned int nr_slots_allocated;
int as_id;
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_add;
QSIMPLEQ_HEAD(, KVMMemoryUpdate) transaction_del;
} KVMMemoryListener;
#define KVM_MSI_HASHTAB_SIZE 256
enum KVMDirtyRingReaperState {
KVM_DIRTY_RING_REAPER_NONE = 0,
/* The reaper is sleeping */
KVM_DIRTY_RING_REAPER_WAIT,
/* The reaper is reaping for dirty pages */
KVM_DIRTY_RING_REAPER_REAPING,
};
/*
* KVM reaper instance, responsible for collecting the KVM dirty bits
* via the dirty ring.
*/
struct KVMDirtyRingReaper {
/* The reaper thread */
QemuThread reaper_thr;
volatile uint64_t reaper_iteration; /* iteration number of reaper thr */
volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state */
};
struct KVMState
{
AccelState parent_obj;
int nr_slots;
int fd;
int vmfd;
int coalesced_mmio;
int coalesced_pio;
struct kvm_coalesced_mmio_ring *coalesced_mmio_ring;
bool coalesced_flush_in_progress;
int vcpu_events;
#ifdef KVM_CAP_SET_GUEST_DEBUG
QTAILQ_HEAD(, kvm_sw_breakpoint) kvm_sw_breakpoints;
#endif
int max_nested_state_len;
int kvm_shadow_mem;
bool kernel_irqchip_allowed;
bool kernel_irqchip_required;
OnOffAuto kernel_irqchip_split;
bool sync_mmu;
uint64_t manual_dirty_log_protect;
/* The man page (and posix) say ioctl numbers are signed int, but
* they're not. Linux, glibc and *BSD all treat ioctl numbers as
* unsigned, and treating them as signed here can break things */
unsigned irq_set_ioctl;
unsigned int sigmask_len;
GHashTable *gsimap;
#ifdef KVM_CAP_IRQ_ROUTING
struct kvm_irq_routing *irq_routes;
int nr_allocated_irq_routes;
unsigned long *used_gsi_bitmap;
unsigned int gsi_count;
#endif
KVMMemoryListener memory_listener;
QLIST_HEAD(, KVMParkedVcpu) kvm_parked_vcpus;
/* For "info mtree -f" to tell if an MR is registered in KVM */
int nr_as;
struct KVMAs {
KVMMemoryListener *ml;
AddressSpace *as;
} *as;
uint64_t kvm_dirty_ring_bytes; /* Size of the per-vcpu dirty ring */
uint32_t kvm_dirty_ring_size; /* Number of dirty GFNs per ring */
bool kvm_dirty_ring_with_bitmap;
uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */
struct KVMDirtyRingReaper reaper;
NotifyVmexitOption notify_vmexit;
uint32_t notify_window;
uint32_t xen_version;
uint32_t xen_caps;
uint16_t xen_gnttab_max_frames;
uint16_t xen_evtchn_max_pirq;
char *device;
};
void kvm_memory_listener_register(KVMState *s, KVMMemoryListener *kml,
AddressSpace *as, int as_id, const char *name);
void kvm_set_max_memslot_size(hwaddr max_slot_size);
/**
* kvm_hwpoison_page_add:
*
* Parameters:
* @ram_addr: the address in the RAM for the poisoned page
*
* Add a poisoned page to the list
*
* Return: None.
*/
void kvm_hwpoison_page_add(ram_addr_t ram_addr);
#endif