VFIO updates for 8.0

* Device level dirty page tracking support for vfio migration, as well as
    various cleanups and consolidations. (Avihai Horon, Joao Martins)
 
  * Trivial cleanup of migration entry points. (Alex Williamson)
 
  * Fix trace event typo. (Cédric Le Goater)
 -----BEGIN PGP SIGNATURE-----
 
 iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmQHgCUbHGFsZXgud2ls
 bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi4i0P/RwP3TJ4jDBEW9JNa52O
 6Hu6tWDccjSZFX7W/pnUztFtIqYBG6Jcms5VLZhaqrSda2BKa3dVoY+iU2finHRn
 q4CNQ4EVbKBG0HvA9SEd7WchAKADBCVpjeUBAF6jVQHBCQECHnfWtA2Y0T5oEGgw
 H1dwuw3YX6Jwyh5RmT/m7wNtOo2ms/CpDAc7d5rfLg0cDQ0vXPCu/CVvqAXbBpVd
 g7NrMLw1wfhKLYN2eWYkiZ+pGwNX5uxsp0jOSA7leFcfkuLX2KzQ99JpCNhX1oRd
 H5bedA62ffFLGQdlM2zyiAi37CgmeElKSlnaJdBX91Y4DQ3HSdbHYWoiYtzl89rB
 7QxYHG7XOMdYKssN7qz+oVUpI+ycB18wSW2D/h4fJCNkH92cSHMyJ/yEA3r39eX4
 7rgu0j8cg2iwIiGlh/klguXfatMDJvbrazDHYixKUJD5vlDXQvTe9LVpwUaUhGGM
 Gh4g8wx9gmDE9H1FbQ0kQqut70sO1Hnw2Pj19qzfdwfL6LeYWk+5AfQZmyziYGFM
 CGRKz5RhlN/Ori9gTKfn00stuxdD09Md5fPllKyMq7a1tkQt58RxLSkUN8hygeki
 Uqnlx5KXBLQ/7ZtnQNoe8frn5FhKBBSLC3tA71PyL4kIbcuiHXLvxIOeE9oJpSPi
 Bt8sTr3eCnVF9mys1ZmGmaYY
 =nM9d
 -----END PGP SIGNATURE-----

Merge tag 'vfio-updates-20230307.1' of https://gitlab.com/alex.williamson/qemu into staging

VFIO updates for 8.0

 * Device level dirty page tracking support for vfio migration, as well as
   various cleanups and consolidations. (Avihai Horon, Joao Martins)

 * Trivial cleanup of migration entry points. (Alex Williamson)

 * Fix trace event typo. (Cédric Le Goater)

# -----BEGIN PGP SIGNATURE-----
#
# iQJPBAABCAA5FiEEQvbATlQL0amee4qQI5ubbjuwiyIFAmQHgCUbHGFsZXgud2ls
# bGlhbXNvbkByZWRoYXQuY29tAAoJECObm247sIsi4i0P/RwP3TJ4jDBEW9JNa52O
# 6Hu6tWDccjSZFX7W/pnUztFtIqYBG6Jcms5VLZhaqrSda2BKa3dVoY+iU2finHRn
# q4CNQ4EVbKBG0HvA9SEd7WchAKADBCVpjeUBAF6jVQHBCQECHnfWtA2Y0T5oEGgw
# H1dwuw3YX6Jwyh5RmT/m7wNtOo2ms/CpDAc7d5rfLg0cDQ0vXPCu/CVvqAXbBpVd
# g7NrMLw1wfhKLYN2eWYkiZ+pGwNX5uxsp0jOSA7leFcfkuLX2KzQ99JpCNhX1oRd
# H5bedA62ffFLGQdlM2zyiAi37CgmeElKSlnaJdBX91Y4DQ3HSdbHYWoiYtzl89rB
# 7QxYHG7XOMdYKssN7qz+oVUpI+ycB18wSW2D/h4fJCNkH92cSHMyJ/yEA3r39eX4
# 7rgu0j8cg2iwIiGlh/klguXfatMDJvbrazDHYixKUJD5vlDXQvTe9LVpwUaUhGGM
# Gh4g8wx9gmDE9H1FbQ0kQqut70sO1Hnw2Pj19qzfdwfL6LeYWk+5AfQZmyziYGFM
# CGRKz5RhlN/Ori9gTKfn00stuxdD09Md5fPllKyMq7a1tkQt58RxLSkUN8hygeki
# Uqnlx5KXBLQ/7ZtnQNoe8frn5FhKBBSLC3tA71PyL4kIbcuiHXLvxIOeE9oJpSPi
# Bt8sTr3eCnVF9mys1ZmGmaYY
# =nM9d
# -----END PGP SIGNATURE-----
# gpg: Signature made Tue 07 Mar 2023 18:19:17 GMT
# gpg:                using RSA key 42F6C04E540BD1A99E7B8A90239B9B6E3BB08B22
# gpg:                issuer "alex.williamson@redhat.com"
# gpg: Good signature from "Alex Williamson <alex.williamson@redhat.com>" [full]
# gpg:                 aka "Alex Williamson <alex@shazbot.org>" [full]
# gpg:                 aka "Alex Williamson <alwillia@redhat.com>" [full]
# gpg:                 aka "Alex Williamson <alex.l.williamson@gmail.com>" [full]
# Primary key fingerprint: 42F6 C04E 540B D1A9 9E7B  8A90 239B 9B6E 3BB0 8B22

* tag 'vfio-updates-20230307.1' of https://gitlab.com/alex.williamson/qemu:
  vfio: Fix vfio_get_dev_region() trace event
  vfio/migration: Rename entry points
  docs/devel: Document VFIO device dirty page tracking
  vfio/migration: Query device dirty page tracking support
  vfio/migration: Block migration with vIOMMU
  vfio/common: Add device dirty page bitmap sync
  vfio/common: Extract code from vfio_get_dirty_bitmap() to new function
  vfio/common: Add device dirty page tracking start/stop
  vfio/common: Record DMA mapped IOVA ranges
  vfio/common: Add helper to consolidate iova/end calculation
  vfio/common: Consolidate skip/invalid section into helper
  vfio/common: Use a single tracepoint for skipped sections
  vfio/common: Add helper to validate iova/end against hostwin
  vfio/common: Add VFIOBitmap and alloc function
  vfio/common: Abort migration if dirty log start/stop/sync fails
  vfio/common: Fix wrong %m usages
  vfio/common: Fix error reporting in vfio_get_dirty_bitmap()

Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
Peter Maydell 2023-03-09 15:19:44 +00:00
commit 66a6aa8f9a
6 changed files with 637 additions and 156 deletions

View file

@ -42,6 +42,7 @@
#include "migration/migration.h"
#include "migration/misc.h"
#include "migration/blocker.h"
#include "migration/qemu-file.h"
#include "sysemu/tpm.h"
VFIOGroupList vfio_group_list =
@ -319,6 +320,28 @@ const MemoryRegionOps vfio_region_ops = {
* Device state interfaces
*/
typedef struct {
unsigned long *bitmap;
hwaddr size;
hwaddr pages;
} VFIOBitmap;
static int vfio_bitmap_alloc(VFIOBitmap *vbmap, hwaddr size)
{
vbmap->pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
vbmap->size = ROUND_UP(vbmap->pages, sizeof(__u64) * BITS_PER_BYTE) /
BITS_PER_BYTE;
vbmap->bitmap = g_try_malloc0(vbmap->size);
if (!vbmap->bitmap) {
return -ENOMEM;
}
return 0;
}
static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
uint64_t size, ram_addr_t ram_addr);
bool vfio_mig_active(void)
{
VFIOGroup *group;
@ -339,6 +362,7 @@ bool vfio_mig_active(void)
}
static Error *multiple_devices_migration_blocker;
static Error *giommu_migration_blocker;
static unsigned int vfio_migratable_device_num(void)
{
@ -390,6 +414,64 @@ void vfio_unblock_multiple_devices_migration(void)
multiple_devices_migration_blocker = NULL;
}
static bool vfio_viommu_preset(void)
{
VFIOAddressSpace *space;
QLIST_FOREACH(space, &vfio_address_spaces, list) {
if (space->as != &address_space_memory) {
return true;
}
}
return false;
}
int vfio_block_giommu_migration(Error **errp)
{
int ret;
if (giommu_migration_blocker ||
!vfio_viommu_preset()) {
return 0;
}
error_setg(&giommu_migration_blocker,
"Migration is currently not supported with vIOMMU enabled");
ret = migrate_add_blocker(giommu_migration_blocker, errp);
if (ret < 0) {
error_free(giommu_migration_blocker);
giommu_migration_blocker = NULL;
}
return ret;
}
void vfio_migration_finalize(void)
{
if (!giommu_migration_blocker ||
vfio_viommu_preset()) {
return;
}
migrate_del_blocker(giommu_migration_blocker);
error_free(giommu_migration_blocker);
giommu_migration_blocker = NULL;
}
static void vfio_set_migration_error(int err)
{
MigrationState *ms = migrate_get_current();
if (migration_is_setup_or_active(ms->state)) {
WITH_QEMU_LOCK_GUARD(&ms->qemu_file_lock) {
if (ms->to_dst_file) {
qemu_file_set_error(ms->to_dst_file, err);
}
}
}
}
static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
{
VFIOGroup *group;
@ -417,6 +499,22 @@ static bool vfio_devices_all_dirty_tracking(VFIOContainer *container)
return true;
}
static bool vfio_devices_all_device_dirty_tracking(VFIOContainer *container)
{
VFIOGroup *group;
VFIODevice *vbasedev;
QLIST_FOREACH(group, &container->group_list, container_next) {
QLIST_FOREACH(vbasedev, &group->device_list, next) {
if (!vbasedev->dirty_pages_supported) {
return false;
}
}
}
return true;
}
/*
* Check if all VFIO devices are running and migration is active, which is
* essentially equivalent to the migration being in pre-copy phase.
@ -454,9 +552,14 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
{
struct vfio_iommu_type1_dma_unmap *unmap;
struct vfio_bitmap *bitmap;
uint64_t pages = REAL_HOST_PAGE_ALIGN(size) / qemu_real_host_page_size();
VFIOBitmap vbmap;
int ret;
ret = vfio_bitmap_alloc(&vbmap, size);
if (ret) {
return ret;
}
unmap = g_malloc0(sizeof(*unmap) + sizeof(*bitmap));
unmap->argsz = sizeof(*unmap) + sizeof(*bitmap);
@ -470,35 +573,28 @@ static int vfio_dma_unmap_bitmap(VFIOContainer *container,
* qemu_real_host_page_size to mark those dirty. Hence set bitmap_pgsize
* to qemu_real_host_page_size.
*/
bitmap->pgsize = qemu_real_host_page_size();
bitmap->size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
BITS_PER_BYTE;
bitmap->size = vbmap.size;
bitmap->data = (__u64 *)vbmap.bitmap;
if (bitmap->size > container->max_dirty_bitmap_size) {
error_report("UNMAP: Size of bitmap too big 0x%"PRIx64,
(uint64_t)bitmap->size);
if (vbmap.size > container->max_dirty_bitmap_size) {
error_report("UNMAP: Size of bitmap too big 0x%"PRIx64, vbmap.size);
ret = -E2BIG;
goto unmap_exit;
}
bitmap->data = g_try_malloc0(bitmap->size);
if (!bitmap->data) {
ret = -ENOMEM;
goto unmap_exit;
}
ret = ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, unmap);
if (!ret) {
cpu_physical_memory_set_dirty_lebitmap((unsigned long *)bitmap->data,
iotlb->translated_addr, pages);
cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap,
iotlb->translated_addr, vbmap.pages);
} else {
error_report("VFIO_UNMAP_DMA with DIRTY_BITMAP : %m");
}
g_free(bitmap->data);
unmap_exit:
g_free(unmap);
g_free(vbmap.bitmap);
return ret;
}
@ -515,10 +611,16 @@ static int vfio_dma_unmap(VFIOContainer *container,
.iova = iova,
.size = size,
};
bool need_dirty_sync = false;
int ret;
if (iotlb && container->dirty_pages_supported &&
vfio_devices_all_running_and_mig_active(container)) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
if (!vfio_devices_all_device_dirty_tracking(container) &&
container->dirty_pages_supported) {
return vfio_dma_unmap_bitmap(container, iova, size, iotlb);
}
need_dirty_sync = true;
}
while (ioctl(container->fd, VFIO_IOMMU_UNMAP_DMA, &unmap)) {
@ -544,10 +646,12 @@ static int vfio_dma_unmap(VFIOContainer *container,
return -errno;
}
if (iotlb && vfio_devices_all_running_and_mig_active(container)) {
cpu_physical_memory_set_dirty_range(iotlb->translated_addr, size,
tcg_enabled() ? DIRTY_CLIENTS_ALL :
DIRTY_CLIENTS_NOCODE);
if (need_dirty_sync) {
ret = vfio_get_dirty_bitmap(container, iova, size,
iotlb->translated_addr);
if (ret) {
return ret;
}
}
return 0;
@ -680,6 +784,7 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
if (iotlb->target_as != &address_space_memory) {
error_report("Wrong target AS \"%s\", only system memory is allowed",
iotlb->target_as->name ? iotlb->target_as->name : "none");
vfio_set_migration_error(-EINVAL);
return;
}
@ -703,17 +808,18 @@ static void vfio_iommu_map_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
read_only);
if (ret) {
error_report("vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%m)",
"0x%"HWADDR_PRIx", %p) = %d (%s)",
container, iova,
iotlb->addr_mask + 1, vaddr, ret);
iotlb->addr_mask + 1, vaddr, ret, strerror(-ret));
}
} else {
ret = vfio_dma_unmap(container, iova, iotlb->addr_mask + 1, iotlb);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%m)",
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova,
iotlb->addr_mask + 1, ret);
iotlb->addr_mask + 1, ret, strerror(-ret));
vfio_set_migration_error(ret);
}
}
out:
@ -868,6 +974,22 @@ static void vfio_unregister_ram_discard_listener(VFIOContainer *container,
g_free(vrdl);
}
static VFIOHostDMAWindow *vfio_find_hostwin(VFIOContainer *container,
hwaddr iova, hwaddr end)
{
VFIOHostDMAWindow *hostwin;
bool hostwin_found = false;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
hostwin_found = true;
break;
}
}
return hostwin_found ? hostwin : NULL;
}
static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
{
MemoryRegion *mr = section->mr;
@ -884,24 +1006,15 @@ static bool vfio_known_safe_misalignment(MemoryRegionSection *section)
return true;
}
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
static bool vfio_listener_valid_section(MemoryRegionSection *section,
const char *name)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
int ret;
VFIOHostDMAWindow *hostwin;
bool hostwin_found;
Error *err = NULL;
if (vfio_listener_skipped_section(section)) {
trace_vfio_listener_region_add_skip(
trace_vfio_listener_region_skip(name,
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(int128_sub(section->size, int128_one())));
return;
return false;
}
if (unlikely((section->offset_within_address_space &
@ -916,15 +1029,53 @@ static void vfio_listener_region_add(MemoryListener *listener,
section->offset_within_region,
qemu_real_host_page_size());
}
return;
return false;
}
return true;
}
static bool vfio_get_section_iova_range(VFIOContainer *container,
MemoryRegionSection *section,
hwaddr *out_iova, hwaddr *out_end,
Int128 *out_llend)
{
Int128 llend;
hwaddr iova;
iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
llend = int128_make64(section->offset_within_address_space);
llend = int128_add(llend, section->size);
llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
if (int128_ge(int128_make64(iova), llend)) {
return false;
}
*out_iova = iova;
*out_end = int128_get64(int128_sub(llend, int128_one()));
if (out_llend) {
*out_llend = llend;
}
return true;
}
static void vfio_listener_region_add(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
hwaddr iova, end;
Int128 llend, llsize;
void *vaddr;
int ret;
VFIOHostDMAWindow *hostwin;
Error *err = NULL;
if (!vfio_listener_valid_section(section, "region_add")) {
return;
}
if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
if (memory_region_is_ram_device(section->mr)) {
trace_vfio_listener_region_add_no_dma_map(
memory_region_name(section->mr),
@ -934,7 +1085,6 @@ static void vfio_listener_region_add(MemoryListener *listener,
}
return;
}
end = int128_get64(int128_sub(llend, int128_one()));
if (container->iommu_type == VFIO_SPAPR_TCE_v2_IOMMU) {
hwaddr pgsize = 0;
@ -994,15 +1144,8 @@ static void vfio_listener_region_add(MemoryListener *listener,
#endif
}
hostwin_found = false;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
hostwin_found = true;
break;
}
}
if (!hostwin_found) {
hostwin = vfio_find_hostwin(container, iova, end);
if (!hostwin) {
error_setg(&err, "Container %p can't map guest IOVA region"
" 0x%"HWADDR_PRIx"..0x%"HWADDR_PRIx, container, iova, end);
goto fail;
@ -1095,8 +1238,9 @@ static void vfio_listener_region_add(MemoryListener *listener,
vaddr, section->readonly);
if (ret) {
error_setg(&err, "vfio_dma_map(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx", %p) = %d (%m)",
container, iova, int128_get64(llsize), vaddr, ret);
"0x%"HWADDR_PRIx", %p) = %d (%s)",
container, iova, int128_get64(llsize), vaddr, ret,
strerror(-ret));
if (memory_region_is_ram_device(section->mr)) {
/* Allow unexpected mappings not to be fatal for RAM devices */
error_report_err(err);
@ -1140,26 +1284,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
int ret;
bool try_unmap = true;
if (vfio_listener_skipped_section(section)) {
trace_vfio_listener_region_del_skip(
section->offset_within_address_space,
section->offset_within_address_space +
int128_get64(int128_sub(section->size, int128_one())));
return;
}
if (unlikely((section->offset_within_address_space &
~qemu_real_host_page_mask()) !=
(section->offset_within_region & ~qemu_real_host_page_mask()))) {
if (!vfio_known_safe_misalignment(section)) {
error_report("%s received unaligned region %s iova=0x%"PRIx64
" offset_within_region=0x%"PRIx64
" qemu_real_host_page_size=0x%"PRIxPTR,
__func__, memory_region_name(section->mr),
section->offset_within_address_space,
section->offset_within_region,
qemu_real_host_page_size());
}
if (!vfio_listener_valid_section(section, "region_del")) {
return;
}
@ -1186,15 +1311,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
*/
}
iova = REAL_HOST_PAGE_ALIGN(section->offset_within_address_space);
llend = int128_make64(section->offset_within_address_space);
llend = int128_add(llend, section->size);
llend = int128_and(llend, int128_exts64(qemu_real_host_page_mask()));
if (int128_ge(int128_make64(iova), llend)) {
if (!vfio_get_section_iova_range(container, section, &iova, &end, &llend)) {
return;
}
end = int128_get64(int128_sub(llend, int128_one()));
llsize = int128_sub(llend, int128_make64(iova));
@ -1203,15 +1322,9 @@ static void vfio_listener_region_del(MemoryListener *listener,
if (memory_region_is_ram_device(section->mr)) {
hwaddr pgmask;
VFIOHostDMAWindow *hostwin;
bool hostwin_found = false;
QLIST_FOREACH(hostwin, &container->hostwin_list, hostwin_next) {
if (hostwin->min_iova <= iova && end <= hostwin->max_iova) {
hostwin_found = true;
break;
}
}
assert(hostwin_found); /* or region_add() would have failed */
hostwin = vfio_find_hostwin(container, iova, end);
assert(hostwin); /* or region_add() would have failed */
pgmask = (1ULL << ctz64(hostwin->iova_pgsizes)) - 1;
try_unmap = !((iova & pgmask) || (int128_get64(llsize) & pgmask));
@ -1228,16 +1341,18 @@ static void vfio_listener_region_del(MemoryListener *listener,
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%m)",
container, iova, int128_get64(llsize), ret);
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, int128_get64(llsize), ret,
strerror(-ret));
}
iova += int128_get64(llsize);
}
ret = vfio_dma_unmap(container, iova, int128_get64(llsize), NULL);
if (ret) {
error_report("vfio_dma_unmap(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%m)",
container, iova, int128_get64(llsize), ret);
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, int128_get64(llsize), ret,
strerror(-ret));
}
}
@ -1256,7 +1371,7 @@ static void vfio_listener_region_del(MemoryListener *listener,
}
}
static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
static int vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
{
int ret;
struct vfio_iommu_type1_dirty_bitmap dirty = {
@ -1264,7 +1379,7 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
};
if (!container->dirty_pages_supported) {
return;
return 0;
}
if (start) {
@ -1275,40 +1390,327 @@ static void vfio_set_dirty_page_tracking(VFIOContainer *container, bool start)
ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, &dirty);
if (ret) {
ret = -errno;
error_report("Failed to set dirty tracking flag 0x%x errno: %d",
dirty.flags, errno);
}
return ret;
}
typedef struct VFIODirtyRanges {
hwaddr min32;
hwaddr max32;
hwaddr min64;
hwaddr max64;
} VFIODirtyRanges;
typedef struct VFIODirtyRangesListener {
VFIOContainer *container;
VFIODirtyRanges ranges;
MemoryListener listener;
} VFIODirtyRangesListener;
static void vfio_dirty_tracking_update(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIODirtyRangesListener *dirty = container_of(listener,
VFIODirtyRangesListener,
listener);
VFIODirtyRanges *range = &dirty->ranges;
hwaddr iova, end, *min, *max;
if (!vfio_listener_valid_section(section, "tracking_update") ||
!vfio_get_section_iova_range(dirty->container, section,
&iova, &end, NULL)) {
return;
}
/*
* The address space passed to the dirty tracker is reduced to two ranges:
* one for 32-bit DMA ranges, and another one for 64-bit DMA ranges.
* The underlying reports of dirty will query a sub-interval of each of
* these ranges.
*
* The purpose of the dual range handling is to handle known cases of big
* holes in the address space, like the x86 AMD 1T hole. The alternative
* would be an IOVATree but that has a much bigger runtime overhead and
* unnecessary complexity.
*/
min = (end <= UINT32_MAX) ? &range->min32 : &range->min64;
max = (end <= UINT32_MAX) ? &range->max32 : &range->max64;
if (*min > iova) {
*min = iova;
}
if (*max < end) {
*max = end;
}
trace_vfio_device_dirty_tracking_update(iova, end, *min, *max);
return;
}
static const MemoryListener vfio_dirty_tracking_listener = {
.name = "vfio-tracking",
.region_add = vfio_dirty_tracking_update,
};
static void vfio_dirty_tracking_init(VFIOContainer *container,
VFIODirtyRanges *ranges)
{
VFIODirtyRangesListener dirty;
memset(&dirty, 0, sizeof(dirty));
dirty.ranges.min32 = UINT32_MAX;
dirty.ranges.min64 = UINT64_MAX;
dirty.listener = vfio_dirty_tracking_listener;
dirty.container = container;
memory_listener_register(&dirty.listener,
container->space->as);
*ranges = dirty.ranges;
/*
* The memory listener is synchronous, and used to calculate the range
* to dirty tracking. Unregister it after we are done as we are not
* interested in any follow-up updates.
*/
memory_listener_unregister(&dirty.listener);
}
static void vfio_devices_dma_logging_stop(VFIOContainer *container)
{
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
sizeof(uint64_t))] = {};
struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
VFIODevice *vbasedev;
VFIOGroup *group;
feature->argsz = sizeof(buf);
feature->flags = VFIO_DEVICE_FEATURE_SET |
VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP;
QLIST_FOREACH(group, &container->group_list, container_next) {
QLIST_FOREACH(vbasedev, &group->device_list, next) {
if (!vbasedev->dirty_tracking) {
continue;
}
if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
warn_report("%s: Failed to stop DMA logging, err %d (%s)",
vbasedev->name, -errno, strerror(errno));
}
vbasedev->dirty_tracking = false;
}
}
}
static struct vfio_device_feature *
vfio_device_feature_dma_logging_start_create(VFIOContainer *container,
VFIODirtyRanges *tracking)
{
struct vfio_device_feature *feature;
size_t feature_size;
struct vfio_device_feature_dma_logging_control *control;
struct vfio_device_feature_dma_logging_range *ranges;
feature_size = sizeof(struct vfio_device_feature) +
sizeof(struct vfio_device_feature_dma_logging_control);
feature = g_try_malloc0(feature_size);
if (!feature) {
errno = ENOMEM;
return NULL;
}
feature->argsz = feature_size;
feature->flags = VFIO_DEVICE_FEATURE_SET |
VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
control = (struct vfio_device_feature_dma_logging_control *)feature->data;
control->page_size = qemu_real_host_page_size();
/*
* DMA logging uAPI guarantees to support at least a number of ranges that
* fits into a single host kernel base page.
*/
control->num_ranges = !!tracking->max32 + !!tracking->max64;
ranges = g_try_new0(struct vfio_device_feature_dma_logging_range,
control->num_ranges);
if (!ranges) {
g_free(feature);
errno = ENOMEM;
return NULL;
}
control->ranges = (__u64)(uintptr_t)ranges;
if (tracking->max32) {
ranges->iova = tracking->min32;
ranges->length = (tracking->max32 - tracking->min32) + 1;
ranges++;
}
if (tracking->max64) {
ranges->iova = tracking->min64;
ranges->length = (tracking->max64 - tracking->min64) + 1;
}
trace_vfio_device_dirty_tracking_start(control->num_ranges,
tracking->min32, tracking->max32,
tracking->min64, tracking->max64);
return feature;
}
static void vfio_device_feature_dma_logging_start_destroy(
struct vfio_device_feature *feature)
{
struct vfio_device_feature_dma_logging_control *control =
(struct vfio_device_feature_dma_logging_control *)feature->data;
struct vfio_device_feature_dma_logging_range *ranges =
(struct vfio_device_feature_dma_logging_range *)(uintptr_t)control->ranges;
g_free(ranges);
g_free(feature);
}
static int vfio_devices_dma_logging_start(VFIOContainer *container)
{
struct vfio_device_feature *feature;
VFIODirtyRanges ranges;
VFIODevice *vbasedev;
VFIOGroup *group;
int ret = 0;
vfio_dirty_tracking_init(container, &ranges);
feature = vfio_device_feature_dma_logging_start_create(container,
&ranges);
if (!feature) {
return -errno;
}
QLIST_FOREACH(group, &container->group_list, container_next) {
QLIST_FOREACH(vbasedev, &group->device_list, next) {
if (vbasedev->dirty_tracking) {
continue;
}
ret = ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
if (ret) {
ret = -errno;
error_report("%s: Failed to start DMA logging, err %d (%s)",
vbasedev->name, ret, strerror(errno));
goto out;
}
vbasedev->dirty_tracking = true;
}
}
out:
if (ret) {
vfio_devices_dma_logging_stop(container);
}
vfio_device_feature_dma_logging_start_destroy(feature);
return ret;
}
static void vfio_listener_log_global_start(MemoryListener *listener)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
int ret;
vfio_set_dirty_page_tracking(container, true);
if (vfio_devices_all_device_dirty_tracking(container)) {
ret = vfio_devices_dma_logging_start(container);
} else {
ret = vfio_set_dirty_page_tracking(container, true);
}
if (ret) {
error_report("vfio: Could not start dirty page tracking, err: %d (%s)",
ret, strerror(-ret));
vfio_set_migration_error(ret);
}
}
static void vfio_listener_log_global_stop(MemoryListener *listener)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
int ret = 0;
vfio_set_dirty_page_tracking(container, false);
if (vfio_devices_all_device_dirty_tracking(container)) {
vfio_devices_dma_logging_stop(container);
} else {
ret = vfio_set_dirty_page_tracking(container, false);
}
if (ret) {
error_report("vfio: Could not stop dirty page tracking, err: %d (%s)",
ret, strerror(-ret));
vfio_set_migration_error(ret);
}
}
static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
uint64_t size, ram_addr_t ram_addr)
static int vfio_device_dma_logging_report(VFIODevice *vbasedev, hwaddr iova,
hwaddr size, void *bitmap)
{
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature) +
sizeof(struct vfio_device_feature_dma_logging_report),
sizeof(__u64))] = {};
struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
struct vfio_device_feature_dma_logging_report *report =
(struct vfio_device_feature_dma_logging_report *)feature->data;
report->iova = iova;
report->length = size;
report->page_size = qemu_real_host_page_size();
report->bitmap = (__u64)(uintptr_t)bitmap;
feature->argsz = sizeof(buf);
feature->flags = VFIO_DEVICE_FEATURE_GET |
VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT;
if (ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature)) {
return -errno;
}
return 0;
}
static int vfio_devices_query_dirty_bitmap(VFIOContainer *container,
VFIOBitmap *vbmap, hwaddr iova,
hwaddr size)
{
VFIODevice *vbasedev;
VFIOGroup *group;
int ret;
QLIST_FOREACH(group, &container->group_list, container_next) {
QLIST_FOREACH(vbasedev, &group->device_list, next) {
ret = vfio_device_dma_logging_report(vbasedev, iova, size,
vbmap->bitmap);
if (ret) {
error_report("%s: Failed to get DMA logging report, iova: "
"0x%" HWADDR_PRIx ", size: 0x%" HWADDR_PRIx
", err: %d (%s)",
vbasedev->name, iova, size, ret, strerror(-ret));
return ret;
}
}
}
return 0;
}
static int vfio_query_dirty_bitmap(VFIOContainer *container, VFIOBitmap *vbmap,
hwaddr iova, hwaddr size)
{
struct vfio_iommu_type1_dirty_bitmap *dbitmap;
struct vfio_iommu_type1_dirty_bitmap_get *range;
uint64_t pages;
int ret;
if (!container->dirty_pages_supported) {
cpu_physical_memory_set_dirty_range(ram_addr, size,
tcg_enabled() ? DIRTY_CLIENTS_ALL :
DIRTY_CLIENTS_NOCODE);
return 0;
}
dbitmap = g_malloc0(sizeof(*dbitmap) + sizeof(*range));
dbitmap->argsz = sizeof(*dbitmap) + sizeof(*range);
@ -1323,36 +1725,63 @@ static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
* to qemu_real_host_page_size.
*/
range->bitmap.pgsize = qemu_real_host_page_size();
pages = REAL_HOST_PAGE_ALIGN(range->size) / qemu_real_host_page_size();
range->bitmap.size = ROUND_UP(pages, sizeof(__u64) * BITS_PER_BYTE) /
BITS_PER_BYTE;
range->bitmap.data = g_try_malloc0(range->bitmap.size);
if (!range->bitmap.data) {
ret = -ENOMEM;
goto err_out;
}
range->bitmap.size = vbmap->size;
range->bitmap.data = (__u64 *)vbmap->bitmap;
ret = ioctl(container->fd, VFIO_IOMMU_DIRTY_PAGES, dbitmap);
if (ret) {
ret = -errno;
error_report("Failed to get dirty bitmap for iova: 0x%"PRIx64
" size: 0x%"PRIx64" err: %d", (uint64_t)range->iova,
(uint64_t)range->size, errno);
goto err_out;
}
cpu_physical_memory_set_dirty_lebitmap((unsigned long *)range->bitmap.data,
ram_addr, pages);
trace_vfio_get_dirty_bitmap(container->fd, range->iova, range->size,
range->bitmap.size, ram_addr);
err_out:
g_free(range->bitmap.data);
g_free(dbitmap);
return ret;
}
static int vfio_get_dirty_bitmap(VFIOContainer *container, uint64_t iova,
uint64_t size, ram_addr_t ram_addr)
{
bool all_device_dirty_tracking =
vfio_devices_all_device_dirty_tracking(container);
VFIOBitmap vbmap;
int ret;
if (!container->dirty_pages_supported && !all_device_dirty_tracking) {
cpu_physical_memory_set_dirty_range(ram_addr, size,
tcg_enabled() ? DIRTY_CLIENTS_ALL :
DIRTY_CLIENTS_NOCODE);
return 0;
}
ret = vfio_bitmap_alloc(&vbmap, size);
if (ret) {
return ret;
}
if (all_device_dirty_tracking) {
ret = vfio_devices_query_dirty_bitmap(container, &vbmap, iova, size);
} else {
ret = vfio_query_dirty_bitmap(container, &vbmap, iova, size);
}
if (ret) {
goto out;
}
cpu_physical_memory_set_dirty_lebitmap(vbmap.bitmap, ram_addr,
vbmap.pages);
trace_vfio_get_dirty_bitmap(container->fd, iova, size, vbmap.size,
ram_addr);
out:
g_free(vbmap.bitmap);
return ret;
}
typedef struct {
IOMMUNotifier n;
VFIOGuestIOMMU *giommu;
@ -1366,29 +1795,33 @@ static void vfio_iommu_map_dirty_notify(IOMMUNotifier *n, IOMMUTLBEntry *iotlb)
VFIOContainer *container = giommu->container;
hwaddr iova = iotlb->iova + giommu->iommu_offset;
ram_addr_t translated_addr;
int ret = -EINVAL;
trace_vfio_iommu_map_dirty_notify(iova, iova + iotlb->addr_mask);
if (iotlb->target_as != &address_space_memory) {
error_report("Wrong target AS \"%s\", only system memory is allowed",
iotlb->target_as->name ? iotlb->target_as->name : "none");
return;
goto out;
}
rcu_read_lock();
if (vfio_get_xlat_addr(iotlb, NULL, &translated_addr, NULL)) {
int ret;
ret = vfio_get_dirty_bitmap(container, iova, iotlb->addr_mask + 1,
translated_addr);
if (ret) {
error_report("vfio_iommu_map_dirty_notify(%p, 0x%"HWADDR_PRIx", "
"0x%"HWADDR_PRIx") = %d (%m)",
container, iova,
iotlb->addr_mask + 1, ret);
"0x%"HWADDR_PRIx") = %d (%s)",
container, iova, iotlb->addr_mask + 1, ret,
strerror(-ret));
}
}
rcu_read_unlock();
out:
if (ret) {
vfio_set_migration_error(ret);
}
}
static int vfio_ram_discard_get_dirty_bitmap(MemoryRegionSection *section,
@ -1481,13 +1914,19 @@ static void vfio_listener_log_sync(MemoryListener *listener,
MemoryRegionSection *section)
{
VFIOContainer *container = container_of(listener, VFIOContainer, listener);
int ret;
if (vfio_listener_skipped_section(section)) {
return;
}
if (vfio_devices_all_dirty_tracking(container)) {
vfio_sync_dirty_bitmap(container, section);
ret = vfio_sync_dirty_bitmap(container, section);
if (ret) {
error_report("vfio: Failed to sync dirty bitmap, err: %d (%s)", ret,
strerror(-ret));
vfio_set_migration_error(ret);
}
}
}

View file

@ -521,7 +521,7 @@ static void vfio_migration_state_notifier(Notifier *notifier, void *data)
}
}
static void vfio_migration_exit(VFIODevice *vbasedev)
static void vfio_migration_free(VFIODevice *vbasedev)
{
g_free(vbasedev->migration);
vbasedev->migration = NULL;
@ -555,6 +555,19 @@ static int vfio_migration_query_flags(VFIODevice *vbasedev, uint64_t *mig_flags)
return 0;
}
static bool vfio_dma_logging_supported(VFIODevice *vbasedev)
{
uint64_t buf[DIV_ROUND_UP(sizeof(struct vfio_device_feature),
sizeof(uint64_t))] = {};
struct vfio_device_feature *feature = (struct vfio_device_feature *)buf;
feature->argsz = sizeof(buf);
feature->flags = VFIO_DEVICE_FEATURE_PROBE |
VFIO_DEVICE_FEATURE_DMA_LOGGING_START;
return !ioctl(vbasedev->fd, VFIO_DEVICE_FEATURE, feature);
}
static int vfio_migration_init(VFIODevice *vbasedev)
{
int ret;
@ -589,6 +602,8 @@ static int vfio_migration_init(VFIODevice *vbasedev)
migration->device_state = VFIO_DEVICE_STATE_RUNNING;
migration->data_fd = -1;
vbasedev->dirty_pages_supported = vfio_dma_logging_supported(vbasedev);
oid = vmstate_if_get_id(VMSTATE_IF(DEVICE(obj)));
if (oid) {
path = g_strdup_printf("%s/vfio", oid);
@ -616,7 +631,7 @@ int64_t vfio_mig_bytes_transferred(void)
return bytes_transferred;
}
int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
int vfio_migration_realize(VFIODevice *vbasedev, Error **errp)
{
int ret = -ENOTSUP;
@ -634,6 +649,11 @@ int vfio_migration_probe(VFIODevice *vbasedev, Error **errp)
return ret;
}
ret = vfio_block_giommu_migration(errp);
if (ret) {
return ret;
}
trace_vfio_migration_probe(vbasedev->name);
return 0;
@ -649,7 +669,7 @@ add_blocker:
return ret;
}
void vfio_migration_finalize(VFIODevice *vbasedev)
void vfio_migration_exit(VFIODevice *vbasedev)
{
if (vbasedev->migration) {
VFIOMigration *migration = vbasedev->migration;
@ -657,7 +677,7 @@ void vfio_migration_finalize(VFIODevice *vbasedev)
remove_migration_state_change_notifier(&migration->migration_state);
qemu_del_vm_change_state_handler(migration->vm_state);
unregister_savevm(VMSTATE_IF(vbasedev->dev), "vfio", vbasedev);
vfio_migration_exit(vbasedev);
vfio_migration_free(vbasedev);
vfio_unblock_multiple_devices_migration();
}

View file

@ -3145,7 +3145,7 @@ static void vfio_realize(PCIDevice *pdev, Error **errp)
}
if (!pdev->failover_pair_id) {
ret = vfio_migration_probe(vbasedev, errp);
ret = vfio_migration_realize(vbasedev, errp);
if (ret) {
error_report("%s: Migration disabled", vbasedev->name);
}
@ -3185,6 +3185,7 @@ static void vfio_instance_finalize(Object *obj)
*/
vfio_put_device(vdev);
vfio_put_group(group);
vfio_migration_finalize();
}
static void vfio_exitfn(PCIDevice *pdev)
@ -3203,7 +3204,7 @@ static void vfio_exitfn(PCIDevice *pdev)
}
vfio_teardown_msi(vdev);
vfio_bars_exit(vdev);
vfio_migration_finalize(&vdev->vbasedev);
vfio_migration_exit(&vdev->vbasedev);
}
static void vfio_pci_reset(DeviceState *dev)

View file

@ -96,14 +96,15 @@ vfio_pci_igd_lpc_bridge_enabled(const char *name) "%s"
vfio_region_write(const char *name, int index, uint64_t addr, uint64_t data, unsigned size) " (%s:region%d+0x%"PRIx64", 0x%"PRIx64 ", %d)"
vfio_region_read(char *name, int index, uint64_t addr, unsigned size, uint64_t data) " (%s:region%d+0x%"PRIx64", %d) = 0x%"PRIx64
vfio_iommu_map_notify(const char *op, uint64_t iova_start, uint64_t iova_end) "iommu %s @ 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_add_skip(uint64_t start, uint64_t end) "SKIPPING region_add 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_skip(const char *name, uint64_t start, uint64_t end) "SKIPPING %s 0x%"PRIx64" - 0x%"PRIx64
vfio_spapr_group_attach(int groupfd, int tablefd) "Attached groupfd %d to liobn fd %d"
vfio_listener_region_add_iommu(uint64_t start, uint64_t end) "region_add [iommu] 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_add_ram(uint64_t iova_start, uint64_t iova_end, void *vaddr) "region_add [ram] 0x%"PRIx64" - 0x%"PRIx64" [%p]"
vfio_known_safe_misalignment(const char *name, uint64_t iova, uint64_t offset_within_region, uintptr_t page_size) "Region \"%s\" iova=0x%"PRIx64" offset_within_region=0x%"PRIx64" qemu_real_host_page_size=0x%"PRIxPTR
vfio_listener_region_add_no_dma_map(const char *name, uint64_t iova, uint64_t size, uint64_t page_size) "Region \"%s\" 0x%"PRIx64" size=0x%"PRIx64" is not aligned to 0x%"PRIx64" and cannot be mapped for DMA"
vfio_listener_region_del_skip(uint64_t start, uint64_t end) "SKIPPING region_del 0x%"PRIx64" - 0x%"PRIx64
vfio_listener_region_del(uint64_t start, uint64_t end) "region_del 0x%"PRIx64" - 0x%"PRIx64
vfio_device_dirty_tracking_update(uint64_t start, uint64_t end, uint64_t min, uint64_t max) "section 0x%"PRIx64" - 0x%"PRIx64" -> update [0x%"PRIx64" - 0x%"PRIx64"]"
vfio_device_dirty_tracking_start(int nr_ranges, uint64_t min32, uint64_t max32, uint64_t min64, uint64_t max64) "nr_ranges %d 32:[0x%"PRIx64" - 0x%"PRIx64"], 64:[0x%"PRIx64" - 0x%"PRIx64"]"
vfio_disconnect_container(int fd) "close container->fd=%d"
vfio_put_group(int fd) "close group->fd=%d"
vfio_get_device(const char * name, unsigned int flags, unsigned int num_regions, unsigned int num_irqs) "Device %s flags: %u, regions: %u, irqs: %u"
@ -117,7 +118,7 @@ vfio_region_mmaps_set_enabled(const char *name, bool enabled) "Region %s mmaps e
vfio_region_unmap(const char *name, unsigned long offset, unsigned long end) "Region %s unmap [0x%lx - 0x%lx]"
vfio_region_sparse_mmap_header(const char *name, int index, int nr_areas) "Device %s region %d: %d sparse mmap entries"
vfio_region_sparse_mmap_entry(int i, unsigned long start, unsigned long end) "sparse entry %d [0x%lx - 0x%lx]"
vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%0x8"
vfio_get_dev_region(const char *name, int index, uint32_t type, uint32_t subtype) "%s index %d, %08x/%08x"
vfio_dma_unmap_overflow_workaround(void) ""
vfio_get_dirty_bitmap(int fd, uint64_t iova, uint64_t size, uint64_t bitmap_size, uint64_t start) "container fd=%d, iova=0x%"PRIx64" size= 0x%"PRIx64" bitmap_size=0x%"PRIx64" start=0x%"PRIx64
vfio_iommu_map_dirty_notify(uint64_t iova_start, uint64_t iova_end) "iommu dirty @ 0x%"PRIx64" - 0x%"PRIx64