mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-08-03 07:43:54 -06:00
migration: allow fault thread to pause
Allows the fault thread to stop handling page faults temporarily. When network failure happened (and if we expect a recovery afterwards), we should not allow the fault thread to continue sending things to source, instead, it should halt for a while until the connection is rebuilt. When the dest main thread noticed the failure, it kicks the fault thread to switch to pause state. Reviewed-by: Dr. David Alan Gilbert <dgilbert@redhat.com> Signed-off-by: Peter Xu <peterx@redhat.com> Message-Id: <20180502104740.12123-7-peterx@redhat.com> Signed-off-by: Juan Quintela <quintela@redhat.com>
This commit is contained in:
parent
14b1742eaa
commit
3a7804c306
5 changed files with 57 additions and 4 deletions
|
@ -830,6 +830,17 @@ static void mark_postcopy_blocktime_end(uintptr_t addr)
|
|||
affected_cpu);
|
||||
}
|
||||
|
||||
static bool postcopy_pause_fault_thread(MigrationIncomingState *mis)
|
||||
{
|
||||
trace_postcopy_pause_fault_thread();
|
||||
|
||||
qemu_sem_wait(&mis->postcopy_pause_sem_fault);
|
||||
|
||||
trace_postcopy_pause_fault_thread_continued();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
* Handle faults detected by the USERFAULT markings
|
||||
*/
|
||||
|
@ -880,6 +891,22 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
|||
break;
|
||||
}
|
||||
|
||||
if (!mis->to_src_file) {
|
||||
/*
|
||||
* Possibly someone tells us that the return path is
|
||||
* broken already using the event. We should hold until
|
||||
* the channel is rebuilt.
|
||||
*/
|
||||
if (postcopy_pause_fault_thread(mis)) {
|
||||
mis->last_rb = NULL;
|
||||
/* Continue to read the userfaultfd */
|
||||
} else {
|
||||
error_report("%s: paused but don't allow to continue",
|
||||
__func__);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (pfd[1].revents) {
|
||||
uint64_t tmp64 = 0;
|
||||
|
||||
|
@ -942,18 +969,37 @@ static void *postcopy_ram_fault_thread(void *opaque)
|
|||
(uintptr_t)(msg.arg.pagefault.address),
|
||||
msg.arg.pagefault.feat.ptid, rb);
|
||||
|
||||
retry:
|
||||
/*
|
||||
* Send the request to the source - we want to request one
|
||||
* of our host page sizes (which is >= TPS)
|
||||
*/
|
||||
if (rb != mis->last_rb) {
|
||||
mis->last_rb = rb;
|
||||
migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
ret = migrate_send_rp_req_pages(mis,
|
||||
qemu_ram_get_idstr(rb),
|
||||
rb_offset,
|
||||
qemu_ram_pagesize(rb));
|
||||
} else {
|
||||
/* Save some space */
|
||||
migrate_send_rp_req_pages(mis, NULL,
|
||||
rb_offset, qemu_ram_pagesize(rb));
|
||||
ret = migrate_send_rp_req_pages(mis,
|
||||
NULL,
|
||||
rb_offset,
|
||||
qemu_ram_pagesize(rb));
|
||||
}
|
||||
|
||||
if (ret) {
|
||||
/* May be network failure, try to wait for recovery */
|
||||
if (ret == -EIO && postcopy_pause_fault_thread(mis)) {
|
||||
/* We got reconnected somehow, try to continue */
|
||||
mis->last_rb = NULL;
|
||||
goto retry;
|
||||
} else {
|
||||
/* This is a unavoidable fault */
|
||||
error_report("%s: migrate_send_rp_req_pages() get %d",
|
||||
__func__, ret);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue