mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-08-10 19:14:58 -06:00
Pull request
v4: * Add PCI_EXPRESS Kconfig dependency to fix s390x in "multi-process: setup PCI host bridge for remote device" [Philippe and Thomas] -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEhpWov9P5fNqsNXdanKSrs4Grc8gFAmAjprYACgkQnKSrs4Gr c8g8fwgAspVVcW0x2uBdrbbi2PSOtpTu+GGC8gYcvGS526WSINv+mY8QhUFPdaQO MErATW1FuIYuXBkvXcaIRWQGGZkfWd6AyjxjEls2jdcp1ScqJ+wnZmTDsgR8yIb3 vOdsA03YJugrAmF4Lsdpkyq9KaWYlCUxrdPoagEopQCEETkKeEOXq40c3pxzeIBI 75s8i28makJ8Logdvr9SBLsR7DL4mpoVQErUe2IMLGGbPSe5F49VD+b/qI20agoI 7L8mDXqL02nc2qJL63hk2dq7pq++MWq0uDs8bbaCouxiFNrNA8zQ+flLP0VrFcP/ gJUa92unwoKs+yVVppadBZKwjtkbYA== =r5oQ -----END PGP SIGNATURE----- Merge remote-tracking branch 'remotes/stefanha-gitlab/tags/block-pull-request' into staging Pull request v4: * Add PCI_EXPRESS Kconfig dependency to fix s390x in "multi-process: setup PCI host bridge for remote device" [Philippe and Thomas] # gpg: Signature made Wed 10 Feb 2021 09:26:14 GMT # gpg: using RSA key 8695A8BFD3F97CDAAC35775A9CA4ABB381AB73C8 # gpg: Good signature from "Stefan Hajnoczi <stefanha@redhat.com>" [full] # gpg: aka "Stefan Hajnoczi <stefanha@gmail.com>" [full] # Primary key fingerprint: 8695 A8BF D3F9 7CDA AC35 775A 9CA4 ABB3 81AB 73C8 * remotes/stefanha-gitlab/tags/block-pull-request: (27 commits) docs: fix Parallels Image "dirty bitmap" section multi-process: perform device reset in the remote process multi-process: Retrieve PCI info from remote process multi-process: create IOHUB object to handle irq multi-process: Synchronize remote memory multi-process: PCI BAR read/write handling for proxy & remote endpoints multi-process: Forward PCI config space acceses to the remote process multi-process: add proxy communication functions multi-process: introduce proxy object multi-process: setup memory manager for remote device multi-process: Associate fd of a PCIDevice with its object multi-process: Initialize message handler in remote device multi-process: define MPQemuMsg format and transmission functions io: add qio_channel_readv_full_all_eof & qio_channel_readv_full_all helpers io: add qio_channel_writev_full_all helper multi-process: setup a machine object for remote device process multi-process: setup PCI host bridge for remote device multi-process: Add config option for multi-process QEMU memory: alloc RAM from file at offset multi-process: add configure and usage information ... Signed-off-by: Peter Maydell <peter.maydell@linaro.org>
This commit is contained in:
commit
83339e21d0
53 changed files with 3296 additions and 70 deletions
8
.github/lockdown.yml
vendored
8
.github/lockdown.yml
vendored
|
@ -10,8 +10,8 @@ issues:
|
||||||
comment: |
|
comment: |
|
||||||
Thank you for your interest in the QEMU project.
|
Thank you for your interest in the QEMU project.
|
||||||
|
|
||||||
This repository is a read-only mirror of the project's master
|
This repository is a read-only mirror of the project's repostories hosted
|
||||||
repostories hosted on https://git.qemu.org/git/qemu.git.
|
at https://gitlab.com/qemu-project/qemu.git.
|
||||||
The project does not process issues filed on GitHub.
|
The project does not process issues filed on GitHub.
|
||||||
|
|
||||||
The project issues are tracked on Launchpad:
|
The project issues are tracked on Launchpad:
|
||||||
|
@ -24,8 +24,8 @@ pulls:
|
||||||
comment: |
|
comment: |
|
||||||
Thank you for your interest in the QEMU project.
|
Thank you for your interest in the QEMU project.
|
||||||
|
|
||||||
This repository is a read-only mirror of the project's master
|
This repository is a read-only mirror of the project's repostories hosted
|
||||||
repostories hosted on https://git.qemu.org/git/qemu.git.
|
on https://gitlab.com/qemu-project/qemu.git.
|
||||||
The project does not process merge requests filed on GitHub.
|
The project does not process merge requests filed on GitHub.
|
||||||
|
|
||||||
QEMU welcomes contributions of code (either fixing bugs or adding new
|
QEMU welcomes contributions of code (either fixing bugs or adding new
|
||||||
|
|
|
@ -18,7 +18,6 @@ include:
|
||||||
image: $CI_REGISTRY_IMAGE/qemu/$IMAGE:latest
|
image: $CI_REGISTRY_IMAGE/qemu/$IMAGE:latest
|
||||||
before_script:
|
before_script:
|
||||||
- JOBS=$(expr $(nproc) + 1)
|
- JOBS=$(expr $(nproc) + 1)
|
||||||
- sed -i s,git.qemu.org/git,gitlab.com/qemu-project, .gitmodules
|
|
||||||
script:
|
script:
|
||||||
- mkdir build
|
- mkdir build
|
||||||
- cd build
|
- cd build
|
||||||
|
|
44
.gitmodules
vendored
44
.gitmodules
vendored
|
@ -1,66 +1,66 @@
|
||||||
[submodule "roms/seabios"]
|
[submodule "roms/seabios"]
|
||||||
path = roms/seabios
|
path = roms/seabios
|
||||||
url = https://git.qemu.org/git/seabios.git/
|
url = https://gitlab.com/qemu-project/seabios.git/
|
||||||
[submodule "roms/SLOF"]
|
[submodule "roms/SLOF"]
|
||||||
path = roms/SLOF
|
path = roms/SLOF
|
||||||
url = https://git.qemu.org/git/SLOF.git
|
url = https://gitlab.com/qemu-project/SLOF.git
|
||||||
[submodule "roms/ipxe"]
|
[submodule "roms/ipxe"]
|
||||||
path = roms/ipxe
|
path = roms/ipxe
|
||||||
url = https://git.qemu.org/git/ipxe.git
|
url = https://gitlab.com/qemu-project/ipxe.git
|
||||||
[submodule "roms/openbios"]
|
[submodule "roms/openbios"]
|
||||||
path = roms/openbios
|
path = roms/openbios
|
||||||
url = https://git.qemu.org/git/openbios.git
|
url = https://gitlab.com/qemu-project/openbios.git
|
||||||
[submodule "roms/qemu-palcode"]
|
[submodule "roms/qemu-palcode"]
|
||||||
path = roms/qemu-palcode
|
path = roms/qemu-palcode
|
||||||
url = https://git.qemu.org/git/qemu-palcode.git
|
url = https://gitlab.com/qemu-project/qemu-palcode.git
|
||||||
[submodule "roms/sgabios"]
|
[submodule "roms/sgabios"]
|
||||||
path = roms/sgabios
|
path = roms/sgabios
|
||||||
url = https://git.qemu.org/git/sgabios.git
|
url = https://gitlab.com/qemu-project/sgabios.git
|
||||||
[submodule "dtc"]
|
[submodule "dtc"]
|
||||||
path = dtc
|
path = dtc
|
||||||
url = https://git.qemu.org/git/dtc.git
|
url = https://gitlab.com/qemu-project/dtc.git
|
||||||
[submodule "roms/u-boot"]
|
[submodule "roms/u-boot"]
|
||||||
path = roms/u-boot
|
path = roms/u-boot
|
||||||
url = https://git.qemu.org/git/u-boot.git
|
url = https://gitlab.com/qemu-project/u-boot.git
|
||||||
[submodule "roms/skiboot"]
|
[submodule "roms/skiboot"]
|
||||||
path = roms/skiboot
|
path = roms/skiboot
|
||||||
url = https://git.qemu.org/git/skiboot.git
|
url = https://gitlab.com/qemu-project/skiboot.git
|
||||||
[submodule "roms/QemuMacDrivers"]
|
[submodule "roms/QemuMacDrivers"]
|
||||||
path = roms/QemuMacDrivers
|
path = roms/QemuMacDrivers
|
||||||
url = https://git.qemu.org/git/QemuMacDrivers.git
|
url = https://gitlab.com/qemu-project/QemuMacDrivers.git
|
||||||
[submodule "ui/keycodemapdb"]
|
[submodule "ui/keycodemapdb"]
|
||||||
path = ui/keycodemapdb
|
path = ui/keycodemapdb
|
||||||
url = https://git.qemu.org/git/keycodemapdb.git
|
url = https://gitlab.com/qemu-project/keycodemapdb.git
|
||||||
[submodule "capstone"]
|
[submodule "capstone"]
|
||||||
path = capstone
|
path = capstone
|
||||||
url = https://git.qemu.org/git/capstone.git
|
url = https://gitlab.com/qemu-project/capstone.git
|
||||||
[submodule "roms/seabios-hppa"]
|
[submodule "roms/seabios-hppa"]
|
||||||
path = roms/seabios-hppa
|
path = roms/seabios-hppa
|
||||||
url = https://git.qemu.org/git/seabios-hppa.git
|
url = https://gitlab.com/qemu-project/seabios-hppa.git
|
||||||
[submodule "roms/u-boot-sam460ex"]
|
[submodule "roms/u-boot-sam460ex"]
|
||||||
path = roms/u-boot-sam460ex
|
path = roms/u-boot-sam460ex
|
||||||
url = https://git.qemu.org/git/u-boot-sam460ex.git
|
url = https://gitlab.com/qemu-project/u-boot-sam460ex.git
|
||||||
[submodule "tests/fp/berkeley-testfloat-3"]
|
[submodule "tests/fp/berkeley-testfloat-3"]
|
||||||
path = tests/fp/berkeley-testfloat-3
|
path = tests/fp/berkeley-testfloat-3
|
||||||
url = https://git.qemu.org/git/berkeley-testfloat-3.git
|
url = https://gitlab.com/qemu-project/berkeley-testfloat-3.git
|
||||||
[submodule "tests/fp/berkeley-softfloat-3"]
|
[submodule "tests/fp/berkeley-softfloat-3"]
|
||||||
path = tests/fp/berkeley-softfloat-3
|
path = tests/fp/berkeley-softfloat-3
|
||||||
url = https://git.qemu.org/git/berkeley-softfloat-3.git
|
url = https://gitlab.com/qemu-project/berkeley-softfloat-3.git
|
||||||
[submodule "roms/edk2"]
|
[submodule "roms/edk2"]
|
||||||
path = roms/edk2
|
path = roms/edk2
|
||||||
url = https://git.qemu.org/git/edk2.git
|
url = https://gitlab.com/qemu-project/edk2.git
|
||||||
[submodule "slirp"]
|
[submodule "slirp"]
|
||||||
path = slirp
|
path = slirp
|
||||||
url = https://git.qemu.org/git/libslirp.git
|
url = https://gitlab.com/qemu-project/libslirp.git
|
||||||
[submodule "roms/opensbi"]
|
[submodule "roms/opensbi"]
|
||||||
path = roms/opensbi
|
path = roms/opensbi
|
||||||
url = https://git.qemu.org/git/opensbi.git
|
url = https://gitlab.com/qemu-project/opensbi.git
|
||||||
[submodule "roms/qboot"]
|
[submodule "roms/qboot"]
|
||||||
path = roms/qboot
|
path = roms/qboot
|
||||||
url = https://git.qemu.org/git/qboot.git
|
url = https://gitlab.com/qemu-project/qboot.git
|
||||||
[submodule "meson"]
|
[submodule "meson"]
|
||||||
path = meson
|
path = meson
|
||||||
url = https://git.qemu.org/git/meson.git
|
url = https://gitlab.com/qemu-project/meson.git
|
||||||
[submodule "roms/vbootrom"]
|
[submodule "roms/vbootrom"]
|
||||||
path = roms/vbootrom
|
path = roms/vbootrom
|
||||||
url = https://git.qemu.org/git/vbootrom.git
|
url = https://gitlab.com/qemu-project/vbootrom.git
|
||||||
|
|
|
@ -37,3 +37,7 @@ config VIRTFS
|
||||||
|
|
||||||
config PVRDMA
|
config PVRDMA
|
||||||
bool
|
bool
|
||||||
|
|
||||||
|
config MULTIPROCESS_ALLOWED
|
||||||
|
bool
|
||||||
|
imply MULTIPROCESS
|
||||||
|
|
24
MAINTAINERS
24
MAINTAINERS
|
@ -3200,6 +3200,30 @@ S: Maintained
|
||||||
F: hw/semihosting/
|
F: hw/semihosting/
|
||||||
F: include/hw/semihosting/
|
F: include/hw/semihosting/
|
||||||
|
|
||||||
|
Multi-process QEMU
|
||||||
|
M: Elena Ufimtseva <elena.ufimtseva@oracle.com>
|
||||||
|
M: Jagannathan Raman <jag.raman@oracle.com>
|
||||||
|
M: John G Johnson <john.g.johnson@oracle.com>
|
||||||
|
S: Maintained
|
||||||
|
F: docs/devel/multi-process.rst
|
||||||
|
F: docs/system/multi-process.rst
|
||||||
|
F: hw/pci-host/remote.c
|
||||||
|
F: include/hw/pci-host/remote.h
|
||||||
|
F: hw/remote/machine.c
|
||||||
|
F: include/hw/remote/machine.h
|
||||||
|
F: hw/remote/mpqemu-link.c
|
||||||
|
F: include/hw/remote/mpqemu-link.h
|
||||||
|
F: hw/remote/message.c
|
||||||
|
F: hw/remote/remote-obj.c
|
||||||
|
F: include/hw/remote/memory.h
|
||||||
|
F: hw/remote/memory.c
|
||||||
|
F: hw/remote/proxy.c
|
||||||
|
F: include/hw/remote/proxy.h
|
||||||
|
F: hw/remote/proxy-memory-listener.c
|
||||||
|
F: include/hw/remote/proxy-memory-listener.h
|
||||||
|
F: hw/remote/iohub.c
|
||||||
|
F: include/hw/remote/iohub.h
|
||||||
|
|
||||||
Build and test automation
|
Build and test automation
|
||||||
-------------------------
|
-------------------------
|
||||||
Build and test automation
|
Build and test automation
|
||||||
|
|
|
@ -60,7 +60,7 @@ The QEMU source code is maintained under the GIT version control system.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://git.qemu.org/git/qemu.git
|
git clone https://gitlab.com/qemu-project/qemu.git
|
||||||
|
|
||||||
When submitting patches, one common approach is to use 'git
|
When submitting patches, one common approach is to use 'git
|
||||||
format-patch' and/or 'git send-email' to format & send the mail to the
|
format-patch' and/or 'git send-email' to format & send the mail to the
|
||||||
|
@ -78,7 +78,7 @@ The QEMU website is also maintained under source control.
|
||||||
|
|
||||||
.. code-block:: shell
|
.. code-block:: shell
|
||||||
|
|
||||||
git clone https://git.qemu.org/git/qemu-web.git
|
git clone https://gitlab.com/qemu-project/qemu-web.git
|
||||||
|
|
||||||
* `<https://www.qemu.org/2017/02/04/the-new-qemu-website-is-up/>`_
|
* `<https://www.qemu.org/2017/02/04/the-new-qemu-website-is-up/>`_
|
||||||
|
|
||||||
|
|
|
@ -55,7 +55,7 @@ memfd_backend_memory_alloc(HostMemoryBackend *backend, Error **errp)
|
||||||
name = host_memory_backend_get_name(backend);
|
name = host_memory_backend_get_name(backend);
|
||||||
memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
|
memory_region_init_ram_from_fd(&backend->mr, OBJECT(backend),
|
||||||
name, backend->size,
|
name, backend->size,
|
||||||
backend->share, fd, errp);
|
backend->share, fd, 0, errp);
|
||||||
g_free(name);
|
g_free(name);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
10
configure
vendored
10
configure
vendored
|
@ -463,6 +463,7 @@ skip_meson=no
|
||||||
gettext="auto"
|
gettext="auto"
|
||||||
fuse="auto"
|
fuse="auto"
|
||||||
fuse_lseek="auto"
|
fuse_lseek="auto"
|
||||||
|
multiprocess="no"
|
||||||
|
|
||||||
malloc_trim="auto"
|
malloc_trim="auto"
|
||||||
|
|
||||||
|
@ -797,6 +798,7 @@ Linux)
|
||||||
linux="yes"
|
linux="yes"
|
||||||
linux_user="yes"
|
linux_user="yes"
|
||||||
vhost_user=${default_feature:-yes}
|
vhost_user=${default_feature:-yes}
|
||||||
|
multiprocess=${default_feature:-yes}
|
||||||
;;
|
;;
|
||||||
esac
|
esac
|
||||||
|
|
||||||
|
@ -1556,6 +1558,10 @@ for opt do
|
||||||
;;
|
;;
|
||||||
--disable-fuse-lseek) fuse_lseek="disabled"
|
--disable-fuse-lseek) fuse_lseek="disabled"
|
||||||
;;
|
;;
|
||||||
|
--enable-multiprocess) multiprocess="yes"
|
||||||
|
;;
|
||||||
|
--disable-multiprocess) multiprocess="no"
|
||||||
|
;;
|
||||||
*)
|
*)
|
||||||
echo "ERROR: unknown option $opt"
|
echo "ERROR: unknown option $opt"
|
||||||
echo "Try '$0 --help' for more information"
|
echo "Try '$0 --help' for more information"
|
||||||
|
@ -1908,6 +1914,7 @@ disabled with --disable-FEATURE, default is enabled if available
|
||||||
libdaxctl libdaxctl support
|
libdaxctl libdaxctl support
|
||||||
fuse FUSE block device export
|
fuse FUSE block device export
|
||||||
fuse-lseek SEEK_HOLE/SEEK_DATA support for FUSE exports
|
fuse-lseek SEEK_HOLE/SEEK_DATA support for FUSE exports
|
||||||
|
multiprocess Multiprocess QEMU support
|
||||||
|
|
||||||
NOTE: The object files are built at the place where configure is launched
|
NOTE: The object files are built at the place where configure is launched
|
||||||
EOF
|
EOF
|
||||||
|
@ -6082,6 +6089,9 @@ fi
|
||||||
if test "$have_mlockall" = "yes" ; then
|
if test "$have_mlockall" = "yes" ; then
|
||||||
echo "HAVE_MLOCKALL=y" >> $config_host_mak
|
echo "HAVE_MLOCKALL=y" >> $config_host_mak
|
||||||
fi
|
fi
|
||||||
|
if test "$multiprocess" = "yes" ; then
|
||||||
|
echo "CONFIG_MULTIPROCESS_ALLOWED=y" >> $config_host_mak
|
||||||
|
fi
|
||||||
if test "$fuzzing" = "yes" ; then
|
if test "$fuzzing" = "yes" ; then
|
||||||
# If LIB_FUZZING_ENGINE is set, assume we are running on OSS-Fuzz, and the
|
# If LIB_FUZZING_ENGINE is set, assume we are running on OSS-Fuzz, and the
|
||||||
# needed CFLAGS have already been provided
|
# needed CFLAGS have already been provided
|
||||||
|
|
|
@ -37,3 +37,4 @@ Contents:
|
||||||
clocks
|
clocks
|
||||||
qom
|
qom
|
||||||
block-coroutine-wrapper
|
block-coroutine-wrapper
|
||||||
|
multi-process
|
||||||
|
|
966
docs/devel/multi-process.rst
Normal file
966
docs/devel/multi-process.rst
Normal file
|
@ -0,0 +1,966 @@
|
||||||
|
This is the design document for multi-process QEMU. It does not
|
||||||
|
necessarily reflect the status of the current implementation, which
|
||||||
|
may lack features or be considerably different from what is described
|
||||||
|
in this document. This document is still useful as a description of
|
||||||
|
the goals and general direction of this feature.
|
||||||
|
|
||||||
|
Please refer to the following wiki for latest details:
|
||||||
|
https://wiki.qemu.org/Features/MultiProcessQEMU
|
||||||
|
|
||||||
|
Multi-process QEMU
|
||||||
|
===================
|
||||||
|
|
||||||
|
QEMU is often used as the hypervisor for virtual machines running in the
|
||||||
|
Oracle cloud. Since one of the advantages of cloud computing is the
|
||||||
|
ability to run many VMs from different tenants in the same cloud
|
||||||
|
infrastructure, a guest that compromised its hypervisor could
|
||||||
|
potentially use the hypervisor's access privileges to access data it is
|
||||||
|
not authorized for.
|
||||||
|
|
||||||
|
QEMU can be susceptible to security attacks because it is a large,
|
||||||
|
monolithic program that provides many features to the VMs it services.
|
||||||
|
Many of these features can be configured out of QEMU, but even a reduced
|
||||||
|
configuration QEMU has a large amount of code a guest can potentially
|
||||||
|
attack. Separating QEMU reduces the attack surface by aiding to
|
||||||
|
limit each component in the system to only access the resources that
|
||||||
|
it needs to perform its job.
|
||||||
|
|
||||||
|
QEMU services
|
||||||
|
-------------
|
||||||
|
|
||||||
|
QEMU can be broadly described as providing three main services. One is a
|
||||||
|
VM control point, where VMs can be created, migrated, re-configured, and
|
||||||
|
destroyed. A second is to emulate the CPU instructions within the VM,
|
||||||
|
often accelerated by HW virtualization features such as Intel's VT
|
||||||
|
extensions. Finally, it provides IO services to the VM by emulating HW
|
||||||
|
IO devices, such as disk and network devices.
|
||||||
|
|
||||||
|
A multi-process QEMU
|
||||||
|
~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
A multi-process QEMU involves separating QEMU services into separate
|
||||||
|
host processes. Each of these processes can be given only the privileges
|
||||||
|
it needs to provide its service, e.g., a disk service could be given
|
||||||
|
access only to the disk images it provides, and not be allowed to
|
||||||
|
access other files, or any network devices. An attacker who compromised
|
||||||
|
this service would not be able to use this exploit to access files or
|
||||||
|
devices beyond what the disk service was given access to.
|
||||||
|
|
||||||
|
A QEMU control process would remain, but in multi-process mode, will
|
||||||
|
have no direct interfaces to the VM. During VM execution, it would still
|
||||||
|
provide the user interface to hot-plug devices or live migrate the VM.
|
||||||
|
|
||||||
|
A first step in creating a multi-process QEMU is to separate IO services
|
||||||
|
from the main QEMU program, which would continue to provide CPU
|
||||||
|
emulation. i.e., the control process would also be the CPU emulation
|
||||||
|
process. In a later phase, CPU emulation could be separated from the
|
||||||
|
control process.
|
||||||
|
|
||||||
|
Separating IO services
|
||||||
|
----------------------
|
||||||
|
|
||||||
|
Separating IO services into individual host processes is a good place to
|
||||||
|
begin for a couple of reasons. One is the sheer number of IO devices QEMU
|
||||||
|
can emulate provides a large surface of interfaces which could potentially
|
||||||
|
be exploited, and, indeed, have been a source of exploits in the past.
|
||||||
|
Another is the modular nature of QEMU device emulation code provides
|
||||||
|
interface points where the QEMU functions that perform device emulation
|
||||||
|
can be separated from the QEMU functions that manage the emulation of
|
||||||
|
guest CPU instructions. The devices emulated in the separate process are
|
||||||
|
referred to as remote devices.
|
||||||
|
|
||||||
|
QEMU device emulation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
QEMU uses an object oriented SW architecture for device emulation code.
|
||||||
|
Configured objects are all compiled into the QEMU binary, then objects
|
||||||
|
are instantiated by name when used by the guest VM. For example, the
|
||||||
|
code to emulate a device named "foo" is always present in QEMU, but its
|
||||||
|
instantiation code is only run when the device is included in the target
|
||||||
|
VM. (e.g., via the QEMU command line as *-device foo*)
|
||||||
|
|
||||||
|
The object model is hierarchical, so device emulation code names its
|
||||||
|
parent object (such as "pci-device" for a PCI device) and QEMU will
|
||||||
|
instantiate a parent object before calling the device's instantiation
|
||||||
|
code.
|
||||||
|
|
||||||
|
Current separation models
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
In order to separate the device emulation code from the CPU emulation
|
||||||
|
code, the device object code must run in a different process. There are
|
||||||
|
a couple of existing QEMU features that can run emulation code
|
||||||
|
separately from the main QEMU process. These are examined below.
|
||||||
|
|
||||||
|
vhost user model
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Virtio guest device drivers can be connected to vhost user applications
|
||||||
|
in order to perform their IO operations. This model uses special virtio
|
||||||
|
device drivers in the guest and vhost user device objects in QEMU, but
|
||||||
|
once the QEMU vhost user code has configured the vhost user application,
|
||||||
|
mission-mode IO is performed by the application. The vhost user
|
||||||
|
application is a daemon process that can be contacted via a known UNIX
|
||||||
|
domain socket.
|
||||||
|
|
||||||
|
vhost socket
|
||||||
|
''''''''''''
|
||||||
|
|
||||||
|
As mentioned above, one of the tasks of the vhost device object within
|
||||||
|
QEMU is to contact the vhost application and send it configuration
|
||||||
|
information about this device instance. As part of the configuration
|
||||||
|
process, the application can also be sent other file descriptors over
|
||||||
|
the socket, which then can be used by the vhost user application in
|
||||||
|
various ways, some of which are described below.
|
||||||
|
|
||||||
|
vhost MMIO store acceleration
|
||||||
|
'''''''''''''''''''''''''''''
|
||||||
|
|
||||||
|
VMs are often run using HW virtualization features via the KVM kernel
|
||||||
|
driver. This driver allows QEMU to accelerate the emulation of guest CPU
|
||||||
|
instructions by running the guest in a virtual HW mode. When the guest
|
||||||
|
executes instructions that cannot be executed by virtual HW mode,
|
||||||
|
execution returns to the KVM driver so it can inform QEMU to emulate the
|
||||||
|
instructions in SW.
|
||||||
|
|
||||||
|
One of the events that can cause a return to QEMU is when a guest device
|
||||||
|
driver accesses an IO location. QEMU then dispatches the memory
|
||||||
|
operation to the corresponding QEMU device object. In the case of a
|
||||||
|
vhost user device, the memory operation would need to be sent over a
|
||||||
|
socket to the vhost application. This path is accelerated by the QEMU
|
||||||
|
virtio code by setting up an eventfd file descriptor that the vhost
|
||||||
|
application can directly receive MMIO store notifications from the KVM
|
||||||
|
driver, instead of needing them to be sent to the QEMU process first.
|
||||||
|
|
||||||
|
vhost interrupt acceleration
|
||||||
|
''''''''''''''''''''''''''''
|
||||||
|
|
||||||
|
Another optimization used by the vhost application is the ability to
|
||||||
|
directly inject interrupts into the VM via the KVM driver, again,
|
||||||
|
bypassing the need to send the interrupt back to the QEMU process first.
|
||||||
|
The QEMU virtio setup code configures the KVM driver with an eventfd
|
||||||
|
that triggers the device interrupt in the guest when the eventfd is
|
||||||
|
written. This irqfd file descriptor is then passed to the vhost user
|
||||||
|
application program.
|
||||||
|
|
||||||
|
vhost access to guest memory
|
||||||
|
''''''''''''''''''''''''''''
|
||||||
|
|
||||||
|
The vhost application is also allowed to directly access guest memory,
|
||||||
|
instead of needing to send the data as messages to QEMU. This is also
|
||||||
|
done with file descriptors sent to the vhost user application by QEMU.
|
||||||
|
These descriptors can be passed to ``mmap()`` by the vhost application
|
||||||
|
to map the guest address space into the vhost application.
|
||||||
|
|
||||||
|
IOMMUs introduce another level of complexity, since the address given to
|
||||||
|
the guest virtio device to DMA to or from is not a guest physical
|
||||||
|
address. This case is handled by having vhost code within QEMU register
|
||||||
|
as a listener for IOMMU mapping changes. The vhost application maintains
|
||||||
|
a cache of IOMMMU translations: sending translation requests back to
|
||||||
|
QEMU on cache misses, and in turn receiving flush requests from QEMU
|
||||||
|
when mappings are purged.
|
||||||
|
|
||||||
|
applicability to device separation
|
||||||
|
''''''''''''''''''''''''''''''''''
|
||||||
|
|
||||||
|
Much of the vhost model can be re-used by separated device emulation. In
|
||||||
|
particular, the ideas of using a socket between QEMU and the device
|
||||||
|
emulation application, using a file descriptor to inject interrupts into
|
||||||
|
the VM via KVM, and allowing the application to ``mmap()`` the guest
|
||||||
|
should be re used.
|
||||||
|
|
||||||
|
There are, however, some notable differences between how a vhost
|
||||||
|
application works and the needs of separated device emulation. The most
|
||||||
|
basic is that vhost uses custom virtio device drivers which always
|
||||||
|
trigger IO with MMIO stores. A separated device emulation model must
|
||||||
|
work with existing IO device models and guest device drivers. MMIO loads
|
||||||
|
break vhost store acceleration since they are synchronous - guest
|
||||||
|
progress cannot continue until the load has been emulated. By contrast,
|
||||||
|
stores are asynchronous, the guest can continue after the store event
|
||||||
|
has been sent to the vhost application.
|
||||||
|
|
||||||
|
Another difference is that in the vhost user model, a single daemon can
|
||||||
|
support multiple QEMU instances. This is contrary to the security regime
|
||||||
|
desired, in which the emulation application should only be allowed to
|
||||||
|
access the files or devices the VM it's running on behalf of can access.
|
||||||
|
#### qemu-io model
|
||||||
|
|
||||||
|
Qemu-io is a test harness used to test changes to the QEMU block backend
|
||||||
|
object code. (e.g., the code that implements disk images for disk driver
|
||||||
|
emulation) Qemu-io is not a device emulation application per se, but it
|
||||||
|
does compile the QEMU block objects into a separate binary from the main
|
||||||
|
QEMU one. This could be useful for disk device emulation, since its
|
||||||
|
emulation applications will need to include the QEMU block objects.
|
||||||
|
|
||||||
|
New separation model based on proxy objects
|
||||||
|
-------------------------------------------
|
||||||
|
|
||||||
|
A different model based on proxy objects in the QEMU program
|
||||||
|
communicating with remote emulation programs could provide separation
|
||||||
|
while minimizing the changes needed to the device emulation code. The
|
||||||
|
rest of this section is a discussion of how a proxy object model would
|
||||||
|
work.
|
||||||
|
|
||||||
|
Remote emulation processes
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The remote emulation process will run the QEMU object hierarchy without
|
||||||
|
modification. The device emulation objects will be also be based on the
|
||||||
|
QEMU code, because for anything but the simplest device, it would not be
|
||||||
|
a tractable to re-implement both the object model and the many device
|
||||||
|
backends that QEMU has.
|
||||||
|
|
||||||
|
The processes will communicate with the QEMU process over UNIX domain
|
||||||
|
sockets. The processes can be executed either as standalone processes,
|
||||||
|
or be executed by QEMU. In both cases, the host backends the emulation
|
||||||
|
processes will provide are specified on its command line, as they would
|
||||||
|
be for QEMU. For example:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
disk-proc -blockdev driver=file,node-name=file0,filename=disk-file0 \
|
||||||
|
-blockdev driver=qcow2,node-name=drive0,file=file0
|
||||||
|
|
||||||
|
would indicate process *disk-proc* uses a qcow2 emulated disk named
|
||||||
|
*file0* as its backend.
|
||||||
|
|
||||||
|
Emulation processes may emulate more than one guest controller. A common
|
||||||
|
configuration might be to put all controllers of the same device class
|
||||||
|
(e.g., disk, network, etc.) in a single process, so that all backends of
|
||||||
|
the same type can be managed by a single QMP monitor.
|
||||||
|
|
||||||
|
communication with QEMU
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The first argument to the remote emulation process will be a Unix domain
|
||||||
|
socket that connects with the Proxy object. This is a required argument.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
disk-proc <socket number> <backend list>
|
||||||
|
|
||||||
|
remote process QMP monitor
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Remote emulation processes can be monitored via QMP, similar to QEMU
|
||||||
|
itself. The QMP monitor socket is specified the same as for a QEMU
|
||||||
|
process:
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
disk-proc -qmp unix:/tmp/disk-mon,server
|
||||||
|
|
||||||
|
can be monitored over the UNIX socket path */tmp/disk-mon*.
|
||||||
|
|
||||||
|
QEMU command line
|
||||||
|
~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Each remote device emulated in a remote process on the host is
|
||||||
|
represented as a *-device* of type *pci-proxy-dev*. A socket
|
||||||
|
sub-option to this option specifies the Unix socket that connects
|
||||||
|
to the remote process. An *id* sub-option is required, and it should
|
||||||
|
be the same id as used in the remote process.
|
||||||
|
|
||||||
|
::
|
||||||
|
|
||||||
|
qemu-system-x86_64 ... -device pci-proxy-dev,id=lsi0,socket=3
|
||||||
|
|
||||||
|
can be used to add a device emulated in a remote process
|
||||||
|
|
||||||
|
|
||||||
|
QEMU management of remote processes
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
QEMU is not aware of the type of type of the remote PCI device. It is
|
||||||
|
a pass through device as far as QEMU is concerned.
|
||||||
|
|
||||||
|
communication with emulation process
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
primary channel
|
||||||
|
'''''''''''''''
|
||||||
|
|
||||||
|
The primary channel (referred to as com in the code) is used to bootstrap
|
||||||
|
the remote process. It is also used to pass on device-agnostic commands
|
||||||
|
like reset.
|
||||||
|
|
||||||
|
per-device channels
|
||||||
|
'''''''''''''''''''
|
||||||
|
|
||||||
|
Each remote device communicates with QEMU using a dedicated communication
|
||||||
|
channel. The proxy object sets up this channel using the primary
|
||||||
|
channel during its initialization.
|
||||||
|
|
||||||
|
QEMU device proxy objects
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
QEMU has an object model based on sub-classes inherited from the
|
||||||
|
"object" super-class. The sub-classes that are of interest here are the
|
||||||
|
"device" and "bus" sub-classes whose child sub-classes make up the
|
||||||
|
device tree of a QEMU emulated system.
|
||||||
|
|
||||||
|
The proxy object model will use device proxy objects to replace the
|
||||||
|
device emulation code within the QEMU process. These objects will live
|
||||||
|
in the same place in the object and bus hierarchies as the objects they
|
||||||
|
replace. i.e., the proxy object for an LSI SCSI controller will be a
|
||||||
|
sub-class of the "pci-device" class, and will have the same PCI bus
|
||||||
|
parent and the same SCSI bus child objects as the LSI controller object
|
||||||
|
it replaces.
|
||||||
|
|
||||||
|
It is worth noting that the same proxy object is used to mediate with
|
||||||
|
all types of remote PCI devices.
|
||||||
|
|
||||||
|
object initialization
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The Proxy device objects are initialized in the exact same manner in
|
||||||
|
which any other QEMU device would be initialized.
|
||||||
|
|
||||||
|
In addition, the Proxy objects perform the following two tasks:
|
||||||
|
- Parses the "socket" sub option and connects to the remote process
|
||||||
|
using this channel
|
||||||
|
- Uses the "id" sub-option to connect to the emulated device on the
|
||||||
|
separate process
|
||||||
|
|
||||||
|
class\_init
|
||||||
|
'''''''''''
|
||||||
|
|
||||||
|
The ``class_init()`` method of a proxy object will, in general behave
|
||||||
|
similarly to the object it replaces, including setting any static
|
||||||
|
properties and methods needed by the proxy.
|
||||||
|
|
||||||
|
instance\_init / realize
|
||||||
|
''''''''''''''''''''''''
|
||||||
|
|
||||||
|
The ``instance_init()`` and ``realize()`` functions would only need to
|
||||||
|
perform tasks related to being a proxy, such are registering its own
|
||||||
|
MMIO handlers, or creating a child bus that other proxy devices can be
|
||||||
|
attached to later.
|
||||||
|
|
||||||
|
Other tasks will be device-specific. For example, PCI device objects
|
||||||
|
will initialize the PCI config space in order to make a valid PCI device
|
||||||
|
tree within the QEMU process.
|
||||||
|
|
||||||
|
address space registration
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Most devices are driven by guest device driver accesses to IO addresses
|
||||||
|
or ports. The QEMU device emulation code uses QEMU's memory region
|
||||||
|
function calls (such as ``memory_region_init_io()``) to add callback
|
||||||
|
functions that QEMU will invoke when the guest accesses the device's
|
||||||
|
areas of the IO address space. When a guest driver does access the
|
||||||
|
device, the VM will exit HW virtualization mode and return to QEMU,
|
||||||
|
which will then lookup and execute the corresponding callback function.
|
||||||
|
|
||||||
|
A proxy object would need to mirror the memory region calls the actual
|
||||||
|
device emulator would perform in its initialization code, but with its
|
||||||
|
own callbacks. When invoked by QEMU as a result of a guest IO operation,
|
||||||
|
they will forward the operation to the device emulation process.
|
||||||
|
|
||||||
|
PCI config space
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
PCI devices also have a configuration space that can be accessed by the
|
||||||
|
guest driver. Guest accesses to this space is not handled by the device
|
||||||
|
emulation object, but by its PCI parent object. Much of this space is
|
||||||
|
read-only, but certain registers (especially BAR and MSI-related ones)
|
||||||
|
need to be propagated to the emulation process.
|
||||||
|
|
||||||
|
PCI parent proxy
|
||||||
|
''''''''''''''''
|
||||||
|
|
||||||
|
One way to propagate guest PCI config accesses is to create a
|
||||||
|
"pci-device-proxy" class that can serve as the parent of a PCI device
|
||||||
|
proxy object. This class's parent would be "pci-device" and it would
|
||||||
|
override the PCI parent's ``config_read()`` and ``config_write()``
|
||||||
|
methods with ones that forward these operations to the emulation
|
||||||
|
program.
|
||||||
|
|
||||||
|
interrupt receipt
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
A proxy for a device that generates interrupts will need to create a
|
||||||
|
socket to receive interrupt indications from the emulation process. An
|
||||||
|
incoming interrupt indication would then be sent up to its bus parent to
|
||||||
|
be injected into the guest. For example, a PCI device object may use
|
||||||
|
``pci_set_irq()``.
|
||||||
|
|
||||||
|
live migration
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The proxy will register to save and restore any *vmstate* it needs over
|
||||||
|
a live migration event. The device proxy does not need to manage the
|
||||||
|
remote device's *vmstate*; that will be handled by the remote process
|
||||||
|
proxy (see below).
|
||||||
|
|
||||||
|
QEMU remote device operation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Generic device operations, such as DMA, will be performed by the remote
|
||||||
|
process proxy by sending messages to the remote process.
|
||||||
|
|
||||||
|
DMA operations
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
DMA operations would be handled much like vhost applications do. One of
|
||||||
|
the initial messages sent to the emulation process is a guest memory
|
||||||
|
table. Each entry in this table consists of a file descriptor and size
|
||||||
|
that the emulation process can ``mmap()`` to directly access guest
|
||||||
|
memory, similar to ``vhost_user_set_mem_table()``. Note guest memory
|
||||||
|
must be backed by file descriptors, such as when QEMU is given the
|
||||||
|
*-mem-path* command line option.
|
||||||
|
|
||||||
|
IOMMU operations
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
When the emulated system includes an IOMMU, the remote process proxy in
|
||||||
|
QEMU will need to create a socket for IOMMU requests from the emulation
|
||||||
|
process. It will handle those requests with an
|
||||||
|
``address_space_get_iotlb_entry()`` call. In order to handle IOMMU
|
||||||
|
unmaps, the remote process proxy will also register as a listener on the
|
||||||
|
device's DMA address space. When an IOMMU memory region is created
|
||||||
|
within the DMA address space, an IOMMU notifier for unmaps will be added
|
||||||
|
to the memory region that will forward unmaps to the emulation process
|
||||||
|
over the IOMMU socket.
|
||||||
|
|
||||||
|
device hot-plug via QMP
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
An QMP "device\_add" command can add a device emulated by a remote
|
||||||
|
process. It will also have "rid" option to the command, just as the
|
||||||
|
*-device* command line option does. The remote process may either be one
|
||||||
|
started at QEMU startup, or be one added by the "add-process" QMP
|
||||||
|
command described above. In either case, the remote process proxy will
|
||||||
|
forward the new device's JSON description to the corresponding emulation
|
||||||
|
process.
|
||||||
|
|
||||||
|
live migration
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The remote process proxy will also register for live migration
|
||||||
|
notifications with ``vmstate_register()``. When called to save state,
|
||||||
|
the proxy will send the remote process a secondary socket file
|
||||||
|
descriptor to save the remote process's device *vmstate* over. The
|
||||||
|
incoming byte stream length and data will be saved as the proxy's
|
||||||
|
*vmstate*. When the proxy is resumed on its new host, this *vmstate*
|
||||||
|
will be extracted, and a secondary socket file descriptor will be sent
|
||||||
|
to the new remote process through which it receives the *vmstate* in
|
||||||
|
order to restore the devices there.
|
||||||
|
|
||||||
|
device emulation in remote process
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The parts of QEMU that the emulation program will need include the
|
||||||
|
object model; the memory emulation objects; the device emulation objects
|
||||||
|
of the targeted device, and any dependent devices; and, the device's
|
||||||
|
backends. It will also need code to setup the machine environment,
|
||||||
|
handle requests from the QEMU process, and route machine-level requests
|
||||||
|
(such as interrupts or IOMMU mappings) back to the QEMU process.
|
||||||
|
|
||||||
|
initialization
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The process initialization sequence will follow the same sequence
|
||||||
|
followed by QEMU. It will first initialize the backend objects, then
|
||||||
|
device emulation objects. The JSON descriptions sent by the QEMU process
|
||||||
|
will drive which objects need to be created.
|
||||||
|
|
||||||
|
- address spaces
|
||||||
|
|
||||||
|
Before the device objects are created, the initial address spaces and
|
||||||
|
memory regions must be configured with ``memory_map_init()``. This
|
||||||
|
creates a RAM memory region object (*system\_memory*) and an IO memory
|
||||||
|
region object (*system\_io*).
|
||||||
|
|
||||||
|
- RAM
|
||||||
|
|
||||||
|
RAM memory region creation will follow how ``pc_memory_init()`` creates
|
||||||
|
them, but must use ``memory_region_init_ram_from_fd()`` instead of
|
||||||
|
``memory_region_allocate_system_memory()``. The file descriptors needed
|
||||||
|
will be supplied by the guest memory table from above. Those RAM regions
|
||||||
|
would then be added to the *system\_memory* memory region with
|
||||||
|
``memory_region_add_subregion()``.
|
||||||
|
|
||||||
|
- PCI
|
||||||
|
|
||||||
|
IO initialization will be driven by the JSON descriptions sent from the
|
||||||
|
QEMU process. For a PCI device, a PCI bus will need to be created with
|
||||||
|
``pci_root_bus_new()``, and a PCI memory region will need to be created
|
||||||
|
and added to the *system\_memory* memory region with
|
||||||
|
``memory_region_add_subregion_overlap()``. The overlap version is
|
||||||
|
required for architectures where PCI memory overlaps with RAM memory.
|
||||||
|
|
||||||
|
MMIO handling
|
||||||
|
^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The device emulation objects will use ``memory_region_init_io()`` to
|
||||||
|
install their MMIO handlers, and ``pci_register_bar()`` to associate
|
||||||
|
those handlers with a PCI BAR, as they do within QEMU currently.
|
||||||
|
|
||||||
|
In order to use ``address_space_rw()`` in the emulation process to
|
||||||
|
handle MMIO requests from QEMU, the PCI physical addresses must be the
|
||||||
|
same in the QEMU process and the device emulation process. In order to
|
||||||
|
accomplish that, guest BAR programming must also be forwarded from QEMU
|
||||||
|
to the emulation process.
|
||||||
|
|
||||||
|
interrupt injection
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
When device emulation wants to inject an interrupt into the VM, the
|
||||||
|
request climbs the device's bus object hierarchy until the point where a
|
||||||
|
bus object knows how to signal the interrupt to the guest. The details
|
||||||
|
depend on the type of interrupt being raised.
|
||||||
|
|
||||||
|
- PCI pin interrupts
|
||||||
|
|
||||||
|
On x86 systems, there is an emulated IOAPIC object attached to the root
|
||||||
|
PCI bus object, and the root PCI object forwards interrupt requests to
|
||||||
|
it. The IOAPIC object, in turn, calls the KVM driver to inject the
|
||||||
|
corresponding interrupt into the VM. The simplest way to handle this in
|
||||||
|
an emulation process would be to setup the root PCI bus driver (via
|
||||||
|
``pci_bus_irqs()``) to send a interrupt request back to the QEMU
|
||||||
|
process, and have the device proxy object reflect it up the PCI tree
|
||||||
|
there.
|
||||||
|
|
||||||
|
- PCI MSI/X interrupts
|
||||||
|
|
||||||
|
PCI MSI/X interrupts are implemented in HW as DMA writes to a
|
||||||
|
CPU-specific PCI address. In QEMU on x86, a KVM APIC object receives
|
||||||
|
these DMA writes, then calls into the KVM driver to inject the interrupt
|
||||||
|
into the VM. A simple emulation process implementation would be to send
|
||||||
|
the MSI DMA address from QEMU as a message at initialization, then
|
||||||
|
install an address space handler at that address which forwards the MSI
|
||||||
|
message back to QEMU.
|
||||||
|
|
||||||
|
DMA operations
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
When a emulation object wants to DMA into or out of guest memory, it
|
||||||
|
first must use dma\_memory\_map() to convert the DMA address to a local
|
||||||
|
virtual address. The emulation process memory region objects setup above
|
||||||
|
will be used to translate the DMA address to a local virtual address the
|
||||||
|
device emulation code can access.
|
||||||
|
|
||||||
|
IOMMU
|
||||||
|
^^^^^
|
||||||
|
|
||||||
|
When an IOMMU is in use in QEMU, DMA translation uses IOMMU memory
|
||||||
|
regions to translate the DMA address to a guest physical address before
|
||||||
|
that physical address can be translated to a local virtual address. The
|
||||||
|
emulation process will need similar functionality.
|
||||||
|
|
||||||
|
- IOTLB cache
|
||||||
|
|
||||||
|
The emulation process will maintain a cache of recent IOMMU translations
|
||||||
|
(the IOTLB). When the translate() callback of an IOMMU memory region is
|
||||||
|
invoked, the IOTLB cache will be searched for an entry that will map the
|
||||||
|
DMA address to a guest PA. On a cache miss, a message will be sent back
|
||||||
|
to QEMU requesting the corresponding translation entry, which be both be
|
||||||
|
used to return a guest address and be added to the cache.
|
||||||
|
|
||||||
|
- IOTLB purge
|
||||||
|
|
||||||
|
The IOMMU emulation will also need to act on unmap requests from QEMU.
|
||||||
|
These happen when the guest IOMMU driver purges an entry from the
|
||||||
|
guest's translation table.
|
||||||
|
|
||||||
|
live migration
|
||||||
|
^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
When a remote process receives a live migration indication from QEMU, it
|
||||||
|
will set up a channel using the received file descriptor with
|
||||||
|
``qio_channel_socket_new_fd()``. This channel will be used to create a
|
||||||
|
*QEMUfile* that can be passed to ``qemu_save_device_state()`` to send
|
||||||
|
the process's device state back to QEMU. This method will be reversed on
|
||||||
|
restore - the channel will be passed to ``qemu_loadvm_state()`` to
|
||||||
|
restore the device state.
|
||||||
|
|
||||||
|
Accelerating device emulation
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
The messages that are required to be sent between QEMU and the emulation
|
||||||
|
process can add considerable latency to IO operations. The optimizations
|
||||||
|
described below attempt to ameliorate this effect by allowing the
|
||||||
|
emulation process to communicate directly with the kernel KVM driver.
|
||||||
|
The KVM file descriptors created would be passed to the emulation process
|
||||||
|
via initialization messages, much like the guest memory table is done.
|
||||||
|
#### MMIO acceleration
|
||||||
|
|
||||||
|
Vhost user applications can receive guest virtio driver stores directly
|
||||||
|
from KVM. The issue with the eventfd mechanism used by vhost user is
|
||||||
|
that it does not pass any data with the event indication, so it cannot
|
||||||
|
handle guest loads or guest stores that carry store data. This concept
|
||||||
|
could, however, be expanded to cover more cases.
|
||||||
|
|
||||||
|
The expanded idea would require a new type of KVM device:
|
||||||
|
*KVM\_DEV\_TYPE\_USER*. This device has two file descriptors: a master
|
||||||
|
descriptor that QEMU can use for configuration, and a slave descriptor
|
||||||
|
that the emulation process can use to receive MMIO notifications. QEMU
|
||||||
|
would create both descriptors using the KVM driver, and pass the slave
|
||||||
|
descriptor to the emulation process via an initialization message.
|
||||||
|
|
||||||
|
data structures
|
||||||
|
^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
- guest physical range
|
||||||
|
|
||||||
|
The guest physical range structure describes the address range that a
|
||||||
|
device will respond to. It includes the base and length of the range, as
|
||||||
|
well as which bus the range resides on (e.g., on an x86machine, it can
|
||||||
|
specify whether the range refers to memory or IO addresses).
|
||||||
|
|
||||||
|
A device can have multiple physical address ranges it responds to (e.g.,
|
||||||
|
a PCI device can have multiple BARs), so the structure will also include
|
||||||
|
an enumerated identifier to specify which of the device's ranges is
|
||||||
|
being referred to.
|
||||||
|
|
||||||
|
+--------+----------------------------+
|
||||||
|
| Name | Description |
|
||||||
|
+========+============================+
|
||||||
|
| addr | range base address |
|
||||||
|
+--------+----------------------------+
|
||||||
|
| len | range length |
|
||||||
|
+--------+----------------------------+
|
||||||
|
| bus | addr type (memory or IO) |
|
||||||
|
+--------+----------------------------+
|
||||||
|
| id | range ID (e.g., PCI BAR) |
|
||||||
|
+--------+----------------------------+
|
||||||
|
|
||||||
|
- MMIO request structure
|
||||||
|
|
||||||
|
This structure describes an MMIO operation. It includes which guest
|
||||||
|
physical range the MMIO was within, the offset within that range, the
|
||||||
|
MMIO type (e.g., load or store), and its length and data. It also
|
||||||
|
includes a sequence number that can be used to reply to the MMIO, and
|
||||||
|
the CPU that issued the MMIO.
|
||||||
|
|
||||||
|
+----------+------------------------+
|
||||||
|
| Name | Description |
|
||||||
|
+==========+========================+
|
||||||
|
| rid | range MMIO is within |
|
||||||
|
+----------+------------------------+
|
||||||
|
| offset | offset withing *rid* |
|
||||||
|
+----------+------------------------+
|
||||||
|
| type | e.g., load or store |
|
||||||
|
+----------+------------------------+
|
||||||
|
| len | MMIO length |
|
||||||
|
+----------+------------------------+
|
||||||
|
| data | store data |
|
||||||
|
+----------+------------------------+
|
||||||
|
| seq | sequence ID |
|
||||||
|
+----------+------------------------+
|
||||||
|
|
||||||
|
- MMIO request queues
|
||||||
|
|
||||||
|
MMIO request queues are FIFO arrays of MMIO request structures. There
|
||||||
|
are two queues: pending queue is for MMIOs that haven't been read by the
|
||||||
|
emulation program, and the sent queue is for MMIOs that haven't been
|
||||||
|
acknowledged. The main use of the second queue is to validate MMIO
|
||||||
|
replies from the emulation program.
|
||||||
|
|
||||||
|
- scoreboard
|
||||||
|
|
||||||
|
Each CPU in the VM is emulated in QEMU by a separate thread, so multiple
|
||||||
|
MMIOs may be waiting to be consumed by an emulation program and multiple
|
||||||
|
threads may be waiting for MMIO replies. The scoreboard would contain a
|
||||||
|
wait queue and sequence number for the per-CPU threads, allowing them to
|
||||||
|
be individually woken when the MMIO reply is received from the emulation
|
||||||
|
program. It also tracks the number of posted MMIO stores to the device
|
||||||
|
that haven't been replied to, in order to satisfy the PCI constraint
|
||||||
|
that a load to a device will not complete until all previous stores to
|
||||||
|
that device have been completed.
|
||||||
|
|
||||||
|
- device shadow memory
|
||||||
|
|
||||||
|
Some MMIO loads do not have device side-effects. These MMIOs can be
|
||||||
|
completed without sending a MMIO request to the emulation program if the
|
||||||
|
emulation program shares a shadow image of the device's memory image
|
||||||
|
with the KVM driver.
|
||||||
|
|
||||||
|
The emulation program will ask the KVM driver to allocate memory for the
|
||||||
|
shadow image, and will then use ``mmap()`` to directly access it. The
|
||||||
|
emulation program can control KVM access to the shadow image by sending
|
||||||
|
KVM an access map telling it which areas of the image have no
|
||||||
|
side-effects (and can be completed immediately), and which require a
|
||||||
|
MMIO request to the emulation program. The access map can also inform
|
||||||
|
the KVM drive which size accesses are allowed to the image.
|
||||||
|
|
||||||
|
master descriptor
|
||||||
|
^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The master descriptor is used by QEMU to configure the new KVM device.
|
||||||
|
The descriptor would be returned by the KVM driver when QEMU issues a
|
||||||
|
*KVM\_CREATE\_DEVICE* ``ioctl()`` with a *KVM\_DEV\_TYPE\_USER* type.
|
||||||
|
|
||||||
|
KVM\_DEV\_TYPE\_USER device ops
|
||||||
|
|
||||||
|
|
||||||
|
The *KVM\_DEV\_TYPE\_USER* operations vector will be registered by a
|
||||||
|
``kvm_register_device_ops()`` call when the KVM system in initialized by
|
||||||
|
``kvm_init()``. These device ops are called by the KVM driver when QEMU
|
||||||
|
executes certain ``ioctl()`` operations on its KVM file descriptor. They
|
||||||
|
include:
|
||||||
|
|
||||||
|
- create
|
||||||
|
|
||||||
|
This routine is called when QEMU issues a *KVM\_CREATE\_DEVICE*
|
||||||
|
``ioctl()`` on its per-VM file descriptor. It will allocate and
|
||||||
|
initialize a KVM user device specific data structure, and assign the
|
||||||
|
*kvm\_device* private field to it.
|
||||||
|
|
||||||
|
- ioctl
|
||||||
|
|
||||||
|
This routine is invoked when QEMU issues an ``ioctl()`` on the master
|
||||||
|
descriptor. The ``ioctl()`` commands supported are defined by the KVM
|
||||||
|
device type. *KVM\_DEV\_TYPE\_USER* ones will need several commands:
|
||||||
|
|
||||||
|
*KVM\_DEV\_USER\_SLAVE\_FD* creates the slave file descriptor that will
|
||||||
|
be passed to the device emulation program. Only one slave can be created
|
||||||
|
by each master descriptor. The file operations performed by this
|
||||||
|
descriptor are described below.
|
||||||
|
|
||||||
|
The *KVM\_DEV\_USER\_PA\_RANGE* command configures a guest physical
|
||||||
|
address range that the slave descriptor will receive MMIO notifications
|
||||||
|
for. The range is specified by a guest physical range structure
|
||||||
|
argument. For buses that assign addresses to devices dynamically, this
|
||||||
|
command can be executed while the guest is running, such as the case
|
||||||
|
when a guest changes a device's PCI BAR registers.
|
||||||
|
|
||||||
|
*KVM\_DEV\_USER\_PA\_RANGE* will use ``kvm_io_bus_register_dev()`` to
|
||||||
|
register *kvm\_io\_device\_ops* callbacks to be invoked when the guest
|
||||||
|
performs a MMIO operation within the range. When a range is changed,
|
||||||
|
``kvm_io_bus_unregister_dev()`` is used to remove the previous
|
||||||
|
instantiation.
|
||||||
|
|
||||||
|
*KVM\_DEV\_USER\_TIMEOUT* will configure a timeout value that specifies
|
||||||
|
how long KVM will wait for the emulation process to respond to a MMIO
|
||||||
|
indication.
|
||||||
|
|
||||||
|
- destroy
|
||||||
|
|
||||||
|
This routine is called when the VM instance is destroyed. It will need
|
||||||
|
to destroy the slave descriptor; and free any memory allocated by the
|
||||||
|
driver, as well as the *kvm\_device* structure itself.
|
||||||
|
|
||||||
|
slave descriptor
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
The slave descriptor will have its own file operations vector, which
|
||||||
|
responds to system calls on the descriptor performed by the device
|
||||||
|
emulation program.
|
||||||
|
|
||||||
|
- read
|
||||||
|
|
||||||
|
A read returns any pending MMIO requests from the KVM driver as MMIO
|
||||||
|
request structures. Multiple structures can be returned if there are
|
||||||
|
multiple MMIO operations pending. The MMIO requests are moved from the
|
||||||
|
pending queue to the sent queue, and if there are threads waiting for
|
||||||
|
space in the pending to add new MMIO operations, they will be woken
|
||||||
|
here.
|
||||||
|
|
||||||
|
- write
|
||||||
|
|
||||||
|
A write also consists of a set of MMIO requests. They are compared to
|
||||||
|
the MMIO requests in the sent queue. Matches are removed from the sent
|
||||||
|
queue, and any threads waiting for the reply are woken. If a store is
|
||||||
|
removed, then the number of posted stores in the per-CPU scoreboard is
|
||||||
|
decremented. When the number is zero, and a non side-effect load was
|
||||||
|
waiting for posted stores to complete, the load is continued.
|
||||||
|
|
||||||
|
- ioctl
|
||||||
|
|
||||||
|
There are several ioctl()s that can be performed on the slave
|
||||||
|
descriptor.
|
||||||
|
|
||||||
|
A *KVM\_DEV\_USER\_SHADOW\_SIZE* ``ioctl()`` causes the KVM driver to
|
||||||
|
allocate memory for the shadow image. This memory can later be
|
||||||
|
``mmap()``\ ed by the emulation process to share the emulation's view of
|
||||||
|
device memory with the KVM driver.
|
||||||
|
|
||||||
|
A *KVM\_DEV\_USER\_SHADOW\_CTRL* ``ioctl()`` controls access to the
|
||||||
|
shadow image. It will send the KVM driver a shadow control map, which
|
||||||
|
specifies which areas of the image can complete guest loads without
|
||||||
|
sending the load request to the emulation program. It will also specify
|
||||||
|
the size of load operations that are allowed.
|
||||||
|
|
||||||
|
- poll
|
||||||
|
|
||||||
|
An emulation program will use the ``poll()`` call with a *POLLIN* flag
|
||||||
|
to determine if there are MMIO requests waiting to be read. It will
|
||||||
|
return if the pending MMIO request queue is not empty.
|
||||||
|
|
||||||
|
- mmap
|
||||||
|
|
||||||
|
This call allows the emulation program to directly access the shadow
|
||||||
|
image allocated by the KVM driver. As device emulation updates device
|
||||||
|
memory, changes with no side-effects will be reflected in the shadow,
|
||||||
|
and the KVM driver can satisfy guest loads from the shadow image without
|
||||||
|
needing to wait for the emulation program.
|
||||||
|
|
||||||
|
kvm\_io\_device ops
|
||||||
|
^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Each KVM per-CPU thread can handle MMIO operation on behalf of the guest
|
||||||
|
VM. KVM will use the MMIO's guest physical address to search for a
|
||||||
|
matching *kvm\_io\_device* to see if the MMIO can be handled by the KVM
|
||||||
|
driver instead of exiting back to QEMU. If a match is found, the
|
||||||
|
corresponding callback will be invoked.
|
||||||
|
|
||||||
|
- read
|
||||||
|
|
||||||
|
This callback is invoked when the guest performs a load to the device.
|
||||||
|
Loads with side-effects must be handled synchronously, with the KVM
|
||||||
|
driver putting the QEMU thread to sleep waiting for the emulation
|
||||||
|
process reply before re-starting the guest. Loads that do not have
|
||||||
|
side-effects may be optimized by satisfying them from the shadow image,
|
||||||
|
if there are no outstanding stores to the device by this CPU. PCI memory
|
||||||
|
ordering demands that a load cannot complete before all older stores to
|
||||||
|
the same device have been completed.
|
||||||
|
|
||||||
|
- write
|
||||||
|
|
||||||
|
Stores can be handled asynchronously unless the pending MMIO request
|
||||||
|
queue is full. In this case, the QEMU thread must sleep waiting for
|
||||||
|
space in the queue. Stores will increment the number of posted stores in
|
||||||
|
the per-CPU scoreboard, in order to implement the PCI ordering
|
||||||
|
constraint above.
|
||||||
|
|
||||||
|
interrupt acceleration
|
||||||
|
^^^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
This performance optimization would work much like a vhost user
|
||||||
|
application does, where the QEMU process sets up *eventfds* that cause
|
||||||
|
the device's corresponding interrupt to be triggered by the KVM driver.
|
||||||
|
These irq file descriptors are sent to the emulation process at
|
||||||
|
initialization, and are used when the emulation code raises a device
|
||||||
|
interrupt.
|
||||||
|
|
||||||
|
intx acceleration
|
||||||
|
'''''''''''''''''
|
||||||
|
|
||||||
|
Traditional PCI pin interrupts are level based, so, in addition to an
|
||||||
|
irq file descriptor, a re-sampling file descriptor needs to be sent to
|
||||||
|
the emulation program. This second file descriptor allows multiple
|
||||||
|
devices sharing an irq to be notified when the interrupt has been
|
||||||
|
acknowledged by the guest, so they can re-trigger the interrupt if their
|
||||||
|
device has not de-asserted its interrupt.
|
||||||
|
|
||||||
|
intx irq descriptor
|
||||||
|
|
||||||
|
|
||||||
|
The irq descriptors are created by the proxy object
|
||||||
|
``using event_notifier_init()`` to create the irq and re-sampling
|
||||||
|
*eventds*, and ``kvm_vm_ioctl(KVM_IRQFD)`` to bind them to an interrupt.
|
||||||
|
The interrupt route can be found with
|
||||||
|
``pci_device_route_intx_to_irq()``.
|
||||||
|
|
||||||
|
intx routing changes
|
||||||
|
|
||||||
|
|
||||||
|
Intx routing can be changed when the guest programs the APIC the device
|
||||||
|
pin is connected to. The proxy object in QEMU will use
|
||||||
|
``pci_device_set_intx_routing_notifier()`` to be informed of any guest
|
||||||
|
changes to the route. This handler will broadly follow the VFIO
|
||||||
|
interrupt logic to change the route: de-assigning the existing irq
|
||||||
|
descriptor from its route, then assigning it the new route. (see
|
||||||
|
``vfio_intx_update()``)
|
||||||
|
|
||||||
|
MSI/X acceleration
|
||||||
|
''''''''''''''''''
|
||||||
|
|
||||||
|
MSI/X interrupts are sent as DMA transactions to the host. The interrupt
|
||||||
|
data contains a vector that is programmed by the guest, A device may have
|
||||||
|
multiple MSI interrupts associated with it, so multiple irq descriptors
|
||||||
|
may need to be sent to the emulation program.
|
||||||
|
|
||||||
|
MSI/X irq descriptor
|
||||||
|
|
||||||
|
|
||||||
|
This case will also follow the VFIO example. For each MSI/X interrupt,
|
||||||
|
an *eventfd* is created, a virtual interrupt is allocated by
|
||||||
|
``kvm_irqchip_add_msi_route()``, and the virtual interrupt is bound to
|
||||||
|
the eventfd with ``kvm_irqchip_add_irqfd_notifier()``.
|
||||||
|
|
||||||
|
MSI/X config space changes
|
||||||
|
|
||||||
|
|
||||||
|
The guest may dynamically update several MSI-related tables in the
|
||||||
|
device's PCI config space. These include per-MSI interrupt enables and
|
||||||
|
vector data. Additionally, MSIX tables exist in device memory space, not
|
||||||
|
config space. Much like the BAR case above, the proxy object must look
|
||||||
|
at guest config space programming to keep the MSI interrupt state
|
||||||
|
consistent between QEMU and the emulation program.
|
||||||
|
|
||||||
|
--------------
|
||||||
|
|
||||||
|
Disaggregated CPU emulation
|
||||||
|
---------------------------
|
||||||
|
|
||||||
|
After IO services have been disaggregated, a second phase would be to
|
||||||
|
separate a process to handle CPU instruction emulation from the main
|
||||||
|
QEMU control function. There are no object separation points for this
|
||||||
|
code, so the first task would be to create one.
|
||||||
|
|
||||||
|
Host access controls
|
||||||
|
--------------------
|
||||||
|
|
||||||
|
Separating QEMU relies on the host OS's access restriction mechanisms to
|
||||||
|
enforce that the differing processes can only access the objects they
|
||||||
|
are entitled to. There are a couple types of mechanisms usually provided
|
||||||
|
by general purpose OSs.
|
||||||
|
|
||||||
|
Discretionary access control
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Discretionary access control allows each user to control who can access
|
||||||
|
their files. In Linux, this type of control is usually too coarse for
|
||||||
|
QEMU separation, since it only provides three separate access controls:
|
||||||
|
one for the same user ID, the second for users IDs with the same group
|
||||||
|
ID, and the third for all other user IDs. Each device instance would
|
||||||
|
need a separate user ID to provide access control, which is likely to be
|
||||||
|
unwieldy for dynamically created VMs.
|
||||||
|
|
||||||
|
Mandatory access control
|
||||||
|
~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
|
Mandatory access control allows the OS to add an additional set of
|
||||||
|
controls on top of discretionary access for the OS to control. It also
|
||||||
|
adds other attributes to processes and files such as types, roles, and
|
||||||
|
categories, and can establish rules for how processes and files can
|
||||||
|
interact.
|
||||||
|
|
||||||
|
Type enforcement
|
||||||
|
^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Type enforcement assigns a *type* attribute to processes and files, and
|
||||||
|
allows rules to be written on what operations a process with a given
|
||||||
|
type can perform on a file with a given type. QEMU separation could take
|
||||||
|
advantage of type enforcement by running the emulation processes with
|
||||||
|
different types, both from the main QEMU process, and from the emulation
|
||||||
|
processes of different classes of devices.
|
||||||
|
|
||||||
|
For example, guest disk images and disk emulation processes could have
|
||||||
|
types separate from the main QEMU process and non-disk emulation
|
||||||
|
processes, and the type rules could prevent processes other than disk
|
||||||
|
emulation ones from accessing guest disk images. Similarly, network
|
||||||
|
emulation processes can have a type separate from the main QEMU process
|
||||||
|
and non-network emulation process, and only that type can access the
|
||||||
|
host tun/tap device used to provide guest networking.
|
||||||
|
|
||||||
|
Category enforcement
|
||||||
|
^^^^^^^^^^^^^^^^^^^^
|
||||||
|
|
||||||
|
Category enforcement assigns a set of numbers within a given range to
|
||||||
|
the process or file. The process is granted access to the file if the
|
||||||
|
process's set is a superset of the file's set. This enforcement can be
|
||||||
|
used to separate multiple instances of devices in the same class.
|
||||||
|
|
||||||
|
For example, if there are multiple disk devices provides to a guest,
|
||||||
|
each device emulation process could be provisioned with a separate
|
||||||
|
category. The different device emulation processes would not be able to
|
||||||
|
access each other's backing disk images.
|
||||||
|
|
||||||
|
Alternatively, categories could be used in lieu of the type enforcement
|
||||||
|
scheme described above. In this scenario, different categories would be
|
||||||
|
used to prevent device emulation processes in different classes from
|
||||||
|
accessing resources assigned to other classes.
|
|
@ -208,7 +208,7 @@ of its data area are:
|
||||||
28 - 31: l1_size
|
28 - 31: l1_size
|
||||||
The number of entries in the L1 table of the bitmap.
|
The number of entries in the L1 table of the bitmap.
|
||||||
|
|
||||||
variable: l1 (64 * l1_size bytes)
|
variable: l1_table (8 * l1_size bytes)
|
||||||
L1 offset table (in bytes)
|
L1 offset table (in bytes)
|
||||||
|
|
||||||
A dirty bitmap is stored using a one-level structure for the mapping to host
|
A dirty bitmap is stored using a one-level structure for the mapping to host
|
||||||
|
|
|
@ -34,6 +34,7 @@ Contents:
|
||||||
pr-manager
|
pr-manager
|
||||||
targets
|
targets
|
||||||
security
|
security
|
||||||
|
multi-process
|
||||||
deprecated
|
deprecated
|
||||||
removed-features
|
removed-features
|
||||||
build-platforms
|
build-platforms
|
||||||
|
|
64
docs/system/multi-process.rst
Normal file
64
docs/system/multi-process.rst
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
Multi-process QEMU
|
||||||
|
==================
|
||||||
|
|
||||||
|
This document describes how to configure and use multi-process qemu.
|
||||||
|
For the design document refer to docs/devel/qemu-multiprocess.
|
||||||
|
|
||||||
|
1) Configuration
|
||||||
|
----------------
|
||||||
|
|
||||||
|
multi-process is enabled by default for targets that enable KVM
|
||||||
|
|
||||||
|
|
||||||
|
2) Usage
|
||||||
|
--------
|
||||||
|
|
||||||
|
Multi-process QEMU requires an orchestrator to launch.
|
||||||
|
|
||||||
|
Following is a description of command-line used to launch mpqemu.
|
||||||
|
|
||||||
|
* Orchestrator:
|
||||||
|
|
||||||
|
- The Orchestrator creates a unix socketpair
|
||||||
|
|
||||||
|
- It launches the remote process and passes one of the
|
||||||
|
sockets to it via command-line.
|
||||||
|
|
||||||
|
- It then launches QEMU and specifies the other socket as an option
|
||||||
|
to the Proxy device object
|
||||||
|
|
||||||
|
* Remote Process:
|
||||||
|
|
||||||
|
- QEMU can enter remote process mode by using the "remote" machine
|
||||||
|
option.
|
||||||
|
|
||||||
|
- The orchestrator creates a "remote-object" with details about
|
||||||
|
the device and the file descriptor for the device
|
||||||
|
|
||||||
|
- The remaining options are no different from how one launches QEMU with
|
||||||
|
devices.
|
||||||
|
|
||||||
|
- Example command-line for the remote process is as follows:
|
||||||
|
|
||||||
|
/usr/bin/qemu-system-x86_64 \
|
||||||
|
-machine x-remote \
|
||||||
|
-device lsi53c895a,id=lsi0 \
|
||||||
|
-drive id=drive_image2,file=/build/ol7-nvme-test-1.qcow2 \
|
||||||
|
-device scsi-hd,id=drive2,drive=drive_image2,bus=lsi0.0,scsi-id=0 \
|
||||||
|
-object x-remote-object,id=robj1,devid=lsi1,fd=4,
|
||||||
|
|
||||||
|
* QEMU:
|
||||||
|
|
||||||
|
- Since parts of the RAM are shared between QEMU & remote process, a
|
||||||
|
memory-backend-memfd is required to facilitate this, as follows:
|
||||||
|
|
||||||
|
-object memory-backend-memfd,id=mem,size=2G
|
||||||
|
|
||||||
|
- A "x-pci-proxy-dev" device is created for each of the PCI devices emulated
|
||||||
|
in the remote process. A "socket" sub-option specifies the other end of
|
||||||
|
unix channel created by orchestrator. The "id" sub-option must be specified
|
||||||
|
and should be the same as the "id" specified for the remote PCI device
|
||||||
|
|
||||||
|
- Example commandline for QEMU is as follows:
|
||||||
|
|
||||||
|
-device x-pci-proxy-dev,id=lsi0,socket=3
|
|
@ -27,6 +27,7 @@ source pci-host/Kconfig
|
||||||
source pcmcia/Kconfig
|
source pcmcia/Kconfig
|
||||||
source pci/Kconfig
|
source pci/Kconfig
|
||||||
source rdma/Kconfig
|
source rdma/Kconfig
|
||||||
|
source remote/Kconfig
|
||||||
source rtc/Kconfig
|
source rtc/Kconfig
|
||||||
source scsi/Kconfig
|
source scsi/Kconfig
|
||||||
source sd/Kconfig
|
source sd/Kconfig
|
||||||
|
|
|
@ -56,6 +56,7 @@ subdir('moxie')
|
||||||
subdir('nios2')
|
subdir('nios2')
|
||||||
subdir('openrisc')
|
subdir('openrisc')
|
||||||
subdir('ppc')
|
subdir('ppc')
|
||||||
|
subdir('remote')
|
||||||
subdir('riscv')
|
subdir('riscv')
|
||||||
subdir('rx')
|
subdir('rx')
|
||||||
subdir('s390x')
|
subdir('s390x')
|
||||||
|
|
|
@ -495,7 +495,8 @@ static void process_msg_shmem(IVShmemState *s, int fd, Error **errp)
|
||||||
|
|
||||||
/* mmap the region and map into the BAR2 */
|
/* mmap the region and map into the BAR2 */
|
||||||
memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
|
memory_region_init_ram_from_fd(&s->server_bar2, OBJECT(s),
|
||||||
"ivshmem.bar2", size, true, fd, &local_err);
|
"ivshmem.bar2", size, true, fd, 0,
|
||||||
|
&local_err);
|
||||||
if (local_err) {
|
if (local_err) {
|
||||||
error_propagate(errp, local_err);
|
error_propagate(errp, local_err);
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -65,3 +65,6 @@ config PCI_POWERNV
|
||||||
select PCI_EXPRESS
|
select PCI_EXPRESS
|
||||||
select MSI_NONBROKEN
|
select MSI_NONBROKEN
|
||||||
select PCIE_PORT
|
select PCIE_PORT
|
||||||
|
|
||||||
|
config REMOTE_PCIHOST
|
||||||
|
bool
|
||||||
|
|
|
@ -9,6 +9,7 @@ pci_ss.add(when: 'CONFIG_PCI_EXPRESS_XILINX', if_true: files('xilinx-pcie.c'))
|
||||||
pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c'))
|
pci_ss.add(when: 'CONFIG_PCI_I440FX', if_true: files('i440fx.c'))
|
||||||
pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c'))
|
pci_ss.add(when: 'CONFIG_PCI_SABRE', if_true: files('sabre.c'))
|
||||||
pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c'))
|
pci_ss.add(when: 'CONFIG_XEN_IGD_PASSTHROUGH', if_true: files('xen_igd_pt.c'))
|
||||||
|
pci_ss.add(when: 'CONFIG_REMOTE_PCIHOST', if_true: files('remote.c'))
|
||||||
|
|
||||||
# PPC devices
|
# PPC devices
|
||||||
pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c'))
|
pci_ss.add(when: 'CONFIG_PREP_PCI', if_true: files('prep.c'))
|
||||||
|
|
75
hw/pci-host/remote.c
Normal file
75
hw/pci-host/remote.c
Normal file
|
@ -0,0 +1,75 @@
|
||||||
|
/*
|
||||||
|
* Remote PCI host device
|
||||||
|
*
|
||||||
|
* Unlike PCI host devices that model physical hardware, the purpose
|
||||||
|
* of this PCI host is to host multi-process QEMU devices.
|
||||||
|
*
|
||||||
|
* Multi-process QEMU extends the PCI host of a QEMU machine into a
|
||||||
|
* remote process. Any PCI device attached to the remote process is
|
||||||
|
* visible in the QEMU guest. This allows existing QEMU device models
|
||||||
|
* to be reused in the remote process.
|
||||||
|
*
|
||||||
|
* This PCI host is purely a container for PCI devices. It's fake in the
|
||||||
|
* sense that the guest never sees this PCI host and has no way of
|
||||||
|
* accessing it. Its job is just to provide the environment that QEMU
|
||||||
|
* PCI device models need when running in a remote process.
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "hw/pci/pci.h"
|
||||||
|
#include "hw/pci/pci_host.h"
|
||||||
|
#include "hw/pci/pcie_host.h"
|
||||||
|
#include "hw/qdev-properties.h"
|
||||||
|
#include "hw/pci-host/remote.h"
|
||||||
|
#include "exec/memory.h"
|
||||||
|
|
||||||
|
static const char *remote_pcihost_root_bus_path(PCIHostState *host_bridge,
|
||||||
|
PCIBus *rootbus)
|
||||||
|
{
|
||||||
|
return "0000:00";
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_pcihost_realize(DeviceState *dev, Error **errp)
|
||||||
|
{
|
||||||
|
PCIHostState *pci = PCI_HOST_BRIDGE(dev);
|
||||||
|
RemotePCIHost *s = REMOTE_PCIHOST(dev);
|
||||||
|
|
||||||
|
pci->bus = pci_root_bus_new(DEVICE(s), "remote-pci",
|
||||||
|
s->mr_pci_mem, s->mr_sys_io,
|
||||||
|
0, TYPE_PCIE_BUS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_pcihost_class_init(ObjectClass *klass, void *data)
|
||||||
|
{
|
||||||
|
DeviceClass *dc = DEVICE_CLASS(klass);
|
||||||
|
PCIHostBridgeClass *hc = PCI_HOST_BRIDGE_CLASS(klass);
|
||||||
|
|
||||||
|
hc->root_bus_path = remote_pcihost_root_bus_path;
|
||||||
|
dc->realize = remote_pcihost_realize;
|
||||||
|
|
||||||
|
dc->user_creatable = false;
|
||||||
|
set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
|
||||||
|
dc->fw_name = "pci";
|
||||||
|
}
|
||||||
|
|
||||||
|
static const TypeInfo remote_pcihost_info = {
|
||||||
|
.name = TYPE_REMOTE_PCIHOST,
|
||||||
|
.parent = TYPE_PCIE_HOST_BRIDGE,
|
||||||
|
.instance_size = sizeof(RemotePCIHost),
|
||||||
|
.class_init = remote_pcihost_class_init,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void remote_pcihost_register(void)
|
||||||
|
{
|
||||||
|
type_register_static(&remote_pcihost_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
type_init(remote_pcihost_register)
|
4
hw/remote/Kconfig
Normal file
4
hw/remote/Kconfig
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
config MULTIPROCESS
|
||||||
|
bool
|
||||||
|
depends on PCI && PCI_EXPRESS && KVM
|
||||||
|
select REMOTE_PCIHOST
|
119
hw/remote/iohub.c
Normal file
119
hw/remote/iohub.c
Normal file
|
@ -0,0 +1,119 @@
|
||||||
|
/*
|
||||||
|
* Remote IO Hub
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "hw/pci/pci.h"
|
||||||
|
#include "hw/pci/pci_ids.h"
|
||||||
|
#include "hw/pci/pci_bus.h"
|
||||||
|
#include "qemu/thread.h"
|
||||||
|
#include "hw/boards.h"
|
||||||
|
#include "hw/remote/machine.h"
|
||||||
|
#include "hw/remote/iohub.h"
|
||||||
|
#include "qemu/main-loop.h"
|
||||||
|
|
||||||
|
void remote_iohub_init(RemoteIOHubState *iohub)
|
||||||
|
{
|
||||||
|
int pirq;
|
||||||
|
|
||||||
|
memset(&iohub->irqfds, 0, sizeof(iohub->irqfds));
|
||||||
|
memset(&iohub->resamplefds, 0, sizeof(iohub->resamplefds));
|
||||||
|
|
||||||
|
for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
|
||||||
|
qemu_mutex_init(&iohub->irq_level_lock[pirq]);
|
||||||
|
iohub->irq_level[pirq] = 0;
|
||||||
|
event_notifier_init_fd(&iohub->irqfds[pirq], -1);
|
||||||
|
event_notifier_init_fd(&iohub->resamplefds[pirq], -1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void remote_iohub_finalize(RemoteIOHubState *iohub)
|
||||||
|
{
|
||||||
|
int pirq;
|
||||||
|
|
||||||
|
for (pirq = 0; pirq < REMOTE_IOHUB_NB_PIRQS; pirq++) {
|
||||||
|
qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
|
||||||
|
NULL, NULL, NULL);
|
||||||
|
event_notifier_cleanup(&iohub->irqfds[pirq]);
|
||||||
|
event_notifier_cleanup(&iohub->resamplefds[pirq]);
|
||||||
|
qemu_mutex_destroy(&iohub->irq_level_lock[pirq]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int remote_iohub_map_irq(PCIDevice *pci_dev, int intx)
|
||||||
|
{
|
||||||
|
return pci_dev->devfn;
|
||||||
|
}
|
||||||
|
|
||||||
|
void remote_iohub_set_irq(void *opaque, int pirq, int level)
|
||||||
|
{
|
||||||
|
RemoteIOHubState *iohub = opaque;
|
||||||
|
|
||||||
|
assert(pirq >= 0);
|
||||||
|
assert(pirq < PCI_DEVFN_MAX);
|
||||||
|
|
||||||
|
QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
|
||||||
|
|
||||||
|
if (level) {
|
||||||
|
if (++iohub->irq_level[pirq] == 1) {
|
||||||
|
event_notifier_set(&iohub->irqfds[pirq]);
|
||||||
|
}
|
||||||
|
} else if (iohub->irq_level[pirq] > 0) {
|
||||||
|
iohub->irq_level[pirq]--;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void intr_resample_handler(void *opaque)
|
||||||
|
{
|
||||||
|
ResampleToken *token = opaque;
|
||||||
|
RemoteIOHubState *iohub = token->iohub;
|
||||||
|
int pirq, s;
|
||||||
|
|
||||||
|
pirq = token->pirq;
|
||||||
|
|
||||||
|
s = event_notifier_test_and_clear(&iohub->resamplefds[pirq]);
|
||||||
|
|
||||||
|
assert(s >= 0);
|
||||||
|
|
||||||
|
QEMU_LOCK_GUARD(&iohub->irq_level_lock[pirq]);
|
||||||
|
|
||||||
|
if (iohub->irq_level[pirq]) {
|
||||||
|
event_notifier_set(&iohub->irqfds[pirq]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg)
|
||||||
|
{
|
||||||
|
RemoteMachineState *machine = REMOTE_MACHINE(current_machine);
|
||||||
|
RemoteIOHubState *iohub = &machine->iohub;
|
||||||
|
int pirq, intx;
|
||||||
|
|
||||||
|
intx = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
|
||||||
|
|
||||||
|
pirq = remote_iohub_map_irq(pci_dev, intx);
|
||||||
|
|
||||||
|
if (event_notifier_get_fd(&iohub->irqfds[pirq]) != -1) {
|
||||||
|
qemu_set_fd_handler(event_notifier_get_fd(&iohub->resamplefds[pirq]),
|
||||||
|
NULL, NULL, NULL);
|
||||||
|
event_notifier_cleanup(&iohub->irqfds[pirq]);
|
||||||
|
event_notifier_cleanup(&iohub->resamplefds[pirq]);
|
||||||
|
memset(&iohub->token[pirq], 0, sizeof(ResampleToken));
|
||||||
|
}
|
||||||
|
|
||||||
|
event_notifier_init_fd(&iohub->irqfds[pirq], msg->fds[0]);
|
||||||
|
event_notifier_init_fd(&iohub->resamplefds[pirq], msg->fds[1]);
|
||||||
|
|
||||||
|
iohub->token[pirq].iohub = iohub;
|
||||||
|
iohub->token[pirq].pirq = pirq;
|
||||||
|
|
||||||
|
qemu_set_fd_handler(msg->fds[1], intr_resample_handler, NULL,
|
||||||
|
&iohub->token[pirq]);
|
||||||
|
}
|
80
hw/remote/machine.c
Normal file
80
hw/remote/machine.c
Normal file
|
@ -0,0 +1,80 @@
|
||||||
|
/*
|
||||||
|
* Machine for remote device
|
||||||
|
*
|
||||||
|
* This machine type is used by the remote device process in multi-process
|
||||||
|
* QEMU. QEMU device models depend on parent busses, interrupt controllers,
|
||||||
|
* memory regions, etc. The remote machine type offers this environment so
|
||||||
|
* that QEMU device models can be used as remote devices.
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "hw/remote/machine.h"
|
||||||
|
#include "exec/address-spaces.h"
|
||||||
|
#include "exec/memory.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "hw/pci/pci_host.h"
|
||||||
|
#include "hw/remote/iohub.h"
|
||||||
|
|
||||||
|
static void remote_machine_init(MachineState *machine)
|
||||||
|
{
|
||||||
|
MemoryRegion *system_memory, *system_io, *pci_memory;
|
||||||
|
RemoteMachineState *s = REMOTE_MACHINE(machine);
|
||||||
|
RemotePCIHost *rem_host;
|
||||||
|
PCIHostState *pci_host;
|
||||||
|
|
||||||
|
system_memory = get_system_memory();
|
||||||
|
system_io = get_system_io();
|
||||||
|
|
||||||
|
pci_memory = g_new(MemoryRegion, 1);
|
||||||
|
memory_region_init(pci_memory, NULL, "pci", UINT64_MAX);
|
||||||
|
|
||||||
|
rem_host = REMOTE_PCIHOST(qdev_new(TYPE_REMOTE_PCIHOST));
|
||||||
|
|
||||||
|
rem_host->mr_pci_mem = pci_memory;
|
||||||
|
rem_host->mr_sys_mem = system_memory;
|
||||||
|
rem_host->mr_sys_io = system_io;
|
||||||
|
|
||||||
|
s->host = rem_host;
|
||||||
|
|
||||||
|
object_property_add_child(OBJECT(s), "remote-pcihost", OBJECT(rem_host));
|
||||||
|
memory_region_add_subregion_overlap(system_memory, 0x0, pci_memory, -1);
|
||||||
|
|
||||||
|
qdev_realize(DEVICE(rem_host), sysbus_get_default(), &error_fatal);
|
||||||
|
|
||||||
|
pci_host = PCI_HOST_BRIDGE(rem_host);
|
||||||
|
|
||||||
|
remote_iohub_init(&s->iohub);
|
||||||
|
|
||||||
|
pci_bus_irqs(pci_host->bus, remote_iohub_set_irq, remote_iohub_map_irq,
|
||||||
|
&s->iohub, REMOTE_IOHUB_NB_PIRQS);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_machine_class_init(ObjectClass *oc, void *data)
|
||||||
|
{
|
||||||
|
MachineClass *mc = MACHINE_CLASS(oc);
|
||||||
|
|
||||||
|
mc->init = remote_machine_init;
|
||||||
|
mc->desc = "Experimental remote machine";
|
||||||
|
}
|
||||||
|
|
||||||
|
static const TypeInfo remote_machine = {
|
||||||
|
.name = TYPE_REMOTE_MACHINE,
|
||||||
|
.parent = TYPE_MACHINE,
|
||||||
|
.instance_size = sizeof(RemoteMachineState),
|
||||||
|
.class_init = remote_machine_class_init,
|
||||||
|
};
|
||||||
|
|
||||||
|
static void remote_machine_register_types(void)
|
||||||
|
{
|
||||||
|
type_register_static(&remote_machine);
|
||||||
|
}
|
||||||
|
|
||||||
|
type_init(remote_machine_register_types);
|
65
hw/remote/memory.c
Normal file
65
hw/remote/memory.c
Normal file
|
@ -0,0 +1,65 @@
|
||||||
|
/*
|
||||||
|
* Memory manager for remote device
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "hw/remote/memory.h"
|
||||||
|
#include "exec/address-spaces.h"
|
||||||
|
#include "exec/ram_addr.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
|
||||||
|
static void remote_sysmem_reset(void)
|
||||||
|
{
|
||||||
|
MemoryRegion *sysmem, *subregion, *next;
|
||||||
|
|
||||||
|
sysmem = get_system_memory();
|
||||||
|
|
||||||
|
QTAILQ_FOREACH_SAFE(subregion, &sysmem->subregions, subregions_link, next) {
|
||||||
|
if (subregion->ram) {
|
||||||
|
memory_region_del_subregion(sysmem, subregion);
|
||||||
|
object_unparent(OBJECT(subregion));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
SyncSysmemMsg *sysmem_info = &msg->data.sync_sysmem;
|
||||||
|
MemoryRegion *sysmem, *subregion;
|
||||||
|
static unsigned int suffix;
|
||||||
|
int region;
|
||||||
|
|
||||||
|
sysmem = get_system_memory();
|
||||||
|
|
||||||
|
remote_sysmem_reset();
|
||||||
|
|
||||||
|
for (region = 0; region < msg->num_fds; region++) {
|
||||||
|
g_autofree char *name;
|
||||||
|
subregion = g_new(MemoryRegion, 1);
|
||||||
|
name = g_strdup_printf("remote-mem-%u", suffix++);
|
||||||
|
memory_region_init_ram_from_fd(subregion, NULL,
|
||||||
|
name, sysmem_info->sizes[region],
|
||||||
|
true, msg->fds[region],
|
||||||
|
sysmem_info->offsets[region],
|
||||||
|
errp);
|
||||||
|
|
||||||
|
if (*errp) {
|
||||||
|
g_free(subregion);
|
||||||
|
remote_sysmem_reset();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
memory_region_add_subregion(sysmem, sysmem_info->gpas[region],
|
||||||
|
subregion);
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
13
hw/remote/meson.build
Normal file
13
hw/remote/meson.build
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
remote_ss = ss.source_set()
|
||||||
|
|
||||||
|
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('machine.c'))
|
||||||
|
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('mpqemu-link.c'))
|
||||||
|
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('message.c'))
|
||||||
|
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('remote-obj.c'))
|
||||||
|
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy.c'))
|
||||||
|
remote_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('iohub.c'))
|
||||||
|
|
||||||
|
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('memory.c'))
|
||||||
|
specific_ss.add(when: 'CONFIG_MULTIPROCESS', if_true: files('proxy-memory-listener.c'))
|
||||||
|
|
||||||
|
softmmu_ss.add_all(when: 'CONFIG_MULTIPROCESS', if_true: remote_ss)
|
230
hw/remote/message.c
Normal file
230
hw/remote/message.c
Normal file
|
@ -0,0 +1,230 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2020, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
|
||||||
|
*
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "hw/remote/machine.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "hw/remote/mpqemu-link.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "sysemu/runstate.h"
|
||||||
|
#include "hw/pci/pci.h"
|
||||||
|
#include "exec/memattrs.h"
|
||||||
|
#include "hw/remote/memory.h"
|
||||||
|
#include "hw/remote/iohub.h"
|
||||||
|
#include "sysemu/reset.h"
|
||||||
|
|
||||||
|
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
|
||||||
|
MPQemuMsg *msg, Error **errp);
|
||||||
|
static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
|
||||||
|
MPQemuMsg *msg, Error **errp);
|
||||||
|
static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
|
||||||
|
static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp);
|
||||||
|
static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
|
||||||
|
Error **errp);
|
||||||
|
|
||||||
|
void coroutine_fn mpqemu_remote_msg_loop_co(void *data)
|
||||||
|
{
|
||||||
|
g_autofree RemoteCommDev *com = (RemoteCommDev *)data;
|
||||||
|
PCIDevice *pci_dev = NULL;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
|
||||||
|
assert(com->ioc);
|
||||||
|
|
||||||
|
pci_dev = com->dev;
|
||||||
|
for (; !local_err;) {
|
||||||
|
MPQemuMsg msg = {0};
|
||||||
|
|
||||||
|
if (!mpqemu_msg_recv(&msg, com->ioc, &local_err)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!mpqemu_msg_valid(&msg)) {
|
||||||
|
error_setg(&local_err, "Received invalid message from proxy"
|
||||||
|
"in remote process pid="FMT_pid"",
|
||||||
|
getpid());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (msg.cmd) {
|
||||||
|
case MPQEMU_CMD_PCI_CFGWRITE:
|
||||||
|
process_config_write(com->ioc, pci_dev, &msg, &local_err);
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_PCI_CFGREAD:
|
||||||
|
process_config_read(com->ioc, pci_dev, &msg, &local_err);
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_BAR_WRITE:
|
||||||
|
process_bar_write(com->ioc, &msg, &local_err);
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_BAR_READ:
|
||||||
|
process_bar_read(com->ioc, &msg, &local_err);
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_SYNC_SYSMEM:
|
||||||
|
remote_sysmem_reconfig(&msg, &local_err);
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_SET_IRQFD:
|
||||||
|
process_set_irqfd_msg(pci_dev, &msg);
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_DEVICE_RESET:
|
||||||
|
process_device_reset_msg(com->ioc, pci_dev, &local_err);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
error_setg(&local_err,
|
||||||
|
"Unknown command (%d) received for device %s"
|
||||||
|
" (pid="FMT_pid")",
|
||||||
|
msg.cmd, DEVICE(pci_dev)->id, getpid());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (local_err) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
qemu_system_shutdown_request(SHUTDOWN_CAUSE_HOST_ERROR);
|
||||||
|
} else {
|
||||||
|
qemu_system_shutdown_request(SHUTDOWN_CAUSE_GUEST_SHUTDOWN);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_config_write(QIOChannel *ioc, PCIDevice *dev,
|
||||||
|
MPQemuMsg *msg, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
|
||||||
|
MPQemuMsg ret = { 0 };
|
||||||
|
|
||||||
|
if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
|
||||||
|
error_setg(errp, "Bad address for PCI config write, pid "FMT_pid".",
|
||||||
|
getpid());
|
||||||
|
ret.data.u64 = UINT64_MAX;
|
||||||
|
} else {
|
||||||
|
pci_default_write_config(dev, conf->addr, conf->val, conf->len);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.cmd = MPQEMU_CMD_RET;
|
||||||
|
ret.size = sizeof(ret.data.u64);
|
||||||
|
|
||||||
|
if (!mpqemu_msg_send(&ret, ioc, NULL)) {
|
||||||
|
error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
|
||||||
|
getpid());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_config_read(QIOChannel *ioc, PCIDevice *dev,
|
||||||
|
MPQemuMsg *msg, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
PciConfDataMsg *conf = (PciConfDataMsg *)&msg->data.pci_conf_data;
|
||||||
|
MPQemuMsg ret = { 0 };
|
||||||
|
|
||||||
|
if ((conf->addr + sizeof(conf->val)) > pci_config_size(dev)) {
|
||||||
|
error_setg(errp, "Bad address for PCI config read, pid "FMT_pid".",
|
||||||
|
getpid());
|
||||||
|
ret.data.u64 = UINT64_MAX;
|
||||||
|
} else {
|
||||||
|
ret.data.u64 = pci_default_read_config(dev, conf->addr, conf->len);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.cmd = MPQEMU_CMD_RET;
|
||||||
|
ret.size = sizeof(ret.data.u64);
|
||||||
|
|
||||||
|
if (!mpqemu_msg_send(&ret, ioc, NULL)) {
|
||||||
|
error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
|
||||||
|
getpid());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_bar_write(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
BarAccessMsg *bar_access = &msg->data.bar_access;
|
||||||
|
AddressSpace *as =
|
||||||
|
bar_access->memory ? &address_space_memory : &address_space_io;
|
||||||
|
MPQemuMsg ret = { 0 };
|
||||||
|
MemTxResult res;
|
||||||
|
uint64_t val;
|
||||||
|
|
||||||
|
if (!is_power_of_2(bar_access->size) ||
|
||||||
|
(bar_access->size > sizeof(uint64_t))) {
|
||||||
|
ret.data.u64 = UINT64_MAX;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
val = cpu_to_le64(bar_access->val);
|
||||||
|
|
||||||
|
res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
|
||||||
|
(void *)&val, bar_access->size, true);
|
||||||
|
|
||||||
|
if (res != MEMTX_OK) {
|
||||||
|
error_setg(errp, "Bad address %"PRIx64" for mem write, pid "FMT_pid".",
|
||||||
|
bar_access->addr, getpid());
|
||||||
|
ret.data.u64 = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
fail:
|
||||||
|
ret.cmd = MPQEMU_CMD_RET;
|
||||||
|
ret.size = sizeof(ret.data.u64);
|
||||||
|
|
||||||
|
if (!mpqemu_msg_send(&ret, ioc, NULL)) {
|
||||||
|
error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
|
||||||
|
getpid());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_bar_read(QIOChannel *ioc, MPQemuMsg *msg, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
BarAccessMsg *bar_access = &msg->data.bar_access;
|
||||||
|
MPQemuMsg ret = { 0 };
|
||||||
|
AddressSpace *as;
|
||||||
|
MemTxResult res;
|
||||||
|
uint64_t val = 0;
|
||||||
|
|
||||||
|
as = bar_access->memory ? &address_space_memory : &address_space_io;
|
||||||
|
|
||||||
|
if (!is_power_of_2(bar_access->size) ||
|
||||||
|
(bar_access->size > sizeof(uint64_t))) {
|
||||||
|
val = UINT64_MAX;
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
res = address_space_rw(as, bar_access->addr, MEMTXATTRS_UNSPECIFIED,
|
||||||
|
(void *)&val, bar_access->size, false);
|
||||||
|
|
||||||
|
if (res != MEMTX_OK) {
|
||||||
|
error_setg(errp, "Bad address %"PRIx64" for mem read, pid "FMT_pid".",
|
||||||
|
bar_access->addr, getpid());
|
||||||
|
val = UINT64_MAX;
|
||||||
|
}
|
||||||
|
|
||||||
|
fail:
|
||||||
|
ret.cmd = MPQEMU_CMD_RET;
|
||||||
|
ret.data.u64 = le64_to_cpu(val);
|
||||||
|
ret.size = sizeof(ret.data.u64);
|
||||||
|
|
||||||
|
if (!mpqemu_msg_send(&ret, ioc, NULL)) {
|
||||||
|
error_prepend(errp, "Error returning code to proxy, pid "FMT_pid": ",
|
||||||
|
getpid());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void process_device_reset_msg(QIOChannel *ioc, PCIDevice *dev,
|
||||||
|
Error **errp)
|
||||||
|
{
|
||||||
|
DeviceClass *dc = DEVICE_GET_CLASS(dev);
|
||||||
|
DeviceState *s = DEVICE(dev);
|
||||||
|
MPQemuMsg ret = { 0 };
|
||||||
|
|
||||||
|
if (dc->reset) {
|
||||||
|
dc->reset(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
ret.cmd = MPQEMU_CMD_RET;
|
||||||
|
|
||||||
|
mpqemu_msg_send(&ret, ioc, errp);
|
||||||
|
}
|
267
hw/remote/mpqemu-link.c
Normal file
267
hw/remote/mpqemu-link.c
Normal file
|
@ -0,0 +1,267 @@
|
||||||
|
/*
|
||||||
|
* Communication channel between QEMU and remote device process
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "qemu/module.h"
|
||||||
|
#include "hw/remote/mpqemu-link.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "qemu/iov.h"
|
||||||
|
#include "qemu/error-report.h"
|
||||||
|
#include "qemu/main-loop.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "sysemu/iothread.h"
|
||||||
|
#include "trace.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send message over the ioc QIOChannel.
|
||||||
|
* This function is safe to call from:
|
||||||
|
* - main loop in co-routine context. Will block the main loop if not in
|
||||||
|
* co-routine context;
|
||||||
|
* - vCPU thread with no co-routine context and if the channel is not part
|
||||||
|
* of the main loop handling;
|
||||||
|
* - IOThread within co-routine context, outside of co-routine context
|
||||||
|
* will block IOThread;
|
||||||
|
* Returns true if no errors were encountered, false otherwise.
|
||||||
|
*/
|
||||||
|
bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
bool iolock = qemu_mutex_iothread_locked();
|
||||||
|
bool iothread = qemu_in_iothread();
|
||||||
|
struct iovec send[2] = {};
|
||||||
|
int *fds = NULL;
|
||||||
|
size_t nfds = 0;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
send[0].iov_base = msg;
|
||||||
|
send[0].iov_len = MPQEMU_MSG_HDR_SIZE;
|
||||||
|
|
||||||
|
send[1].iov_base = (void *)&msg->data;
|
||||||
|
send[1].iov_len = msg->size;
|
||||||
|
|
||||||
|
if (msg->num_fds) {
|
||||||
|
nfds = msg->num_fds;
|
||||||
|
fds = msg->fds;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Dont use in IOThread out of co-routine context as
|
||||||
|
* it will block IOThread.
|
||||||
|
*/
|
||||||
|
assert(qemu_in_coroutine() || !iothread);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Skip unlocking/locking iothread lock when the IOThread is running
|
||||||
|
* in co-routine context. Co-routine context is asserted above
|
||||||
|
* for IOThread case.
|
||||||
|
* Also skip lock handling while in a co-routine in the main context.
|
||||||
|
*/
|
||||||
|
if (iolock && !iothread && !qemu_in_coroutine()) {
|
||||||
|
qemu_mutex_unlock_iothread();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!qio_channel_writev_full_all(ioc, send, G_N_ELEMENTS(send),
|
||||||
|
fds, nfds, errp)) {
|
||||||
|
ret = true;
|
||||||
|
} else {
|
||||||
|
trace_mpqemu_send_io_error(msg->cmd, msg->size, nfds);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (iolock && !iothread && !qemu_in_coroutine()) {
|
||||||
|
/* See above comment why skip locking here. */
|
||||||
|
qemu_mutex_lock_iothread();
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Read message from the ioc QIOChannel.
|
||||||
|
* This function is safe to call from:
|
||||||
|
* - From main loop in co-routine context. Will block the main loop if not in
|
||||||
|
* co-routine context;
|
||||||
|
* - From vCPU thread with no co-routine context and if the channel is not part
|
||||||
|
* of the main loop handling;
|
||||||
|
* - From IOThread within co-routine context, outside of co-routine context
|
||||||
|
* will block IOThread;
|
||||||
|
*/
|
||||||
|
static ssize_t mpqemu_read(QIOChannel *ioc, void *buf, size_t len, int **fds,
|
||||||
|
size_t *nfds, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
struct iovec iov = { .iov_base = buf, .iov_len = len };
|
||||||
|
bool iolock = qemu_mutex_iothread_locked();
|
||||||
|
bool iothread = qemu_in_iothread();
|
||||||
|
int ret = -1;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Dont use in IOThread out of co-routine context as
|
||||||
|
* it will block IOThread.
|
||||||
|
*/
|
||||||
|
assert(qemu_in_coroutine() || !iothread);
|
||||||
|
|
||||||
|
if (iolock && !iothread && !qemu_in_coroutine()) {
|
||||||
|
qemu_mutex_unlock_iothread();
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = qio_channel_readv_full_all_eof(ioc, &iov, 1, fds, nfds, errp);
|
||||||
|
|
||||||
|
if (iolock && !iothread && !qemu_in_coroutine()) {
|
||||||
|
qemu_mutex_lock_iothread();
|
||||||
|
}
|
||||||
|
|
||||||
|
return (ret <= 0) ? ret : iov.iov_len;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
g_autofree int *fds = NULL;
|
||||||
|
size_t nfds = 0;
|
||||||
|
ssize_t len;
|
||||||
|
bool ret = false;
|
||||||
|
|
||||||
|
len = mpqemu_read(ioc, msg, MPQEMU_MSG_HDR_SIZE, &fds, &nfds, errp);
|
||||||
|
if (len <= 0) {
|
||||||
|
goto fail;
|
||||||
|
} else if (len != MPQEMU_MSG_HDR_SIZE) {
|
||||||
|
error_setg(errp, "Message header corrupted");
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg->size > sizeof(msg->data)) {
|
||||||
|
error_setg(errp, "Invalid size for message");
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!msg->size) {
|
||||||
|
goto copy_fds;
|
||||||
|
}
|
||||||
|
|
||||||
|
len = mpqemu_read(ioc, &msg->data, msg->size, NULL, NULL, errp);
|
||||||
|
if (len <= 0) {
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (len != msg->size) {
|
||||||
|
error_setg(errp, "Unable to read full message");
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
|
||||||
|
copy_fds:
|
||||||
|
msg->num_fds = nfds;
|
||||||
|
if (nfds > G_N_ELEMENTS(msg->fds)) {
|
||||||
|
error_setg(errp,
|
||||||
|
"Overflow error: received %zu fds, more than max of %d fds",
|
||||||
|
nfds, REMOTE_MAX_FDS);
|
||||||
|
goto fail;
|
||||||
|
}
|
||||||
|
if (nfds) {
|
||||||
|
memcpy(msg->fds, fds, nfds * sizeof(int));
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = true;
|
||||||
|
|
||||||
|
fail:
|
||||||
|
if (*errp) {
|
||||||
|
trace_mpqemu_recv_io_error(msg->cmd, msg->size, nfds);
|
||||||
|
}
|
||||||
|
while (*errp && nfds) {
|
||||||
|
close(fds[nfds - 1]);
|
||||||
|
nfds--;
|
||||||
|
}
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Send msg and wait for a reply with command code RET_MSG.
|
||||||
|
* Returns the message received of size u64 or UINT64_MAX
|
||||||
|
* on error.
|
||||||
|
* Called from VCPU thread in non-coroutine context.
|
||||||
|
* Used by the Proxy object to communicate to remote processes.
|
||||||
|
*/
|
||||||
|
uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
|
||||||
|
Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
MPQemuMsg msg_reply = {0};
|
||||||
|
uint64_t ret = UINT64_MAX;
|
||||||
|
|
||||||
|
assert(!qemu_in_coroutine());
|
||||||
|
|
||||||
|
QEMU_LOCK_GUARD(&pdev->io_mutex);
|
||||||
|
if (!mpqemu_msg_send(msg, pdev->ioc, errp)) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!mpqemu_msg_recv(&msg_reply, pdev->ioc, errp)) {
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!mpqemu_msg_valid(&msg_reply) || msg_reply.cmd != MPQEMU_CMD_RET) {
|
||||||
|
error_setg(errp, "ERROR: Invalid reply received for command %d",
|
||||||
|
msg->cmd);
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
return msg_reply.data.u64;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mpqemu_msg_valid(MPQemuMsg *msg)
|
||||||
|
{
|
||||||
|
if (msg->cmd >= MPQEMU_CMD_MAX && msg->cmd < 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Verify FDs. */
|
||||||
|
if (msg->num_fds >= REMOTE_MAX_FDS) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (msg->num_fds > 0) {
|
||||||
|
for (int i = 0; i < msg->num_fds; i++) {
|
||||||
|
if (fcntl(msg->fds[i], F_GETFL) == -1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Verify message specific fields. */
|
||||||
|
switch (msg->cmd) {
|
||||||
|
case MPQEMU_CMD_SYNC_SYSMEM:
|
||||||
|
if (msg->num_fds == 0 || msg->size != sizeof(SyncSysmemMsg)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_PCI_CFGWRITE:
|
||||||
|
case MPQEMU_CMD_PCI_CFGREAD:
|
||||||
|
if (msg->size != sizeof(PciConfDataMsg)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_BAR_WRITE:
|
||||||
|
case MPQEMU_CMD_BAR_READ:
|
||||||
|
if ((msg->size != sizeof(BarAccessMsg)) || (msg->num_fds != 0)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case MPQEMU_CMD_SET_IRQFD:
|
||||||
|
if (msg->size || (msg->num_fds != 2)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
227
hw/remote/proxy-memory-listener.c
Normal file
227
hw/remote/proxy-memory-listener.c
Normal file
|
@ -0,0 +1,227 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "qemu/compiler.h"
|
||||||
|
#include "qemu/int128.h"
|
||||||
|
#include "qemu/range.h"
|
||||||
|
#include "exec/memory.h"
|
||||||
|
#include "exec/cpu-common.h"
|
||||||
|
#include "cpu.h"
|
||||||
|
#include "exec/ram_addr.h"
|
||||||
|
#include "exec/address-spaces.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "hw/remote/mpqemu-link.h"
|
||||||
|
#include "hw/remote/proxy-memory-listener.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* TODO: get_fd_from_hostaddr(), proxy_mrs_can_merge() and
|
||||||
|
* proxy_memory_listener_commit() defined below perform tasks similar to the
|
||||||
|
* functions defined in vhost-user.c. These functions are good candidates
|
||||||
|
* for refactoring.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
static void proxy_memory_listener_reset(MemoryListener *listener)
|
||||||
|
{
|
||||||
|
ProxyMemoryListener *proxy_listener = container_of(listener,
|
||||||
|
ProxyMemoryListener,
|
||||||
|
listener);
|
||||||
|
int mrs;
|
||||||
|
|
||||||
|
for (mrs = 0; mrs < proxy_listener->n_mr_sections; mrs++) {
|
||||||
|
memory_region_unref(proxy_listener->mr_sections[mrs].mr);
|
||||||
|
}
|
||||||
|
|
||||||
|
g_free(proxy_listener->mr_sections);
|
||||||
|
proxy_listener->mr_sections = NULL;
|
||||||
|
proxy_listener->n_mr_sections = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int get_fd_from_hostaddr(uint64_t host, ram_addr_t *offset)
|
||||||
|
{
|
||||||
|
MemoryRegion *mr;
|
||||||
|
ram_addr_t off;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Assumes that the host address is a valid address as it's
|
||||||
|
* coming from the MemoryListener system. In the case host
|
||||||
|
* address is not valid, the following call would return
|
||||||
|
* the default subregion of "system_memory" region, and
|
||||||
|
* not NULL. So it's not possible to check for NULL here.
|
||||||
|
*/
|
||||||
|
mr = memory_region_from_host((void *)(uintptr_t)host, &off);
|
||||||
|
|
||||||
|
if (offset) {
|
||||||
|
*offset = off;
|
||||||
|
}
|
||||||
|
|
||||||
|
return memory_region_get_fd(mr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool proxy_mrs_can_merge(uint64_t host, uint64_t prev_host, size_t size)
|
||||||
|
{
|
||||||
|
if (((prev_host + size) != host)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (get_fd_from_hostaddr(host, NULL) !=
|
||||||
|
get_fd_from_hostaddr(prev_host, NULL)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool try_merge(ProxyMemoryListener *proxy_listener,
|
||||||
|
MemoryRegionSection *section)
|
||||||
|
{
|
||||||
|
uint64_t mrs_size, mrs_gpa, mrs_page;
|
||||||
|
MemoryRegionSection *prev_sec;
|
||||||
|
bool merged = false;
|
||||||
|
uintptr_t mrs_host;
|
||||||
|
RAMBlock *mrs_rb;
|
||||||
|
|
||||||
|
if (!proxy_listener->n_mr_sections) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
mrs_rb = section->mr->ram_block;
|
||||||
|
mrs_page = (uint64_t)qemu_ram_pagesize(mrs_rb);
|
||||||
|
mrs_size = int128_get64(section->size);
|
||||||
|
mrs_gpa = section->offset_within_address_space;
|
||||||
|
mrs_host = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
|
||||||
|
section->offset_within_region;
|
||||||
|
|
||||||
|
if (get_fd_from_hostaddr(mrs_host, NULL) < 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
mrs_host = mrs_host & ~(mrs_page - 1);
|
||||||
|
mrs_gpa = mrs_gpa & ~(mrs_page - 1);
|
||||||
|
mrs_size = ROUND_UP(mrs_size, mrs_page);
|
||||||
|
|
||||||
|
prev_sec = proxy_listener->mr_sections +
|
||||||
|
(proxy_listener->n_mr_sections - 1);
|
||||||
|
uint64_t prev_gpa_start = prev_sec->offset_within_address_space;
|
||||||
|
uint64_t prev_size = int128_get64(prev_sec->size);
|
||||||
|
uint64_t prev_gpa_end = range_get_last(prev_gpa_start, prev_size);
|
||||||
|
uint64_t prev_host_start =
|
||||||
|
(uintptr_t)memory_region_get_ram_ptr(prev_sec->mr) +
|
||||||
|
prev_sec->offset_within_region;
|
||||||
|
uint64_t prev_host_end = range_get_last(prev_host_start, prev_size);
|
||||||
|
|
||||||
|
if (mrs_gpa <= (prev_gpa_end + 1)) {
|
||||||
|
g_assert(mrs_gpa > prev_gpa_start);
|
||||||
|
|
||||||
|
if ((section->mr == prev_sec->mr) &&
|
||||||
|
proxy_mrs_can_merge(mrs_host, prev_host_start,
|
||||||
|
(mrs_gpa - prev_gpa_start))) {
|
||||||
|
uint64_t max_end = MAX(prev_host_end, mrs_host + mrs_size);
|
||||||
|
merged = true;
|
||||||
|
prev_sec->offset_within_address_space =
|
||||||
|
MIN(prev_gpa_start, mrs_gpa);
|
||||||
|
prev_sec->offset_within_region =
|
||||||
|
MIN(prev_host_start, mrs_host) -
|
||||||
|
(uintptr_t)memory_region_get_ram_ptr(prev_sec->mr);
|
||||||
|
prev_sec->size = int128_make64(max_end - MIN(prev_host_start,
|
||||||
|
mrs_host));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return merged;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void proxy_memory_listener_region_addnop(MemoryListener *listener,
|
||||||
|
MemoryRegionSection *section)
|
||||||
|
{
|
||||||
|
ProxyMemoryListener *proxy_listener = container_of(listener,
|
||||||
|
ProxyMemoryListener,
|
||||||
|
listener);
|
||||||
|
|
||||||
|
if (!memory_region_is_ram(section->mr) ||
|
||||||
|
memory_region_is_rom(section->mr)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (try_merge(proxy_listener, section)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
++proxy_listener->n_mr_sections;
|
||||||
|
proxy_listener->mr_sections = g_renew(MemoryRegionSection,
|
||||||
|
proxy_listener->mr_sections,
|
||||||
|
proxy_listener->n_mr_sections);
|
||||||
|
proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1] = *section;
|
||||||
|
proxy_listener->mr_sections[proxy_listener->n_mr_sections - 1].fv = NULL;
|
||||||
|
memory_region_ref(section->mr);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void proxy_memory_listener_commit(MemoryListener *listener)
|
||||||
|
{
|
||||||
|
ProxyMemoryListener *proxy_listener = container_of(listener,
|
||||||
|
ProxyMemoryListener,
|
||||||
|
listener);
|
||||||
|
MPQemuMsg msg;
|
||||||
|
MemoryRegionSection *section;
|
||||||
|
ram_addr_t offset;
|
||||||
|
uintptr_t host_addr;
|
||||||
|
int region;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
|
||||||
|
memset(&msg, 0, sizeof(MPQemuMsg));
|
||||||
|
|
||||||
|
msg.cmd = MPQEMU_CMD_SYNC_SYSMEM;
|
||||||
|
msg.num_fds = proxy_listener->n_mr_sections;
|
||||||
|
msg.size = sizeof(SyncSysmemMsg);
|
||||||
|
if (msg.num_fds > REMOTE_MAX_FDS) {
|
||||||
|
error_report("Number of fds is more than %d", REMOTE_MAX_FDS);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (region = 0; region < proxy_listener->n_mr_sections; region++) {
|
||||||
|
section = &proxy_listener->mr_sections[region];
|
||||||
|
msg.data.sync_sysmem.gpas[region] =
|
||||||
|
section->offset_within_address_space;
|
||||||
|
msg.data.sync_sysmem.sizes[region] = int128_get64(section->size);
|
||||||
|
host_addr = (uintptr_t)memory_region_get_ram_ptr(section->mr) +
|
||||||
|
section->offset_within_region;
|
||||||
|
msg.fds[region] = get_fd_from_hostaddr(host_addr, &offset);
|
||||||
|
msg.data.sync_sysmem.offsets[region] = offset;
|
||||||
|
}
|
||||||
|
if (!mpqemu_msg_send(&msg, proxy_listener->ioc, &local_err)) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener)
|
||||||
|
{
|
||||||
|
memory_listener_unregister(&proxy_listener->listener);
|
||||||
|
|
||||||
|
proxy_memory_listener_reset(&proxy_listener->listener);
|
||||||
|
}
|
||||||
|
|
||||||
|
void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
|
||||||
|
QIOChannel *ioc)
|
||||||
|
{
|
||||||
|
proxy_listener->n_mr_sections = 0;
|
||||||
|
proxy_listener->mr_sections = NULL;
|
||||||
|
|
||||||
|
proxy_listener->ioc = ioc;
|
||||||
|
|
||||||
|
proxy_listener->listener.begin = proxy_memory_listener_reset;
|
||||||
|
proxy_listener->listener.commit = proxy_memory_listener_commit;
|
||||||
|
proxy_listener->listener.region_add = proxy_memory_listener_region_addnop;
|
||||||
|
proxy_listener->listener.region_nop = proxy_memory_listener_region_addnop;
|
||||||
|
proxy_listener->listener.priority = 10;
|
||||||
|
|
||||||
|
memory_listener_register(&proxy_listener->listener,
|
||||||
|
&address_space_memory);
|
||||||
|
}
|
379
hw/remote/proxy.c
Normal file
379
hw/remote/proxy.c
Normal file
|
@ -0,0 +1,379 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "hw/remote/proxy.h"
|
||||||
|
#include "hw/pci/pci.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "io/channel-util.h"
|
||||||
|
#include "hw/qdev-properties.h"
|
||||||
|
#include "monitor/monitor.h"
|
||||||
|
#include "migration/blocker.h"
|
||||||
|
#include "qemu/sockets.h"
|
||||||
|
#include "hw/remote/mpqemu-link.h"
|
||||||
|
#include "qemu/error-report.h"
|
||||||
|
#include "hw/remote/proxy-memory-listener.h"
|
||||||
|
#include "qom/object.h"
|
||||||
|
#include "qemu/event_notifier.h"
|
||||||
|
#include "sysemu/kvm.h"
|
||||||
|
#include "util/event_notifier-posix.c"
|
||||||
|
|
||||||
|
static void probe_pci_info(PCIDevice *dev, Error **errp);
|
||||||
|
static void proxy_device_reset(DeviceState *dev);
|
||||||
|
|
||||||
|
static void proxy_intx_update(PCIDevice *pci_dev)
|
||||||
|
{
|
||||||
|
PCIProxyDev *dev = PCI_PROXY_DEV(pci_dev);
|
||||||
|
PCIINTxRoute route;
|
||||||
|
int pin = pci_get_byte(pci_dev->config + PCI_INTERRUPT_PIN) - 1;
|
||||||
|
|
||||||
|
if (dev->virq != -1) {
|
||||||
|
kvm_irqchip_remove_irqfd_notifier_gsi(kvm_state, &dev->intr, dev->virq);
|
||||||
|
dev->virq = -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
route = pci_device_route_intx_to_irq(pci_dev, pin);
|
||||||
|
|
||||||
|
dev->virq = route.irq;
|
||||||
|
|
||||||
|
if (dev->virq != -1) {
|
||||||
|
kvm_irqchip_add_irqfd_notifier_gsi(kvm_state, &dev->intr,
|
||||||
|
&dev->resample, dev->virq);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void setup_irqfd(PCIProxyDev *dev)
|
||||||
|
{
|
||||||
|
PCIDevice *pci_dev = PCI_DEVICE(dev);
|
||||||
|
MPQemuMsg msg;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
|
||||||
|
event_notifier_init(&dev->intr, 0);
|
||||||
|
event_notifier_init(&dev->resample, 0);
|
||||||
|
|
||||||
|
memset(&msg, 0, sizeof(MPQemuMsg));
|
||||||
|
msg.cmd = MPQEMU_CMD_SET_IRQFD;
|
||||||
|
msg.num_fds = 2;
|
||||||
|
msg.fds[0] = event_notifier_get_fd(&dev->intr);
|
||||||
|
msg.fds[1] = event_notifier_get_fd(&dev->resample);
|
||||||
|
msg.size = 0;
|
||||||
|
|
||||||
|
if (!mpqemu_msg_send(&msg, dev->ioc, &local_err)) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
}
|
||||||
|
|
||||||
|
dev->virq = -1;
|
||||||
|
|
||||||
|
proxy_intx_update(pci_dev);
|
||||||
|
|
||||||
|
pci_device_set_intx_routing_notifier(pci_dev, proxy_intx_update);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void pci_proxy_dev_realize(PCIDevice *device, Error **errp)
|
||||||
|
{
|
||||||
|
ERRP_GUARD();
|
||||||
|
PCIProxyDev *dev = PCI_PROXY_DEV(device);
|
||||||
|
uint8_t *pci_conf = device->config;
|
||||||
|
int fd;
|
||||||
|
|
||||||
|
if (!dev->fd) {
|
||||||
|
error_setg(errp, "fd parameter not specified for %s",
|
||||||
|
DEVICE(device)->id);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fd = monitor_fd_param(monitor_cur(), dev->fd, errp);
|
||||||
|
if (fd == -1) {
|
||||||
|
error_prepend(errp, "proxy: unable to parse fd %s: ", dev->fd);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fd_is_socket(fd)) {
|
||||||
|
error_setg(errp, "proxy: fd %d is not a socket", fd);
|
||||||
|
close(fd);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
dev->ioc = qio_channel_new_fd(fd, errp);
|
||||||
|
|
||||||
|
error_setg(&dev->migration_blocker, "%s does not support migration",
|
||||||
|
TYPE_PCI_PROXY_DEV);
|
||||||
|
migrate_add_blocker(dev->migration_blocker, errp);
|
||||||
|
|
||||||
|
qemu_mutex_init(&dev->io_mutex);
|
||||||
|
qio_channel_set_blocking(dev->ioc, true, NULL);
|
||||||
|
|
||||||
|
pci_conf[PCI_LATENCY_TIMER] = 0xff;
|
||||||
|
pci_conf[PCI_INTERRUPT_PIN] = 0x01;
|
||||||
|
|
||||||
|
proxy_memory_listener_configure(&dev->proxy_listener, dev->ioc);
|
||||||
|
|
||||||
|
setup_irqfd(dev);
|
||||||
|
|
||||||
|
probe_pci_info(PCI_DEVICE(dev), errp);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void pci_proxy_dev_exit(PCIDevice *pdev)
|
||||||
|
{
|
||||||
|
PCIProxyDev *dev = PCI_PROXY_DEV(pdev);
|
||||||
|
|
||||||
|
if (dev->ioc) {
|
||||||
|
qio_channel_close(dev->ioc, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
migrate_del_blocker(dev->migration_blocker);
|
||||||
|
|
||||||
|
error_free(dev->migration_blocker);
|
||||||
|
|
||||||
|
proxy_memory_listener_deconfigure(&dev->proxy_listener);
|
||||||
|
|
||||||
|
event_notifier_cleanup(&dev->intr);
|
||||||
|
event_notifier_cleanup(&dev->resample);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void config_op_send(PCIProxyDev *pdev, uint32_t addr, uint32_t *val,
|
||||||
|
int len, unsigned int op)
|
||||||
|
{
|
||||||
|
MPQemuMsg msg = { 0 };
|
||||||
|
uint64_t ret = -EINVAL;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
|
||||||
|
msg.cmd = op;
|
||||||
|
msg.data.pci_conf_data.addr = addr;
|
||||||
|
msg.data.pci_conf_data.val = (op == MPQEMU_CMD_PCI_CFGWRITE) ? *val : 0;
|
||||||
|
msg.data.pci_conf_data.len = len;
|
||||||
|
msg.size = sizeof(PciConfDataMsg);
|
||||||
|
|
||||||
|
ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
|
||||||
|
if (local_err) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret == UINT64_MAX) {
|
||||||
|
error_report("Failed to perform PCI config %s operation",
|
||||||
|
(op == MPQEMU_CMD_PCI_CFGREAD) ? "READ" : "WRITE");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (op == MPQEMU_CMD_PCI_CFGREAD) {
|
||||||
|
*val = (uint32_t)ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint32_t pci_proxy_read_config(PCIDevice *d, uint32_t addr, int len)
|
||||||
|
{
|
||||||
|
uint32_t val;
|
||||||
|
|
||||||
|
config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGREAD);
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void pci_proxy_write_config(PCIDevice *d, uint32_t addr, uint32_t val,
|
||||||
|
int len)
|
||||||
|
{
|
||||||
|
/*
|
||||||
|
* Some of the functions access the copy of remote device's PCI config
|
||||||
|
* space which is cached in the proxy device. Therefore, maintain
|
||||||
|
* it updated.
|
||||||
|
*/
|
||||||
|
pci_default_write_config(d, addr, val, len);
|
||||||
|
|
||||||
|
config_op_send(PCI_PROXY_DEV(d), addr, &val, len, MPQEMU_CMD_PCI_CFGWRITE);
|
||||||
|
}
|
||||||
|
|
||||||
|
static Property proxy_properties[] = {
|
||||||
|
DEFINE_PROP_STRING("fd", PCIProxyDev, fd),
|
||||||
|
DEFINE_PROP_END_OF_LIST(),
|
||||||
|
};
|
||||||
|
|
||||||
|
static void pci_proxy_dev_class_init(ObjectClass *klass, void *data)
|
||||||
|
{
|
||||||
|
DeviceClass *dc = DEVICE_CLASS(klass);
|
||||||
|
PCIDeviceClass *k = PCI_DEVICE_CLASS(klass);
|
||||||
|
|
||||||
|
k->realize = pci_proxy_dev_realize;
|
||||||
|
k->exit = pci_proxy_dev_exit;
|
||||||
|
k->config_read = pci_proxy_read_config;
|
||||||
|
k->config_write = pci_proxy_write_config;
|
||||||
|
|
||||||
|
dc->reset = proxy_device_reset;
|
||||||
|
|
||||||
|
device_class_set_props(dc, proxy_properties);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const TypeInfo pci_proxy_dev_type_info = {
|
||||||
|
.name = TYPE_PCI_PROXY_DEV,
|
||||||
|
.parent = TYPE_PCI_DEVICE,
|
||||||
|
.instance_size = sizeof(PCIProxyDev),
|
||||||
|
.class_init = pci_proxy_dev_class_init,
|
||||||
|
.interfaces = (InterfaceInfo[]) {
|
||||||
|
{ INTERFACE_CONVENTIONAL_PCI_DEVICE },
|
||||||
|
{ },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
static void pci_proxy_dev_register_types(void)
|
||||||
|
{
|
||||||
|
type_register_static(&pci_proxy_dev_type_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
type_init(pci_proxy_dev_register_types)
|
||||||
|
|
||||||
|
static void send_bar_access_msg(PCIProxyDev *pdev, MemoryRegion *mr,
|
||||||
|
bool write, hwaddr addr, uint64_t *val,
|
||||||
|
unsigned size, bool memory)
|
||||||
|
{
|
||||||
|
MPQemuMsg msg = { 0 };
|
||||||
|
long ret = -EINVAL;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
|
||||||
|
msg.size = sizeof(BarAccessMsg);
|
||||||
|
msg.data.bar_access.addr = mr->addr + addr;
|
||||||
|
msg.data.bar_access.size = size;
|
||||||
|
msg.data.bar_access.memory = memory;
|
||||||
|
|
||||||
|
if (write) {
|
||||||
|
msg.cmd = MPQEMU_CMD_BAR_WRITE;
|
||||||
|
msg.data.bar_access.val = *val;
|
||||||
|
} else {
|
||||||
|
msg.cmd = MPQEMU_CMD_BAR_READ;
|
||||||
|
}
|
||||||
|
|
||||||
|
ret = mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
|
||||||
|
if (local_err) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!write) {
|
||||||
|
*val = ret;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void proxy_bar_write(void *opaque, hwaddr addr, uint64_t val,
|
||||||
|
unsigned size)
|
||||||
|
{
|
||||||
|
ProxyMemoryRegion *pmr = opaque;
|
||||||
|
|
||||||
|
send_bar_access_msg(pmr->dev, &pmr->mr, true, addr, &val, size,
|
||||||
|
pmr->memory);
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64_t proxy_bar_read(void *opaque, hwaddr addr, unsigned size)
|
||||||
|
{
|
||||||
|
ProxyMemoryRegion *pmr = opaque;
|
||||||
|
uint64_t val;
|
||||||
|
|
||||||
|
send_bar_access_msg(pmr->dev, &pmr->mr, false, addr, &val, size,
|
||||||
|
pmr->memory);
|
||||||
|
|
||||||
|
return val;
|
||||||
|
}
|
||||||
|
|
||||||
|
const MemoryRegionOps proxy_mr_ops = {
|
||||||
|
.read = proxy_bar_read,
|
||||||
|
.write = proxy_bar_write,
|
||||||
|
.endianness = DEVICE_NATIVE_ENDIAN,
|
||||||
|
.impl = {
|
||||||
|
.min_access_size = 1,
|
||||||
|
.max_access_size = 8,
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
static void probe_pci_info(PCIDevice *dev, Error **errp)
|
||||||
|
{
|
||||||
|
PCIDeviceClass *pc = PCI_DEVICE_GET_CLASS(dev);
|
||||||
|
uint32_t orig_val, new_val, base_class, val;
|
||||||
|
PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
|
||||||
|
DeviceClass *dc = DEVICE_CLASS(pc);
|
||||||
|
uint8_t type;
|
||||||
|
int i, size;
|
||||||
|
|
||||||
|
config_op_send(pdev, PCI_VENDOR_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
|
||||||
|
pc->vendor_id = (uint16_t)val;
|
||||||
|
|
||||||
|
config_op_send(pdev, PCI_DEVICE_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
|
||||||
|
pc->device_id = (uint16_t)val;
|
||||||
|
|
||||||
|
config_op_send(pdev, PCI_CLASS_DEVICE, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
|
||||||
|
pc->class_id = (uint16_t)val;
|
||||||
|
|
||||||
|
config_op_send(pdev, PCI_SUBSYSTEM_ID, &val, 2, MPQEMU_CMD_PCI_CFGREAD);
|
||||||
|
pc->subsystem_id = (uint16_t)val;
|
||||||
|
|
||||||
|
base_class = pc->class_id >> 4;
|
||||||
|
switch (base_class) {
|
||||||
|
case PCI_BASE_CLASS_BRIDGE:
|
||||||
|
set_bit(DEVICE_CATEGORY_BRIDGE, dc->categories);
|
||||||
|
break;
|
||||||
|
case PCI_BASE_CLASS_STORAGE:
|
||||||
|
set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
|
||||||
|
break;
|
||||||
|
case PCI_BASE_CLASS_NETWORK:
|
||||||
|
set_bit(DEVICE_CATEGORY_NETWORK, dc->categories);
|
||||||
|
break;
|
||||||
|
case PCI_BASE_CLASS_INPUT:
|
||||||
|
set_bit(DEVICE_CATEGORY_INPUT, dc->categories);
|
||||||
|
break;
|
||||||
|
case PCI_BASE_CLASS_DISPLAY:
|
||||||
|
set_bit(DEVICE_CATEGORY_DISPLAY, dc->categories);
|
||||||
|
break;
|
||||||
|
case PCI_BASE_CLASS_PROCESSOR:
|
||||||
|
set_bit(DEVICE_CATEGORY_CPU, dc->categories);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
set_bit(DEVICE_CATEGORY_MISC, dc->categories);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (i = 0; i < PCI_NUM_REGIONS; i++) {
|
||||||
|
config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
|
||||||
|
MPQEMU_CMD_PCI_CFGREAD);
|
||||||
|
new_val = 0xffffffff;
|
||||||
|
config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
|
||||||
|
MPQEMU_CMD_PCI_CFGWRITE);
|
||||||
|
config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &new_val, 4,
|
||||||
|
MPQEMU_CMD_PCI_CFGREAD);
|
||||||
|
size = (~(new_val & 0xFFFFFFF0)) + 1;
|
||||||
|
config_op_send(pdev, PCI_BASE_ADDRESS_0 + (4 * i), &orig_val, 4,
|
||||||
|
MPQEMU_CMD_PCI_CFGWRITE);
|
||||||
|
type = (new_val & 0x1) ?
|
||||||
|
PCI_BASE_ADDRESS_SPACE_IO : PCI_BASE_ADDRESS_SPACE_MEMORY;
|
||||||
|
|
||||||
|
if (size) {
|
||||||
|
g_autofree char *name;
|
||||||
|
pdev->region[i].dev = pdev;
|
||||||
|
pdev->region[i].present = true;
|
||||||
|
if (type == PCI_BASE_ADDRESS_SPACE_MEMORY) {
|
||||||
|
pdev->region[i].memory = true;
|
||||||
|
}
|
||||||
|
name = g_strdup_printf("bar-region-%d", i);
|
||||||
|
memory_region_init_io(&pdev->region[i].mr, OBJECT(pdev),
|
||||||
|
&proxy_mr_ops, &pdev->region[i],
|
||||||
|
name, size);
|
||||||
|
pci_register_bar(dev, i, type, &pdev->region[i].mr);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void proxy_device_reset(DeviceState *dev)
|
||||||
|
{
|
||||||
|
PCIProxyDev *pdev = PCI_PROXY_DEV(dev);
|
||||||
|
MPQemuMsg msg = { 0 };
|
||||||
|
Error *local_err = NULL;
|
||||||
|
|
||||||
|
msg.cmd = MPQEMU_CMD_DEVICE_RESET;
|
||||||
|
msg.size = 0;
|
||||||
|
|
||||||
|
mpqemu_msg_send_and_await_reply(&msg, pdev, &local_err);
|
||||||
|
if (local_err) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
203
hw/remote/remote-obj.c
Normal file
203
hw/remote/remote-obj.c
Normal file
|
@ -0,0 +1,203 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2020, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL-v2, version 2 or later.
|
||||||
|
*
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu-common.h"
|
||||||
|
|
||||||
|
#include "qemu/error-report.h"
|
||||||
|
#include "qemu/notify.h"
|
||||||
|
#include "qom/object_interfaces.h"
|
||||||
|
#include "hw/qdev-core.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "hw/qdev-core.h"
|
||||||
|
#include "hw/remote/machine.h"
|
||||||
|
#include "io/channel-util.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "sysemu/sysemu.h"
|
||||||
|
#include "hw/pci/pci.h"
|
||||||
|
#include "qemu/sockets.h"
|
||||||
|
#include "monitor/monitor.h"
|
||||||
|
|
||||||
|
#define TYPE_REMOTE_OBJECT "x-remote-object"
|
||||||
|
OBJECT_DECLARE_TYPE(RemoteObject, RemoteObjectClass, REMOTE_OBJECT)
|
||||||
|
|
||||||
|
struct RemoteObjectClass {
|
||||||
|
ObjectClass parent_class;
|
||||||
|
|
||||||
|
unsigned int nr_devs;
|
||||||
|
unsigned int max_devs;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct RemoteObject {
|
||||||
|
/* private */
|
||||||
|
Object parent;
|
||||||
|
|
||||||
|
Notifier machine_done;
|
||||||
|
|
||||||
|
int32_t fd;
|
||||||
|
char *devid;
|
||||||
|
|
||||||
|
QIOChannel *ioc;
|
||||||
|
|
||||||
|
DeviceState *dev;
|
||||||
|
DeviceListener listener;
|
||||||
|
};
|
||||||
|
|
||||||
|
static void remote_object_set_fd(Object *obj, const char *str, Error **errp)
|
||||||
|
{
|
||||||
|
RemoteObject *o = REMOTE_OBJECT(obj);
|
||||||
|
int fd = -1;
|
||||||
|
|
||||||
|
fd = monitor_fd_param(monitor_cur(), str, errp);
|
||||||
|
if (fd == -1) {
|
||||||
|
error_prepend(errp, "Could not parse remote object fd %s:", str);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!fd_is_socket(fd)) {
|
||||||
|
error_setg(errp, "File descriptor '%s' is not a socket", str);
|
||||||
|
close(fd);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
o->fd = fd;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_object_set_devid(Object *obj, const char *str, Error **errp)
|
||||||
|
{
|
||||||
|
RemoteObject *o = REMOTE_OBJECT(obj);
|
||||||
|
|
||||||
|
g_free(o->devid);
|
||||||
|
|
||||||
|
o->devid = g_strdup(str);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_object_unrealize_listener(DeviceListener *listener,
|
||||||
|
DeviceState *dev)
|
||||||
|
{
|
||||||
|
RemoteObject *o = container_of(listener, RemoteObject, listener);
|
||||||
|
|
||||||
|
if (o->dev == dev) {
|
||||||
|
object_unref(OBJECT(o));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_object_machine_done(Notifier *notifier, void *data)
|
||||||
|
{
|
||||||
|
RemoteObject *o = container_of(notifier, RemoteObject, machine_done);
|
||||||
|
DeviceState *dev = NULL;
|
||||||
|
QIOChannel *ioc = NULL;
|
||||||
|
Coroutine *co = NULL;
|
||||||
|
RemoteCommDev *comdev = NULL;
|
||||||
|
Error *err = NULL;
|
||||||
|
|
||||||
|
dev = qdev_find_recursive(sysbus_get_default(), o->devid);
|
||||||
|
if (!dev || !object_dynamic_cast(OBJECT(dev), TYPE_PCI_DEVICE)) {
|
||||||
|
error_report("%s is not a PCI device", o->devid);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
ioc = qio_channel_new_fd(o->fd, &err);
|
||||||
|
if (!ioc) {
|
||||||
|
error_report_err(err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
qio_channel_set_blocking(ioc, false, NULL);
|
||||||
|
|
||||||
|
o->dev = dev;
|
||||||
|
|
||||||
|
o->listener.unrealize = remote_object_unrealize_listener;
|
||||||
|
device_listener_register(&o->listener);
|
||||||
|
|
||||||
|
/* co-routine should free this. */
|
||||||
|
comdev = g_new0(RemoteCommDev, 1);
|
||||||
|
*comdev = (RemoteCommDev) {
|
||||||
|
.ioc = ioc,
|
||||||
|
.dev = PCI_DEVICE(dev),
|
||||||
|
};
|
||||||
|
|
||||||
|
co = qemu_coroutine_create(mpqemu_remote_msg_loop_co, comdev);
|
||||||
|
qemu_coroutine_enter(co);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_object_init(Object *obj)
|
||||||
|
{
|
||||||
|
RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
|
||||||
|
RemoteObject *o = REMOTE_OBJECT(obj);
|
||||||
|
|
||||||
|
if (k->nr_devs >= k->max_devs) {
|
||||||
|
error_report("Reached maximum number of devices: %u", k->max_devs);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
o->ioc = NULL;
|
||||||
|
o->fd = -1;
|
||||||
|
o->devid = NULL;
|
||||||
|
|
||||||
|
k->nr_devs++;
|
||||||
|
|
||||||
|
o->machine_done.notify = remote_object_machine_done;
|
||||||
|
qemu_add_machine_init_done_notifier(&o->machine_done);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_object_finalize(Object *obj)
|
||||||
|
{
|
||||||
|
RemoteObjectClass *k = REMOTE_OBJECT_GET_CLASS(obj);
|
||||||
|
RemoteObject *o = REMOTE_OBJECT(obj);
|
||||||
|
|
||||||
|
device_listener_unregister(&o->listener);
|
||||||
|
|
||||||
|
if (o->ioc) {
|
||||||
|
qio_channel_shutdown(o->ioc, QIO_CHANNEL_SHUTDOWN_BOTH, NULL);
|
||||||
|
qio_channel_close(o->ioc, NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
object_unref(OBJECT(o->ioc));
|
||||||
|
|
||||||
|
k->nr_devs--;
|
||||||
|
g_free(o->devid);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void remote_object_class_init(ObjectClass *klass, void *data)
|
||||||
|
{
|
||||||
|
RemoteObjectClass *k = REMOTE_OBJECT_CLASS(klass);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Limit number of supported devices to 1. This is done to avoid devices
|
||||||
|
* from one VM accessing the RAM of another VM. This is done until we
|
||||||
|
* start using separate address spaces for individual devices.
|
||||||
|
*/
|
||||||
|
k->max_devs = 1;
|
||||||
|
k->nr_devs = 0;
|
||||||
|
|
||||||
|
object_class_property_add_str(klass, "fd", NULL, remote_object_set_fd);
|
||||||
|
object_class_property_add_str(klass, "devid", NULL,
|
||||||
|
remote_object_set_devid);
|
||||||
|
}
|
||||||
|
|
||||||
|
static const TypeInfo remote_object_info = {
|
||||||
|
.name = TYPE_REMOTE_OBJECT,
|
||||||
|
.parent = TYPE_OBJECT,
|
||||||
|
.instance_size = sizeof(RemoteObject),
|
||||||
|
.instance_init = remote_object_init,
|
||||||
|
.instance_finalize = remote_object_finalize,
|
||||||
|
.class_size = sizeof(RemoteObjectClass),
|
||||||
|
.class_init = remote_object_class_init,
|
||||||
|
.interfaces = (InterfaceInfo[]) {
|
||||||
|
{ TYPE_USER_CREATABLE },
|
||||||
|
{ }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static void register_types(void)
|
||||||
|
{
|
||||||
|
type_register_static(&remote_object_info);
|
||||||
|
}
|
||||||
|
|
||||||
|
type_init(register_types);
|
4
hw/remote/trace-events
Normal file
4
hw/remote/trace-events
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
# multi-process trace events
|
||||||
|
|
||||||
|
mpqemu_send_io_error(int cmd, int size, int nfds) "send command %d size %d, %d file descriptors to remote process"
|
||||||
|
mpqemu_recv_io_error(int cmd, int size, int nfds) "failed to receive %d size %d, %d file descriptors to remote process"
|
1
hw/remote/trace.h
Normal file
1
hw/remote/trace.h
Normal file
|
@ -0,0 +1 @@
|
||||||
|
#include "trace/trace-hw_remote.h"
|
|
@ -998,6 +998,7 @@ void memory_region_init_ram_from_file(MemoryRegion *mr,
|
||||||
* @size: size of the region.
|
* @size: size of the region.
|
||||||
* @share: %true if memory must be mmaped with the MAP_SHARED flag
|
* @share: %true if memory must be mmaped with the MAP_SHARED flag
|
||||||
* @fd: the fd to mmap.
|
* @fd: the fd to mmap.
|
||||||
|
* @offset: offset within the file referenced by fd
|
||||||
* @errp: pointer to Error*, to store an error if it happens.
|
* @errp: pointer to Error*, to store an error if it happens.
|
||||||
*
|
*
|
||||||
* Note that this function does not do anything to cause the data in the
|
* Note that this function does not do anything to cause the data in the
|
||||||
|
@ -1009,6 +1010,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
|
||||||
uint64_t size,
|
uint64_t size,
|
||||||
bool share,
|
bool share,
|
||||||
int fd,
|
int fd,
|
||||||
|
ram_addr_t offset,
|
||||||
Error **errp);
|
Error **errp);
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -121,8 +121,8 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
|
||||||
uint32_t ram_flags, const char *mem_path,
|
uint32_t ram_flags, const char *mem_path,
|
||||||
bool readonly, Error **errp);
|
bool readonly, Error **errp);
|
||||||
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
|
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
|
||||||
uint32_t ram_flags, int fd, bool readonly,
|
uint32_t ram_flags, int fd, off_t offset,
|
||||||
Error **errp);
|
bool readonly, Error **errp);
|
||||||
|
|
||||||
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
|
RAMBlock *qemu_ram_alloc_from_ptr(ram_addr_t size, void *host,
|
||||||
MemoryRegion *mr, Error **errp);
|
MemoryRegion *mr, Error **errp);
|
||||||
|
|
30
include/hw/pci-host/remote.h
Normal file
30
include/hw/pci-host/remote.h
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
/*
|
||||||
|
* PCI Host for remote device
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef REMOTE_PCIHOST_H
|
||||||
|
#define REMOTE_PCIHOST_H
|
||||||
|
|
||||||
|
#include "exec/memory.h"
|
||||||
|
#include "hw/pci/pcie_host.h"
|
||||||
|
|
||||||
|
#define TYPE_REMOTE_PCIHOST "remote-pcihost"
|
||||||
|
OBJECT_DECLARE_SIMPLE_TYPE(RemotePCIHost, REMOTE_PCIHOST)
|
||||||
|
|
||||||
|
struct RemotePCIHost {
|
||||||
|
/*< private >*/
|
||||||
|
PCIExpressHost parent_obj;
|
||||||
|
/*< public >*/
|
||||||
|
|
||||||
|
MemoryRegion *mr_pci_mem;
|
||||||
|
MemoryRegion *mr_sys_io;
|
||||||
|
MemoryRegion *mr_sys_mem;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif
|
|
@ -192,6 +192,9 @@
|
||||||
#define PCI_DEVICE_ID_SUN_SIMBA 0x5000
|
#define PCI_DEVICE_ID_SUN_SIMBA 0x5000
|
||||||
#define PCI_DEVICE_ID_SUN_SABRE 0xa000
|
#define PCI_DEVICE_ID_SUN_SABRE 0xa000
|
||||||
|
|
||||||
|
#define PCI_VENDOR_ID_ORACLE 0x108e
|
||||||
|
#define PCI_DEVICE_ID_REMOTE_IOHUB 0xb000
|
||||||
|
|
||||||
#define PCI_VENDOR_ID_CMD 0x1095
|
#define PCI_VENDOR_ID_CMD 0x1095
|
||||||
#define PCI_DEVICE_ID_CMD_646 0x0646
|
#define PCI_DEVICE_ID_CMD_646 0x0646
|
||||||
|
|
||||||
|
|
42
include/hw/remote/iohub.h
Normal file
42
include/hw/remote/iohub.h
Normal file
|
@ -0,0 +1,42 @@
|
||||||
|
/*
|
||||||
|
* IO Hub for remote device
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef REMOTE_IOHUB_H
|
||||||
|
#define REMOTE_IOHUB_H
|
||||||
|
|
||||||
|
#include "hw/pci/pci.h"
|
||||||
|
#include "qemu/event_notifier.h"
|
||||||
|
#include "qemu/thread-posix.h"
|
||||||
|
#include "hw/remote/mpqemu-link.h"
|
||||||
|
|
||||||
|
#define REMOTE_IOHUB_NB_PIRQS PCI_DEVFN_MAX
|
||||||
|
|
||||||
|
typedef struct ResampleToken {
|
||||||
|
void *iohub;
|
||||||
|
int pirq;
|
||||||
|
} ResampleToken;
|
||||||
|
|
||||||
|
typedef struct RemoteIOHubState {
|
||||||
|
PCIDevice d;
|
||||||
|
EventNotifier irqfds[REMOTE_IOHUB_NB_PIRQS];
|
||||||
|
EventNotifier resamplefds[REMOTE_IOHUB_NB_PIRQS];
|
||||||
|
unsigned int irq_level[REMOTE_IOHUB_NB_PIRQS];
|
||||||
|
ResampleToken token[REMOTE_IOHUB_NB_PIRQS];
|
||||||
|
QemuMutex irq_level_lock[REMOTE_IOHUB_NB_PIRQS];
|
||||||
|
} RemoteIOHubState;
|
||||||
|
|
||||||
|
int remote_iohub_map_irq(PCIDevice *pci_dev, int intx);
|
||||||
|
void remote_iohub_set_irq(void *opaque, int pirq, int level);
|
||||||
|
void process_set_irqfd_msg(PCIDevice *pci_dev, MPQemuMsg *msg);
|
||||||
|
|
||||||
|
void remote_iohub_init(RemoteIOHubState *iohub);
|
||||||
|
void remote_iohub_finalize(RemoteIOHubState *iohub);
|
||||||
|
|
||||||
|
#endif
|
38
include/hw/remote/machine.h
Normal file
38
include/hw/remote/machine.h
Normal file
|
@ -0,0 +1,38 @@
|
||||||
|
/*
|
||||||
|
* Remote machine configuration
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef REMOTE_MACHINE_H
|
||||||
|
#define REMOTE_MACHINE_H
|
||||||
|
|
||||||
|
#include "qom/object.h"
|
||||||
|
#include "hw/boards.h"
|
||||||
|
#include "hw/pci-host/remote.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "hw/remote/iohub.h"
|
||||||
|
|
||||||
|
struct RemoteMachineState {
|
||||||
|
MachineState parent_obj;
|
||||||
|
|
||||||
|
RemotePCIHost *host;
|
||||||
|
RemoteIOHubState iohub;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Used to pass to co-routine device and ioc. */
|
||||||
|
typedef struct RemoteCommDev {
|
||||||
|
PCIDevice *dev;
|
||||||
|
QIOChannel *ioc;
|
||||||
|
} RemoteCommDev;
|
||||||
|
|
||||||
|
#define TYPE_REMOTE_MACHINE "x-remote-machine"
|
||||||
|
OBJECT_DECLARE_SIMPLE_TYPE(RemoteMachineState, REMOTE_MACHINE)
|
||||||
|
|
||||||
|
void coroutine_fn mpqemu_remote_msg_loop_co(void *data);
|
||||||
|
|
||||||
|
#endif
|
19
include/hw/remote/memory.h
Normal file
19
include/hw/remote/memory.h
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Memory manager for remote device
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef REMOTE_MEMORY_H
|
||||||
|
#define REMOTE_MEMORY_H
|
||||||
|
|
||||||
|
#include "exec/hwaddr.h"
|
||||||
|
#include "hw/remote/mpqemu-link.h"
|
||||||
|
|
||||||
|
void remote_sysmem_reconfig(MPQemuMsg *msg, Error **errp);
|
||||||
|
|
||||||
|
#endif
|
99
include/hw/remote/mpqemu-link.h
Normal file
99
include/hw/remote/mpqemu-link.h
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
/*
|
||||||
|
* Communication channel between QEMU and remote device process
|
||||||
|
*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MPQEMU_LINK_H
|
||||||
|
#define MPQEMU_LINK_H
|
||||||
|
|
||||||
|
#include "qom/object.h"
|
||||||
|
#include "qemu/thread.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "exec/hwaddr.h"
|
||||||
|
#include "io/channel-socket.h"
|
||||||
|
#include "hw/remote/proxy.h"
|
||||||
|
|
||||||
|
#define REMOTE_MAX_FDS 8
|
||||||
|
|
||||||
|
#define MPQEMU_MSG_HDR_SIZE offsetof(MPQemuMsg, data.u64)
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MPQemuCmd:
|
||||||
|
*
|
||||||
|
* MPQemuCmd enum type to specify the command to be executed on the remote
|
||||||
|
* device.
|
||||||
|
*
|
||||||
|
* This uses a private protocol between QEMU and the remote process. vfio-user
|
||||||
|
* protocol would supersede this in the future.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
typedef enum {
|
||||||
|
MPQEMU_CMD_SYNC_SYSMEM,
|
||||||
|
MPQEMU_CMD_RET,
|
||||||
|
MPQEMU_CMD_PCI_CFGWRITE,
|
||||||
|
MPQEMU_CMD_PCI_CFGREAD,
|
||||||
|
MPQEMU_CMD_BAR_WRITE,
|
||||||
|
MPQEMU_CMD_BAR_READ,
|
||||||
|
MPQEMU_CMD_SET_IRQFD,
|
||||||
|
MPQEMU_CMD_DEVICE_RESET,
|
||||||
|
MPQEMU_CMD_MAX,
|
||||||
|
} MPQemuCmd;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
hwaddr gpas[REMOTE_MAX_FDS];
|
||||||
|
uint64_t sizes[REMOTE_MAX_FDS];
|
||||||
|
off_t offsets[REMOTE_MAX_FDS];
|
||||||
|
} SyncSysmemMsg;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
uint32_t addr;
|
||||||
|
uint32_t val;
|
||||||
|
int len;
|
||||||
|
} PciConfDataMsg;
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
hwaddr addr;
|
||||||
|
uint64_t val;
|
||||||
|
unsigned size;
|
||||||
|
bool memory;
|
||||||
|
} BarAccessMsg;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* MPQemuMsg:
|
||||||
|
* @cmd: The remote command
|
||||||
|
* @size: Size of the data to be shared
|
||||||
|
* @data: Structured data
|
||||||
|
* @fds: File descriptors to be shared with remote device
|
||||||
|
*
|
||||||
|
* MPQemuMsg Format of the message sent to the remote device from QEMU.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
typedef struct {
|
||||||
|
int cmd;
|
||||||
|
size_t size;
|
||||||
|
|
||||||
|
union {
|
||||||
|
uint64_t u64;
|
||||||
|
PciConfDataMsg pci_conf_data;
|
||||||
|
SyncSysmemMsg sync_sysmem;
|
||||||
|
BarAccessMsg bar_access;
|
||||||
|
} data;
|
||||||
|
|
||||||
|
int fds[REMOTE_MAX_FDS];
|
||||||
|
int num_fds;
|
||||||
|
} MPQemuMsg;
|
||||||
|
|
||||||
|
bool mpqemu_msg_send(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
|
||||||
|
bool mpqemu_msg_recv(MPQemuMsg *msg, QIOChannel *ioc, Error **errp);
|
||||||
|
|
||||||
|
uint64_t mpqemu_msg_send_and_await_reply(MPQemuMsg *msg, PCIProxyDev *pdev,
|
||||||
|
Error **errp);
|
||||||
|
bool mpqemu_msg_valid(MPQemuMsg *msg);
|
||||||
|
|
||||||
|
#endif
|
28
include/hw/remote/proxy-memory-listener.h
Normal file
28
include/hw/remote/proxy-memory-listener.h
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PROXY_MEMORY_LISTENER_H
|
||||||
|
#define PROXY_MEMORY_LISTENER_H
|
||||||
|
|
||||||
|
#include "exec/memory.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
|
||||||
|
typedef struct ProxyMemoryListener {
|
||||||
|
MemoryListener listener;
|
||||||
|
|
||||||
|
int n_mr_sections;
|
||||||
|
MemoryRegionSection *mr_sections;
|
||||||
|
|
||||||
|
QIOChannel *ioc;
|
||||||
|
} ProxyMemoryListener;
|
||||||
|
|
||||||
|
void proxy_memory_listener_configure(ProxyMemoryListener *proxy_listener,
|
||||||
|
QIOChannel *ioc);
|
||||||
|
void proxy_memory_listener_deconfigure(ProxyMemoryListener *proxy_listener);
|
||||||
|
|
||||||
|
#endif
|
48
include/hw/remote/proxy.h
Normal file
48
include/hw/remote/proxy.h
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
/*
|
||||||
|
* Copyright © 2018, 2021 Oracle and/or its affiliates.
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef PROXY_H
|
||||||
|
#define PROXY_H
|
||||||
|
|
||||||
|
#include "hw/pci/pci.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "hw/remote/proxy-memory-listener.h"
|
||||||
|
#include "qemu/event_notifier.h"
|
||||||
|
|
||||||
|
#define TYPE_PCI_PROXY_DEV "x-pci-proxy-dev"
|
||||||
|
OBJECT_DECLARE_SIMPLE_TYPE(PCIProxyDev, PCI_PROXY_DEV)
|
||||||
|
|
||||||
|
typedef struct ProxyMemoryRegion {
|
||||||
|
PCIProxyDev *dev;
|
||||||
|
MemoryRegion mr;
|
||||||
|
bool memory;
|
||||||
|
bool present;
|
||||||
|
uint8_t type;
|
||||||
|
} ProxyMemoryRegion;
|
||||||
|
|
||||||
|
struct PCIProxyDev {
|
||||||
|
PCIDevice parent_dev;
|
||||||
|
char *fd;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Mutex used to protect the QIOChannel fd from
|
||||||
|
* the concurrent access by the VCPUs since proxy
|
||||||
|
* blocks while awaiting for the replies from the
|
||||||
|
* process remote.
|
||||||
|
*/
|
||||||
|
QemuMutex io_mutex;
|
||||||
|
QIOChannel *ioc;
|
||||||
|
Error *migration_blocker;
|
||||||
|
ProxyMemoryListener proxy_listener;
|
||||||
|
int virq;
|
||||||
|
EventNotifier intr;
|
||||||
|
EventNotifier resample;
|
||||||
|
ProxyMemoryRegion region[PCI_NUM_REGIONS];
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif /* PROXY_H */
|
|
@ -777,4 +777,82 @@ void qio_channel_set_aio_fd_handler(QIOChannel *ioc,
|
||||||
IOHandler *io_write,
|
IOHandler *io_write,
|
||||||
void *opaque);
|
void *opaque);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* qio_channel_readv_full_all_eof:
|
||||||
|
* @ioc: the channel object
|
||||||
|
* @iov: the array of memory regions to read data to
|
||||||
|
* @niov: the length of the @iov array
|
||||||
|
* @fds: an array of file handles to read
|
||||||
|
* @nfds: number of file handles in @fds
|
||||||
|
* @errp: pointer to a NULL-initialized error object
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Performs same function as qio_channel_readv_all_eof.
|
||||||
|
* Additionally, attempts to read file descriptors shared
|
||||||
|
* over the channel. The function will wait for all
|
||||||
|
* requested data to be read, yielding from the current
|
||||||
|
* coroutine if required. data refers to both file
|
||||||
|
* descriptors and the iovs.
|
||||||
|
*
|
||||||
|
* Returns: 1 if all bytes were read, 0 if end-of-file
|
||||||
|
* occurs without data, or -1 on error
|
||||||
|
*/
|
||||||
|
|
||||||
|
int qio_channel_readv_full_all_eof(QIOChannel *ioc,
|
||||||
|
const struct iovec *iov,
|
||||||
|
size_t niov,
|
||||||
|
int **fds, size_t *nfds,
|
||||||
|
Error **errp);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* qio_channel_readv_full_all:
|
||||||
|
* @ioc: the channel object
|
||||||
|
* @iov: the array of memory regions to read data to
|
||||||
|
* @niov: the length of the @iov array
|
||||||
|
* @fds: an array of file handles to read
|
||||||
|
* @nfds: number of file handles in @fds
|
||||||
|
* @errp: pointer to a NULL-initialized error object
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Performs same function as qio_channel_readv_all_eof.
|
||||||
|
* Additionally, attempts to read file descriptors shared
|
||||||
|
* over the channel. The function will wait for all
|
||||||
|
* requested data to be read, yielding from the current
|
||||||
|
* coroutine if required. data refers to both file
|
||||||
|
* descriptors and the iovs.
|
||||||
|
*
|
||||||
|
* Returns: 0 if all bytes were read, or -1 on error
|
||||||
|
*/
|
||||||
|
|
||||||
|
int qio_channel_readv_full_all(QIOChannel *ioc,
|
||||||
|
const struct iovec *iov,
|
||||||
|
size_t niov,
|
||||||
|
int **fds, size_t *nfds,
|
||||||
|
Error **errp);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* qio_channel_writev_full_all:
|
||||||
|
* @ioc: the channel object
|
||||||
|
* @iov: the array of memory regions to write data from
|
||||||
|
* @niov: the length of the @iov array
|
||||||
|
* @fds: an array of file handles to send
|
||||||
|
* @nfds: number of file handles in @fds
|
||||||
|
* @errp: pointer to a NULL-initialized error object
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Behaves like qio_channel_writev_full but will attempt
|
||||||
|
* to send all data passed (file handles and memory regions).
|
||||||
|
* The function will wait for all requested data
|
||||||
|
* to be written, yielding from the current coroutine
|
||||||
|
* if required.
|
||||||
|
*
|
||||||
|
* Returns: 0 if all bytes were written, or -1 on error
|
||||||
|
*/
|
||||||
|
|
||||||
|
int qio_channel_writev_full_all(QIOChannel *ioc,
|
||||||
|
const struct iovec *iov,
|
||||||
|
size_t niov,
|
||||||
|
int *fds, size_t nfds,
|
||||||
|
Error **errp);
|
||||||
|
|
||||||
#endif /* QIO_CHANNEL_H */
|
#endif /* QIO_CHANNEL_H */
|
||||||
|
|
|
@ -17,6 +17,7 @@ size_t qemu_mempath_getpagesize(const char *mem_path);
|
||||||
* @readonly: true for a read-only mapping, false for read/write.
|
* @readonly: true for a read-only mapping, false for read/write.
|
||||||
* @shared: map has RAM_SHARED flag.
|
* @shared: map has RAM_SHARED flag.
|
||||||
* @is_pmem: map has RAM_PMEM flag.
|
* @is_pmem: map has RAM_PMEM flag.
|
||||||
|
* @map_offset: map starts at offset of map_offset from the start of fd
|
||||||
*
|
*
|
||||||
* Return:
|
* Return:
|
||||||
* On success, return a pointer to the mapped area.
|
* On success, return a pointer to the mapped area.
|
||||||
|
@ -27,7 +28,8 @@ void *qemu_ram_mmap(int fd,
|
||||||
size_t align,
|
size_t align,
|
||||||
bool readonly,
|
bool readonly,
|
||||||
bool shared,
|
bool shared,
|
||||||
bool is_pmem);
|
bool is_pmem,
|
||||||
|
off_t map_offset);
|
||||||
|
|
||||||
void qemu_ram_munmap(int fd, void *ptr, size_t size);
|
void qemu_ram_munmap(int fd, void *ptr, size_t size);
|
||||||
|
|
||||||
|
|
|
@ -57,4 +57,10 @@ IOThread *iothread_create(const char *id, Error **errp);
|
||||||
void iothread_stop(IOThread *iothread);
|
void iothread_stop(IOThread *iothread);
|
||||||
void iothread_destroy(IOThread *iothread);
|
void iothread_destroy(IOThread *iothread);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Returns true if executing withing IOThread context,
|
||||||
|
* false otherwise.
|
||||||
|
*/
|
||||||
|
bool qemu_in_iothread(void);
|
||||||
|
|
||||||
#endif /* IOTHREAD_H */
|
#endif /* IOTHREAD_H */
|
||||||
|
|
116
io/channel.c
116
io/channel.c
|
@ -91,20 +91,48 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
|
||||||
const struct iovec *iov,
|
const struct iovec *iov,
|
||||||
size_t niov,
|
size_t niov,
|
||||||
Error **errp)
|
Error **errp)
|
||||||
|
{
|
||||||
|
return qio_channel_readv_full_all_eof(ioc, iov, niov, NULL, NULL, errp);
|
||||||
|
}
|
||||||
|
|
||||||
|
int qio_channel_readv_all(QIOChannel *ioc,
|
||||||
|
const struct iovec *iov,
|
||||||
|
size_t niov,
|
||||||
|
Error **errp)
|
||||||
|
{
|
||||||
|
return qio_channel_readv_full_all(ioc, iov, niov, NULL, NULL, errp);
|
||||||
|
}
|
||||||
|
|
||||||
|
int qio_channel_readv_full_all_eof(QIOChannel *ioc,
|
||||||
|
const struct iovec *iov,
|
||||||
|
size_t niov,
|
||||||
|
int **fds, size_t *nfds,
|
||||||
|
Error **errp)
|
||||||
{
|
{
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
struct iovec *local_iov = g_new(struct iovec, niov);
|
struct iovec *local_iov = g_new(struct iovec, niov);
|
||||||
struct iovec *local_iov_head = local_iov;
|
struct iovec *local_iov_head = local_iov;
|
||||||
unsigned int nlocal_iov = niov;
|
unsigned int nlocal_iov = niov;
|
||||||
|
int **local_fds = fds;
|
||||||
|
size_t *local_nfds = nfds;
|
||||||
bool partial = false;
|
bool partial = false;
|
||||||
|
|
||||||
|
if (nfds) {
|
||||||
|
*nfds = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fds) {
|
||||||
|
*fds = NULL;
|
||||||
|
}
|
||||||
|
|
||||||
nlocal_iov = iov_copy(local_iov, nlocal_iov,
|
nlocal_iov = iov_copy(local_iov, nlocal_iov,
|
||||||
iov, niov,
|
iov, niov,
|
||||||
0, iov_size(iov, niov));
|
0, iov_size(iov, niov));
|
||||||
|
|
||||||
while (nlocal_iov > 0) {
|
while ((nlocal_iov > 0) || local_fds) {
|
||||||
ssize_t len;
|
ssize_t len;
|
||||||
len = qio_channel_readv(ioc, local_iov, nlocal_iov, errp);
|
len = qio_channel_readv_full(ioc, local_iov, nlocal_iov, local_fds,
|
||||||
|
local_nfds, errp);
|
||||||
if (len == QIO_CHANNEL_ERR_BLOCK) {
|
if (len == QIO_CHANNEL_ERR_BLOCK) {
|
||||||
if (qemu_in_coroutine()) {
|
if (qemu_in_coroutine()) {
|
||||||
qio_channel_yield(ioc, G_IO_IN);
|
qio_channel_yield(ioc, G_IO_IN);
|
||||||
|
@ -112,20 +140,50 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
|
||||||
qio_channel_wait(ioc, G_IO_IN);
|
qio_channel_wait(ioc, G_IO_IN);
|
||||||
}
|
}
|
||||||
continue;
|
continue;
|
||||||
} else if (len < 0) {
|
}
|
||||||
goto cleanup;
|
|
||||||
} else if (len == 0) {
|
if (len == 0) {
|
||||||
if (partial) {
|
if (local_nfds && *local_nfds) {
|
||||||
error_setg(errp,
|
/*
|
||||||
"Unexpected end-of-file before all bytes were read");
|
* Got some FDs, but no data yet. This isn't an EOF
|
||||||
} else {
|
* scenario (yet), so carry on to try to read data
|
||||||
|
* on next loop iteration
|
||||||
|
*/
|
||||||
|
goto next_iter;
|
||||||
|
} else if (!partial) {
|
||||||
|
/* No fds and no data - EOF before any data read */
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
goto cleanup;
|
||||||
|
} else {
|
||||||
|
len = -1;
|
||||||
|
error_setg(errp,
|
||||||
|
"Unexpected end-of-file before all data were read");
|
||||||
|
/* Fallthrough into len < 0 handling */
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len < 0) {
|
||||||
|
/* Close any FDs we previously received */
|
||||||
|
if (nfds && fds) {
|
||||||
|
size_t i;
|
||||||
|
for (i = 0; i < (*nfds); i++) {
|
||||||
|
close((*fds)[i]);
|
||||||
|
}
|
||||||
|
g_free(*fds);
|
||||||
|
*fds = NULL;
|
||||||
|
*nfds = 0;
|
||||||
}
|
}
|
||||||
goto cleanup;
|
goto cleanup;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (nlocal_iov) {
|
||||||
|
iov_discard_front(&local_iov, &nlocal_iov, len);
|
||||||
|
}
|
||||||
|
|
||||||
|
next_iter:
|
||||||
partial = true;
|
partial = true;
|
||||||
iov_discard_front(&local_iov, &nlocal_iov, len);
|
local_fds = NULL;
|
||||||
|
local_nfds = NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = 1;
|
ret = 1;
|
||||||
|
@ -135,20 +193,23 @@ int qio_channel_readv_all_eof(QIOChannel *ioc,
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int qio_channel_readv_all(QIOChannel *ioc,
|
int qio_channel_readv_full_all(QIOChannel *ioc,
|
||||||
const struct iovec *iov,
|
const struct iovec *iov,
|
||||||
size_t niov,
|
size_t niov,
|
||||||
Error **errp)
|
int **fds, size_t *nfds,
|
||||||
|
Error **errp)
|
||||||
{
|
{
|
||||||
int ret = qio_channel_readv_all_eof(ioc, iov, niov, errp);
|
int ret = qio_channel_readv_full_all_eof(ioc, iov, niov, fds, nfds, errp);
|
||||||
|
|
||||||
if (ret == 0) {
|
if (ret == 0) {
|
||||||
ret = -1;
|
error_prepend(errp,
|
||||||
error_setg(errp,
|
"Unexpected end-of-file before all data were read.");
|
||||||
"Unexpected end-of-file before all bytes were read");
|
return -1;
|
||||||
} else if (ret == 1) {
|
|
||||||
ret = 0;
|
|
||||||
}
|
}
|
||||||
|
if (ret == 1) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -156,6 +217,15 @@ int qio_channel_writev_all(QIOChannel *ioc,
|
||||||
const struct iovec *iov,
|
const struct iovec *iov,
|
||||||
size_t niov,
|
size_t niov,
|
||||||
Error **errp)
|
Error **errp)
|
||||||
|
{
|
||||||
|
return qio_channel_writev_full_all(ioc, iov, niov, NULL, 0, errp);
|
||||||
|
}
|
||||||
|
|
||||||
|
int qio_channel_writev_full_all(QIOChannel *ioc,
|
||||||
|
const struct iovec *iov,
|
||||||
|
size_t niov,
|
||||||
|
int *fds, size_t nfds,
|
||||||
|
Error **errp)
|
||||||
{
|
{
|
||||||
int ret = -1;
|
int ret = -1;
|
||||||
struct iovec *local_iov = g_new(struct iovec, niov);
|
struct iovec *local_iov = g_new(struct iovec, niov);
|
||||||
|
@ -168,7 +238,8 @@ int qio_channel_writev_all(QIOChannel *ioc,
|
||||||
|
|
||||||
while (nlocal_iov > 0) {
|
while (nlocal_iov > 0) {
|
||||||
ssize_t len;
|
ssize_t len;
|
||||||
len = qio_channel_writev(ioc, local_iov, nlocal_iov, errp);
|
len = qio_channel_writev_full(ioc, local_iov, nlocal_iov, fds, nfds,
|
||||||
|
errp);
|
||||||
if (len == QIO_CHANNEL_ERR_BLOCK) {
|
if (len == QIO_CHANNEL_ERR_BLOCK) {
|
||||||
if (qemu_in_coroutine()) {
|
if (qemu_in_coroutine()) {
|
||||||
qio_channel_yield(ioc, G_IO_OUT);
|
qio_channel_yield(ioc, G_IO_OUT);
|
||||||
|
@ -182,6 +253,9 @@ int qio_channel_writev_all(QIOChannel *ioc,
|
||||||
}
|
}
|
||||||
|
|
||||||
iov_discard_front(&local_iov, &nlocal_iov, len);
|
iov_discard_front(&local_iov, &nlocal_iov, len);
|
||||||
|
|
||||||
|
fds = NULL;
|
||||||
|
nfds = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
ret = 0;
|
ret = 0;
|
||||||
|
|
|
@ -369,3 +369,9 @@ IOThread *iothread_by_id(const char *id)
|
||||||
{
|
{
|
||||||
return IOTHREAD(object_resolve_path_type(id, TYPE_IOTHREAD, NULL));
|
return IOTHREAD(object_resolve_path_type(id, TYPE_IOTHREAD, NULL));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool qemu_in_iothread(void)
|
||||||
|
{
|
||||||
|
return qemu_get_current_aio_context() == qemu_get_aio_context() ?
|
||||||
|
false : true;
|
||||||
|
}
|
||||||
|
|
|
@ -1226,7 +1226,8 @@ host_kconfig = \
|
||||||
('CONFIG_VHOST_KERNEL' in config_host ? ['CONFIG_VHOST_KERNEL=y'] : []) + \
|
('CONFIG_VHOST_KERNEL' in config_host ? ['CONFIG_VHOST_KERNEL=y'] : []) + \
|
||||||
(have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
|
(have_virtfs ? ['CONFIG_VIRTFS=y'] : []) + \
|
||||||
('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
|
('CONFIG_LINUX' in config_host ? ['CONFIG_LINUX=y'] : []) + \
|
||||||
('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : [])
|
('CONFIG_PVRDMA' in config_host ? ['CONFIG_PVRDMA=y'] : []) + \
|
||||||
|
('CONFIG_MULTIPROCESS_ALLOWED' in config_host ? ['CONFIG_MULTIPROCESS_ALLOWED=y'] : [])
|
||||||
|
|
||||||
ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
|
ignored = [ 'TARGET_XML_FILES', 'TARGET_ABI_DIR', 'TARGET_ARCH' ]
|
||||||
|
|
||||||
|
@ -1817,6 +1818,7 @@ if have_system
|
||||||
'net',
|
'net',
|
||||||
'softmmu',
|
'softmmu',
|
||||||
'ui',
|
'ui',
|
||||||
|
'hw/remote',
|
||||||
]
|
]
|
||||||
endif
|
endif
|
||||||
if have_system or have_user
|
if have_system or have_user
|
||||||
|
@ -2652,6 +2654,7 @@ summary_info += {'libpmem support': config_host.has_key('CONFIG_LIBPMEM')}
|
||||||
summary_info += {'libdaxctl support': config_host.has_key('CONFIG_LIBDAXCTL')}
|
summary_info += {'libdaxctl support': config_host.has_key('CONFIG_LIBDAXCTL')}
|
||||||
summary_info += {'libudev': libudev.found()}
|
summary_info += {'libudev': libudev.found()}
|
||||||
summary_info += {'FUSE lseek': fuse_lseek.found()}
|
summary_info += {'FUSE lseek': fuse_lseek.found()}
|
||||||
|
summary_info += {'Multiprocess QEMU': config_host.has_key('CONFIG_MULTIPROCESS_ALLOWED')}
|
||||||
summary(summary_info, bool_yn: true, section: 'Dependencies')
|
summary(summary_info, bool_yn: true, section: 'Dependencies')
|
||||||
|
|
||||||
if not supported_cpus.contains(cpu)
|
if not supported_cpus.contains(cpu)
|
||||||
|
|
|
@ -20,7 +20,7 @@
|
||||||
legacy x86 software to communicate with an attached serial console as
|
legacy x86 software to communicate with an attached serial console as
|
||||||
if a video card were attached. The master sources reside in a subversion
|
if a video card were attached. The master sources reside in a subversion
|
||||||
repository at http://sgabios.googlecode.com/svn/trunk. A git mirror is
|
repository at http://sgabios.googlecode.com/svn/trunk. A git mirror is
|
||||||
available at https://git.qemu.org/git/sgabios.git.
|
available at https://gitlab.com/qemu-project/sgabios.git.
|
||||||
|
|
||||||
- The PXE roms come from the iPXE project. Built with BANNER_TIME 0.
|
- The PXE roms come from the iPXE project. Built with BANNER_TIME 0.
|
||||||
Sources available at http://ipxe.org. Vendor:Device ID -> ROM mapping:
|
Sources available at http://ipxe.org. Vendor:Device ID -> ROM mapping:
|
||||||
|
@ -37,7 +37,7 @@
|
||||||
|
|
||||||
- The u-boot binary for e500 comes from the upstream denx u-boot project where
|
- The u-boot binary for e500 comes from the upstream denx u-boot project where
|
||||||
it was compiled using the qemu-ppce500 target.
|
it was compiled using the qemu-ppce500 target.
|
||||||
A git mirror is available at: https://git.qemu.org/git/u-boot.git
|
A git mirror is available at: https://gitlab.com/qemu-project/u-boot.git
|
||||||
The hash used to compile the current version is: 2072e72
|
The hash used to compile the current version is: 2072e72
|
||||||
|
|
||||||
- Skiboot (https://github.com/open-power/skiboot/) is an OPAL
|
- Skiboot (https://github.com/open-power/skiboot/) is an OPAL
|
||||||
|
|
|
@ -1377,7 +1377,7 @@ sub vcs_exists {
|
||||||
warn("$P: No supported VCS found. Add --nogit to options?\n");
|
warn("$P: No supported VCS found. Add --nogit to options?\n");
|
||||||
warn("Using a git repository produces better results.\n");
|
warn("Using a git repository produces better results.\n");
|
||||||
warn("Try latest git repository using:\n");
|
warn("Try latest git repository using:\n");
|
||||||
warn("git clone https://git.qemu.org/git/qemu.git\n");
|
warn("git clone https://gitlab.com/qemu-project/qemu.git\n");
|
||||||
$printed_novcs = 1;
|
$printed_novcs = 1;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -1612,6 +1612,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
|
||||||
uint64_t size,
|
uint64_t size,
|
||||||
bool share,
|
bool share,
|
||||||
int fd,
|
int fd,
|
||||||
|
ram_addr_t offset,
|
||||||
Error **errp)
|
Error **errp)
|
||||||
{
|
{
|
||||||
Error *err = NULL;
|
Error *err = NULL;
|
||||||
|
@ -1621,7 +1622,7 @@ void memory_region_init_ram_from_fd(MemoryRegion *mr,
|
||||||
mr->destructor = memory_region_destructor_ram;
|
mr->destructor = memory_region_destructor_ram;
|
||||||
mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
|
mr->ram_block = qemu_ram_alloc_from_fd(size, mr,
|
||||||
share ? RAM_SHARED : 0,
|
share ? RAM_SHARED : 0,
|
||||||
fd, false, &err);
|
fd, offset, false, &err);
|
||||||
if (err) {
|
if (err) {
|
||||||
mr->size = int128_zero();
|
mr->size = int128_zero();
|
||||||
object_unparent(OBJECT(mr));
|
object_unparent(OBJECT(mr));
|
||||||
|
|
|
@ -1543,6 +1543,7 @@ static void *file_ram_alloc(RAMBlock *block,
|
||||||
int fd,
|
int fd,
|
||||||
bool readonly,
|
bool readonly,
|
||||||
bool truncate,
|
bool truncate,
|
||||||
|
off_t offset,
|
||||||
Error **errp)
|
Error **errp)
|
||||||
{
|
{
|
||||||
void *area;
|
void *area;
|
||||||
|
@ -1593,7 +1594,8 @@ static void *file_ram_alloc(RAMBlock *block,
|
||||||
}
|
}
|
||||||
|
|
||||||
area = qemu_ram_mmap(fd, memory, block->mr->align, readonly,
|
area = qemu_ram_mmap(fd, memory, block->mr->align, readonly,
|
||||||
block->flags & RAM_SHARED, block->flags & RAM_PMEM);
|
block->flags & RAM_SHARED, block->flags & RAM_PMEM,
|
||||||
|
offset);
|
||||||
if (area == MAP_FAILED) {
|
if (area == MAP_FAILED) {
|
||||||
error_setg_errno(errp, errno,
|
error_setg_errno(errp, errno,
|
||||||
"unable to map backing store for guest RAM");
|
"unable to map backing store for guest RAM");
|
||||||
|
@ -2024,8 +2026,8 @@ static void ram_block_add(RAMBlock *new_block, Error **errp, bool shared)
|
||||||
|
|
||||||
#ifdef CONFIG_POSIX
|
#ifdef CONFIG_POSIX
|
||||||
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
|
RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
|
||||||
uint32_t ram_flags, int fd, bool readonly,
|
uint32_t ram_flags, int fd, off_t offset,
|
||||||
Error **errp)
|
bool readonly, Error **errp)
|
||||||
{
|
{
|
||||||
RAMBlock *new_block;
|
RAMBlock *new_block;
|
||||||
Error *local_err = NULL;
|
Error *local_err = NULL;
|
||||||
|
@ -2079,7 +2081,7 @@ RAMBlock *qemu_ram_alloc_from_fd(ram_addr_t size, MemoryRegion *mr,
|
||||||
new_block->max_length = size;
|
new_block->max_length = size;
|
||||||
new_block->flags = ram_flags;
|
new_block->flags = ram_flags;
|
||||||
new_block->host = file_ram_alloc(new_block, size, fd, readonly,
|
new_block->host = file_ram_alloc(new_block, size, fd, readonly,
|
||||||
!file_size, errp);
|
!file_size, offset, errp);
|
||||||
if (!new_block->host) {
|
if (!new_block->host) {
|
||||||
g_free(new_block);
|
g_free(new_block);
|
||||||
return NULL;
|
return NULL;
|
||||||
|
@ -2110,7 +2112,7 @@ RAMBlock *qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
|
||||||
block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, readonly, errp);
|
block = qemu_ram_alloc_from_fd(size, mr, ram_flags, fd, 0, readonly, errp);
|
||||||
if (!block) {
|
if (!block) {
|
||||||
if (created) {
|
if (created) {
|
||||||
unlink(mem_path);
|
unlink(mem_path);
|
||||||
|
|
|
@ -87,7 +87,8 @@ void *qemu_ram_mmap(int fd,
|
||||||
size_t align,
|
size_t align,
|
||||||
bool readonly,
|
bool readonly,
|
||||||
bool shared,
|
bool shared,
|
||||||
bool is_pmem)
|
bool is_pmem,
|
||||||
|
off_t map_offset)
|
||||||
{
|
{
|
||||||
int prot;
|
int prot;
|
||||||
int flags;
|
int flags;
|
||||||
|
@ -150,7 +151,8 @@ void *qemu_ram_mmap(int fd,
|
||||||
|
|
||||||
prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
|
prot = PROT_READ | (readonly ? 0 : PROT_WRITE);
|
||||||
|
|
||||||
ptr = mmap(guardptr + offset, size, prot, flags | map_sync_flags, fd, 0);
|
ptr = mmap(guardptr + offset, size, prot,
|
||||||
|
flags | map_sync_flags, fd, map_offset);
|
||||||
|
|
||||||
if (ptr == MAP_FAILED && map_sync_flags) {
|
if (ptr == MAP_FAILED && map_sync_flags) {
|
||||||
if (errno == ENOTSUP) {
|
if (errno == ENOTSUP) {
|
||||||
|
@ -174,7 +176,7 @@ void *qemu_ram_mmap(int fd,
|
||||||
* if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
|
* if map failed with MAP_SHARED_VALIDATE | MAP_SYNC,
|
||||||
* we will remove these flags to handle compatibility.
|
* we will remove these flags to handle compatibility.
|
||||||
*/
|
*/
|
||||||
ptr = mmap(guardptr + offset, size, prot, flags, fd, 0);
|
ptr = mmap(guardptr + offset, size, prot, flags, fd, map_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (ptr == MAP_FAILED) {
|
if (ptr == MAP_FAILED) {
|
||||||
|
|
|
@ -230,7 +230,7 @@ void *qemu_memalign(size_t alignment, size_t size)
|
||||||
void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
|
void *qemu_anon_ram_alloc(size_t size, uint64_t *alignment, bool shared)
|
||||||
{
|
{
|
||||||
size_t align = QEMU_VMALLOC_ALIGN;
|
size_t align = QEMU_VMALLOC_ALIGN;
|
||||||
void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false);
|
void *ptr = qemu_ram_mmap(-1, size, align, false, shared, false, 0);
|
||||||
|
|
||||||
if (ptr == MAP_FAILED) {
|
if (ptr == MAP_FAILED) {
|
||||||
return NULL;
|
return NULL;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue