mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-07-29 05:13:54 -06:00
* target/i386/kvm: support for reading RAPL MSRs using a helper program
* hpet: emulation improvements -----BEGIN PGP SIGNATURE----- iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmaelL4UHHBib256aW5p QHJlZGhhdC5jb20ACgkQv/vSX3jHroMXoQf+K77lNlHLETSgeeP3dr7yZPOmXjjN qFY/18jiyLw7MK1rZC09fF+n9SoaTH8JDKupt0z9M1R10HKHLIO04f8zDE+dOxaE Rou3yKnlTgFPGSoPPFr1n1JJfxtYlLZRoUzaAcHUaa4W7JR/OHJX90n1Rb9MXeDk jV6P0v1FWtIDdM6ERm9qBGoQdYhj6Ra2T4/NZKJFXwIhKEkxgu4yO7WXv8l0dxQz jE4fKotqAvrkYW1EsiVZm30lw/19duhvGiYeQXoYhk8KKXXjAbJMblLITSNWsCio 3l6Uud/lOxekkJDAq5nH3H9hCBm0WwvwL+0vRf3Mkr+/xRGvrhtmUdp8NQ== =00mB -----END PGP SIGNATURE----- Merge tag 'for-upstream' of https://gitlab.com/bonzini/qemu into staging * target/i386/kvm: support for reading RAPL MSRs using a helper program * hpet: emulation improvements # -----BEGIN PGP SIGNATURE----- # # iQFIBAABCAAyFiEE8TM4V0tmI4mGbHaCv/vSX3jHroMFAmaelL4UHHBib256aW5p # QHJlZGhhdC5jb20ACgkQv/vSX3jHroMXoQf+K77lNlHLETSgeeP3dr7yZPOmXjjN # qFY/18jiyLw7MK1rZC09fF+n9SoaTH8JDKupt0z9M1R10HKHLIO04f8zDE+dOxaE # Rou3yKnlTgFPGSoPPFr1n1JJfxtYlLZRoUzaAcHUaa4W7JR/OHJX90n1Rb9MXeDk # jV6P0v1FWtIDdM6ERm9qBGoQdYhj6Ra2T4/NZKJFXwIhKEkxgu4yO7WXv8l0dxQz # jE4fKotqAvrkYW1EsiVZm30lw/19duhvGiYeQXoYhk8KKXXjAbJMblLITSNWsCio # 3l6Uud/lOxekkJDAq5nH3H9hCBm0WwvwL+0vRf3Mkr+/xRGvrhtmUdp8NQ== # =00mB # -----END PGP SIGNATURE----- # gpg: Signature made Tue 23 Jul 2024 03:19:58 AM AEST # gpg: using RSA key F13338574B662389866C7682BFFBD25F78C7AE83 # gpg: issuer "pbonzini@redhat.com" # gpg: Good signature from "Paolo Bonzini <bonzini@gnu.org>" [full] # gpg: aka "Paolo Bonzini <pbonzini@redhat.com>" [full] * tag 'for-upstream' of https://gitlab.com/bonzini/qemu: hpet: avoid timer storms on periodic timers hpet: store full 64-bit target value of the counter hpet: accept 64-bit reads and writes hpet: place read-only bits directly in "new_val" hpet: remove unnecessary variable "index" hpet: ignore high bits of comparator in 32-bit mode hpet: fix and cleanup persistence of interrupt status Add support for RAPL MSRs in KVM/Qemu tools: build qemu-vmsr-helper qio: add support for SO_PEERCRED for socket channel target/i386: do not crash if microvm guest uses SGX CPUID leaves Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
This commit is contained in:
commit
43f59bf765
23 changed files with 1995 additions and 185 deletions
|
@ -140,6 +140,7 @@ F: docs/system/target-i386*
|
||||||
F: target/i386/*.[ch]
|
F: target/i386/*.[ch]
|
||||||
F: target/i386/Kconfig
|
F: target/i386/Kconfig
|
||||||
F: target/i386/meson.build
|
F: target/i386/meson.build
|
||||||
|
F: tools/i386/
|
||||||
|
|
||||||
Guest CPU cores (TCG)
|
Guest CPU cores (TCG)
|
||||||
---------------------
|
---------------------
|
||||||
|
|
|
@ -3776,6 +3776,21 @@ static void kvm_set_device(Object *obj,
|
||||||
s->device = g_strdup(value);
|
s->device = g_strdup(value);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void kvm_set_kvm_rapl(Object *obj, bool value, Error **errp)
|
||||||
|
{
|
||||||
|
KVMState *s = KVM_STATE(obj);
|
||||||
|
s->msr_energy.enable = value;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void kvm_set_kvm_rapl_socket_path(Object *obj,
|
||||||
|
const char *str,
|
||||||
|
Error **errp)
|
||||||
|
{
|
||||||
|
KVMState *s = KVM_STATE(obj);
|
||||||
|
g_free(s->msr_energy.socket_path);
|
||||||
|
s->msr_energy.socket_path = g_strdup(str);
|
||||||
|
}
|
||||||
|
|
||||||
static void kvm_accel_instance_init(Object *obj)
|
static void kvm_accel_instance_init(Object *obj)
|
||||||
{
|
{
|
||||||
KVMState *s = KVM_STATE(obj);
|
KVMState *s = KVM_STATE(obj);
|
||||||
|
@ -3795,6 +3810,7 @@ static void kvm_accel_instance_init(Object *obj)
|
||||||
s->xen_gnttab_max_frames = 64;
|
s->xen_gnttab_max_frames = 64;
|
||||||
s->xen_evtchn_max_pirq = 256;
|
s->xen_evtchn_max_pirq = 256;
|
||||||
s->device = NULL;
|
s->device = NULL;
|
||||||
|
s->msr_energy.enable = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -3839,6 +3855,17 @@ static void kvm_accel_class_init(ObjectClass *oc, void *data)
|
||||||
object_class_property_set_description(oc, "device",
|
object_class_property_set_description(oc, "device",
|
||||||
"Path to the device node to use (default: /dev/kvm)");
|
"Path to the device node to use (default: /dev/kvm)");
|
||||||
|
|
||||||
|
object_class_property_add_bool(oc, "rapl",
|
||||||
|
NULL,
|
||||||
|
kvm_set_kvm_rapl);
|
||||||
|
object_class_property_set_description(oc, "rapl",
|
||||||
|
"Allow energy related MSRs for RAPL interface in Guest");
|
||||||
|
|
||||||
|
object_class_property_add_str(oc, "rapl-helper-socket", NULL,
|
||||||
|
kvm_set_kvm_rapl_socket_path);
|
||||||
|
object_class_property_set_description(oc, "rapl-helper-socket",
|
||||||
|
"Socket Path for comminucating with the Virtual MSR helper daemon");
|
||||||
|
|
||||||
kvm_arch_accel_class_init(oc);
|
kvm_arch_accel_class_init(oc);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
15
contrib/systemd/qemu-vmsr-helper.service
Normal file
15
contrib/systemd/qemu-vmsr-helper.service
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Virtual RAPL MSR Daemon for QEMU
|
||||||
|
|
||||||
|
[Service]
|
||||||
|
WorkingDirectory=/tmp
|
||||||
|
Type=simple
|
||||||
|
ExecStart=/usr/bin/qemu-vmsr-helper
|
||||||
|
PrivateTmp=yes
|
||||||
|
ProtectSystem=strict
|
||||||
|
ReadWritePaths=/var/run
|
||||||
|
RestrictAddressFamilies=AF_UNIX
|
||||||
|
Restart=always
|
||||||
|
RestartSec=0
|
||||||
|
|
||||||
|
[Install]
|
9
contrib/systemd/qemu-vmsr-helper.socket
Normal file
9
contrib/systemd/qemu-vmsr-helper.socket
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
[Unit]
|
||||||
|
Description=Virtual RAPL MSR helper for QEMU
|
||||||
|
|
||||||
|
[Socket]
|
||||||
|
ListenStream=/run/qemu-vmsr-helper.sock
|
||||||
|
SocketMode=0600
|
||||||
|
|
||||||
|
[Install]
|
||||||
|
WantedBy=multi-user.target
|
|
@ -34,3 +34,4 @@ guest hardware that is specific to QEMU.
|
||||||
virt-ctlr
|
virt-ctlr
|
||||||
vmcoreinfo
|
vmcoreinfo
|
||||||
vmgenid
|
vmgenid
|
||||||
|
rapl-msr
|
||||||
|
|
155
docs/specs/rapl-msr.rst
Normal file
155
docs/specs/rapl-msr.rst
Normal file
|
@ -0,0 +1,155 @@
|
||||||
|
================
|
||||||
|
RAPL MSR support
|
||||||
|
================
|
||||||
|
|
||||||
|
The RAPL interface (Running Average Power Limit) is advertising the accumulated
|
||||||
|
energy consumption of various power domains (e.g. CPU packages, DRAM, etc.).
|
||||||
|
|
||||||
|
The consumption is reported via MSRs (model specific registers) like
|
||||||
|
MSR_PKG_ENERGY_STATUS for the CPU package power domain. These MSRs are 64 bits
|
||||||
|
registers that represent the accumulated energy consumption in micro Joules.
|
||||||
|
|
||||||
|
Thanks to the MSR Filtering patch [#a]_ not all MSRs are handled by KVM. Some
|
||||||
|
of them can now be handled by the userspace (QEMU). It uses a mechanism called
|
||||||
|
"MSR filtering" where a list of MSRs is given at init time of a VM to KVM so
|
||||||
|
that a callback is put in place. The design of this patch uses only this
|
||||||
|
mechanism for handling the MSRs between guest/host.
|
||||||
|
|
||||||
|
At the moment the following MSRs are involved:
|
||||||
|
|
||||||
|
.. code:: C
|
||||||
|
|
||||||
|
#define MSR_RAPL_POWER_UNIT 0x00000606
|
||||||
|
#define MSR_PKG_POWER_LIMIT 0x00000610
|
||||||
|
#define MSR_PKG_ENERGY_STATUS 0x00000611
|
||||||
|
#define MSR_PKG_POWER_INFO 0x00000614
|
||||||
|
|
||||||
|
The ``*_POWER_UNIT``, ``*_POWER_LIMIT``, ``*_POWER INFO`` are part of the RAPL
|
||||||
|
spec and specify the power limit of the package, provide range of parameter(min
|
||||||
|
power, max power,..) and also the information of the multiplier for the energy
|
||||||
|
counter to calculate the power. Those MSRs are populated once at the beginning
|
||||||
|
by reading the host CPU MSRs and are given back to the guest 1:1 when
|
||||||
|
requested.
|
||||||
|
|
||||||
|
The MSR_PKG_ENERGY_STATUS is a counter; it represents the total amount of
|
||||||
|
energy consumed since the last time the register was cleared. If you multiply
|
||||||
|
it with the UNIT provided above you'll get the power in micro-joules. This
|
||||||
|
counter is always increasing and it increases more or less faster depending on
|
||||||
|
the consumption of the package. This counter is supposed to overflow at some
|
||||||
|
point.
|
||||||
|
|
||||||
|
Each core belonging to the same Package reading the MSR_PKG_ENERGY_STATUS (i.e
|
||||||
|
"rdmsr 0x611") will retrieve the same value. The value represents the energy
|
||||||
|
for the whole package. Whatever Core reading it will get the same value and a
|
||||||
|
core that belongs to PKG-0 will not be able to get the value of PKG-1 and
|
||||||
|
vice-versa.
|
||||||
|
|
||||||
|
High level implementation
|
||||||
|
-------------------------
|
||||||
|
|
||||||
|
In order to update the value of the virtual MSR, a QEMU thread is created.
|
||||||
|
The thread is basically just an infinity loop that does:
|
||||||
|
|
||||||
|
1. Snapshot of the time metrics of all QEMU threads (Time spent scheduled in
|
||||||
|
Userspace and System)
|
||||||
|
|
||||||
|
2. Snapshot of the actual MSR_PKG_ENERGY_STATUS counter of all packages where
|
||||||
|
the QEMU threads are running on.
|
||||||
|
|
||||||
|
3. Sleep for 1 second - During this pause the vcpu and other non-vcpu threads
|
||||||
|
will do what they have to do and so the energy counter will increase.
|
||||||
|
|
||||||
|
4. Repeat 2. and 3. and calculate the delta of every metrics representing the
|
||||||
|
time spent scheduled for each QEMU thread *and* the energy spent by the
|
||||||
|
packages during the pause.
|
||||||
|
|
||||||
|
5. Filter the vcpu threads and the non-vcpu threads.
|
||||||
|
|
||||||
|
6. Retrieve the topology of the Virtual Machine. This helps identify which
|
||||||
|
vCPU is running on which virtual package.
|
||||||
|
|
||||||
|
7. The total energy spent by the non-vcpu threads is divided by the number
|
||||||
|
of vcpu threads so that each vcpu thread will get an equal part of the
|
||||||
|
energy spent by the QEMU workers.
|
||||||
|
|
||||||
|
8. Calculate the ratio of energy spent per vcpu threads.
|
||||||
|
|
||||||
|
9. Calculate the energy for each virtual package.
|
||||||
|
|
||||||
|
10. The virtual MSRs are updated for each virtual package. Each vCPU that
|
||||||
|
belongs to the same package will return the same value when accessing the
|
||||||
|
the MSR.
|
||||||
|
|
||||||
|
11. Loop back to 1.
|
||||||
|
|
||||||
|
Ratio calculation
|
||||||
|
-----------------
|
||||||
|
|
||||||
|
In Linux, a process has an execution time associated with it. The scheduler is
|
||||||
|
dividing the time in clock ticks. The number of clock ticks per second can be
|
||||||
|
found by the sysconf system call. A typical value of clock ticks per second is
|
||||||
|
100. So a core can run a process at the maximum of 100 ticks per second. If a
|
||||||
|
package has 4 cores, 400 ticks maximum can be scheduled on all the cores
|
||||||
|
of the package for a period of 1 second.
|
||||||
|
|
||||||
|
The /proc/[pid]/stat [#b]_ is a sysfs file that can give the executed time of a
|
||||||
|
process with the [pid] as the process ID. It gives the amount of ticks the
|
||||||
|
process has been scheduled in userspace (utime) and kernel space (stime).
|
||||||
|
|
||||||
|
By reading those metrics for a thread, one can calculate the ratio of time the
|
||||||
|
package has spent executing the thread.
|
||||||
|
|
||||||
|
Example:
|
||||||
|
|
||||||
|
A 4 cores package can schedule a maximum of 400 ticks per second with 100 ticks
|
||||||
|
per second per core. If a thread was scheduled for 100 ticks between a second
|
||||||
|
on this package, that means my thread has been scheduled for 1/4 of the whole
|
||||||
|
package. With that, the calculation of the energy spent by the thread on this
|
||||||
|
package during this whole second is 1/4 of the total energy spent by the
|
||||||
|
package.
|
||||||
|
|
||||||
|
Usage
|
||||||
|
-----
|
||||||
|
|
||||||
|
Currently this feature is only working on an Intel CPU that has the RAPL driver
|
||||||
|
mounted and available in the sysfs. if not, QEMU fails at start-up.
|
||||||
|
|
||||||
|
This feature is activated with -accel
|
||||||
|
kvm,rapl=true,rapl-helper-socket=/path/sock.sock
|
||||||
|
|
||||||
|
It is important that the socket path is the same as the one
|
||||||
|
:program:`qemu-vmsr-helper` is listening to.
|
||||||
|
|
||||||
|
qemu-vmsr-helper
|
||||||
|
----------------
|
||||||
|
|
||||||
|
The qemu-vmsr-helper is working very much like the qemu-pr-helper. Instead of
|
||||||
|
making persistent reservation, qemu-vmsr-helper is here to overcome the
|
||||||
|
CVE-2020-8694 which remove user access to the rapl msr attributes.
|
||||||
|
|
||||||
|
A socket communication is established between QEMU processes that has the RAPL
|
||||||
|
MSR support activated and the qemu-vmsr-helper. A systemd service and socket
|
||||||
|
activation is provided in contrib/systemd/qemu-vmsr-helper.(service/socket).
|
||||||
|
|
||||||
|
The systemd socket uses 600, like contrib/systemd/qemu-pr-helper.socket. The
|
||||||
|
socket can be passed via SCM_RIGHTS by libvirt, or its permissions can be
|
||||||
|
changed (e.g. 660 and root:kvm for a Debian system for example). Libvirt could
|
||||||
|
also start a separate helper if needed. All in all, the policy is left to the
|
||||||
|
user.
|
||||||
|
|
||||||
|
See the qemu-pr-helper documentation or manpage for further details.
|
||||||
|
|
||||||
|
Current Limitations
|
||||||
|
-------------------
|
||||||
|
|
||||||
|
- Works only on Intel host CPUs because AMD CPUs are using different MSR
|
||||||
|
addresses.
|
||||||
|
|
||||||
|
- Only the Package Power-Plane (MSR_PKG_ENERGY_STATUS) is reported at the
|
||||||
|
moment.
|
||||||
|
|
||||||
|
References
|
||||||
|
----------
|
||||||
|
|
||||||
|
.. [#a] https://patchwork.kernel.org/project/kvm/patch/20200916202951.23760-7-graf@amazon.com/
|
||||||
|
.. [#b] https://man7.org/linux/man-pages/man5/proc.5.html
|
|
@ -16,3 +16,4 @@ command line utilities and other standalone programs.
|
||||||
qemu-pr-helper
|
qemu-pr-helper
|
||||||
qemu-trace-stap
|
qemu-trace-stap
|
||||||
virtfs-proxy-helper
|
virtfs-proxy-helper
|
||||||
|
qemu-vmsr-helper
|
||||||
|
|
89
docs/tools/qemu-vmsr-helper.rst
Normal file
89
docs/tools/qemu-vmsr-helper.rst
Normal file
|
@ -0,0 +1,89 @@
|
||||||
|
==================================
|
||||||
|
QEMU virtual RAPL MSR helper
|
||||||
|
==================================
|
||||||
|
|
||||||
|
Synopsis
|
||||||
|
--------
|
||||||
|
|
||||||
|
**qemu-vmsr-helper** [*OPTION*]
|
||||||
|
|
||||||
|
Description
|
||||||
|
-----------
|
||||||
|
|
||||||
|
Implements the virtual RAPL MSR helper for QEMU.
|
||||||
|
|
||||||
|
Accessing the RAPL (Running Average Power Limit) MSR enables the RAPL powercap
|
||||||
|
driver to advertise and monitor the power consumption or accumulated energy
|
||||||
|
consumption of different power domains, such as CPU packages, DRAM, and other
|
||||||
|
components when available.
|
||||||
|
|
||||||
|
However those register are accesible under priviliged access (CAP_SYS_RAWIO).
|
||||||
|
QEMU can use an external helper to access those priviliged register.
|
||||||
|
|
||||||
|
:program:`qemu-vmsr-helper` is that external helper; it creates a listener
|
||||||
|
socket which will accept incoming connections for communication with QEMU.
|
||||||
|
|
||||||
|
If you want to run VMs in a setup like this, this helper should be started as a
|
||||||
|
system service, and you should read the QEMU manual section on "RAPL MSR
|
||||||
|
support" to find out how to configure QEMU to connect to the socket created by
|
||||||
|
:program:`qemu-vmsr-helper`.
|
||||||
|
|
||||||
|
After connecting to the socket, :program:`qemu-vmsr-helper` can
|
||||||
|
optionally drop root privileges, except for those capabilities that
|
||||||
|
are needed for its operation.
|
||||||
|
|
||||||
|
:program:`qemu-vmsr-helper` can also use the systemd socket activation
|
||||||
|
protocol. In this case, the systemd socket unit should specify a
|
||||||
|
Unix stream socket, like this::
|
||||||
|
|
||||||
|
[Socket]
|
||||||
|
ListenStream=/var/run/qemu-vmsr-helper.sock
|
||||||
|
|
||||||
|
Options
|
||||||
|
-------
|
||||||
|
|
||||||
|
.. program:: qemu-vmsr-helper
|
||||||
|
|
||||||
|
.. option:: -d, --daemon
|
||||||
|
|
||||||
|
run in the background (and create a PID file)
|
||||||
|
|
||||||
|
.. option:: -q, --quiet
|
||||||
|
|
||||||
|
decrease verbosity
|
||||||
|
|
||||||
|
.. option:: -v, --verbose
|
||||||
|
|
||||||
|
increase verbosity
|
||||||
|
|
||||||
|
.. option:: -f, --pidfile=PATH
|
||||||
|
|
||||||
|
PID file when running as a daemon. By default the PID file
|
||||||
|
is created in the system runtime state directory, for example
|
||||||
|
:file:`/var/run/qemu-vmsr-helper.pid`.
|
||||||
|
|
||||||
|
.. option:: -k, --socket=PATH
|
||||||
|
|
||||||
|
path to the socket. By default the socket is created in
|
||||||
|
the system runtime state directory, for example
|
||||||
|
:file:`/var/run/qemu-vmsr-helper.sock`.
|
||||||
|
|
||||||
|
.. option:: -T, --trace [[enable=]PATTERN][,events=FILE][,file=FILE]
|
||||||
|
|
||||||
|
.. include:: ../qemu-option-trace.rst.inc
|
||||||
|
|
||||||
|
.. option:: -u, --user=USER
|
||||||
|
|
||||||
|
user to drop privileges to
|
||||||
|
|
||||||
|
.. option:: -g, --group=GROUP
|
||||||
|
|
||||||
|
group to drop privileges to
|
||||||
|
|
||||||
|
.. option:: -h, --help
|
||||||
|
|
||||||
|
Display a help message and exit.
|
||||||
|
|
||||||
|
.. option:: -V, --version
|
||||||
|
|
||||||
|
Display version information and exit.
|
|
@ -268,10 +268,12 @@ void hmp_info_sgx(Monitor *mon, const QDict *qdict)
|
||||||
|
|
||||||
bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
|
bool sgx_epc_get_section(int section_nr, uint64_t *addr, uint64_t *size)
|
||||||
{
|
{
|
||||||
PCMachineState *pcms = PC_MACHINE(qdev_get_machine());
|
PCMachineState *pcms =
|
||||||
|
(PCMachineState *)object_dynamic_cast(qdev_get_machine(),
|
||||||
|
TYPE_PC_MACHINE);
|
||||||
SGXEPCDevice *epc;
|
SGXEPCDevice *epc;
|
||||||
|
|
||||||
if (pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
|
if (!pcms || pcms->sgx_epc.size == 0 || pcms->sgx_epc.nr_sections <= section_nr) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
329
hw/timer/hpet.c
329
hw/timer/hpet.c
|
@ -54,10 +54,12 @@ typedef struct HPETTimer { /* timers */
|
||||||
uint64_t cmp; /* comparator */
|
uint64_t cmp; /* comparator */
|
||||||
uint64_t fsb; /* FSB route */
|
uint64_t fsb; /* FSB route */
|
||||||
/* Hidden register state */
|
/* Hidden register state */
|
||||||
|
uint64_t cmp64; /* comparator (extended to counter width) */
|
||||||
uint64_t period; /* Last value written to comparator */
|
uint64_t period; /* Last value written to comparator */
|
||||||
uint8_t wrap_flag; /* timer pop will indicate wrap for one-shot 32-bit
|
uint8_t wrap_flag; /* timer pop will indicate wrap for one-shot 32-bit
|
||||||
* mode. Next pop will be actual timer expiration.
|
* mode. Next pop will be actual timer expiration.
|
||||||
*/
|
*/
|
||||||
|
uint64_t last; /* last value armed, to avoid timer storms */
|
||||||
} HPETTimer;
|
} HPETTimer;
|
||||||
|
|
||||||
struct HPETState {
|
struct HPETState {
|
||||||
|
@ -115,11 +117,6 @@ static uint32_t timer_enabled(HPETTimer *t)
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint32_t hpet_time_after(uint64_t a, uint64_t b)
|
static uint32_t hpet_time_after(uint64_t a, uint64_t b)
|
||||||
{
|
|
||||||
return ((int32_t)(b - a) < 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
static uint32_t hpet_time_after64(uint64_t a, uint64_t b)
|
|
||||||
{
|
{
|
||||||
return ((int64_t)(b - a) < 0);
|
return ((int64_t)(b - a) < 0);
|
||||||
}
|
}
|
||||||
|
@ -156,29 +153,34 @@ static uint64_t hpet_get_ticks(HPETState *s)
|
||||||
return ns_to_ticks(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->hpet_offset);
|
return ns_to_ticks(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + s->hpet_offset);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
static uint64_t hpet_get_ns(HPETState *s, uint64_t tick)
|
||||||
* calculate diff between comparator value and current ticks
|
|
||||||
*/
|
|
||||||
static inline uint64_t hpet_calculate_diff(HPETTimer *t, uint64_t current)
|
|
||||||
{
|
{
|
||||||
|
return ticks_to_ns(tick) - s->hpet_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* calculate next value of the general counter that matches the
|
||||||
|
* target (either entirely, or the low 32-bit only depending on
|
||||||
|
* the timer mode).
|
||||||
|
*/
|
||||||
|
static uint64_t hpet_calculate_cmp64(HPETTimer *t, uint64_t cur_tick, uint64_t target)
|
||||||
|
{
|
||||||
if (t->config & HPET_TN_32BIT) {
|
if (t->config & HPET_TN_32BIT) {
|
||||||
uint32_t diff, cmp;
|
uint64_t result = deposit64(cur_tick, 0, 32, target);
|
||||||
|
if (result < cur_tick) {
|
||||||
cmp = (uint32_t)t->cmp;
|
result += 0x100000000ULL;
|
||||||
diff = cmp - (uint32_t)current;
|
}
|
||||||
diff = (int32_t)diff > 0 ? diff : (uint32_t)1;
|
return result;
|
||||||
return (uint64_t)diff;
|
|
||||||
} else {
|
} else {
|
||||||
uint64_t diff, cmp;
|
return target;
|
||||||
|
|
||||||
cmp = t->cmp;
|
|
||||||
diff = cmp - current;
|
|
||||||
diff = (int64_t)diff > 0 ? diff : (uint64_t)1;
|
|
||||||
return diff;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static uint64_t hpet_next_wrap(uint64_t cur_tick)
|
||||||
|
{
|
||||||
|
return (cur_tick | 0xffffffffU) + 1;
|
||||||
|
}
|
||||||
|
|
||||||
static void update_irq(struct HPETTimer *timer, int set)
|
static void update_irq(struct HPETTimer *timer, int set)
|
||||||
{
|
{
|
||||||
uint64_t mask;
|
uint64_t mask;
|
||||||
|
@ -196,21 +198,31 @@ static void update_irq(struct HPETTimer *timer, int set)
|
||||||
}
|
}
|
||||||
s = timer->state;
|
s = timer->state;
|
||||||
mask = 1 << timer->tn;
|
mask = 1 << timer->tn;
|
||||||
if (!set || !timer_enabled(timer) || !hpet_enabled(timer->state)) {
|
|
||||||
|
if (set && (timer->config & HPET_TN_TYPE_LEVEL)) {
|
||||||
|
/*
|
||||||
|
* If HPET_TN_ENABLE bit is 0, "the timer will still operate and
|
||||||
|
* generate appropriate status bits, but will not cause an interrupt"
|
||||||
|
*/
|
||||||
|
s->isr |= mask;
|
||||||
|
} else {
|
||||||
s->isr &= ~mask;
|
s->isr &= ~mask;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (set && timer_enabled(timer) && hpet_enabled(s)) {
|
||||||
|
if (timer_fsb_route(timer)) {
|
||||||
|
address_space_stl_le(&address_space_memory, timer->fsb >> 32,
|
||||||
|
timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED,
|
||||||
|
NULL);
|
||||||
|
} else if (timer->config & HPET_TN_TYPE_LEVEL) {
|
||||||
|
qemu_irq_raise(s->irqs[route]);
|
||||||
|
} else {
|
||||||
|
qemu_irq_pulse(s->irqs[route]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
if (!timer_fsb_route(timer)) {
|
if (!timer_fsb_route(timer)) {
|
||||||
qemu_irq_lower(s->irqs[route]);
|
qemu_irq_lower(s->irqs[route]);
|
||||||
}
|
}
|
||||||
} else if (timer_fsb_route(timer)) {
|
|
||||||
address_space_stl_le(&address_space_memory, timer->fsb >> 32,
|
|
||||||
timer->fsb & 0xffffffff, MEMTXATTRS_UNSPECIFIED,
|
|
||||||
NULL);
|
|
||||||
} else if (timer->config & HPET_TN_TYPE_LEVEL) {
|
|
||||||
s->isr |= mask;
|
|
||||||
qemu_irq_raise(s->irqs[route]);
|
|
||||||
} else {
|
|
||||||
s->isr &= ~mask;
|
|
||||||
qemu_irq_pulse(s->irqs[route]);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -250,7 +262,13 @@ static bool hpet_validate_num_timers(void *opaque, int version_id)
|
||||||
static int hpet_post_load(void *opaque, int version_id)
|
static int hpet_post_load(void *opaque, int version_id)
|
||||||
{
|
{
|
||||||
HPETState *s = opaque;
|
HPETState *s = opaque;
|
||||||
|
int i;
|
||||||
|
|
||||||
|
for (i = 0; i < s->num_timers; i++) {
|
||||||
|
HPETTimer *t = &s->timer[i];
|
||||||
|
t->cmp64 = hpet_calculate_cmp64(t, s->hpet_counter, t->cmp);
|
||||||
|
t->last = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) - NANOSECONDS_PER_SECOND;
|
||||||
|
}
|
||||||
/* Recalculate the offset between the main counter and guest time */
|
/* Recalculate the offset between the main counter and guest time */
|
||||||
if (!s->hpet_offset_saved) {
|
if (!s->hpet_offset_saved) {
|
||||||
s->hpet_offset = ticks_to_ns(s->hpet_counter)
|
s->hpet_offset = ticks_to_ns(s->hpet_counter)
|
||||||
|
@ -346,14 +364,17 @@ static const VMStateDescription vmstate_hpet = {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
static void hpet_arm(HPETTimer *t, uint64_t ticks)
|
static void hpet_arm(HPETTimer *t, uint64_t tick)
|
||||||
{
|
{
|
||||||
if (ticks < ns_to_ticks(INT64_MAX / 2)) {
|
uint64_t ns = hpet_get_ns(t->state, tick);
|
||||||
timer_mod(t->qemu_timer,
|
|
||||||
qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + ticks_to_ns(ticks));
|
/* Clamp period to reasonable min value (1 us) */
|
||||||
} else {
|
if (timer_is_periodic(t) && ns - t->last < 1000) {
|
||||||
timer_del(t->qemu_timer);
|
ns = t->last + 1000;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
t->last = ns;
|
||||||
|
timer_mod(t->qemu_timer, ns);
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -362,72 +383,68 @@ static void hpet_arm(HPETTimer *t, uint64_t ticks)
|
||||||
static void hpet_timer(void *opaque)
|
static void hpet_timer(void *opaque)
|
||||||
{
|
{
|
||||||
HPETTimer *t = opaque;
|
HPETTimer *t = opaque;
|
||||||
uint64_t diff;
|
|
||||||
|
|
||||||
uint64_t period = t->period;
|
uint64_t period = t->period;
|
||||||
uint64_t cur_tick = hpet_get_ticks(t->state);
|
uint64_t cur_tick = hpet_get_ticks(t->state);
|
||||||
|
|
||||||
if (timer_is_periodic(t) && period != 0) {
|
if (timer_is_periodic(t) && period != 0) {
|
||||||
|
while (hpet_time_after(cur_tick, t->cmp64)) {
|
||||||
|
t->cmp64 += period;
|
||||||
|
}
|
||||||
if (t->config & HPET_TN_32BIT) {
|
if (t->config & HPET_TN_32BIT) {
|
||||||
while (hpet_time_after(cur_tick, t->cmp)) {
|
t->cmp = (uint32_t)t->cmp64;
|
||||||
t->cmp = (uint32_t)(t->cmp + t->period);
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
while (hpet_time_after64(cur_tick, t->cmp)) {
|
t->cmp = t->cmp64;
|
||||||
t->cmp += period;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
diff = hpet_calculate_diff(t, cur_tick);
|
|
||||||
hpet_arm(t, diff);
|
|
||||||
} else if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
|
|
||||||
if (t->wrap_flag) {
|
|
||||||
diff = hpet_calculate_diff(t, cur_tick);
|
|
||||||
hpet_arm(t, diff);
|
|
||||||
t->wrap_flag = 0;
|
|
||||||
}
|
}
|
||||||
|
hpet_arm(t, t->cmp64);
|
||||||
|
} else if (t->wrap_flag) {
|
||||||
|
t->wrap_flag = 0;
|
||||||
|
hpet_arm(t, t->cmp64);
|
||||||
}
|
}
|
||||||
update_irq(t, 1);
|
update_irq(t, 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hpet_set_timer(HPETTimer *t)
|
static void hpet_set_timer(HPETTimer *t)
|
||||||
{
|
{
|
||||||
uint64_t diff;
|
|
||||||
uint32_t wrap_diff; /* how many ticks until we wrap? */
|
|
||||||
uint64_t cur_tick = hpet_get_ticks(t->state);
|
uint64_t cur_tick = hpet_get_ticks(t->state);
|
||||||
|
|
||||||
/* whenever new timer is being set up, make sure wrap_flag is 0 */
|
|
||||||
t->wrap_flag = 0;
|
t->wrap_flag = 0;
|
||||||
diff = hpet_calculate_diff(t, cur_tick);
|
t->cmp64 = hpet_calculate_cmp64(t, cur_tick, t->cmp);
|
||||||
|
if (t->config & HPET_TN_32BIT) {
|
||||||
|
|
||||||
/* hpet spec says in one-shot 32-bit mode, generate an interrupt when
|
/* hpet spec says in one-shot 32-bit mode, generate an interrupt when
|
||||||
* counter wraps in addition to an interrupt with comparator match.
|
* counter wraps in addition to an interrupt with comparator match.
|
||||||
*/
|
*/
|
||||||
if (t->config & HPET_TN_32BIT && !timer_is_periodic(t)) {
|
if (!timer_is_periodic(t) && t->cmp64 > hpet_next_wrap(cur_tick)) {
|
||||||
wrap_diff = 0xffffffff - (uint32_t)cur_tick;
|
|
||||||
if (wrap_diff < (uint32_t)diff) {
|
|
||||||
diff = wrap_diff;
|
|
||||||
t->wrap_flag = 1;
|
t->wrap_flag = 1;
|
||||||
|
hpet_arm(t, hpet_next_wrap(cur_tick));
|
||||||
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
hpet_arm(t, diff);
|
hpet_arm(t, t->cmp64);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void hpet_del_timer(HPETTimer *t)
|
static void hpet_del_timer(HPETTimer *t)
|
||||||
{
|
{
|
||||||
|
HPETState *s = t->state;
|
||||||
timer_del(t->qemu_timer);
|
timer_del(t->qemu_timer);
|
||||||
update_irq(t, 0);
|
|
||||||
|
if (s->isr & (1 << t->tn)) {
|
||||||
|
/* For level-triggered interrupt, this leaves ISR set but lowers irq. */
|
||||||
|
update_irq(t, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static uint64_t hpet_ram_read(void *opaque, hwaddr addr,
|
static uint64_t hpet_ram_read(void *opaque, hwaddr addr,
|
||||||
unsigned size)
|
unsigned size)
|
||||||
{
|
{
|
||||||
HPETState *s = opaque;
|
HPETState *s = opaque;
|
||||||
uint64_t cur_tick, index;
|
int shift = (addr & 4) * 8;
|
||||||
|
uint64_t cur_tick;
|
||||||
|
|
||||||
trace_hpet_ram_read(addr);
|
trace_hpet_ram_read(addr);
|
||||||
index = addr;
|
|
||||||
/*address range of all TN regs*/
|
/*address range of all TN regs*/
|
||||||
if (index >= 0x100 && index <= 0x3ff) {
|
if (addr >= 0x100 && addr <= 0x3ff) {
|
||||||
uint8_t timer_id = (addr - 0x100) / 0x20;
|
uint8_t timer_id = (addr - 0x100) / 0x20;
|
||||||
HPETTimer *timer = &s->timer[timer_id];
|
HPETTimer *timer = &s->timer[timer_id];
|
||||||
|
|
||||||
|
@ -436,52 +453,33 @@ static uint64_t hpet_ram_read(void *opaque, hwaddr addr,
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
switch ((addr - 0x100) % 0x20) {
|
switch (addr & 0x18) {
|
||||||
case HPET_TN_CFG:
|
case HPET_TN_CFG: // including interrupt capabilities
|
||||||
return timer->config;
|
return timer->config >> shift;
|
||||||
case HPET_TN_CFG + 4: // Interrupt capabilities
|
|
||||||
return timer->config >> 32;
|
|
||||||
case HPET_TN_CMP: // comparator register
|
case HPET_TN_CMP: // comparator register
|
||||||
return timer->cmp;
|
return timer->cmp >> shift;
|
||||||
case HPET_TN_CMP + 4:
|
|
||||||
return timer->cmp >> 32;
|
|
||||||
case HPET_TN_ROUTE:
|
case HPET_TN_ROUTE:
|
||||||
return timer->fsb;
|
return timer->fsb >> shift;
|
||||||
case HPET_TN_ROUTE + 4:
|
|
||||||
return timer->fsb >> 32;
|
|
||||||
default:
|
default:
|
||||||
trace_hpet_ram_read_invalid();
|
trace_hpet_ram_read_invalid();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
switch (index) {
|
switch (addr & ~4) {
|
||||||
case HPET_ID:
|
case HPET_ID: // including HPET_PERIOD
|
||||||
return s->capability;
|
return s->capability >> shift;
|
||||||
case HPET_PERIOD:
|
|
||||||
return s->capability >> 32;
|
|
||||||
case HPET_CFG:
|
case HPET_CFG:
|
||||||
return s->config;
|
return s->config >> shift;
|
||||||
case HPET_CFG + 4:
|
|
||||||
trace_hpet_invalid_hpet_cfg(4);
|
|
||||||
return 0;
|
|
||||||
case HPET_COUNTER:
|
case HPET_COUNTER:
|
||||||
if (hpet_enabled(s)) {
|
if (hpet_enabled(s)) {
|
||||||
cur_tick = hpet_get_ticks(s);
|
cur_tick = hpet_get_ticks(s);
|
||||||
} else {
|
} else {
|
||||||
cur_tick = s->hpet_counter;
|
cur_tick = s->hpet_counter;
|
||||||
}
|
}
|
||||||
trace_hpet_ram_read_reading_counter(0, cur_tick);
|
trace_hpet_ram_read_reading_counter(addr & 4, cur_tick);
|
||||||
return cur_tick;
|
return cur_tick >> shift;
|
||||||
case HPET_COUNTER + 4:
|
|
||||||
if (hpet_enabled(s)) {
|
|
||||||
cur_tick = hpet_get_ticks(s);
|
|
||||||
} else {
|
|
||||||
cur_tick = s->hpet_counter;
|
|
||||||
}
|
|
||||||
trace_hpet_ram_read_reading_counter(4, cur_tick);
|
|
||||||
return cur_tick >> 32;
|
|
||||||
case HPET_STATUS:
|
case HPET_STATUS:
|
||||||
return s->isr;
|
return s->isr >> shift;
|
||||||
default:
|
default:
|
||||||
trace_hpet_ram_read_invalid();
|
trace_hpet_ram_read_invalid();
|
||||||
break;
|
break;
|
||||||
|
@ -495,15 +493,14 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
HPETState *s = opaque;
|
HPETState *s = opaque;
|
||||||
uint64_t old_val, new_val, val, index;
|
int shift = (addr & 4) * 8;
|
||||||
|
int len = MIN(size * 8, 64 - shift);
|
||||||
|
uint64_t old_val, new_val, cleared;
|
||||||
|
|
||||||
trace_hpet_ram_write(addr, value);
|
trace_hpet_ram_write(addr, value);
|
||||||
index = addr;
|
|
||||||
old_val = hpet_ram_read(opaque, addr, 4);
|
|
||||||
new_val = value;
|
|
||||||
|
|
||||||
/*address range of all TN regs*/
|
/*address range of all TN regs*/
|
||||||
if (index >= 0x100 && index <= 0x3ff) {
|
if (addr >= 0x100 && addr <= 0x3ff) {
|
||||||
uint8_t timer_id = (addr - 0x100) / 0x20;
|
uint8_t timer_id = (addr - 0x100) / 0x20;
|
||||||
HPETTimer *timer = &s->timer[timer_id];
|
HPETTimer *timer = &s->timer[timer_id];
|
||||||
|
|
||||||
|
@ -512,71 +509,49 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
|
||||||
trace_hpet_timer_id_out_of_range(timer_id);
|
trace_hpet_timer_id_out_of_range(timer_id);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
switch ((addr - 0x100) % 0x20) {
|
switch (addr & 0x18) {
|
||||||
case HPET_TN_CFG:
|
case HPET_TN_CFG:
|
||||||
trace_hpet_ram_write_tn_cfg();
|
trace_hpet_ram_write_tn_cfg(addr & 4);
|
||||||
if (activating_bit(old_val, new_val, HPET_TN_FSB_ENABLE)) {
|
old_val = timer->config;
|
||||||
|
new_val = deposit64(old_val, shift, len, value);
|
||||||
|
new_val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK);
|
||||||
|
if (deactivating_bit(old_val, new_val, HPET_TN_TYPE_LEVEL)) {
|
||||||
|
/*
|
||||||
|
* Do this before changing timer->config; otherwise, if
|
||||||
|
* HPET_TN_FSB is set, update_irq will not lower the qemu_irq.
|
||||||
|
*/
|
||||||
update_irq(timer, 0);
|
update_irq(timer, 0);
|
||||||
}
|
}
|
||||||
val = hpet_fixup_reg(new_val, old_val, HPET_TN_CFG_WRITE_MASK);
|
timer->config = new_val;
|
||||||
timer->config = (timer->config & 0xffffffff00000000ULL) | val;
|
if (activating_bit(old_val, new_val, HPET_TN_ENABLE)
|
||||||
|
&& (s->isr & (1 << timer_id))) {
|
||||||
|
update_irq(timer, 1);
|
||||||
|
}
|
||||||
if (new_val & HPET_TN_32BIT) {
|
if (new_val & HPET_TN_32BIT) {
|
||||||
timer->cmp = (uint32_t)timer->cmp;
|
timer->cmp = (uint32_t)timer->cmp;
|
||||||
timer->period = (uint32_t)timer->period;
|
timer->period = (uint32_t)timer->period;
|
||||||
}
|
}
|
||||||
if (activating_bit(old_val, new_val, HPET_TN_ENABLE) &&
|
|
||||||
hpet_enabled(s)) {
|
|
||||||
hpet_set_timer(timer);
|
|
||||||
} else if (deactivating_bit(old_val, new_val, HPET_TN_ENABLE)) {
|
|
||||||
hpet_del_timer(timer);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
case HPET_TN_CFG + 4: // Interrupt capabilities
|
|
||||||
trace_hpet_ram_write_invalid_tn_cfg(4);
|
|
||||||
break;
|
|
||||||
case HPET_TN_CMP: // comparator register
|
|
||||||
trace_hpet_ram_write_tn_cmp(0);
|
|
||||||
if (timer->config & HPET_TN_32BIT) {
|
|
||||||
new_val = (uint32_t)new_val;
|
|
||||||
}
|
|
||||||
if (!timer_is_periodic(timer)
|
|
||||||
|| (timer->config & HPET_TN_SETVAL)) {
|
|
||||||
timer->cmp = (timer->cmp & 0xffffffff00000000ULL) | new_val;
|
|
||||||
}
|
|
||||||
if (timer_is_periodic(timer)) {
|
|
||||||
/*
|
|
||||||
* FIXME: Clamp period to reasonable min value?
|
|
||||||
* Clamp period to reasonable max value
|
|
||||||
*/
|
|
||||||
if (timer->config & HPET_TN_32BIT) {
|
|
||||||
new_val = MIN(new_val, ~0u >> 1);
|
|
||||||
}
|
|
||||||
timer->period =
|
|
||||||
(timer->period & 0xffffffff00000000ULL) | new_val;
|
|
||||||
}
|
|
||||||
/*
|
|
||||||
* FIXME: on a 64-bit write, HPET_TN_SETVAL should apply to the
|
|
||||||
* high bits part as well.
|
|
||||||
*/
|
|
||||||
timer->config &= ~HPET_TN_SETVAL;
|
|
||||||
if (hpet_enabled(s)) {
|
if (hpet_enabled(s)) {
|
||||||
hpet_set_timer(timer);
|
hpet_set_timer(timer);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case HPET_TN_CMP + 4: // comparator register high order
|
case HPET_TN_CMP: // comparator register
|
||||||
trace_hpet_ram_write_tn_cmp(4);
|
if (timer->config & HPET_TN_32BIT) {
|
||||||
|
/* High 32-bits are zero, leave them untouched. */
|
||||||
|
if (shift) {
|
||||||
|
trace_hpet_ram_write_invalid_tn_cmp();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
len = 64;
|
||||||
|
value = (uint32_t) value;
|
||||||
|
}
|
||||||
|
trace_hpet_ram_write_tn_cmp(addr & 4);
|
||||||
if (!timer_is_periodic(timer)
|
if (!timer_is_periodic(timer)
|
||||||
|| (timer->config & HPET_TN_SETVAL)) {
|
|| (timer->config & HPET_TN_SETVAL)) {
|
||||||
timer->cmp = (timer->cmp & 0xffffffffULL) | new_val << 32;
|
timer->cmp = deposit64(timer->cmp, shift, len, value);
|
||||||
}
|
}
|
||||||
if (timer_is_periodic(timer)) {
|
if (timer_is_periodic(timer)) {
|
||||||
/*
|
timer->period = deposit64(timer->period, shift, len, value);
|
||||||
* FIXME: Clamp period to reasonable min value?
|
|
||||||
* Clamp period to reasonable max value
|
|
||||||
*/
|
|
||||||
new_val = MIN(new_val, ~0u >> 1);
|
|
||||||
timer->period =
|
|
||||||
(timer->period & 0xffffffffULL) | new_val << 32;
|
|
||||||
}
|
}
|
||||||
timer->config &= ~HPET_TN_SETVAL;
|
timer->config &= ~HPET_TN_SETVAL;
|
||||||
if (hpet_enabled(s)) {
|
if (hpet_enabled(s)) {
|
||||||
|
@ -584,10 +559,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case HPET_TN_ROUTE:
|
case HPET_TN_ROUTE:
|
||||||
timer->fsb = (timer->fsb & 0xffffffff00000000ULL) | new_val;
|
timer->fsb = deposit64(timer->fsb, shift, len, value);
|
||||||
break;
|
|
||||||
case HPET_TN_ROUTE + 4:
|
|
||||||
timer->fsb = (new_val << 32) | (timer->fsb & 0xffffffff);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
trace_hpet_ram_write_invalid();
|
trace_hpet_ram_write_invalid();
|
||||||
|
@ -595,20 +567,23 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
} else {
|
} else {
|
||||||
switch (index) {
|
switch (addr & ~4) {
|
||||||
case HPET_ID:
|
case HPET_ID:
|
||||||
return;
|
return;
|
||||||
case HPET_CFG:
|
case HPET_CFG:
|
||||||
val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK);
|
old_val = s->config;
|
||||||
s->config = (s->config & 0xffffffff00000000ULL) | val;
|
new_val = deposit64(old_val, shift, len, value);
|
||||||
|
new_val = hpet_fixup_reg(new_val, old_val, HPET_CFG_WRITE_MASK);
|
||||||
|
s->config = new_val;
|
||||||
if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
|
if (activating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
|
||||||
/* Enable main counter and interrupt generation. */
|
/* Enable main counter and interrupt generation. */
|
||||||
s->hpet_offset =
|
s->hpet_offset =
|
||||||
ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
|
ticks_to_ns(s->hpet_counter) - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
|
||||||
for (i = 0; i < s->num_timers; i++) {
|
for (i = 0; i < s->num_timers; i++) {
|
||||||
if ((&s->timer[i])->cmp != ~0ULL) {
|
if (timer_enabled(&s->timer[i]) && (s->isr & (1 << i))) {
|
||||||
hpet_set_timer(&s->timer[i]);
|
update_irq(&s->timer[i], 1);
|
||||||
}
|
}
|
||||||
|
hpet_set_timer(&s->timer[i]);
|
||||||
}
|
}
|
||||||
} else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
|
} else if (deactivating_bit(old_val, new_val, HPET_CFG_ENABLE)) {
|
||||||
/* Halt main counter and disable interrupt generation. */
|
/* Halt main counter and disable interrupt generation. */
|
||||||
|
@ -629,13 +604,11 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
|
||||||
qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level);
|
qemu_set_irq(s->irqs[RTC_ISA_IRQ], s->rtc_irq_level);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case HPET_CFG + 4:
|
|
||||||
trace_hpet_invalid_hpet_cfg(4);
|
|
||||||
break;
|
|
||||||
case HPET_STATUS:
|
case HPET_STATUS:
|
||||||
val = new_val & s->isr;
|
new_val = value << shift;
|
||||||
|
cleared = new_val & s->isr;
|
||||||
for (i = 0; i < s->num_timers; i++) {
|
for (i = 0; i < s->num_timers; i++) {
|
||||||
if (val & (1 << i)) {
|
if (cleared & (1 << i)) {
|
||||||
update_irq(&s->timer[i], 0);
|
update_irq(&s->timer[i], 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -644,15 +617,7 @@ static void hpet_ram_write(void *opaque, hwaddr addr,
|
||||||
if (hpet_enabled(s)) {
|
if (hpet_enabled(s)) {
|
||||||
trace_hpet_ram_write_counter_write_while_enabled();
|
trace_hpet_ram_write_counter_write_while_enabled();
|
||||||
}
|
}
|
||||||
s->hpet_counter =
|
s->hpet_counter = deposit64(s->hpet_counter, shift, len, value);
|
||||||
(s->hpet_counter & 0xffffffff00000000ULL) | value;
|
|
||||||
trace_hpet_ram_write_counter_written(0, value, s->hpet_counter);
|
|
||||||
break;
|
|
||||||
case HPET_COUNTER + 4:
|
|
||||||
trace_hpet_ram_write_counter_write_while_enabled();
|
|
||||||
s->hpet_counter =
|
|
||||||
(s->hpet_counter & 0xffffffffULL) | (((uint64_t)value) << 32);
|
|
||||||
trace_hpet_ram_write_counter_written(4, value, s->hpet_counter);
|
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
trace_hpet_ram_write_invalid();
|
trace_hpet_ram_write_invalid();
|
||||||
|
@ -666,7 +631,11 @@ static const MemoryRegionOps hpet_ram_ops = {
|
||||||
.write = hpet_ram_write,
|
.write = hpet_ram_write,
|
||||||
.valid = {
|
.valid = {
|
||||||
.min_access_size = 4,
|
.min_access_size = 4,
|
||||||
.max_access_size = 4,
|
.max_access_size = 8,
|
||||||
|
},
|
||||||
|
.impl = {
|
||||||
|
.min_access_size = 4,
|
||||||
|
.max_access_size = 8,
|
||||||
},
|
},
|
||||||
.endianness = DEVICE_NATIVE_ENDIAN,
|
.endianness = DEVICE_NATIVE_ENDIAN,
|
||||||
};
|
};
|
||||||
|
|
|
@ -108,9 +108,9 @@ hpet_ram_read_reading_counter(uint8_t reg_off, uint64_t cur_tick) "reading count
|
||||||
hpet_ram_read_invalid(void) "invalid hpet_ram_readl"
|
hpet_ram_read_invalid(void) "invalid hpet_ram_readl"
|
||||||
hpet_ram_write(uint64_t addr, uint64_t value) "enter hpet_ram_writel at 0x%" PRIx64 " = 0x%" PRIx64
|
hpet_ram_write(uint64_t addr, uint64_t value) "enter hpet_ram_writel at 0x%" PRIx64 " = 0x%" PRIx64
|
||||||
hpet_ram_write_timer_id(uint64_t timer_id) "hpet_ram_writel timer_id = 0x%" PRIx64
|
hpet_ram_write_timer_id(uint64_t timer_id) "hpet_ram_writel timer_id = 0x%" PRIx64
|
||||||
hpet_ram_write_tn_cfg(void) "hpet_ram_writel HPET_TN_CFG"
|
hpet_ram_write_tn_cfg(uint8_t reg_off) "hpet_ram_writel HPET_TN_CFG + %" PRIu8
|
||||||
hpet_ram_write_invalid_tn_cfg(uint8_t reg_off) "invalid HPET_TN_CFG + %" PRIu8 " write"
|
|
||||||
hpet_ram_write_tn_cmp(uint8_t reg_off) "hpet_ram_writel HPET_TN_CMP + %" PRIu8
|
hpet_ram_write_tn_cmp(uint8_t reg_off) "hpet_ram_writel HPET_TN_CMP + %" PRIu8
|
||||||
|
hpet_ram_write_invalid_tn_cmp(void) "invalid HPET_TN_CMP + 4 write"
|
||||||
hpet_ram_write_invalid(void) "invalid hpet_ram_writel"
|
hpet_ram_write_invalid(void) "invalid hpet_ram_writel"
|
||||||
hpet_ram_write_counter_write_while_enabled(void) "Writing counter while HPET enabled!"
|
hpet_ram_write_counter_write_while_enabled(void) "Writing counter while HPET enabled!"
|
||||||
hpet_ram_write_counter_written(uint8_t reg_off, uint64_t value, uint64_t counter) "HPET counter + %" PRIu8 "written. crt = 0x%" PRIx64 " -> 0x%" PRIx64
|
hpet_ram_write_counter_written(uint8_t reg_off, uint64_t value, uint64_t counter) "HPET counter + %" PRIu8 "written. crt = 0x%" PRIx64 " -> 0x%" PRIx64
|
||||||
|
|
|
@ -160,6 +160,9 @@ struct QIOChannelClass {
|
||||||
void *opaque);
|
void *opaque);
|
||||||
int (*io_flush)(QIOChannel *ioc,
|
int (*io_flush)(QIOChannel *ioc,
|
||||||
Error **errp);
|
Error **errp);
|
||||||
|
int (*io_peerpid)(QIOChannel *ioc,
|
||||||
|
unsigned int *pid,
|
||||||
|
Error **errp);
|
||||||
};
|
};
|
||||||
|
|
||||||
/* General I/O handling functions */
|
/* General I/O handling functions */
|
||||||
|
@ -981,4 +984,22 @@ int coroutine_mixed_fn qio_channel_writev_full_all(QIOChannel *ioc,
|
||||||
int qio_channel_flush(QIOChannel *ioc,
|
int qio_channel_flush(QIOChannel *ioc,
|
||||||
Error **errp);
|
Error **errp);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* qio_channel_get_peercred:
|
||||||
|
* @ioc: the channel object
|
||||||
|
* @pid: pointer to pid
|
||||||
|
* @errp: pointer to a NULL-initialized error object
|
||||||
|
*
|
||||||
|
* Returns the pid of the peer process connected to this socket.
|
||||||
|
*
|
||||||
|
* The use of this function is possible only for connected
|
||||||
|
* AF_UNIX stream sockets and for AF_UNIX stream and datagram
|
||||||
|
* socket pairs on Linux.
|
||||||
|
* Return -1 on error with pid -1 for the non-Linux OS.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
int qio_channel_get_peerpid(QIOChannel *ioc,
|
||||||
|
unsigned int *pid,
|
||||||
|
Error **errp);
|
||||||
|
|
||||||
#endif /* QIO_CHANNEL_H */
|
#endif /* QIO_CHANNEL_H */
|
||||||
|
|
|
@ -14,6 +14,9 @@
|
||||||
#include "qemu/accel.h"
|
#include "qemu/accel.h"
|
||||||
#include "qemu/queue.h"
|
#include "qemu/queue.h"
|
||||||
#include "sysemu/kvm.h"
|
#include "sysemu/kvm.h"
|
||||||
|
#include "hw/boards.h"
|
||||||
|
#include "hw/i386/topology.h"
|
||||||
|
#include "io/channel-socket.h"
|
||||||
|
|
||||||
typedef struct KVMSlot
|
typedef struct KVMSlot
|
||||||
{
|
{
|
||||||
|
@ -50,6 +53,34 @@ typedef struct KVMMemoryListener {
|
||||||
|
|
||||||
#define KVM_MSI_HASHTAB_SIZE 256
|
#define KVM_MSI_HASHTAB_SIZE 256
|
||||||
|
|
||||||
|
typedef struct KVMHostTopoInfo {
|
||||||
|
/* Number of package on the Host */
|
||||||
|
unsigned int maxpkgs;
|
||||||
|
/* Number of cpus on the Host */
|
||||||
|
unsigned int maxcpus;
|
||||||
|
/* Number of cpus on each different package */
|
||||||
|
unsigned int *pkg_cpu_count;
|
||||||
|
/* Each package can have different maxticks */
|
||||||
|
unsigned int *maxticks;
|
||||||
|
} KVMHostTopoInfo;
|
||||||
|
|
||||||
|
struct KVMMsrEnergy {
|
||||||
|
pid_t pid;
|
||||||
|
bool enable;
|
||||||
|
char *socket_path;
|
||||||
|
QIOChannelSocket *sioc;
|
||||||
|
QemuThread msr_thr;
|
||||||
|
unsigned int guest_vcpus;
|
||||||
|
unsigned int guest_vsockets;
|
||||||
|
X86CPUTopoInfo guest_topo_info;
|
||||||
|
KVMHostTopoInfo host_topo;
|
||||||
|
const CPUArchIdList *guest_cpu_list;
|
||||||
|
uint64_t *msr_value;
|
||||||
|
uint64_t msr_unit;
|
||||||
|
uint64_t msr_limit;
|
||||||
|
uint64_t msr_info;
|
||||||
|
};
|
||||||
|
|
||||||
enum KVMDirtyRingReaperState {
|
enum KVMDirtyRingReaperState {
|
||||||
KVM_DIRTY_RING_REAPER_NONE = 0,
|
KVM_DIRTY_RING_REAPER_NONE = 0,
|
||||||
/* The reaper is sleeping */
|
/* The reaper is sleeping */
|
||||||
|
@ -117,6 +148,7 @@ struct KVMState
|
||||||
bool kvm_dirty_ring_with_bitmap;
|
bool kvm_dirty_ring_with_bitmap;
|
||||||
uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */
|
uint64_t kvm_eager_split_size; /* Eager Page Splitting chunk size */
|
||||||
struct KVMDirtyRingReaper reaper;
|
struct KVMDirtyRingReaper reaper;
|
||||||
|
struct KVMMsrEnergy msr_energy;
|
||||||
NotifyVmexitOption notify_vmexit;
|
NotifyVmexitOption notify_vmexit;
|
||||||
uint32_t notify_window;
|
uint32_t notify_window;
|
||||||
uint32_t xen_version;
|
uint32_t xen_version;
|
||||||
|
|
|
@ -841,6 +841,33 @@ qio_channel_socket_set_cork(QIOChannel *ioc,
|
||||||
socket_set_cork(sioc->fd, v);
|
socket_set_cork(sioc->fd, v);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int
|
||||||
|
qio_channel_socket_get_peerpid(QIOChannel *ioc,
|
||||||
|
unsigned int *pid,
|
||||||
|
Error **errp)
|
||||||
|
{
|
||||||
|
#ifdef CONFIG_LINUX
|
||||||
|
QIOChannelSocket *sioc = QIO_CHANNEL_SOCKET(ioc);
|
||||||
|
Error *err = NULL;
|
||||||
|
socklen_t len = sizeof(struct ucred);
|
||||||
|
|
||||||
|
struct ucred cred;
|
||||||
|
if (getsockopt(sioc->fd,
|
||||||
|
SOL_SOCKET, SO_PEERCRED,
|
||||||
|
&cred, &len) == -1) {
|
||||||
|
error_setg_errno(&err, errno, "Unable to get peer credentials");
|
||||||
|
error_propagate(errp, err);
|
||||||
|
*pid = -1;
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
*pid = (unsigned int)cred.pid;
|
||||||
|
return 0;
|
||||||
|
#else
|
||||||
|
error_setg(errp, "Unsupported feature");
|
||||||
|
*pid = -1;
|
||||||
|
return -1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static int
|
static int
|
||||||
qio_channel_socket_close(QIOChannel *ioc,
|
qio_channel_socket_close(QIOChannel *ioc,
|
||||||
|
@ -938,6 +965,7 @@ static void qio_channel_socket_class_init(ObjectClass *klass,
|
||||||
#ifdef QEMU_MSG_ZEROCOPY
|
#ifdef QEMU_MSG_ZEROCOPY
|
||||||
ioc_klass->io_flush = qio_channel_socket_flush;
|
ioc_klass->io_flush = qio_channel_socket_flush;
|
||||||
#endif
|
#endif
|
||||||
|
ioc_klass->io_peerpid = qio_channel_socket_get_peerpid;
|
||||||
}
|
}
|
||||||
|
|
||||||
static const TypeInfo qio_channel_socket_info = {
|
static const TypeInfo qio_channel_socket_info = {
|
||||||
|
|
13
io/channel.c
13
io/channel.c
|
@ -548,6 +548,19 @@ void qio_channel_set_cork(QIOChannel *ioc,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int qio_channel_get_peerpid(QIOChannel *ioc,
|
||||||
|
unsigned int *pid,
|
||||||
|
Error **errp)
|
||||||
|
{
|
||||||
|
QIOChannelClass *klass = QIO_CHANNEL_GET_CLASS(ioc);
|
||||||
|
|
||||||
|
if (!klass->io_peerpid) {
|
||||||
|
error_setg(errp, "Channel does not support peer pid");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
klass->io_peerpid(ioc, pid, errp);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
off_t qio_channel_io_seek(QIOChannel *ioc,
|
off_t qio_channel_io_seek(QIOChannel *ioc,
|
||||||
off_t offset,
|
off_t offset,
|
||||||
|
|
|
@ -4089,6 +4089,13 @@ if have_tools
|
||||||
dependencies: [authz, crypto, io, qom, qemuutil,
|
dependencies: [authz, crypto, io, qom, qemuutil,
|
||||||
libcap_ng, mpathpersist],
|
libcap_ng, mpathpersist],
|
||||||
install: true)
|
install: true)
|
||||||
|
|
||||||
|
if cpu in ['x86', 'x86_64']
|
||||||
|
executable('qemu-vmsr-helper', files('tools/i386/qemu-vmsr-helper.c'),
|
||||||
|
dependencies: [authz, crypto, io, qom, qemuutil,
|
||||||
|
libcap_ng, mpathpersist],
|
||||||
|
install: true)
|
||||||
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
if have_ivshmem
|
if have_ivshmem
|
||||||
|
|
|
@ -414,6 +414,10 @@ typedef enum X86Seg {
|
||||||
#define MSR_IA32_TSX_CTRL 0x122
|
#define MSR_IA32_TSX_CTRL 0x122
|
||||||
#define MSR_IA32_TSCDEADLINE 0x6e0
|
#define MSR_IA32_TSCDEADLINE 0x6e0
|
||||||
#define MSR_IA32_PKRS 0x6e1
|
#define MSR_IA32_PKRS 0x6e1
|
||||||
|
#define MSR_RAPL_POWER_UNIT 0x00000606
|
||||||
|
#define MSR_PKG_POWER_LIMIT 0x00000610
|
||||||
|
#define MSR_PKG_ENERGY_STATUS 0x00000611
|
||||||
|
#define MSR_PKG_POWER_INFO 0x00000614
|
||||||
#define MSR_ARCH_LBR_CTL 0x000014ce
|
#define MSR_ARCH_LBR_CTL 0x000014ce
|
||||||
#define MSR_ARCH_LBR_DEPTH 0x000014cf
|
#define MSR_ARCH_LBR_DEPTH 0x000014cf
|
||||||
#define MSR_ARCH_LBR_FROM_0 0x00001500
|
#define MSR_ARCH_LBR_FROM_0 0x00001500
|
||||||
|
@ -1880,6 +1884,10 @@ typedef struct CPUArchState {
|
||||||
|
|
||||||
uintptr_t retaddr;
|
uintptr_t retaddr;
|
||||||
|
|
||||||
|
/* RAPL MSR */
|
||||||
|
uint64_t msr_rapl_power_unit;
|
||||||
|
uint64_t msr_pkg_energy_status;
|
||||||
|
|
||||||
/* Fields up to this point are cleared by a CPU reset */
|
/* Fields up to this point are cleared by a CPU reset */
|
||||||
struct {} end_reset_fields;
|
struct {} end_reset_fields;
|
||||||
|
|
||||||
|
|
|
@ -16,9 +16,12 @@
|
||||||
#include "qapi/qapi-events-run-state.h"
|
#include "qapi/qapi-events-run-state.h"
|
||||||
#include "qapi/error.h"
|
#include "qapi/error.h"
|
||||||
#include "qapi/visitor.h"
|
#include "qapi/visitor.h"
|
||||||
|
#include <math.h>
|
||||||
#include <sys/ioctl.h>
|
#include <sys/ioctl.h>
|
||||||
#include <sys/utsname.h>
|
#include <sys/utsname.h>
|
||||||
#include <sys/syscall.h>
|
#include <sys/syscall.h>
|
||||||
|
#include <sys/resource.h>
|
||||||
|
#include <sys/time.h>
|
||||||
|
|
||||||
#include <linux/kvm.h>
|
#include <linux/kvm.h>
|
||||||
#include <linux/kvm_para.h>
|
#include <linux/kvm_para.h>
|
||||||
|
@ -27,6 +30,7 @@
|
||||||
|
|
||||||
#include "cpu.h"
|
#include "cpu.h"
|
||||||
#include "host-cpu.h"
|
#include "host-cpu.h"
|
||||||
|
#include "vmsr_energy.h"
|
||||||
#include "sysemu/sysemu.h"
|
#include "sysemu/sysemu.h"
|
||||||
#include "sysemu/hw_accel.h"
|
#include "sysemu/hw_accel.h"
|
||||||
#include "sysemu/kvm_int.h"
|
#include "sysemu/kvm_int.h"
|
||||||
|
@ -2559,7 +2563,8 @@ static int kvm_get_supported_msrs(KVMState *s)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
|
static bool kvm_rdmsr_core_thread_count(X86CPU *cpu,
|
||||||
|
uint32_t msr,
|
||||||
uint64_t *val)
|
uint64_t *val)
|
||||||
{
|
{
|
||||||
CPUState *cs = CPU(cpu);
|
CPUState *cs = CPU(cpu);
|
||||||
|
@ -2570,6 +2575,53 @@ static bool kvm_rdmsr_core_thread_count(X86CPU *cpu, uint32_t msr,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static bool kvm_rdmsr_rapl_power_unit(X86CPU *cpu,
|
||||||
|
uint32_t msr,
|
||||||
|
uint64_t *val)
|
||||||
|
{
|
||||||
|
|
||||||
|
CPUState *cs = CPU(cpu);
|
||||||
|
|
||||||
|
*val = cs->kvm_state->msr_energy.msr_unit;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool kvm_rdmsr_pkg_power_limit(X86CPU *cpu,
|
||||||
|
uint32_t msr,
|
||||||
|
uint64_t *val)
|
||||||
|
{
|
||||||
|
|
||||||
|
CPUState *cs = CPU(cpu);
|
||||||
|
|
||||||
|
*val = cs->kvm_state->msr_energy.msr_limit;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool kvm_rdmsr_pkg_power_info(X86CPU *cpu,
|
||||||
|
uint32_t msr,
|
||||||
|
uint64_t *val)
|
||||||
|
{
|
||||||
|
|
||||||
|
CPUState *cs = CPU(cpu);
|
||||||
|
|
||||||
|
*val = cs->kvm_state->msr_energy.msr_info;
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool kvm_rdmsr_pkg_energy_status(X86CPU *cpu,
|
||||||
|
uint32_t msr,
|
||||||
|
uint64_t *val)
|
||||||
|
{
|
||||||
|
|
||||||
|
CPUState *cs = CPU(cpu);
|
||||||
|
*val = cs->kvm_state->msr_energy.msr_value[cs->cpu_index];
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
static Notifier smram_machine_done;
|
static Notifier smram_machine_done;
|
||||||
static KVMMemoryListener smram_listener;
|
static KVMMemoryListener smram_listener;
|
||||||
static AddressSpace smram_address_space;
|
static AddressSpace smram_address_space;
|
||||||
|
@ -2604,6 +2656,340 @@ static void register_smram_listener(Notifier *n, void *unused)
|
||||||
&smram_address_space, 1, "kvm-smram");
|
&smram_address_space, 1, "kvm-smram");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void *kvm_msr_energy_thread(void *data)
|
||||||
|
{
|
||||||
|
KVMState *s = data;
|
||||||
|
struct KVMMsrEnergy *vmsr = &s->msr_energy;
|
||||||
|
|
||||||
|
g_autofree vmsr_package_energy_stat *pkg_stat = NULL;
|
||||||
|
g_autofree vmsr_thread_stat *thd_stat = NULL;
|
||||||
|
g_autofree CPUState *cpu = NULL;
|
||||||
|
g_autofree unsigned int *vpkgs_energy_stat = NULL;
|
||||||
|
unsigned int num_threads = 0;
|
||||||
|
|
||||||
|
X86CPUTopoIDs topo_ids;
|
||||||
|
|
||||||
|
rcu_register_thread();
|
||||||
|
|
||||||
|
/* Allocate memory for each package energy status */
|
||||||
|
pkg_stat = g_new0(vmsr_package_energy_stat, vmsr->host_topo.maxpkgs);
|
||||||
|
|
||||||
|
/* Allocate memory for thread stats */
|
||||||
|
thd_stat = g_new0(vmsr_thread_stat, 1);
|
||||||
|
|
||||||
|
/* Allocate memory for holding virtual package energy counter */
|
||||||
|
vpkgs_energy_stat = g_new0(unsigned int, vmsr->guest_vsockets);
|
||||||
|
|
||||||
|
/* Populate the max tick of each packages */
|
||||||
|
for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
|
||||||
|
/*
|
||||||
|
* Max numbers of ticks per package
|
||||||
|
* Time in second * Number of ticks/second * Number of cores/package
|
||||||
|
* ex: 100 ticks/second/CPU, 12 CPUs per Package gives 1200 ticks max
|
||||||
|
*/
|
||||||
|
vmsr->host_topo.maxticks[i] = (MSR_ENERGY_THREAD_SLEEP_US / 1000000)
|
||||||
|
* sysconf(_SC_CLK_TCK)
|
||||||
|
* vmsr->host_topo.pkg_cpu_count[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
/* Get all qemu threads id */
|
||||||
|
g_autofree pid_t *thread_ids =
|
||||||
|
thread_ids = vmsr_get_thread_ids(vmsr->pid, &num_threads);
|
||||||
|
|
||||||
|
if (thread_ids == NULL) {
|
||||||
|
goto clean;
|
||||||
|
}
|
||||||
|
|
||||||
|
thd_stat = g_renew(vmsr_thread_stat, thd_stat, num_threads);
|
||||||
|
/* Unlike g_new0, g_renew0 function doesn't exist yet... */
|
||||||
|
memset(thd_stat, 0, num_threads * sizeof(vmsr_thread_stat));
|
||||||
|
|
||||||
|
/* Populate all the thread stats */
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
thd_stat[i].utime = g_new0(unsigned long long, 2);
|
||||||
|
thd_stat[i].stime = g_new0(unsigned long long, 2);
|
||||||
|
thd_stat[i].thread_id = thread_ids[i];
|
||||||
|
vmsr_read_thread_stat(vmsr->pid,
|
||||||
|
thd_stat[i].thread_id,
|
||||||
|
thd_stat[i].utime,
|
||||||
|
thd_stat[i].stime,
|
||||||
|
&thd_stat[i].cpu_id);
|
||||||
|
thd_stat[i].pkg_id =
|
||||||
|
vmsr_get_physical_package_id(thd_stat[i].cpu_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retrieve all packages power plane energy counter */
|
||||||
|
for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
|
||||||
|
for (int j = 0; j < num_threads; j++) {
|
||||||
|
/*
|
||||||
|
* Use the first thread we found that ran on the CPU
|
||||||
|
* of the package to read the packages energy counter
|
||||||
|
*/
|
||||||
|
if (thd_stat[j].pkg_id == i) {
|
||||||
|
pkg_stat[i].e_start =
|
||||||
|
vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
|
||||||
|
thd_stat[j].cpu_id,
|
||||||
|
thd_stat[j].thread_id,
|
||||||
|
s->msr_energy.sioc);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sleep a short period while the other threads are working */
|
||||||
|
usleep(MSR_ENERGY_THREAD_SLEEP_US);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Retrieve all packages power plane energy counter
|
||||||
|
* Calculate the delta of all packages
|
||||||
|
*/
|
||||||
|
for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
|
||||||
|
for (int j = 0; j < num_threads; j++) {
|
||||||
|
/*
|
||||||
|
* Use the first thread we found that ran on the CPU
|
||||||
|
* of the package to read the packages energy counter
|
||||||
|
*/
|
||||||
|
if (thd_stat[j].pkg_id == i) {
|
||||||
|
pkg_stat[i].e_end =
|
||||||
|
vmsr_read_msr(MSR_PKG_ENERGY_STATUS,
|
||||||
|
thd_stat[j].cpu_id,
|
||||||
|
thd_stat[j].thread_id,
|
||||||
|
s->msr_energy.sioc);
|
||||||
|
/*
|
||||||
|
* Prevent the case we have migrate the VM
|
||||||
|
* during the sleep period or any other cases
|
||||||
|
* were energy counter might be lower after
|
||||||
|
* the sleep period.
|
||||||
|
*/
|
||||||
|
if (pkg_stat[i].e_end > pkg_stat[i].e_start) {
|
||||||
|
pkg_stat[i].e_delta =
|
||||||
|
pkg_stat[i].e_end - pkg_stat[i].e_start;
|
||||||
|
} else {
|
||||||
|
pkg_stat[i].e_delta = 0;
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Delta of ticks spend by each thread between the sample */
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
vmsr_read_thread_stat(vmsr->pid,
|
||||||
|
thd_stat[i].thread_id,
|
||||||
|
thd_stat[i].utime,
|
||||||
|
thd_stat[i].stime,
|
||||||
|
&thd_stat[i].cpu_id);
|
||||||
|
|
||||||
|
if (vmsr->pid < 0) {
|
||||||
|
/*
|
||||||
|
* We don't count the dead thread
|
||||||
|
* i.e threads that existed before the sleep
|
||||||
|
* and not anymore
|
||||||
|
*/
|
||||||
|
thd_stat[i].delta_ticks = 0;
|
||||||
|
} else {
|
||||||
|
vmsr_delta_ticks(thd_stat, i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Identify the vcpu threads
|
||||||
|
* Calculate the number of vcpu per package
|
||||||
|
*/
|
||||||
|
CPU_FOREACH(cpu) {
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
if (cpu->thread_id == thd_stat[i].thread_id) {
|
||||||
|
thd_stat[i].is_vcpu = true;
|
||||||
|
thd_stat[i].vcpu_id = cpu->cpu_index;
|
||||||
|
pkg_stat[thd_stat[i].pkg_id].nb_vcpu++;
|
||||||
|
thd_stat[i].acpi_id = kvm_arch_vcpu_id(cpu);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retrieve the virtual package number of each vCPU */
|
||||||
|
for (int i = 0; i < vmsr->guest_cpu_list->len; i++) {
|
||||||
|
for (int j = 0; j < num_threads; j++) {
|
||||||
|
if ((thd_stat[j].acpi_id ==
|
||||||
|
vmsr->guest_cpu_list->cpus[i].arch_id)
|
||||||
|
&& (thd_stat[j].is_vcpu == true)) {
|
||||||
|
x86_topo_ids_from_apicid(thd_stat[j].acpi_id,
|
||||||
|
&vmsr->guest_topo_info, &topo_ids);
|
||||||
|
thd_stat[j].vpkg_id = topo_ids.pkg_id;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Calculate the total energy of all non-vCPU thread */
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
if ((thd_stat[i].is_vcpu != true) &&
|
||||||
|
(thd_stat[i].delta_ticks > 0)) {
|
||||||
|
double temp;
|
||||||
|
temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
|
||||||
|
thd_stat[i].delta_ticks,
|
||||||
|
vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
|
||||||
|
pkg_stat[thd_stat[i].pkg_id].e_ratio
|
||||||
|
+= (uint64_t)lround(temp);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Calculate the ratio per non-vCPU thread of each package */
|
||||||
|
for (int i = 0; i < vmsr->host_topo.maxpkgs; i++) {
|
||||||
|
if (pkg_stat[i].nb_vcpu > 0) {
|
||||||
|
pkg_stat[i].e_ratio = pkg_stat[i].e_ratio / pkg_stat[i].nb_vcpu;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Calculate the energy for each Package:
|
||||||
|
* Energy Package = sum of each vCPU energy that belongs to the package
|
||||||
|
*/
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
if ((thd_stat[i].is_vcpu == true) && \
|
||||||
|
(thd_stat[i].delta_ticks > 0)) {
|
||||||
|
double temp;
|
||||||
|
temp = vmsr_get_ratio(pkg_stat[thd_stat[i].pkg_id].e_delta,
|
||||||
|
thd_stat[i].delta_ticks,
|
||||||
|
vmsr->host_topo.maxticks[thd_stat[i].pkg_id]);
|
||||||
|
vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
|
||||||
|
(uint64_t)lround(temp);
|
||||||
|
vpkgs_energy_stat[thd_stat[i].vpkg_id] +=
|
||||||
|
pkg_stat[thd_stat[i].pkg_id].e_ratio;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Finally populate the vmsr register of each vCPU with the total
|
||||||
|
* package value to emulate the real hardware where each CPU return the
|
||||||
|
* value of the package it belongs.
|
||||||
|
*/
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
if ((thd_stat[i].is_vcpu == true) && \
|
||||||
|
(thd_stat[i].delta_ticks > 0)) {
|
||||||
|
vmsr->msr_value[thd_stat[i].vcpu_id] = \
|
||||||
|
vpkgs_energy_stat[thd_stat[i].vpkg_id];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Freeing memory before zeroing the pointer */
|
||||||
|
for (int i = 0; i < num_threads; i++) {
|
||||||
|
g_free(thd_stat[i].utime);
|
||||||
|
g_free(thd_stat[i].stime);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rcu_unregister_thread();
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int kvm_msr_energy_thread_init(KVMState *s, MachineState *ms)
|
||||||
|
{
|
||||||
|
MachineClass *mc = MACHINE_GET_CLASS(ms);
|
||||||
|
struct KVMMsrEnergy *r = &s->msr_energy;
|
||||||
|
int ret = 0;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sanity check
|
||||||
|
* 1. Host cpu must be Intel cpu
|
||||||
|
* 2. RAPL must be enabled on the Host
|
||||||
|
*/
|
||||||
|
if (is_host_cpu_intel()) {
|
||||||
|
error_report("The RAPL feature can only be enabled on hosts\
|
||||||
|
with Intel CPU models");
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_rapl_enabled()) {
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retrieve the virtual topology */
|
||||||
|
vmsr_init_topo_info(&r->guest_topo_info, ms);
|
||||||
|
|
||||||
|
/* Retrieve the number of vcpu */
|
||||||
|
r->guest_vcpus = ms->smp.cpus;
|
||||||
|
|
||||||
|
/* Retrieve the number of virtual sockets */
|
||||||
|
r->guest_vsockets = ms->smp.sockets;
|
||||||
|
|
||||||
|
/* Allocate register memory (MSR_PKG_STATUS) for each vcpu */
|
||||||
|
r->msr_value = g_new0(uint64_t, r->guest_vcpus);
|
||||||
|
|
||||||
|
/* Retrieve the CPUArchIDlist */
|
||||||
|
r->guest_cpu_list = mc->possible_cpu_arch_ids(ms);
|
||||||
|
|
||||||
|
/* Max number of cpus on the Host */
|
||||||
|
r->host_topo.maxcpus = vmsr_get_maxcpus();
|
||||||
|
if (r->host_topo.maxcpus == 0) {
|
||||||
|
error_report("host max cpus = 0");
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Max number of packages on the host */
|
||||||
|
r->host_topo.maxpkgs = vmsr_get_max_physical_package(r->host_topo.maxcpus);
|
||||||
|
if (r->host_topo.maxpkgs == 0) {
|
||||||
|
error_report("host max pkgs = 0");
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Allocate memory for each package on the host */
|
||||||
|
r->host_topo.pkg_cpu_count = g_new0(unsigned int, r->host_topo.maxpkgs);
|
||||||
|
r->host_topo.maxticks = g_new0(unsigned int, r->host_topo.maxpkgs);
|
||||||
|
|
||||||
|
vmsr_count_cpus_per_package(r->host_topo.pkg_cpu_count,
|
||||||
|
r->host_topo.maxpkgs);
|
||||||
|
for (int i = 0; i < r->host_topo.maxpkgs; i++) {
|
||||||
|
if (r->host_topo.pkg_cpu_count[i] == 0) {
|
||||||
|
error_report("cpu per packages = 0 on package_%d", i);
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get QEMU PID*/
|
||||||
|
r->pid = getpid();
|
||||||
|
|
||||||
|
/* Compute the socket path if necessary */
|
||||||
|
if (s->msr_energy.socket_path == NULL) {
|
||||||
|
s->msr_energy.socket_path = vmsr_compute_default_paths();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Open socket with vmsr helper */
|
||||||
|
s->msr_energy.sioc = vmsr_open_socket(s->msr_energy.socket_path);
|
||||||
|
|
||||||
|
if (s->msr_energy.sioc == NULL) {
|
||||||
|
error_report("vmsr socket opening failed");
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Those MSR values should not change */
|
||||||
|
r->msr_unit = vmsr_read_msr(MSR_RAPL_POWER_UNIT, 0, r->pid,
|
||||||
|
s->msr_energy.sioc);
|
||||||
|
r->msr_limit = vmsr_read_msr(MSR_PKG_POWER_LIMIT, 0, r->pid,
|
||||||
|
s->msr_energy.sioc);
|
||||||
|
r->msr_info = vmsr_read_msr(MSR_PKG_POWER_INFO, 0, r->pid,
|
||||||
|
s->msr_energy.sioc);
|
||||||
|
if (r->msr_unit == 0 || r->msr_limit == 0 || r->msr_info == 0) {
|
||||||
|
error_report("can't read any virtual msr");
|
||||||
|
ret = 1;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
qemu_thread_create(&r->msr_thr, "kvm-msr",
|
||||||
|
kvm_msr_energy_thread,
|
||||||
|
s, QEMU_THREAD_JOINABLE);
|
||||||
|
out:
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
int kvm_arch_get_default_type(MachineState *ms)
|
int kvm_arch_get_default_type(MachineState *ms)
|
||||||
{
|
{
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -2804,6 +3190,49 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
|
||||||
strerror(-ret));
|
strerror(-ret));
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (s->msr_energy.enable == true) {
|
||||||
|
r = kvm_filter_msr(s, MSR_RAPL_POWER_UNIT,
|
||||||
|
kvm_rdmsr_rapl_power_unit, NULL);
|
||||||
|
if (!r) {
|
||||||
|
error_report("Could not install MSR_RAPL_POWER_UNIT \
|
||||||
|
handler: %s",
|
||||||
|
strerror(-ret));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
r = kvm_filter_msr(s, MSR_PKG_POWER_LIMIT,
|
||||||
|
kvm_rdmsr_pkg_power_limit, NULL);
|
||||||
|
if (!r) {
|
||||||
|
error_report("Could not install MSR_PKG_POWER_LIMIT \
|
||||||
|
handler: %s",
|
||||||
|
strerror(-ret));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
r = kvm_filter_msr(s, MSR_PKG_POWER_INFO,
|
||||||
|
kvm_rdmsr_pkg_power_info, NULL);
|
||||||
|
if (!r) {
|
||||||
|
error_report("Could not install MSR_PKG_POWER_INFO \
|
||||||
|
handler: %s",
|
||||||
|
strerror(-ret));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
r = kvm_filter_msr(s, MSR_PKG_ENERGY_STATUS,
|
||||||
|
kvm_rdmsr_pkg_energy_status, NULL);
|
||||||
|
if (!r) {
|
||||||
|
error_report("Could not install MSR_PKG_ENERGY_STATUS \
|
||||||
|
handler: %s",
|
||||||
|
strerror(-ret));
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
r = kvm_msr_energy_thread_init(s, ms);
|
||||||
|
if (r) {
|
||||||
|
error_report("kvm : error RAPL feature requirement not meet");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
@ -3,6 +3,7 @@ i386_kvm_ss = ss.source_set()
|
||||||
i386_kvm_ss.add(files(
|
i386_kvm_ss.add(files(
|
||||||
'kvm.c',
|
'kvm.c',
|
||||||
'kvm-cpu.c',
|
'kvm-cpu.c',
|
||||||
|
'vmsr_energy.c',
|
||||||
))
|
))
|
||||||
|
|
||||||
i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
|
i386_kvm_ss.add(when: 'CONFIG_XEN_EMU', if_true: files('xen-emu.c'))
|
||||||
|
|
345
target/i386/kvm/vmsr_energy.c
Normal file
345
target/i386/kvm/vmsr_energy.c
Normal file
|
@ -0,0 +1,345 @@
|
||||||
|
/*
|
||||||
|
* QEMU KVM support -- x86 virtual RAPL msr
|
||||||
|
*
|
||||||
|
* Copyright 2024 Red Hat, Inc. 2024
|
||||||
|
*
|
||||||
|
* Author:
|
||||||
|
* Anthony Harivel <aharivel@redhat.com>
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "qemu/error-report.h"
|
||||||
|
#include "vmsr_energy.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "io/channel-socket.h"
|
||||||
|
#include "hw/boards.h"
|
||||||
|
#include "cpu.h"
|
||||||
|
#include "host-cpu.h"
|
||||||
|
|
||||||
|
char *vmsr_compute_default_paths(void)
|
||||||
|
{
|
||||||
|
g_autofree char *state = qemu_get_local_state_dir();
|
||||||
|
|
||||||
|
return g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_host_cpu_intel(void)
|
||||||
|
{
|
||||||
|
int family, model, stepping;
|
||||||
|
char vendor[CPUID_VENDOR_SZ + 1];
|
||||||
|
|
||||||
|
host_cpu_vendor_fms(vendor, &family, &model, &stepping);
|
||||||
|
|
||||||
|
return strcmp(vendor, CPUID_VENDOR_INTEL);
|
||||||
|
}
|
||||||
|
|
||||||
|
int is_rapl_enabled(void)
|
||||||
|
{
|
||||||
|
const char *path = "/sys/class/powercap/intel-rapl/enabled";
|
||||||
|
FILE *file = fopen(path, "r");
|
||||||
|
int value = 0;
|
||||||
|
|
||||||
|
if (file != NULL) {
|
||||||
|
if (fscanf(file, "%d", &value) != 1) {
|
||||||
|
error_report("INTEL RAPL not enabled");
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
} else {
|
||||||
|
error_report("Error opening %s", path);
|
||||||
|
}
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
QIOChannelSocket *vmsr_open_socket(const char *path)
|
||||||
|
{
|
||||||
|
g_autofree char *socket_path = NULL;
|
||||||
|
|
||||||
|
socket_path = g_strdup(path);
|
||||||
|
|
||||||
|
SocketAddress saddr = {
|
||||||
|
.type = SOCKET_ADDRESS_TYPE_UNIX,
|
||||||
|
.u.q_unix.path = socket_path
|
||||||
|
};
|
||||||
|
|
||||||
|
QIOChannelSocket *sioc = qio_channel_socket_new();
|
||||||
|
Error *local_err = NULL;
|
||||||
|
|
||||||
|
qio_channel_set_name(QIO_CHANNEL(sioc), "vmsr-helper");
|
||||||
|
qio_channel_socket_connect_sync(sioc,
|
||||||
|
&saddr,
|
||||||
|
&local_err);
|
||||||
|
if (local_err) {
|
||||||
|
/* Close socket. */
|
||||||
|
qio_channel_close(QIO_CHANNEL(sioc), NULL);
|
||||||
|
object_unref(OBJECT(sioc));
|
||||||
|
sioc = NULL;
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
qio_channel_set_delay(QIO_CHANNEL(sioc), false);
|
||||||
|
out:
|
||||||
|
return sioc;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id, uint32_t tid,
|
||||||
|
QIOChannelSocket *sioc)
|
||||||
|
{
|
||||||
|
uint64_t data = 0;
|
||||||
|
int r = 0;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
uint32_t buffer[3];
|
||||||
|
/*
|
||||||
|
* Send the required arguments:
|
||||||
|
* 1. RAPL MSR register to read
|
||||||
|
* 2. On which CPU ID
|
||||||
|
* 3. From which vCPU (Thread ID)
|
||||||
|
*/
|
||||||
|
buffer[0] = reg;
|
||||||
|
buffer[1] = cpu_id;
|
||||||
|
buffer[2] = tid;
|
||||||
|
|
||||||
|
r = qio_channel_write_all(QIO_CHANNEL(sioc),
|
||||||
|
(char *)buffer, sizeof(buffer),
|
||||||
|
&local_err);
|
||||||
|
if (r < 0) {
|
||||||
|
goto out_close;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = qio_channel_read(QIO_CHANNEL(sioc),
|
||||||
|
(char *)&data, sizeof(data),
|
||||||
|
&local_err);
|
||||||
|
if (r < 0) {
|
||||||
|
data = 0;
|
||||||
|
goto out_close;
|
||||||
|
}
|
||||||
|
|
||||||
|
out_close:
|
||||||
|
return data;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retrieve the max number of physical package */
|
||||||
|
unsigned int vmsr_get_max_physical_package(unsigned int max_cpus)
|
||||||
|
{
|
||||||
|
const char *dir = "/sys/devices/system/cpu/";
|
||||||
|
const char *topo_path = "topology/physical_package_id";
|
||||||
|
g_autofree int *uniquePackages = g_new0(int, max_cpus);
|
||||||
|
unsigned int packageCount = 0;
|
||||||
|
FILE *file = NULL;
|
||||||
|
|
||||||
|
for (int i = 0; i < max_cpus; i++) {
|
||||||
|
g_autofree char *filePath = NULL;
|
||||||
|
g_autofree char *cpuid = g_strdup_printf("cpu%d", i);
|
||||||
|
|
||||||
|
filePath = g_build_filename(dir, cpuid, topo_path, NULL);
|
||||||
|
|
||||||
|
file = fopen(filePath, "r");
|
||||||
|
|
||||||
|
if (file == NULL) {
|
||||||
|
error_report("Error opening physical_package_id file");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
char packageId[10];
|
||||||
|
if (fgets(packageId, sizeof(packageId), file) == NULL) {
|
||||||
|
packageCount = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
|
|
||||||
|
int currentPackageId = atoi(packageId);
|
||||||
|
|
||||||
|
bool isUnique = true;
|
||||||
|
for (int j = 0; j < packageCount; j++) {
|
||||||
|
if (uniquePackages[j] == currentPackageId) {
|
||||||
|
isUnique = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isUnique) {
|
||||||
|
uniquePackages[packageCount] = currentPackageId;
|
||||||
|
packageCount++;
|
||||||
|
|
||||||
|
if (packageCount >= max_cpus) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (packageCount == 0) ? 1 : packageCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Retrieve the max number of physical cpu on the host */
|
||||||
|
unsigned int vmsr_get_maxcpus(void)
|
||||||
|
{
|
||||||
|
GDir *dir;
|
||||||
|
const gchar *entry_name;
|
||||||
|
unsigned int cpu_count = 0;
|
||||||
|
const char *path = "/sys/devices/system/cpu/";
|
||||||
|
|
||||||
|
dir = g_dir_open(path, 0, NULL);
|
||||||
|
if (dir == NULL) {
|
||||||
|
error_report("Unable to open cpu directory");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
while ((entry_name = g_dir_read_name(dir)) != NULL) {
|
||||||
|
if (g_ascii_strncasecmp(entry_name, "cpu", 3) == 0 &&
|
||||||
|
isdigit(entry_name[3])) {
|
||||||
|
cpu_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
g_dir_close(dir);
|
||||||
|
|
||||||
|
return cpu_count;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Count the number of physical cpu on each packages */
|
||||||
|
unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
|
||||||
|
unsigned int max_pkgs)
|
||||||
|
{
|
||||||
|
g_autofree char *file_contents = NULL;
|
||||||
|
g_autofree char *path = NULL;
|
||||||
|
g_autofree char *path_name = NULL;
|
||||||
|
gsize length;
|
||||||
|
|
||||||
|
/* Iterate over cpus and count cpus in each package */
|
||||||
|
for (int cpu_id = 0; ; cpu_id++) {
|
||||||
|
path_name = g_strdup_printf("/sys/devices/system/cpu/cpu%d/"
|
||||||
|
"topology/physical_package_id", cpu_id);
|
||||||
|
|
||||||
|
path = g_build_filename(path_name, NULL);
|
||||||
|
|
||||||
|
if (!g_file_get_contents(path, &file_contents, &length, NULL)) {
|
||||||
|
break; /* No more cpus */
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the physical package ID for this CPU */
|
||||||
|
int package_id = atoi(file_contents);
|
||||||
|
|
||||||
|
/* Check if the package ID is within the known number of packages */
|
||||||
|
if (package_id >= 0 && package_id < max_pkgs) {
|
||||||
|
/* If yes, count the cpu for this package*/
|
||||||
|
package_count[package_id]++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Get the physical package id from a given cpu id */
|
||||||
|
int vmsr_get_physical_package_id(int cpu_id)
|
||||||
|
{
|
||||||
|
g_autofree char *file_contents = NULL;
|
||||||
|
g_autofree char *file_path = NULL;
|
||||||
|
int package_id = -1;
|
||||||
|
gsize length;
|
||||||
|
|
||||||
|
file_path = g_strdup_printf("/sys/devices/system/cpu/cpu%d"
|
||||||
|
"/topology/physical_package_id", cpu_id);
|
||||||
|
|
||||||
|
if (!g_file_get_contents(file_path, &file_contents, &length, NULL)) {
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
package_id = atoi(file_contents);
|
||||||
|
|
||||||
|
out:
|
||||||
|
return package_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read the scheduled time for a given thread of a give pid */
|
||||||
|
void vmsr_read_thread_stat(pid_t pid,
|
||||||
|
unsigned int thread_id,
|
||||||
|
unsigned long long *utime,
|
||||||
|
unsigned long long *stime,
|
||||||
|
unsigned int *cpu_id)
|
||||||
|
{
|
||||||
|
g_autofree char *path = NULL;
|
||||||
|
g_autofree char *path_name = NULL;
|
||||||
|
|
||||||
|
path_name = g_strdup_printf("/proc/%u/task/%d/stat", pid, thread_id);
|
||||||
|
|
||||||
|
path = g_build_filename(path_name, NULL);
|
||||||
|
|
||||||
|
FILE *file = fopen(path, "r");
|
||||||
|
if (file == NULL) {
|
||||||
|
pid = -1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (fscanf(file, "%*d (%*[^)]) %*c %*d %*d %*d %*d %*d %*u %*u %*u %*u %*u"
|
||||||
|
" %llu %llu %*d %*d %*d %*d %*d %*d %*u %*u %*d %*u %*u"
|
||||||
|
" %*u %*u %*u %*u %*u %*u %*u %*u %*u %*d %*u %*u %u",
|
||||||
|
utime, stime, cpu_id) != 3)
|
||||||
|
{
|
||||||
|
pid = -1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
fclose(file);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Read QEMU stat task folder to retrieve all QEMU threads ID */
|
||||||
|
pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads)
|
||||||
|
{
|
||||||
|
g_autofree char *task_path = g_strdup_printf("%d/task", pid);
|
||||||
|
g_autofree char *path = g_build_filename("/proc", task_path, NULL);
|
||||||
|
|
||||||
|
DIR *dir = opendir(path);
|
||||||
|
if (dir == NULL) {
|
||||||
|
error_report("Error opening /proc/qemu/task");
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
pid_t *thread_ids = NULL;
|
||||||
|
unsigned int thread_count = 0;
|
||||||
|
|
||||||
|
g_autofree struct dirent *ent = NULL;
|
||||||
|
while ((ent = readdir(dir)) != NULL) {
|
||||||
|
if (ent->d_name[0] == '.') {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
pid_t tid = atoi(ent->d_name);
|
||||||
|
if (pid != tid) {
|
||||||
|
thread_ids = g_renew(pid_t, thread_ids, (thread_count + 1));
|
||||||
|
thread_ids[thread_count] = tid;
|
||||||
|
thread_count++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
closedir(dir);
|
||||||
|
|
||||||
|
*num_threads = thread_count;
|
||||||
|
return thread_ids;
|
||||||
|
}
|
||||||
|
|
||||||
|
void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i)
|
||||||
|
{
|
||||||
|
thd_stat[i].delta_ticks = (thd_stat[i].utime[1] + thd_stat[i].stime[1])
|
||||||
|
- (thd_stat[i].utime[0] + thd_stat[i].stime[0]);
|
||||||
|
}
|
||||||
|
|
||||||
|
double vmsr_get_ratio(uint64_t e_delta,
|
||||||
|
unsigned long long delta_ticks,
|
||||||
|
unsigned int maxticks)
|
||||||
|
{
|
||||||
|
return (e_delta / 100.0) * ((100.0 / maxticks) * delta_ticks);
|
||||||
|
}
|
||||||
|
|
||||||
|
void vmsr_init_topo_info(X86CPUTopoInfo *topo_info,
|
||||||
|
const MachineState *ms)
|
||||||
|
{
|
||||||
|
topo_info->dies_per_pkg = ms->smp.dies;
|
||||||
|
topo_info->modules_per_die = ms->smp.modules;
|
||||||
|
topo_info->cores_per_module = ms->smp.cores;
|
||||||
|
topo_info->threads_per_core = ms->smp.threads;
|
||||||
|
}
|
||||||
|
|
99
target/i386/kvm/vmsr_energy.h
Normal file
99
target/i386/kvm/vmsr_energy.h
Normal file
|
@ -0,0 +1,99 @@
|
||||||
|
/*
|
||||||
|
* QEMU KVM support -- x86 virtual energy-related MSR.
|
||||||
|
*
|
||||||
|
* Copyright 2024 Red Hat, Inc. 2024
|
||||||
|
*
|
||||||
|
* Author:
|
||||||
|
* Anthony Harivel <aharivel@redhat.com>
|
||||||
|
*
|
||||||
|
* This work is licensed under the terms of the GNU GPL, version 2 or later.
|
||||||
|
* See the COPYING file in the top-level directory.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef VMSR_ENERGY_H
|
||||||
|
#define VMSR_ENERGY_H
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include "io/channel-socket.h"
|
||||||
|
#include "hw/i386/topology.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Define the interval time in micro seconds between 2 samples of
|
||||||
|
* energy related MSRs
|
||||||
|
*/
|
||||||
|
#define MSR_ENERGY_THREAD_SLEEP_US 1000000.0
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Thread statistic
|
||||||
|
* @ thread_id: TID (thread ID)
|
||||||
|
* @ is_vcpu: true if TID is vCPU thread
|
||||||
|
* @ cpu_id: CPU number last executed on
|
||||||
|
* @ pkg_id: package number of the CPU
|
||||||
|
* @ vcpu_id: vCPU ID
|
||||||
|
* @ vpkg: virtual package number
|
||||||
|
* @ acpi_id: APIC id of the vCPU
|
||||||
|
* @ utime: amount of clock ticks the thread
|
||||||
|
* has been scheduled in User mode
|
||||||
|
* @ stime: amount of clock ticks the thread
|
||||||
|
* has been scheduled in System mode
|
||||||
|
* @ delta_ticks: delta of utime+stime between
|
||||||
|
* the two samples (before/after sleep)
|
||||||
|
*/
|
||||||
|
struct vmsr_thread_stat {
|
||||||
|
unsigned int thread_id;
|
||||||
|
bool is_vcpu;
|
||||||
|
unsigned int cpu_id;
|
||||||
|
unsigned int pkg_id;
|
||||||
|
unsigned int vpkg_id;
|
||||||
|
unsigned int vcpu_id;
|
||||||
|
unsigned long acpi_id;
|
||||||
|
unsigned long long *utime;
|
||||||
|
unsigned long long *stime;
|
||||||
|
unsigned long long delta_ticks;
|
||||||
|
};
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Package statistic
|
||||||
|
* @ e_start: package energy counter before the sleep
|
||||||
|
* @ e_end: package energy counter after the sleep
|
||||||
|
* @ e_delta: delta of package energy counter
|
||||||
|
* @ e_ratio: store the energy ratio of non-vCPU thread
|
||||||
|
* @ nb_vcpu: number of vCPU running on this package
|
||||||
|
*/
|
||||||
|
struct vmsr_package_energy_stat {
|
||||||
|
uint64_t e_start;
|
||||||
|
uint64_t e_end;
|
||||||
|
uint64_t e_delta;
|
||||||
|
uint64_t e_ratio;
|
||||||
|
unsigned int nb_vcpu;
|
||||||
|
};
|
||||||
|
|
||||||
|
typedef struct vmsr_thread_stat vmsr_thread_stat;
|
||||||
|
typedef struct vmsr_package_energy_stat vmsr_package_energy_stat;
|
||||||
|
|
||||||
|
char *vmsr_compute_default_paths(void);
|
||||||
|
void vmsr_read_thread_stat(pid_t pid,
|
||||||
|
unsigned int thread_id,
|
||||||
|
unsigned long long *utime,
|
||||||
|
unsigned long long *stime,
|
||||||
|
unsigned int *cpu_id);
|
||||||
|
|
||||||
|
QIOChannelSocket *vmsr_open_socket(const char *path);
|
||||||
|
uint64_t vmsr_read_msr(uint32_t reg, uint32_t cpu_id,
|
||||||
|
uint32_t tid, QIOChannelSocket *sioc);
|
||||||
|
void vmsr_delta_ticks(vmsr_thread_stat *thd_stat, int i);
|
||||||
|
unsigned int vmsr_get_maxcpus(void);
|
||||||
|
unsigned int vmsr_get_max_physical_package(unsigned int max_cpus);
|
||||||
|
unsigned int vmsr_count_cpus_per_package(unsigned int *package_count,
|
||||||
|
unsigned int max_pkgs);
|
||||||
|
int vmsr_get_physical_package_id(int cpu_id);
|
||||||
|
pid_t *vmsr_get_thread_ids(pid_t pid, unsigned int *num_threads);
|
||||||
|
double vmsr_get_ratio(uint64_t e_delta,
|
||||||
|
unsigned long long delta_ticks,
|
||||||
|
unsigned int maxticks);
|
||||||
|
void vmsr_init_topo_info(X86CPUTopoInfo *topo_info, const MachineState *ms);
|
||||||
|
bool is_host_cpu_intel(void);
|
||||||
|
int is_rapl_enabled(void);
|
||||||
|
#endif /* VMSR_ENERGY_H */
|
530
tools/i386/qemu-vmsr-helper.c
Normal file
530
tools/i386/qemu-vmsr-helper.c
Normal file
|
@ -0,0 +1,530 @@
|
||||||
|
/*
|
||||||
|
* Privileged RAPL MSR helper commands for QEMU
|
||||||
|
*
|
||||||
|
* Copyright (C) 2024 Red Hat, Inc. <aharivel@redhat.com>
|
||||||
|
*
|
||||||
|
* Author: Anthony Harivel <aharivel@redhat.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; under version 2 of the License.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "qemu/osdep.h"
|
||||||
|
#include <getopt.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
#include <sys/ioctl.h>
|
||||||
|
#ifdef CONFIG_LIBCAP_NG
|
||||||
|
#include <cap-ng.h>
|
||||||
|
#endif
|
||||||
|
#include <pwd.h>
|
||||||
|
#include <grp.h>
|
||||||
|
|
||||||
|
#include "qemu/help-texts.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "qemu/cutils.h"
|
||||||
|
#include "qemu/main-loop.h"
|
||||||
|
#include "qemu/module.h"
|
||||||
|
#include "qemu/error-report.h"
|
||||||
|
#include "qemu/config-file.h"
|
||||||
|
#include "qemu-version.h"
|
||||||
|
#include "qapi/error.h"
|
||||||
|
#include "qemu/error-report.h"
|
||||||
|
#include "qemu/log.h"
|
||||||
|
#include "qemu/systemd.h"
|
||||||
|
#include "io/channel.h"
|
||||||
|
#include "io/channel-socket.h"
|
||||||
|
#include "trace/control.h"
|
||||||
|
#include "qemu-version.h"
|
||||||
|
#include "rapl-msr-index.h"
|
||||||
|
|
||||||
|
#define MSR_PATH_TEMPLATE "/dev/cpu/%u/msr"
|
||||||
|
|
||||||
|
static char *socket_path;
|
||||||
|
static char *pidfile;
|
||||||
|
static enum { RUNNING, TERMINATE, TERMINATING } state;
|
||||||
|
static QIOChannelSocket *server_ioc;
|
||||||
|
static int server_watch;
|
||||||
|
static int num_active_sockets = 1;
|
||||||
|
|
||||||
|
#ifdef CONFIG_LIBCAP_NG
|
||||||
|
static int uid = -1;
|
||||||
|
static int gid = -1;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
static void compute_default_paths(void)
|
||||||
|
{
|
||||||
|
g_autofree char *state = qemu_get_local_state_dir();
|
||||||
|
|
||||||
|
socket_path = g_build_filename(state, "run", "qemu-vmsr-helper.sock", NULL);
|
||||||
|
pidfile = g_build_filename(state, "run", "qemu-vmsr-helper.pid", NULL);
|
||||||
|
}
|
||||||
|
|
||||||
|
static int is_intel_processor(void)
|
||||||
|
{
|
||||||
|
int result;
|
||||||
|
int ebx, ecx, edx;
|
||||||
|
|
||||||
|
/* Execute CPUID instruction with eax=0 (basic identification) */
|
||||||
|
asm volatile (
|
||||||
|
"cpuid"
|
||||||
|
: "=b" (ebx), "=c" (ecx), "=d" (edx)
|
||||||
|
: "a" (0)
|
||||||
|
);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if processor is "GenuineIntel"
|
||||||
|
* 0x756e6547 = "Genu"
|
||||||
|
* 0x49656e69 = "ineI"
|
||||||
|
* 0x6c65746e = "ntel"
|
||||||
|
*/
|
||||||
|
result = (ebx == 0x756e6547) && (edx == 0x49656e69) && (ecx == 0x6c65746e);
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static int is_rapl_enabled(void)
|
||||||
|
{
|
||||||
|
const char *path = "/sys/class/powercap/intel-rapl/enabled";
|
||||||
|
FILE *file = fopen(path, "r");
|
||||||
|
int value = 0;
|
||||||
|
|
||||||
|
if (file != NULL) {
|
||||||
|
if (fscanf(file, "%d", &value) != 1) {
|
||||||
|
error_report("INTEL RAPL not enabled");
|
||||||
|
}
|
||||||
|
fclose(file);
|
||||||
|
} else {
|
||||||
|
error_report("Error opening %s", path);
|
||||||
|
}
|
||||||
|
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check if the TID that request the MSR read
|
||||||
|
* belongs to the peer. It be should a TID of a vCPU.
|
||||||
|
*/
|
||||||
|
static bool is_tid_present(pid_t pid, pid_t tid)
|
||||||
|
{
|
||||||
|
g_autofree char *tidPath = g_strdup_printf("/proc/%d/task/%d", pid, tid);
|
||||||
|
|
||||||
|
/* Check if the TID directory exists within the PID directory */
|
||||||
|
if (access(tidPath, F_OK) == 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
error_report("Failed to open /proc at %s", tidPath);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Only the RAPL MSR in target/i386/cpu.h are allowed
|
||||||
|
*/
|
||||||
|
static bool is_msr_allowed(uint32_t reg)
|
||||||
|
{
|
||||||
|
switch (reg) {
|
||||||
|
case MSR_RAPL_POWER_UNIT:
|
||||||
|
case MSR_PKG_POWER_LIMIT:
|
||||||
|
case MSR_PKG_ENERGY_STATUS:
|
||||||
|
case MSR_PKG_POWER_INFO:
|
||||||
|
return true;
|
||||||
|
default:
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static uint64_t vmsr_read_msr(uint32_t msr_register, unsigned int cpu_id)
|
||||||
|
{
|
||||||
|
int fd;
|
||||||
|
uint64_t result = 0;
|
||||||
|
|
||||||
|
g_autofree char *path = g_strdup_printf(MSR_PATH_TEMPLATE, cpu_id);
|
||||||
|
|
||||||
|
fd = open(path, O_RDONLY);
|
||||||
|
if (fd < 0) {
|
||||||
|
error_report("Failed to open MSR file at %s", path);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (pread(fd, &result, sizeof(result), msr_register) != sizeof(result)) {
|
||||||
|
error_report("Failed to read MSR");
|
||||||
|
result = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
close(fd);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void usage(const char *name)
|
||||||
|
{
|
||||||
|
(printf) (
|
||||||
|
"Usage: %s [OPTIONS] FILE\n"
|
||||||
|
"Virtual RAPL MSR helper program for QEMU\n"
|
||||||
|
"\n"
|
||||||
|
" -h, --help display this help and exit\n"
|
||||||
|
" -V, --version output version information and exit\n"
|
||||||
|
"\n"
|
||||||
|
" -d, --daemon run in the background\n"
|
||||||
|
" -f, --pidfile=PATH PID file when running as a daemon\n"
|
||||||
|
" (default '%s')\n"
|
||||||
|
" -k, --socket=PATH path to the unix socket\n"
|
||||||
|
" (default '%s')\n"
|
||||||
|
" -T, --trace [[enable=]<pattern>][,events=<file>][,file=<file>]\n"
|
||||||
|
" specify tracing options\n"
|
||||||
|
#ifdef CONFIG_LIBCAP_NG
|
||||||
|
" -u, --user=USER user to drop privileges to\n"
|
||||||
|
" -g, --group=GROUP group to drop privileges to\n"
|
||||||
|
#endif
|
||||||
|
"\n"
|
||||||
|
QEMU_HELP_BOTTOM "\n"
|
||||||
|
, name, pidfile, socket_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
static void version(const char *name)
|
||||||
|
{
|
||||||
|
printf(
|
||||||
|
"%s " QEMU_FULL_VERSION "\n"
|
||||||
|
"Written by Anthony Harivel.\n"
|
||||||
|
"\n"
|
||||||
|
QEMU_COPYRIGHT "\n"
|
||||||
|
"This is free software; see the source for copying conditions. There is NO\n"
|
||||||
|
"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.\n"
|
||||||
|
, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
typedef struct VMSRHelperClient {
|
||||||
|
QIOChannelSocket *ioc;
|
||||||
|
Coroutine *co;
|
||||||
|
} VMSRHelperClient;
|
||||||
|
|
||||||
|
static void coroutine_fn vh_co_entry(void *opaque)
|
||||||
|
{
|
||||||
|
VMSRHelperClient *client = opaque;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
unsigned int peer_pid;
|
||||||
|
uint32_t request[3];
|
||||||
|
uint64_t vmsr;
|
||||||
|
int r;
|
||||||
|
|
||||||
|
qio_channel_set_blocking(QIO_CHANNEL(client->ioc),
|
||||||
|
false, NULL);
|
||||||
|
|
||||||
|
qio_channel_set_follow_coroutine_ctx(QIO_CHANNEL(client->ioc), true);
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Check peer credentials
|
||||||
|
*/
|
||||||
|
r = qio_channel_get_peerpid(QIO_CHANNEL(client->ioc),
|
||||||
|
&peer_pid,
|
||||||
|
&local_err);
|
||||||
|
if (r < 0) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
goto out;
|
||||||
|
}
|
||||||
|
|
||||||
|
while (r < 0) {
|
||||||
|
/*
|
||||||
|
* Read the requested MSR
|
||||||
|
* Only RAPL MSR in rapl-msr-index.h is allowed
|
||||||
|
*/
|
||||||
|
r = qio_channel_read_all(QIO_CHANNEL(client->ioc),
|
||||||
|
(char *) &request, sizeof(request), &local_err);
|
||||||
|
if (r < 0) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_msr_allowed(request[0])) {
|
||||||
|
error_report("Requested unallowed msr: %d", request[0]);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
vmsr = vmsr_read_msr(request[0], request[1]);
|
||||||
|
|
||||||
|
if (!is_tid_present(peer_pid, request[2])) {
|
||||||
|
error_report("Requested TID not in peer PID: %d %d",
|
||||||
|
peer_pid, request[2]);
|
||||||
|
vmsr = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
r = qio_channel_write_all(QIO_CHANNEL(client->ioc),
|
||||||
|
(char *) &vmsr,
|
||||||
|
sizeof(vmsr),
|
||||||
|
&local_err);
|
||||||
|
if (r < 0) {
|
||||||
|
error_report_err(local_err);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
out:
|
||||||
|
object_unref(OBJECT(client->ioc));
|
||||||
|
g_free(client);
|
||||||
|
}
|
||||||
|
|
||||||
|
static gboolean accept_client(QIOChannel *ioc,
|
||||||
|
GIOCondition cond,
|
||||||
|
gpointer opaque)
|
||||||
|
{
|
||||||
|
QIOChannelSocket *cioc;
|
||||||
|
VMSRHelperClient *vmsrh;
|
||||||
|
|
||||||
|
cioc = qio_channel_socket_accept(QIO_CHANNEL_SOCKET(ioc),
|
||||||
|
NULL);
|
||||||
|
if (!cioc) {
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
vmsrh = g_new(VMSRHelperClient, 1);
|
||||||
|
vmsrh->ioc = cioc;
|
||||||
|
vmsrh->co = qemu_coroutine_create(vh_co_entry, vmsrh);
|
||||||
|
qemu_coroutine_enter(vmsrh->co);
|
||||||
|
|
||||||
|
return TRUE;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void termsig_handler(int signum)
|
||||||
|
{
|
||||||
|
qatomic_cmpxchg(&state, RUNNING, TERMINATE);
|
||||||
|
qemu_notify_event();
|
||||||
|
}
|
||||||
|
|
||||||
|
static void close_server_socket(void)
|
||||||
|
{
|
||||||
|
assert(server_ioc);
|
||||||
|
|
||||||
|
g_source_remove(server_watch);
|
||||||
|
server_watch = -1;
|
||||||
|
object_unref(OBJECT(server_ioc));
|
||||||
|
num_active_sockets--;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_LIBCAP_NG
|
||||||
|
static int drop_privileges(void)
|
||||||
|
{
|
||||||
|
/* clear all capabilities */
|
||||||
|
capng_clear(CAPNG_SELECT_BOTH);
|
||||||
|
|
||||||
|
if (capng_update(CAPNG_ADD, CAPNG_EFFECTIVE | CAPNG_PERMITTED,
|
||||||
|
CAP_SYS_RAWIO) < 0) {
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
int main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
const char *sopt = "hVk:f:dT:u:g:vq";
|
||||||
|
struct option lopt[] = {
|
||||||
|
{ "help", no_argument, NULL, 'h' },
|
||||||
|
{ "version", no_argument, NULL, 'V' },
|
||||||
|
{ "socket", required_argument, NULL, 'k' },
|
||||||
|
{ "pidfile", required_argument, NULL, 'f' },
|
||||||
|
{ "daemon", no_argument, NULL, 'd' },
|
||||||
|
{ "trace", required_argument, NULL, 'T' },
|
||||||
|
{ "verbose", no_argument, NULL, 'v' },
|
||||||
|
{ NULL, 0, NULL, 0 }
|
||||||
|
};
|
||||||
|
int opt_ind = 0;
|
||||||
|
int ch;
|
||||||
|
Error *local_err = NULL;
|
||||||
|
bool daemonize = false;
|
||||||
|
bool pidfile_specified = false;
|
||||||
|
bool socket_path_specified = false;
|
||||||
|
unsigned socket_activation;
|
||||||
|
|
||||||
|
struct sigaction sa_sigterm;
|
||||||
|
memset(&sa_sigterm, 0, sizeof(sa_sigterm));
|
||||||
|
sa_sigterm.sa_handler = termsig_handler;
|
||||||
|
sigaction(SIGTERM, &sa_sigterm, NULL);
|
||||||
|
sigaction(SIGINT, &sa_sigterm, NULL);
|
||||||
|
sigaction(SIGHUP, &sa_sigterm, NULL);
|
||||||
|
|
||||||
|
signal(SIGPIPE, SIG_IGN);
|
||||||
|
|
||||||
|
error_init(argv[0]);
|
||||||
|
module_call_init(MODULE_INIT_TRACE);
|
||||||
|
module_call_init(MODULE_INIT_QOM);
|
||||||
|
qemu_add_opts(&qemu_trace_opts);
|
||||||
|
qemu_init_exec_dir(argv[0]);
|
||||||
|
|
||||||
|
compute_default_paths();
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Sanity check
|
||||||
|
* 1. cpu must be Intel cpu
|
||||||
|
* 2. RAPL must be enabled
|
||||||
|
*/
|
||||||
|
if (!is_intel_processor()) {
|
||||||
|
error_report("error: CPU is not INTEL cpu");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!is_rapl_enabled()) {
|
||||||
|
error_report("error: RAPL driver not enable");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
|
||||||
|
switch (ch) {
|
||||||
|
case 'k':
|
||||||
|
g_free(socket_path);
|
||||||
|
socket_path = g_strdup(optarg);
|
||||||
|
socket_path_specified = true;
|
||||||
|
if (socket_path[0] != '/') {
|
||||||
|
error_report("socket path must be absolute");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
case 'f':
|
||||||
|
g_free(pidfile);
|
||||||
|
pidfile = g_strdup(optarg);
|
||||||
|
pidfile_specified = true;
|
||||||
|
break;
|
||||||
|
#ifdef CONFIG_LIBCAP_NG
|
||||||
|
case 'u': {
|
||||||
|
unsigned long res;
|
||||||
|
struct passwd *userinfo = getpwnam(optarg);
|
||||||
|
if (userinfo) {
|
||||||
|
uid = userinfo->pw_uid;
|
||||||
|
} else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 &&
|
||||||
|
(uid_t)res == res) {
|
||||||
|
uid = res;
|
||||||
|
} else {
|
||||||
|
error_report("invalid user '%s'", optarg);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 'g': {
|
||||||
|
unsigned long res;
|
||||||
|
struct group *groupinfo = getgrnam(optarg);
|
||||||
|
if (groupinfo) {
|
||||||
|
gid = groupinfo->gr_gid;
|
||||||
|
} else if (qemu_strtoul(optarg, NULL, 10, &res) == 0 &&
|
||||||
|
(gid_t)res == res) {
|
||||||
|
gid = res;
|
||||||
|
} else {
|
||||||
|
error_report("invalid group '%s'", optarg);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
case 'u':
|
||||||
|
case 'g':
|
||||||
|
error_report("-%c not supported by this %s", ch, argv[0]);
|
||||||
|
exit(1);
|
||||||
|
#endif
|
||||||
|
case 'd':
|
||||||
|
daemonize = true;
|
||||||
|
break;
|
||||||
|
case 'T':
|
||||||
|
trace_opt_parse(optarg);
|
||||||
|
break;
|
||||||
|
case 'V':
|
||||||
|
version(argv[0]);
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
break;
|
||||||
|
case 'h':
|
||||||
|
usage(argv[0]);
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
break;
|
||||||
|
case '?':
|
||||||
|
error_report("Try `%s --help' for more information.", argv[0]);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!trace_init_backends()) {
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
trace_init_file();
|
||||||
|
qemu_set_log(LOG_TRACE, &error_fatal);
|
||||||
|
|
||||||
|
socket_activation = check_socket_activation();
|
||||||
|
if (socket_activation == 0) {
|
||||||
|
SocketAddress saddr;
|
||||||
|
saddr = (SocketAddress){
|
||||||
|
.type = SOCKET_ADDRESS_TYPE_UNIX,
|
||||||
|
.u.q_unix.path = socket_path,
|
||||||
|
};
|
||||||
|
server_ioc = qio_channel_socket_new();
|
||||||
|
if (qio_channel_socket_listen_sync(server_ioc, &saddr,
|
||||||
|
1, &local_err) < 0) {
|
||||||
|
object_unref(OBJECT(server_ioc));
|
||||||
|
error_report_err(local_err);
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
/* Using socket activation - check user didn't use -p etc. */
|
||||||
|
if (socket_path_specified) {
|
||||||
|
error_report("Unix socket can't be set when"
|
||||||
|
"using socket activation");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Can only listen on a single socket. */
|
||||||
|
if (socket_activation > 1) {
|
||||||
|
error_report("%s does not support socket activation"
|
||||||
|
"with LISTEN_FDS > 1",
|
||||||
|
argv[0]);
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
server_ioc = qio_channel_socket_new_fd(FIRST_SOCKET_ACTIVATION_FD,
|
||||||
|
&local_err);
|
||||||
|
if (server_ioc == NULL) {
|
||||||
|
error_reportf_err(local_err,
|
||||||
|
"Failed to use socket activation: ");
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
qemu_init_main_loop(&error_fatal);
|
||||||
|
|
||||||
|
server_watch = qio_channel_add_watch(QIO_CHANNEL(server_ioc),
|
||||||
|
G_IO_IN,
|
||||||
|
accept_client,
|
||||||
|
NULL, NULL);
|
||||||
|
|
||||||
|
if (daemonize) {
|
||||||
|
if (daemon(0, 0) < 0) {
|
||||||
|
error_report("Failed to daemonize: %s", strerror(errno));
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (daemonize || pidfile_specified) {
|
||||||
|
qemu_write_pidfile(pidfile, &error_fatal);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef CONFIG_LIBCAP_NG
|
||||||
|
if (drop_privileges() < 0) {
|
||||||
|
error_report("Failed to drop privileges: %s", strerror(errno));
|
||||||
|
exit(EXIT_FAILURE);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
info_report("Listening on %s", socket_path);
|
||||||
|
|
||||||
|
state = RUNNING;
|
||||||
|
do {
|
||||||
|
main_loop_wait(false);
|
||||||
|
if (state == TERMINATE) {
|
||||||
|
state = TERMINATING;
|
||||||
|
close_server_socket();
|
||||||
|
}
|
||||||
|
} while (num_active_sockets > 0);
|
||||||
|
|
||||||
|
exit(EXIT_SUCCESS);
|
||||||
|
}
|
28
tools/i386/rapl-msr-index.h
Normal file
28
tools/i386/rapl-msr-index.h
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
/*
|
||||||
|
* Allowed list of MSR for Privileged RAPL MSR helper commands for QEMU
|
||||||
|
*
|
||||||
|
* Copyright (C) 2023 Red Hat, Inc. <aharivel@redhat.com>
|
||||||
|
*
|
||||||
|
* Author: Anthony Harivel <aharivel@redhat.com>
|
||||||
|
*
|
||||||
|
* This program is free software; you can redistribute it and/or modify
|
||||||
|
* it under the terms of the GNU General Public License as published by
|
||||||
|
* the Free Software Foundation; under version 2 of the License.
|
||||||
|
*
|
||||||
|
* This program is distributed in the hope that it will be useful,
|
||||||
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||||
|
* GNU General Public License for more details.
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License
|
||||||
|
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Should stay in sync with the RAPL MSR
|
||||||
|
* in target/i386/cpu.h
|
||||||
|
*/
|
||||||
|
#define MSR_RAPL_POWER_UNIT 0x00000606
|
||||||
|
#define MSR_PKG_POWER_LIMIT 0x00000610
|
||||||
|
#define MSR_PKG_ENERGY_STATUS 0x00000611
|
||||||
|
#define MSR_PKG_POWER_INFO 0x00000614
|
Loading…
Add table
Add a link
Reference in a new issue