mirror of
https://github.com/Motorhead1991/qemu.git
synced 2025-07-29 21:33:53 -06:00

Like macOS we have similar issue on Linux. For TCP socket the send buffer size is 2626560 bytes (~2.5 MiB) and we get good performance. However for unix socket the default and maximum buffer size is 212992 bytes (208 KiB) and we see poor performance when using one NBD connection, up to 4 times slower than macOS on the same machine. Tracing shows that for every 2 MiB payload (qemu uses 2 MiB io size), we do 1 recvmsg call with TCP socket, and 10 recvmsg calls with unix socket. Fixing this issue requires changing the maximum send buffer size (the receive buffer size is ignored). This can be done using: $ cat /etc/sysctl.d/net-mem-max.conf net.core.wmem_max = 2097152 $ sudo sysctl -p /etc/sysctl.d/net-mem-max.conf With this we can set the socket buffer size to 2 MiB. With the defaults the value requested by qemu is clipped to the maximum size and has no effect. I tested on 2 machines: - Fedora 42 VM on MacBook Pro M2 Max - Dell PowerEdge R640 (Intel(R) Xeon(R) Gold 6230 CPU @ 2.10GHz) On the older Dell machine we see very little improvement, up to 1.03 higher throughput. On the M2 machine we see up to 2.67 times higher throughput. The following results are from the M2 machine. Reading from qemu-nbd with qemu-img convert. In this test buffer size of 4m is optimal (2.28 times faster). | buffer size | time | user | system | |-------------|---------|---------|---------| | default | 4.292 | 0.243 | 1.604 | | 524288 | 2.167 | 0.058 | 1.288 | | 1048576 | 2.041 | 0.060 | 1.238 | | 2097152 | 1.884 | 0.060 | 1.191 | | 4194304 | 1.881 | 0.054 | 1.196 | Writing to qemu-nbd with qemu-img convert. In this test buffer size of 1m is optimal (2.67 times faster). | buffer size | time | user | system | |-------------|---------|---------|---------| | default | 3.113 | 0.334 | 1.094 | | 524288 | 1.173 | 0.179 | 0.654 | | 1048576 | 1.164 | 0.164 | 0.670 | | 2097152 | 1.227 | 0.197 | 0.663 | | 4194304 | 1.227 | 0.198 | 0.666 | Computing a blkhash with nbdcopy. In this test buffer size of 512k is optimal (1.19 times faster). | buffer size | time | user | system | |-------------|---------|---------|---------| | default | 2.140 | 4.483 | 2.681 | | 524288 | 1.794 | 4.467 | 2.572 | | 1048576 | 1.807 | 4.447 | 2.644 | | 2097152 | 1.822 | 4.461 | 2.698 | | 4194304 | 1.827 | 4.465 | 2.700 | Computing a blkhash with blksum. In this test buffer size of 4m is optimal (2.65 times faster). | buffer size | time | user | system | |-------------|---------|---------|---------| | default | 3.582 | 4.595 | 2.392 | | 524288 | 1.499 | 4.384 | 1.482 | | 1048576 | 1.377 | 4.381 | 1.345 | | 2097152 | 1.388 | 4.389 | 1.354 | | 4194304 | 1.352 | 4.395 | 1.302 | Signed-off-by: Nir Soffer <nirsof@gmail.com> Message-ID: <20250517201154.88456-4-nirsof@gmail.com> Reviewed-by: Daniel P. Berrangé <berrange@redhat.com> Signed-off-by: Eric Blake <eblake@redhat.com>
292 lines
6.9 KiB
C
292 lines
6.9 KiB
C
/*
|
|
* Copyright (C) 2005 Anthony Liguori <anthony@codemonkey.ws>
|
|
*
|
|
* Network Block Device Common Code
|
|
*
|
|
* This program is free software; you can redistribute it and/or modify
|
|
* it under the terms of the GNU General Public License as published by
|
|
* the Free Software Foundation; under version 2 of the License.
|
|
*
|
|
* This program is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
* GNU General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU General Public License
|
|
* along with this program; if not, see <http://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
#include "qemu/osdep.h"
|
|
#include "trace.h"
|
|
#include "io/channel-socket.h"
|
|
#include "qapi/error.h"
|
|
#include "qemu/units.h"
|
|
#include "nbd-internal.h"
|
|
|
|
/* Discard length bytes from channel. Return -errno on failure and 0 on
|
|
* success */
|
|
int nbd_drop(QIOChannel *ioc, size_t size, Error **errp)
|
|
{
|
|
ssize_t ret = 0;
|
|
char small[1024];
|
|
char *buffer;
|
|
|
|
buffer = sizeof(small) >= size ? small : g_malloc(MIN(65536, size));
|
|
while (size > 0) {
|
|
ssize_t count = MIN(65536, size);
|
|
ret = nbd_read(ioc, buffer, MIN(65536, size), NULL, errp);
|
|
|
|
if (ret < 0) {
|
|
goto cleanup;
|
|
}
|
|
size -= count;
|
|
}
|
|
|
|
cleanup:
|
|
if (buffer != small) {
|
|
g_free(buffer);
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
const char *nbd_opt_lookup(uint32_t opt)
|
|
{
|
|
switch (opt) {
|
|
case NBD_OPT_EXPORT_NAME:
|
|
return "export name";
|
|
case NBD_OPT_ABORT:
|
|
return "abort";
|
|
case NBD_OPT_LIST:
|
|
return "list";
|
|
case NBD_OPT_STARTTLS:
|
|
return "starttls";
|
|
case NBD_OPT_INFO:
|
|
return "info";
|
|
case NBD_OPT_GO:
|
|
return "go";
|
|
case NBD_OPT_STRUCTURED_REPLY:
|
|
return "structured reply";
|
|
case NBD_OPT_LIST_META_CONTEXT:
|
|
return "list meta context";
|
|
case NBD_OPT_SET_META_CONTEXT:
|
|
return "set meta context";
|
|
case NBD_OPT_EXTENDED_HEADERS:
|
|
return "extended headers";
|
|
default:
|
|
return "<unknown>";
|
|
}
|
|
}
|
|
|
|
|
|
const char *nbd_rep_lookup(uint32_t rep)
|
|
{
|
|
switch (rep) {
|
|
case NBD_REP_ACK:
|
|
return "ack";
|
|
case NBD_REP_SERVER:
|
|
return "server";
|
|
case NBD_REP_INFO:
|
|
return "info";
|
|
case NBD_REP_META_CONTEXT:
|
|
return "meta context";
|
|
case NBD_REP_ERR_UNSUP:
|
|
return "unsupported";
|
|
case NBD_REP_ERR_POLICY:
|
|
return "denied by policy";
|
|
case NBD_REP_ERR_INVALID:
|
|
return "invalid";
|
|
case NBD_REP_ERR_PLATFORM:
|
|
return "platform lacks support";
|
|
case NBD_REP_ERR_TLS_REQD:
|
|
return "TLS required";
|
|
case NBD_REP_ERR_UNKNOWN:
|
|
return "export unknown";
|
|
case NBD_REP_ERR_SHUTDOWN:
|
|
return "server shutting down";
|
|
case NBD_REP_ERR_BLOCK_SIZE_REQD:
|
|
return "block size required";
|
|
case NBD_REP_ERR_TOO_BIG:
|
|
return "option payload too big";
|
|
case NBD_REP_ERR_EXT_HEADER_REQD:
|
|
return "extended headers required";
|
|
default:
|
|
return "<unknown>";
|
|
}
|
|
}
|
|
|
|
|
|
const char *nbd_info_lookup(uint16_t info)
|
|
{
|
|
switch (info) {
|
|
case NBD_INFO_EXPORT:
|
|
return "export";
|
|
case NBD_INFO_NAME:
|
|
return "name";
|
|
case NBD_INFO_DESCRIPTION:
|
|
return "description";
|
|
case NBD_INFO_BLOCK_SIZE:
|
|
return "block size";
|
|
default:
|
|
return "<unknown>";
|
|
}
|
|
}
|
|
|
|
|
|
const char *nbd_cmd_lookup(uint16_t cmd)
|
|
{
|
|
switch (cmd) {
|
|
case NBD_CMD_READ:
|
|
return "read";
|
|
case NBD_CMD_WRITE:
|
|
return "write";
|
|
case NBD_CMD_DISC:
|
|
return "disconnect";
|
|
case NBD_CMD_FLUSH:
|
|
return "flush";
|
|
case NBD_CMD_TRIM:
|
|
return "trim";
|
|
case NBD_CMD_CACHE:
|
|
return "cache";
|
|
case NBD_CMD_WRITE_ZEROES:
|
|
return "write zeroes";
|
|
case NBD_CMD_BLOCK_STATUS:
|
|
return "block status";
|
|
default:
|
|
return "<unknown>";
|
|
}
|
|
}
|
|
|
|
|
|
const char *nbd_reply_type_lookup(uint16_t type)
|
|
{
|
|
switch (type) {
|
|
case NBD_REPLY_TYPE_NONE:
|
|
return "none";
|
|
case NBD_REPLY_TYPE_OFFSET_DATA:
|
|
return "data";
|
|
case NBD_REPLY_TYPE_OFFSET_HOLE:
|
|
return "hole";
|
|
case NBD_REPLY_TYPE_BLOCK_STATUS:
|
|
return "block status (32-bit)";
|
|
case NBD_REPLY_TYPE_BLOCK_STATUS_EXT:
|
|
return "block status (64-bit)";
|
|
case NBD_REPLY_TYPE_ERROR:
|
|
return "generic error";
|
|
case NBD_REPLY_TYPE_ERROR_OFFSET:
|
|
return "error at offset";
|
|
default:
|
|
if (type & (1 << 15)) {
|
|
return "<unknown error>";
|
|
}
|
|
return "<unknown>";
|
|
}
|
|
}
|
|
|
|
|
|
const char *nbd_err_lookup(int err)
|
|
{
|
|
switch (err) {
|
|
case NBD_SUCCESS:
|
|
return "success";
|
|
case NBD_EPERM:
|
|
return "EPERM";
|
|
case NBD_EIO:
|
|
return "EIO";
|
|
case NBD_ENOMEM:
|
|
return "ENOMEM";
|
|
case NBD_EINVAL:
|
|
return "EINVAL";
|
|
case NBD_ENOSPC:
|
|
return "ENOSPC";
|
|
case NBD_EOVERFLOW:
|
|
return "EOVERFLOW";
|
|
case NBD_ENOTSUP:
|
|
return "ENOTSUP";
|
|
case NBD_ESHUTDOWN:
|
|
return "ESHUTDOWN";
|
|
default:
|
|
return "<unknown>";
|
|
}
|
|
}
|
|
|
|
|
|
int nbd_errno_to_system_errno(int err)
|
|
{
|
|
int ret;
|
|
switch (err) {
|
|
case NBD_SUCCESS:
|
|
ret = 0;
|
|
break;
|
|
case NBD_EPERM:
|
|
ret = EPERM;
|
|
break;
|
|
case NBD_EIO:
|
|
ret = EIO;
|
|
break;
|
|
case NBD_ENOMEM:
|
|
ret = ENOMEM;
|
|
break;
|
|
case NBD_ENOSPC:
|
|
ret = ENOSPC;
|
|
break;
|
|
case NBD_EOVERFLOW:
|
|
ret = EOVERFLOW;
|
|
break;
|
|
case NBD_ENOTSUP:
|
|
ret = ENOTSUP;
|
|
break;
|
|
case NBD_ESHUTDOWN:
|
|
ret = ESHUTDOWN;
|
|
break;
|
|
default:
|
|
trace_nbd_unknown_error(err);
|
|
/* fallthrough */
|
|
case NBD_EINVAL:
|
|
ret = EINVAL;
|
|
break;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
|
|
const char *nbd_mode_lookup(NBDMode mode)
|
|
{
|
|
switch (mode) {
|
|
case NBD_MODE_OLDSTYLE:
|
|
return "oldstyle";
|
|
case NBD_MODE_EXPORT_NAME:
|
|
return "export name only";
|
|
case NBD_MODE_SIMPLE:
|
|
return "simple headers";
|
|
case NBD_MODE_STRUCTURED:
|
|
return "structured replies";
|
|
case NBD_MODE_EXTENDED:
|
|
return "extended headers";
|
|
default:
|
|
return "<unknown>";
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Testing shows that 2m send buffer is optimal. Changing the receive buffer
|
|
* size has no effect on performance.
|
|
* On Linux we need to increase net.core.wmem_max to make this effective.
|
|
*/
|
|
#if defined(__APPLE__) || defined(__linux__)
|
|
#define UNIX_STREAM_SOCKET_SEND_BUFFER_SIZE (2 * MiB)
|
|
#endif
|
|
|
|
void nbd_set_socket_send_buffer(QIOChannelSocket *sioc)
|
|
{
|
|
#ifdef UNIX_STREAM_SOCKET_SEND_BUFFER_SIZE
|
|
if (sioc->localAddr.ss_family == AF_UNIX) {
|
|
size_t size = UNIX_STREAM_SOCKET_SEND_BUFFER_SIZE;
|
|
Error *errp = NULL;
|
|
|
|
if (qio_channel_socket_set_send_buffer(sioc, size, &errp) < 0) {
|
|
warn_report_err(errp);
|
|
}
|
|
}
|
|
#endif /* UNIX_STREAM_SOCKET_SEND_BUFFER_SIZE */
|
|
}
|