fix: overhaul socket buffer handling for far higher stability and blocking boot

This commit is contained in:
Lance Vick 2025-04-30 17:30:18 -07:00
parent bb2f87d471
commit 24a254a914
Signed by: lrvick
GPG Key ID: 8E47A1EC35A1551D
1 changed files with 183 additions and 140 deletions

View File

@ -2,59 +2,72 @@
set -eu
COMMAND=($@)
QGA_SOCKET=/var/run/netvm_qga.sock
LOCKFILE=/var/run/netvm.pid
QGA_SOCK_PATH=/var/run/netvm_qga.sock
QGA_SOCK_FDS=false
qemu_execute() {
local COMMAND ARGS
COMMAND="$1"
ARGS="${2-}"
jq \
-ncM \
--arg cmd "$COMMAND" \
--argjson args "$ARGS" \
'{"execute": $cmd, "arguments": $args}' \
>&$FD_SOCKET_OUT
local LINE
read -t 5 -r -u $FD_SOCKET_IN LINE
local ERROR=$(jq -r '.error.desc // empty' <<< "$LINE")
if [[ -n "$ERROR" ]]; then
echo "$ERROR" >&2
return 1
fi
GA_RETURN=$(jq -cM .return <<< "$LINE")
qga_connect() {
[[ -v "QGA_SOCK_FDS_PID" ]] && return 0
coproc QGA_SOCK_FDS (
exec socat - UNIX-CONNECT:"${QGA_SOCK_PATH}"
) || return 1
QGA_SOCK_IN=${QGA_SOCK_FDS[0]}
QGA_SOCK_OUT=${QGA_SOCK_FDS[1]}
}
qemu_ga() {
local COMMAND ARGS
COMMAND="$1"
ARGS="$2"
qga_execute() {
local cmd args
cmd="$1"
args="${2-}"
GA_RETURN=""
coproc FDS (
exec socat - UNIX-CONNECT:"${QGA_SOCKET}"
)
jq \
-ncM \
--arg cmd "$cmd" \
--argjson args "$args" \
'{"execute": $cmd, "arguments": $args}' \
>&$QGA_SOCK_OUT
FD_SOCKET_IN=${FDS[0]}
FD_SOCKET_OUT=${FDS[1]}
local LINE
read -t 5 -r -u $QGA_SOCK_IN LINE || return 1
local PID=$$
qemu_execute guest-sync "$(jq -ncM --argjson pid "$PID" '{"id": $pid}')"
[[ "$(jq -re . <<< "$GA_RETURN")" = "$$" ]] \
|| (echo "guest-sync mismatch" >&2 && return 1)
local ERROR=$(jq -r '.error.desc // empty' <<< "$LINE")
if [[ -n "$ERROR" ]]; then
echo "$ERROR" >&2
return 1
fi
qemu_execute "$COMMAND" "$ARGS"
echo "$GA_RETURN" 2>&1
GA_RETURN=$(jq -cM .return <<< "$LINE")
}
local RETURN
kill -INT "$FDS_PID" 2>/dev/null
wait "$FDS_PID" || RETURN=$?
if [[ $RETURN != 130 ]]; then
return $RETURN
fi
qga_flush() {
#Docs say this should work, but it just get parse errors
#LC_ALL= LC_CTYPE=en_US.UTF-8 printf '%b' "\uff" >&$QGA_SOCK_OUT
#read -t 5 -r -u $QGA_SOCK_IN LINE
until ! read -t 1 -r -u $QGA_SOCK_IN LINE; do sleep 0.1; done
}
qga() {
local cmd args
cmd="$1"
args="$2"
qga_connect
local id=$((1 + $RANDOM % 10000000))
qga_execute guest-sync "$(jq -ncM --argjson id "$id" '{"id": $id}')";
[[ "$(jq -re . <<< "$GA_RETURN")" = "$id" ]] || (echo "Error: guest-sync mismatch" >&2 && return 1)
unset GA_RETURN
qga_execute "$cmd" "$args"
echo "$GA_RETURN" 2>&1
local RETURN
kill -INT "$QGA_SOCK_FDS_PID" 2>/dev/null
wait "$QGA_SOCK_FDS_PID" || RETURN=$?
if [[ $RETURN != 130 ]]; then
return $RETURN
fi
}
function cmd_start(){
@ -71,120 +84,150 @@ function cmd_start(){
if [[ -n "$net_args" ]]; then
echo Y > /sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts
fi
echo "Starting netvm";
printf "Starting netvm...";
qemu-system-x86_64 \
-m 512M \
--machine q35 \
-nographic \
-serial none \
-monitor none \
-net none \
-cdrom /guest.img \
-boot order=d \
-chardev socket,path=${QGA_SOCKET},server=on,wait=off,id=qga0 \
--machine q35 \
-nographic \
-serial none \
-monitor none \
-net none \
-cdrom /guest.img \
-boot order=d \
-chardev socket,path=/var/run/netvm_qga.sock,server=on,wait=off,id=qga0 \
$net_args \
-device qemu-xhci \
-device virtio-serial \
-device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0 &
pid=$!
-device virtio-serial \
-device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0 \
>/dev/null 2>&1 &!
pid=$!
printf "done\n"
echo "$pid" > "${LOCKFILE}"
until [ -S "${QGA_SOCKET}" ]; do sleep 1; done
until qemu_ga guest-ping "{}"; do
ps -p $pid >/dev/null || {
echo "Error: netvm exited unexpectedly";
exit 1;
rm "${LOCKFILE}"
}
sleep 1
done
echo "NetVM is booted"
printf "QGA Socket starting... "
until [ -S "${QGA_SOCK_PATH}" ]; do sleep 1; done
printf "done\n"
printf "Connecting to QGA socket... "
until qga_connect; do sleep 1; done
printf "done\n"
[ -f "/proc/${pid}/status" ] || {
echo "Error: netvm exited unexpectedly";
rm "${LOCKFILE}"
exit 1;
}
local id;
local spin='-\|/'
local i=0;
while true; do
i=$(( (i+1) %4 ));
printf "\rConnecting to QGA agent... ${spin:$i:1}"
qga_execute guest-ping "{}" > /dev/null || continue && break
sleep 1
done;
printf "\rConnecting to guest agent... done\n"
printf "Flushing buffers..."
qga_flush
printf "done\n"
local i=0;
while true; do
i=$(( (i+1) %4 ));
printf "\rSyncing with guest... ${spin:$i:1}"
id=$((1 + $RANDOM % 10000000))
qga_execute guest-sync "$(jq -ncM --argjson id "$id" '{"id": $id}')" || continue
if [[ "$(jq -re . <<< "$GA_RETURN")" = "$id" ]]; then
printf "\rSyncing with guest... done\n"
break
fi;
sleep 1
done;
echo "NetVM boot complete"
}
function cmd_stop(){
pkill $(cat "${LOCKFILE}")
kill $(cat "${LOCKFILE}")
rm "${LOCKFILE}"
}
function cmd_status(){
qemu_ga guest-get-host-name "{}" | jq -r '."host-name"'
pid=$(qemu_ga guest-exec '{"path": "uptime", "capture-output": true}' | jq -r '.pid')
out=$(qemu_ga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" \
| jq -r '."out-data"' \
| base64 -d \
)
echo $out
qga guest-get-host-name "{}" | jq -r '."host-name"'
pid=$(qga guest-exec '{"path": "uptime", "capture-output": true}' | jq -r '.pid')
out=$(qga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" \
| jq -r '."out-data"' \
| base64 -d \
)
echo $out
}
function cmd_push(){
local source="${COMMAND[1]}"
local dest="${COMMAND[2]}"
fo_request=$(jq -n --arg dest "$dest" '{"path": $dest, "mode": "w" }')
handle=$(qemu_ga guest-file-open "$fo_request")
bufb64=$(base64 "$source")
count=$(cat "$source" | wc -c)
fw_request=$(jq -n \
--argjson handle $handle \
--argjson count $count \
--arg bufb64 "$bufb64" \
'{handle: $handle, "buf-b64": $bufb64, count: $count }' \
)
qemu_ga guest-file-write "$fw_request"
fh_request=$(jq -n --argjson handle $handle '{handle: $handle}' )
qemu_ga guest-file-flush "$fh_request"
qemu_ga guest-file-close "$fh_request"
local source="${COMMAND[1]}"
local dest="${COMMAND[2]}"
fo_request=$(jq -n --arg dest "$dest" '{"path": $dest, "mode": "w" }')
handle=$(qga guest-file-open "$fo_request")
bufb64=$(base64 "$source")
count=$(cat "$source" | wc -c)
fw_request=$(jq -n \
--argjson handle $handle \
--argjson count $count \
--arg bufb64 "$bufb64" \
'{handle: $handle, "buf-b64": $bufb64, count: $count }' \
)
qga guest-file-write "$fw_request"
fh_request=$(jq -n --argjson handle $handle '{handle: $handle}' )
qga guest-file-flush "$fh_request"
qga guest-file-close "$fh_request"
}
function cmd_pull(){
local source="${COMMAND[1]}"
local dest="${COMMAND[2]}"
fo_request=$(jq -n --arg source "$source" '{"path": $source}')
handle=$(qemu_ga guest-file-open "$fo_request")
fr_request=$(jq -n \
--argjson handle $handle \
'{handle: $handle, count: 48000000 }' \
)
out=$(qemu_ga guest-file-read "$fr_request")
echo $out | jq -r '."buf-b64"' | base64 -d > $dest
local source="${COMMAND[1]}"
local dest="${COMMAND[2]}"
fo_request=$(jq -n --arg source "$source" '{"path": $source}')
handle=$(qga guest-file-open "$fo_request")
fr_request=$(jq -n \
--argjson handle $handle \
'{handle: $handle, count: 48000000 }' \
)
out=$(qga guest-file-read "$fr_request")
echo $out | jq -r '."buf-b64"' | base64 -d > $dest
}
function cmd_run(){
[ -z "${COMMAND[1]}" ] && { echo "Error: missing command"; exit 1; }
[ -f "${LOCKFILE}" ] || { echo "Error: Netvm is not running"; exit 1; }
[ -S "${QGA_SOCKET}" ] || { echo "Error: Netvm QGA socket is missing"; exit 1; }
local cmd="${COMMAND[1]}"
local args="${COMMAND[@]:2}"
local args_json="[]"
if [[ -n "$args" ]]; then
args_json=$(printf '%s\n' "$args" | jq -R . | jq -s .)
fi
local request
request=$( \
jq -n \
--arg path "$cmd" \
--argjson args "$args_json" \
'{
path: $path,
arg: $args,
"capture-output": true
}' \
)
pid=$(qemu_ga guest-exec "$request" | jq -r '.pid')
local exited=false
until [ "$exited" == "true" ]; do \
out=$(qemu_ga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" )
exited=$(echo $out | jq -r '.exited')
if $exited && jq -r 'has("out-data")' >/dev/null < <(echo $out); then
echo "$out" | jq -r '."out-data"' | base64 -d
break
fi
sleep 1
done
[ -z "${COMMAND[1]}" ] && { echo "Error: missing command"; exit 1; }
[ -f "${LOCKFILE}" ] || { echo "Error: Netvm is not running"; exit 1; }
[ -S "${QGA_SOCK_PATH}" ] || { echo "Error: Netvm QGA socket is missing"; exit 1; }
local cmd="${COMMAND[1]}"
local args="${COMMAND[@]:2}"
local args_json="[]"
if [[ -n "$args" ]]; then
args_json=$(printf '%s\n' "$args" | jq -R . | jq -s .)
fi
local request
request=$( \
jq -n \
--arg path "$cmd" \
--argjson args "$args_json" \
'{
path: $path,
arg: $args,
"capture-output": true
}' \
)
pid=$(qga guest-exec "$request" | jq -r '.pid')
local exited=false
until [ "$exited" == "true" ]; do \
out=$(qga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" )
exited=$(echo $out | jq -r '.exited')
if $exited && jq -r 'has("out-data")' >/dev/null < <(echo $out); then
echo "$out" | jq -r '."out-data"' | base64 -d
break
fi
sleep 1
done
}
cmd_usage() {
cat <<-_EOF
netvm
Control network vm headlessly via QMP protocol
Usage:
@ -204,12 +247,12 @@ cmd_usage() {
}
case "$1" in
status) shift; cmd_status $@ ;;
start) shift; cmd_start $@ ;;
stop) shift; cmd_stop $@ ;;
push) shift; cmd_push $@ ;;
pull) shift; cmd_pull $@ ;;
run) shift; cmd_run $@ ;;
help) shift; cmd_usage $@ ;;
*) cmd_usage $@ ;;
status) shift; cmd_status $@ ;;
start) shift; cmd_start $@ ;;
stop) shift; cmd_stop $@ ;;
push) shift; cmd_push $@ ;;
pull) shift; cmd_pull $@ ;;
run) shift; cmd_run $@ ;;
help) shift; cmd_usage $@ ;;
*) cmd_usage $@ ;;
esac