fix: overhaul socket buffer handling for far higher stability and blocking boot

This commit is contained in:
Lance Vick 2025-04-30 17:30:18 -07:00
parent bb2f87d471
commit 24a254a914
Signed by: lrvick
GPG Key ID: 8E47A1EC35A1551D
1 changed files with 183 additions and 140 deletions

View File

@ -2,23 +2,34 @@
set -eu set -eu
COMMAND=($@) COMMAND=($@)
QGA_SOCKET=/var/run/netvm_qga.sock
LOCKFILE=/var/run/netvm.pid LOCKFILE=/var/run/netvm.pid
QGA_SOCK_PATH=/var/run/netvm_qga.sock
QGA_SOCK_FDS=false
qemu_execute() { qga_connect() {
local COMMAND ARGS [[ -v "QGA_SOCK_FDS_PID" ]] && return 0
COMMAND="$1" coproc QGA_SOCK_FDS (
ARGS="${2-}" exec socat - UNIX-CONNECT:"${QGA_SOCK_PATH}"
) || return 1
QGA_SOCK_IN=${QGA_SOCK_FDS[0]}
QGA_SOCK_OUT=${QGA_SOCK_FDS[1]}
}
qga_execute() {
local cmd args
cmd="$1"
args="${2-}"
GA_RETURN=""
jq \ jq \
-ncM \ -ncM \
--arg cmd "$COMMAND" \ --arg cmd "$cmd" \
--argjson args "$ARGS" \ --argjson args "$args" \
'{"execute": $cmd, "arguments": $args}' \ '{"execute": $cmd, "arguments": $args}' \
>&$FD_SOCKET_OUT >&$QGA_SOCK_OUT
local LINE local LINE
read -t 5 -r -u $FD_SOCKET_IN LINE read -t 5 -r -u $QGA_SOCK_IN LINE || return 1
local ERROR=$(jq -r '.error.desc // empty' <<< "$LINE") local ERROR=$(jq -r '.error.desc // empty' <<< "$LINE")
if [[ -n "$ERROR" ]]; then if [[ -n "$ERROR" ]]; then
@ -29,29 +40,31 @@ qemu_execute() {
GA_RETURN=$(jq -cM .return <<< "$LINE") GA_RETURN=$(jq -cM .return <<< "$LINE")
} }
qemu_ga() { qga_flush() {
local COMMAND ARGS #Docs say this should work, but it just get parse errors
COMMAND="$1" #LC_ALL= LC_CTYPE=en_US.UTF-8 printf '%b' "\uff" >&$QGA_SOCK_OUT
ARGS="$2" #read -t 5 -r -u $QGA_SOCK_IN LINE
until ! read -t 1 -r -u $QGA_SOCK_IN LINE; do sleep 0.1; done
}
coproc FDS ( qga() {
exec socat - UNIX-CONNECT:"${QGA_SOCKET}" local cmd args
) cmd="$1"
args="$2"
FD_SOCKET_IN=${FDS[0]} qga_connect
FD_SOCKET_OUT=${FDS[1]}
local PID=$$ local id=$((1 + $RANDOM % 10000000))
qemu_execute guest-sync "$(jq -ncM --argjson pid "$PID" '{"id": $pid}')" qga_execute guest-sync "$(jq -ncM --argjson id "$id" '{"id": $id}')";
[[ "$(jq -re . <<< "$GA_RETURN")" = "$$" ]] \ [[ "$(jq -re . <<< "$GA_RETURN")" = "$id" ]] || (echo "Error: guest-sync mismatch" >&2 && return 1)
|| (echo "guest-sync mismatch" >&2 && return 1)
qemu_execute "$COMMAND" "$ARGS" unset GA_RETURN
qga_execute "$cmd" "$args"
echo "$GA_RETURN" 2>&1 echo "$GA_RETURN" 2>&1
local RETURN local RETURN
kill -INT "$FDS_PID" 2>/dev/null kill -INT "$QGA_SOCK_FDS_PID" 2>/dev/null
wait "$FDS_PID" || RETURN=$? wait "$QGA_SOCK_FDS_PID" || RETURN=$?
if [[ $RETURN != 130 ]]; then if [[ $RETURN != 130 ]]; then
return $RETURN return $RETURN
fi fi
@ -71,7 +84,7 @@ function cmd_start(){
if [[ -n "$net_args" ]]; then if [[ -n "$net_args" ]]; then
echo Y > /sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts echo Y > /sys/module/vfio_iommu_type1/parameters/allow_unsafe_interrupts
fi fi
echo "Starting netvm"; printf "Starting netvm...";
qemu-system-x86_64 \ qemu-system-x86_64 \
-m 512M \ -m 512M \
--machine q35 \ --machine q35 \
@ -81,34 +94,64 @@ function cmd_start(){
-net none \ -net none \
-cdrom /guest.img \ -cdrom /guest.img \
-boot order=d \ -boot order=d \
-chardev socket,path=${QGA_SOCKET},server=on,wait=off,id=qga0 \ -chardev socket,path=/var/run/netvm_qga.sock,server=on,wait=off,id=qga0 \
$net_args \ $net_args \
-device qemu-xhci \ -device qemu-xhci \
-device virtio-serial \ -device virtio-serial \
-device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0 & -device virtserialport,chardev=qga0,name=org.qemu.guest_agent.0 \
>/dev/null 2>&1 &!
pid=$! pid=$!
printf "done\n"
echo "$pid" > "${LOCKFILE}" echo "$pid" > "${LOCKFILE}"
until [ -S "${QGA_SOCKET}" ]; do sleep 1; done printf "QGA Socket starting... "
until qemu_ga guest-ping "{}"; do until [ -S "${QGA_SOCK_PATH}" ]; do sleep 1; done
ps -p $pid >/dev/null || { printf "done\n"
printf "Connecting to QGA socket... "
until qga_connect; do sleep 1; done
printf "done\n"
[ -f "/proc/${pid}/status" ] || {
echo "Error: netvm exited unexpectedly"; echo "Error: netvm exited unexpectedly";
exit 1;
rm "${LOCKFILE}" rm "${LOCKFILE}"
exit 1;
} }
local id;
local spin='-\|/'
local i=0;
while true; do
i=$(( (i+1) %4 ));
printf "\rConnecting to QGA agent... ${spin:$i:1}"
qga_execute guest-ping "{}" > /dev/null || continue && break
sleep 1 sleep 1
done done;
echo "NetVM is booted" printf "\rConnecting to guest agent... done\n"
printf "Flushing buffers..."
qga_flush
printf "done\n"
local i=0;
while true; do
i=$(( (i+1) %4 ));
printf "\rSyncing with guest... ${spin:$i:1}"
id=$((1 + $RANDOM % 10000000))
qga_execute guest-sync "$(jq -ncM --argjson id "$id" '{"id": $id}')" || continue
if [[ "$(jq -re . <<< "$GA_RETURN")" = "$id" ]]; then
printf "\rSyncing with guest... done\n"
break
fi;
sleep 1
done;
echo "NetVM boot complete"
} }
function cmd_stop(){ function cmd_stop(){
pkill $(cat "${LOCKFILE}") kill $(cat "${LOCKFILE}")
rm "${LOCKFILE}" rm "${LOCKFILE}"
} }
function cmd_status(){ function cmd_status(){
qemu_ga guest-get-host-name "{}" | jq -r '."host-name"' qga guest-get-host-name "{}" | jq -r '."host-name"'
pid=$(qemu_ga guest-exec '{"path": "uptime", "capture-output": true}' | jq -r '.pid') pid=$(qga guest-exec '{"path": "uptime", "capture-output": true}' | jq -r '.pid')
out=$(qemu_ga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" \ out=$(qga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" \
| jq -r '."out-data"' \ | jq -r '."out-data"' \
| base64 -d \ | base64 -d \
) )
@ -119,7 +162,7 @@ function cmd_push(){
local source="${COMMAND[1]}" local source="${COMMAND[1]}"
local dest="${COMMAND[2]}" local dest="${COMMAND[2]}"
fo_request=$(jq -n --arg dest "$dest" '{"path": $dest, "mode": "w" }') fo_request=$(jq -n --arg dest "$dest" '{"path": $dest, "mode": "w" }')
handle=$(qemu_ga guest-file-open "$fo_request") handle=$(qga guest-file-open "$fo_request")
bufb64=$(base64 "$source") bufb64=$(base64 "$source")
count=$(cat "$source" | wc -c) count=$(cat "$source" | wc -c)
fw_request=$(jq -n \ fw_request=$(jq -n \
@ -128,29 +171,29 @@ function cmd_push(){
--arg bufb64 "$bufb64" \ --arg bufb64 "$bufb64" \
'{handle: $handle, "buf-b64": $bufb64, count: $count }' \ '{handle: $handle, "buf-b64": $bufb64, count: $count }' \
) )
qemu_ga guest-file-write "$fw_request" qga guest-file-write "$fw_request"
fh_request=$(jq -n --argjson handle $handle '{handle: $handle}' ) fh_request=$(jq -n --argjson handle $handle '{handle: $handle}' )
qemu_ga guest-file-flush "$fh_request" qga guest-file-flush "$fh_request"
qemu_ga guest-file-close "$fh_request" qga guest-file-close "$fh_request"
} }
function cmd_pull(){ function cmd_pull(){
local source="${COMMAND[1]}" local source="${COMMAND[1]}"
local dest="${COMMAND[2]}" local dest="${COMMAND[2]}"
fo_request=$(jq -n --arg source "$source" '{"path": $source}') fo_request=$(jq -n --arg source "$source" '{"path": $source}')
handle=$(qemu_ga guest-file-open "$fo_request") handle=$(qga guest-file-open "$fo_request")
fr_request=$(jq -n \ fr_request=$(jq -n \
--argjson handle $handle \ --argjson handle $handle \
'{handle: $handle, count: 48000000 }' \ '{handle: $handle, count: 48000000 }' \
) )
out=$(qemu_ga guest-file-read "$fr_request") out=$(qga guest-file-read "$fr_request")
echo $out | jq -r '."buf-b64"' | base64 -d > $dest echo $out | jq -r '."buf-b64"' | base64 -d > $dest
} }
function cmd_run(){ function cmd_run(){
[ -z "${COMMAND[1]}" ] && { echo "Error: missing command"; exit 1; } [ -z "${COMMAND[1]}" ] && { echo "Error: missing command"; exit 1; }
[ -f "${LOCKFILE}" ] || { echo "Error: Netvm is not running"; exit 1; } [ -f "${LOCKFILE}" ] || { echo "Error: Netvm is not running"; exit 1; }
[ -S "${QGA_SOCKET}" ] || { echo "Error: Netvm QGA socket is missing"; exit 1; } [ -S "${QGA_SOCK_PATH}" ] || { echo "Error: Netvm QGA socket is missing"; exit 1; }
local cmd="${COMMAND[1]}" local cmd="${COMMAND[1]}"
local args="${COMMAND[@]:2}" local args="${COMMAND[@]:2}"
local args_json="[]" local args_json="[]"
@ -168,10 +211,10 @@ function cmd_run(){
"capture-output": true "capture-output": true
}' \ }' \
) )
pid=$(qemu_ga guest-exec "$request" | jq -r '.pid') pid=$(qga guest-exec "$request" | jq -r '.pid')
local exited=false local exited=false
until [ "$exited" == "true" ]; do \ until [ "$exited" == "true" ]; do \
out=$(qemu_ga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" ) out=$(qga guest-exec-status "$(jq -n --argjson pid "$pid" '{pid: $pid }')" )
exited=$(echo $out | jq -r '.exited') exited=$(echo $out | jq -r '.exited')
if $exited && jq -r 'has("out-data")' >/dev/null < <(echo $out); then if $exited && jq -r 'has("out-data")' >/dev/null < <(echo $out); then
echo "$out" | jq -r '."out-data"' | base64 -d echo "$out" | jq -r '."out-data"' | base64 -d