[master] bada81bf6 Avoid hang when child does not come up, add startup_timeout
Nils Goroll
nils.goroll at uplex.de
Mon Jul 3 08:55:09 UTC 2023
commit bada81bf65c69a0120f57e3b2742015d052412fb
Author: Nils Goroll <nils.goroll at uplex.de>
Date: Fri Jun 16 21:50:31 2023 +0200
Avoid hang when child does not come up, add startup_timeout
When a child did not come up within cli_timeout, varnishd startup
would hang indefinitely.
We add startup_timeout specifically for child startup.
To facilitate the transition, we use the maximum of cli_timeout and
start_timeout (suggested by Dridi, thank you) and add a tip if
startup_timeout is not used.
We avoid the previous harsh exit(1), primarily to make the vtc_varnish
facility work.
The test case uses both vtc_varnish and vtc_process to exercise the
different code paths for implicit startup vs. cli "start".
Fixes #3940
diff --git a/bin/varnishd/mgt/mgt_child.c b/bin/varnishd/mgt/mgt_child.c
index 28e7b4570..188cad7bc 100644
--- a/bin/varnishd/mgt/mgt_child.c
+++ b/bin/varnishd/mgt/mgt_child.c
@@ -90,6 +90,7 @@ static struct vlu *child_std_vlu;
static struct vsb *child_panic = NULL;
static void mgt_reap_child(void);
+static int kill_child(void);
/*=====================================================================
* Panic string evacuation and handling
@@ -302,12 +303,14 @@ child_poker(const struct vev *e, int what)
static void
mgt_launch_child(struct cli *cli)
{
- pid_t pid, pidr;
+ pid_t pid;
unsigned u;
char *p;
struct vev *e;
int i, cp[2];
struct rlimit rl[1];
+ vtim_dur dstart;
+ vtim_mono t0;
if (child_state != CH_STOPPED && child_state != CH_DIED)
return;
@@ -436,15 +439,23 @@ mgt_launch_child(struct cli *cli)
AN(child_std_vlu);
/* Wait for cache/cache_cli.c::CLI_Run() to check in */
- if (VCLI_ReadResult(child_cli_in, &u, NULL, mgt_param.cli_timeout)) {
+ dstart = vmax(mgt_param.startup_timeout, mgt_param.cli_timeout);
+ t0 = VTIM_mono();
+ if (VCLI_ReadResult(child_cli_in, &u, NULL, dstart)) {
+ int bstart = mgt_param.startup_timeout >= mgt_param.cli_timeout;
assert(u == CLIS_COMMS);
- pidr = waitpid(pid, &i, 0);
- assert(pidr == pid);
- do {
- i = VLU_Fd(child_std_vlu, child_output);
- } while (i == 0);
- MGT_Complain(C_ERR, "Child failed on launch");
- exit(1); // XXX Harsh ?
+ if (VTIM_mono() - t0 < dstart)
+ mgt_launch_err(cli, u, "Child failed on launch ");
+ else
+ mgt_launch_err(cli, u, "Child failed on launch "
+ "within %s_timeout=%.2fs%s",
+ bstart ? "startup" : "cli", dstart,
+ bstart ? "" : " (tip: set startup_timeout)");
+ child_pid = pid;
+ kill_child();
+ mgt_reap_child();
+ child_state = CH_STOPPED;
+ return;
} else {
assert(u == CLIS_OK);
fprintf(stderr, "Child launched OK\n");
diff --git a/bin/varnishtest/tests/p00000.vtc b/bin/varnishtest/tests/p00000.vtc
index 8b4b6493d..e7a4219b4 100644
--- a/bin/varnishtest/tests/p00000.vtc
+++ b/bin/varnishtest/tests/p00000.vtc
@@ -9,7 +9,8 @@ process p1 -dump {
process p1 -expect-text 0 0 {to launch}
process p1 -write "start\n"
process p1 -expect-text 0 0 {-spersistent has been deprecated}
-process p1 -wait
+process p1 -write "quit\n"
+process p1 -expect-exit 0x20 -wait
server s1 {
rxreq
diff --git a/bin/varnishtest/tests/r03940.vtc b/bin/varnishtest/tests/r03940.vtc
new file mode 100644
index 000000000..1213330c5
--- /dev/null
+++ b/bin/varnishtest/tests/r03940.vtc
@@ -0,0 +1,45 @@
+varnishtest "test startup_timeout vs. stevedore init / open"
+
+# we test with vtc_varnish and vtc_process because of different code
+# paths in mgr for implicit start vs. cli start
+
+####
+# startup_timeout used, delay in stevedore init
+varnish v1 -arg "-sdebug=debug,dinit=5s -pstartup_timeout=3s -pcli_timeout=2s" \
+ -arg "-p feature=+no_coredump" \
+ -vcl "backend none none;"
+varnish v1 -cliexpect \
+ "Child failed on launch within startup_timeout=3.00s" \
+ "start"
+varnish v1 -expectexit 0x40
+
+process p1 { varnishd \
+ -sdebug=debug,dinit=5s \
+ -pstartup_timeout=3s -pcli_timeout=2s \
+ -n ${tmpdir}/p1 -a :0 -b none 2>&1 } -start
+process p1 -expect-exit 0x2 -wait
+process p1 -expect-text 0 0 \
+ "Child failed on launch within startup_timeout=3.00s"
+
+####
+# cli_timeout used, delay in stevedore open
+
+varnish v2 -arg "-sdebug=debug,dopen=5s -pstartup_timeout=2s -pcli_timeout=3s" \
+ -arg "-p feature=+no_coredump" \
+ -vcl "backend none none;"
+varnish v2 -cliexpect \
+ "launch within cli_timeout=3.00s .tip: set startup_" \
+ "start"
+varnish v2 -cliok "panic.clear"
+varnish v2 -expectexit 0x40
+
+process p2 { varnishd \
+ -sdebug=debug,dopen=5s \
+ -pstartup_timeout=2s -pcli_timeout=3s \
+ -n ${tmpdir}/p2 -a :0 -b none} -start
+process p2 -expect-exit 0x2 -wait
+
+# expect-text does not work here because the panic info pushes the
+# error out of the emulated terminal's view.
+# And I do not want to rely on any x lines to be enough
+shell {grep -q "launch within cli_timeout=3.00s (tip: set startup_" ${p2_err}}
diff --git a/bin/varnishtest/tests/s00003.vtc b/bin/varnishtest/tests/s00003.vtc
index d1077dd93..f186d8956 100644
--- a/bin/varnishtest/tests/s00003.vtc
+++ b/bin/varnishtest/tests/s00003.vtc
@@ -47,13 +47,13 @@ varnish v1 -cliok "ban obj.http.date ~ ."
process p1 {
varnishd -sTransient=file,${tmpdir}/foo,xxx -blocalhost -a:0 -n ${tmpdir} 2>&1
-} -expect-exit 255 -dump -start -expect-text 0 0 "Invalid number" -wait -screen_dump
+} -expect-exit 0x2 -dump -start -expect-text 0 0 "Invalid number" -wait -screen_dump
process p1 {
varnishd -sTransient=file,${tmpdir}/foo,10M,xxx -blocalhost -a:0 -n ${tmpdir} 2>&1
-} -expect-exit 255 -dump -start -expect-text 0 0 "granularity" -wait -screen_dump
+} -expect-exit 0x2 -dump -start -expect-text 0 0 "granularity" -wait -screen_dump
process p1 {
varnishd -sTransient=file,${tmpdir}/foo,10m,,foo -blocalhost -a:0 -n ${tmpdir} 2>&1
-} -expect-exit 255 -dump -start -expect-text 0 0 "invalid advice" -wait
+} -expect-exit 0x2 -dump -start -expect-text 0 0 "invalid advice" -wait
diff --git a/include/tbl/params.h b/include/tbl/params.h
index 8a066361f..83054eef3 100644
--- a/include/tbl/params.h
+++ b/include/tbl/params.h
@@ -335,6 +335,18 @@ PARAM_SIMPLE(
"Timeout for the child's replies to CLI requests."
)
+PARAM_SIMPLE(
+ /* name */ startup_timeout,
+ /* type */ timeout,
+ /* min */ "0.000",
+ /* max */ NULL,
+ /* def */ "0.000",
+ /* units */ "seconds",
+ /* descr */
+ "Alternative timeout for the initial worker process startup.\n"
+ "If cli_timeout is longer than startup_timeout, it is used instead."
+)
+
PARAM_SIMPLE(
/* name */ clock_skew,
/* type */ uint,
More information about the varnish-commit
mailing list