[master] bada81bf6 Avoid hang when child does not come up, add startup_timeout

Nils Goroll nils.goroll at uplex.de
Mon Jul 3 08:55:09 UTC 2023


commit bada81bf65c69a0120f57e3b2742015d052412fb
Author: Nils Goroll <nils.goroll at uplex.de>
Date:   Fri Jun 16 21:50:31 2023 +0200

    Avoid hang when child does not come up, add startup_timeout
    
    When a child did not come up within cli_timeout, varnishd startup
    would hang indefinitely.
    
    We add startup_timeout specifically for child startup.
    
    To facilitate the transition, we use the maximum of cli_timeout and
    start_timeout (suggested by Dridi, thank you) and add a tip if
    startup_timeout is not used.
    
    We avoid the previous harsh exit(1), primarily to make the vtc_varnish
    facility work.
    
    The test case uses both vtc_varnish and vtc_process to exercise the
    different code paths for implicit startup vs. cli "start".
    
    Fixes #3940

diff --git a/bin/varnishd/mgt/mgt_child.c b/bin/varnishd/mgt/mgt_child.c
index 28e7b4570..188cad7bc 100644
--- a/bin/varnishd/mgt/mgt_child.c
+++ b/bin/varnishd/mgt/mgt_child.c
@@ -90,6 +90,7 @@ static struct vlu	*child_std_vlu;
 static struct vsb *child_panic = NULL;
 
 static void mgt_reap_child(void);
+static int kill_child(void);
 
 /*=====================================================================
  * Panic string evacuation and handling
@@ -302,12 +303,14 @@ child_poker(const struct vev *e, int what)
 static void
 mgt_launch_child(struct cli *cli)
 {
-	pid_t pid, pidr;
+	pid_t pid;
 	unsigned u;
 	char *p;
 	struct vev *e;
 	int i, cp[2];
 	struct rlimit rl[1];
+	vtim_dur dstart;
+	vtim_mono t0;
 
 	if (child_state != CH_STOPPED && child_state != CH_DIED)
 		return;
@@ -436,15 +439,23 @@ mgt_launch_child(struct cli *cli)
 	AN(child_std_vlu);
 
 	/* Wait for cache/cache_cli.c::CLI_Run() to check in */
-	if (VCLI_ReadResult(child_cli_in, &u, NULL, mgt_param.cli_timeout)) {
+	dstart = vmax(mgt_param.startup_timeout, mgt_param.cli_timeout);
+	t0 = VTIM_mono();
+	if (VCLI_ReadResult(child_cli_in, &u, NULL, dstart)) {
+		int bstart = mgt_param.startup_timeout >= mgt_param.cli_timeout;
 		assert(u == CLIS_COMMS);
-		pidr = waitpid(pid, &i, 0);
-		assert(pidr == pid);
-		do {
-			i = VLU_Fd(child_std_vlu, child_output);
-		} while (i == 0);
-		MGT_Complain(C_ERR, "Child failed on launch");
-		exit(1);		// XXX Harsh ?
+		if (VTIM_mono() - t0 < dstart)
+			mgt_launch_err(cli, u, "Child failed on launch ");
+		else
+			mgt_launch_err(cli, u, "Child failed on launch "
+			    "within %s_timeout=%.2fs%s",
+			    bstart ? "startup" : "cli", dstart,
+			    bstart ? "" : " (tip: set startup_timeout)");
+		child_pid = pid;
+		kill_child();
+		mgt_reap_child();
+		child_state = CH_STOPPED;
+		return;
 	} else {
 		assert(u == CLIS_OK);
 		fprintf(stderr, "Child launched OK\n");
diff --git a/bin/varnishtest/tests/p00000.vtc b/bin/varnishtest/tests/p00000.vtc
index 8b4b6493d..e7a4219b4 100644
--- a/bin/varnishtest/tests/p00000.vtc
+++ b/bin/varnishtest/tests/p00000.vtc
@@ -9,7 +9,8 @@ process p1 -dump {
 process p1 -expect-text 0 0 {to launch}
 process p1 -write "start\n"
 process p1 -expect-text 0 0 {-spersistent has been deprecated}
-process p1 -wait
+process p1 -write "quit\n"
+process p1 -expect-exit 0x20 -wait
 
 server s1 {
 	rxreq
diff --git a/bin/varnishtest/tests/r03940.vtc b/bin/varnishtest/tests/r03940.vtc
new file mode 100644
index 000000000..1213330c5
--- /dev/null
+++ b/bin/varnishtest/tests/r03940.vtc
@@ -0,0 +1,45 @@
+varnishtest "test startup_timeout vs. stevedore init / open"
+
+# we test with vtc_varnish and vtc_process because of different code
+# paths in mgr for implicit start vs. cli start
+
+####
+# startup_timeout used, delay in stevedore init
+varnish v1 -arg "-sdebug=debug,dinit=5s -pstartup_timeout=3s -pcli_timeout=2s" \
+	-arg "-p feature=+no_coredump" \
+	-vcl "backend none none;"
+varnish v1 -cliexpect \
+	"Child failed on launch within startup_timeout=3.00s" \
+	"start"
+varnish v1 -expectexit 0x40
+
+process p1 { varnishd \
+	-sdebug=debug,dinit=5s \
+	-pstartup_timeout=3s -pcli_timeout=2s \
+	-n ${tmpdir}/p1 -a :0 -b none 2>&1 } -start
+process p1 -expect-exit 0x2 -wait
+process p1 -expect-text 0 0 \
+	"Child failed on launch within startup_timeout=3.00s"
+
+####
+# cli_timeout used, delay in stevedore open
+
+varnish v2 -arg "-sdebug=debug,dopen=5s -pstartup_timeout=2s -pcli_timeout=3s" \
+	-arg "-p feature=+no_coredump" \
+	-vcl "backend none none;"
+varnish v2 -cliexpect \
+	"launch within cli_timeout=3.00s .tip: set startup_" \
+	"start"
+varnish v2 -cliok "panic.clear"
+varnish v2 -expectexit 0x40
+
+process p2 { varnishd \
+	-sdebug=debug,dopen=5s \
+	-pstartup_timeout=2s -pcli_timeout=3s \
+	-n ${tmpdir}/p2 -a :0 -b none} -start
+process p2 -expect-exit 0x2 -wait
+
+# expect-text does not work here because the panic info pushes the
+# error out of the emulated terminal's view.
+# And I do not want to rely on any x lines to be enough
+shell {grep -q "launch within cli_timeout=3.00s (tip: set startup_" ${p2_err}}
diff --git a/bin/varnishtest/tests/s00003.vtc b/bin/varnishtest/tests/s00003.vtc
index d1077dd93..f186d8956 100644
--- a/bin/varnishtest/tests/s00003.vtc
+++ b/bin/varnishtest/tests/s00003.vtc
@@ -47,13 +47,13 @@ varnish v1 -cliok "ban obj.http.date ~ ."
 
 process p1 {
 	varnishd -sTransient=file,${tmpdir}/foo,xxx -blocalhost -a:0 -n ${tmpdir} 2>&1
-} -expect-exit 255 -dump -start -expect-text 0 0 "Invalid number" -wait -screen_dump
+} -expect-exit 0x2 -dump -start -expect-text 0 0 "Invalid number" -wait -screen_dump
 
 process p1 {
 	varnishd -sTransient=file,${tmpdir}/foo,10M,xxx -blocalhost -a:0 -n ${tmpdir} 2>&1
-} -expect-exit 255 -dump -start -expect-text 0 0 "granularity" -wait -screen_dump
+} -expect-exit 0x2 -dump -start -expect-text 0 0 "granularity" -wait -screen_dump
 
 process p1 {
 	varnishd -sTransient=file,${tmpdir}/foo,10m,,foo -blocalhost -a:0 -n ${tmpdir} 2>&1
-} -expect-exit 255 -dump -start -expect-text 0 0 "invalid advice" -wait
+} -expect-exit 0x2 -dump -start -expect-text 0 0 "invalid advice" -wait
 
diff --git a/include/tbl/params.h b/include/tbl/params.h
index 8a066361f..83054eef3 100644
--- a/include/tbl/params.h
+++ b/include/tbl/params.h
@@ -335,6 +335,18 @@ PARAM_SIMPLE(
 	"Timeout for the child's replies to CLI requests."
 )
 
+PARAM_SIMPLE(
+	/* name */	startup_timeout,
+	/* type */	timeout,
+	/* min */	"0.000",
+	/* max */	NULL,
+	/* def */	"0.000",
+	/* units */	"seconds",
+	/* descr */
+	"Alternative timeout for the initial worker process startup.\n"
+	"If cli_timeout is longer than startup_timeout, it is used instead."
+)
+
 PARAM_SIMPLE(
 	/* name */	clock_skew,
 	/* type */	uint,


More information about the varnish-commit mailing list