server: tests - slow inference causes timeout on the CI (ggml-org#5715)

phymbert · jordankanter · commit 5ef6fa494291 · 2024-03-12T20:44:06.000-05:00
* server: tests - longer inference timeout for CI
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl(
             //    }
             //}
 
-            LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
+            //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());
         }
     }
 
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
@@ -699,6 +699,8 @@ async def wait_for_health_status(context,
     if context.debug:
         print(f"Starting checking for health for expected_health_status={expected_health_status}")
     timeout = 3  # seconds
+    if expected_health_status == 'ok':
+        timeout = 10 # CI slow inference
     interval = 0.5
     counter = 0
     async with aiohttp.ClientSession() as session:
@@ -736,7 +738,7 @@ async def wait_for_health_status(context,
                         if n_completions > 0:
                             return
 
-                assert False, 'timeout exceeded'
+                assert False, f'{expected_health_status} timeout exceeded {counter}s>={timeout}'
 
 
 def assert_embeddings(embeddings):

Original file line number	Diff line number	Diff line change
`@@ -266,7 +266,7 @@ static llama_token llama_sampling_sample_impl(`
`266`	`266`	`// }`
`267`	`267`	`//}`
`268`	`268`
`269`		`- LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());`
	`269`	`+ //LOG("sampled token: %5d: '%s'\n", id, llama_token_to_piece(ctx_main, id).c_str());`
`270`	`270`	`}`
`271`	`271`	`}`
`272`	`272`