Use monotonic time where appropriate (#1249)

Yard1 · web-flow · commit acbed3ef40f0 · 2023-10-02T19:22:05.000-07:00
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
     def run_to_completion(profile: bool = False):
         if profile:
             torch.cuda.cudart().cudaProfilerStart()
-        start_time = time.time()
+        start_time = time.perf_counter()
 
         llm.generate(prompt_token_ids=dummy_prompt_token_ids,
                      sampling_params=sampling_params,
                      use_tqdm=False)
 
-        end_time = time.time()
+        end_time = time.perf_counter()
         latency = end_time - start_time
         if profile:
             torch.cuda.cudart().cudaProfilerStop()
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -105,7 +105,7 @@ async def send_request(
     best_of: int,
     use_beam_search: bool,
 ) -> None:
-    request_start_time = time.time()
+    request_start_time = time.perf_counter()
 
     headers = {"User-Agent": "Benchmark Client"}
     if backend == "vllm":
@@ -148,7 +148,7 @@ async def send_request(
             if "error" not in output:
                 break
 
-    request_end_time = time.time()
+    request_end_time = time.perf_counter()
     request_latency = request_end_time - request_start_time
     REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
 
@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
     tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
     input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
 
-    benchmark_start_time = time.time()
+    benchmark_start_time = time.perf_counter()
     asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
                           args.use_beam_search, args.request_rate))
-    benchmark_end_time = time.time()
+    benchmark_end_time = time.perf_counter()
     benchmark_time = benchmark_end_time - benchmark_start_time
     print(f"Total time: {benchmark_time:.2f} s")
     print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -93,10 +93,10 @@ def run_vllm(
             sampling_params=sampling_params,
         )
 
-    start = time.time()
+    start = time.perf_counter()
     # FIXME(woosuk): Do use internal method.
     llm._run_engine(use_tqdm=True)
-    end = time.time()
+    end = time.perf_counter()
     return end - start
 
 
@@ -118,7 +118,7 @@ def run_hf(
     llm = llm.cuda()
 
     pbar = tqdm(total=len(requests))
-    start = time.time()
+    start = time.perf_counter()
     batch: List[str] = []
     max_prompt_len = 0
     max_output_len = 0
@@ -156,7 +156,7 @@ def run_hf(
         batch = []
         max_prompt_len = 0
         max_output_len = 0
-    end = time.time()
+    end = time.perf_counter()
     return end - start
 
 
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -121,7 +121,7 @@ def _schedule(self) -> SchedulerOutputs:
         blocks_to_copy: Dict[int, List[int]] = {}
 
         # Fix the current time.
-        now = time.time()
+        now = time.monotonic()
 
         # Join waiting sequences if possible.
         if not self.swapped:
diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
@@ -417,7 +417,8 @@ async def generate(
             request.
         """
         # Preprocess the request.
-        arrival_time = time.time()
+        # This should not be used for logging, as it is monotonic time.
+        arrival_time = time.monotonic()
 
         try:
             stream = await self.add_request(request_id,
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -256,10 +256,10 @@ def add_request(
             prompt_token_ids: The token IDs of the prompt. If None, we
                 use the tokenizer to convert the prompts to token IDs.
             arrival_time: The arrival time of the request. If None, we use
-                the current time.
+                the current monotonic time.
         """
         if arrival_time is None:
-            arrival_time = time.time()
+            arrival_time = time.monotonic()
         if prompt_token_ids is None:
             assert prompt is not None
             prompt_token_ids = self.tokenizer.encode(prompt)
@@ -568,7 +568,7 @@ def _log_system_stats(
         prompt_run: bool,
         num_batched_tokens: int,
     ) -> None:
-        now = time.time()
+        now = time.monotonic()
         # Log the number of batched input tokens.
         if prompt_run:
             self.num_prompt_tokens.append((now, num_batched_tokens))
diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
@@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
 
     model_name = request.model
     request_id = f"cmpl-{random_uuid()}"
-    created_time = int(time.time())
+    created_time = int(time.monotonic())
     try:
         sampling_params = SamplingParams(
             n=request.n,
@@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
     if error_check_ret is not None:
         return error_check_ret
 
-    created_time = int(time.time())
+    created_time = int(time.monotonic())
     try:
         sampling_params = SamplingParams(
             n=request.n,