Skip to content

Commit acbed3e

Browse files
authored
Use monotonic time where appropriate (#1249)
1 parent 66d18a7 commit acbed3e

File tree

7 files changed

+18
-17
lines changed

7 files changed

+18
-17
lines changed

benchmarks/benchmark_latency.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,13 +40,13 @@ def main(args: argparse.Namespace):
4040
def run_to_completion(profile: bool = False):
4141
if profile:
4242
torch.cuda.cudart().cudaProfilerStart()
43-
start_time = time.time()
43+
start_time = time.perf_counter()
4444

4545
llm.generate(prompt_token_ids=dummy_prompt_token_ids,
4646
sampling_params=sampling_params,
4747
use_tqdm=False)
4848

49-
end_time = time.time()
49+
end_time = time.perf_counter()
5050
latency = end_time - start_time
5151
if profile:
5252
torch.cuda.cudart().cudaProfilerStop()

benchmarks/benchmark_serving.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ async def send_request(
105105
best_of: int,
106106
use_beam_search: bool,
107107
) -> None:
108-
request_start_time = time.time()
108+
request_start_time = time.perf_counter()
109109

110110
headers = {"User-Agent": "Benchmark Client"}
111111
if backend == "vllm":
@@ -148,7 +148,7 @@ async def send_request(
148148
if "error" not in output:
149149
break
150150

151-
request_end_time = time.time()
151+
request_end_time = time.perf_counter()
152152
request_latency = request_end_time - request_start_time
153153
REQUEST_LATENCY.append((prompt_len, output_len, request_latency))
154154

@@ -180,10 +180,10 @@ def main(args: argparse.Namespace):
180180
tokenizer = get_tokenizer(args.tokenizer, trust_remote_code=args.trust_remote_code)
181181
input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
182182

183-
benchmark_start_time = time.time()
183+
benchmark_start_time = time.perf_counter()
184184
asyncio.run(benchmark(args.backend, api_url, input_requests, args.best_of,
185185
args.use_beam_search, args.request_rate))
186-
benchmark_end_time = time.time()
186+
benchmark_end_time = time.perf_counter()
187187
benchmark_time = benchmark_end_time - benchmark_start_time
188188
print(f"Total time: {benchmark_time:.2f} s")
189189
print(f"Throughput: {args.num_prompts / benchmark_time:.2f} requests/s")

benchmarks/benchmark_throughput.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -93,10 +93,10 @@ def run_vllm(
9393
sampling_params=sampling_params,
9494
)
9595

96-
start = time.time()
96+
start = time.perf_counter()
9797
# FIXME(woosuk): Do use internal method.
9898
llm._run_engine(use_tqdm=True)
99-
end = time.time()
99+
end = time.perf_counter()
100100
return end - start
101101

102102

@@ -118,7 +118,7 @@ def run_hf(
118118
llm = llm.cuda()
119119

120120
pbar = tqdm(total=len(requests))
121-
start = time.time()
121+
start = time.perf_counter()
122122
batch: List[str] = []
123123
max_prompt_len = 0
124124
max_output_len = 0
@@ -156,7 +156,7 @@ def run_hf(
156156
batch = []
157157
max_prompt_len = 0
158158
max_output_len = 0
159-
end = time.time()
159+
end = time.perf_counter()
160160
return end - start
161161

162162

vllm/core/scheduler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ def _schedule(self) -> SchedulerOutputs:
121121
blocks_to_copy: Dict[int, List[int]] = {}
122122

123123
# Fix the current time.
124-
now = time.time()
124+
now = time.monotonic()
125125

126126
# Join waiting sequences if possible.
127127
if not self.swapped:

vllm/engine/async_llm_engine.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -417,7 +417,8 @@ async def generate(
417417
request.
418418
"""
419419
# Preprocess the request.
420-
arrival_time = time.time()
420+
# This should not be used for logging, as it is monotonic time.
421+
arrival_time = time.monotonic()
421422

422423
try:
423424
stream = await self.add_request(request_id,

vllm/engine/llm_engine.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -256,10 +256,10 @@ def add_request(
256256
prompt_token_ids: The token IDs of the prompt. If None, we
257257
use the tokenizer to convert the prompts to token IDs.
258258
arrival_time: The arrival time of the request. If None, we use
259-
the current time.
259+
the current monotonic time.
260260
"""
261261
if arrival_time is None:
262-
arrival_time = time.time()
262+
arrival_time = time.monotonic()
263263
if prompt_token_ids is None:
264264
assert prompt is not None
265265
prompt_token_ids = self.tokenizer.encode(prompt)
@@ -568,7 +568,7 @@ def _log_system_stats(
568568
prompt_run: bool,
569569
num_batched_tokens: int,
570570
) -> None:
571-
now = time.time()
571+
now = time.monotonic()
572572
# Log the number of batched input tokens.
573573
if prompt_run:
574574
self.num_prompt_tokens.append((now, num_batched_tokens))

vllm/entrypoints/openai/api_server.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ async def create_chat_completion(request: ChatCompletionRequest,
210210

211211
model_name = request.model
212212
request_id = f"cmpl-{random_uuid()}"
213-
created_time = int(time.time())
213+
created_time = int(time.monotonic())
214214
try:
215215
sampling_params = SamplingParams(
216216
n=request.n,
@@ -411,7 +411,7 @@ async def create_completion(request: CompletionRequest, raw_request: Request):
411411
if error_check_ret is not None:
412412
return error_check_ret
413413

414-
created_time = int(time.time())
414+
created_time = int(time.monotonic())
415415
try:
416416
sampling_params = SamplingParams(
417417
n=request.n,

0 commit comments

Comments
 (0)