vllm-project · zou3519 · Nov 3, 2025 · Oct 28, 2025 · Oct 28, 2025 · Oct 28, 2025
diff --git a/docs/design/torch_compile.md b/docs/design/torch_compile.md
@@ -27,6 +27,8 @@ With all these factors taken into consideration, usually we can guarantee that t
 
 A unique aspect of vLLM's `torch.compile` integration, is that we guarantee all the compilation finishes before we serve any requests. No requests will trigger new compilations. Otherwise, the engine would be blocked on that request, and the response time will have unexpected spikes.
 
+By default, the cache saves compiled artifacts as binary files. If you would like to interact with the generated code for debugging purposes, set `VLLM_COMPILE_CACHE_SAVE_FORMAT=unpacked`.
+
 ## Python Code Compilation
 
 In the very verbose logs, we can see:

@@ -220,7 +220,7 @@ def compile(
         assert key is not None
         path = os.path.join(self.cache_dir, key)
         if not envs.VLLM_DISABLE_COMPILE_CACHE:
-            compiled_graph.save(path=path, format="unpacked")
+            compiled_graph.save(path=path, format=envs.VLLM_COMPILE_CACHE_SAVE_FORMAT)
             compilation_counter.num_compiled_artifacts_saved += 1
         return compiled_graph, (key, path)
 
@@ -237,7 +237,7 @@ def load(
         assert isinstance(handle[1], str)
         path = handle[1]
         inductor_compiled_graph = torch._inductor.CompiledArtifact.load(
-            path=path, format="unpacked"
+            path=path, format=envs.VLLM_COMPILE_CACHE_SAVE_FORMAT
         )
         from torch._inductor.compile_fx import graph_returns_tuple
 

diff --git a/vllm/envs.py b/vllm/envs.py
@@ -218,6 +218,7 @@
     VLLM_USE_FBGEMM: bool = False
     VLLM_GC_DEBUG: str = ""
     VLLM_DISABLE_SHARED_EXPERTS_STREAM: bool = False
+    VLLM_COMPILE_CACHE_SAVE_FORMAT: Literal["binary", "unpacked"] = "binary"
 
 
 def get_default_cache_root():
@@ -1408,6 +1409,15 @@ def get_vllm_port() -> int | None:
     "VLLM_DISABLE_SHARED_EXPERTS_STREAM": lambda: os.getenv(
         "VLLM_DISABLE_SHARED_EXPERTS_STREAM", False
     ),
+    # Format for saving torch.compile cache artifacts
+    # - "binary": saves as binary file
+    #     Safe for multiple vllm serve processes accessing the same torch compile cache.
+    # - "unpacked": saves as directory structure (for inspection/debugging)
+    #     NOT multiprocess safe - race conditions may occur with multiple processes.
+    #     Allows viewing and setting breakpoints in Inductor's code output files.
+    "VLLM_COMPILE_CACHE_SAVE_FORMAT": env_with_choices(
+        "VLLM_COMPILE_CACHE_SAVE_FORMAT", "binary", ["binary", "unpacked"]
+    ),
 }
 
 # --8<-- [end:env-vars-definition]