Small fixes for VLLM example (#1347)

* small cleanups * small cleanups
2025-08-29 13:23:58 -04:00
parent 281379b06e
commit 60016b822e
1 changed files with 5 additions and 9 deletions
--- a/06_gpu_and_ml/llm-serving/vllm_inference.py
+++ b/06_gpu_and_ml/llm-serving/vllm_inference.py
@@ -35,13 +35,10 @@ import modal
 vllm_image = (
    modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
    .uv_pip_install(
-        "torch==2.8.0+cu128",
-        index_url="https://download.pytorch.org/whl/cu128",
-    )
-    .uv_pip_install(
-        "vllm>=0.10.1.1",
-        "huggingface_hub[hf_transfer]>=0.32.0",
+        "vllm==0.10.1.1",
+        "huggingface_hub[hf_transfer]==0.34.4",
        "flashinfer-python==0.2.8",
+        "torch==2.7.1",
    )
    .env({"HF_HUB_ENABLE_HF_TRANSFER": "1"})  # faster model transfers
 )
@@ -59,11 +56,10 @@ vllm_image = (
 # like those of Modal's [Hopper H100/H200 and Blackwell B200 GPUs](https://modal.com/blog/announcing-h200-b200).

 # You can swap this model out for another by changing the strings below.
-# A single H100 GPU has enough VRAM to store a 70,000,000,000 parameter model,
-# like Llama 3.3, in eight bit precision, along with a very large KV cache.
+# A single H100 GPU has enough VRAM to store a 8,000,000 parameter model,
+# like Llama 3.1, in eight bit precision, along with a very large KV cache.

 MODEL_NAME = "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
-
 MODEL_REVISION = "12fd6884d2585dd4d020373e7f39f74507b31866"  # avoid nasty surprises when repos update!

 # Although vLLM will download weights from Hugging Face on-demand,