Small fixes for VLLM example (#1347)

* small cleanups

* small cleanups
This commit is contained in:
Lucy Zhang
2025-08-29 13:23:58 -04:00
committed by GitHub
parent 281379b06e
commit 60016b822e

View File

@@ -35,13 +35,10 @@ import modal
vllm_image = (
modal.Image.from_registry("nvidia/cuda:12.8.0-devel-ubuntu22.04", add_python="3.12")
.uv_pip_install(
"torch==2.8.0+cu128",
index_url="https://download.pytorch.org/whl/cu128",
)
.uv_pip_install(
"vllm>=0.10.1.1",
"huggingface_hub[hf_transfer]>=0.32.0",
"vllm==0.10.1.1",
"huggingface_hub[hf_transfer]==0.34.4",
"flashinfer-python==0.2.8",
"torch==2.7.1",
)
.env({"HF_HUB_ENABLE_HF_TRANSFER": "1"}) # faster model transfers
)
@@ -59,11 +56,10 @@ vllm_image = (
# like those of Modal's [Hopper H100/H200 and Blackwell B200 GPUs](https://modal.com/blog/announcing-h200-b200).
# You can swap this model out for another by changing the strings below.
# A single H100 GPU has enough VRAM to store a 70,000,000,000 parameter model,
# like Llama 3.3, in eight bit precision, along with a very large KV cache.
# A single H100 GPU has enough VRAM to store a 8,000,000 parameter model,
# like Llama 3.1, in eight bit precision, along with a very large KV cache.
MODEL_NAME = "RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
MODEL_REVISION = "12fd6884d2585dd4d020373e7f39f74507b31866" # avoid nasty surprises when repos update!
# Although vLLM will download weights from Hugging Face on-demand,