# ---
# deploy: true
# ---

# # Run state-of-the-art RLMs on Blackwell GPUs with TensorRT-LLM (DeepSeek-R1-0528-FP4)

# In this example, we demonstrate how to use the TensorRT-LLM framework to serve NVIDIA's DeepSeek-R1-0528-FP4 model,
# a [state-of-the-art reasoning language model](https://lmarena.ai/leaderboard),
# on Modal's Blackwell GPUs (8 x B200s).

# Because this model is so large, our focus will be on optimizing the cold start and the model's inference latencies.
# We use [NVIDIA's recommendations](https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4#minimum-latency-server-deployment)
# for matching expected performance and minimizing inference latency.

# ## Overview

# This guide is intended to document two things:
# the general process for building TensorRT-LLM on Modal
# and a specific configuration for serving the DeepSeek-R1-0528-FP4 model.

# ## Installing TensorRT-LLM

# To run TensorRT-LLM, we must first install it. Easier said than done!

# To run code on Modal, we define [container images](https://modal.com/docs/guide/images).
# All Modal containers have access to GPU drivers via the underlying host environment,
# but we still need to install the software stack on top of the drivers, from the CUDA runtime up.

# We start from an official `nvidia/cuda` container image,
# which includes the CUDA runtime & development libraries
# and the environment configuration necessary to run them.

import os
import webbrowser  # for opening generated HTML files in browser
from pathlib import Path

import modal

# We first install PyTorch with CUDA 12.8 support (required for Blackwell).
# We also add some system dependencies of TensorRT-LLM,
# including OpenMPI for distributed communication, some core software like `git`,
# and install packages using [uv](https://docs.astral.sh/uv/)
# to speed up the installation process.

tensorrt_image = (
    modal.Image.from_registry(
        "nvidia/cuda:12.8.1-devel-ubuntu22.04",
        add_python="3.12",  # TRT-LLM requires Python 3.12
    )
    .entrypoint([])  # silence base-image entrypoint
    .apt_install(
        "git",
        "openmpi-bin",
        "libopenmpi-dev",
    )
    .uv_pip_install(
        "torch==2.7.1",
        "torchvision",
        "torchaudio",
        index_url="https://download.pytorch.org/whl/cu128",
    )
    .uv_pip_install(
        "mpi4py",
        "tensorrt_llm==1.0.0rc0",
    )
)

# Note that we're doing this by [method-chaining](https://quanticdev.com/articles/method-chaining/)
# a number of calls to methods on the `modal.Image`. If you're familiar with
# Dockerfiles, you can think of this as a Pythonic interface to instructions like `RUN` and `CMD`.

# End-to-end, this step takes a few minutes on first run.
# If you're reading this from top to bottom,
# you might want to stop here and execute the example
# with `modal run` so that it runs in the background while you read the rest.

# ## Downloading the model

# Next, we'll set up a few things to download the model to persistent storage and do it quickly.
# For persistent, distributed storage, we use
# [Modal Volumes](https://modal.com/docs/guide/volumes), which can be accessed from any container
# with read speeds in excess of a gigabyte per second.

# We also set the `HF_HOME` environment variable to point to the Volume so that the model
# is cached there. And we install `hf-transfer` to get maximum download throughput from
# the Hugging Face Hub, in the hundreds of megabytes per second.

app_name = "example-trtllm-deepseek"

hf_cache_vol = modal.Volume.from_name(f"{app_name}-hf-cache", create_if_missing=True)
HF_CACHE_PATH = Path("/hf_cache")
volumes = {HF_CACHE_PATH: hf_cache_vol}

MODEL_NAME = "nvidia/DeepSeek-R1-0528-FP4-v2"
MODEL_REVISION = "d12ff8db9876124d533b26bc24523c27907ce386"  # in case repo updates!
MODELS_PATH = HF_CACHE_PATH / "models"
MODEL_PATH = MODELS_PATH / MODEL_NAME


# We use the function below to download the model from the Hugging Face Hub.


def download_model():
    from huggingface_hub import snapshot_download

    print(f"downloading base model to {MODEL_PATH} if necessary")
    snapshot_download(
        MODEL_NAME,
        local_dir=MODEL_PATH,
        ignore_patterns=["*.pt", "*.bin"],  # using safetensors
        revision=MODEL_REVISION,
    )


# Just defining that function doesn't actually download the model, though.
# We can run it by adding it to the image's build process with `run_function`.
# The download process has its own dependencies, which we add here.

MINUTES = 60  # seconds
tensorrt_image = (
    tensorrt_image.uv_pip_install("hf-transfer==0.1.9", "huggingface_hub==0.33.0")
    .env(
        {
            "HF_HUB_ENABLE_HF_TRANSFER": "1",
            "HF_HOME": str(MODELS_PATH),
        }
    )
    .run_function(download_model, volumes=volumes, timeout=40 * MINUTES)
)

with tensorrt_image.imports():
    from tensorrt_llm import SamplingParams
    from tensorrt_llm._tensorrt_engine import LLM


# ## Setting up the engine

# ### Configure plugins

# TensorRT-LLM is an LLM inference framework built on top of NVIDIA's TensorRT,
# which is a generic inference framework for neural networks.

# TensorRT includes a "plugin" extension system that allows you to adjust behavior,
# like configuring the [CUDA kernels](https://modal.com/gpu-glossary/device-software/kernel)
# used by the engine.
# The [General Matrix Multiply (GEMM)](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html)
# plugin, for instance, adds heavily-optimized matrix multiplication kernels
# from NVIDIA's [cuBLAS library of linear algebra routines](https://docs.nvidia.com/cuda/cublas/).

# We'll specify the `paged_kv_cache` plugin which enables a
# [paged attention algorithm](https://arxiv.org/abs/2309.06180)
# for the key-value (KV) cache.


def get_plugin_config():
    from tensorrt_llm.plugin.plugin import PluginConfig

    return PluginConfig.from_dict(
        {
            "paged_kv_cache": True,
        }
    )


# ### Configure speculative decoding

# Speculative decoding is a technique for generating multiple tokens per step,
# avoiding the auto-regressive bottleneck in the Transformer architecture.
# Generating multiple tokens in parallel exposes more parallelism to the GPU.
# It works best for text that has predicable patterns, like code,
# but it's worth testing for any workload where latency is critical.

# Speculative decoding can use any technique to guess tokens, including running another,
# smaller language model. Here, we'll use a simple, but popular and effective
# speculative decoding strategy called "multi-token prediction (MTP) decoding",
# which essentially uses a smaller model to generate the next token.


def get_speculative_config():
    from tensorrt_llm.llmapi import MTPDecodingConfig

    return MTPDecodingConfig(
        num_nextn_predict_layers=3,  # number of layers to predict next n tokens
        use_relaxed_acceptance_for_thinking=True,  # draft token accepted when it's a candidate
        relaxed_topk=10,  # first k candidates are considered
        relaxed_delta=0.6,  # delta for relaxed acceptance
    )


# ### Set the build config

# Finally, we'll specify the overall build configuration for the engine. This includes
# more obvious parameters such as the maximum input length, the maximum number of tokens
# to process at once before queueing occurs, and the maximum number of sequences
# to process at once before queueing occurs.

# To minimize latency, we set the maximum number of sequences (the "batch size")
# to 4. We enforce this maximum by setting the number of inputs that the
# Modal Function is allowed to process at once -- `max_concurrent_inputs`.

MAX_BATCH_SIZE = MAX_CONCURRENT_INPUTS = 4


def get_build_config():
    from tensorrt_llm import BuildConfig

    return BuildConfig(
        plugin_config=get_plugin_config(),
        max_input_len=8192,
        max_num_tokens=16384,
        max_batch_size=MAX_BATCH_SIZE,
    )


# ## Serving inference

# Now that we have written the code to compile the engine, we can
# serve it with Modal!

# We start by creating an `App`.

app = modal.App(app_name)

# Thanks to our [custom container runtime system](https://modal.com/blog/jono-containers-talk),
# even this large container boots in seconds.

# On the first container start, we mount the Volume and build the engine,
# which takes a few minutes. Subsequent starts will be much faster,
# as the engine is cached in the Volume and loaded in seconds.

# Container starts are triggered when Modal scales up your Function,
# like the first time you run this code or the first time a request comes in after a period of inactivity.
# For details on optimizing container start latency, see
# [this guide](https://modal.com/docs/guide/cold-start).

# Container lifecycles in Modal are managed via our `Cls` interface, so we define one below
# to separate out the engine startup (`enter`) and engine execution (`generate`).
# For details, see [this guide](https://modal.com/docs/guide/lifecycle-functions).


N_GPU = 8


@app.cls(
    image=tensorrt_image,
    gpu=f"B200:{N_GPU}",
    scaledown_window=60 * MINUTES,
    timeout=60 * MINUTES,
    volumes=volumes,
)
@modal.concurrent(max_inputs=MAX_CONCURRENT_INPUTS)
class Model:
    def build_engine(self, engine_path, engine_kwargs):
        llm = LLM(model=MODEL_PATH, **engine_kwargs)
        # llm.save(engine_path)
        return llm

    @modal.enter()
    def enter(self):
        from transformers import AutoTokenizer

        self.tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

        engine_kwargs = {
            "build_config": get_build_config(),
            "speculative_config": get_speculative_config(),
            "tensor_parallel_size": N_GPU,
            "moe_backend": "TRTLLM",
            "use_cuda_graph": True,
            "backend": "pytorch",
            "max_batch_size": MAX_BATCH_SIZE,
            "trust_remote_code": True,
        }

        self.sampling_params = SamplingParams(
            temperature=0.6,
            top_p=0.95,
            max_tokens=32768,  # max generated tokens
        )

        engine_path = MODEL_PATH / "trtllm_engine"
        if not os.path.exists(engine_path):
            print(f"building new engine at {engine_path}")
            self.llm = self.build_engine(engine_path, engine_kwargs)
        else:
            print(f"loading engine from {engine_path}")
            self.llm = LLM(model=engine_path, **engine_kwargs)

    @modal.method()
    async def generate_async(self, prompt):
        text = self.text_from_prompt(prompt)
        async for output in self.llm.generate_async(
            text, self.sampling_params, streaming=True
        ):
            yield output.outputs[0].text_diff

    def text_from_prompt(self, prompt):
        messages = [{"role": "user", "content": prompt}]

        return self.tokenizer.apply_chat_template(
            messages, tokenize=False, add_generation_prompt=True
        )

    @modal.method()
    def boot(self):
        pass  # no-op to start up containers

    @modal.exit()
    def shutdown(self):
        self.llm.shutdown()
        del self.llm


# ## Calling our inference function

# To run our `Model`'s `.generate` method from Python, we just need to call it --
# with `.remote` appended to run it on Modal.

# We wrap that logic in a `local_entrypoint` so you can run it from the command line with

# ```bash
# modal run trtllm_deepseek.py
# ```

# For simplicity, we ask the model to generate a game of tic-tac-toe in HTML and open it in the browser.
# But the code in the `local_entrypoint` is just regular Python code
# that runs on your machine -- we wrap it in a CLI automatically --
# so feel free to customize it to your liking.


@app.local_entrypoint()
def main():
    print("🏎️  creating container")
    model = Model()

    print("🏎️  cold booting container")
    model.boot.remote()

    prompt = """
    Create an HTML page implementing a simple game of tic-tac-toe.
    Only output the HTML in English, no other text or language.
    """

    print("🏎️ creating game of tic-tac-toe")
    resp = ""
    for out in model.generate_async.remote_gen(prompt):
        print(out, end="", flush=True)
        resp += out
    print("\n")

    # post-process
    html_content = (
        resp.split("</think>")[-1].split("```html")[-1].split("```")[0].strip()
    )

    html_filename = Path(__file__).parent / "tic_tac_toe.html"
    with open(html_filename, "w") as f:
        f.write(html_content)

    file_path = html_filename.absolute()
    file_url = f"file://{file_path}"
    print(f"\nHTML saved to: {file_path}")
    print(f"Opening in browser: {file_url}")

    print("🏎️  opening in browser")
    success = webbrowser.open(file_url)
    if not success:
        print(f"Failed to open browser, please manually open: {file_url}")


# Once deployed with `modal deploy`, this `Model.generate` function
# can be called from other Python code. It can also be converted to an HTTP endpoint
# for invocation over the Internet by any client.
# For details, see [this guide](https://modal.com/docs/guide/trigger-deployed-functions).

# As a quick demo, we've included some sample chat client code in the
# Python main entrypoint below. To use it, first deploy with

# ```bash
# modal deploy trtllm_deepseek.py
# ```

# and then run the client with

# ```python notest
# python trtllm_deepseek.py
# ```


if __name__ == "__main__":
    import sys

    try:
        Model = modal.Cls.from_name(app_name, "Model")
        print("🏎️  connecting to model")
        model = Model()
        model.boot.remote()
    except modal.exception.NotFoundError as e:
        raise SystemError("Deploy this app first with modal deploy") from e

    print("🏎️  starting chat. exit with :q, ctrl+C, or ctrl+D")
    try:
        prompt = []
        while (nxt := input("🏎️  > ")) != ":q":
            prompt.append({"role": "user", "content": nxt})
            resp = ""
            for out in model.generate_async.remote_gen(prompt):
                print(out, end="", flush=True)
                resp += out
            print("\n")
            prompt.append({"role": "assistant", "content": resp})
    except KeyboardInterrupt:
        pass
    except SystemExit:
        pass
    finally:
        print("\n")
        sys.exit(0)