upstreaming-to-vllm/examples/offline_inference/neuron_speculation.py at neuron-2.26 · aws-neuron/upstreaming-to-vllm · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# SPDX-License-Identifier: Apache-2.0
"""
This example shows how to run offline inference with a speculative
decoding model on neuron.
"""

import os

from vllm import LLM, SamplingParams

# Sample prompts.
prompts = [
    "Hello, I am a language model and I can help",
    "The president of the United States is",
    "The capital of France is",
]


def config_buckets():
    """Configure context length and token gen buckets."""
    # creates XLA hlo graphs for all the context length buckets.
    os.environ["NEURON_CONTEXT_LENGTH_BUCKETS"] = "128,512,1024,2048"
    # creates XLA hlo graphs for all the token gen buckets.
    os.environ["NEURON_TOKEN_GEN_BUCKETS"] = "128,512,1024,2048"


def initialize_model():
    """Create an LLM with speculative decoding."""
    return LLM(
        model="openlm-research/open_llama_7b",
        speculative_config={
            "model": "openlm-research/open_llama_3b",
            "num_speculative_tokens": 4,
            "max_model_len": 2048,
        },
        max_num_seqs=4,
        max_model_len=2048,
        block_size=2048,
        use_v2_block_manager=True,
        device="neuron",
        tensor_parallel_size=32,
    )


def process_requests(model: LLM, sampling_params: SamplingParams):
    """Generate texts from prompts and print them."""
    outputs = model.generate(prompts, sampling_params)
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")


def main():
    """Main function that sets up the model and processes prompts."""
    config_buckets()
    model = initialize_model()
    # Create a sampling params object.
    sampling_params = SamplingParams(max_tokens=100, top_k=1)
    process_requests(model, sampling_params)


if __name__ == "__main__":
    main()