forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 12
Expand file tree
/
Copy pathneuron_di.py
More file actions
130 lines (108 loc) · 3.88 KB
/
neuron_di.py
File metadata and controls
130 lines (108 loc) · 3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
# SPDX-License-Identifier: Apache-2.0
# This example can be run by having two terminals
# For the prefill terminal:
# SEND=1 python neuron_di.py
# For the decode terminal:
# python neuron_di.py
# This example assumes you have the llama-3.2-3b model
# downloaded into a folder called "models" under your home directory
import logging
import os
import subprocess
from pathlib import Path
from vllm import LLM, SamplingParams
from vllm.config import KVTransferConfig
logging.basicConfig(level=logging.DEBUG)
MODEL_NAME = "llama-3.2-3b"
CURRENT_DIR = Path.cwd()
LOCAL_MODEL = CURRENT_DIR / "models" / MODEL_NAME
COMPILED_MODEL = CURRENT_DIR / "compiled_models/llama32di_compiled"
# Use neuronx-distributed-inference framework over transformers-neuronx
os.environ['VLLM_NEURON_FRAMEWORK'] = "neuronx-distributed-inference"
os.environ['NEURON_RT_ASYNC_SENDRECV_BOOTSTRAP_PORT'] = '45645'
os.environ['NEURON_RT_ASYNC_SENDRECV_EXPERIMENTAL_ENABLED'] = '1'
def sender_routine():
# use first 16 cores for prefill
os.environ['NEURON_RT_VISIBLE_CORES'] = '0-15'
kv_transfer_config = KVTransferConfig(
kv_connector='NeuronConnector',
kv_buffer_device='cpu',
kv_buffer_size=1e9,
kv_rank=0,
kv_role="kv_producer",
kv_parallel_size=2,
kv_ip="127.0.0.1",
kv_port=8199,
neuron_core_offset=0,
)
results = common_routine(kv_transfer_config)
return results
def recver_routine():
# use next 16 cores for decode
os.environ['NEURON_RT_VISIBLE_CORES'] = '16-31'
kv_transfer_config = KVTransferConfig(
kv_connector='NeuronConnector',
kv_buffer_device='cpu',
kv_buffer_size=1e9,
kv_rank=1,
kv_role="kv_consumer",
kv_parallel_size=2,
kv_ip="127.0.0.1",
kv_port=8199,
neuron_core_offset=16,
)
results = common_routine(kv_transfer_config)
return results
def common_routine(kv_transfer_config):
# Sample prompts.
prompts = [
"The president of the United States is",
"The capital of France is",
"The future of AI is",
"Answer in five words: First five planets from sun are",
"sushi tastes good with",
"Ethiopia is a country in",
"two plus two is",
]
# Create a sampling params object.
sampling_params = SamplingParams(top_k=1,
temperature=1.0,
top_p=1.0,
max_tokens=20)
# Create an LLM. Update config as per the case
llm = LLM(
model=str(LOCAL_MODEL),
max_num_seqs=2,
max_model_len=256,
# The device can be automatically detected when AWS Neuron SDK
# is installed. The device argument can be either unspecified
# for automated detection, or explicitly assigned.
device="neuron",
tensor_parallel_size=16,
kv_transfer_config=kv_transfer_config,
)
# Generate texts from the prompts. The output is a list of RequestOutput
# objects that contain the prompt, generated text, and other information.
outputs = llm.generate(prompts, sampling_params)
results = []
# Print the outputs.
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
results.append(generated_text)
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
return results
def compile():
if not COMPILED_MODEL.exists():
result = subprocess.run(
Path(__file__).parent / "compile_di_single_instance.sh")
assert result.returncode == 0, "compile script failed"
def run_neuron_di_example():
os.environ["NEURON_COMPILED_ARTIFACTS"] = str(COMPILED_MODEL)
if os.getenv("SEND", None) == '1':
compile()
sender_routine()
else:
recver_routine()
if __name__ == "__main__":
run_neuron_di_example()