沐曦集群使用vllm

1. 提前下载好文件

下载地址：https://pub-docstore.metax-tech.com:7001/

账号：wuluo

可以分享后使用，例如：https://pub-docstore.metax-tech.com:7001/sharing/PsE45bzdD

编写爬虫脚本（目前还没有跑通，浏览器崩溃）：

from selenium import webdriver
from selenium.webdriver.firefox.options import Options
from pyvirtualdisplay import Display
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.common.by import By
import os
import time

download_dir = os.path.join(os.getcwd(), "firefox_downloads")
driver_path = "/root/yjk/softwares/geckodriver"
url = "https://pub-docstore.metax-tech.com:7001/sharing/PsE45bzdD"

def start_download():
    # 如果目录不存在，则创建它
    if not os.path.exists(download_dir):
        os.makedirs(download_dir)

    # 创建一个虚拟显示
    display = Display(visible=0, size=(1280, 768))
    display.start()

    # 配置Firefox选项
    firefox_options = Options()
    # 设置下载相关的首选项
    firefox_options.set_preference("browser.download.folderList", 2)  # 0是桌面; 1是默认“下载”' 2是自定义文件夹
    firefox_options.set_preference("browser.download.dir", download_dir) # 设置自定义下载路径
    firefox_options.set_preference("browser.download.useDownloadDir", True)
    # 对于特定文件类型，禁用下载前的询问弹窗
    # "application/octet-stream" 是一个通用的二进制文件类型
    firefox_options.set_preference("browser.helperApps.neverAsk.saveToDisk",
                                "application/zip, application/octet-stream, application/x-zip-compressed, application/x-tar, application/pdf")
    firefox_options.headless = True # 以headless模式运行，具体代码如下：

    # 创建Firefox
    service = Service(driver_path)
    browser = webdriver.Firefox(options=firefox_options, service=service)
    browser.get(url)

    # --- 等待页面加载完成 ---
    print("等待页面加载...")
    browser.implicitly_wait(10)  # 等待10秒钟，直到页面加载完成

    button = browser.find_element(By.XPATH, '//*[@id="ext-gen43"]')
    button.click()

    # --- 等待下载完成 (此逻辑与浏览器无关) ---
    print("开始等待下载完成...")
    timeout = 60 * 30 # 设置60秒的超时时间
    start_time = time.time()
    download_complete = False
    new_file_path = None

    files_before = set(os.listdir(download_dir))
    while time.time() - start_time < timeout:
        files_after = set(os.listdir(download_dir))
        new_files = files_after - files_before

        if time.time() - start_time > 10:
            if not browser.service.process:
                print('Browser has quit unexpectedly')
            browser.save_screenshot('1.png')

        if new_files:
            filename = new_files.pop()
            # Firefox 下载时可能会创建 .part 文件，当下载完成时，.part 后缀会消失
            if not filename.endswith('.part'):
                new_file_path = os.path.join(download_dir, filename)
                print(f"✅ 下载完成！文件位于: {new_file_path}")
                download_complete = True
                break

        time.sleep(0.5)

    if not download_complete:
        print(f"❌ 下载超时！在{timeout}秒内未完成下载。")

    # --- 6. 使用找到的文件 ---
    if new_file_path:
        file_size = os.path.getsize(new_file_path)
        print(f"文件大小: {file_size / 1024 / 1024:.2f} MB")

    browser.quit()
    display.stop()

if __name__ == "__main__":
    start_download()

目前通过挂载的方式将文件存储在以下位置：sudo mount -t nfs 10.118.14.133:/zion0/nfsdir /nfsdir0

2. 通过镜像加载运行

镜像位置位于/nfsdir0/WT-X201/2.32.0.x/images/pytorch，可以导入vllm镜像，位置位于/nfsdir0/WT-X201/2.32.0.x/images/AI/py310/vllm_2.32.0.11-torch2.4-py310-ubuntu20.04-amd64.tar，pytorch位置位于/nfsdir0/WT-X201/2.32.0.x/images/pytorch/hpcc-x201-pytorch_2.32.0.3-torch2.4-py310-ubuntu20.04-amd64.tar。

使用以下命令导入：

docker load < /nfsdir0/WT-X201/2.32.0.x/images/AI/py310/vllm_2.32.0.11-torch2.4-py310-ubuntu20.04-amd64.tar

使用set -x可以发现，docker实际执行的命令是nerdctl -n k8s.io --address /data/containerd/run/containerd.sock。

编写以下脚本运行镜像：

#!/bin/bash

# Default Docker image
DEFAULT_IMAGE="vllm:2.32.0.11-torch2.4-py310-ubuntu20.04-amd64"

# Check if an image name is provided as an argument
if [ -n "$1" ]; then
    DOCKER_IMAGE="$1"
else
    DOCKER_IMAGE="$DEFAULT_IMAGE"
fi

# Start building the docker run command
DOCKER_COMMAND="nerdctl -n k8s.io --address /data/containerd/run/containerd.sock run"

# Add the always-included device if it exists
if [ -e "/dev/htcd" ]; then
    DOCKER_COMMAND+=" --device=/dev/htcd"
else
    echo "Warning: /dev/htcd not found on the host."
fi

# Get all devices under /dev/dri and add them to the command
if [ -d "/dev/dri" ]; then
    for device in /dev/dri/*; do
        if [ -c "$device" ]; then # Check if it's a character device
            DOCKER_COMMAND+=" --device=$device"
        fi
    done
else
    echo "Error: /dev/dri directory not found on the host. Graphics devices might not be available."
    exit 1
fi

# Add the rest of your docker command using the chosen image
DOCKER_COMMAND+=" -it ${DOCKER_IMAGE} /bin/bash"

# Print the generated command
echo "Generated Docker command:"
echo "$DOCKER_COMMAND"

# Execute the command (uncomment the line below to actually run it)
eval "$DOCKER_COMMAND"

脚本版本2：

#!/bin/bash

# Enable shell tracing for debugging (optional, can be removed once confident)
# set -x

# --- IMPORTANT: Find your DOCKER/NERDCTL executable path ---
# 1. Deactivate conda: `conda deactivate`
# 2. Run: `which docker`
# 3. If it points to a script/symlink that calls nerdctl, use that script's path.
#    Otherwise, use the direct binary path shown by `which docker`.
# Example: DOCKER_BIN="/usr/bin/docker"
# If `which docker` gives no output outside conda, you'll need to find it manually:
# `find / -name docker -type f 2>/dev/null | grep -E "(bin/docker|sbin/docker)"`
# Once found, replace the placeholder below with the actual path.
DOCKER_BIN="nerdctl -n k8s.io --address /data/containerd/run/containerd.sock" # <--- REPLACE THIS WITH YOUR ACTUAL DOCKER/NERDCTL PATH

# Default Docker image (used if not provided as the second argument)
DEFAULT_IMAGE="vllm:2.32.0.11-torch2.4-py310-ubuntu20.04-amd64"

# --- Argument Parsing ---
# Check for at least one argument (mapping directory)
if [ -z "$1" ]; then
    echo "Usage: $0 <host_dir_to_map> [docker_image_name]"
    echo "  <host_dir_to_map>: The host directory to mount into the container (e.g., /home/user/data:/app/data)"
    echo "  [docker_image_name]: Optional. The Docker image to use. Defaults to ${DEFAULT_IMAGE}"
    exit 1
fi

MAPPED_DIR="$1" # First argument is the directory mapping

# Check for the second argument (Docker image name)
if [ -n "$2" ]; then
    DOCKER_IMAGE="$2"
else
    DOCKER_IMAGE="$DEFAULT_IMAGE"
fi

# Start building the docker run command
DOCKER_COMMAND="${DOCKER_BIN} run"

# Add the directory mapping
DOCKER_COMMAND+=" -v ${MAPPED_DIR}"

# Add the always-included device if it exists
if [ -e "/dev/htcd" ]; then
    DOCKER_COMMAND+=" --device=/dev/htcd"
else
    echo "Warning: /dev/htcd not found on the host."
fi

# Get all devices under /dev/dri and add them to the command
if [ -d "/dev/dri" ]; then
    for device in /dev/dri/*; do
        if [ -c "$device" ]; then # Check if it's a character device
            DOCKER_COMMAND+=" --device=$device"
        fi
    done
else
    echo "Error: /dev/dri directory not found on the host. Graphics devices might not be available."
    exit 1
fi

# Add the rest of your docker command using the chosen image
DOCKER_COMMAND+=" -it ${DOCKER_IMAGE} /bin/bash"

# Print the generated command before execution
echo "-------------------------------------"
echo "Generated command string (before eval):"
echo "$DOCKER_COMMAND"
echo "-------------------------------------"

# Execute the command (uncomment the line below to actually run it)
eval "$DOCKER_COMMAND"

# Disable shell tracing (if enabled)
# set +x

脚本版本3：

#!/bin/bash

# Enable shell tracing for debugging (optional, can be removed once confident)
# set -x

# --- IMPORTANT: Find your DOCKER/NERDCTL executable path ---
# 1. Deactivate conda: `conda deactivate`
# 2. Run: `which docker`
# 3. If it points to a script/symlink that calls nerdctl, use that script's path.
#    Otherwise, use the direct binary path shown by `which docker`.
# Example: DOCKER_BIN="/usr/bin/docker"
# If `which docker` gives no output outside conda, you'll need to find it manually:
# `find / -name docker -type f 2>/dev/null | grep -E "(bin/docker|sbin/docker)"`
# Once found, replace the placeholder below with the actual path.
DOCKER_BIN="nerdctl -n k8s.io --address /data/containerd/run/containerd.sock" # <--- REPLACE THIS WITH YOUR ACTUAL DOCKER/NERDCTL PATH

# Default Docker image (used if not provided as the second argument)
DEFAULT_IMAGE="vllm:2.32.0.11-torch2.4-py310-ubuntu20.04-amd64"

DOCKER_IMAGE="${DOCKER_IMAGE:-$DEFAULT_IMAGE}"

# Check if at least one volume mapping argument is provided.
# "$#" holds the total number of command-line arguments.
if [ "$#" -eq 0 ]; then
    echo "Error: At least one volume mapping is required."
    echo
    echo "Usage: [DOCKER_IMAGE=<image_name>] $0 <host_dir1:container_dir1> [<host_dir2:container_dir2> ...]"
    echo
    echo "Description:"
    echo "  This script starts a Docker container and maps one or more host directories"
    echo "  into the container."
    echo
    echo "Arguments:"
    echo "  <host_dir:container_dir>  A directory to mount, with host and container paths separated by a colon."
    echo "                            Multiple mapping arguments can be provided."
    echo
    echo "Environment Variable:"
    echo "  DOCKER_IMAGE            The Docker image to use. Defaults to: '${DEFAULT_IMAGE}'"
    echo
    echo "Examples:"
    echo "  # Map a single directory using the default image (${DEFAULT_IMAGE})"
    echo "  $0 /home/user/data:/app/data"
    echo
    echo "  # Map multiple directories with a specific image"
    echo "  DOCKER_IMAGE=python:3.9-slim $0 /home/user/project:/app /home/user/logs:/logs"
    exit 1
fi

# Start building the docker run command
DOCKER_COMMAND="${DOCKER_BIN} run -it --rm"

# Iterate over all command-line arguments.
# "$@" treats each command-line argument as a separate, quoted string.
for mapping in "$@"; do
    # For each argument, add the "-v" flag and the mapping itself to our command array.
    DOCKER_COMMAND+=" -v ${mapping}"
done

# Add the always-included device if it exists
if [ -e "/dev/htcd" ]; then
    DOCKER_COMMAND+=" --device=/dev/htcd"
else
    echo "Warning: /dev/htcd not found on the host."
fi

# Get all devices under /dev/dri and add them to the command
if [ -d "/dev/dri" ]; then
    for device in /dev/dri/*; do
        if [ -c "$device" ]; then # Check if it's a character device
            DOCKER_COMMAND+=" --device=$device"
        fi
    done
else
    echo "Error: /dev/dri directory not found on the host. Graphics devices might not be available."
    exit 1
fi

# Add the rest of your docker command using the chosen image
DOCKER_COMMAND+=" ${DOCKER_IMAGE} /bin/bash"

# Print the generated command before execution
echo "-------------------------------------"
echo "Generated command string (before eval):"
echo "$DOCKER_COMMAND"
echo "-------------------------------------"

# Execute the command (uncomment the line below to actually run it)
eval "$DOCKER_COMMAND"

# Disable shell tracing (if enabled)
# set +x

测试命令ht-smi，import vllm，import torch和print(torch.cuda.is_available())

运行vllm

# vllm_model.py
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import os
import json

# 自动下载模型时，指定使用modelscope。不设置的话，会从 huggingface 下载
os.environ['VLLM_USE_MODELSCOPE']='True'

def get_completion(prompts, model, tokenizer=None, max_tokens=512, temperature=0.8, top_p=0.95, max_model_len=2048):
    stop_token_ids = [151329, 151336, 151338]
    # 创建采样参数。temperature 控制生成文本的多样性，top_p 控制核心采样的概率
    sampling_params = SamplingParams(temperature=temperature, top_p=top_p, max_tokens=max_tokens, stop_token_ids=stop_token_ids)
    # 初始化 vLLM 推理引擎
    llm = LLM(model=model, tokenizer=tokenizer, max_model_len=max_model_len,trust_remote_code=True)
    outputs = llm.generate(prompts, sampling_params)
    return outputs


if __name__ == "__main__":    
    # 初始化 vLLM 推理引擎
    model='/root/autodl-tmp/qwen/Qwen2-7B-Instruct' # 指定模型路径
    # model="qwen/Qwen2-7B-Instruct" # 指定模型名称，自动下载模型
    tokenizer = None
    # 加载分词器后传入vLLM 模型，但不是必要的。
    # tokenizer = AutoTokenizer.from_pretrained(model, use_fast=False) 

    text = ["你好，帮我介绍一下什么时大语言模型。",
            "可以给我将一个有趣的童话故事吗？"]
    # messages = [
    #     {"role": "system", "content": "你是一个有用的助手。"},
    #     {"role": "user", "content": prompt}
    # ]
    # 作为聊天模板的消息，不是必要的。
    # text = tokenizer.apply_chat_template(
    #     messages,
    #     tokenize=False,
    #     add_generation_prompt=True
    # )

    outputs = get_completion(text, model, tokenizer=tokenizer, max_tokens=512, temperature=1, top_p=1, max_model_len=2048)

    # 输出是一个包含 prompt、生成文本和其他信息的 RequestOutput 对象列表。
    # 打印输出。
    for output in outputs:
        prompt = output.prompt
        generated_text = output.outputs[0].text
        print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

使用vllm serve：

经过测试，只能使用Qwen2.5-1.5B-Instruct模型：

pip install modelscope
modelscope download --model Qwen/Qwen2.5-1.5B-Instruct
cp -r ~/.cache/modelscope/hub/models/Qwen/ ./model/
vllm serve /root/python_venv/llm/model/Qwen/Qwen2.5-1.5B-Instruct

将在8000端口可用：

curl http://localhost:8000/v1/completions -H "Content-Type: application/json"   -d '{
    "model": "/root/python_venv/llm/model/Qwen/Qwen2.5-1.5B-Instruct",
    "prompt": ["<｜begin▁of▁sentence｜>你好，DeepSeek!<｜end▁of▁sentence｜>"],
    "max_tokens": 100,
    "temperature": 0.6
  }'

测试暂时还无法安装lmcache，缺少nvcc

## 3. 测试结果

```shell
Average latency: 0.22ms
P50 latency: 0.15ms
P90 latency: 0.16ms
P99 latency: 1.97ms
Average throughput: 84.45 GB/s

3. vllm-adapter

运行脚本环境

bash run.sh /root/yjk/from_worker01/python_venv/llm:/root/python_venv/llm /root/yjk/from_worker01/llm:/root/llm

cp patch/vllm_adapter_connector.py /opt/conda/lib/python3.10/site-packages/vllm/distributed/kv_transfer/kv_connector
cp patch/patched_factory.py /opt/conda/lib/python3.10/site-packages/vllm/distributed/kv_transfer/kv_connector/factory.py

测试：

/usr/share/bcc/tools/biopattern  1
python3 vllm-adaptor-test.py > output_stdout.txt 2> output_stderr.txt

代码位于

https://github.com/Ethereal-O/vllm_adapter

muxi分支

测试结果：

第一组

TIME      DISK     %RND  %SEQ    COUNT     KBYTES
24:35  sda       100     0      230       1120
24:39  sda         0   100        3         72
24:44  sda         0   100        1          4
24:49  sda         0   100        3         92
24:50  sda        93     7      103        828
24:55  sda         0   100        3         76
24:56  sda       100     0        3         12
25:00  sda         0   100        2         48
25:01  sda       100     0       50        240
25:05  sda         0   100        1          4
25:10  sda         0   100        2          8
25:11  sda        50    50        6        232
25:15  sda         0   100        1          4
25:16  sda       100     0       98        416
25:20  sda         0   100        1          4

第二组

TIME      DISK     %RND  %SEQ    COUNT     KBYTES
26:02  sda         0   100        2         36
26:07  sda         0   100        2          8
26:12  sda         0   100        2         36
26:13  sda       100     0       40        192
26:17  sda         0   100        1          4
26:22  sda         0   100        2         28
26:27  sda         0   100        1          4
26:32  sda         0   100        1          4
26:33  sda       100     0        2         16
26:37  sda         0   100        1          4

第三组

TIME      DISK     %RND  %SEQ    COUNT     KBYTES
27:17  sda         0   100        1          4
27:22  sda         0   100        2         28
27:24  sda       100     0       42        200
27:27  sda         0   100        2         32
27:29  sda       100     0        1          8
27:32  sda         0   100        2          8
27:34  sda       100     0      227       1100
27:37  sda         0   100        2         36
27:42  sda         0   100        2         48
27:47  sda         0   100        1          4
27:52  sda         0   100        1          4