简介:
这篇文章部署的是对话模型 Qwen3-0.6B,如有需要,可以换成向量模型 Qwen3-Embedding-0.6B。
1.检查
WSL 里不要安装 Linux 版 NVIDIA 驱动,Windows 驱动正常即可,WSL 中能看到这个就行。
nvidia-smi
或
nvidia-smi || /usr/lib/wsl/lib/nvidia-smi
2.换国内 apt 源(可选)
sudo cp /etc/apt/sources.list /etc/apt/sources.list.bak
sudo sed -i 's|http://archive.ubuntu.com/ubuntu|https://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list
sudo sed -i 's|http://security.ubuntu.com/ubuntu|https://mirrors.aliyun.com/ubuntu|g' /etc/apt/sources.list
sudo apt update
sudo apt install -y wget curl git git-lfs build-essential ca-certificates \
python3-pip python3-venv python-is-python3
git lfs install
3.创建 Python 虚拟环境
mkdir -p ~/ai/qwen3
cd ~/ai/qwen3
python3 -m venv .venv
source .venv/bin/activate
4.配置国内 pip 源(可选)
pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
pip config set global.extra-index-url https://download.pytorch.org/whl/cu121
pip config set global.trusted-host "pypi.tuna.tsinghua.edu.cn download.pytorch.org"
python -m pip install -U pip setuptools wheel
5.安装 PyTorch CUDA 版
pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
6.验证 CUDA
python - <<'PY'
import torch
print("cuda:", torch.cuda.is_available())
print("gpu:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)
print("torch:", torch.__version__)
PY
7.安装 transformers
因为WSL + RTX 3060 Laptop 上不合适最新版的 vllm,最新的 transformers 又不兼容 vllm 0.10.2 版本,所以只能指定版本
pip install "transformers==4.55.4" "tokenizers==0.21.4" -i https://pypi.tuna.tsinghua.edu.cn/simple
8.安装 vLLM 和 huggingface 下载工具
因为WSL + RTX 3060 Laptop 上不合适最新版的 vllm,所以只能指定 0.10.2 版本
pip install -U "vllm==0.10.2" "huggingface_hub[cli]" modelscope
9.配置 HuggingFace 国内镜像源
export HF_ENDPOINT=https://hf-mirror.com
export HF_HOME=$HOME/.cache/huggingface
export HUGGINGFACE_HUB_CACHE=$HF_HOME/hubb
10.下载 Qwen3-0.6B
mkdir -p ~/ai/qwen3/models
hf download Qwen/Qwen3-0.6B --local-dir ~/ai/qwen3/models/Qwen3-0.6B
11.启动
unset VLLM_USE_V1
export VLLM_USE_V1=0
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
vllm serve ~/ai/qwen3/models/Qwen3-0.6B \
--served-model-name qwen3-0.6b \
--host 0.0.0.0 \
--port 6006 \
--dtype float16 \
--gpu-memory-utilization 0.75 \
--max-model-len 4096 \
--max-num-seqs 1 \
--api-key 123456
12.测试
文档地址:http://localhost:6006/docs
curl http://127.0.0.1:6006/v1/chat/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer 123456" \
-d '{
"model": "qwen3-0.6b",
"messages": [
{"role": "user", "content": "用三句话介绍一下 Qwen3-0.6B。"}
],
"temperature": 0.6,
"max_tokens": 512
}'
13.备注
1.如果显存充足,可以把启动参数里的:
--max-model-len 4096
# 改成:
--max-model-len 8192
2.需要换成向量模型的,按以下操作:
# 切换环境
source ~/ai/qwen3/.venv/bin/activate
# 下载模型
hf download Qwen/Qwen3-Embedding-0.6B --local-dir ~/ai/qwen3/models/Qwen3-Embedding-0.6B
# 安装 sentence-transformers
pip install \
"fastapi" \
"uvicorn[standard]" \
"sentence-transformers>=2.7.0" \
"accelerate" \
-i https://pypi.tuna.tsinghua.edu.cn/simple
# 创建接口服务(供外部调用,也可以不用创建,用程序直连)
cat > embedding_server.py <<'PY'
import time
import uuid
from typing import List, Union, Optional
import torch
from fastapi import FastAPI, Header, HTTPException
from pydantic import BaseModel
from sentence_transformers import SentenceTransformer
API_KEY = "123456"
MODEL_NAME = "qwen3-embedding-0.6b"
MODEL_PATH = "/home/ai/ai/qwen3/models/Qwen3-Embedding-0.6B"
app = FastAPI(title="Qwen3 Embedding OpenAI Compatible API")
model = SentenceTransformer(
MODEL_PATH,
model_kwargs={
"torch_dtype": torch.float16,
"device_map": "auto",
},
tokenizer_kwargs={
"padding_side": "left",
},
)
class EmbeddingsRequest(BaseModel):
input: Union[str, List[str]]
model: Optional[str] = MODEL_NAME
dimensions: Optional[int] = None
@app.get("/v1/models")
def list_models():
return {
"object": "list",
"data": [
{
"id": MODEL_NAME,
"object": "model",
"created": int(time.time()),
"owned_by": "local",
}
],
}
@app.post("/v1/embeddings")
def embeddings(req: EmbeddingsRequest, authorization: Optional[str] = Header(None)):
if authorization != f"Bearer {API_KEY}":
raise HTTPException(status_code=401, detail="Invalid API key")
texts = [req.input] if isinstance(req.input, str) else req.input
vectors = model.encode(
texts,
normalize_embeddings=True,
convert_to_numpy=True,
batch_size=8,
show_progress_bar=False,
)
if req.dimensions is not None:
if req.dimensions < 32 or req.dimensions > vectors.shape[1]:
raise HTTPException(status_code=400, detail=f"dimensions must be between 32 and {vectors.shape[1]}")
vectors = vectors[:, :req.dimensions]
norms = (vectors ** 2).sum(axis=1, keepdims=True) ** 0.5
vectors = vectors / norms
return {
"object": "list",
"model": MODEL_NAME,
"data": [
{
"object": "embedding",
"index": i,
"embedding": vec.tolist(),
}
for i, vec in enumerate(vectors)
],
"usage": {
"prompt_tokens": 0,
"total_tokens": 0,
},
}
PY
# 启动
cd ~/ai/qwen3
source .venv/bin/activate
export CUDA_VISIBLE_DEVICES=0
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
uvicorn embedding_server:app \
--host 0.0.0.0 \
--port 6007
# 测试
curl http://127.0.0.1:6007/v1/embeddings \
-H "Content-Type: application/json" \
-H "Authorization: Bearer 123456" \
-d '{
"model": "qwen3-embedding-0.6b",
"input": [
]
}'