设备:Linux,Mac
git clone https://github.com/OpenBMB/llama.cpp.git
git checkout minicpm3
cd llama.cpp
make
cd llama.cpp/models
mkdir Minicpm3
python3 -m pip install -r requirements.txt
# 将pytorch模型转化为fp16的gguf
python3 convert_hf_to_gguf.py models/Minicpm3/ --outfile /your/path/llama.cpp/models/Minicpm3/CPM-4B-F16.gguf
# 完成以上步骤后,llama.cpp/models/Minicpm3目录下将存在一个CPM-4B-F16.gguf的模型文件
# 使用本行命令执行成功后,/models/Minicpm/下将存在 ggml-model-Q4_K_M.gguf的4bit量化文件
./llama-quantize ./models/Minicpm3/CPM-4B-F16.gguf ./models/Minicpm3/ggml-model-Q4_K_M.gguf Q4_K_M
# 如果找不到llama-quantize,可以尝试以下方法
cd llama.cpp
make llama-quantize
./llama-cli -c 1024 -m ./models/Minicpm/ggml-model-Q4_K_M.gguf -n 1024 --top-p 0.7 --temp 0.7 --prompt
./llama-server -m ./models/Minicpm3/CPM-2B-F16.gguf -c 2048
.import requests
url = "http://localhost:8080/completion"
headers = {
"Content-Type": "application/json"
}
data = {
"prompt": "MiniCPM3 是哪家公司发布的?",
"n_predict": 128
}
response = requests.post(url, json=data, headers=headers)
if response.status_code == 200:
result = response.json()
print(result["content"])
else:
print(f"Request failed with status code {response.status_code}: {response.text}")