parent
e0de1bac36
commit
30ab48e314
|
|
@ -0,0 +1,51 @@
|
|||
additionalProperties:
|
||||
formFields:
|
||||
- default: 8800
|
||||
edit: true
|
||||
envKey: PANEL_APP_PORT_HTTP
|
||||
labelEn: Port
|
||||
labelZh: 端口
|
||||
required: true
|
||||
rule: paramPort
|
||||
type: number
|
||||
label:
|
||||
en: Port
|
||||
ja: ポート
|
||||
ms: Port
|
||||
pt-br: Porta
|
||||
ru: Порт
|
||||
zh-Hant: 端口
|
||||
zh: 端口
|
||||
ko: 포트
|
||||
- default: ""
|
||||
edit: true
|
||||
envKey: HUGGING_FACE_HUB_TOKEN
|
||||
labelEn: Hugging Face Token
|
||||
labelZh: Hugging Face Token
|
||||
required: true
|
||||
type: text
|
||||
label:
|
||||
en: Hugging Face Token
|
||||
ja: Hugging Face Token
|
||||
ms: Hugging Face Token
|
||||
pt-br: Hugging Face Token
|
||||
ru: Hugging Face Token
|
||||
zh-Hant: Hugging Face Token
|
||||
zh: Hugging Face Token
|
||||
ko: Hugging Face Token
|
||||
- default: facebook/opt-125m
|
||||
edit: true
|
||||
envKey: MODEL
|
||||
labelEn: 模型
|
||||
labelZh: Model
|
||||
required: true
|
||||
type: text
|
||||
label:
|
||||
en: Model
|
||||
ja: モデル
|
||||
ms: Model
|
||||
pt-br: Modelo
|
||||
ru: Модель
|
||||
zh-Hant: 模型
|
||||
zh: Model
|
||||
ko: 모델
|
||||
|
|
@ -0,0 +1,27 @@
|
|||
services:
|
||||
vllm:
|
||||
image: vllm/vllm-openai:v0.7.2
|
||||
container_name: ${CONTAINER_NAME}
|
||||
restart: always
|
||||
runtime: nvidia
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
networks:
|
||||
- 1panel-network
|
||||
volumes:
|
||||
- ./cache/huggingface:/root/.cache/huggingface
|
||||
environment:
|
||||
HUGGING_FACE_HUB_TOKEN: ${HUGGING_FACE_HUB_TOKEN}
|
||||
HF_ENDPOINT: https://hf-mirror.com
|
||||
ports:
|
||||
- ${PANEL_APP_PORT_HTTP}:8000
|
||||
ipc: host
|
||||
command: --model ${MODEL}
|
||||
networks:
|
||||
1panel-network:
|
||||
external: true
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
## 使用说明
|
||||
1. 在 https://huggingface.co/ 注册账号并获取模型权限创建 token
|
||||
2. 机器上有 Nvidia GPU
|
||||
3. 修改 /etc/docker/daemon.json 并增加
|
||||
```
|
||||
"runtimes": {
|
||||
"nvidia": {
|
||||
"path": "nvidia-container-runtime",
|
||||
"runtimeArgs": []
|
||||
}
|
||||
}
|
||||
```
|
||||
4. 安装 nvidia-container-runtime 和 nvidia-docker2 组件
|
||||
|
||||
|
||||
## About
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
|
||||
vLLM is fast with:
|
||||
|
||||
- State-of-the-art serving throughput
|
||||
- Efficient management of attention key and value memory with **PagedAttention**
|
||||
- Continuous batching of incoming requests
|
||||
- Fast model execution with CUDA/HIP graph
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
|
||||
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
||||
|
||||
vLLM is flexible and easy to use with:
|
||||
|
||||
- Seamless integration with popular Hugging Face models
|
||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||
- Prefix caching support
|
||||
- Multi-lora support
|
||||
|
||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||
- Transformer-like LLMs (e.g., Llama)
|
||||
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
||||
- Embedding Models (e.g. E5-Mistral)
|
||||
- Multi-modal LLMs (e.g., LLaVA)
|
||||
|
||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
|
|
@ -0,0 +1,49 @@
|
|||
## 使用说明
|
||||
1. Register an account at https://huggingface.co/ and get model access to create a token.
|
||||
2. Ensure the machine has an Nvidia GPU.
|
||||
3. Modify the /etc/docker/daemon.json file and add:
|
||||
```
|
||||
"runtimes": {
|
||||
"nvidia": {
|
||||
"path": "nvidia-container-runtime",
|
||||
"runtimeArgs": []
|
||||
}
|
||||
}
|
||||
```
|
||||
4. Install the nvidia-container-runtime and nvidia-docker2 components.
|
||||
|
||||
|
||||
## About
|
||||
vLLM is a fast and easy-to-use library for LLM inference and serving.
|
||||
|
||||
vLLM is fast with:
|
||||
|
||||
- State-of-the-art serving throughput
|
||||
- Efficient management of attention key and value memory with **PagedAttention**
|
||||
- Continuous batching of incoming requests
|
||||
- Fast model execution with CUDA/HIP graph
|
||||
- Quantizations: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), INT4, INT8, and FP8.
|
||||
- Optimized CUDA kernels, including integration with FlashAttention and FlashInfer.
|
||||
- Speculative decoding
|
||||
- Chunked prefill
|
||||
|
||||
**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/4068) that compares the performance of vLLM against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
|
||||
|
||||
vLLM is flexible and easy to use with:
|
||||
|
||||
- Seamless integration with popular Hugging Face models
|
||||
- High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
|
||||
- Tensor parallelism and pipeline parallelism support for distributed inference
|
||||
- Streaming outputs
|
||||
- OpenAI-compatible API server
|
||||
- Support NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, TPU, and AWS Neuron.
|
||||
- Prefix caching support
|
||||
- Multi-lora support
|
||||
|
||||
vLLM seamlessly supports most popular open-source models on HuggingFace, including:
|
||||
- Transformer-like LLMs (e.g., Llama)
|
||||
- Mixture-of-Expert LLMs (e.g., Mixtral)
|
||||
- Embedding Models (e.g. E5-Mistral)
|
||||
- Multi-modal LLMs (e.g., LLaVA)
|
||||
|
||||
Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
|
||||
|
|
@ -0,0 +1,28 @@
|
|||
name: vLLM
|
||||
tags:
|
||||
- AI / 大模型
|
||||
title: 用于大语言模型的高吞吐量和内存高效的推理和服务引擎
|
||||
description: 用于大语言模型的高吞吐量和内存高效的推理和服务引擎
|
||||
additionalProperties:
|
||||
key: vllm
|
||||
name: vLLM
|
||||
tags:
|
||||
- AI
|
||||
shortDescZh: 用于大语言模型的高吞吐量和内存高效的推理和服务引擎
|
||||
shortDescEn: A high-throughput and memory-efficient inference and serving engine for LLMs
|
||||
description:
|
||||
en: A high-throughput and memory-efficient inference and serving engine for LLMs
|
||||
ja: 大規模言語モデル向けの高スループットでメモリ効率の良い推論およびサービスエンジン
|
||||
ms: Enjin inferens dan perkhidmatan yang cekap memori dan berkapasiti tinggi untuk LLM
|
||||
pt-br: Motor de inferência e serviço eficiente em memória e de alto rendimento para LLMs
|
||||
ru: Высокопроизводительный и эффективный по памяти движок вывода и обслуживания для LLM
|
||||
zh-Hant: 用於大語言模型的高吞吐量和內存高效的推理和服務引擎
|
||||
zh: 用于大语言模型的高吞吐量和内存高效的推理和服务引擎
|
||||
ko: 대형 언어 모델을 위한 고 처리량 및 메모리 효율적 추론 및 서비스 엔진
|
||||
type: tool
|
||||
crossVersionUpdate: true
|
||||
limit: 0
|
||||
recommend: 13
|
||||
website: https://github.com/vllm-project/vllm
|
||||
github: https://github.com/vllm-project/vllm
|
||||
document: https://docs.vllm.ai/en/latest/
|
||||
Binary file not shown.
|
After Width: | Height: | Size: 10 KiB |
Loading…
Reference in New Issue