| import os |
| import requests |
| from fastapi import FastAPI, HTTPException |
| from fastapi.middleware.cors import CORSMiddleware |
| from fastapi.responses import HTMLResponse |
| from llama_cpp import Llama |
| from pydantic import BaseModel |
| import uvicorn |
|
|
| |
| MODEL_URL = "https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" |
| MODEL_NAME = "DeepSeek-R1-Distill-Qwen-1.5B-Q5_K_M.gguf" |
| MODEL_DIR = "model" |
| MODEL_PATH = os.path.join(MODEL_DIR, MODEL_NAME) |
|
|
| |
| os.makedirs(MODEL_DIR, exist_ok=True) |
|
|
| |
| if not os.path.exists(MODEL_PATH): |
| print(f"Downloading model from {MODEL_URL}...") |
| response = requests.get(MODEL_URL, stream=True) |
| if response.status_code == 200: |
| with open(MODEL_PATH, "wb") as f: |
| for chunk in response.iter_content(chunk_size=8192): |
| f.write(chunk) |
| print("Model downloaded successfully!") |
| else: |
| raise RuntimeError(f"Failed to download model: HTTP {response.status_code}") |
| else: |
| print("Model already exists. Skipping download.") |
|
|
| |
| app = FastAPI( |
| title="DeepSeek-R1 OpenAI-Compatible API", |
| description="OpenAI-compatible API for DeepSeek-R1-Distill-Qwen-1.5B", |
| version="1.0.0" |
| ) |
|
|
| |
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| |
| print("Loading model...") |
| try: |
| llm = Llama( |
| model_path=MODEL_PATH, |
| n_ctx=2048, |
| n_threads=4, |
| n_gpu_layers=0, |
| verbose=False |
| ) |
| print("Model loaded successfully!") |
| except Exception as e: |
| raise RuntimeError(f"Failed to load model: {str(e)}") |
|
|
| |
| @app.get("/", response_class=HTMLResponse) |
| async def root(): |
| return f""" |
| <html> |
| <head> |
| <title>DeepSeek-R1 OpenAI API</title> |
| <style> |
| body {{ font-family: Arial, sans-serif; max-width: 800px; margin: 20px auto; padding: 0 20px; }} |
| .warning {{ color: #dc3545; background: #ffeef0; padding: 15px; border-radius: 5px; }} |
| a {{ color: #007bff; text-decoration: none; }} |
| code {{ background: #f8f9fa; padding: 2px 4px; border-radius: 4px; }} |
| </style> |
| </head> |
| <body> |
| <h1>DeepSeek-R1 OpenAI-Compatible API</h1> |
| |
| <div class="warning"> |
| <h3>⚠️ Important Notice</h3> |
| <p>For private use, please duplicate this space:<br> |
| 1. Click your profile picture in the top-right<br> |
| 2. Select "Duplicate Space"<br> |
| 3. Set visibility to Private</p> |
| </div> |
| |
| <h2>API Documentation</h2> |
| <ul> |
| <li><a href="/docs">Interactive Swagger Documentation</a></li> |
| <li><a href="/redoc">ReDoc Documentation</a></li> |
| </ul> |
| |
| <h2>API Endpoints</h2> |
| <h3>Chat Completion</h3> |
| <p><code>POST /v1/chat/completions</code></p> |
| <p>Parameters:</p> |
| <ul> |
| <li><strong>messages</strong>: List of message objects</li> |
| <li><strong>max_tokens</strong>: Maximum response length (default: 128)</li> |
| <li><strong>temperature</strong>: Sampling temperature (default: 0.7)</li> |
| <li><strong>top_p</strong>: Nucleus sampling threshold (default: 0.9)</li> |
| </ul> |
| |
| <h2>Example Request</h2> |
| <pre> |
| curl -X POST "{os.environ.get('SPACE_HOST', 'http://localhost:7860')}/v1/chat/completions" \\ |
| -H "Content-Type: application/json" \\ |
| -d '{{ |
| "messages": [{{"role": "user", "content": "Explain quantum computing"}}], |
| "max_tokens": 150 |
| }}' |
| </pre> |
| </body> |
| </html> |
| """ |
|
|
| |
| class ChatCompletionRequest(BaseModel): |
| model: str = "DeepSeek-R1-Distill-Qwen-1.5B" |
| messages: list[dict] |
| max_tokens: int = 128 |
| temperature: float = 0.7 |
| top_p: float = 0.9 |
| stream: bool = False |
|
|
| |
| class ChatCompletionResponse(BaseModel): |
| id: str = "chatcmpl-12345" |
| object: str = "chat.completion" |
| created: int = 1693161600 |
| model: str = "DeepSeek-R1-Distill-Qwen-1.5B" |
| choices: list[dict] |
| usage: dict |
|
|
| @app.post("/v1/chat/completions") |
| async def chat_completion(request: ChatCompletionRequest): |
| try: |
| prompt = "\n".join([f"{msg['role']}: {msg['content']}" for msg in request.messages]) |
| prompt += "\nassistant:" |
|
|
| response = llm( |
| prompt=prompt, |
| max_tokens=request.max_tokens, |
| temperature=request.temperature, |
| top_p=request.top_p, |
| stop=["</s>"] |
| ) |
|
|
| return ChatCompletionResponse( |
| choices=[{ |
| "index": 0, |
| "message": { |
| "role": "assistant", |
| "content": response['choices'][0]['text'].strip() |
| }, |
| "finish_reason": "stop" |
| }], |
| usage={ |
| "prompt_tokens": len(prompt), |
| "completion_tokens": len(response['choices'][0]['text']), |
| "total_tokens": len(prompt) + len(response['choices'][0]['text']) |
| } |
| ) |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| @app.get("/health") |
| def health_check(): |
| return {"status": "healthy"} |
|
|
| if __name__ == "__main__": |
| uvicorn.run(app, host="0.0.0.0", port=7860) |