Luminous

Sleeping

App Files Files Community

jeeltcraft commited on about 1 month ago

Commit

5e7aadc

verified ·

1 Parent(s): 8e2d3ce

Update main.py

Browse files

Files changed (1) hide show

main.py +74 -133

main.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from typing import List, Optional
-from ctransformers import AutoModelForCausalLM
 import os
 import uuid
 import time
@@ -56,15 +56,16 @@ app = FastAPI(
     OpenAI-compatible API powered by Qwen2.5-Coder-7B for code generation and assistance.
     ### Features
-    * 🤖 AI-powered code generation with Qwen2.5-Coder
     * 🔌 OpenAI-compatible endpoints for Cursor IDE integration
     * 💰 ETH unit conversion utilities (Wei ↔ Gwei ↔ ETH)
     * 💻 Optimized for coding tasks and assistance
     ### Integration with Cursor IDE
     1. Go to Cursor Settings → Models → Override OpenAI Base URL
     2. Set Base URL: `https://jeeltcraft-luminous.hf.space/v1`
-    3. Model name: `qwen2.5-coder-7b`
     4. Add any dummy API key
     """,
     version="1.0.0",
@@ -105,47 +106,62 @@ _llm_model = None
 def get_llm():
     """
-    Lazy load the model on first use.
     Model files are preloaded by Hugging Face Spaces during build time.
     """
     global _llm_model
     if _llm_model is None:
         try:
-            # Files are preloaded to ~/.cache/huggingface/hub/
-            model_path = "CISCai/Qwen2.5-Coder-7B-Instruct-SOTA-GGUF"
             model_file = "Qwen2.5-Coder-7B-Instruct.IQ4_XS.gguf"
-            _llm_model = AutoModelForCausalLM.from_pretrained(
-                model_path,
-                model_file=model_file,
-                model_type="qwen2",
-                gpu_layers=0,  # CPU only
-                context_length=2048,
-                max_new_tokens=512,
             )
-            print("✓ Model loaded successfully")
         except Exception as e:
             print(f"✗ Error loading model: {e}")
             raise
     return _llm_model
-def call_llm(prompt: str) -> str:
     """
-    Generate response using the preloaded ctransformers model.
     """
     try:
         llm = get_llm()
         response = llm(
             prompt,
-            max_new_tokens=512,
-            temperature=0.7,
             top_p=0.95,
-            repetition_penalty=1.1,
-            stop=["</s>", "<|user|>", "<|im_end|>"]
         )
-        return response.strip()
     except Exception as e:
         return f"Error during inference: {str(e)}"
@@ -193,6 +209,21 @@ def convert_eth_units(value: float, from_unit: str = "eth") -> dict:
         }
     }
 # ============== OpenAI-Compatible Endpoints ==============
 @app.post(
@@ -211,7 +242,7 @@ async def chat_completions(request: ChatCompletionRequest):
     the model's response.
     ## Parameters
-    - **model**: Model identifier (use `qwen2.5-coder-7b` for this API)
     - **messages**: Array of conversation messages with role and content
         - Role can be: `system`, `user`, or `assistant`
     - **temperature**: Controls randomness (0.0 = deterministic, 2.0 = very random)
@@ -242,11 +273,21 @@ async def chat_completions(request: ChatCompletionRequest):
             ""
         )
-        # Format prompt for Qwen2.5-Coder
-        formatted_prompt = f"<|im_start|>system\nYou are a helpful coding assistant.<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
-        # Call your LLM
-        response_text = call_llm(formatted_prompt)
         # Simple token counting (word-based estimation)
         prompt_tokens = len(user_message.split())
@@ -293,6 +334,12 @@ async def list_models():
                 "object": "model",
                 "created": int(time.time()),
                 "owned_by": "jeeltcraft"
             }
         ]
     }
@@ -363,110 +410,4 @@ async def convert_units(request: EthConversionRequest):
     Conversions to Wei, Gwei, and ETH with both numeric and formatted values.
     """
     try:
-        result = convert_eth_units(request.value, request.from_unit)
-        return result
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Conversion error: {str(e)}")
-@app.post(
-    "/eth_to_units",
-    tags=["Utilities"],
-    summary="Quick convert: ETH to Wei/Gwei",
-    response_description="Returns Wei and Gwei values"
-)
-async def eth_to_units(item: Validation):
-    """
-    Quick converter: Extract a number from text and convert from ETH to Wei and Gwei.
-    This endpoint extracts the first number found in the prompt and treats it as ETH,
-    then converts it to Wei and Gwei. Useful for quick conversions in chat interfaces.
-    ## Example
-    Send prompt: `"Convert 0.5 ETH"` or just `"0.5"`
-    Returns the value in Wei and Gwei.
-    - **prompt**: Text containing an ETH amount (number will be extracted)
-    """
-    try:
-        # Extract number from prompt
-        match = re.search(r'\d+\.?\d*', item.prompt)
-        if match:
-            eth_value = float(match.group())
-            result = convert_eth_units(eth_value, "eth")
-            return result
-        else:
-            raise HTTPException(status_code=400, detail="No numeric value found in prompt")
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Conversion error: {str(e)}")
-@app.get(
-    "/quick_convert/{value}/{unit}",
-    tags=["Utilities"],
-    summary="Quick URL-based ETH conversion",
-    response_description="Returns conversions to all units"
-)
-async def quick_convert(value: float, unit: str = "eth"):
-    """
-    Quick conversion via URL path parameters.
-    ## Usage Examples
-    - `/quick_convert/1/eth` - Convert 1 ETH to Wei and Gwei
-    - `/quick_convert/50/gwei` - Convert 50 Gwei to ETH and Wei
-    - `/quick_convert/1000000000/wei` - Convert 1,000,000,000 Wei to ETH and Gwei
-    ## Parameters
-    - **value**: Numeric amount to convert
-    - **unit**: Source unit (`eth`, `gwei`, or `wei`)
-    """
-    try:
-        result = convert_eth_units(value, unit)
-        return result
-    except ValueError as e:
-        raise HTTPException(status_code=400, detail=str(e))
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Conversion error: {str(e)}")
-# ============== Health & Info Endpoints ==============
-@app.get(
-    "/",
-    tags=["Utilities"],
-    summary="API root information",
-    response_description="Returns API status and information"
-)
-async def root():
-    """
-    Get basic information about the API.
-    Returns the API name, status, and current model being used.
-    """
-    return {
-        "message": "Luminous API - OpenAI Compatible Coding Assistant",
-        "status": "active",
-        "model": "Qwen2.5-Coder-7B-Instruct",
-        "docs": "/docs",
-        "openapi": "/openapi.json"
-    }
-@app.get(
-    "/health",
-    tags=["Utilities"],
-    summary="Health check",
-    response_description="Returns health status and diagnostics"
-)
-async def health_check():
-    """
-    Check the health status of the API.
-    Returns information about model loading status.
-    """
-    return {
-        "status": "healthy",
-        "model_loaded": _llm_model is not None,
-        "api_version": "1.0.0"
-    }

 from fastapi import FastAPI, HTTPException
 from pydantic import BaseModel
 from typing import List, Optional
+from llama_cpp import Llama
 import os
 import uuid
 import time
     OpenAI-compatible API powered by Qwen2.5-Coder-7B for code generation and assistance.
     ### Features
+    * 🤖 AI-powered code generation with Qwen2.5-Coder (GGUF quantized)
     * 🔌 OpenAI-compatible endpoints for Cursor IDE integration
     * 💰 ETH unit conversion utilities (Wei ↔ Gwei ↔ ETH)
     * 💻 Optimized for coding tasks and assistance
+    * ⚡ Fast inference with llama.cpp
     ### Integration with Cursor IDE
     1. Go to Cursor Settings → Models → Override OpenAI Base URL
     2. Set Base URL: `https://jeeltcraft-luminous.hf.space/v1`
+    3. Model name: `qwen2.5-coder-7b` or `gpt-4`
     4. Add any dummy API key
     """,
     version="1.0.0",
 def get_llm():
     """
+    Lazy load the GGUF model using llama-cpp-python.
     Model files are preloaded by Hugging Face Spaces during build time.
     """
     global _llm_model
     if _llm_model is None:
         try:
+            # Model path after preload_from_hub
+            model_repo = "CISCai/Qwen2.5-Coder-7B-Instruct-SOTA-GGUF"
             model_file = "Qwen2.5-Coder-7B-Instruct.IQ4_XS.gguf"
+            # Build full path
+            from huggingface_hub import hf_hub_download
+            model_path = hf_hub_download(
+                repo_id=model_repo,
+                filename=model_file,
+                cache_dir="/root/.cache/huggingface/hub"
             )
+            print(f"📦 Loading model from: {model_path}")
+            _llm_model = Llama(
+                model_path=model_path,
+                n_ctx=2048,          # Context window
+                n_threads=4,         # CPU threads to use
+                n_gpu_layers=0,      # 0 for CPU only
+                verbose=False,       # Reduce logging
+                seed=42              # For reproducibility
+            )
+            print("✓ Model loaded successfully with llama.cpp")
         except Exception as e:
             print(f"✗ Error loading model: {e}")
             raise
     return _llm_model
+def call_llm(prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
     """
+    Generate response using the preloaded GGUF model via llama.cpp.
     """
     try:
         llm = get_llm()
         response = llm(
             prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
             top_p=0.95,
+            repeat_penalty=1.1,
+            stop=["</s>", "<|user|>", "<|im_end|>", "<|im_start|>"],
+            echo=False  # Don't include the prompt in output
         )
+        # Extract text from llama.cpp response
+        return response['choices'][0]['text'].strip()
     except Exception as e:
         return f"Error during inference: {str(e)}"
         }
     }
+# ============== Startup Event ==============
+@app.on_event("startup")
+async def startup_event():
+    """
+    Pre-load the model during startup to avoid timeout on first request.
+    """
+    print("🚀 Starting up Luminous API...")
+    try:
+        get_llm()  # This will load the model
+        print("✅ Model loaded and ready!")
+    except Exception as e:
+        print(f"⚠️ Warning: Could not pre-load model: {e}")
+        print("Model will be loaded on first request.")
 # ============== OpenAI-Compatible Endpoints ==============
 @app.post(
     the model's response.
     ## Parameters
+    - **model**: Model identifier (use `qwen2.5-coder-7b` or `gpt-4` for this API)
     - **messages**: Array of conversation messages with role and content
         - Role can be: `system`, `user`, or `assistant`
     - **temperature**: Controls randomness (0.0 = deterministic, 2.0 = very random)
             ""
         )
+        # Get system message if provided
+        system_message = next(
+            (msg.content for msg in request.messages if msg.role == "system"),
+            "You are a helpful coding assistant."
+        )
+        # Format prompt for Qwen2.5-Coder using ChatML format
+        formatted_prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
+        # Call LLM
+        response_text = call_llm(
+            formatted_prompt,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature
+        )
         # Simple token counting (word-based estimation)
         prompt_tokens = len(user_message.split())
                 "object": "model",
                 "created": int(time.time()),
                 "owned_by": "jeeltcraft"
+            },
+            {
+                "id": "gpt-4",  # Alias for better Cursor compatibility
+                "object": "model",
+                "created": int(time.time()),
+                "owned_by": "jeeltcraft"
             }
         ]
     }
     Conversions to Wei, Gwei, and ETH with both numeric and formatted values.
     """
     try:
+        result = convert_eth_units(req