jeeltcraft commited on
Commit
5e7aadc
·
verified ·
1 Parent(s): 8e2d3ce

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +74 -133
main.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from typing import List, Optional
4
- from ctransformers import AutoModelForCausalLM
5
  import os
6
  import uuid
7
  import time
@@ -56,15 +56,16 @@ app = FastAPI(
56
  OpenAI-compatible API powered by Qwen2.5-Coder-7B for code generation and assistance.
57
 
58
  ### Features
59
- * 🤖 AI-powered code generation with Qwen2.5-Coder
60
  * 🔌 OpenAI-compatible endpoints for Cursor IDE integration
61
  * 💰 ETH unit conversion utilities (Wei ↔ Gwei ↔ ETH)
62
  * 💻 Optimized for coding tasks and assistance
 
63
 
64
  ### Integration with Cursor IDE
65
  1. Go to Cursor Settings → Models → Override OpenAI Base URL
66
  2. Set Base URL: `https://jeeltcraft-luminous.hf.space/v1`
67
- 3. Model name: `qwen2.5-coder-7b`
68
  4. Add any dummy API key
69
  """,
70
  version="1.0.0",
@@ -105,47 +106,62 @@ _llm_model = None
105
 
106
  def get_llm():
107
  """
108
- Lazy load the model on first use.
109
  Model files are preloaded by Hugging Face Spaces during build time.
110
  """
111
  global _llm_model
112
 
113
  if _llm_model is None:
114
  try:
115
- # Files are preloaded to ~/.cache/huggingface/hub/
116
- model_path = "CISCai/Qwen2.5-Coder-7B-Instruct-SOTA-GGUF"
117
  model_file = "Qwen2.5-Coder-7B-Instruct.IQ4_XS.gguf"
118
 
119
- _llm_model = AutoModelForCausalLM.from_pretrained(
120
- model_path,
121
- model_file=model_file,
122
- model_type="qwen2",
123
- gpu_layers=0, # CPU only
124
- context_length=2048,
125
- max_new_tokens=512,
126
  )
127
- print("✓ Model loaded successfully")
 
 
 
 
 
 
 
 
 
 
 
 
128
  except Exception as e:
129
  print(f"✗ Error loading model: {e}")
130
  raise
131
 
132
  return _llm_model
133
 
134
- def call_llm(prompt: str) -> str:
135
  """
136
- Generate response using the preloaded ctransformers model.
137
  """
138
  try:
139
  llm = get_llm()
 
140
  response = llm(
141
  prompt,
142
- max_new_tokens=512,
143
- temperature=0.7,
144
  top_p=0.95,
145
- repetition_penalty=1.1,
146
- stop=["</s>", "<|user|>", "<|im_end|>"]
 
147
  )
148
- return response.strip()
 
 
149
  except Exception as e:
150
  return f"Error during inference: {str(e)}"
151
 
@@ -193,6 +209,21 @@ def convert_eth_units(value: float, from_unit: str = "eth") -> dict:
193
  }
194
  }
195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
  # ============== OpenAI-Compatible Endpoints ==============
197
 
198
  @app.post(
@@ -211,7 +242,7 @@ async def chat_completions(request: ChatCompletionRequest):
211
  the model's response.
212
 
213
  ## Parameters
214
- - **model**: Model identifier (use `qwen2.5-coder-7b` for this API)
215
  - **messages**: Array of conversation messages with role and content
216
  - Role can be: `system`, `user`, or `assistant`
217
  - **temperature**: Controls randomness (0.0 = deterministic, 2.0 = very random)
@@ -242,11 +273,21 @@ async def chat_completions(request: ChatCompletionRequest):
242
  ""
243
  )
244
 
245
- # Format prompt for Qwen2.5-Coder
246
- formatted_prompt = f"<|im_start|>system\nYou are a helpful coding assistant.<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
 
 
 
 
 
 
247
 
248
- # Call your LLM
249
- response_text = call_llm(formatted_prompt)
 
 
 
 
250
 
251
  # Simple token counting (word-based estimation)
252
  prompt_tokens = len(user_message.split())
@@ -293,6 +334,12 @@ async def list_models():
293
  "object": "model",
294
  "created": int(time.time()),
295
  "owned_by": "jeeltcraft"
 
 
 
 
 
 
296
  }
297
  ]
298
  }
@@ -363,110 +410,4 @@ async def convert_units(request: EthConversionRequest):
363
  Conversions to Wei, Gwei, and ETH with both numeric and formatted values.
364
  """
365
  try:
366
- result = convert_eth_units(request.value, request.from_unit)
367
- return result
368
- except ValueError as e:
369
- raise HTTPException(status_code=400, detail=str(e))
370
- except Exception as e:
371
- raise HTTPException(status_code=500, detail=f"Conversion error: {str(e)}")
372
-
373
- @app.post(
374
- "/eth_to_units",
375
- tags=["Utilities"],
376
- summary="Quick convert: ETH to Wei/Gwei",
377
- response_description="Returns Wei and Gwei values"
378
- )
379
- async def eth_to_units(item: Validation):
380
- """
381
- Quick converter: Extract a number from text and convert from ETH to Wei and Gwei.
382
-
383
- This endpoint extracts the first number found in the prompt and treats it as ETH,
384
- then converts it to Wei and Gwei. Useful for quick conversions in chat interfaces.
385
-
386
- ## Example
387
- Send prompt: `"Convert 0.5 ETH"` or just `"0.5"`
388
-
389
- Returns the value in Wei and Gwei.
390
-
391
- - **prompt**: Text containing an ETH amount (number will be extracted)
392
- """
393
- try:
394
- # Extract number from prompt
395
- match = re.search(r'\d+\.?\d*', item.prompt)
396
- if match:
397
- eth_value = float(match.group())
398
- result = convert_eth_units(eth_value, "eth")
399
- return result
400
- else:
401
- raise HTTPException(status_code=400, detail="No numeric value found in prompt")
402
- except ValueError as e:
403
- raise HTTPException(status_code=400, detail=str(e))
404
- except Exception as e:
405
- raise HTTPException(status_code=500, detail=f"Conversion error: {str(e)}")
406
-
407
- @app.get(
408
- "/quick_convert/{value}/{unit}",
409
- tags=["Utilities"],
410
- summary="Quick URL-based ETH conversion",
411
- response_description="Returns conversions to all units"
412
- )
413
- async def quick_convert(value: float, unit: str = "eth"):
414
- """
415
- Quick conversion via URL path parameters.
416
-
417
- ## Usage Examples
418
- - `/quick_convert/1/eth` - Convert 1 ETH to Wei and Gwei
419
- - `/quick_convert/50/gwei` - Convert 50 Gwei to ETH and Wei
420
- - `/quick_convert/1000000000/wei` - Convert 1,000,000,000 Wei to ETH and Gwei
421
-
422
- ## Parameters
423
- - **value**: Numeric amount to convert
424
- - **unit**: Source unit (`eth`, `gwei`, or `wei`)
425
- """
426
- try:
427
- result = convert_eth_units(value, unit)
428
- return result
429
- except ValueError as e:
430
- raise HTTPException(status_code=400, detail=str(e))
431
- except Exception as e:
432
- raise HTTPException(status_code=500, detail=f"Conversion error: {str(e)}")
433
-
434
- # ============== Health & Info Endpoints ==============
435
-
436
- @app.get(
437
- "/",
438
- tags=["Utilities"],
439
- summary="API root information",
440
- response_description="Returns API status and information"
441
- )
442
- async def root():
443
- """
444
- Get basic information about the API.
445
-
446
- Returns the API name, status, and current model being used.
447
- """
448
- return {
449
- "message": "Luminous API - OpenAI Compatible Coding Assistant",
450
- "status": "active",
451
- "model": "Qwen2.5-Coder-7B-Instruct",
452
- "docs": "/docs",
453
- "openapi": "/openapi.json"
454
- }
455
-
456
- @app.get(
457
- "/health",
458
- tags=["Utilities"],
459
- summary="Health check",
460
- response_description="Returns health status and diagnostics"
461
- )
462
- async def health_check():
463
- """
464
- Check the health status of the API.
465
-
466
- Returns information about model loading status.
467
- """
468
- return {
469
- "status": "healthy",
470
- "model_loaded": _llm_model is not None,
471
- "api_version": "1.0.0"
472
- }
 
1
  from fastapi import FastAPI, HTTPException
2
  from pydantic import BaseModel
3
  from typing import List, Optional
4
+ from llama_cpp import Llama
5
  import os
6
  import uuid
7
  import time
 
56
  OpenAI-compatible API powered by Qwen2.5-Coder-7B for code generation and assistance.
57
 
58
  ### Features
59
+ * 🤖 AI-powered code generation with Qwen2.5-Coder (GGUF quantized)
60
  * 🔌 OpenAI-compatible endpoints for Cursor IDE integration
61
  * 💰 ETH unit conversion utilities (Wei ↔ Gwei ↔ ETH)
62
  * 💻 Optimized for coding tasks and assistance
63
+ * ⚡ Fast inference with llama.cpp
64
 
65
  ### Integration with Cursor IDE
66
  1. Go to Cursor Settings → Models → Override OpenAI Base URL
67
  2. Set Base URL: `https://jeeltcraft-luminous.hf.space/v1`
68
+ 3. Model name: `qwen2.5-coder-7b` or `gpt-4`
69
  4. Add any dummy API key
70
  """,
71
  version="1.0.0",
 
106
 
107
  def get_llm():
108
  """
109
+ Lazy load the GGUF model using llama-cpp-python.
110
  Model files are preloaded by Hugging Face Spaces during build time.
111
  """
112
  global _llm_model
113
 
114
  if _llm_model is None:
115
  try:
116
+ # Model path after preload_from_hub
117
+ model_repo = "CISCai/Qwen2.5-Coder-7B-Instruct-SOTA-GGUF"
118
  model_file = "Qwen2.5-Coder-7B-Instruct.IQ4_XS.gguf"
119
 
120
+ # Build full path
121
+ from huggingface_hub import hf_hub_download
122
+ model_path = hf_hub_download(
123
+ repo_id=model_repo,
124
+ filename=model_file,
125
+ cache_dir="/root/.cache/huggingface/hub"
 
126
  )
127
+
128
+ print(f"📦 Loading model from: {model_path}")
129
+
130
+ _llm_model = Llama(
131
+ model_path=model_path,
132
+ n_ctx=2048, # Context window
133
+ n_threads=4, # CPU threads to use
134
+ n_gpu_layers=0, # 0 for CPU only
135
+ verbose=False, # Reduce logging
136
+ seed=42 # For reproducibility
137
+ )
138
+
139
+ print("✓ Model loaded successfully with llama.cpp")
140
  except Exception as e:
141
  print(f"✗ Error loading model: {e}")
142
  raise
143
 
144
  return _llm_model
145
 
146
+ def call_llm(prompt: str, max_tokens: int = 512, temperature: float = 0.7) -> str:
147
  """
148
+ Generate response using the preloaded GGUF model via llama.cpp.
149
  """
150
  try:
151
  llm = get_llm()
152
+
153
  response = llm(
154
  prompt,
155
+ max_tokens=max_tokens,
156
+ temperature=temperature,
157
  top_p=0.95,
158
+ repeat_penalty=1.1,
159
+ stop=["</s>", "<|user|>", "<|im_end|>", "<|im_start|>"],
160
+ echo=False # Don't include the prompt in output
161
  )
162
+
163
+ # Extract text from llama.cpp response
164
+ return response['choices'][0]['text'].strip()
165
  except Exception as e:
166
  return f"Error during inference: {str(e)}"
167
 
 
209
  }
210
  }
211
 
212
+ # ============== Startup Event ==============
213
+
214
+ @app.on_event("startup")
215
+ async def startup_event():
216
+ """
217
+ Pre-load the model during startup to avoid timeout on first request.
218
+ """
219
+ print("🚀 Starting up Luminous API...")
220
+ try:
221
+ get_llm() # This will load the model
222
+ print("✅ Model loaded and ready!")
223
+ except Exception as e:
224
+ print(f"⚠️ Warning: Could not pre-load model: {e}")
225
+ print("Model will be loaded on first request.")
226
+
227
  # ============== OpenAI-Compatible Endpoints ==============
228
 
229
  @app.post(
 
242
  the model's response.
243
 
244
  ## Parameters
245
+ - **model**: Model identifier (use `qwen2.5-coder-7b` or `gpt-4` for this API)
246
  - **messages**: Array of conversation messages with role and content
247
  - Role can be: `system`, `user`, or `assistant`
248
  - **temperature**: Controls randomness (0.0 = deterministic, 2.0 = very random)
 
273
  ""
274
  )
275
 
276
+ # Get system message if provided
277
+ system_message = next(
278
+ (msg.content for msg in request.messages if msg.role == "system"),
279
+ "You are a helpful coding assistant."
280
+ )
281
+
282
+ # Format prompt for Qwen2.5-Coder using ChatML format
283
+ formatted_prompt = f"<|im_start|>system\n{system_message}<|im_end|>\n<|im_start|>user\n{user_message}<|im_end|>\n<|im_start|>assistant\n"
284
 
285
+ # Call LLM
286
+ response_text = call_llm(
287
+ formatted_prompt,
288
+ max_tokens=request.max_tokens,
289
+ temperature=request.temperature
290
+ )
291
 
292
  # Simple token counting (word-based estimation)
293
  prompt_tokens = len(user_message.split())
 
334
  "object": "model",
335
  "created": int(time.time()),
336
  "owned_by": "jeeltcraft"
337
+ },
338
+ {
339
+ "id": "gpt-4", # Alias for better Cursor compatibility
340
+ "object": "model",
341
+ "created": int(time.time()),
342
+ "owned_by": "jeeltcraft"
343
  }
344
  ]
345
  }
 
410
  Conversions to Wei, Gwei, and ETH with both numeric and formatted values.
411
  """
412
  try:
413
+ result = convert_eth_units(req