import gradio as gr
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
import torch

device = torch.device("cpu")
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
model.to(device)

# Template khusus untuk gaya deskripsi natural language eksplisit
INSTRUCTION = (
    "Describe this NSFW image in highly explicit, natural language. "
    "Include specific details about the subject's pose, nudity, sexual context, facial expression, body parts, hairstyle, clothing (if any), background, lighting, and emotional tone. "
    "Use clear, uncensored language, and write it as a vivid and realistic scene description."
)

def generate_detailed_prompt(image):
    inputs = processor(image, text=INSTRUCTION, return_tensors="pt").to(device)
    output = model.generate(**inputs, max_new_tokens=150)
    caption = processor.decode(output[0], skip_special_tokens=True)
    return caption

demo = gr.Interface(
    fn=generate_detailed_prompt,
    inputs=gr.Image(type="pil"),
    outputs="text",
    title="🔞 NSFW Image to Natural Prompt (BLIP)",
    description="Upload NSFW image to get a detailed, uncensored, natural-language prompt. Based on BLIP. CPU-only.",
    allow_flagging="never"
)

demo.launch()