Spaces:

DawnC
/

VividFlow

Running on Zero

App Files Files Community

VividFlow / VideoEngine_optimized.py

DawnC

Upload VideoEngine_optimized.py

eb66bbc verified about 1 month ago

raw

history blame contribute delete

12.5 kB

	import warnings
	warnings.filterwarnings('ignore', category=FutureWarning)
	warnings.filterwarnings('ignore', category=DeprecationWarning)

	import gc
	import os
	import tempfile
	import traceback
	from typing import Optional

	import torch
	import numpy as np
	from PIL import Image

	# Critical dependencies
	import ftfy
	import sentencepiece

	# Diffusers imports
	from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
	from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
	from diffusers.utils.export_utils import export_to_video


	class VideoEngine:
	"""
	Ultra-fast video generation with FP8 quantization.
	70-90s inference time (compared to 150s baseline).
	"""

	MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
	TRANSFORMER_REPO = "cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers"
	LORA_REPO = "Kijai/WanVideo_comfy"
	LORA_WEIGHT = "Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors"

	# Model parameters
	MAX_DIM = 832
	MIN_DIM = 480
	SQUARE_DIM = 640
	MULTIPLE_OF = 16
	FIXED_FPS = 16
	MIN_FRAMES = 8
	MAX_FRAMES = 81

	def __init__(self):
	"""Initialize VideoEngine."""
	self.is_spaces = os.environ.get('SPACE_ID') is not None
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	self.pipeline: Optional[WanImageToVideoPipeline] = None
	self.is_loaded = False
	self.use_aoti = False

	print(f"✓ VideoEngine initialized ({self.device})")

	def _check_xformers_available(self) -> bool:
	"""Check if xFormers is available."""
	try:
	import xformers
	return True
	except ImportError:
	return False

	def load_model(self) -> None:
	"""Load model with FP8 quantization and AOTI compilation."""
	if self.is_loaded:
	print("⚠ VideoEngine already loaded")
	return

	try:
	print("=" * 60)
	print("Loading Wan2.2 I2V Engine with FP8 Quantization")
	print("=" * 60)

	# Stage 1: Load base pipeline to CPU
	print("→ [1/5] Loading base pipeline to CPU...")
	self.pipeline = WanImageToVideoPipeline.from_pretrained(
	self.MODEL_ID,
	transformer=WanTransformer3DModel.from_pretrained(
	self.TRANSFORMER_REPO,
	subfolder='transformer',
	torch_dtype=torch.bfloat16,
	),
	transformer_2=WanTransformer3DModel.from_pretrained(
	self.TRANSFORMER_REPO,
	subfolder='transformer_2',
	torch_dtype=torch.bfloat16,
	),
	torch_dtype=torch.bfloat16,
	)
	print("✓ Base pipeline loaded to CPU")

	# Stage 2: Load and fuse Lightning LoRA
	print("→ [2/5] Loading Lightning LoRA...")
	self.pipeline.load_lora_weights(
	self.LORA_REPO, weight_name=self.LORA_WEIGHT,
	adapter_name="lightx2v"
	)
	kwargs_lora = {"load_into_transformer_2": True}
	self.pipeline.load_lora_weights(
	self.LORA_REPO, weight_name=self.LORA_WEIGHT,
	adapter_name="lightx2v_2", **kwargs_lora
	)
	self.pipeline.set_adapters(
	["lightx2v", "lightx2v_2"],
	adapter_weights=[1., 1.]
	)
	self.pipeline.fuse_lora(
	adapter_names=["lightx2v"], lora_scale=3.,
	components=["transformer"]
	)
	self.pipeline.fuse_lora(
	adapter_names=["lightx2v_2"], lora_scale=1.,
	components=["transformer_2"]
	)
	self.pipeline.unload_lora_weights()
	print("✓ Lightning LoRA fused")

	# Stage 3: FP8 Quantization
	print("→ [3/5] Applying FP8 quantization...")
	try:
	from torchao.quantization import quantize_
	from torchao.quantization import Float8DynamicActivationFloat8WeightConfig

	# int8_weight_only 在 torchao 0.16+ 改名為 Int8WeightOnlyConfig
	try:
	from torchao.quantization import int8_weight_only
	int8_config = int8_weight_only()
	except ImportError:
	from torchao.quantization import Int8WeightOnlyConfig
	int8_config = Int8WeightOnlyConfig()

	# Quantize text encoder (INT8)
	quantize_(self.pipeline.text_encoder, int8_config)

	# Quantize transformers (FP8)
	quantize_(
	self.pipeline.transformer,
	Float8DynamicActivationFloat8WeightConfig()
	)
	quantize_(
	self.pipeline.transformer_2,
	Float8DynamicActivationFloat8WeightConfig()
	)

	print("✓ FP8 quantization applied (50% memory reduction)")
	except Exception as e:
	print(f"⚠ Quantization failed: {e}")
	raise RuntimeError("FP8 quantization required for this optimized version")

	# Stage 4: AOTI compilation (disabled for stability)
	print("→ [4/5] Skipping AOTI compilation...")
	self.use_aoti = False
	print("✓ Using FP8 quantization only")

	# Stage 5: Move to GPU and enable optimizations
	print("→ [5/5] Moving to GPU...")
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	self.pipeline = self.pipeline.to('cuda')

	# Enable VAE optimizations (if available)
	try:
	if hasattr(self.pipeline, 'enable_vae_tiling'):
	self.pipeline.enable_vae_tiling()
	if hasattr(self.pipeline, 'enable_vae_slicing'):
	self.pipeline.enable_vae_slicing()
	print(" • VAE tiling/slicing enabled")
	except Exception as e:
	print(f" ⚠ VAE optimizations not available: {e}")

	# Enable TF32
	if torch.cuda.is_available():
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	# Enable xFormers
	try:
	if self._check_xformers_available():
	self.pipeline.enable_xformers_memory_efficient_attention()
	print(" • xFormers enabled")
	except:
	pass

	self.is_loaded = True
	print("=" * 60)
	print("✓ VideoEngine Ready")
	print(f" • Device: {self.device}")
	print(f" • Quantization: FP8 (50% memory reduction)")
	print("=" * 60)

	except Exception as e:
	print(f"\n{'='*60}")
	print("✗ FATAL ERROR LOADING VIDEO ENGINE")
	print(f"{'='*60}")
	print(f"Error Type: {type(e).__name__}")
	print(f"Error Message: {str(e)}")
	print(f"\nFull Traceback:")
	print(traceback.format_exc())
	print(f"{'='*60}")
	raise

	def resize_image(self, image: Image.Image) -> Image.Image:
	"""Resize image to fit model constraints while preserving aspect ratio."""
	width, height = image.size

	if width == height:
	return image.resize((self.SQUARE_DIM, self.SQUARE_DIM), Image.LANCZOS)

	aspect_ratio = width / height
	MAX_ASPECT_RATIO = self.MAX_DIM / self.MIN_DIM
	MIN_ASPECT_RATIO = self.MIN_DIM / self.MAX_DIM

	image_to_resize = image

	if aspect_ratio > MAX_ASPECT_RATIO:
	target_w, target_h = self.MAX_DIM, self.MIN_DIM
	crop_width = int(round(height * MAX_ASPECT_RATIO))
	left = (width - crop_width) // 2
	image_to_resize = image.crop((left, 0, left + crop_width, height))
	elif aspect_ratio < MIN_ASPECT_RATIO:
	target_w, target_h = self.MIN_DIM, self.MAX_DIM
	crop_height = int(round(width / MIN_ASPECT_RATIO))
	top = (height - crop_height) // 2
	image_to_resize = image.crop((0, top, width, top + crop_height))
	else:
	if width > height:
	target_w = self.MAX_DIM
	target_h = int(round(target_w / aspect_ratio))
	else:
	target_h = self.MAX_DIM
	target_w = int(round(target_h * aspect_ratio))

	final_w = round(target_w / self.MULTIPLE_OF) * self.MULTIPLE_OF
	final_h = round(target_h / self.MULTIPLE_OF) * self.MULTIPLE_OF
	final_w = max(self.MIN_DIM, min(self.MAX_DIM, final_w))
	final_h = max(self.MIN_DIM, min(self.MAX_DIM, final_h))

	return image_to_resize.resize((final_w, final_h), Image.LANCZOS)

	def get_num_frames(self, duration_seconds: float) -> int:
	"""Calculate frame count from duration."""
	return 1 + int(np.clip(
	int(round(duration_seconds * self.FIXED_FPS)),
	self.MIN_FRAMES,
	self.MAX_FRAMES,
	))

	def generate_video(
	self,
	image: Image.Image,
	prompt: str,
	duration_seconds: float = 3.0,
	num_inference_steps: int = 4,
	guidance_scale: float = 1.0,
	guidance_scale_2: float = 1.0,
	seed: int = 42,
	) -> str:
	"""Generate video from image with FP8 quantization."""
	if not self.is_loaded:
	raise RuntimeError("VideoEngine not loaded. Call load_model() first.")

	try:
	resized_image = self.resize_image(image)
	num_frames = self.get_num_frames(duration_seconds)

	print(f"\n→ Generating video:")
	print(f" • Prompt: {prompt}")
	print(f" • Resolution: {resized_image.width}x{resized_image.height}")
	print(f" • Frames: {num_frames} ({duration_seconds}s @ {self.FIXED_FPS}fps)")
	print(f" • Steps: {num_inference_steps}")

	# Memory cleanup
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	torch.cuda.synchronize()

	with torch.no_grad():
	# Use CUDA generator for optimized version
	generator = torch.Generator(device="cuda").manual_seed(seed)

	output_frames = self.pipeline(
	image=resized_image,
	prompt=prompt,
	height=resized_image.height,
	width=resized_image.width,
	num_frames=num_frames,
	guidance_scale=float(guidance_scale),
	guidance_scale_2=float(guidance_scale_2),
	num_inference_steps=int(num_inference_steps),
	generator=generator,
	).frames[0]

	# Cleanup after generation
	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Export video
	temp_dir = tempfile.gettempdir()
	output_path = os.path.join(temp_dir, f"deltaflow_{seed}.mp4")
	export_to_video(output_frames, output_path, fps=self.FIXED_FPS)

	print(f"✓ Video generated: {output_path}")
	return output_path

	except Exception as e:
	print(f"\n{'='*60}")
	print("✗ FATAL ERROR DURING VIDEO GENERATION")
	print(f"{'='*60}")
	print(f"Error Type: {type(e).__name__}")
	print(f"Error Message: {str(e)}")
	print(f"\nFull Traceback:")
	print(traceback.format_exc())
	print(f"{'='*60}")
	raise

	def unload_model(self) -> None:
	"""Unload pipeline and free memory."""
	if not self.is_loaded:
	return

	try:
	if self.pipeline is not None:
	del self.pipeline
	self.pipeline = None

	gc.collect()
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	self.is_loaded = False
	print("✓ VideoEngine unloaded")

	except Exception as e:
	print(f"⚠ Error during unload: {str(e)}")