""" Discord Scraper Module for Code Similarity Checker. Scrapes URLs from Discord channels for specific #tags. Supports incremental scraping (only fetches new messages since last scrape). """ import asyncio import os import re from typing import List, Optional, Tuple # Discord.py for bot interactions try: import discord except ImportError: raise ImportError("discord.py not installed. Discord scraping disabled.") try: DISCORD_BOT_TOKEN = os.environ.get("DISCORD_BOT_TOKEN") except KeyError: raise KeyError("DISCORD_BOT_TOKEN not set in environment") # URL regex pattern URL_PATTERN = re.compile( r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+[^\s]*", re.IGNORECASE ) async def _scrape_channel_messages( channel_id: str, tag: str, after_message_id: Optional[str] = None, limit: int = 500 ) -> Tuple[List[str], Optional[str]]: """ Internal async function to scrape messages from a Discord channel. Args: channel_id: Discord channel ID tag: Tag to search for (e.g., "#nlp-classification") after_message_id: If set, only fetch messages after this ID (incremental) limit: Maximum number of messages to fetch Returns: (urls, last_message_id) - list of URLs found and ID of newest message """ intents = discord.Intents.default() intents.message_content = True client = discord.Client(intents=intents) urls = [] newest_message_id = None @client.event async def on_ready(): nonlocal urls, newest_message_id try: channel = client.get_channel(int(channel_id)) if not channel: channel = await client.fetch_channel(int(channel_id)) if not channel: print(f"❌ Channel {channel_id} not found") await client.close() return # Prepare fetch parameters after = None if after_message_id: try: after = discord.Object(id=int(after_message_id)) except: pass # Fetch messages messages = [] async for message in channel.history( limit=limit, after=after, oldest_first=False ): messages.append(message) if messages: # Track newest message for incremental scraping newest_message_id = str(messages[0].id) # Search for tag and extract URLs tag_lower = tag.lower().lstrip("#") for message in messages: content = message.content # Check if message contains the tag # Match patterns like: #tag, # tag, or just the tag name at start tag_patterns = [f"#{tag_lower}", f"# {tag_lower}", tag_lower] content_lower = content.lower() # ===== Legacy API (backwards compatible) ===== # Check if any tag pattern is in the message has_tag = any(pattern in content_lower for pattern in tag_patterns) if has_tag: # Extract all URLs from this message found_urls = URL_PATTERN.findall(content) urls.extend(found_urls) print(f"✅ Scraped {len(messages)} messages, found {len(urls)} URLs for #{tag_lower}") except Exception as e: print(f"❌ Error scraping channel: {e}") finally: await client.close() # Run the client try: await client.start(DISCORD_BOT_TOKEN) except Exception as e: print(f"❌ Discord client error: {e}") if not client.is_closed(): await client.close() # Remove duplicates while preserving order seen = set() unique_urls = [] for url in urls: if url not in seen: seen.add(url) unique_urls.append(url) return unique_urls, newest_message_id def scrape_tag_from_channel( channel_id: str, tag: str, after_message_id: Optional[str] = None ) -> Tuple[List[str], Optional[str]]: """ Synchronous wrapper for Discord channel scraping. Uses a separate thread with its own event loop to avoid conflicts with FastAPI's uvloop. Args: channel_id: Discord channel ID tag: Tag to search for (e.g., "nlp-classification" or "#nlp-classification") after_message_id: For incremental scraping Returns: (urls, last_message_id) """ import concurrent.futures # Normalize tag tag = tag.lstrip("#") def run_in_thread(): """Run the async scraper in a new thread with its own event loop.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: return loop.run_until_complete( _scrape_channel_messages(channel_id, tag, after_message_id) ) finally: loop.close() # Run in a separate thread to avoid event loop conflicts with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor: future = executor.submit(run_in_thread) return future.result(timeout=60) # 60 second timeout def get_or_scrape_urls( channel_id: str, tag: str, force_refresh: bool = False ) -> List[str]: """ Main entry point. Uses cache with incremental updates. 1. Check cache for channel_id + tag 2. If cached and not force_refresh, fetch only NEW messages 3. Merge new URLs with cached URLs 4. Update cache Args: channel_id: Discord channel ID tag: Tag name (with or without #) force_refresh: If True, re-scrape everything ignoring cache Returns: List of URLs Raises: RuntimeError: If Discord is not configured """ import db # Run async db functions in the current thread's event loop loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: # Check cache cached = loop.run_until_complete(db.get_cached_discord_urls(channel_id, tag)) if cached and not force_refresh: # Incremental scrape - only get new messages after_id = cached.get("last_message_id") new_urls, new_last_id = scrape_tag_from_channel(channel_id, tag, after_id) if new_urls: # Merge: new URLs first, then cached (preserving order, no duplicates) all_urls = new_urls + [ u for u in cached.get("urls", []) if u not in new_urls ] last_id = new_last_id or after_id loop.run_until_complete(db.update_discord_cache(channel_id, tag, all_urls, last_id)) return all_urls else: # No new messages, return cached cached_urls = cached.get("urls", []) print(f"📋 Returning {len(cached_urls)} cached URLs for #{tag}") return cached_urls else: # Full scrape urls, last_id = scrape_tag_from_channel(channel_id, tag) if urls and last_id: loop.run_until_complete(db.update_discord_cache(channel_id, tag, urls, last_id)) return urls finally: loop.close()