import os
import requests
from firecrawl import Firecrawl
from dotenv import load_dotenv
from xml.etree import ElementTree
import time

load_dotenv()

FIRECRAWL_API_KEY = os.getenv("FIRECRAWL_API_KEY")
WEBSITE_URL_TO_CRAWL = os.getenv("WEBSITE_URL_TO_CRAWL")

firecrawl = Firecrawl(api_key=FIRECRAWL_API_KEY)

def fetch_sitemap(url):
    response = requests.get(url)
    response.raise_for_status()
    # Remove leading whitespace to avoid XML declaration errors
    content = response.content.lstrip()
    return ElementTree.fromstring(content)

def extract_urls(sitemap_xml):
    urls = []
    for url in sitemap_xml.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
        loc = url.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
        if loc is not None:
            urls.append(loc.text)
    return urls

def save_markdown(markdown, directory, filename):
    # Create base markdown-data directory
    base_dir = "markdown-data"
    full_dir = os.path.join(base_dir, directory)
    
    # Create directory if it doesn't exist
    os.makedirs(full_dir, exist_ok=True)
    
    # Save markdown to file
    with open(os.path.join(full_dir, filename), 'w', encoding='utf-8') as f:
        f.write(markdown)

def process_sitemap(sitemap_url, directory):
    print(f"Fetching sitemap: {sitemap_url}")
    try:
        sitemap_xml = fetch_sitemap(sitemap_url)
    except Exception as e:
        print(f"Error fetching sitemap {sitemap_url}: {e}")
        return
    urls = extract_urls(sitemap_xml)
    print(f"Found {len(urls)} URLs in {sitemap_url}")
    for url in urls:
        print(f"Processing: {url}")
        try:
            # Scrape a website:
            result = firecrawl.scrape(url, formats=["markdown"])
            # If result is a dict with 'markdown', extract it
            if isinstance(result, dict) and 'markdown' in result:
                markdown = result['markdown']
            else:
                markdown = result if isinstance(result, str) else str(result)
            if not markdown:
                print(f"No markdown returned for {url}")
            # Use page name from URL for filename
            page_name = url.rstrip("/").split("/")[-1] or "index"
            save_markdown(markdown, directory, f"{page_name}.md")
            print(f"Saved markdown for {url} as {page_name}.md")
            # Add delay to avoid rate limit errors
            time.sleep(5)
        except Exception as e:
            print(f"Error crawling {url}: {e}")

def get_sub_sitemaps(main_sitemap_url):
    xml = fetch_sitemap(main_sitemap_url)
    sub_sitemaps = []
    for sitemap in xml.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap"):
        loc = sitemap.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc")
        if loc is not None:
            sub_sitemaps.append(loc.text)
    return sub_sitemaps

import asyncio

async def create_markdown_files():
    print(f"Main sitemap: {WEBSITE_URL_TO_CRAWL}")
    sub_sitemaps = get_sub_sitemaps(WEBSITE_URL_TO_CRAWL)
    print(f"Found {len(sub_sitemaps)} sub-sitemaps:")
    for i, sub_url in enumerate(sub_sitemaps, start=1):
        print(f" {i}.  {sub_url}")
        if "post-sitemap" in sub_url:
            print(f"Processing posts sitemap: {sub_url}")
            process_sitemap(sub_url, "posts_markdown")
        elif "page-sitemap" in sub_url:
            print(f"Processing pages sitemap: {sub_url}")
            process_sitemap(sub_url, "pages_markdown")

if __name__ == "__main__":
    asyncio.run(create_markdown_files())

