"""
One-time setup: prepare tokenizer and upload to HF bucket.
Run on a cheap CPU job:

    hf jobs uv run \
        --flavor cpu-basic \
        --timeout 20m \
        --namespace mishig \
        --secrets HF_TOKEN \
        setup_cache.py
"""
import os
import subprocess
import sys
import tarfile
import urllib.request

REPO_URL = "https://github.com/karpathy/autoresearch/archive/refs/heads/master.tar.gz"
REPO_DIR = "autoresearch-master"
BUCKET = "hf://buckets/mishig/autoresearch-cache"

# Download and extract repo
print("Downloading repo...")
urllib.request.urlretrieve(REPO_URL, "repo.tar.gz")
with tarfile.open("repo.tar.gz") as tar:
    tar.extractall(filter="data")
os.chdir(REPO_DIR)

# Prepare data (download shards + train tokenizer)
print("Running prepare.py...")
subprocess.run(["uv", "run", "prepare.py", "--num-shards", "10"], check=True)

# Upload tokenizer to bucket using Python API
tokenizer_dir = os.path.expanduser("~/.cache/autoresearch/tokenizer")
print("Uploading tokenizer to bucket...")

from huggingface_hub import batch_bucket_files

files_to_upload = []
for fname in os.listdir(tokenizer_dir):
    local_path = os.path.join(tokenizer_dir, fname)
    if os.path.isfile(local_path):
        files_to_upload.append((local_path, f"tokenizer/{fname}"))
        print(f"  Uploading {fname}")

batch_bucket_files("mishig/autoresearch-cache", add=files_to_upload)

print("Done! Tokenizer cached at", f"{BUCKET}/tokenizer")