healthcare-slm-1m / config.py
sathishphdai's picture
Upload Healthcare-SLM v2
8cf7869 verified
"""
Configuration for Healthcare-SLM: A Small Language Model for Healthcare & Medical domain.
LLaMA-style architecture with RoPE β€” supports up to 1M token context.
"""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
@dataclass
class SLMConfig:
"""All hyperparameters and paths in one place."""
# ── Project paths ──────────────────────────────────────────────
project_dir: Path = Path(__file__).resolve().parent
data_dir: Path = field(default=None)
tokenizer_dir: Path = field(default=None)
checkpoint_dir: Path = field(default=None)
# ── Domain ─────────────────────────────────────────────────────
domain_name: str = "Healthcare"
domain_slug: str = "healthcare"
tokenizer_filename: str = "healthcare_tokenizer.json"
# ── Tokenizer ──────────────────────────────────────────────────
vocab_size: int = 16_000
min_frequency: int = 2
special_tokens: list = field(
default_factory=lambda: [
"<pad>", "<unk>", "<bos>", "<eos>",
"<|system|>", "<|user|>", "<|assistant|>",
]
)
# ── Model (LLaMA-style with RoPE) ─────────────────────────────
n_layer: int = 8
n_head: int = 8
n_embd: int = 512
block_size: int = 512
dropout: float = 0.1
bias: bool = False
ffn_multiplier: float = 2.667
# ── RoPE ───────────────────────────────────────────────────────
max_position_embeddings: int = 100_000_000_000 # 100B tokens via RoPE
rope_theta: float = 50_000_000_000.0 # Scaled for 100B context
# ── Sliding Window ─────────────────────────────────────────────
sliding_window: Optional[int] = None
# ── Training ───────────────────────────────────────────────────
batch_size: int = 4
gradient_accumulation_steps: int = 4
learning_rate: float = 3e-4
weight_decay: float = 0.1
max_epochs: int = 20
dataset_stride: int = 256
warmup_steps: int = 20
grad_clip: float = 1.0
eval_interval: int = 50
eval_samples: int = 20
log_interval: int = 10
device: str = "auto"
# ── Generation ─────────────────────────────────────────────────
max_new_tokens: int = 1_000_000 # 1M output tokens
temperature: float = 0.8
top_k: int = 50
top_p: float = 0.9
# ── Hugging Face ───────────────────────────────────────────────
hf_repo_name: str = "healthcare-slm-1m"
hf_model_card_tags: list = field(
default_factory=lambda: [
"healthcare", "medical", "clinical", "slm",
"llama-style", "rope", "1m-context", "from-scratch",
]
)
def __post_init__(self):
if self.data_dir is None:
self.data_dir = self.project_dir / "data"
if self.tokenizer_dir is None:
self.tokenizer_dir = self.project_dir / "tokenizer"
if self.checkpoint_dir is None:
self.checkpoint_dir = self.project_dir / "checkpoints"
self.data_dir.mkdir(parents=True, exist_ok=True)
self.tokenizer_dir.mkdir(parents=True, exist_ok=True)
self.checkpoint_dir.mkdir(parents=True, exist_ok=True)
if self.device == "auto":
import torch
if torch.cuda.is_available():
self.device = "cuda"
elif torch.backends.mps.is_available():
self.device = "mps"
else:
self.device = "cpu"
cfg = SLMConfig()