"""
Pradhya · Nanoclaude · Unit 12
==============================

A minimal router that picks Claude (cloud) or Llama (local Ollama) for
each request based on the task type. The shape of consumer-grade AI
infrastructure five years from now — just built earlier in your closet.

Install:
    pip install anthropic openai

Run (after Ollama is up on the Mac mini):
    export ANTHROPIC_API_KEY="sk-ant-..."
    python hybrid.py
"""

from __future__ import annotations

import os

from typing import Literal, TypedDict

from anthropic import Anthropic
from openai import OpenAI


# ---------- Backends ----------
cloud = Anthropic()
local = OpenAI(
    # Defaults to a Mac mini on your LAN; override with OLLAMA_BASE_URL
    # (e.g. http://localhost:11434/v1 when Ollama runs on this machine).
    base_url=os.environ.get("OLLAMA_BASE_URL", "http://mac-mini.local:11434/v1"),
    api_key="ollama",                             # any string; Ollama doesn't check
)

CLOUD_MODEL = "claude-sonnet-4-6"
LOCAL_MODEL = "llama3.1:8b-instruct-q4_K_M"


# ---------- Task taxonomy ----------
TaskType = Literal[
    "classify",        # short text classification
    "summarize_short", # < 2k tokens of input
    "embed",           # produce an embedding (use a real embed model in real life)
    "voice",           # voice agent reply, latency-sensitive
    "reason",          # multi-step reasoning
    "analyze",         # complex analysis, long context
    "long_context",    # large doc(s) in the prompt
    "tool_use",        # tool-calling agent
    "draft",           # first-pass writing
]


class Task(TypedDict, total=False):
    type: TaskType
    sensitive: bool   # if True, never send to cloud
    user_id: str


# ---------- Router ----------
def route(task: Task) -> Literal["local", "cloud"]:
    """Pick the backend for this task. The policy is editable."""
    if task.get("sensitive"):
        return "local"

    t = task.get("type")
    if t in {"classify", "summarize_short", "embed", "voice", "draft"}:
        return "local"
    if t in {"reason", "analyze", "long_context", "tool_use"}:
        return "cloud"

    # default: try local first, escalate manually if quality is low
    return "local"


# ---------- Unified ask() ----------
def ask(task: Task, prompt: str, system: str | None = None) -> str:
    backend = route(task)
    if backend == "local":
        messages = []
        if system:
            messages.append({"role": "system", "content": system})
        messages.append({"role": "user", "content": prompt})
        resp = local.chat.completions.create(model=LOCAL_MODEL, messages=messages)
        return resp.choices[0].message.content

    # cloud
    kwargs = {"model": CLOUD_MODEL, "max_tokens": 1024,
              "messages": [{"role": "user", "content": prompt}]}
    if system:
        kwargs["system"] = system
    resp = cloud.messages.create(**kwargs)
    return resp.content[0].text


# ---------- Demo ----------
if __name__ == "__main__":
    examples = [
        (
            {"type": "classify"},
            "Tag this email: REPLY / FYI / ACTION / SKIP\n\n"
            "'Are you free Thursday at 11 to walk through the renewal?'",
        ),
        (
            {"type": "summarize_short", "sensitive": True},
            "Summarize my journal entry from yesterday in three sentences:\n\n"
            "I felt off all day. Three meetings ran long, the new product...",
        ),
        (
            {"type": "reason"},
            "Explain how Apple Silicon's unified memory architecture affects "
            "the design of local LLM inference engines. Give two trade-offs.",
        ),
    ]
    for task, prompt in examples:
        backend = route(task)
        print(f"\n--- [{backend:>5s}] type={task.get('type')} sensitive={task.get('sensitive', False)}")
        try:
            print(ask(task, prompt))
        except Exception as e:
            print(f"  (skipped) {type(e).__name__}: {e}")