#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
ChatGPT export (conversations.json) -> SIREP JSONL

Salida por línea:
{
  "conv_id": "...",
  "title": "...",
  "ts": "...",
  "role": "user|assistant",
  "text": "...",
  "urls": [...],
  "files": [...],
  "project_guess": "...",
  "keywords": [...]
}
"""
import json, re, argparse, datetime
from collections import Counter

WORD = re.compile(r"[a-zA-ZáéíóúñüÁÉÍÓÚÑÜ0-9_]+")
URL = re.compile(r"https?://[^\s)>\]]+")
FILE = re.compile(r"(?i)(sandbox:/mnt/data/[^\s)>\]]+|[A-Za-z0-9_\-./]+\.(?:zip|html|png|webp|jpg|jpeg|pdf|pptx|docx|xlsx|json|js|css|php|md|mp3|wav))")

STOP = set(("the a an and or to of in on for with without from by is are was were be been being "
            "this that those these as at it its i you we they he she them his her our your "
            "de del la las el los y o en con por para sin un una unos unas lo al se que "
            "http https www com mx php html css js png jpg jpeg webp zip pdf pptx docx xlsx json").split())

PROJECTS = {
  "MK Asistente / Entrevista MK": ["entrevista","maskapital","indexeddb","stt","analyzer","pwa","panel_agent","openai"],
  "MAP HUB / Digital Twin": ["map hub","digital twin","zonas","rejilla","muros","cámara","cytoscape","d3"],
  "IoT / ESP32": ["esp32","iot","sensor","pir","bluetooth","relé","incubadora"],
  "WISP / Ciberseguridad": ["wisp","vpn","dvr","dns","glasswire","netstat","seguridad","server"],
  "Minecraft": ["minecraft","denizen","paper","geyser","bedrock","realms","npc"],
  "Trading": ["scalp","long","short","pnl","apalanc","derivados","futuros","stop"],
}

def guess_project(text: str, title: str="") -> str:
    t=(title+"\n"+text).lower()
    best=("Otros", 0)
    for name,keys in PROJECTS.items():
        s=0
        for k in keys:
            s += t.count(k)
        if s>best[1]:
            best=(name,s)
    return best[0]

def keywords(text: str, k=8):
    toks=[w.lower() for w in WORD.findall(text or "")]
    toks=[w for w in toks if len(w)>2 and w not in STOP and not w.isdigit()]
    c=Counter(toks)
    return [w for w,_ in c.most_common(k)]

def iter_messages(conv):
    mapping = conv.get("mapping") or {}
    for _,node in mapping.items():
        msg=node.get("message")
        if not msg: 
            continue
        role = (msg.get("author") or {}).get("role")
        content = msg.get("content") or {}
        parts=[]
        ct=content.get("content_type")
        if ct=="text":
            parts = content.get("parts") or []
        elif ct=="multimodal_text":
            for part in (content.get("parts") or []):
                if isinstance(part,str): parts.append(part)
                elif isinstance(part,dict) and "text" in part: parts.append(part["text"])
        text="\n".join([p for p in parts if isinstance(p,str)])
        yield role, text

def main():
    ap=argparse.ArgumentParser()
    ap.add_argument("--conversations", required=True, help="Ruta a conversations.json (export ChatGPT)")
    ap.add_argument("--out", required=True, help="Archivo JSONL de salida")
    args=ap.parse_args()

    convs=json.load(open(args.conversations,"r",encoding="utf-8"))
    out=open(args.out,"w",encoding="utf-8")

    for conv in convs:
        conv_id=conv.get("id")
        title=conv.get("title") or ""
        create=conv.get("create_time")
        ts = datetime.datetime.utcfromtimestamp(create).isoformat()+"Z" if create else None
        for role,text in iter_messages(conv):
            if not text.strip(): 
                continue
            urls=[u.rstrip('.,;\'"') for u in URL.findall(text)]
            files=[f.strip().strip('",.);') for f in FILE.findall(text)]
            rec={
              "conv_id": conv_id,
              "title": title,
              "ts": ts,
              "role": role,
              "text": text,
              "urls": urls[:25],
              "files": files[:25],
              "project_guess": guess_project(text,title),
              "keywords": keywords(title+"\n"+text, 10),
            }
            out.write(json.dumps(rec, ensure_ascii=False) + "\n")

    out.close()
    print("[OK] wrote", args.out)

if __name__=="__main__":
    main()
