#!/usr/bin/env python3
"""
Rebuild v0 — re-genera tablas/gráficos con overrides manuales.

Workflow recomendado:
1) Edita data/project_overrides.csv con columnas: id, project
2) Ejecuta: python monark/scripts/rebuild_catalog.py
3) Abre dashboard/monark_dashboard.html

Nota: esto NO re-clasifica con ML; solo aplica overrides y recalcula agregados.
"""
import pandas as pd, json, os
from collections import Counter, defaultdict
import math, datetime

BASE=os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
DATA=os.path.join(BASE,"data")
DASH=os.path.join(BASE,"dashboard")

def entropy(counter):
    total=sum(counter.values())
    if total==0: return 0.0
    ent=0.0
    for v in counter.values():
        p=v/total
        ent -= p*math.log(p+1e-12)
    return ent

def norm(s):
    s=s.astype(float)
    if s.max()==s.min():
        return s*0 + 50
    return 100*(s - s.min())/(s.max()-s.min())

def main():
    conv=pd.read_csv(os.path.join(DATA,"conversations_index.csv"))
    # overrides
    ov_path=os.path.join(DATA,"project_overrides.csv")
    if os.path.exists(ov_path):
        ov=pd.read_csv(ov_path)
        m=dict(zip(ov["id"], ov["project"]))
        conv["proj_final"]=conv["id"].map(lambda x: m.get(x, conv.loc[conv["id"]==x,"proj_final"].iloc[0] if (conv["id"]==x).any() else None))
    # themes list parsing
    def parse_list(x):
        if pd.isna(x): return []
        if isinstance(x,list): return x
        # try csv style: "['a','b']" or "a, b"
        s=str(x)
        if s.startswith("[") and s.endswith("]"):
            s=s.strip("[]")
            return [t.strip().strip("'"") for t in s.split(",") if t.strip()]
        return [t.strip() for t in s.split(",") if t.strip()]

    conv["themes"]=conv["themes"].map(parse_list)

    # aggregate
    conv["date"]=pd.to_datetime(conv["date"], errors="coerce")
    grp=conv.groupby("proj_final").agg(
        convs=("id","count"),
        start=("date","min"),
        end=("date","max"),
        msgs=("msg_count","sum"),
        user_chars=("user_chars","sum"),
        assistant_chars=("assistant_chars","sum"),
        urls=("urls","sum"),
        files=("files","sum"),
        code_blocks=("code_blocks","sum"),
    ).reset_index().rename(columns={"proj_final":"project"})

    theme_counts=defaultdict(Counter)
    for _,r in conv.iterrows():
        for th in r["themes"]:
            theme_counts[r["proj_final"]][th]+=1

    grp["days_span"]=(grp["end"]-grp["start"]).dt.days.fillna(0).astype(int)
    grp["complexity_raw"]=pd.Series([math.log1p(a+b+c) for a,b,c in zip(grp["code_blocks"], grp["urls"], grp["files"])]) + 0.2*np.log1p(grp["assistant_chars"]/1000)
    grp["time_raw"]=np.log1p(grp["user_chars"]+grp["assistant_chars"])
    grp["theme_entropy"]=grp["project"].map(lambda p: entropy(theme_counts[p]))
    grp["theme_variety"]=grp["project"].map(lambda p: len(theme_counts[p]))

    grp["complexity"]=norm(grp["complexity_raw"])
    grp["time_spent"]=norm(grp["time_raw"])
    grp["dispersion"]=norm(grp["theme_entropy"] + 0.2*grp["theme_variety"])
    grp["maturity"]=norm(np.log1p(grp["days_span"]+1) + 0.3*np.log1p(grp["convs"]))

    grp["projectness"]=norm(0.45*np.log1p(grp["convs"]) + 0.35*np.log1p(grp["files"]+grp["urls"]+grp["code_blocks"]) + 0.2*np.log1p(grp["days_span"]+1))

    # save
    out=grp.sort_values(["projectness","time_spent"], ascending=False)
    out["start_date"]=out["start"].dt.date.astype(str)
    out["end_date"]=out["end"].dt.date.astype(str)
    out.drop(columns=["start","end"]).to_csv(os.path.join(DATA,"projects_index_rebuilt.csv"), index=False)

    cat={
        "generated_utc": datetime.datetime.utcnow().isoformat()+"Z",
        "conversations": conv.to_dict(orient="records"),
        "projects": out.to_dict(orient="records"),
    }
    with open(os.path.join(DATA,"sirep_catalog_rebuilt.json"),"w",encoding="utf-8") as f:
        json.dump(cat,f,ensure_ascii=False,indent=2)

    print("OK: data/projects_index_rebuilt.csv + data/sirep_catalog_rebuilt.json")

if __name__=="__main__":
    import numpy as np
    main()
