#!/usr/bin/env python3
"""Scrape likers + commenters from keith's last N posts.

For each post, generates a short descriptor via claude CLI for use in the
prefab outreach template "Thanks so much for the love on {descriptor}".

Filters out: existing CSV handles, EXCLUDED/PARTNER bucket, self.

Output: engagers.json
"""
import csv
import json
import os
import subprocess
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import unquote

import rookiepy
from instagrapi import Client
from instagrapi.exceptions import ClientError

from _buckets_lib import load_skip_handles

# ── Config ────────────────────────────────────────────────────────────────────
N_POSTS         = 3
LIKERS_LIMIT    = 250    # cap per post (IG can return thousands)
COMMENTS_LIMIT  = 100
DELAY_MIN       = 1.5
DELAY_MAX       = 2.5
SELF_HANDLE     = "@shakingmedicine"
BASE            = Path(__file__).parent
DM_CSV          = BASE / "follow_up_FULL.csv"
OUT             = BASE / "engagers.json"
CLAUDE_BIN      = "/Users/daichi/.local/bin/claude"
ENV_NO_KEY      = {k: v for k, v in os.environ.items() if k != "ANTHROPIC_API_KEY"}

# Hand-curated post descriptor overrides (post code -> phrase). Used instead of claude.
_overrides_path = Path(__file__).parent / "post_descriptors.json"
POST_OVERRIDES = {}
if _overrides_path.exists():
    try:
        POST_OVERRIDES = json.loads(_overrides_path.read_text())
    except Exception:
        POST_OVERRIDES = {}
print(f"loaded {len(POST_OVERRIDES)} hand-curated post descriptors", flush=True)

# ── Login ─────────────────────────────────────────────────────────────────────
print("logging in via chrome session...", flush=True)
cl = Client()
cl.delay_range = [DELAY_MIN, DELAY_MAX]
cookies    = rookiepy.chrome(domains=["instagram.com"])
session_id = unquote({c["name"]: c["value"] for c in cookies}["sessionid"])
from _auth_lib import safe_login
safe_login(cl, BASE / "session.json", session_id)
my_id = str(cl.user_id)
print(f"logged in as: {cl.account_info().username}", flush=True)

# ── Load skip set ─────────────────────────────────────────────────────────────
existing_handles = set()
if DM_CSV.exists():
    with open(DM_CSV, encoding="utf-8") as f:
        for row in csv.DictReader(f):
            h = row.get("handle", "").lower().strip()
            if h: existing_handles.add(h)
skip_handles = load_skip_handles() | existing_handles | {SELF_HANDLE.lower()}
print(f"skip set: {len(skip_handles)} handles (existing CSV + buckets + self)", flush=True)

# ── Whitelist mode: if engager_posts.json has entries, scrape ONLY those ─────
_wl_path = Path(__file__).parent / "engager_posts.json"
_whitelist = []
if _wl_path.exists():
    try:
        _whitelist = json.loads(_wl_path.read_text())
    except Exception:
        _whitelist = []
WHITELIST_MODE = bool(_whitelist)

def engagement(m):
    return (
        (getattr(m, "play_count", None) or 0)
        + (m.like_count or 0) * 5
        + (m.comment_count or 0) * 10
    )

medias_dict = {}
if WHITELIST_MODE:
    print(f"whitelist mode: {len(_whitelist)} post(s) only — bypassing pool/evergreen", flush=True)
    for _code in _whitelist:
        try:
            _pk = cl.media_pk_from_code(_code)
            _m = cl.media_info(_pk)
            medias_dict[str(_m.pk)] = _m
            print(f"  whitelist fetched: {_code}", flush=True)
        except Exception as _e:
            print(f"  whitelist fetch err for {_code}: {_e}", flush=True)
else:
    POOL = 20
    print(f"fetching top {POOL} posts to pick evergreen #1...", flush=True)
    pool = cl.user_medias(my_id, amount=POOL)
    print(f"got {len(pool)} posts in pool", flush=True)
    top = max(pool, key=engagement) if pool else None
    recent = pool[:N_POSTS]
    medias_dict = {str(m.pk): m for m in recent}
    if top and str(top.pk) not in medias_dict:
        medias_dict[str(top.pk)] = top
    for _code in POST_OVERRIDES:
        if _code not in {m.code for m in medias_dict.values()}:
            try:
                _pk = cl.media_pk_from_code(_code)
                _m = cl.media_info(_pk)
                medias_dict[str(_m.pk)] = _m
                print(f"  + override post fetched: {_code}", flush=True)
            except Exception as _e:
                print(f"  override fetch err for {_code}: {_e}", flush=True)

medias = list(medias_dict.values())
print(f"final medias count: {len(medias)}", flush=True)
for m in medias:
    print(f"  {m.code}: likes={m.like_count} comments={m.comment_count} plays={getattr(m, "play_count", None)}", flush=True)

# ── Generate descriptors via claude CLI ───────────────────────────────────────
def gen_descriptor(caption: str) -> str:
    """5-10 word descriptor that fits 'Thanks so much for the love on {X}'."""
    if not caption:
        return "your recent post"
    prompt = (
        "describe this Instagram post in 4-8 words. it fits in this sentence: "
        "\"Thanks so much for the love on our X\"\n\n"
        "examples of good descriptors (use this exact tone, no quotes around output):\n"
        "- neurogenic training reel with Floki\n"
        "- pandiculation tutorial\n"
        "- shaking medicine teacher training launch\n"
        "- somatic healing process post\n"
        "- step-by-step release guide\n"
        "- nervous system regulation reel\n\n"
        "post caption:\n"
        f"{caption[:1200]}\n\n"
        "output the descriptor only, no preamble, no quotes, no leading dash, no leading \"your\" or \"the\". "
        "start with the topic noun. no em-dashes."
    )
    try:
        r = subprocess.run([CLAUDE_BIN, "-p", prompt],
                           capture_output=True, text=True, timeout=45, env=ENV_NO_KEY)
        text = r.stdout.strip().split("\n")[0].strip()
        # strip quotes if any
        text = text.strip('"\'')
        # strip em-dashes defensively
        text = text.replace("—", ",").replace("  ", " ").strip()
        return text if text else "your recent post"
    except Exception:
        return "your recent post"

posts_data = []
for m in medias:
    _override = POST_OVERRIDES.get(m.code)
    if isinstance(_override, dict):
        descriptor = _override.get("descriptor", "")
        custom_template = _override.get("template", "")
    elif isinstance(_override, str):
        descriptor = _override
        custom_template = ""
    else:
        descriptor = gen_descriptor(m.caption_text or "")
        custom_template = ""
    if descriptor:
        print(f"  override for {m.code}: {descriptor}", flush=True)
    posts_data.append({
        "id":              str(m.pk),
        "code":            m.code,
        "url":             f"https://www.instagram.com/p/{m.code}/",
        "caption":         (m.caption_text or "")[:300],
        "descriptor":      descriptor,
        "custom_template": custom_template,
        "ts":              m.taken_at.strftime("%Y-%m-%d %H:%M") if m.taken_at else "",
        "type":            m.media_type,  # 1=photo, 2=video, 8=carousel
    })
    print(f"  post {m.code}: descriptor='{descriptor}'", flush=True)

# ── Collect engagers from each post ───────────────────────────────────────────
engagers_by_handle = {}  # @handle -> {name, post_id, action, ts}

for post in posts_data:
    pid = post["id"]
    print(f"\nfetching engagers for post {post['code']}...", flush=True)

    # Likers
    try:
        likers = cl.media_likers(pid)
        likers = likers[:LIKERS_LIMIT]
        print(f"  {len(likers)} likers", flush=True)
        for u in likers:
            h = f"@{u.username}".lower()
            if h in skip_handles or h in engagers_by_handle: continue
            engagers_by_handle[h] = {
                "handle":   f"@{u.username}",
                "name":     u.full_name or u.username,
                "post_id":  pid,
                "action":   "liked",
                "ts":       post["ts"],
            }
    except Exception as e:
        print(f"  likers err: {e}", flush=True)

    # Commenters
    try:
        comments = cl.media_comments(pid, amount=COMMENTS_LIMIT)
        print(f"  {len(comments)} comments", flush=True)
        for c in comments:
            u = c.user
            h = f"@{u.username}".lower()
            if h in skip_handles: continue
            # commenter takes precedence over liker (richer signal)
            engagers_by_handle[h] = {
                "handle":   f"@{u.username}",
                "name":     u.full_name or u.username,
                "post_id":  pid,
                "action":   "commented",
                "comment":  (c.text or "")[:200],
                "ts":       c.created_at_utc.strftime("%Y-%m-%d %H:%M") if hasattr(c, "created_at_utc") else post["ts"],
            }
    except Exception as e:
        print(f"  comments err: {e}", flush=True)

    time.sleep(2)  # gentle pacing between posts

# ── Carry over thread_ids from prior engagers.json so we don't re-resolve ──
_engager_list = list(engagers_by_handle.values())
_tid_cache = {}
if OUT.exists():
    try:
        _prev = json.loads(OUT.read_text())
        for _p in _prev.get("engagers", []):
            _ph = _p.get("handle", "").lower().strip()
            _ptid = _p.get("thread_id", "")
            if _ph and _ptid:
                _tid_cache[_ph] = _ptid
    except Exception:
        pass
_carried = 0
for _eng in _engager_list:
    _h = _eng.get("handle", "").lower().strip()
    if _h in _tid_cache and not _eng.get("thread_id"):
        _eng["thread_id"] = _tid_cache[_h]
        _carried += 1
print(f"carried {_carried} thread_ids from prior engagers.json", flush=True)

# ── Resolve thread_ids for engagers that don't have one yet ──────────────────
_to_resolve = [e for e in _engager_list if not e.get("thread_id")]
print(f"resolving thread_ids for {len(_to_resolve)} new engagers...", flush=True)
_resolved_e = 0
for _eng in _to_resolve:
    _h = _eng.get("handle", "").lstrip("@")
    if not _h:
        continue
    try:
        _uid = cl.user_info_by_username_v1(_h).pk
        _t = cl.direct_thread_by_participants([int(_uid)])
        _tid = ""
        if isinstance(_t, dict):
            _thread_obj = _t.get("thread") or {}
            _tid = str(_thread_obj.get("thread_id") or "")
        if _tid:
            _eng["thread_id"] = _tid
            _resolved_e += 1
    except Exception as _e:
        print(f"  thread_id err for {_h}: {_e}", flush=True)
print(f"  resolved {_resolved_e}/{len(_to_resolve)} new thread_ids", flush=True)

# ── Save ──────────────────────────────────────────────────────────────────────
out = {
    "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M"),
    "posts":      posts_data,
    "engagers":   _engager_list,
}
OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False))
print(f"\ndone. {len(out['engagers'])} unique engagers saved to {OUT.name}", flush=True)
