#!/usr/bin/env python3
"""Scrape likers + commenters from keith's last N posts.

For each post, generates a short descriptor via claude CLI for use in the
prefab outreach template "Thanks so much for the love on {descriptor}".

Filters out: existing CSV handles, EXCLUDED/PARTNER bucket, self.

Output: engagers.json
"""
import csv
import json
import os
import subprocess
import time
from datetime import datetime
from pathlib import Path
from urllib.parse import unquote

import rookiepy
from instagrapi import Client
from instagrapi.exceptions import ClientError

from _buckets_lib import load_skip_handles

# ── Config ────────────────────────────────────────────────────────────────────
N_POSTS         = 3
LIKERS_LIMIT    = 250    # cap per post (IG can return thousands)
COMMENTS_LIMIT  = 100
DELAY_MIN       = 1.5
DELAY_MAX       = 2.5
SELF_HANDLE     = "@shakingmedicine"
BASE            = Path(__file__).parent
DM_CSV          = BASE / "follow_up_FULL.csv"
OUT             = BASE / "engagers.json"
CLAUDE_BIN      = "/Users/daichi/.local/bin/claude"
ENV_NO_KEY      = {k: v for k, v in os.environ.items() if k != "ANTHROPIC_API_KEY"}

# Hand-curated post descriptor overrides (post code -> phrase). Used instead of claude.
_overrides_path = Path(__file__).parent / "post_descriptors.json"
POST_OVERRIDES = {}
if _overrides_path.exists():
    try:
        POST_OVERRIDES = json.loads(_overrides_path.read_text())
    except Exception:
        POST_OVERRIDES = {}
print(f"loaded {len(POST_OVERRIDES)} hand-curated post descriptors", flush=True)

# ── Login ─────────────────────────────────────────────────────────────────────
print("logging in via chrome session...", flush=True)
cl = Client()
cl.delay_range = [DELAY_MIN, DELAY_MAX]
cookies    = rookiepy.chrome(domains=["instagram.com"])
session_id = unquote({c["name"]: c["value"] for c in cookies}["sessionid"])
cl.login_by_sessionid(session_id)
my_id = str(cl.user_id)
print(f"logged in as: {cl.account_info().username}", flush=True)

# ── Load skip set ─────────────────────────────────────────────────────────────
existing_handles = set()
if DM_CSV.exists():
    with open(DM_CSV, encoding="utf-8") as f:
        for row in csv.DictReader(f):
            h = row.get("handle", "").lower().strip()
            if h: existing_handles.add(h)
skip_handles = load_skip_handles() | existing_handles | {SELF_HANDLE.lower()}
print(f"skip set: {len(skip_handles)} handles (existing CSV + buckets + self)", flush=True)

# ── Whitelist mode: if engager_posts.json has entries, scrape ONLY those ─────
_wl_path = Path(__file__).parent / "engager_posts.json"
_whitelist = []
if _wl_path.exists():
    try:
        _whitelist = json.loads(_wl_path.read_text())
    except Exception:
        _whitelist = []
WHITELIST_MODE = bool(_whitelist)

def engagement(m):
    return (
        (getattr(m, "play_count", None) or 0)
        + (m.like_count or 0) * 5
        + (m.comment_count or 0) * 10
    )

medias_dict = {}
if WHITELIST_MODE:
    print(f"whitelist mode: {len(_whitelist)} post(s) only — bypassing pool/evergreen", flush=True)
    for _code in _whitelist:
        try:
            _pk = cl.media_pk_from_code(_code)
            _m = cl.media_info(_pk)
            medias_dict[str(_m.pk)] = _m
            print(f"  whitelist fetched: {_code}", flush=True)
        except Exception as _e:
            print(f"  whitelist fetch err for {_code}: {_e}", flush=True)
else:
    POOL = 20
    print(f"fetching top {POOL} posts to pick evergreen #1...", flush=True)
    pool = cl.user_medias(my_id, amount=POOL)
    print(f"got {len(pool)} posts in pool", flush=True)
    top = max(pool, key=engagement) if pool else None
    recent = pool[:N_POSTS]
    medias_dict = {str(m.pk): m for m in recent}
    if top and str(top.pk) not in medias_dict:
        medias_dict[str(top.pk)] = top
    for _code in POST_OVERRIDES:
        if _code not in {m.code for m in medias_dict.values()}:
            try:
                _pk = cl.media_pk_from_code(_code)
                _m = cl.media_info(_pk)
                medias_dict[str(_m.pk)] = _m
                print(f"  + override post fetched: {_code}", flush=True)
            except Exception as _e:
                print(f"  override fetch err for {_code}: {_e}", flush=True)

medias = list(medias_dict.values())
print(f"final medias count: {len(medias)}", flush=True)
for m in medias:
    print(f"  {m.code}: likes={m.like_count} comments={m.comment_count} plays={getattr(m, "play_count", None)}", flush=True)

# ── Generate descriptors via claude CLI ───────────────────────────────────────
def gen_descriptor(caption: str) -> str:
    """5-10 word descriptor that fits 'Thanks so much for the love on {X}'."""
    if not caption:
        return "your recent post"
    prompt = (
        "describe this Instagram post in 4-8 words. it fits in this sentence: "
        "\"Thanks so much for the love on our X\"\n\n"
        "examples of good descriptors (use this exact tone, no quotes around output):\n"
        "- neurogenic training reel with Floki\n"
        "- pandiculation tutorial\n"
        "- shaking medicine teacher training launch\n"
        "- somatic healing process post\n"
        "- step-by-step release guide\n"
        "- nervous system regulation reel\n\n"
        "post caption:\n"
        f"{caption[:1200]}\n\n"
        "output the descriptor only, no preamble, no quotes, no leading dash, no leading \"your\" or \"the\". "
        "start with the topic noun. no em-dashes."
    )
    try:
        r = subprocess.run([CLAUDE_BIN, "-p", prompt],
                           capture_output=True, text=True, timeout=45, env=ENV_NO_KEY)
        text = r.stdout.strip().split("\n")[0].strip()
        # strip quotes if any
        text = text.strip('"\'')
        # strip em-dashes defensively
        text = text.replace("—", ",").replace("  ", " ").strip()
        return text if text else "your recent post"
    except Exception:
        return "your recent post"

posts_data = []
for m in medias:
    if m.code in POST_OVERRIDES:
        descriptor = POST_OVERRIDES[m.code]
        print(f"  override for {m.code}: {descriptor}", flush=True)
    else:
        descriptor = gen_descriptor(m.caption_text or "")
    posts_data.append({
        "id":         str(m.pk),
        "code":       m.code,
        "url":        f"https://www.instagram.com/p/{m.code}/",
        "caption":    (m.caption_text or "")[:300],
        "descriptor": descriptor,
        "ts":         m.taken_at.strftime("%Y-%m-%d %H:%M") if m.taken_at else "",
        "type":       m.media_type,  # 1=photo, 2=video, 8=carousel
    })
    print(f"  post {m.code}: descriptor='{descriptor}'", flush=True)

# ── Collect engagers from each post ───────────────────────────────────────────
engagers_by_handle = {}  # @handle -> {name, post_id, action, ts}

for post in posts_data:
    pid = post["id"]
    print(f"\nfetching engagers for post {post['code']}...", flush=True)

    # Likers
    try:
        likers = cl.media_likers(pid)
        likers = likers[:LIKERS_LIMIT]
        print(f"  {len(likers)} likers", flush=True)
        for u in likers:
            h = f"@{u.username}".lower()
            if h in skip_handles or h in engagers_by_handle: continue
            engagers_by_handle[h] = {
                "handle":   f"@{u.username}",
                "name":     u.full_name or u.username,
                "post_id":  pid,
                "action":   "liked",
                "ts":       post["ts"],
            }
    except Exception as e:
        print(f"  likers err: {e}", flush=True)

    # Commenters
    try:
        comments = cl.media_comments(pid, amount=COMMENTS_LIMIT)
        print(f"  {len(comments)} comments", flush=True)
        for c in comments:
            u = c.user
            h = f"@{u.username}".lower()
            if h in skip_handles: continue
            # commenter takes precedence over liker (richer signal)
            engagers_by_handle[h] = {
                "handle":   f"@{u.username}",
                "name":     u.full_name or u.username,
                "post_id":  pid,
                "action":   "commented",
                "comment":  (c.text or "")[:200],
                "ts":       c.created_at_utc.strftime("%Y-%m-%d %H:%M") if hasattr(c, "created_at_utc") else post["ts"],
            }
    except Exception as e:
        print(f"  comments err: {e}", flush=True)

    time.sleep(2)  # gentle pacing between posts

# ── Save ──────────────────────────────────────────────────────────────────────
out = {
    "scraped_at": datetime.now().strftime("%Y-%m-%d %H:%M"),
    "posts":      posts_data,
    "engagers":   list(engagers_by_handle.values()),
}
OUT.write_text(json.dumps(out, indent=2, ensure_ascii=False))
print(f"\ndone. {len(out['engagers'])} unique engagers saved to {OUT.name}", flush=True)
