"""
Build viewer-data.js from conversation.json.

  - filters out tapbacks (Liked/Loved/Emphasized/etc.)
  - converts UTC timestamps to a local timezone (default America/New_York)
  - buckets messages by local date
  - computes stats (busiest day, top emojis, streaks, hour/weekday histograms)

Usage:
    python3 build_viewer.py
    python3 build_viewer.py --src ./out/conversation.json --dst ./viewer/viewer-data.js
    python3 build_viewer.py --tz America/Los_Angeles
"""

import argparse
import json
import re
from collections import Counter, defaultdict
from datetime import datetime, timedelta
from pathlib import Path
from zoneinfo import ZoneInfo

TAPBACK_RE = re.compile(
    r'^(Liked|Loved|Disliked|Emphasized|Questioned|Laughed at|Removed a [a-z]+ from|Removed an? [a-z]+ from) [“"]',
)
EMOJI_RE = re.compile(
    "[\U0001F300-\U0001FAFF\U00002600-\U000027BF\U0001F1E6-\U0001F1FF]"
)
LOVE_RE = re.compile(r"\bi love you\b", re.I)

STOPWORDS = set(
    "the and you for that this with have what just are not but get can will all out one when "
    "been was your they not how its from then about there here some have had has like got "
    "because know think need right back over really very out where with our them their would "
    "could should make made way down into too much more than only well even some such still "
    "being were our who didn doesn isn aren wasn weren haven hasn hadn don doesnt isnt arent "
    "wasnt werent havent hasnt hadnt dont".split()
)


def main():
    ap = argparse.ArgumentParser(description="Build viewer-data.js from conversation.json.")
    ap.add_argument("--src", default="./out/conversation.json")
    ap.add_argument("--dst", default="./viewer/viewer-data.js")
    ap.add_argument("--tz", default="America/New_York",
                    help="Timezone for bucketing (default: America/New_York)")
    args = ap.parse_args()

    src = Path(args.src).expanduser()
    dst = Path(args.dst).expanduser()
    tz = ZoneInfo(args.tz)

    raw = json.loads(src.read_text())
    print(f"Loaded {len(raw):,} raw messages from {src}.")

    by_day = defaultdict(list)
    day_meta = defaultdict(
        lambda: {"count": 0, "sent": 0, "received": 0,
                 "attachments": 0, "emojis": 0, "love_yous": 0}
    )
    sent = received = photos = emoji_total = love_total = 0
    hour_hist = Counter()
    weekday_hist = Counter()
    emoji_counter = Counter()
    word_counter = Counter()
    tapbacks_dropped = 0
    empties_dropped = 0

    for m in raw:
        text = m.get("text") or ""
        if text and TAPBACK_RE.match(text):
            tapbacks_dropped += 1
            continue
        if not text and not m["attachments"]:
            empties_dropped += 1
            continue

        dt = datetime.fromisoformat(m["date"]).astimezone(tz)
        date = dt.date().isoformat()
        time_str = dt.strftime("%H:%M")

        from_me = m["from_me"]
        atts = m["attachments"]
        emojis_here = len(EMOJI_RE.findall(text))
        love_here = 1 if LOVE_RE.search(text) else 0

        by_day[date].append({"t": time_str, "me": from_me, "x": text, "a": atts})
        meta = day_meta[date]
        meta["count"] += 1
        meta["sent" if from_me else "received"] += 1
        meta["attachments"] += atts
        meta["emojis"] += emojis_here
        meta["love_yous"] += love_here

        if from_me:
            sent += 1
        else:
            received += 1
        photos += atts
        emoji_total += emojis_here
        love_total += love_here
        hour_hist[dt.hour] += 1
        weekday_hist[dt.weekday()] += 1
        for e in EMOJI_RE.findall(text):
            emoji_counter[e] += 1
        for w in re.findall(r"[A-Za-z']{3,}", text.lower()):
            word_counter[w] += 1

    total = sent + received
    print(f"After filtering: {total:,} messages "
          f"({tapbacks_dropped:,} tapbacks, {empties_dropped:,} empty dropped)")

    sorted_dates = sorted(by_day.keys())
    longest = current = 1
    longest_end = current_end = sorted_dates[0]
    for i in range(1, len(sorted_dates)):
        d_prev = datetime.fromisoformat(sorted_dates[i - 1])
        d_now = datetime.fromisoformat(sorted_dates[i])
        if (d_now - d_prev).days == 1:
            current += 1
            current_end = sorted_dates[i]
            if current > longest:
                longest = current
                longest_end = current_end
        else:
            current = 1
    longest_start = (datetime.fromisoformat(longest_end) -
                     timedelta(days=longest - 1)).date().isoformat()

    busiest = max(day_meta.items(), key=lambda kv: kv[1]["count"])
    top_words = [(w, c) for w, c in word_counter.most_common(120)
                 if w not in STOPWORDS][:30]

    span_days = (datetime.fromisoformat(sorted_dates[-1]) -
                 datetime.fromisoformat(sorted_dates[0])).days + 1

    stats = {
        "total": total,
        "sent": sent,
        "received": received,
        "photos": photos,
        "emojis": emoji_total,
        "love_yous": love_total,
        "first_date": sorted_dates[0],
        "last_date": sorted_dates[-1],
        "span_days": span_days,
        "active_days": len(sorted_dates),
        "years": round(span_days / 365.25, 2),
        "busiest_day": {"date": busiest[0], "count": busiest[1]["count"]},
        "longest_streak": {"start": longest_start, "end": longest_end, "days": longest},
        "avg_per_day": round(total / span_days, 1),
        "share_me": round(sent / total * 100, 1),
        "share_them": round(received / total * 100, 1),
        "hour_hist": [hour_hist[h] for h in range(24)],
        "weekday_hist": [weekday_hist[d] for d in range(7)],
        "top_emojis": emoji_counter.most_common(15),
        "top_words": top_words,
    }
    payload = {"stats": stats, "day_meta": day_meta, "messages": by_day}

    dst.parent.mkdir(parents=True, exist_ok=True)
    dst.write_text("window.DATA = " + json.dumps(payload, ensure_ascii=False) + ";")
    print(f"Wrote {dst}  ({dst.stat().st_size / 1e6:.1f} MB)")


if __name__ == "__main__":
    main()
