#!/usr/bin/env python3
"""
Search the USR Total Control mailing list archive.

Usage:
    ./search.py <query>                     # Full-text search
    ./search.py <query> --from <email>      # Filter by sender
    ./search.py <query> --year <year>       # Filter by year
    ./search.py <query> --subject-only      # Search subjects only
    ./search.py --list-threads <query>      # Show matching threads (grouped by subject)
    ./search.py --top-posters              # Show most active posters
    ./search.py --top-posters --year 1998  # Most active posters in a year
    ./search.py --stats                    # Archive statistics

Examples:
    ./search.py ascend
    ./search.py "v.90 firmware"
    ./search.py radius --year 1998
    ./search.py "K56flex" --subject-only
    ./search.py --list-threads "ISDN PRI"

FTS5 query syntax:
    ascend                  # Messages containing "ascend"
    ascend max              # Messages containing "ascend" AND "max"
    "ascend max"            # Messages containing the exact phrase "ascend max"
    ascend OR livingston    # Messages containing either term
    ascend NOT pipeline     # Messages with "ascend" but not "pipeline"
"""

import argparse
import os
import re
import sqlite3
import sys
import textwrap

# SQLite needs a local filesystem (archive is on CIFS mount)
DB_PATH = os.path.expanduser("~/.local/share/usr-tc-archive/usr-tc-archive.db")
TERM_WIDTH = min(os.get_terminal_size().columns, 120) if sys.stdout.isatty() else 120


def highlight(text, terms):
    """Highlight search terms in text for terminal output."""
    if not terms or not sys.stdout.isatty():
        return text
    for term in terms:
        if len(term) < 2:
            continue
        pattern = re.compile(re.escape(term), re.IGNORECASE)
        text = pattern.sub(lambda m: f"\033[1;33m{m.group()}\033[0m", text)
    return text


def extract_terms(query):
    """Extract individual search terms from an FTS query for highlighting."""
    # Remove FTS operators
    cleaned = re.sub(r'\b(AND|OR|NOT|NEAR)\b', ' ', query)
    # Remove quotes but keep the words
    cleaned = cleaned.replace('"', '')
    terms = [t.strip() for t in cleaned.split() if len(t.strip()) >= 2]
    return terms


def snippet(body, terms, max_lines=8):
    """Extract a relevant snippet from the message body around matching terms."""
    lines = body.split("\n")

    if not terms:
        return "\n".join(lines[:max_lines])

    # Find the first line containing a search term
    best_line = 0
    for i, line in enumerate(lines):
        for term in terms:
            if term.lower() in line.lower():
                best_line = i
                break
        if best_line > 0:
            break

    # Show context around the best match
    start = max(0, best_line - 1)
    end = min(len(lines), start + max_lines)

    result = lines[start:end]
    if start > 0:
        result = ["..."] + result
    if end < len(lines):
        result = result + ["..."]

    return "\n".join(result)


def format_message(msg, terms=None, full=False):
    """Format a message for display."""
    sep = "=" * TERM_WIDTH
    date_str = msg["date_parsed"] or msg["date_raw"] or "unknown"
    from_str = msg["from_raw"] or "unknown"
    subject_str = msg["subject"] or "(no subject)"

    header = (
        f"{sep}\n"
        f"From:    {highlight(from_str, terms)}\n"
        f"Subject: {highlight(subject_str, terms)}\n"
        f"Date:    {date_str}\n"
        f"Source:  {msg['source_file']}\n"
        f"{'-' * TERM_WIDTH}"
    )

    if full:
        body = highlight(msg["body"], terms)
    else:
        body = highlight(snippet(msg["body"], terms), terms)

    return f"{header}\n{body}\n"


def search(conn, query, from_filter=None, year=None, year_end=None,
           subject_only=False, limit=25, offset=0, full=False):
    """Full-text search with optional filters."""
    terms = extract_terms(query)

    if subject_only:
        fts_query = f"subject:({query})"
    else:
        fts_query = query

    sql = """
        SELECT m.*, messages_fts.rank
        FROM messages_fts
        JOIN messages m ON m.id = messages_fts.rowid
        WHERE messages_fts MATCH ?
    """
    params = [fts_query]

    if from_filter:
        sql += " AND m.from_email LIKE ?"
        params.append(f"%{from_filter}%")

    if year:
        sql += " AND m.archive_year = ?"
        params.append(year)

    if year_end:
        sql += " AND m.archive_year <= ?"
        params.append(year_end)

    # Count total matches
    count_sql = sql.replace("SELECT m.*, messages_fts.rank", "SELECT COUNT(*)")
    c = conn.cursor()
    c.execute(count_sql, params)
    total = c.fetchone()[0]

    # Get results ordered by relevance
    sql += " ORDER BY messages_fts.rank LIMIT ? OFFSET ?"
    params.extend([limit, offset])

    c.execute(sql, params)
    columns = [d[0] for d in c.description]
    results = [dict(zip(columns, row)) for row in c.fetchall()]

    return results, total, terms


def list_threads(conn, query, year=None, limit=30):
    """Show threads (grouped by subject) matching a query."""
    terms = extract_terms(query)

    sql = """
        SELECT m.subject,
               COUNT(*) as msg_count,
               MIN(m.date_parsed) as first_date,
               MAX(m.date_parsed) as last_date,
               GROUP_CONCAT(DISTINCT m.from_name) as participants
        FROM messages_fts
        JOIN messages m ON m.id = messages_fts.rowid
        WHERE messages_fts MATCH ?
    """
    params = [query]

    if year:
        sql += " AND m.archive_year = ?"
        params.append(year)

    # Normalize subjects by removing Re: and (usr-tc) prefixes for grouping
    sql += """
        GROUP BY REPLACE(REPLACE(REPLACE(LOWER(m.subject),
            're: ', ''), '(usr-tc) ', ''), 're:(usr-tc) ', '')
        ORDER BY msg_count DESC
        LIMIT ?
    """
    params.append(limit)

    c = conn.cursor()
    c.execute(sql, params)

    print(f"\nThreads matching '{query}':\n")
    print(f"{'Messages':>8}  {'First seen':<12} {'Subject'}")
    print(f"{'-'*8}  {'-'*12} {'-'*60}")

    for row in c.fetchall():
        subject = highlight(row[0] or "(no subject)", terms)
        participants = row[4] or ""
        # Truncate participant list
        if len(participants) > 50:
            participants = participants[:50] + "..."
        first = (row[2] or "")[:10]
        print(f"{row[1]:>8}  {first:<12} {subject}")
        if participants:
            print(f"{'':>8}  {'':>12} participants: {participants}")


def top_posters(conn, year=None, limit=30):
    """Show most active posters."""
    sql = "SELECT from_email, from_name, COUNT(*) as cnt FROM messages"
    params = []
    if year:
        sql += " WHERE archive_year = ?"
        params.append(year)
    sql += " GROUP BY from_email ORDER BY cnt DESC LIMIT ?"
    params.append(limit)

    c = conn.cursor()
    c.execute(sql, params)

    period = f" in {year}" if year else ""
    print(f"\nTop posters{period}:\n")
    print(f"{'Count':>6}  {'Email':<40} {'Name'}")
    print(f"{'-'*6}  {'-'*40} {'-'*30}")
    for row in c.fetchall():
        print(f"{row[2]:>6}  {row[0]:<40} {row[1] or ''}")


def show_stats(conn):
    """Show archive statistics."""
    c = conn.cursor()

    c.execute("SELECT COUNT(*) FROM messages")
    total = c.fetchone()[0]

    c.execute("SELECT COUNT(DISTINCT from_email) FROM messages")
    senders = c.fetchone()[0]

    c.execute("SELECT MIN(date_parsed), MAX(date_parsed) FROM messages WHERE date_parsed IS NOT NULL")
    date_range = c.fetchone()

    c.execute("SELECT MIN(archive_year), MAX(archive_year) FROM messages")
    year_range = c.fetchone()

    print(f"\nUSR Total Control Mailing List Archive")
    print(f"{'='*40}")
    print(f"Total messages: {total:,}")
    print(f"Unique senders: {senders:,}")
    print(f"Date range:     {date_range[0]} to {date_range[1]}")
    print(f"Year range:     {year_range[0]} to {year_range[1]}")

    print(f"\nMessages per year:")
    c.execute("""
        SELECT archive_year, COUNT(*) FROM messages
        GROUP BY archive_year ORDER BY archive_year
    """)
    for row in c.fetchall():
        bar = "#" * (row[1] // 50)
        print(f"  {row[0]}: {row[1]:>5}  {bar}")

    print(f"\nTop 15 subjects (normalized):")
    c.execute("""
        SELECT REPLACE(REPLACE(LOWER(subject), 're: ', ''), '(usr-tc) ', '') as subj,
               COUNT(*) as cnt
        FROM messages
        GROUP BY subj
        ORDER BY cnt DESC
        LIMIT 15
    """)
    for row in c.fetchall():
        print(f"  {row[1]:>5}  {row[0]}")


def main():
    parser = argparse.ArgumentParser(
        description="Search the USR Total Control mailing list archive",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=textwrap.dedent("""\
            FTS5 query examples:
              ascend                  Messages containing "ascend"
              ascend max              Messages containing both terms
              "ascend max"            Exact phrase match
              ascend OR livingston    Either term
              ascend NOT pipeline     "ascend" but not "pipeline"
        """)
    )
    parser.add_argument("query", nargs="*", help="Search terms (FTS5 syntax)")
    parser.add_argument("--from", dest="from_filter", help="Filter by sender email")
    parser.add_argument("--year", type=int, help="Filter by year")
    parser.add_argument("--subject-only", action="store_true", help="Search subjects only")
    parser.add_argument("--list-threads", action="store_true", help="Group results by thread")
    parser.add_argument("--top-posters", action="store_true", help="Show most active posters")
    parser.add_argument("--stats", action="store_true", help="Show archive statistics")
    parser.add_argument("--full", action="store_true", help="Show full message bodies")
    parser.add_argument("--limit", "-n", type=int, default=25, help="Max results (default 25)")
    parser.add_argument("--offset", type=int, default=0, help="Skip first N results")
    parser.add_argument("--id", type=int, help="Show a specific message by ID")

    args = parser.parse_args()

    if not os.path.exists(DB_PATH):
        print(f"Database not found at {DB_PATH}", file=sys.stderr)
        print("Run parse_archive.py first to build the index.", file=sys.stderr)
        sys.exit(1)

    conn = sqlite3.connect(DB_PATH)

    if args.stats:
        show_stats(conn)
        conn.close()
        return

    if args.top_posters:
        top_posters(conn, year=args.year)
        conn.close()
        return

    if args.id:
        c = conn.cursor()
        c.execute("SELECT * FROM messages WHERE id = ?", (args.id,))
        columns = [d[0] for d in c.description]
        row = c.fetchone()
        if row:
            msg = dict(zip(columns, row))
            print(format_message(msg, full=True))
        else:
            print(f"Message ID {args.id} not found.")
        conn.close()
        return

    query = " ".join(args.query)
    if not query:
        parser.print_help()
        conn.close()
        return

    if args.list_threads:
        list_threads(conn, query, year=args.year)
        conn.close()
        return

    results, total, terms = search(
        conn, query,
        from_filter=args.from_filter,
        year=args.year,
        subject_only=args.subject_only,
        limit=args.limit,
        offset=args.offset,
        full=args.full,
    )

    print(f"\n{total} results for '{query}'", end="")
    if args.from_filter:
        print(f" from '{args.from_filter}'", end="")
    if args.year:
        print(f" in {args.year}", end="")
    print(f" (showing {args.offset+1}-{args.offset+len(results)})\n")

    for msg in results:
        print(format_message(msg, terms, full=args.full))

    if total > args.offset + args.limit:
        next_offset = args.offset + args.limit
        print(f"\n... {total - next_offset} more results. Use --offset {next_offset} to see more.")

    conn.close()


if __name__ == "__main__":
    main()
