#!/usr/bin/env python3 """ Search the USR Total Control mailing list archive. Usage: ./search.py # Full-text search ./search.py --from # Filter by sender ./search.py --year # Filter by year ./search.py --subject-only # Search subjects only ./search.py --list-threads # Show matching threads (grouped by subject) ./search.py --top-posters # Show most active posters ./search.py --top-posters --year 1998 # Most active posters in a year ./search.py --stats # Archive statistics Examples: ./search.py ascend ./search.py "v.90 firmware" ./search.py radius --year 1998 ./search.py "K56flex" --subject-only ./search.py --list-threads "ISDN PRI" FTS5 query syntax: ascend # Messages containing "ascend" ascend max # Messages containing "ascend" AND "max" "ascend max" # Messages containing the exact phrase "ascend max" ascend OR livingston # Messages containing either term ascend NOT pipeline # Messages with "ascend" but not "pipeline" """ import argparse import os import re import sqlite3 import sys import textwrap # SQLite needs a local filesystem (archive is on CIFS mount) DB_PATH = os.path.expanduser("~/.local/share/usr-tc-archive/usr-tc-archive.db") TERM_WIDTH = min(os.get_terminal_size().columns, 120) if sys.stdout.isatty() else 120 def highlight(text, terms): """Highlight search terms in text for terminal output.""" if not terms or not sys.stdout.isatty(): return text for term in terms: if len(term) < 2: continue pattern = re.compile(re.escape(term), re.IGNORECASE) text = pattern.sub(lambda m: f"\033[1;33m{m.group()}\033[0m", text) return text def extract_terms(query): """Extract individual search terms from an FTS query for highlighting.""" # Remove FTS operators cleaned = re.sub(r'\b(AND|OR|NOT|NEAR)\b', ' ', query) # Remove quotes but keep the words cleaned = cleaned.replace('"', '') terms = [t.strip() for t in cleaned.split() if len(t.strip()) >= 2] return terms def snippet(body, terms, max_lines=8): """Extract a relevant snippet from the message body around matching terms.""" lines = body.split("\n") if not terms: return "\n".join(lines[:max_lines]) # Find the first line containing a search term best_line = 0 for i, line in enumerate(lines): for term in terms: if term.lower() in line.lower(): best_line = i break if best_line > 0: break # Show context around the best match start = max(0, best_line - 1) end = min(len(lines), start + max_lines) result = lines[start:end] if start > 0: result = ["..."] + result if end < len(lines): result = result + ["..."] return "\n".join(result) def format_message(msg, terms=None, full=False): """Format a message for display.""" sep = "=" * TERM_WIDTH date_str = msg["date_parsed"] or msg["date_raw"] or "unknown" from_str = msg["from_raw"] or "unknown" subject_str = msg["subject"] or "(no subject)" header = ( f"{sep}\n" f"From: {highlight(from_str, terms)}\n" f"Subject: {highlight(subject_str, terms)}\n" f"Date: {date_str}\n" f"Source: {msg['source_file']}\n" f"{'-' * TERM_WIDTH}" ) if full: body = highlight(msg["body"], terms) else: body = highlight(snippet(msg["body"], terms), terms) return f"{header}\n{body}\n" def search(conn, query, from_filter=None, year=None, year_end=None, subject_only=False, limit=25, offset=0, full=False): """Full-text search with optional filters.""" terms = extract_terms(query) if subject_only: fts_query = f"subject:({query})" else: fts_query = query sql = """ SELECT m.*, messages_fts.rank FROM messages_fts JOIN messages m ON m.id = messages_fts.rowid WHERE messages_fts MATCH ? """ params = [fts_query] if from_filter: sql += " AND m.from_email LIKE ?" params.append(f"%{from_filter}%") if year: sql += " AND m.archive_year = ?" params.append(year) if year_end: sql += " AND m.archive_year <= ?" params.append(year_end) # Count total matches count_sql = sql.replace("SELECT m.*, messages_fts.rank", "SELECT COUNT(*)") c = conn.cursor() c.execute(count_sql, params) total = c.fetchone()[0] # Get results ordered by relevance sql += " ORDER BY messages_fts.rank LIMIT ? OFFSET ?" params.extend([limit, offset]) c.execute(sql, params) columns = [d[0] for d in c.description] results = [dict(zip(columns, row)) for row in c.fetchall()] return results, total, terms def list_threads(conn, query, year=None, limit=30): """Show threads (grouped by subject) matching a query.""" terms = extract_terms(query) sql = """ SELECT m.subject, COUNT(*) as msg_count, MIN(m.date_parsed) as first_date, MAX(m.date_parsed) as last_date, GROUP_CONCAT(DISTINCT m.from_name) as participants FROM messages_fts JOIN messages m ON m.id = messages_fts.rowid WHERE messages_fts MATCH ? """ params = [query] if year: sql += " AND m.archive_year = ?" params.append(year) # Normalize subjects by removing Re: and (usr-tc) prefixes for grouping sql += """ GROUP BY REPLACE(REPLACE(REPLACE(LOWER(m.subject), 're: ', ''), '(usr-tc) ', ''), 're:(usr-tc) ', '') ORDER BY msg_count DESC LIMIT ? """ params.append(limit) c = conn.cursor() c.execute(sql, params) print(f"\nThreads matching '{query}':\n") print(f"{'Messages':>8} {'First seen':<12} {'Subject'}") print(f"{'-'*8} {'-'*12} {'-'*60}") for row in c.fetchall(): subject = highlight(row[0] or "(no subject)", terms) participants = row[4] or "" # Truncate participant list if len(participants) > 50: participants = participants[:50] + "..." first = (row[2] or "")[:10] print(f"{row[1]:>8} {first:<12} {subject}") if participants: print(f"{'':>8} {'':>12} participants: {participants}") def top_posters(conn, year=None, limit=30): """Show most active posters.""" sql = "SELECT from_email, from_name, COUNT(*) as cnt FROM messages" params = [] if year: sql += " WHERE archive_year = ?" params.append(year) sql += " GROUP BY from_email ORDER BY cnt DESC LIMIT ?" params.append(limit) c = conn.cursor() c.execute(sql, params) period = f" in {year}" if year else "" print(f"\nTop posters{period}:\n") print(f"{'Count':>6} {'Email':<40} {'Name'}") print(f"{'-'*6} {'-'*40} {'-'*30}") for row in c.fetchall(): print(f"{row[2]:>6} {row[0]:<40} {row[1] or ''}") def show_stats(conn): """Show archive statistics.""" c = conn.cursor() c.execute("SELECT COUNT(*) FROM messages") total = c.fetchone()[0] c.execute("SELECT COUNT(DISTINCT from_email) FROM messages") senders = c.fetchone()[0] c.execute("SELECT MIN(date_parsed), MAX(date_parsed) FROM messages WHERE date_parsed IS NOT NULL") date_range = c.fetchone() c.execute("SELECT MIN(archive_year), MAX(archive_year) FROM messages") year_range = c.fetchone() print(f"\nUSR Total Control Mailing List Archive") print(f"{'='*40}") print(f"Total messages: {total:,}") print(f"Unique senders: {senders:,}") print(f"Date range: {date_range[0]} to {date_range[1]}") print(f"Year range: {year_range[0]} to {year_range[1]}") print(f"\nMessages per year:") c.execute(""" SELECT archive_year, COUNT(*) FROM messages GROUP BY archive_year ORDER BY archive_year """) for row in c.fetchall(): bar = "#" * (row[1] // 50) print(f" {row[0]}: {row[1]:>5} {bar}") print(f"\nTop 15 subjects (normalized):") c.execute(""" SELECT REPLACE(REPLACE(LOWER(subject), 're: ', ''), '(usr-tc) ', '') as subj, COUNT(*) as cnt FROM messages GROUP BY subj ORDER BY cnt DESC LIMIT 15 """) for row in c.fetchall(): print(f" {row[1]:>5} {row[0]}") def main(): parser = argparse.ArgumentParser( description="Search the USR Total Control mailing list archive", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=textwrap.dedent("""\ FTS5 query examples: ascend Messages containing "ascend" ascend max Messages containing both terms "ascend max" Exact phrase match ascend OR livingston Either term ascend NOT pipeline "ascend" but not "pipeline" """) ) parser.add_argument("query", nargs="*", help="Search terms (FTS5 syntax)") parser.add_argument("--from", dest="from_filter", help="Filter by sender email") parser.add_argument("--year", type=int, help="Filter by year") parser.add_argument("--subject-only", action="store_true", help="Search subjects only") parser.add_argument("--list-threads", action="store_true", help="Group results by thread") parser.add_argument("--top-posters", action="store_true", help="Show most active posters") parser.add_argument("--stats", action="store_true", help="Show archive statistics") parser.add_argument("--full", action="store_true", help="Show full message bodies") parser.add_argument("--limit", "-n", type=int, default=25, help="Max results (default 25)") parser.add_argument("--offset", type=int, default=0, help="Skip first N results") parser.add_argument("--id", type=int, help="Show a specific message by ID") args = parser.parse_args() if not os.path.exists(DB_PATH): print(f"Database not found at {DB_PATH}", file=sys.stderr) print("Run parse_archive.py first to build the index.", file=sys.stderr) sys.exit(1) conn = sqlite3.connect(DB_PATH) if args.stats: show_stats(conn) conn.close() return if args.top_posters: top_posters(conn, year=args.year) conn.close() return if args.id: c = conn.cursor() c.execute("SELECT * FROM messages WHERE id = ?", (args.id,)) columns = [d[0] for d in c.description] row = c.fetchone() if row: msg = dict(zip(columns, row)) print(format_message(msg, full=True)) else: print(f"Message ID {args.id} not found.") conn.close() return query = " ".join(args.query) if not query: parser.print_help() conn.close() return if args.list_threads: list_threads(conn, query, year=args.year) conn.close() return results, total, terms = search( conn, query, from_filter=args.from_filter, year=args.year, subject_only=args.subject_only, limit=args.limit, offset=args.offset, full=args.full, ) print(f"\n{total} results for '{query}'", end="") if args.from_filter: print(f" from '{args.from_filter}'", end="") if args.year: print(f" in {args.year}", end="") print(f" (showing {args.offset+1}-{args.offset+len(results)})\n") for msg in results: print(format_message(msg, terms, full=args.full)) if total > args.offset + args.limit: next_offset = args.offset + args.limit print(f"\n... {total - next_offset} more results. Use --offset {next_offset} to see more.") conn.close() if __name__ == "__main__": main()