wip major changes

2026-04-05 23:37:58 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
@@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+"""
+Parse RSS/Atom feeds and extract URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads feed content from a URL and extracts article URLs.
+
+Usage: ./on_Snapshot__51_parse_rss_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Examples:
+    ./on_Snapshot__51_parse_rss_urls.py --url=https://example.com/feed.rss
+    ./on_Snapshot__51_parse_rss_urls.py --url=file:///path/to/feed.xml
+"""
+
+import json
+import os
+import sys
+from datetime import datetime, timezone
+from html import unescape
+from time import mktime
+from urllib.parse import urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_rss_urls'
+
+try:
+    import feedparser
+except ImportError:
+    feedparser = None
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
+def main(url: str):
+    """Parse RSS/Atom feed and extract article URLs."""
+
+    if feedparser is None:
+        click.echo('feedparser library not installed', err=True)
+        sys.exit(1)
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    # Parse the feed
+    feed = feedparser.parse(content)
+
+    if not feed.entries:
+        click.echo('No entries found in feed', err=True)
+        sys.exit(1)
+
+    urls_found = []
+    for item in feed.entries:
+        item_url = getattr(item, 'link', None)
+        if not item_url:
+            continue
+
+        title = getattr(item, 'title', None)
+
+        # Get bookmarked_at (published/updated date as ISO 8601)
+        bookmarked_at = None
+        if hasattr(item, 'published_parsed') and item.published_parsed:
+            bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
+        elif hasattr(item, 'updated_parsed') and item.updated_parsed:
+            bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
+
+        # Get tags
+        tags = ''
+        if hasattr(item, 'tags') and item.tags:
+            try:
+                tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
+            except (AttributeError, TypeError):
+                pass
+
+        entry = {
+            'type': 'Snapshot',
+            'url': unescape(item_url),
+            'via_extractor': EXTRACTOR_NAME,
+        }
+        if title:
+            entry['title'] = unescape(title)
+        if bookmarked_at:
+            entry['bookmarked_at'] = bookmarked_at
+        if tags:
+            entry['tags'] = tags
+        urls_found.append(entry)
+
+    if not urls_found:
+        click.echo('No valid URLs found in feed entries', err=True)
+        sys.exit(1)
+
+    # Collect unique tags
+    all_tags = set()
+    for entry in urls_found:
+        if entry.get('tags'):
+            for tag in entry['tags'].split(','):
+                tag = tag.strip()
+                if tag:
+                    all_tags.add(tag)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        # Write Tag records first
+        for tag_name in sorted(all_tags):
+            f.write(json.dumps({
+                'type': 'Tag',
+                'name': tag_name,
+            }) + '\n')
+        # Write Snapshot records
+        for entry in urls_found:
+            f.write(json.dumps(entry) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_rss_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
+
+
+class TestParseRssUrls:
+    """Test the parse_rss_urls extractor CLI."""
+
+    def test_parses_real_rss_feed(self, tmp_path):
+        """Test parsing a real RSS feed from the web."""
+        # Use httpbin.org which provides a sample RSS feed
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'https://news.ycombinator.com/rss'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        # HN RSS feed should parse successfully
+        if result.returncode == 0:
+            output_file = tmp_path / 'urls.jsonl'
+            assert output_file.exists(), "Output file not created"
+
+            content = output_file.read_text()
+            assert len(content) > 0, "No URLs extracted from real RSS feed"
+
+            # Verify at least one URL was extracted
+            lines = content.strip().split('\n')
+            assert len(lines) > 0, "No entries found in RSS feed"
+
+    def test_extracts_urls_from_rss_feed(self, tmp_path):
+        """Test extracting URLs from an RSS 2.0 feed."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Test Feed</title>
+    <link>https://example.com</link>
+    <item>
+      <title>First Post</title>
+      <link>https://example.com/post/1</link>
+      <pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
+    </item>
+    <item>
+      <title>Second Post</title>
+      <link>https://example.com/post/2</link>
+      <pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 2 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+        titles = {e.get('title') for e in entries}
+
+        assert 'https://example.com/post/1' in urls
+        assert 'https://example.com/post/2' in urls
+        assert 'First Post' in titles
+        assert 'Second Post' in titles
+
+    def test_extracts_urls_from_atom_feed(self, tmp_path):
+        """Test extracting URLs from an Atom feed."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <title>Test Atom Feed</title>
+  <entry>
+    <title>Atom Post 1</title>
+    <link href="https://atom.example.com/entry/1"/>
+    <updated>2024-01-01T12:00:00Z</updated>
+  </entry>
+  <entry>
+    <title>Atom Post 2</title>
+    <link href="https://atom.example.com/entry/2"/>
+    <updated>2024-01-02T12:00:00Z</updated>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        urls = {json.loads(line)['url'] for line in lines}
+
+        assert 'https://atom.example.com/entry/1' in urls
+        assert 'https://atom.example.com/entry/2' in urls
+
+    def test_exits_1_when_no_entries(self, tmp_path):
+        """Test that script exits with code 1 when feed has no entries."""
+        input_file = tmp_path / 'empty.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <title>Empty Feed</title>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No entries found' in result.stderr
+
+    def test_exits_1_when_file_not_found(self, tmp_path):
+        """Test that script exits with code 1 when file doesn't exist."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/feed.rss'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'Failed to fetch' in result.stderr
+
+    def test_handles_html_entities_in_urls(self, tmp_path):
+        """Test that HTML entities in URLs are decoded."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Entity Test</title>
+      <link>https://example.com/page?a=1&amp;b=2</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+
+    def test_includes_optional_metadata(self, tmp_path):
+        """Test that title and timestamp are included when present."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Test Title</title>
+      <link>https://example.com/test</link>
+      <pubDate>Wed, 15 Jan 2020 10:30:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/test'
+        assert entry['title'] == 'Test Title'
+        # Parser converts timestamp to bookmarked_at
+        assert 'bookmarked_at' in entry
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])