wip

2026-04-04 23:07:56 +10:00 · 2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions
--- a/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py
@@ -70,61 +70,57 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
    # Parse the feed
    feed = feedparser.parse(content)

-    if not feed.entries:
-        click.echo('No entries found in feed', err=True)
-        sys.exit(1)
-
    urls_found = []
    all_tags = set()

-    for item in feed.entries:
-        item_url = getattr(item, 'link', None)
-        if not item_url:
-            continue
+    if not feed.entries:
+        # No entries - will emit skipped status at end
+        pass
+    else:
+        for item in feed.entries:
+            item_url = getattr(item, 'link', None)
+            if not item_url:
+                continue

-        title = getattr(item, 'title', None)
+            title = getattr(item, 'title', None)

-        # Get bookmarked_at (published/updated date as ISO 8601)
-        bookmarked_at = None
-        if hasattr(item, 'published_parsed') and item.published_parsed:
-            bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
-        elif hasattr(item, 'updated_parsed') and item.updated_parsed:
-            bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
+            # Get bookmarked_at (published/updated date as ISO 8601)
+            bookmarked_at = None
+            if hasattr(item, 'published_parsed') and item.published_parsed:
+                bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
+            elif hasattr(item, 'updated_parsed') and item.updated_parsed:
+                bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()

-        # Get tags
-        tags = ''
-        if hasattr(item, 'tags') and item.tags:
-            try:
-                tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
-                # Collect unique tags
-                for tag in tags.split(','):
-                    tag = tag.strip()
-                    if tag:
-                        all_tags.add(tag)
-            except (AttributeError, TypeError):
-                pass
+            # Get tags
+            tags = ''
+            if hasattr(item, 'tags') and item.tags:
+                try:
+                    tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
+                    # Collect unique tags
+                    for tag in tags.split(','):
+                        tag = tag.strip()
+                        if tag:
+                            all_tags.add(tag)
+                except (AttributeError, TypeError):
+                    pass

-        entry = {
-            'type': 'Snapshot',
-            'url': unescape(item_url),
-            'plugin': PLUGIN_NAME,
-            'depth': depth + 1,
-        }
-        if snapshot_id:
-            entry['parent_snapshot_id'] = snapshot_id
-        if crawl_id:
-            entry['crawl_id'] = crawl_id
-        if title:
-            entry['title'] = unescape(title)
-        if bookmarked_at:
-            entry['bookmarked_at'] = bookmarked_at
-        if tags:
-            entry['tags'] = tags
-        urls_found.append(entry)
-
-    if not urls_found:
-        click.echo('No valid URLs found in feed entries', err=True)
-        sys.exit(1)
+            entry = {
+                'type': 'Snapshot',
+                'url': unescape(item_url),
+                'plugin': PLUGIN_NAME,
+                'depth': depth + 1,
+            }
+            if snapshot_id:
+                entry['parent_snapshot_id'] = snapshot_id
+            if crawl_id:
+                entry['crawl_id'] = crawl_id
+            if title:
+                entry['title'] = unescape(title)
+            if bookmarked_at:
+                entry['bookmarked_at'] = bookmarked_at
+            if tags:
+                entry['tags'] = tags
+            urls_found.append(entry)

    # Emit Tag records first (to stdout as JSONL)
    for tag_name in sorted(all_tags):
@@ -137,7 +133,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
    for entry in urls_found:
        print(json.dumps(entry))

-    click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags', err=True)
+    # Emit ArchiveResult record to mark completion
+    status = 'succeeded' if urls_found else 'skipped'
+    output_str = f'Found {len(urls_found)} URLs, {len(all_tags)} tags' if urls_found else 'No URLs found'
+    ar_record = {
+        'type': 'ArchiveResult',
+        'status': status,
+        'output_str': output_str,
+    }
+    print(json.dumps(ar_record))
+
+    click.echo(output_str, err=True)
    sys.exit(0)


--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
@@ -28,10 +28,8 @@ class TestParseRssUrls:

        # HN RSS feed should parse successfully
        if result.returncode == 0:
-            output_file = tmp_path / 'urls.jsonl'
-            assert output_file.exists(), "Output file not created"
-
-            content = output_file.read_text()
+            # Output goes to stdout (JSONL)
+            content = result.stdout
            assert len(content) > 0, "No URLs extracted from real RSS feed"

            # Verify at least one URL was extracted
@@ -70,10 +68,8 @@ class TestParseRssUrls:
        assert result.returncode == 0
        assert 'Found 2 URLs' in result.stdout

-        output_file = tmp_path / 'urls.jsonl'
-        assert output_file.exists()
-
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        assert len(lines) == 2

        entries = [json.loads(line) for line in lines]
@@ -112,15 +108,15 @@ class TestParseRssUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        urls = {json.loads(line)['url'] for line in lines}

        assert 'https://atom.example.com/entry/1' in urls
        assert 'https://atom.example.com/entry/2' in urls

-    def test_exits_1_when_no_entries(self, tmp_path):
-        """Test that script exits with code 1 when feed has no entries."""
+    def test_skips_when_no_entries(self, tmp_path):
+        """Test that script returns skipped status when feed has no entries."""
        input_file = tmp_path / 'empty.rss'
        input_file.write_text('''<?xml version="1.0"?>
 <rss version="2.0">
@@ -137,8 +133,9 @@ class TestParseRssUrls:
            text=True,
        )

-        assert result.returncode == 1
-        assert 'No entries found' in result.stderr
+        assert result.returncode == 0
+        assert 'No URLs found' in result.stderr
+        assert '"status": "skipped"' in result.stdout

    def test_exits_1_when_file_not_found(self, tmp_path):
        """Test that script exits with code 1 when file doesn't exist."""
@@ -174,8 +171,9 @@ class TestParseRssUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com/page?a=1&b=2'

    def test_includes_optional_metadata(self, tmp_path):
@@ -201,8 +199,9 @@ class TestParseRssUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com/test'
        assert entry['title'] == 'Test Title'
        # Parser converts timestamp to bookmarked_at
--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
@@ -41,8 +41,8 @@ class TestRssVariants:
        )

        assert result.returncode == 0, f"Failed: {result.stderr}"
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        entry = json.loads(lines[0])

        assert entry['url'] == 'https://example.com/article1'
@@ -82,8 +82,8 @@ class TestRssVariants:
        )

        assert result.returncode == 0, f"Failed: {result.stderr}"
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        entries = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']

        urls = {e['url'] for e in entries}
@@ -122,8 +122,8 @@ class TestRssVariants:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        content = output_file.read_text().strip()
+        # Output goes to stdout (JSONL)
+        content = result.stdout.strip()
        lines = content.split('\n')

        # Check for Tag records
@@ -171,8 +171,8 @@ class TestAtomVariants:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]

        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
        tag_names = {t['name'] for t in tags}
@@ -207,8 +207,9 @@ class TestAtomVariants:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        # feedparser should pick the alternate link
        assert 'atom.example.com/article' in entry['url']

@@ -239,8 +240,9 @@ class TestDateFormats:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        assert 'bookmarked_at' in entry
        assert '2020-01-15' in entry['bookmarked_at']

@@ -265,8 +267,9 @@ class TestDateFormats:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        assert 'bookmarked_at' in entry
        assert '2024-01-15' in entry['bookmarked_at']

@@ -292,8 +295,9 @@ class TestDateFormats:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        # Should use published date (Jan 10) not updated date (Jan 15)
        assert '2024-01-10' in entry['bookmarked_at']

@@ -318,8 +322,9 @@ class TestDateFormats:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        assert '2024-01-20' in entry['bookmarked_at']

    def test_no_date(self, tmp_path):
@@ -344,8 +349,9 @@ class TestDateFormats:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        assert 'bookmarked_at' not in entry


@@ -377,8 +383,8 @@ class TestTagsAndCategories:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]

        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
        tag_names = {t['name'] for t in tags}
@@ -414,8 +420,8 @@ class TestTagsAndCategories:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]

        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
        tag_names = {t['name'] for t in tags}
@@ -445,8 +451,9 @@ class TestTagsAndCategories:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])
        assert 'tags' not in entry or entry['tags'] == ''

    def test_duplicate_tags(self, tmp_path):
@@ -474,8 +481,8 @@ class TestTagsAndCategories:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
        # Tag records should be unique
        tag_names = [t['name'] for t in tags]
@@ -514,8 +521,8 @@ class TestCustomNamespaces:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
        entry = snapshots[0]

@@ -550,8 +557,9 @@ class TestCustomNamespaces:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        assert entry['url'] == 'https://example.com/podcast/1'
        assert entry['title'] == 'Podcast Episode 1'
@@ -583,8 +591,8 @@ class TestCustomNamespaces:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]
        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
        entry = snapshots[0]

@@ -617,8 +625,9 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        assert entry['url'] == 'https://example.com/notitle'
        assert 'title' not in entry
@@ -649,8 +658,9 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        # Should only have the entry with a link
        assert entry['url'] == 'https://example.com/haslink'
@@ -678,8 +688,9 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        assert entry['title'] == 'Using <div> & <span> tags'

@@ -708,8 +719,8 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]

        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
        tag_names = {t['name'] for t in tags}
@@ -740,8 +751,9 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        # feedparser should strip HTML tags
        assert 'HTML' in entry['title']
@@ -770,8 +782,9 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        # feedparser may convert relative to absolute, or leave as-is
        assert 'article/relative' in entry['url']
@@ -800,7 +813,7 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
+        # Output goes to stdout (JSONL)
        lines = output_file.read_text(encoding='utf-8').strip().split('\n')

        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
@@ -831,8 +844,9 @@ class TestEdgeCases:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        assert len(entry['title']) == 1000
        assert entry['title'] == long_title
@@ -870,8 +884,8 @@ class TestEdgeCases:
        assert result.returncode == 0
        assert 'Found 100 URLs' in result.stdout

-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]

        # Should have 10 unique tags (Tag0-Tag9) + 100 snapshots
        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
@@ -912,8 +926,8 @@ class TestRealWorldFeeds:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]

        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
        entry = snapshots[0]
@@ -944,8 +958,8 @@ class TestRealWorldFeeds:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '\"type\": \"Snapshot\"' in line]

        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
        entry = snapshots[0]
@@ -976,8 +990,9 @@ class TestRealWorldFeeds:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        # Output goes to stdout (JSONL)
+        lines = [line for line in result.stdout.strip().split('\n') if '\"type\": \"Snapshot\"' in line]
+        entry = json.loads(lines[0])

        assert 'youtube.com' in entry['url']
        assert 'dQw4w9WgXcQ' in entry['url']