more tests and migrations fixes

2026-04-04 23:07:56 +10:00 · 2025-12-26 18:22:48 -08:00
parent 0fbcbd2616
commit e2cbcd17f6
26 changed files with 3608 additions and 1792 deletions
--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py
@@ -0,0 +1,987 @@
+#!/usr/bin/env python3
+"""Comprehensive tests for parse_rss_urls extractor covering various RSS/Atom variants."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
+
+
+class TestRssVariants:
+    """Test various RSS format variants."""
+
+    def test_rss_091(self, tmp_path):
+        """Test RSS 0.91 format (oldest RSS version)."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<rss version="0.91">
+  <channel>
+    <title>RSS 0.91 Feed</title>
+    <link>https://example.com</link>
+    <description>Test RSS 0.91</description>
+    <item>
+      <title>RSS 0.91 Article</title>
+      <link>https://example.com/article1</link>
+      <description>An article in RSS 0.91 format</description>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0, f"Failed: {result.stderr}"
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entry = json.loads(lines[0])
+
+        assert entry['url'] == 'https://example.com/article1'
+        assert entry['title'] == 'RSS 0.91 Article'
+        assert entry['via_extractor'] == 'parse_rss_urls'
+
+    def test_rss_10_rdf(self, tmp_path):
+        """Test RSS 1.0 (RDF) format."""
+        input_file = tmp_path / 'feed.rdf'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns="http://purl.org/rss/1.0/"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <channel rdf:about="https://example.com">
+    <title>RSS 1.0 Feed</title>
+    <link>https://example.com</link>
+  </channel>
+  <item rdf:about="https://example.com/rdf1">
+    <title>RDF Item 1</title>
+    <link>https://example.com/rdf1</link>
+    <dc:date>2024-01-15T10:30:00Z</dc:date>
+    <dc:subject>Technology</dc:subject>
+  </item>
+  <item rdf:about="https://example.com/rdf2">
+    <title>RDF Item 2</title>
+    <link>https://example.com/rdf2</link>
+    <dc:date>2024-01-16T14:20:00Z</dc:date>
+  </item>
+</rdf:RDF>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0, f"Failed: {result.stderr}"
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+
+        urls = {e['url'] for e in entries}
+        assert 'https://example.com/rdf1' in urls
+        assert 'https://example.com/rdf2' in urls
+        assert any(e.get('bookmarked_at') for e in entries)
+
+    def test_rss_20_with_full_metadata(self, tmp_path):
+        """Test RSS 2.0 with all standard metadata fields."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <title>Full RSS 2.0</title>
+    <link>https://example.com</link>
+    <description>Complete RSS 2.0 feed</description>
+    <item>
+      <title>Complete Article</title>
+      <link>https://example.com/complete</link>
+      <description>Full description here</description>
+      <author>author@example.com</author>
+      <category>Technology</category>
+      <category>Programming</category>
+      <guid>https://example.com/complete</guid>
+      <pubDate>Mon, 15 Jan 2024 10:30:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        content = output_file.read_text().strip()
+        lines = content.split('\n')
+
+        # Check for Tag records
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        tag_names = {t['name'] for t in tags}
+        assert 'Technology' in tag_names
+        assert 'Programming' in tag_names
+
+        # Check Snapshot record
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+        assert entry['url'] == 'https://example.com/complete'
+        assert entry['title'] == 'Complete Article'
+        assert 'bookmarked_at' in entry
+        assert entry['tags'] == 'Technology,Programming' or entry['tags'] == 'Programming,Technology'
+
+
+class TestAtomVariants:
+    """Test various Atom format variants."""
+
+    def test_atom_10_full(self, tmp_path):
+        """Test Atom 1.0 with full metadata."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <title>Atom 1.0 Feed</title>
+  <updated>2024-01-15T00:00:00Z</updated>
+  <entry>
+    <title>Atom Entry 1</title>
+    <link href="https://atom.example.com/1"/>
+    <id>urn:uuid:1234-5678</id>
+    <updated>2024-01-15T10:30:00Z</updated>
+    <published>2024-01-14T08:00:00Z</published>
+    <category term="science"/>
+    <category term="research"/>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        tag_names = {t['name'] for t in tags}
+        assert 'science' in tag_names
+        assert 'research' in tag_names
+
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+        assert entry['url'] == 'https://atom.example.com/1'
+        assert 'bookmarked_at' in entry
+
+    def test_atom_with_alternate_link(self, tmp_path):
+        """Test Atom feed with alternate link types."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <title>Atom Alternate Links</title>
+  <entry>
+    <title>Entry with alternate</title>
+    <link rel="alternate" type="text/html" href="https://atom.example.com/article"/>
+    <link rel="self" href="https://atom.example.com/feed"/>
+    <updated>2024-01-15T10:30:00Z</updated>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        # feedparser should pick the alternate link
+        assert 'atom.example.com/article' in entry['url']
+
+
+class TestDateFormats:
+    """Test various date format handling."""
+
+    def test_rfc822_date(self, tmp_path):
+        """Test RFC 822 date format (RSS 2.0 standard)."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>RFC 822 Date</title>
+      <link>https://example.com/rfc822</link>
+      <pubDate>Wed, 15 Jan 2020 10:30:45 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert 'bookmarked_at' in entry
+        assert '2020-01-15' in entry['bookmarked_at']
+
+    def test_iso8601_date(self, tmp_path):
+        """Test ISO 8601 date format (Atom standard)."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <entry>
+    <title>ISO 8601 Date</title>
+    <link href="https://example.com/iso"/>
+    <published>2024-01-15T10:30:45.123Z</published>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert 'bookmarked_at' in entry
+        assert '2024-01-15' in entry['bookmarked_at']
+
+    def test_updated_vs_published_date(self, tmp_path):
+        """Test that published date is preferred over updated date."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <entry>
+    <title>Date Priority Test</title>
+    <link href="https://example.com/dates"/>
+    <published>2024-01-10T10:00:00Z</published>
+    <updated>2024-01-15T10:00:00Z</updated>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        # Should use published date (Jan 10) not updated date (Jan 15)
+        assert '2024-01-10' in entry['bookmarked_at']
+
+    def test_only_updated_date(self, tmp_path):
+        """Test fallback to updated date when published is missing."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <entry>
+    <title>Only Updated</title>
+    <link href="https://example.com/updated"/>
+    <updated>2024-01-20T10:00:00Z</updated>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert '2024-01-20' in entry['bookmarked_at']
+
+    def test_no_date(self, tmp_path):
+        """Test entries without any date."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>No Date</title>
+      <link>https://example.com/nodate</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert 'bookmarked_at' not in entry
+
+
+class TestTagsAndCategories:
+    """Test various tag and category formats."""
+
+    def test_rss_categories(self, tmp_path):
+        """Test RSS 2.0 category elements."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Multi Category</title>
+      <link>https://example.com/cats</link>
+      <category>Tech</category>
+      <category>Web</category>
+      <category>Programming</category>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        tag_names = {t['name'] for t in tags}
+        assert 'Tech' in tag_names
+        assert 'Web' in tag_names
+        assert 'Programming' in tag_names
+
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+        tags_list = entry['tags'].split(',')
+        assert len(tags_list) == 3
+
+    def test_atom_categories(self, tmp_path):
+        """Test Atom category elements with various attributes."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <entry>
+    <title>Atom Categories</title>
+    <link href="https://example.com/atomcats"/>
+    <category term="python" scheme="http://example.com/categories" label="Python Programming"/>
+    <category term="django" label="Django Framework"/>
+    <updated>2024-01-15T10:00:00Z</updated>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        tag_names = {t['name'] for t in tags}
+        # feedparser extracts the 'term' attribute
+        assert 'python' in tag_names
+        assert 'django' in tag_names
+
+    def test_no_tags(self, tmp_path):
+        """Test entries without tags."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>No Tags</title>
+      <link>https://example.com/notags</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert 'tags' not in entry or entry['tags'] == ''
+
+    def test_duplicate_tags(self, tmp_path):
+        """Test that duplicate tags are handled properly."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Duplicate Tags</title>
+      <link>https://example.com/dups</link>
+      <category>Python</category>
+      <category>Python</category>
+      <category>Web</category>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        # Tag records should be unique
+        tag_names = [t['name'] for t in tags]
+        assert tag_names.count('Python') == 1
+
+
+class TestCustomNamespaces:
+    """Test custom namespace handling (Dublin Core, Media RSS, etc.)."""
+
+    def test_dublin_core_metadata(self, tmp_path):
+        """Test Dublin Core namespace fields."""
+        input_file = tmp_path / 'feed.rdf'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+         xmlns="http://purl.org/rss/1.0/"
+         xmlns:dc="http://purl.org/dc/elements/1.1/">
+  <channel rdf:about="https://example.com">
+    <title>Dublin Core Feed</title>
+  </channel>
+  <item rdf:about="https://example.com/dc1">
+    <title>Dublin Core Article</title>
+    <link>https://example.com/dc1</link>
+    <dc:creator>John Doe</dc:creator>
+    <dc:subject>Technology</dc:subject>
+    <dc:date>2024-01-15T10:30:00Z</dc:date>
+    <dc:rights>Copyright 2024</dc:rights>
+  </item>
+</rdf:RDF>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+
+        assert entry['url'] == 'https://example.com/dc1'
+        assert entry['title'] == 'Dublin Core Article'
+        # feedparser should parse dc:date as bookmarked_at
+        assert 'bookmarked_at' in entry
+
+    def test_media_rss_namespace(self, tmp_path):
+        """Test Media RSS namespace (common in podcast feeds)."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">
+  <channel>
+    <title>Media RSS Feed</title>
+    <item>
+      <title>Podcast Episode 1</title>
+      <link>https://example.com/podcast/1</link>
+      <media:content url="https://example.com/audio.mp3" type="audio/mpeg"/>
+      <media:thumbnail url="https://example.com/thumb.jpg"/>
+      <pubDate>Mon, 15 Jan 2024 10:00:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert entry['url'] == 'https://example.com/podcast/1'
+        assert entry['title'] == 'Podcast Episode 1'
+
+    def test_itunes_namespace(self, tmp_path):
+        """Test iTunes namespace (common in podcast feeds)."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0" xmlns:itunes="http://www.itunes.com/dtds/podcast-1.0.dtd">
+  <channel>
+    <title>iTunes Podcast</title>
+    <item>
+      <title>Episode 1: Getting Started</title>
+      <link>https://example.com/ep1</link>
+      <itunes:author>Jane Smith</itunes:author>
+      <itunes:duration>45:30</itunes:duration>
+      <itunes:keywords>programming, tutorial, beginner</itunes:keywords>
+      <pubDate>Tue, 16 Jan 2024 08:00:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+
+        assert entry['url'] == 'https://example.com/ep1'
+        assert entry['title'] == 'Episode 1: Getting Started'
+
+
+class TestEdgeCases:
+    """Test edge cases and malformed data."""
+
+    def test_missing_title(self, tmp_path):
+        """Test entries without title."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <link>https://example.com/notitle</link>
+      <pubDate>Mon, 15 Jan 2024 10:00:00 GMT</pubDate>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert entry['url'] == 'https://example.com/notitle'
+        assert 'title' not in entry
+
+    def test_missing_link(self, tmp_path):
+        """Test entries without link (should be skipped)."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>No Link</title>
+      <description>This entry has no link</description>
+    </item>
+    <item>
+      <title>Has Link</title>
+      <link>https://example.com/haslink</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        # Should only have the entry with a link
+        assert entry['url'] == 'https://example.com/haslink'
+        assert '1 URL' in result.stdout
+
+    def test_html_entities_in_title(self, tmp_path):
+        """Test HTML entities in titles are properly decoded."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Using &lt;div&gt; &amp; &lt;span&gt; tags</title>
+      <link>https://example.com/html</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert entry['title'] == 'Using <div> & <span> tags'
+
+    def test_special_characters_in_tags(self, tmp_path):
+        """Test special characters in tags."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Special Tags</title>
+      <link>https://example.com/special</link>
+      <category>C++</category>
+      <category>Node.js</category>
+      <category>Web/Mobile</category>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        tag_names = {t['name'] for t in tags}
+        assert 'C++' in tag_names
+        assert 'Node.js' in tag_names
+        assert 'Web/Mobile' in tag_names
+
+    def test_cdata_sections(self, tmp_path):
+        """Test CDATA sections in titles and descriptions."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title><![CDATA[Using <strong>HTML</strong> in titles]]></title>
+      <link>https://example.com/cdata</link>
+      <description><![CDATA[Content with <em>markup</em>]]></description>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        # feedparser should strip HTML tags
+        assert 'HTML' in entry['title']
+        assert entry['url'] == 'https://example.com/cdata'
+
+    def test_relative_urls(self, tmp_path):
+        """Test that relative URLs are preserved (feedparser handles them)."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <link>https://example.com</link>
+    <item>
+      <title>Relative URL</title>
+      <link>/article/relative</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        # feedparser may convert relative to absolute, or leave as-is
+        assert 'article/relative' in entry['url']
+
+    def test_unicode_characters(self, tmp_path):
+        """Test Unicode characters in feed content."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>Unicode: 日本語 Français 中文 العربية</title>
+      <link>https://example.com/unicode</link>
+      <category>日本語</category>
+      <category>Français</category>
+    </item>
+  </channel>
+</rss>
+        ''', encoding='utf-8')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text(encoding='utf-8').strip().split('\n')
+
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+        assert '日本語' in entry['title']
+        assert 'Français' in entry['title']
+
+    def test_very_long_title(self, tmp_path):
+        """Test handling of very long titles."""
+        long_title = 'A' * 1000
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text(f'''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <item>
+      <title>{long_title}</title>
+      <link>https://example.com/long</link>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert len(entry['title']) == 1000
+        assert entry['title'] == long_title
+
+    def test_multiple_entries_batch(self, tmp_path):
+        """Test processing a large batch of entries."""
+        items = []
+        for i in range(100):
+            items.append(f'''
+    <item>
+      <title>Article {i}</title>
+      <link>https://example.com/article/{i}</link>
+      <category>Tag{i % 10}</category>
+      <pubDate>Mon, {15 + (i % 15)} Jan 2024 10:00:00 GMT</pubDate>
+    </item>
+            ''')
+
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text(f'''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <title>Large Feed</title>
+    {''.join(items)}
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 100 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        # Should have 10 unique tags (Tag0-Tag9) + 100 snapshots
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+
+        assert len(tags) == 10
+        assert len(snapshots) == 100
+
+
+class TestRealWorldFeeds:
+    """Test patterns from real-world RSS feeds."""
+
+    def test_medium_style_feed(self, tmp_path):
+        """Test Medium-style feed structure."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<rss version="2.0">
+  <channel>
+    <title>Medium Feed</title>
+    <item>
+      <title>Article Title</title>
+      <link>https://medium.com/@user/article-slug-123abc</link>
+      <guid isPermaLink="false">https://medium.com/p/123abc</guid>
+      <pubDate>Wed, 15 Jan 2024 10:30:00 GMT</pubDate>
+      <category>Programming</category>
+      <category>JavaScript</category>
+      <dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">Author Name</dc:creator>
+    </item>
+  </channel>
+</rss>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+        assert 'medium.com' in entry['url']
+        assert entry['title'] == 'Article Title'
+
+    def test_reddit_style_feed(self, tmp_path):
+        """Test Reddit-style feed structure."""
+        input_file = tmp_path / 'feed.rss'
+        input_file.write_text('''<?xml version="1.0"?>
+<feed xmlns="http://www.w3.org/2005/Atom">
+  <title>Reddit Feed</title>
+  <entry>
+    <title>Post Title</title>
+    <link href="https://www.reddit.com/r/programming/comments/abc123/post_title/"/>
+    <updated>2024-01-15T10:30:00+00:00</updated>
+    <category term="programming" label="r/programming"/>
+    <id>t3_abc123</id>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+        entry = snapshots[0]
+        assert 'reddit.com' in entry['url']
+
+    def test_youtube_style_feed(self, tmp_path):
+        """Test YouTube-style feed structure."""
+        input_file = tmp_path / 'feed.atom'
+        input_file.write_text('''<?xml version="1.0"?>
+<feed xmlns:yt="http://www.youtube.com/xml/schemas/2015"
+      xmlns="http://www.w3.org/2005/Atom">
+  <title>YouTube Channel</title>
+  <entry>
+    <title>Video Title</title>
+    <link rel="alternate" href="https://www.youtube.com/watch?v=dQw4w9WgXcQ"/>
+    <published>2024-01-15T10:30:00+00:00</published>
+    <yt:videoId>dQw4w9WgXcQ</yt:videoId>
+    <yt:channelId>UCxxxxxxxx</yt:channelId>
+  </entry>
+</feed>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert 'youtube.com' in entry['url']
+        assert 'dQw4w9WgXcQ' in entry['url']
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])