mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
wip major changes
This commit is contained in:
140
archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
Executable file
140
archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py
Executable file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse RSS/Atom feeds and extract URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads feed content from a URL and extracts article URLs.
|
||||
|
||||
Usage: ./on_Snapshot__51_parse_rss_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Examples:
|
||||
./on_Snapshot__51_parse_rss_urls.py --url=https://example.com/feed.rss
|
||||
./on_Snapshot__51_parse_rss_urls.py --url=file:///path/to/feed.xml
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from time import mktime
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_rss_urls'
|
||||
|
||||
try:
|
||||
import feedparser
|
||||
except ImportError:
|
||||
feedparser = None
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='RSS/Atom feed URL to parse')
|
||||
def main(url: str):
|
||||
"""Parse RSS/Atom feed and extract article URLs."""
|
||||
|
||||
if feedparser is None:
|
||||
click.echo('feedparser library not installed', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Parse the feed
|
||||
feed = feedparser.parse(content)
|
||||
|
||||
if not feed.entries:
|
||||
click.echo('No entries found in feed', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = []
|
||||
for item in feed.entries:
|
||||
item_url = getattr(item, 'link', None)
|
||||
if not item_url:
|
||||
continue
|
||||
|
||||
title = getattr(item, 'title', None)
|
||||
|
||||
# Get bookmarked_at (published/updated date as ISO 8601)
|
||||
bookmarked_at = None
|
||||
if hasattr(item, 'published_parsed') and item.published_parsed:
|
||||
bookmarked_at = datetime.fromtimestamp(mktime(item.published_parsed), tz=timezone.utc).isoformat()
|
||||
elif hasattr(item, 'updated_parsed') and item.updated_parsed:
|
||||
bookmarked_at = datetime.fromtimestamp(mktime(item.updated_parsed), tz=timezone.utc).isoformat()
|
||||
|
||||
# Get tags
|
||||
tags = ''
|
||||
if hasattr(item, 'tags') and item.tags:
|
||||
try:
|
||||
tags = ','.join(tag.term for tag in item.tags if hasattr(tag, 'term'))
|
||||
except (AttributeError, TypeError):
|
||||
pass
|
||||
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(item_url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}
|
||||
if title:
|
||||
entry['title'] = unescape(title)
|
||||
if bookmarked_at:
|
||||
entry['bookmarked_at'] = bookmarked_at
|
||||
if tags:
|
||||
entry['tags'] = tags
|
||||
urls_found.append(entry)
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No valid URLs found in feed entries', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Collect unique tags
|
||||
all_tags = set()
|
||||
for entry in urls_found:
|
||||
if entry.get('tags'):
|
||||
for tag in entry['tags'].split(','):
|
||||
tag = tag.strip()
|
||||
if tag:
|
||||
all_tags.add(tag)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
# Write Tag records first
|
||||
for tag_name in sorted(all_tags):
|
||||
f.write(json.dumps({
|
||||
'type': 'Tag',
|
||||
'name': tag_name,
|
||||
}) + '\n')
|
||||
# Write Snapshot records
|
||||
for entry in urls_found:
|
||||
f.write(json.dumps(entry) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
213
archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
Normal file
213
archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls.py
Normal file
@@ -0,0 +1,213 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_rss_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_rss_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseRssUrls:
|
||||
"""Test the parse_rss_urls extractor CLI."""
|
||||
|
||||
def test_parses_real_rss_feed(self, tmp_path):
|
||||
"""Test parsing a real RSS feed from the web."""
|
||||
# Use httpbin.org which provides a sample RSS feed
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'https://news.ycombinator.com/rss'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
# HN RSS feed should parse successfully
|
||||
if result.returncode == 0:
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists(), "Output file not created"
|
||||
|
||||
content = output_file.read_text()
|
||||
assert len(content) > 0, "No URLs extracted from real RSS feed"
|
||||
|
||||
# Verify at least one URL was extracted
|
||||
lines = content.strip().split('\n')
|
||||
assert len(lines) > 0, "No entries found in RSS feed"
|
||||
|
||||
def test_extracts_urls_from_rss_feed(self, tmp_path):
|
||||
"""Test extracting URLs from an RSS 2.0 feed."""
|
||||
input_file = tmp_path / 'feed.rss'
|
||||
input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Test Feed</title>
|
||||
<link>https://example.com</link>
|
||||
<item>
|
||||
<title>First Post</title>
|
||||
<link>https://example.com/post/1</link>
|
||||
<pubDate>Mon, 01 Jan 2024 12:00:00 GMT</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title>Second Post</title>
|
||||
<link>https://example.com/post/2</link>
|
||||
<pubDate>Tue, 02 Jan 2024 12:00:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 2 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
entries = [json.loads(line) for line in lines]
|
||||
urls = {e['url'] for e in entries}
|
||||
titles = {e.get('title') for e in entries}
|
||||
|
||||
assert 'https://example.com/post/1' in urls
|
||||
assert 'https://example.com/post/2' in urls
|
||||
assert 'First Post' in titles
|
||||
assert 'Second Post' in titles
|
||||
|
||||
def test_extracts_urls_from_atom_feed(self, tmp_path):
|
||||
"""Test extracting URLs from an Atom feed."""
|
||||
input_file = tmp_path / 'feed.atom'
|
||||
input_file.write_text('''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
<title>Test Atom Feed</title>
|
||||
<entry>
|
||||
<title>Atom Post 1</title>
|
||||
<link href="https://atom.example.com/entry/1"/>
|
||||
<updated>2024-01-01T12:00:00Z</updated>
|
||||
</entry>
|
||||
<entry>
|
||||
<title>Atom Post 2</title>
|
||||
<link href="https://atom.example.com/entry/2"/>
|
||||
<updated>2024-01-02T12:00:00Z</updated>
|
||||
</entry>
|
||||
</feed>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
urls = {json.loads(line)['url'] for line in lines}
|
||||
|
||||
assert 'https://atom.example.com/entry/1' in urls
|
||||
assert 'https://atom.example.com/entry/2' in urls
|
||||
|
||||
def test_exits_1_when_no_entries(self, tmp_path):
|
||||
"""Test that script exits with code 1 when feed has no entries."""
|
||||
input_file = tmp_path / 'empty.rss'
|
||||
input_file.write_text('''<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<title>Empty Feed</title>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No entries found' in result.stderr
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when file doesn't exist."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/feed.rss'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'Failed to fetch' in result.stderr
|
||||
|
||||
def test_handles_html_entities_in_urls(self, tmp_path):
|
||||
"""Test that HTML entities in URLs are decoded."""
|
||||
input_file = tmp_path / 'feed.rss'
|
||||
input_file.write_text('''<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Entity Test</title>
|
||||
<link>https://example.com/page?a=1&b=2</link>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
|
||||
def test_includes_optional_metadata(self, tmp_path):
|
||||
"""Test that title and timestamp are included when present."""
|
||||
input_file = tmp_path / 'feed.rss'
|
||||
input_file.write_text('''<?xml version="1.0"?>
|
||||
<rss version="2.0">
|
||||
<channel>
|
||||
<item>
|
||||
<title>Test Title</title>
|
||||
<link>https://example.com/test</link>
|
||||
<pubDate>Wed, 15 Jan 2020 10:30:00 GMT</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
</rss>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/test'
|
||||
assert entry['title'] == 'Test Title'
|
||||
# Parser converts timestamp to bookmarked_at
|
||||
assert 'bookmarked_at' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user