mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
wip major changes
This commit is contained in:
184
archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
Executable file
184
archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
Executable file
@@ -0,0 +1,184 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse JSONL bookmark files and extract URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads JSONL-format bookmark exports (one JSON object per line).
|
||||
|
||||
Usage: ./on_Snapshot__54_parse_jsonl_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Expected JSONL format (one object per line):
|
||||
{"url": "https://example.com", "title": "Example", "tags": "tag1,tag2"}
|
||||
{"href": "https://other.com", "description": "Other Site"}
|
||||
|
||||
Supports various field names for URL, title, timestamp, and tags.
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from html import unescape
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_jsonl_urls'
|
||||
|
||||
|
||||
def parse_bookmarked_at(link: dict) -> str | None:
|
||||
"""Parse timestamp from various JSON formats, return ISO 8601."""
|
||||
from datetime import timezone
|
||||
|
||||
def json_date(s: str) -> datetime:
|
||||
# Try ISO 8601 format
|
||||
return datetime.strptime(s.split(',', 1)[0], '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
def to_iso(dt: datetime) -> str:
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
return dt.isoformat()
|
||||
|
||||
try:
|
||||
if link.get('bookmarked_at'):
|
||||
# Already in our format, pass through
|
||||
return link['bookmarked_at']
|
||||
elif link.get('timestamp'):
|
||||
# Chrome/Firefox histories use microseconds
|
||||
return to_iso(datetime.fromtimestamp(link['timestamp'] / 1000000, tz=timezone.utc))
|
||||
elif link.get('time'):
|
||||
return to_iso(json_date(link['time']))
|
||||
elif link.get('created_at'):
|
||||
return to_iso(json_date(link['created_at']))
|
||||
elif link.get('created'):
|
||||
return to_iso(json_date(link['created']))
|
||||
elif link.get('date'):
|
||||
return to_iso(json_date(link['date']))
|
||||
elif link.get('bookmarked'):
|
||||
return to_iso(json_date(link['bookmarked']))
|
||||
elif link.get('saved'):
|
||||
return to_iso(json_date(link['saved']))
|
||||
except (ValueError, TypeError, KeyError):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def json_object_to_entry(link: dict) -> dict | None:
|
||||
"""Convert a JSON bookmark object to a URL entry."""
|
||||
# Parse URL (try various field names)
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
return None
|
||||
|
||||
entry = {
|
||||
'type': 'Snapshot',
|
||||
'url': unescape(url),
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}
|
||||
|
||||
# Parse title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
if title:
|
||||
entry['title'] = unescape(title)
|
||||
|
||||
# Parse bookmarked_at (ISO 8601)
|
||||
bookmarked_at = parse_bookmarked_at(link)
|
||||
if bookmarked_at:
|
||||
entry['bookmarked_at'] = bookmarked_at
|
||||
|
||||
# Parse tags
|
||||
tags = link.get('tags', '')
|
||||
if isinstance(tags, list):
|
||||
tags = ','.join(tags)
|
||||
elif isinstance(tags, str) and ',' not in tags and tags:
|
||||
# If no comma, assume space-separated
|
||||
tags = tags.replace(' ', ',')
|
||||
if tags:
|
||||
entry['tags'] = unescape(tags)
|
||||
|
||||
return entry
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='JSONL file URL to parse')
|
||||
def main(url: str):
|
||||
"""Parse JSONL bookmark file and extract URLs."""
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = []
|
||||
for line in content.splitlines():
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
try:
|
||||
link = json.loads(line)
|
||||
entry = json_object_to_entry(link)
|
||||
if entry:
|
||||
urls_found.append(entry)
|
||||
except json.JSONDecodeError:
|
||||
# Skip malformed lines
|
||||
continue
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No URLs found', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Collect unique tags
|
||||
all_tags = set()
|
||||
for entry in urls_found:
|
||||
if entry.get('tags'):
|
||||
for tag in entry['tags'].split(','):
|
||||
tag = tag.strip()
|
||||
if tag:
|
||||
all_tags.add(tag)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
# Write Tag records first
|
||||
for tag_name in sorted(all_tags):
|
||||
f.write(json.dumps({
|
||||
'type': 'Tag',
|
||||
'name': tag_name,
|
||||
}) + '\n')
|
||||
# Write Snapshot records
|
||||
for entry in urls_found:
|
||||
f.write(json.dumps(entry) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -0,0 +1,272 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_jsonl_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseJsonlUrls:
|
||||
"""Test the parse_jsonl_urls extractor CLI."""
|
||||
|
||||
def test_extracts_urls_from_jsonl(self, tmp_path):
|
||||
"""Test extracting URLs from JSONL bookmark file."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://example.com", "title": "Example"}\n'
|
||||
'{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
|
||||
'{"url": "https://test.org", "title": "Test Org"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 3
|
||||
|
||||
entries = [json.loads(line) for line in lines]
|
||||
urls = {e['url'] for e in entries}
|
||||
titles = {e.get('title') for e in entries}
|
||||
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://foo.bar/page' in urls
|
||||
assert 'https://test.org' in urls
|
||||
assert 'Example' in titles
|
||||
assert 'Foo Bar' in titles
|
||||
assert 'Test Org' in titles
|
||||
|
||||
def test_supports_href_field(self, tmp_path):
|
||||
"""Test that 'href' field is recognized as URL."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
|
||||
def test_supports_description_as_title(self, tmp_path):
|
||||
"""Test that 'description' field is used as title fallback."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['title'] == 'A description'
|
||||
|
||||
def test_parses_various_timestamp_formats(self, tmp_path):
|
||||
"""Test parsing of different timestamp field names."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
# Parser converts timestamp to bookmarked_at
|
||||
assert 'bookmarked_at' in entry
|
||||
|
||||
def test_parses_tags_as_string(self, tmp_path):
|
||||
"""Test parsing tags as comma-separated string."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
# Parser converts tags to separate Tag objects in the output
|
||||
content = output_file.read_text()
|
||||
assert 'tech' in content or 'news' in content or 'Tag' in content
|
||||
|
||||
def test_parses_tags_as_list(self, tmp_path):
|
||||
"""Test parsing tags as JSON array."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
# Parser converts tags to separate Tag objects in the output
|
||||
content = output_file.read_text()
|
||||
assert 'tech' in content or 'news' in content or 'Tag' in content
|
||||
|
||||
def test_skips_malformed_lines(self, tmp_path):
|
||||
"""Test that malformed JSON lines are skipped."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://valid.com"}\n'
|
||||
'not valid json\n'
|
||||
'{"url": "https://also-valid.com"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_skips_entries_without_url(self, tmp_path):
|
||||
"""Test that entries without URL field are skipped."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://valid.com"}\n'
|
||||
'{"title": "No URL here"}\n'
|
||||
'{"url": "https://also-valid.com"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_exits_1_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no URLs found."""
|
||||
input_file = tmp_path / 'empty.jsonl'
|
||||
input_file.write_text('{"title": "No URL"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No URLs found' in result.stderr
|
||||
|
||||
def test_exits_1_when_file_not_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when file doesn't exist."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'Failed to fetch' in result.stderr
|
||||
|
||||
def test_handles_html_entities(self, tmp_path):
|
||||
"""Test that HTML entities in URLs and titles are decoded."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com/page?a=1&b=2", "title": "Test & Title"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
assert entry['title'] == 'Test & Title'
|
||||
|
||||
def test_skips_empty_lines(self, tmp_path):
|
||||
"""Test that empty lines are skipped."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text(
|
||||
'{"url": "https://example.com"}\n'
|
||||
'\n'
|
||||
' \n'
|
||||
'{"url": "https://other.com"}\n'
|
||||
)
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_output_includes_required_fields(self, tmp_path):
|
||||
"""Test that output includes required fields."""
|
||||
input_file = tmp_path / 'bookmarks.jsonl'
|
||||
input_file.write_text('{"url": "https://example.com"}\n')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user