wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,184 @@
#!/usr/bin/env python3
"""
Parse JSONL bookmark files and extract URLs.
This is a standalone extractor that can run without ArchiveBox.
It reads JSONL-format bookmark exports (one JSON object per line).
Usage: ./on_Snapshot__54_parse_jsonl_urls.py --url=<url>
Output: Appends discovered URLs to urls.jsonl in current directory
Expected JSONL format (one object per line):
{"url": "https://example.com", "title": "Example", "tags": "tag1,tag2"}
{"href": "https://other.com", "description": "Other Site"}
Supports various field names for URL, title, timestamp, and tags.
"""
import json
import os
import sys
from datetime import datetime
from html import unescape
from urllib.parse import urlparse
import rich_click as click
EXTRACTOR_NAME = 'parse_jsonl_urls'
def parse_bookmarked_at(link: dict) -> str | None:
"""Parse timestamp from various JSON formats, return ISO 8601."""
from datetime import timezone
def json_date(s: str) -> datetime:
# Try ISO 8601 format
return datetime.strptime(s.split(',', 1)[0], '%Y-%m-%dT%H:%M:%S%z')
def to_iso(dt: datetime) -> str:
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.isoformat()
try:
if link.get('bookmarked_at'):
# Already in our format, pass through
return link['bookmarked_at']
elif link.get('timestamp'):
# Chrome/Firefox histories use microseconds
return to_iso(datetime.fromtimestamp(link['timestamp'] / 1000000, tz=timezone.utc))
elif link.get('time'):
return to_iso(json_date(link['time']))
elif link.get('created_at'):
return to_iso(json_date(link['created_at']))
elif link.get('created'):
return to_iso(json_date(link['created']))
elif link.get('date'):
return to_iso(json_date(link['date']))
elif link.get('bookmarked'):
return to_iso(json_date(link['bookmarked']))
elif link.get('saved'):
return to_iso(json_date(link['saved']))
except (ValueError, TypeError, KeyError):
pass
return None
def json_object_to_entry(link: dict) -> dict | None:
"""Convert a JSON bookmark object to a URL entry."""
# Parse URL (try various field names)
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
return None
entry = {
'type': 'Snapshot',
'url': unescape(url),
'via_extractor': EXTRACTOR_NAME,
}
# Parse title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
if title:
entry['title'] = unescape(title)
# Parse bookmarked_at (ISO 8601)
bookmarked_at = parse_bookmarked_at(link)
if bookmarked_at:
entry['bookmarked_at'] = bookmarked_at
# Parse tags
tags = link.get('tags', '')
if isinstance(tags, list):
tags = ','.join(tags)
elif isinstance(tags, str) and ',' not in tags and tags:
# If no comma, assume space-separated
tags = tags.replace(' ', ',')
if tags:
entry['tags'] = unescape(tags)
return entry
def fetch_content(url: str) -> str:
"""Fetch content from a URL (supports file:// and https://)."""
parsed = urlparse(url)
if parsed.scheme == 'file':
file_path = parsed.path
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
return f.read()
else:
timeout = int(os.environ.get('TIMEOUT', '60'))
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
import urllib.request
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
with urllib.request.urlopen(req, timeout=timeout) as response:
return response.read().decode('utf-8', errors='replace')
@click.command()
@click.option('--url', required=True, help='JSONL file URL to parse')
def main(url: str):
"""Parse JSONL bookmark file and extract URLs."""
try:
content = fetch_content(url)
except Exception as e:
click.echo(f'Failed to fetch {url}: {e}', err=True)
sys.exit(1)
urls_found = []
for line in content.splitlines():
line = line.strip()
if not line:
continue
try:
link = json.loads(line)
entry = json_object_to_entry(link)
if entry:
urls_found.append(entry)
except json.JSONDecodeError:
# Skip malformed lines
continue
if not urls_found:
click.echo('No URLs found', err=True)
sys.exit(1)
# Collect unique tags
all_tags = set()
for entry in urls_found:
if entry.get('tags'):
for tag in entry['tags'].split(','):
tag = tag.strip()
if tag:
all_tags.add(tag)
# Write urls.jsonl
with open('urls.jsonl', 'w') as f:
# Write Tag records first
for tag_name in sorted(all_tags):
f.write(json.dumps({
'type': 'Tag',
'name': tag_name,
}) + '\n')
# Write Snapshot records
for entry in urls_found:
f.write(json.dumps(entry) + '\n')
click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,272 @@
#!/usr/bin/env python3
"""Unit tests for parse_jsonl_urls extractor."""
import json
import subprocess
import sys
from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
class TestParseJsonlUrls:
"""Test the parse_jsonl_urls extractor CLI."""
def test_extracts_urls_from_jsonl(self, tmp_path):
"""Test extracting URLs from JSONL bookmark file."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://example.com", "title": "Example"}\n'
'{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
'{"url": "https://test.org", "title": "Test Org"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
assert 'Found 3 URLs' in result.stdout
output_file = tmp_path / 'urls.jsonl'
assert output_file.exists()
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 3
entries = [json.loads(line) for line in lines]
urls = {e['url'] for e in entries}
titles = {e.get('title') for e in entries}
assert 'https://example.com' in urls
assert 'https://foo.bar/page' in urls
assert 'https://test.org' in urls
assert 'Example' in titles
assert 'Foo Bar' in titles
assert 'Test Org' in titles
def test_supports_href_field(self, tmp_path):
"""Test that 'href' field is recognized as URL."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
def test_supports_description_as_title(self, tmp_path):
"""Test that 'description' field is used as title fallback."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['title'] == 'A description'
def test_parses_various_timestamp_formats(self, tmp_path):
"""Test parsing of different timestamp field names."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
# Parser converts timestamp to bookmarked_at
assert 'bookmarked_at' in entry
def test_parses_tags_as_string(self, tmp_path):
"""Test parsing tags as comma-separated string."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Parser converts tags to separate Tag objects in the output
content = output_file.read_text()
assert 'tech' in content or 'news' in content or 'Tag' in content
def test_parses_tags_as_list(self, tmp_path):
"""Test parsing tags as JSON array."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
# Parser converts tags to separate Tag objects in the output
content = output_file.read_text()
assert 'tech' in content or 'news' in content or 'Tag' in content
def test_skips_malformed_lines(self, tmp_path):
"""Test that malformed JSON lines are skipped."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://valid.com"}\n'
'not valid json\n'
'{"url": "https://also-valid.com"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_skips_entries_without_url(self, tmp_path):
"""Test that entries without URL field are skipped."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://valid.com"}\n'
'{"title": "No URL here"}\n'
'{"url": "https://also-valid.com"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_exits_1_when_no_urls_found(self, tmp_path):
"""Test that script exits with code 1 when no URLs found."""
input_file = tmp_path / 'empty.jsonl'
input_file.write_text('{"title": "No URL"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'No URLs found' in result.stderr
def test_exits_1_when_file_not_found(self, tmp_path):
"""Test that script exits with code 1 when file doesn't exist."""
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 1
assert 'Failed to fetch' in result.stderr
def test_handles_html_entities(self, tmp_path):
"""Test that HTML entities in URLs and titles are decoded."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com/page?a=1&amp;b=2", "title": "Test &amp; Title"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com/page?a=1&b=2'
assert entry['title'] == 'Test & Title'
def test_skips_empty_lines(self, tmp_path):
"""Test that empty lines are skipped."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text(
'{"url": "https://example.com"}\n'
'\n'
' \n'
'{"url": "https://other.com"}\n'
)
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
lines = output_file.read_text().strip().split('\n')
assert len(lines) == 2
def test_output_includes_required_fields(self, tmp_path):
"""Test that output includes required fields."""
input_file = tmp_path / 'bookmarks.jsonl'
input_file.write_text('{"url": "https://example.com"}\n')
result = subprocess.run(
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
cwd=tmp_path,
capture_output=True,
text=True,
)
assert result.returncode == 0
output_file = tmp_path / 'urls.jsonl'
entry = json.loads(output_file.read_text().strip())
assert entry['url'] == 'https://example.com'
assert 'type' in entry
assert 'via_extractor' in entry
if __name__ == '__main__':
pytest.main([__file__, '-v'])