wip major changes

2026-04-05 07:17:52 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py
@@ -0,0 +1,184 @@
+#!/usr/bin/env python3
+"""
+Parse JSONL bookmark files and extract URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads JSONL-format bookmark exports (one JSON object per line).
+
+Usage: ./on_Snapshot__54_parse_jsonl_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Expected JSONL format (one object per line):
+    {"url": "https://example.com", "title": "Example", "tags": "tag1,tag2"}
+    {"href": "https://other.com", "description": "Other Site"}
+
+Supports various field names for URL, title, timestamp, and tags.
+"""
+
+import json
+import os
+import sys
+from datetime import datetime
+from html import unescape
+from urllib.parse import urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_jsonl_urls'
+
+
+def parse_bookmarked_at(link: dict) -> str | None:
+    """Parse timestamp from various JSON formats, return ISO 8601."""
+    from datetime import timezone
+
+    def json_date(s: str) -> datetime:
+        # Try ISO 8601 format
+        return datetime.strptime(s.split(',', 1)[0], '%Y-%m-%dT%H:%M:%S%z')
+
+    def to_iso(dt: datetime) -> str:
+        if dt.tzinfo is None:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.isoformat()
+
+    try:
+        if link.get('bookmarked_at'):
+            # Already in our format, pass through
+            return link['bookmarked_at']
+        elif link.get('timestamp'):
+            # Chrome/Firefox histories use microseconds
+            return to_iso(datetime.fromtimestamp(link['timestamp'] / 1000000, tz=timezone.utc))
+        elif link.get('time'):
+            return to_iso(json_date(link['time']))
+        elif link.get('created_at'):
+            return to_iso(json_date(link['created_at']))
+        elif link.get('created'):
+            return to_iso(json_date(link['created']))
+        elif link.get('date'):
+            return to_iso(json_date(link['date']))
+        elif link.get('bookmarked'):
+            return to_iso(json_date(link['bookmarked']))
+        elif link.get('saved'):
+            return to_iso(json_date(link['saved']))
+    except (ValueError, TypeError, KeyError):
+        pass
+
+    return None
+
+
+def json_object_to_entry(link: dict) -> dict | None:
+    """Convert a JSON bookmark object to a URL entry."""
+    # Parse URL (try various field names)
+    url = link.get('href') or link.get('url') or link.get('URL')
+    if not url:
+        return None
+
+    entry = {
+        'type': 'Snapshot',
+        'url': unescape(url),
+        'via_extractor': EXTRACTOR_NAME,
+    }
+
+    # Parse title
+    title = None
+    if link.get('title'):
+        title = link['title'].strip()
+    elif link.get('description'):
+        title = link['description'].replace(' — Readability', '').strip()
+    elif link.get('name'):
+        title = link['name'].strip()
+    if title:
+        entry['title'] = unescape(title)
+
+    # Parse bookmarked_at (ISO 8601)
+    bookmarked_at = parse_bookmarked_at(link)
+    if bookmarked_at:
+        entry['bookmarked_at'] = bookmarked_at
+
+    # Parse tags
+    tags = link.get('tags', '')
+    if isinstance(tags, list):
+        tags = ','.join(tags)
+    elif isinstance(tags, str) and ',' not in tags and tags:
+        # If no comma, assume space-separated
+        tags = tags.replace(' ', ',')
+    if tags:
+        entry['tags'] = unescape(tags)
+
+    return entry
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='JSONL file URL to parse')
+def main(url: str):
+    """Parse JSONL bookmark file and extract URLs."""
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    urls_found = []
+    for line in content.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+
+        try:
+            link = json.loads(line)
+            entry = json_object_to_entry(link)
+            if entry:
+                urls_found.append(entry)
+        except json.JSONDecodeError:
+            # Skip malformed lines
+            continue
+
+    if not urls_found:
+        click.echo('No URLs found', err=True)
+        sys.exit(1)
+
+    # Collect unique tags
+    all_tags = set()
+    for entry in urls_found:
+        if entry.get('tags'):
+            for tag in entry['tags'].split(','):
+                tag = tag.strip()
+                if tag:
+                    all_tags.add(tag)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        # Write Tag records first
+        for tag_name in sorted(all_tags):
+            f.write(json.dumps({
+                'type': 'Tag',
+                'name': tag_name,
+            }) + '\n')
+        # Write Snapshot records
+        for entry in urls_found:
+            f.write(json.dumps(entry) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
+++ b/archivebox/plugins/parse_jsonl_urls/tests/test_parse_jsonl_urls.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_jsonl_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_jsonl_urls.py'), None)
+
+
+class TestParseJsonlUrls:
+    """Test the parse_jsonl_urls extractor CLI."""
+
+    def test_extracts_urls_from_jsonl(self, tmp_path):
+        """Test extracting URLs from JSONL bookmark file."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://example.com", "title": "Example"}\n'
+            '{"url": "https://foo.bar/page", "title": "Foo Bar"}\n'
+            '{"url": "https://test.org", "title": "Test Org"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 3 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+        titles = {e.get('title') for e in entries}
+
+        assert 'https://example.com' in urls
+        assert 'https://foo.bar/page' in urls
+        assert 'https://test.org' in urls
+        assert 'Example' in titles
+        assert 'Foo Bar' in titles
+        assert 'Test Org' in titles
+
+    def test_supports_href_field(self, tmp_path):
+        """Test that 'href' field is recognized as URL."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"href": "https://example.com", "title": "Test"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+
+    def test_supports_description_as_title(self, tmp_path):
+        """Test that 'description' field is used as title fallback."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "description": "A description"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['title'] == 'A description'
+
+    def test_parses_various_timestamp_formats(self, tmp_path):
+        """Test parsing of different timestamp field names."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "timestamp": 1609459200000000}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        # Parser converts timestamp to bookmarked_at
+        assert 'bookmarked_at' in entry
+
+    def test_parses_tags_as_string(self, tmp_path):
+        """Test parsing tags as comma-separated string."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "tags": "tech,news,reading"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        # Parser converts tags to separate Tag objects in the output
+        content = output_file.read_text()
+        assert 'tech' in content or 'news' in content or 'Tag' in content
+
+    def test_parses_tags_as_list(self, tmp_path):
+        """Test parsing tags as JSON array."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com", "tags": ["tech", "news"]}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        # Parser converts tags to separate Tag objects in the output
+        content = output_file.read_text()
+        assert 'tech' in content or 'news' in content or 'Tag' in content
+
+    def test_skips_malformed_lines(self, tmp_path):
+        """Test that malformed JSON lines are skipped."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://valid.com"}\n'
+            'not valid json\n'
+            '{"url": "https://also-valid.com"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_skips_entries_without_url(self, tmp_path):
+        """Test that entries without URL field are skipped."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://valid.com"}\n'
+            '{"title": "No URL here"}\n'
+            '{"url": "https://also-valid.com"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_exits_1_when_no_urls_found(self, tmp_path):
+        """Test that script exits with code 1 when no URLs found."""
+        input_file = tmp_path / 'empty.jsonl'
+        input_file.write_text('{"title": "No URL"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No URLs found' in result.stderr
+
+    def test_exits_1_when_file_not_found(self, tmp_path):
+        """Test that script exits with code 1 when file doesn't exist."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.jsonl'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'Failed to fetch' in result.stderr
+
+    def test_handles_html_entities(self, tmp_path):
+        """Test that HTML entities in URLs and titles are decoded."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com/page?a=1&amp;b=2", "title": "Test &amp; Title"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+        assert entry['title'] == 'Test & Title'
+
+    def test_skips_empty_lines(self, tmp_path):
+        """Test that empty lines are skipped."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text(
+            '{"url": "https://example.com"}\n'
+            '\n'
+            '   \n'
+            '{"url": "https://other.com"}\n'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_output_includes_required_fields(self, tmp_path):
+        """Test that output includes required fields."""
+        input_file = tmp_path / 'bookmarks.jsonl'
+        input_file.write_text('{"url": "https://example.com"}\n')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+        assert 'type' in entry
+        assert 'via_extractor' in entry
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])