wip major changes

2026-04-05 23:37:58 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+"""
+Parse Netscape bookmark HTML files and extract URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads Netscape-format bookmark exports (produced by all major browsers).
+
+Usage: ./on_Snapshot__53_parse_netscape_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Examples:
+    ./on_Snapshot__53_parse_netscape_urls.py --url=file:///path/to/bookmarks.html
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from html import unescape
+from urllib.parse import urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_netscape_urls'
+
+# Regex pattern for Netscape bookmark format
+# Example: <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" TAGS="tag1,tag2">example title</A>
+NETSCAPE_PATTERN = re.compile(
+    r'<a\s+href="([^"]+)"\s+add_date="(\d+)"(?:\s+[^>]*?tags="([^"]*)")?[^>]*>([^<]+)</a>',
+    re.UNICODE | re.IGNORECASE
+)
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='Netscape bookmark file URL to parse')
+def main(url: str):
+    """Parse Netscape bookmark HTML and extract URLs."""
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    urls_found = []
+    all_tags = set()
+
+    for line in content.splitlines():
+        match = NETSCAPE_PATTERN.search(line)
+        if match:
+            bookmark_url = match.group(1)
+            tags_str = match.group(3) or ''
+            title = match.group(4).strip()
+
+            entry = {
+                'type': 'Snapshot',
+                'url': unescape(bookmark_url),
+                'via_extractor': EXTRACTOR_NAME,
+            }
+            if title:
+                entry['title'] = unescape(title)
+            if tags_str:
+                entry['tags'] = tags_str
+                # Collect unique tags
+                for tag in tags_str.split(','):
+                    tag = tag.strip()
+                    if tag:
+                        all_tags.add(tag)
+            try:
+                # Convert unix timestamp to ISO 8601
+                entry['bookmarked_at'] = datetime.fromtimestamp(float(match.group(2)), tz=timezone.utc).isoformat()
+            except (ValueError, OSError):
+                pass
+            urls_found.append(entry)
+
+    if not urls_found:
+        click.echo('No bookmarks found', err=True)
+        sys.exit(1)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        # Write Tag records first
+        for tag_name in sorted(all_tags):
+            f.write(json.dumps({
+                'type': 'Tag',
+                'name': tag_name,
+            }) + '\n')
+        # Write Snapshot records
+        for entry in urls_found:
+            f.write(json.dumps(entry) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs, {len(all_tags)} tags')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_netscape_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
+
+
+class TestParseNetscapeUrls:
+    """Test the parse_netscape_urls extractor CLI."""
+
+    def test_extracts_urls_from_netscape_bookmarks(self, tmp_path):
+        """Test extracting URLs from Netscape bookmark HTML format."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1609459200">Example Site</A>
+    <DT><A HREF="https://foo.bar/page" ADD_DATE="1609545600">Foo Bar</A>
+    <DT><A HREF="https://test.org" ADD_DATE="1609632000">Test Org</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 3 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+        titles = {e.get('title') for e in entries}
+
+        assert 'https://example.com' in urls
+        assert 'https://foo.bar/page' in urls
+        assert 'https://test.org' in urls
+        assert 'Example Site' in titles
+        assert 'Foo Bar' in titles
+        assert 'Test Org' in titles
+
+    def test_parses_add_date_timestamps(self, tmp_path):
+        """Test that ADD_DATE timestamps are parsed correctly."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1609459200">Test</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        # Parser converts timestamp to bookmarked_at
+        assert 'bookmarked_at' in entry
+
+    def test_handles_query_params_in_urls(self, tmp_path):
+        """Test that URLs with query parameters are preserved."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com/search?q=test+query&page=1" ADD_DATE="1609459200">Search</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert 'q=test+query' in entry['url']
+        assert 'page=1' in entry['url']
+
+    def test_handles_html_entities(self, tmp_path):
+        """Test that HTML entities in URLs and titles are decoded."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com/page?a=1&amp;b=2" ADD_DATE="1609459200">Test &amp; Title</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+        assert entry['title'] == 'Test & Title'
+
+    def test_exits_1_when_no_bookmarks_found(self, tmp_path):
+        """Test that script exits with code 1 when no bookmarks found."""
+        input_file = tmp_path / 'empty.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No bookmarks found' in result.stderr
+
+    def test_exits_1_when_file_not_found(self, tmp_path):
+        """Test that script exits with code 1 when file doesn't exist."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'file:///nonexistent/bookmarks.html'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'Failed to fetch' in result.stderr
+
+    def test_handles_nested_folders(self, tmp_path):
+        """Test parsing bookmarks in nested folder structure."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<DL><p>
+    <DT><H3>Folder 1</H3>
+    <DL><p>
+        <DT><A HREF="https://example.com/nested1" ADD_DATE="1609459200">Nested 1</A>
+        <DT><H3>Subfolder</H3>
+        <DL><p>
+            <DT><A HREF="https://example.com/nested2" ADD_DATE="1609459200">Nested 2</A>
+        </DL><p>
+    </DL><p>
+    <DT><A HREF="https://example.com/top" ADD_DATE="1609459200">Top Level</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        urls = {json.loads(line)['url'] for line in lines}
+
+        assert 'https://example.com/nested1' in urls
+        assert 'https://example.com/nested2' in urls
+        assert 'https://example.com/top' in urls
+
+    def test_case_insensitive_parsing(self, tmp_path):
+        """Test that parsing is case-insensitive for HTML tags."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<dt><a HREF="https://example.com" ADD_DATE="1609459200">Test</a>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])