more tests and migrations fixes

2026-04-05 07:17:52 +10:00 · 2025-12-26 18:22:48 -08:00
parent 0fbcbd2616
commit e2cbcd17f6
26 changed files with 3608 additions and 1792 deletions
--- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
+++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py
@@ -24,14 +24,125 @@ import rich_click as click

 EXTRACTOR_NAME = 'parse_netscape_urls'

+# Constants for timestamp epoch detection
+UNIX_EPOCH = 0  # 1970-01-01 00:00:00 UTC
+MAC_COCOA_EPOCH = 978307200  # 2001-01-01 00:00:00 UTC (Mac/Cocoa/NSDate epoch)
+
+# Reasonable date range for bookmarks (to detect correct epoch/unit)
+MIN_REASONABLE_YEAR = 1995  # Netscape Navigator era
+MAX_REASONABLE_YEAR = 2035  # Far enough in future
+
 # Regex pattern for Netscape bookmark format
 # Example: <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" TAGS="tag1,tag2">example title</A>
+# Make ADD_DATE optional and allow negative numbers
 NETSCAPE_PATTERN = re.compile(
-    r'<a\s+href="([^"]+)"\s+add_date="(\d+)"(?:\s+[^>]*?tags="([^"]*)")?[^>]*>([^<]+)</a>',
+    r'<a\s+href="([^"]+)"(?:\s+add_date="([^"]*)")?(?:\s+[^>]*?tags="([^"]*)")?[^>]*>([^<]+)</a>',
    re.UNICODE | re.IGNORECASE
 )


+def parse_timestamp(timestamp_str: str) -> datetime | None:
+    """
+    Intelligently parse bookmark timestamp with auto-detection of format and epoch.
+
+    Browsers use different timestamp formats:
+    - Firefox: Unix epoch (1970) in seconds (10 digits): 1609459200
+    - Safari: Mac/Cocoa epoch (2001) in seconds (9-10 digits): 631152000
+    - Chrome: Unix epoch in microseconds (16 digits): 1609459200000000
+    - Others: Unix epoch in milliseconds (13 digits): 1609459200000
+
+    Strategy:
+    1. Try parsing with different epoch + unit combinations
+    2. Pick the one that yields a reasonable date (1995-2035)
+    3. Prioritize more common formats (Unix seconds, then Mac seconds, etc.)
+    """
+    if not timestamp_str or timestamp_str == '':
+        return None
+
+    try:
+        timestamp_num = float(timestamp_str)
+    except (ValueError, TypeError):
+        return None
+
+    # Detect sign and work with absolute value
+    is_negative = timestamp_num < 0
+    abs_timestamp = abs(timestamp_num)
+
+    # Determine number of digits to guess the unit
+    if abs_timestamp == 0:
+        num_digits = 1
+    else:
+        num_digits = len(str(int(abs_timestamp)))
+
+    # Try different interpretations in order of likelihood
+    candidates = []
+
+    # Unix epoch seconds (10-11 digits) - Most common: Firefox, Chrome HTML export
+    if 9 <= num_digits <= 11:
+        try:
+            dt = datetime.fromtimestamp(timestamp_num, tz=timezone.utc)
+            if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR:
+                candidates.append((dt, 'unix_seconds', 100))  # Highest priority
+        except (ValueError, OSError, OverflowError):
+            pass
+
+    # Mac/Cocoa epoch seconds (9-10 digits) - Safari
+    # Only consider if Unix seconds didn't work or gave unreasonable date
+    if 8 <= num_digits <= 11:
+        try:
+            dt = datetime.fromtimestamp(timestamp_num + MAC_COCOA_EPOCH, tz=timezone.utc)
+            if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR:
+                candidates.append((dt, 'mac_seconds', 90))
+        except (ValueError, OSError, OverflowError):
+            pass
+
+    # Unix epoch milliseconds (13 digits) - JavaScript exports
+    if 12 <= num_digits <= 14:
+        try:
+            dt = datetime.fromtimestamp(timestamp_num / 1000, tz=timezone.utc)
+            if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR:
+                candidates.append((dt, 'unix_milliseconds', 95))
+        except (ValueError, OSError, OverflowError):
+            pass
+
+    # Mac/Cocoa epoch milliseconds (12-13 digits) - Rare
+    if 11 <= num_digits <= 14:
+        try:
+            dt = datetime.fromtimestamp((timestamp_num / 1000) + MAC_COCOA_EPOCH, tz=timezone.utc)
+            if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR:
+                candidates.append((dt, 'mac_milliseconds', 85))
+        except (ValueError, OSError, OverflowError):
+            pass
+
+    # Unix epoch microseconds (16-17 digits) - Chrome WebKit timestamps
+    if 15 <= num_digits <= 18:
+        try:
+            dt = datetime.fromtimestamp(timestamp_num / 1_000_000, tz=timezone.utc)
+            if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR:
+                candidates.append((dt, 'unix_microseconds', 98))
+        except (ValueError, OSError, OverflowError):
+            pass
+
+    # Mac/Cocoa epoch microseconds (15-16 digits) - Very rare
+    if 14 <= num_digits <= 18:
+        try:
+            dt = datetime.fromtimestamp((timestamp_num / 1_000_000) + MAC_COCOA_EPOCH, tz=timezone.utc)
+            if MIN_REASONABLE_YEAR <= dt.year <= MAX_REASONABLE_YEAR:
+                candidates.append((dt, 'mac_microseconds', 80))
+        except (ValueError, OSError, OverflowError):
+            pass
+
+    # If no candidates found, return None
+    if not candidates:
+        return None
+
+    # Sort by priority (highest first) and return best match
+    candidates.sort(key=lambda x: x[2], reverse=True)
+    best_dt, best_format, _ = candidates[0]
+
+    return best_dt
+
+
 def fetch_content(url: str) -> str:
    """Fetch content from a URL (supports file:// and https://)."""
    parsed = urlparse(url)
@@ -69,6 +180,7 @@ def main(url: str, snapshot_id: str = None):
        match = NETSCAPE_PATTERN.search(line)
        if match:
            bookmark_url = match.group(1)
+            timestamp_str = match.group(2)
            tags_str = match.group(3) or ''
            title = match.group(4).strip()

@@ -86,11 +198,13 @@ def main(url: str, snapshot_id: str = None):
                    tag = tag.strip()
                    if tag:
                        all_tags.add(tag)
-            try:
-                # Convert unix timestamp to ISO 8601
-                entry['bookmarked_at'] = datetime.fromtimestamp(float(match.group(2)), tz=timezone.utc).isoformat()
-            except (ValueError, OSError):
-                pass
+
+            # Parse timestamp with intelligent format detection
+            if timestamp_str:
+                dt = parse_timestamp(timestamp_str)
+                if dt:
+                    entry['bookmarked_at'] = dt.isoformat()
+
            urls_found.append(entry)

    if not urls_found:
--- a/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls_comprehensive.py
+++ b/archivebox/plugins/parse_netscape_urls/tests/test_parse_netscape_urls_comprehensive.py
@@ -0,0 +1,930 @@
+#!/usr/bin/env python3
+"""Comprehensive tests for parse_netscape_urls extractor covering various browser formats."""
+
+import json
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_netscape_urls.py'), None)
+
+
+class TestFirefoxFormat:
+    """Test Firefox Netscape bookmark export format."""
+
+    def test_firefox_basic_format(self, tmp_path):
+        """Test standard Firefox export format with Unix timestamps in seconds."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<!-- This is an automatically generated file.
+     It will be read and overwritten.
+     DO NOT EDIT! -->
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks Menu</H1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1609459200" LAST_MODIFIED="1609545600">Example Site</A>
+    <DT><A HREF="https://mozilla.org" ADD_DATE="1640995200">Mozilla</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        assert len(entries) == 2
+        assert entries[0]['url'] == 'https://example.com'
+        assert entries[0]['title'] == 'Example Site'
+        # Timestamp should be parsed as seconds (Jan 1, 2021)
+        assert '2021-01-01' in entries[0]['bookmarked_at']
+        # Second bookmark (Jan 1, 2022)
+        assert '2022-01-01' in entries[1]['bookmarked_at']
+
+    def test_firefox_with_tags(self, tmp_path):
+        """Test Firefox bookmarks with tags."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1609459200" TAGS="coding,tutorial,python">Python Tutorial</A>
+    <DT><A HREF="https://rust-lang.org" ADD_DATE="1609459200" TAGS="coding,rust">Rust Lang</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        # Should have Tag records + Snapshot records
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+
+        tag_names = {t['name'] for t in tags}
+        assert 'coding' in tag_names
+        assert 'tutorial' in tag_names
+        assert 'python' in tag_names
+        assert 'rust' in tag_names
+
+        assert snapshots[0]['tags'] == 'coding,tutorial,python'
+        assert snapshots[1]['tags'] == 'coding,rust'
+
+    def test_firefox_nested_folders(self, tmp_path):
+        """Test Firefox bookmark folders and nested structure."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<DL><p>
+    <DT><H3 ADD_DATE="1609459200" LAST_MODIFIED="1609545600">Toolbar</H3>
+    <DL><p>
+        <DT><A HREF="https://github.com" ADD_DATE="1609459200">GitHub</A>
+        <DT><H3 ADD_DATE="1609459200" LAST_MODIFIED="1609545600">Development</H3>
+        <DL><p>
+            <DT><A HREF="https://stackoverflow.com" ADD_DATE="1609459200">Stack Overflow</A>
+            <DT><A HREF="https://developer.mozilla.org" ADD_DATE="1609459200">MDN</A>
+        </DL><p>
+    </DL><p>
+    <DT><A HREF="https://news.ycombinator.com" ADD_DATE="1609459200">Hacker News</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+
+        assert 'https://github.com' in urls
+        assert 'https://stackoverflow.com' in urls
+        assert 'https://developer.mozilla.org' in urls
+        assert 'https://news.ycombinator.com' in urls
+        assert len(entries) == 4
+
+    def test_firefox_icon_and_icon_uri(self, tmp_path):
+        """Test Firefox bookmarks with ICON and ICON_URI attributes."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<DL><p>
+    <DT><A HREF="https://example.com" ADD_DATE="1609459200" ICON="data:image/png;base64,iVBORw0K">Example</A>
+    <DT><A HREF="https://github.com" ADD_DATE="1609459200" ICON_URI="https://github.com/favicon.ico">GitHub</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        assert entries[0]['url'] == 'https://example.com'
+        assert entries[1]['url'] == 'https://github.com'
+
+
+class TestChromeFormat:
+    """Test Chrome/Chromium Netscape bookmark export format."""
+
+    def test_chrome_microsecond_timestamps(self, tmp_path):
+        """Test Chrome format with microsecond timestamps (16-17 digits)."""
+        input_file = tmp_path / 'bookmarks.html'
+        # Chrome uses WebKit/Chrome timestamps which are microseconds
+        # 1609459200000000 = Jan 1, 2021 00:00:00 in microseconds
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://google.com" ADD_DATE="1609459200000000">Google</A>
+    <DT><A HREF="https://chrome.google.com" ADD_DATE="1640995200000000">Chrome</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        # Should correctly parse microsecond timestamps
+        # Currently will fail - we'll fix the parser after writing tests
+        assert entries[0]['url'] == 'https://google.com'
+        # Timestamp should be around Jan 1, 2021, not year 52970!
+        if 'bookmarked_at' in entries[0]:
+            year = datetime.fromisoformat(entries[0]['bookmarked_at']).year
+            # Should be 2021, not some far future date
+            assert 2020 <= year <= 2025, f"Year should be ~2021, got {year}"
+
+    def test_chrome_with_folders(self, tmp_path):
+        """Test Chrome bookmark folder structure."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<DL><p>
+    <DT><H3 ADD_DATE="1609459200" LAST_MODIFIED="1609459200" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks bar</H3>
+    <DL><p>
+        <DT><A HREF="https://google.com" ADD_DATE="1609459200">Google</A>
+    </DL><p>
+    <DT><H3 ADD_DATE="1609459200" LAST_MODIFIED="1609459200">Other bookmarks</H3>
+    <DL><p>
+        <DT><A HREF="https://example.com" ADD_DATE="1609459200">Example</A>
+    </DL><p>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+
+        assert 'https://google.com' in urls
+        assert 'https://example.com' in urls
+
+
+class TestSafariFormat:
+    """Test Safari Netscape bookmark export format."""
+
+    def test_safari_basic_format(self, tmp_path):
+        """Test Safari export format."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<Title>Bookmarks</Title>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><H3 FOLDED ADD_DATE="1609459200">BookmarksBar</H3>
+    <DL><p>
+        <DT><A HREF="https://apple.com" ADD_DATE="1609459200">Apple</A>
+        <DT><A HREF="https://webkit.org" ADD_DATE="1609459200">WebKit</A>
+    </DL><p>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+
+        assert 'https://apple.com' in urls
+        assert 'https://webkit.org' in urls
+
+    def test_safari_reading_list(self, tmp_path):
+        """Test Safari Reading List entries."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<DL><p>
+    <DT><H3 FOLDED ADD_DATE="1609459200">com.apple.ReadingList</H3>
+    <DL><p>
+        <DT><A HREF="https://article1.com" ADD_DATE="1609459200">Article 1</A>
+        <DD>Long article to read later
+        <DT><A HREF="https://article2.com" ADD_DATE="1609545600">Article 2</A>
+        <DD>Another saved article
+    </DL><p>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+
+        assert 'https://article1.com' in urls
+        assert 'https://article2.com' in urls
+
+
+class TestEdgeFormat:
+    """Test Edge/IE bookmark export formats."""
+
+    def test_edge_chromium_format(self, tmp_path):
+        """Test Edge (Chromium-based) format."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''<!DOCTYPE NETSCAPE-Bookmark-file-1>
+<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
+<TITLE>Bookmarks</TITLE>
+<H1>Bookmarks</H1>
+<DL><p>
+    <DT><A HREF="https://microsoft.com" ADD_DATE="1609459200">Microsoft</A>
+    <DT><A HREF="https://bing.com" ADD_DATE="1609459200">Bing</A>
+</DL><p>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+        urls = {e['url'] for e in entries}
+
+        assert 'https://microsoft.com' in urls
+        assert 'https://bing.com' in urls
+
+
+class TestTimestampFormats:
+    """Test various timestamp format handling and edge cases."""
+
+    def test_unix_seconds_timestamp(self, tmp_path):
+        """Test Unix epoch timestamp in seconds (10-11 digits) - Firefox, Chrome HTML export."""
+        input_file = tmp_path / 'bookmarks.html'
+        # 1609459200 = Jan 1, 2021 00:00:00 UTC (Unix epoch)
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1609459200">Test</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        assert dt.year == 2021
+        assert dt.month == 1
+        assert dt.day == 1
+
+    def test_mac_cocoa_seconds_timestamp(self, tmp_path):
+        """Test Mac/Cocoa epoch timestamp in seconds - Safari uses epoch of 2001-01-01."""
+        input_file = tmp_path / 'bookmarks.html'
+        # Safari uses Mac absolute time: seconds since 2001-01-01 00:00:00 UTC
+        # 631152000 seconds after 2001-01-01 = Jan 1, 2021
+        # 631152000 as Unix would be Feb 1990 (too old for a recent bookmark)
+        input_file.write_text('''
+<DT><A HREF="https://apple.com" ADD_DATE="631152000">Safari Bookmark</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        # Should detect Mac epoch and convert correctly to 2021
+        assert 2020 <= dt.year <= 2022, f"Expected ~2021, got {dt.year}"
+
+    def test_safari_recent_timestamp(self, tmp_path):
+        """Test recent Safari timestamp (Mac epoch)."""
+        input_file = tmp_path / 'bookmarks.html'
+        # 725846400 seconds after 2001-01-01 = Jan 1, 2024
+        input_file.write_text('''
+<DT><A HREF="https://webkit.org" ADD_DATE="725846400">Recent Safari</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        # Should detect Mac epoch and convert to 2024
+        assert 2023 <= dt.year <= 2025, f"Expected ~2024, got {dt.year}"
+
+    def test_unix_milliseconds_timestamp(self, tmp_path):
+        """Test Unix epoch timestamp in milliseconds (13 digits) - Some JavaScript exports."""
+        input_file = tmp_path / 'bookmarks.html'
+        # 1609459200000 = Jan 1, 2021 00:00:00 UTC in milliseconds
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1609459200000">Test</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        assert dt.year == 2021
+        assert dt.month == 1
+        assert dt.day == 1
+
+    def test_chrome_webkit_microseconds_timestamp(self, tmp_path):
+        """Test Chrome WebKit timestamp in microseconds (16-17 digits) - Chrome internal format."""
+        input_file = tmp_path / 'bookmarks.html'
+        # 1609459200000000 = Jan 1, 2021 00:00:00 UTC in microseconds (Unix epoch)
+        # Chrome sometimes exports with microsecond precision
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1609459200000000">Test</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        assert dt.year == 2021
+        assert dt.month == 1
+        assert dt.day == 1
+
+    def test_mac_cocoa_milliseconds_timestamp(self, tmp_path):
+        """Test Mac/Cocoa epoch in milliseconds (rare but possible)."""
+        input_file = tmp_path / 'bookmarks.html'
+        # 631152000000 milliseconds after 2001-01-01 = Jan 1, 2021
+        input_file.write_text('''
+<DT><A HREF="https://apple.com" ADD_DATE="631152000000">Safari Milliseconds</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        # Should detect Mac epoch with milliseconds and convert to 2021
+        assert 2020 <= dt.year <= 2022, f"Expected ~2021, got {dt.year}"
+
+    def test_ambiguous_timestamp_detection(self, tmp_path):
+        """Test that ambiguous timestamps are resolved to reasonable dates."""
+        input_file = tmp_path / 'bookmarks.html'
+        # Test multiple bookmarks with different timestamp formats mixed together
+        # Parser should handle each correctly
+        input_file.write_text('''
+<DT><A HREF="https://unix-seconds.com" ADD_DATE="1609459200">Unix Seconds 2021</A>
+<DT><A HREF="https://mac-seconds.com" ADD_DATE="631152000">Mac Seconds 2021</A>
+<DT><A HREF="https://unix-ms.com" ADD_DATE="1704067200000">Unix MS 2024</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        # All should be parsed to reasonable dates (2020-2025)
+        for entry in entries:
+            dt = datetime.fromisoformat(entry['bookmarked_at'])
+            assert 2020 <= dt.year <= 2025, f"Date {dt.year} out of reasonable range for {entry['url']}"
+
+    def test_very_old_timestamp(self, tmp_path):
+        """Test very old timestamp (1990s)."""
+        input_file = tmp_path / 'bookmarks.html'
+        # 820454400 = Jan 1, 1996
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="820454400">Old Bookmark</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        assert dt.year == 1996
+
+    def test_recent_timestamp(self, tmp_path):
+        """Test recent timestamp (2024)."""
+        input_file = tmp_path / 'bookmarks.html'
+        # 1704067200 = Jan 1, 2024
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1704067200">Recent</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        dt = datetime.fromisoformat(entry['bookmarked_at'])
+        assert dt.year == 2024
+
+    def test_invalid_timestamp(self, tmp_path):
+        """Test invalid/malformed timestamp - should extract URL but skip timestamp."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="invalid">Test</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        # Should still extract URL but skip timestamp
+        assert entry['url'] == 'https://example.com'
+        assert 'bookmarked_at' not in entry
+
+    def test_zero_timestamp(self, tmp_path):
+        """Test timestamp of 0 (Unix epoch) - too old, should be skipped."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="0">Test</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        # Timestamp 0 = 1970, which is before MIN_REASONABLE_YEAR (1995)
+        # Parser should skip it as unreasonable
+        assert entry['url'] == 'https://example.com'
+        # Timestamp should be omitted (outside reasonable range)
+        assert 'bookmarked_at' not in entry
+
+    def test_negative_timestamp(self, tmp_path):
+        """Test negative timestamp (before Unix epoch) - should handle gracefully."""
+        input_file = tmp_path / 'bookmarks.html'
+        # -86400 = 1 day before Unix epoch = Dec 31, 1969
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="-86400">Before Unix Epoch</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        # Should handle gracefully (extracts URL, may or may not include timestamp)
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+        # If timestamp is included, should be reasonable (1969)
+        if 'bookmarked_at' in entry:
+            dt = datetime.fromisoformat(entry['bookmarked_at'])
+            # Should be near Unix epoch (late 1969)
+            assert 1969 <= dt.year <= 1970
+
+
+class TestBookmarkAttributes:
+    """Test various bookmark attributes and metadata."""
+
+    def test_private_attribute(self, tmp_path):
+        """Test bookmarks with PRIVATE attribute."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://private.example.com" ADD_DATE="1609459200" PRIVATE="1">Private</A>
+<DT><A HREF="https://public.example.com" ADD_DATE="1609459200">Public</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        # Both should be extracted
+        assert len(entries) == 2
+
+    def test_shortcuturl_attribute(self, tmp_path):
+        """Test bookmarks with SHORTCUTURL keyword attribute."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://google.com/search?q=%s" ADD_DATE="1609459200" SHORTCUTURL="g">Google Search</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert 'google.com' in entry['url']
+
+    def test_post_data_attribute(self, tmp_path):
+        """Test bookmarks with POST_DATA attribute."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com/login" ADD_DATE="1609459200" POST_DATA="user=test">Login</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert entry['url'] == 'https://example.com/login'
+
+
+class TestEdgeCases:
+    """Test edge cases and malformed data."""
+
+    def test_multiline_bookmark(self, tmp_path):
+        """Test bookmark spanning multiple lines."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com"
+       ADD_DATE="1609459200"
+       TAGS="tag1,tag2">
+    Multi-line Bookmark
+</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        # Current regex works line-by-line, so this might not match
+        # Document current behavior
+        if result.returncode == 0:
+            output_file = tmp_path / 'urls.jsonl'
+            if output_file.exists():
+                content = output_file.read_text().strip()
+                if content:
+                    entry = json.loads(content)
+                    assert 'example.com' in entry['url']
+
+    def test_missing_add_date(self, tmp_path):
+        """Test bookmark without ADD_DATE attribute - should still extract URL."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com">No Date</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        # Should succeed and extract URL without timestamp
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+        assert entry['title'] == 'No Date'
+        assert 'bookmarked_at' not in entry
+
+    def test_empty_title(self, tmp_path):
+        """Test bookmark with empty title."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1609459200"></A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        # Current regex requires non-empty title [^<]+
+        # Document current behavior
+        assert result.returncode == 1
+
+    def test_special_chars_in_url(self, tmp_path):
+        """Test URLs with special characters."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com/path?q=test&foo=bar&baz=qux#section" ADD_DATE="1609459200">Special URL</A>
+<DT><A HREF="https://example.com/path%20with%20spaces" ADD_DATE="1609459200">Encoded Spaces</A>
+<DT><A HREF="https://example.com/unicode/日本語" ADD_DATE="1609459200">Unicode Path</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        assert len(entries) == 3
+        assert 'q=test&foo=bar' in entries[0]['url']
+        assert '%20' in entries[1]['url']
+
+    def test_javascript_url(self, tmp_path):
+        """Test javascript: URLs (should still be extracted)."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="javascript:alert('test')" ADD_DATE="1609459200">JS Bookmarklet</A>
+<DT><A HREF="https://example.com" ADD_DATE="1609459200">Normal</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        # Both should be extracted
+        assert len(entries) == 2
+        assert entries[0]['url'].startswith('javascript:')
+
+    def test_data_url(self, tmp_path):
+        """Test data: URLs."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="data:text/html,<h1>Test</h1>" ADD_DATE="1609459200">Data URL</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert entry['url'].startswith('data:')
+
+    def test_file_url(self, tmp_path):
+        """Test file:// URLs."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="file:///home/user/document.pdf" ADD_DATE="1609459200">Local File</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert entry['url'].startswith('file://')
+
+    def test_very_long_url(self, tmp_path):
+        """Test very long URLs (2000+ characters)."""
+        long_url = 'https://example.com/path?' + '&'.join([f'param{i}=value{i}' for i in range(100)])
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text(f'''
+<DT><A HREF="{long_url}" ADD_DATE="1609459200">Long URL</A>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+
+        assert len(entry['url']) > 1000
+        assert entry['url'].startswith('https://example.com')
+
+    def test_unicode_in_title(self, tmp_path):
+        """Test Unicode characters in titles."""
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text('''
+<DT><A HREF="https://example.com" ADD_DATE="1609459200">日本語のタイトル</A>
+<DT><A HREF="https://example.org" ADD_DATE="1609459200">Título en Español</A>
+<DT><A HREF="https://example.net" ADD_DATE="1609459200">Заголовок на русском</A>
+<DT><A HREF="https://example.biz" ADD_DATE="1609459200">عنوان بالعربية</A>
+<DT><A HREF="https://example.info" ADD_DATE="1609459200">Emoji 🚀 📚 🎉</A>
+        ''', encoding='utf-8')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text(encoding='utf-8').strip().split('\n')
+        entries = [json.loads(line) for line in lines]
+
+        assert len(entries) == 5
+        assert any('日本語' in e.get('title', '') for e in entries)
+        assert any('Español' in e.get('title', '') for e in entries)
+
+    def test_large_file_many_bookmarks(self, tmp_path):
+        """Test parsing large file with many bookmarks (1000+)."""
+        bookmarks = []
+        for i in range(1000):
+            bookmarks.append(
+                f'<DT><A HREF="https://example.com/page{i}" ADD_DATE="1609459200" TAGS="tag{i % 10}">Bookmark {i}</A>'
+            )
+
+        input_file = tmp_path / 'bookmarks.html'
+        input_file.write_text(
+            '<!DOCTYPE NETSCAPE-Bookmark-file-1>\n<DL><p>\n' +
+            '\n'.join(bookmarks) +
+            '\n</DL><p>'
+        )
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=30,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 1000 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+
+        # Should have 10 unique tags + 1000 snapshots
+        tags = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Tag']
+        snapshots = [json.loads(line) for line in lines if json.loads(line)['type'] == 'Snapshot']
+
+        assert len(tags) == 10
+        assert len(snapshots) == 1000
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])