wip major changes

2026-04-05 07:17:52 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Parse HTML files and extract href URLs.
+
+This is a standalone extractor that can run without ArchiveBox.
+It reads HTML content and extracts all <a href="..."> URLs.
+
+NOTE: If parse_dom_outlinks already ran (parse_dom_outlinks/urls.jsonl exists),
+this extractor will skip since parse_dom_outlinks provides better coverage via Chrome.
+
+Usage: ./on_Snapshot__60_parse_html_urls.py --url=<url>
+Output: Appends discovered URLs to urls.jsonl in current directory
+
+Examples:
+    ./on_Snapshot__60_parse_html_urls.py --url=file:///path/to/page.html
+    ./on_Snapshot__60_parse_html_urls.py --url=https://example.com/page.html
+"""
+
+import json
+import os
+import re
+import sys
+from datetime import datetime, timezone
+from html import unescape
+from html.parser import HTMLParser
+from pathlib import Path
+from urllib.parse import urljoin, urlparse
+
+import rich_click as click
+
+EXTRACTOR_NAME = 'parse_html_urls'
+
+# Check if parse_dom_outlinks extractor already ran
+DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
+
+
+# URL regex from archivebox/misc/util.py
+URL_REGEX = re.compile(
+    r'(?=('
+    r'http[s]?://'
+    r'(?:[a-zA-Z]|[0-9]'
+    r'|[-_$@.&+!*\(\),]'
+    r'|[^\u0000-\u007F])+'
+    r'[^\]\[<>"\'\s]+'
+    r'))',
+    re.IGNORECASE | re.UNICODE,
+)
+
+
+class HrefParser(HTMLParser):
+    """Extract href attributes from anchor tags."""
+
+    def __init__(self):
+        super().__init__()
+        self.urls = []
+
+    def handle_starttag(self, tag, attrs):
+        if tag == 'a':
+            for attr, value in attrs:
+                if attr == 'href' and value:
+                    self.urls.append(value)
+
+
+def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
+    """Check if urljoin incorrectly stripped // from sub-URLs."""
+    relative_path = relative_path.lower()
+    if relative_path.startswith('http://') or relative_path.startswith('https://'):
+        relative_path = relative_path.split('://', 1)[-1]
+
+    original_path_had_suburl = '://' in relative_path
+    original_root_had_suburl = '://' in root_url[8:]
+    final_joined_has_suburl = '://' in final_url[8:]
+
+    return (original_root_had_suburl or original_path_had_suburl) and not final_joined_has_suburl
+
+
+def fix_urljoin_bug(url: str, nesting_limit=5) -> str:
+    """Fix broken sub-URLs where :// was changed to :/."""
+    input_url = url
+    for _ in range(nesting_limit):
+        url = re.sub(
+            r'(?P<root>.+?)'
+            r'(?P<separator>[-=/_&+%$#@!*\(\\])'
+            r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/'
+            r'(?P<suburl>[^/\\]+)',
+            r'\1\2\3://\4',
+            input_url,
+            re.IGNORECASE | re.UNICODE,
+        )
+        if url == input_url:
+            break
+        input_url = url
+    return url
+
+
+def normalize_url(url: str, root_url: str = None) -> str:
+    """Normalize a URL, resolving relative paths if root_url provided."""
+    if not root_url:
+        return url
+
+    url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')
+
+    if url_is_absolute:
+        return url
+
+    # Resolve relative URL
+    resolved = urljoin(root_url, url)
+
+    # Fix urljoin bug with sub-URLs
+    if did_urljoin_misbehave(root_url, url, resolved):
+        resolved = fix_urljoin_bug(resolved)
+
+    return resolved
+
+
+def fetch_content(url: str) -> str:
+    """Fetch content from a URL (supports file:// and https://)."""
+    parsed = urlparse(url)
+
+    if parsed.scheme == 'file':
+        file_path = parsed.path
+        with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
+            return f.read()
+    else:
+        timeout = int(os.environ.get('TIMEOUT', '60'))
+        user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
+
+        import urllib.request
+        req = urllib.request.Request(url, headers={'User-Agent': user_agent})
+        with urllib.request.urlopen(req, timeout=timeout) as response:
+            return response.read().decode('utf-8', errors='replace')
+
+
+@click.command()
+@click.option('--url', required=True, help='HTML URL to parse')
+def main(url: str):
+    """Parse HTML and extract href URLs."""
+
+    # Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
+    # If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
+    if DOM_OUTLINKS_URLS_FILE.exists() and DOM_OUTLINKS_URLS_FILE.stat().st_size > 0:
+        click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
+        sys.exit(0)
+
+    try:
+        content = fetch_content(url)
+    except Exception as e:
+        click.echo(f'Failed to fetch {url}: {e}', err=True)
+        sys.exit(1)
+
+    # Parse HTML for hrefs
+    parser = HrefParser()
+    try:
+        parser.feed(content)
+    except Exception as e:
+        click.echo(f'Failed to parse HTML: {e}', err=True)
+        sys.exit(1)
+
+    urls_found = set()
+    for href in parser.urls:
+        # Normalize URL
+        normalized = normalize_url(href, root_url=url)
+
+        # Only include http/https URLs
+        if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
+            # Skip the source URL itself
+            if normalized != url:
+                urls_found.add(unescape(normalized))
+
+    if not urls_found:
+        click.echo('No URLs found', err=True)
+        sys.exit(1)
+
+    # Write urls.jsonl
+    with open('urls.jsonl', 'w') as f:
+        for found_url in sorted(urls_found):
+            f.write(json.dumps({
+                'type': 'Snapshot',
+                'url': found_url,
+                'via_extractor': EXTRACTOR_NAME,
+            }) + '\n')
+
+    click.echo(f'Found {len(urls_found)} URLs')
+    sys.exit(0)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""Unit tests for parse_html_urls extractor."""
+
+import json
+import subprocess
+import sys
+from pathlib import Path
+
+import pytest
+
+PLUGIN_DIR = Path(__file__).parent.parent
+SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.py'), None)
+
+
+class TestParseHtmlUrls:
+    """Test the parse_html_urls extractor CLI."""
+
+    def test_parses_real_example_com(self, tmp_path):
+        """Test parsing real https://example.com and extracting its links."""
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', 'https://example.com'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+            timeout=30
+        )
+
+        assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists(), "Output file not created"
+
+        # Verify output contains IANA link (example.com links to iana.org)
+        content = output_file.read_text()
+        assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
+
+    def test_extracts_href_urls(self, tmp_path):
+        """Test extracting URLs from anchor tags."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<!DOCTYPE html>
+<html>
+<body>
+    <a href="https://example.com">Example</a>
+    <a href="https://foo.bar/page">Foo</a>
+    <a href="http://test.org">Test</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        assert 'Found 3 URLs' in result.stdout
+
+        output_file = tmp_path / 'urls.jsonl'
+        assert output_file.exists()
+
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 3
+
+        urls = set()
+        for line in lines:
+            entry = json.loads(line)
+            assert 'url' in entry
+            urls.add(entry['url'])
+
+        assert 'https://example.com' in urls
+        assert 'https://foo.bar/page' in urls
+        assert 'http://test.org' in urls
+
+    def test_ignores_non_http_schemes(self, tmp_path):
+        """Test that non-http schemes are ignored."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="mailto:test@example.com">Email</a>
+    <a href="javascript:void(0)">JS</a>
+    <a href="tel:+1234567890">Phone</a>
+    <a href="https://valid.com">Valid</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 1
+
+        entry = json.loads(lines[0])
+        assert entry['url'] == 'https://valid.com'
+
+    def test_handles_html_entities(self, tmp_path):
+        """Test that HTML entities in URLs are decoded."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="https://example.com/page?a=1&amp;b=2">Link</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com/page?a=1&b=2'
+
+    def test_deduplicates_urls(self, tmp_path):
+        """Test that duplicate URLs are deduplicated."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="https://example.com">Link 1</a>
+    <a href="https://example.com">Link 2</a>
+    <a href="https://example.com">Link 3</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 1
+
+    def test_excludes_source_url(self, tmp_path):
+        """Test that the source URL itself is excluded from results."""
+        input_file = tmp_path / 'page.html'
+        source_url = f'file://{input_file}'
+        input_file.write_text(f'''
+<html>
+<body>
+    <a href="{source_url}">Self</a>
+    <a href="https://other.com">Other</a>
+</body>
+</html>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', source_url],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 1
+        entry = json.loads(lines[0])
+        assert entry['url'] == 'https://other.com'
+
+    def test_exits_1_when_no_urls_found(self, tmp_path):
+        """Test that script exits with code 1 when no URLs found."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('<html><body>No links here</body></html>')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 1
+        assert 'No URLs found' in result.stderr
+
+    def test_handles_malformed_html(self, tmp_path):
+        """Test handling of malformed HTML."""
+        input_file = tmp_path / 'malformed.html'
+        input_file.write_text('''
+<html>
+<body>
+    <a href="https://example.com">Unclosed tag
+    <a href="https://other.com">Another link</a>
+</body>
+        ''')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        lines = output_file.read_text().strip().split('\n')
+        assert len(lines) == 2
+
+    def test_output_is_valid_json(self, tmp_path):
+        """Test that output contains required fields."""
+        input_file = tmp_path / 'page.html'
+        input_file.write_text('<a href="https://example.com">Link</a>')
+
+        result = subprocess.run(
+            [sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
+            cwd=tmp_path,
+            capture_output=True,
+            text=True,
+        )
+
+        assert result.returncode == 0
+        output_file = tmp_path / 'urls.jsonl'
+        entry = json.loads(output_file.read_text().strip())
+        assert entry['url'] == 'https://example.com'
+        assert 'type' in entry
+        assert 'via_extractor' in entry
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])