mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
wip major changes
This commit is contained in:
188
archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
Executable file
188
archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py
Executable file
@@ -0,0 +1,188 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Parse HTML files and extract href URLs.
|
||||
|
||||
This is a standalone extractor that can run without ArchiveBox.
|
||||
It reads HTML content and extracts all <a href="..."> URLs.
|
||||
|
||||
NOTE: If parse_dom_outlinks already ran (parse_dom_outlinks/urls.jsonl exists),
|
||||
this extractor will skip since parse_dom_outlinks provides better coverage via Chrome.
|
||||
|
||||
Usage: ./on_Snapshot__60_parse_html_urls.py --url=<url>
|
||||
Output: Appends discovered URLs to urls.jsonl in current directory
|
||||
|
||||
Examples:
|
||||
./on_Snapshot__60_parse_html_urls.py --url=file:///path/to/page.html
|
||||
./on_Snapshot__60_parse_html_urls.py --url=https://example.com/page.html
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from html import unescape
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
|
||||
import rich_click as click
|
||||
|
||||
EXTRACTOR_NAME = 'parse_html_urls'
|
||||
|
||||
# Check if parse_dom_outlinks extractor already ran
|
||||
DOM_OUTLINKS_URLS_FILE = Path('parse_dom_outlinks/urls.jsonl')
|
||||
|
||||
|
||||
# URL regex from archivebox/misc/util.py
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://'
|
||||
r'(?:[a-zA-Z]|[0-9]'
|
||||
r'|[-_$@.&+!*\(\),]'
|
||||
r'|[^\u0000-\u007F])+'
|
||||
r'[^\]\[<>"\'\s]+'
|
||||
r'))',
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
class HrefParser(HTMLParser):
|
||||
"""Extract href attributes from anchor tags."""
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.urls = []
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
if tag == 'a':
|
||||
for attr, value in attrs:
|
||||
if attr == 'href' and value:
|
||||
self.urls.append(value)
|
||||
|
||||
|
||||
def did_urljoin_misbehave(root_url: str, relative_path: str, final_url: str) -> bool:
|
||||
"""Check if urljoin incorrectly stripped // from sub-URLs."""
|
||||
relative_path = relative_path.lower()
|
||||
if relative_path.startswith('http://') or relative_path.startswith('https://'):
|
||||
relative_path = relative_path.split('://', 1)[-1]
|
||||
|
||||
original_path_had_suburl = '://' in relative_path
|
||||
original_root_had_suburl = '://' in root_url[8:]
|
||||
final_joined_has_suburl = '://' in final_url[8:]
|
||||
|
||||
return (original_root_had_suburl or original_path_had_suburl) and not final_joined_has_suburl
|
||||
|
||||
|
||||
def fix_urljoin_bug(url: str, nesting_limit=5) -> str:
|
||||
"""Fix broken sub-URLs where :// was changed to :/."""
|
||||
input_url = url
|
||||
for _ in range(nesting_limit):
|
||||
url = re.sub(
|
||||
r'(?P<root>.+?)'
|
||||
r'(?P<separator>[-=/_&+%$#@!*\(\\])'
|
||||
r'(?P<subscheme>[a-zA-Z0-9+_-]{1,32}?):/'
|
||||
r'(?P<suburl>[^/\\]+)',
|
||||
r'\1\2\3://\4',
|
||||
input_url,
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
if url == input_url:
|
||||
break
|
||||
input_url = url
|
||||
return url
|
||||
|
||||
|
||||
def normalize_url(url: str, root_url: str = None) -> str:
|
||||
"""Normalize a URL, resolving relative paths if root_url provided."""
|
||||
if not root_url:
|
||||
return url
|
||||
|
||||
url_is_absolute = url.lower().startswith('http://') or url.lower().startswith('https://')
|
||||
|
||||
if url_is_absolute:
|
||||
return url
|
||||
|
||||
# Resolve relative URL
|
||||
resolved = urljoin(root_url, url)
|
||||
|
||||
# Fix urljoin bug with sub-URLs
|
||||
if did_urljoin_misbehave(root_url, url, resolved):
|
||||
resolved = fix_urljoin_bug(resolved)
|
||||
|
||||
return resolved
|
||||
|
||||
|
||||
def fetch_content(url: str) -> str:
|
||||
"""Fetch content from a URL (supports file:// and https://)."""
|
||||
parsed = urlparse(url)
|
||||
|
||||
if parsed.scheme == 'file':
|
||||
file_path = parsed.path
|
||||
with open(file_path, 'r', encoding='utf-8', errors='replace') as f:
|
||||
return f.read()
|
||||
else:
|
||||
timeout = int(os.environ.get('TIMEOUT', '60'))
|
||||
user_agent = os.environ.get('USER_AGENT', 'Mozilla/5.0 (compatible; ArchiveBox/1.0)')
|
||||
|
||||
import urllib.request
|
||||
req = urllib.request.Request(url, headers={'User-Agent': user_agent})
|
||||
with urllib.request.urlopen(req, timeout=timeout) as response:
|
||||
return response.read().decode('utf-8', errors='replace')
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='HTML URL to parse')
|
||||
def main(url: str):
|
||||
"""Parse HTML and extract href URLs."""
|
||||
|
||||
# Skip only if parse_dom_outlinks already ran AND found URLs (it uses Chrome for better coverage)
|
||||
# If parse_dom_outlinks ran but found nothing, we still try static HTML parsing as fallback
|
||||
if DOM_OUTLINKS_URLS_FILE.exists() and DOM_OUTLINKS_URLS_FILE.stat().st_size > 0:
|
||||
click.echo(f'Skipping parse_html_urls - parse_dom_outlinks already extracted URLs')
|
||||
sys.exit(0)
|
||||
|
||||
try:
|
||||
content = fetch_content(url)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to fetch {url}: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Parse HTML for hrefs
|
||||
parser = HrefParser()
|
||||
try:
|
||||
parser.feed(content)
|
||||
except Exception as e:
|
||||
click.echo(f'Failed to parse HTML: {e}', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
urls_found = set()
|
||||
for href in parser.urls:
|
||||
# Normalize URL
|
||||
normalized = normalize_url(href, root_url=url)
|
||||
|
||||
# Only include http/https URLs
|
||||
if normalized.lower().startswith('http://') or normalized.lower().startswith('https://'):
|
||||
# Skip the source URL itself
|
||||
if normalized != url:
|
||||
urls_found.add(unescape(normalized))
|
||||
|
||||
if not urls_found:
|
||||
click.echo('No URLs found', err=True)
|
||||
sys.exit(1)
|
||||
|
||||
# Write urls.jsonl
|
||||
with open('urls.jsonl', 'w') as f:
|
||||
for found_url in sorted(urls_found):
|
||||
f.write(json.dumps({
|
||||
'type': 'Snapshot',
|
||||
'url': found_url,
|
||||
'via_extractor': EXTRACTOR_NAME,
|
||||
}) + '\n')
|
||||
|
||||
click.echo(f'Found {len(urls_found)} URLs')
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
240
archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
Normal file
240
archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
Normal file
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Unit tests for parse_html_urls extractor."""
|
||||
|
||||
import json
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
SCRIPT_PATH = next(PLUGIN_DIR.glob('on_Snapshot__*_parse_html_urls.py'), None)
|
||||
|
||||
|
||||
class TestParseHtmlUrls:
|
||||
"""Test the parse_html_urls extractor CLI."""
|
||||
|
||||
def test_parses_real_example_com(self, tmp_path):
|
||||
"""Test parsing real https://example.com and extracting its links."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', 'https://example.com'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists(), "Output file not created"
|
||||
|
||||
# Verify output contains IANA link (example.com links to iana.org)
|
||||
content = output_file.read_text()
|
||||
assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
|
||||
|
||||
def test_extracts_href_urls(self, tmp_path):
|
||||
"""Test extracting URLs from anchor tags."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com">Example</a>
|
||||
<a href="https://foo.bar/page">Foo</a>
|
||||
<a href="http://test.org">Test</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
assert 'Found 3 URLs' in result.stdout
|
||||
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
assert output_file.exists()
|
||||
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 3
|
||||
|
||||
urls = set()
|
||||
for line in lines:
|
||||
entry = json.loads(line)
|
||||
assert 'url' in entry
|
||||
urls.add(entry['url'])
|
||||
|
||||
assert 'https://example.com' in urls
|
||||
assert 'https://foo.bar/page' in urls
|
||||
assert 'http://test.org' in urls
|
||||
|
||||
def test_ignores_non_http_schemes(self, tmp_path):
|
||||
"""Test that non-http schemes are ignored."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="mailto:test@example.com">Email</a>
|
||||
<a href="javascript:void(0)">JS</a>
|
||||
<a href="tel:+1234567890">Phone</a>
|
||||
<a href="https://valid.com">Valid</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 1
|
||||
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://valid.com'
|
||||
|
||||
def test_handles_html_entities(self, tmp_path):
|
||||
"""Test that HTML entities in URLs are decoded."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com/page?a=1&b=2">Link</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com/page?a=1&b=2'
|
||||
|
||||
def test_deduplicates_urls(self, tmp_path):
|
||||
"""Test that duplicate URLs are deduplicated."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com">Link 1</a>
|
||||
<a href="https://example.com">Link 2</a>
|
||||
<a href="https://example.com">Link 3</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 1
|
||||
|
||||
def test_excludes_source_url(self, tmp_path):
|
||||
"""Test that the source URL itself is excluded from results."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
source_url = f'file://{input_file}'
|
||||
input_file.write_text(f'''
|
||||
<html>
|
||||
<body>
|
||||
<a href="{source_url}">Self</a>
|
||||
<a href="https://other.com">Other</a>
|
||||
</body>
|
||||
</html>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', source_url],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 1
|
||||
entry = json.loads(lines[0])
|
||||
assert entry['url'] == 'https://other.com'
|
||||
|
||||
def test_exits_1_when_no_urls_found(self, tmp_path):
|
||||
"""Test that script exits with code 1 when no URLs found."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('<html><body>No links here</body></html>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 1
|
||||
assert 'No URLs found' in result.stderr
|
||||
|
||||
def test_handles_malformed_html(self, tmp_path):
|
||||
"""Test handling of malformed HTML."""
|
||||
input_file = tmp_path / 'malformed.html'
|
||||
input_file.write_text('''
|
||||
<html>
|
||||
<body>
|
||||
<a href="https://example.com">Unclosed tag
|
||||
<a href="https://other.com">Another link</a>
|
||||
</body>
|
||||
''')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
lines = output_file.read_text().strip().split('\n')
|
||||
assert len(lines) == 2
|
||||
|
||||
def test_output_is_valid_json(self, tmp_path):
|
||||
"""Test that output contains required fields."""
|
||||
input_file = tmp_path / 'page.html'
|
||||
input_file.write_text('<a href="https://example.com">Link</a>')
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(SCRIPT_PATH), '--url', f'file://{input_file}'],
|
||||
cwd=tmp_path,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
output_file = tmp_path / 'urls.jsonl'
|
||||
entry = json.loads(output_file.read_text().strip())
|
||||
assert entry['url'] == 'https://example.com'
|
||||
assert 'type' in entry
|
||||
assert 'via_extractor' in entry
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
Reference in New Issue
Block a user