wip

2026-04-04 14:57:56 +10:00 · 2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions
--- a/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py
@@ -170,10 +170,6 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0
            if normalized != url:
                urls_found.add(unescape(normalized))

-    if not urls_found:
-        click.echo('No URLs found', err=True)
-        sys.exit(1)
-
    # Emit Snapshot records to stdout (JSONL)
    for found_url in sorted(urls_found):
        record = {
@@ -189,7 +185,17 @@ def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0

        print(json.dumps(record))

-    click.echo(f'Found {len(urls_found)} URLs', err=True)
+    # Emit ArchiveResult record to mark completion
+    status = 'succeeded' if urls_found else 'skipped'
+    output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
+    ar_record = {
+        'type': 'ArchiveResult',
+        'status': status,
+        'output_str': output_str,
+    }
+    print(json.dumps(ar_record))
+
+    click.echo(output_str, err=True)
    sys.exit(0)


--- a/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
+++ b/archivebox/plugins/parse_html_urls/tests/test_parse_html_urls.py
@@ -27,12 +27,13 @@ class TestParseHtmlUrls:

        assert result.returncode == 0, f"Failed to parse example.com: {result.stderr}"

-        output_file = tmp_path / 'urls.jsonl'
-        assert output_file.exists(), "Output file not created"
+        # Verify stdout contains JSONL records for discovered URLs
+        # example.com links to iana.org
+        assert 'iana.org' in result.stdout or 'example' in result.stdout, "Expected links from example.com not found"

-        # Verify output contains IANA link (example.com links to iana.org)
-        content = output_file.read_text()
-        assert 'iana.org' in content or 'example' in content, "Expected links from example.com not found"
+        # Verify ArchiveResult record is present
+        assert '"type": "ArchiveResult"' in result.stdout, "Missing ArchiveResult record"
+        assert '"status": "succeeded"' in result.stdout, "Missing success status"

    def test_extracts_href_urls(self, tmp_path):
        """Test extracting URLs from anchor tags."""
@@ -56,17 +57,16 @@ class TestParseHtmlUrls:
        )

        assert result.returncode == 0
-        assert 'Found 3 URLs' in result.stdout
+        assert 'Found 3 URLs' in result.stderr

-        output_file = tmp_path / 'urls.jsonl'
-        assert output_file.exists()
-
-        lines = output_file.read_text().strip().split('\n')
-        assert len(lines) == 3
+        # Parse Snapshot records from stdout
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
+        assert len(lines) == 3, f"Expected 3 Snapshot records, got {len(lines)}"

        urls = set()
        for line in lines:
            entry = json.loads(line)
+            assert entry['type'] == 'Snapshot'
            assert 'url' in entry
            urls.add(entry['url'])

@@ -74,6 +74,10 @@ class TestParseHtmlUrls:
        assert 'https://foo.bar/page' in urls
        assert 'http://test.org' in urls

+        # Verify ArchiveResult record
+        assert '"type": "ArchiveResult"' in result.stdout
+        assert '"status": "succeeded"' in result.stdout
+
    def test_ignores_non_http_schemes(self, tmp_path):
        """Test that non-http schemes are ignored."""
        input_file = tmp_path / 'page.html'
@@ -96,9 +100,10 @@ class TestParseHtmlUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
-        assert len(lines) == 1
+
+        # Parse Snapshot records from stdout
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
+        assert len(lines) == 1, f"Expected 1 Snapshot record, got {len(lines)}"

        entry = json.loads(lines[0])
        assert entry['url'] == 'https://valid.com'
@@ -122,8 +127,8 @@ class TestParseHtmlUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
+        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com/page?a=1&b=2'

    def test_deduplicates_urls(self, tmp_path):
@@ -147,8 +152,7 @@ class TestParseHtmlUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
        assert len(lines) == 1

    def test_excludes_source_url(self, tmp_path):
@@ -172,14 +176,13 @@ class TestParseHtmlUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
        assert len(lines) == 1
        entry = json.loads(lines[0])
        assert entry['url'] == 'https://other.com'

-    def test_exits_1_when_no_urls_found(self, tmp_path):
-        """Test that script exits with code 1 when no URLs found."""
+    def test_skips_when_no_urls_found(self, tmp_path):
+        """Test that script returns skipped status when no URLs found."""
        input_file = tmp_path / 'page.html'
        input_file.write_text('<html><body>No links here</body></html>')

@@ -190,8 +193,9 @@ class TestParseHtmlUrls:
            text=True,
        )

-        assert result.returncode == 1
+        assert result.returncode == 0
        assert 'No URLs found' in result.stderr
+        assert '"status": "skipped"' in result.stdout

    def test_handles_malformed_html(self, tmp_path):
        """Test handling of malformed HTML."""
@@ -212,8 +216,7 @@ class TestParseHtmlUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
        assert len(lines) == 2

    def test_output_is_valid_json(self, tmp_path):
@@ -229,11 +232,11 @@ class TestParseHtmlUrls:
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
+        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com'
-        assert 'type' in entry
-        assert 'plugin' in entry
+        assert entry['type'] == 'Snapshot'
+        assert entry['plugin'] == 'parse_html_urls'


 if __name__ == '__main__':