wip

2026-04-06 07:47:53 +10:00 · 2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions
--- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py
@@ -117,20 +117,28 @@ def main(url: str, snapshot_id: str = None):
        if cleaned_url != url:
            urls_found.add(cleaned_url)

-    if not urls_found:
-        click.echo('No URLs found', err=True)
-        sys.exit(1)
+    # Emit Snapshot records to stdout (JSONL)
+    for found_url in sorted(urls_found):
+        record = {
+            'type': 'Snapshot',
+            'url': found_url,
+            'plugin': PLUGIN_NAME,
+        }
+        if snapshot_id:
+            record['parent_snapshot_id'] = snapshot_id
+        print(json.dumps(record))

-    # Write urls.jsonl
-    with open('urls.jsonl', 'w') as f:
-        for found_url in sorted(urls_found):
-            f.write(json.dumps({
-                'type': 'Snapshot',
-                'url': found_url,
-                'plugin': PLUGIN_NAME,
-            }) + '\n')
+    # Emit ArchiveResult record to mark completion
+    status = 'succeeded' if urls_found else 'skipped'
+    output_str = f'Found {len(urls_found)} URLs' if urls_found else 'No URLs found'
+    ar_record = {
+        'type': 'ArchiveResult',
+        'status': status,
+        'output_str': output_str,
+    }
+    print(json.dumps(ar_record))

-    click.echo(f'Found {len(urls_found)} URLs')
+    click.echo(output_str, err=True)
    sys.exit(0)


--- a/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
+++ b/archivebox/plugins/parse_txt_urls/tests/test_parse_txt_urls.py
@@ -32,17 +32,16 @@ https://www.iana.org/domains/reserved
        )

        assert result.returncode == 0, f"Failed: {result.stderr}"
-        assert 'Found 3 URLs' in result.stdout
+        assert 'Found 3 URLs' in result.stderr

-        output_file = tmp_path / 'urls.jsonl'
-        assert output_file.exists()
-
-        lines = output_file.read_text().strip().split('\n')
+        # Parse Snapshot records from stdout
+        lines = [line for line in result.stdout.strip().split('\n') if line.strip() and '"type": "Snapshot"' in line]
        assert len(lines) == 3

        urls = set()
        for line in lines:
            entry = json.loads(line)
+            assert entry['type'] == 'Snapshot'
            assert 'url' in entry
            urls.add(entry['url'])

@@ -51,6 +50,10 @@ https://www.iana.org/domains/reserved
        assert 'https://example.com/page' in urls
        assert 'https://www.iana.org/domains/reserved' in urls

+        # Verify ArchiveResult record
+        assert '"type": "ArchiveResult"' in result.stdout
+        assert '"status": "succeeded"' in result.stdout
+
    def test_extracts_urls_from_mixed_content(self, tmp_path):
        """Test extracting URLs embedded in prose text."""
        input_file = tmp_path / 'mixed.txt'
@@ -68,8 +71,7 @@ Also see https://github.com/user/repo for the code.
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
        urls = {json.loads(line)['url'] for line in lines}

        assert 'https://blog.example.com/post' in urls
@@ -92,15 +94,14 @@ Also see https://github.com/user/repo for the code.
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
        urls = {json.loads(line)['url'] for line in lines}

        assert 'https://example.com/page' in urls
        assert any('wikipedia.org' in u for u in urls)

-    def test_exits_1_when_no_urls_found(self, tmp_path):
-        """Test that script exits with code 1 when no URLs found."""
+    def test_skips_when_no_urls_found(self, tmp_path):
+        """Test that script returns skipped status when no URLs found."""
        input_file = tmp_path / 'empty.txt'
        input_file.write_text('no urls here, just plain text')

@@ -111,8 +112,9 @@ Also see https://github.com/user/repo for the code.
            text=True,
        )

-        assert result.returncode == 1
+        assert result.returncode == 0
        assert 'No URLs found' in result.stderr
+        assert '"status": "skipped"' in result.stdout

    def test_exits_1_when_file_not_found(self, tmp_path):
        """Test that script exits with code 1 when file doesn't exist."""
@@ -144,12 +146,11 @@ https://other.com
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
        assert len(lines) == 2

-    def test_appends_to_existing_file(self, tmp_path):
-        """Test that output creates urls.jsonl with extracted URLs."""
+    def test_outputs_to_stdout(self, tmp_path):
+        """Test that output goes to stdout in JSONL format."""
        input_file = tmp_path / 'urls.txt'
        input_file.write_text('https://new.com\nhttps://other.com')

@@ -161,8 +162,7 @@ https://other.com
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        lines = output_file.read_text().strip().split('\n')
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
        assert len(lines) == 2

        urls = {json.loads(line)['url'] for line in lines}
@@ -182,11 +182,11 @@ https://other.com
        )

        assert result.returncode == 0
-        output_file = tmp_path / 'urls.jsonl'
-        entry = json.loads(output_file.read_text().strip())
+        lines = [line for line in result.stdout.strip().split('\n') if '"type": "Snapshot"' in line]
+        entry = json.loads(lines[0])
        assert entry['url'] == 'https://example.com'
-        assert 'type' in entry
-        assert 'plugin' in entry
+        assert entry['type'] == 'Snapshot'
+        assert entry['plugin'] == 'parse_txt_urls'


 if __name__ == '__main__':