fix: address PR review comments from cubic-dev-ai

- Add JSONL_INDEX_FILENAME to ALLOWED_IN_DATA_DIR for consistency - Fix fallback logic in legacy.py to try JSON when JSONL parsing fails - Replace bare except clauses with specific exception types - Fix stdin double-consumption in archivebox_crawl.py - Merge CLI --tag option with crawl tags in archivebox_snapshot.py - Remove tautological mock tests (covered by integration tests) Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
2026-01-03 09:25:42 +10:00 · 2025-12-30 20:08:54 +00:00
parent cf387ed59f
commit 762cddc8c5
6 changed files with 23 additions and 86 deletions
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -37,7 +37,7 @@ import rich_click as click


 def create_crawls(
-    args: tuple,
+    records: list,
    depth: int = 0,
    tag: str = '',
    created_by_id: Optional[int] = None,
@@ -45,7 +45,7 @@ def create_crawls(
    """
    Create a single Crawl job from all input URLs.

-    Reads from args or stdin, creates one Crawl with all URLs, outputs JSONL.
+    Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
    Does NOT start the crawl - just creates the job in QUEUED state.

    Exit codes:
@@ -54,16 +54,13 @@ def create_crawls(
    """
    from rich import print as rprint

-    from archivebox.misc.jsonl import read_args_or_stdin, write_record
+    from archivebox.misc.jsonl import write_record
    from archivebox.base_models.models import get_or_create_system_user_pk
    from archivebox.crawls.models import Crawl

    created_by_id = created_by_id or get_or_create_system_user_pk()
    is_tty = sys.stdout.isatty()

-    # Collect all input records
-    records = list(read_args_or_stdin(args))
-
    if not records:
        rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
        return 1
@@ -188,7 +185,7 @@ def main(depth: int, tag: str, args: tuple):
        sys.exit(exit_code)
    else:
        # Default behavior: create Crawl jobs from URLs
-        sys.exit(create_crawls(args, depth=depth, tag=tag))
+        sys.exit(create_crawls(records, depth=depth, tag=tag))


 if __name__ == '__main__':
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -130,9 +130,16 @@ def create_snapshots(

                # Create snapshots for each URL in the crawl
                for url in crawl.get_urls_list():
+                    # Merge CLI tags with crawl tags
+                    merged_tags = crawl.tags_str
+                    if tag:
+                        if merged_tags:
+                            merged_tags = f"{merged_tags},{tag}"
+                        else:
+                            merged_tags = tag
                    snapshot_record = {
                        'url': url,
-                        'tags': crawl.tags_str,
+                        'tags': merged_tags,
                        'crawl_id': str(crawl.id),
                        'depth': 0,
                    }
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -178,57 +178,8 @@ class TestJSONLOutput(unittest.TestCase):
        self.assertEqual(result['urls'], 'https://example.com')
        self.assertEqual(result['status'], 'queued')

-    def test_snapshot_to_jsonl(self):
-        """Snapshot model should serialize to JSONL correctly."""
-        from archivebox.misc.jsonl import TYPE_SNAPSHOT
-
-        # Create a mock snapshot with to_jsonl method configured
-        mock_snapshot = MagicMock()
-        mock_snapshot.to_jsonl.return_value = {
-            'type': TYPE_SNAPSHOT,
-            'schema_version': '0.9.0',
-            'id': 'test-uuid-1234',
-            'url': 'https://example.com',
-            'title': 'Example Title',
-            'tags': 'tag1,tag2',
-            'bookmarked_at': None,
-            'created_at': None,
-            'timestamp': '1234567890',
-            'depth': 0,
-            'status': 'queued',
-        }
-
-        result = mock_snapshot.to_jsonl()
-        self.assertEqual(result['type'], TYPE_SNAPSHOT)
-        self.assertEqual(result['id'], 'test-uuid-1234')
-        self.assertEqual(result['url'], 'https://example.com')
-        self.assertEqual(result['title'], 'Example Title')
-
-    def test_archiveresult_to_jsonl(self):
-        """ArchiveResult model should serialize to JSONL correctly."""
-        from archivebox.misc.jsonl import TYPE_ARCHIVERESULT
-
-        # Create a mock result with to_jsonl method configured
-        mock_result = MagicMock()
-        mock_result.to_jsonl.return_value = {
-            'type': TYPE_ARCHIVERESULT,
-            'schema_version': '0.9.0',
-            'id': 'result-uuid-5678',
-            'snapshot_id': 'snapshot-uuid-1234',
-            'plugin': 'title',
-            'hook_name': '',
-            'status': 'succeeded',
-            'output_str': 'Example Title',
-            'start_ts': None,
-            'end_ts': None,
-        }
-
-        result = mock_result.to_jsonl()
-        self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
-        self.assertEqual(result['id'], 'result-uuid-5678')
-        self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
-        self.assertEqual(result['plugin'], 'title')
-        self.assertEqual(result['status'], 'succeeded')
+    # Note: Snapshot and ArchiveResult serialization is tested in integration tests
+    # (TestPipingWorkflowIntegration) using real model instances, not mocks.


 class TestReadArgsOrStdin(unittest.TestCase):
@@ -395,28 +346,9 @@ class TestSnapshotCommand(unittest.TestCase):
        self.assertEqual(records[0]['tags'], 'tag1,tag2')
        self.assertEqual(records[0]['title'], 'Test')

-    def test_snapshot_output_format(self):
-        """snapshot output should include id and url."""
-        mock_snapshot = MagicMock()
-        mock_snapshot.to_jsonl.return_value = {
-            'type': 'Snapshot',
-            'schema_version': '0.9.0',
-            'id': 'test-id',
-            'url': 'https://example.com',
-            'title': 'Test',
-            'tags': '',
-            'bookmarked_at': None,
-            'created_at': None,
-            'timestamp': '123',
-            'depth': 0,
-            'status': 'queued',
-        }
-
-        output = mock_snapshot.to_jsonl()
-
-        self.assertIn('id', output)
-        self.assertIn('url', output)
-        self.assertEqual(output['type'], 'Snapshot')
+    # Note: Snapshot output format is tested in integration tests
+    # (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
+    # using real Snapshot instances.


 class TestExtractCommand(unittest.TestCase):
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -188,6 +188,7 @@ class ConstantsDict(Mapping):
        "queue.sqlite3-wal",
        "queue.sqlite3-shm",
        JSON_INDEX_FILENAME,
+        JSONL_INDEX_FILENAME,
        HTML_INDEX_FILENAME,
        ROBOTS_TXT_FILENAME,
        FAVICON_FILENAME,
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -600,13 +600,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                            if record.get('type') == 'Snapshot':
                                data = record
                                break
-            except:
+            except (json.JSONDecodeError, OSError):
                pass
        elif json_path.exists():
            try:
                with open(json_path) as f:
                    data = json.load(f)
-            except:
+            except (json.JSONDecodeError, OSError):
                pass

        if not data:
@@ -661,13 +661,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                            if record.get('type') == 'Snapshot':
                                data = record
                                break
-            except:
+            except (json.JSONDecodeError, OSError):
                pass
        elif json_path.exists():
            try:
                with open(json_path) as f:
                    data = json.load(f)
-            except:
+            except (json.JSONDecodeError, OSError):
                pass

        if not data:
--- a/archivebox/misc/legacy.py
+++ b/archivebox/misc/legacy.py
@@ -92,7 +92,7 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
            except (json.JSONDecodeError, KeyError, TypeError):
                pass

-        elif json_file.exists():
+        if link is None and json_file.exists():
            try:
                with open(json_file, 'r', encoding='utf-8') as f:
                    link = json.load(f)