diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py index 4e583c98..d8c3c7ad 100644 --- a/archivebox/cli/archivebox_crawl.py +++ b/archivebox/cli/archivebox_crawl.py @@ -37,7 +37,7 @@ import rich_click as click def create_crawls( - args: tuple, + records: list, depth: int = 0, tag: str = '', created_by_id: Optional[int] = None, @@ -45,7 +45,7 @@ def create_crawls( """ Create a single Crawl job from all input URLs. - Reads from args or stdin, creates one Crawl with all URLs, outputs JSONL. + Takes pre-read records, creates one Crawl with all URLs, outputs JSONL. Does NOT start the crawl - just creates the job in QUEUED state. Exit codes: @@ -54,16 +54,13 @@ def create_crawls( """ from rich import print as rprint - from archivebox.misc.jsonl import read_args_or_stdin, write_record + from archivebox.misc.jsonl import write_record from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.crawls.models import Crawl created_by_id = created_by_id or get_or_create_system_user_pk() is_tty = sys.stdout.isatty() - # Collect all input records - records = list(read_args_or_stdin(args)) - if not records: rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr) return 1 @@ -188,7 +185,7 @@ def main(depth: int, tag: str, args: tuple): sys.exit(exit_code) else: # Default behavior: create Crawl jobs from URLs - sys.exit(create_crawls(args, depth=depth, tag=tag)) + sys.exit(create_crawls(records, depth=depth, tag=tag)) if __name__ == '__main__': diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index 7ef2ff4a..b9876bb1 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -130,9 +130,16 @@ def create_snapshots( # Create snapshots for each URL in the crawl for url in crawl.get_urls_list(): + # Merge CLI tags with crawl tags + merged_tags = crawl.tags_str + if tag: + if merged_tags: + merged_tags = f"{merged_tags},{tag}" + else: + merged_tags = tag snapshot_record = { 'url': url, - 'tags': crawl.tags_str, + 'tags': merged_tags, 'crawl_id': str(crawl.id), 'depth': 0, } diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py index 09927d2c..f6aee426 100644 --- a/archivebox/cli/tests_piping.py +++ b/archivebox/cli/tests_piping.py @@ -178,57 +178,8 @@ class TestJSONLOutput(unittest.TestCase): self.assertEqual(result['urls'], 'https://example.com') self.assertEqual(result['status'], 'queued') - def test_snapshot_to_jsonl(self): - """Snapshot model should serialize to JSONL correctly.""" - from archivebox.misc.jsonl import TYPE_SNAPSHOT - - # Create a mock snapshot with to_jsonl method configured - mock_snapshot = MagicMock() - mock_snapshot.to_jsonl.return_value = { - 'type': TYPE_SNAPSHOT, - 'schema_version': '0.9.0', - 'id': 'test-uuid-1234', - 'url': 'https://example.com', - 'title': 'Example Title', - 'tags': 'tag1,tag2', - 'bookmarked_at': None, - 'created_at': None, - 'timestamp': '1234567890', - 'depth': 0, - 'status': 'queued', - } - - result = mock_snapshot.to_jsonl() - self.assertEqual(result['type'], TYPE_SNAPSHOT) - self.assertEqual(result['id'], 'test-uuid-1234') - self.assertEqual(result['url'], 'https://example.com') - self.assertEqual(result['title'], 'Example Title') - - def test_archiveresult_to_jsonl(self): - """ArchiveResult model should serialize to JSONL correctly.""" - from archivebox.misc.jsonl import TYPE_ARCHIVERESULT - - # Create a mock result with to_jsonl method configured - mock_result = MagicMock() - mock_result.to_jsonl.return_value = { - 'type': TYPE_ARCHIVERESULT, - 'schema_version': '0.9.0', - 'id': 'result-uuid-5678', - 'snapshot_id': 'snapshot-uuid-1234', - 'plugin': 'title', - 'hook_name': '', - 'status': 'succeeded', - 'output_str': 'Example Title', - 'start_ts': None, - 'end_ts': None, - } - - result = mock_result.to_jsonl() - self.assertEqual(result['type'], TYPE_ARCHIVERESULT) - self.assertEqual(result['id'], 'result-uuid-5678') - self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234') - self.assertEqual(result['plugin'], 'title') - self.assertEqual(result['status'], 'succeeded') + # Note: Snapshot and ArchiveResult serialization is tested in integration tests + # (TestPipingWorkflowIntegration) using real model instances, not mocks. class TestReadArgsOrStdin(unittest.TestCase): @@ -395,28 +346,9 @@ class TestSnapshotCommand(unittest.TestCase): self.assertEqual(records[0]['tags'], 'tag1,tag2') self.assertEqual(records[0]['title'], 'Test') - def test_snapshot_output_format(self): - """snapshot output should include id and url.""" - mock_snapshot = MagicMock() - mock_snapshot.to_jsonl.return_value = { - 'type': 'Snapshot', - 'schema_version': '0.9.0', - 'id': 'test-id', - 'url': 'https://example.com', - 'title': 'Test', - 'tags': '', - 'bookmarked_at': None, - 'created_at': None, - 'timestamp': '123', - 'depth': 0, - 'status': 'queued', - } - - output = mock_snapshot.to_jsonl() - - self.assertIn('id', output) - self.assertIn('url', output) - self.assertEqual(output['type'], 'Snapshot') + # Note: Snapshot output format is tested in integration tests + # (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl) + # using real Snapshot instances. class TestExtractCommand(unittest.TestCase): diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 30f0246b..59e64aeb 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -188,6 +188,7 @@ class ConstantsDict(Mapping): "queue.sqlite3-wal", "queue.sqlite3-shm", JSON_INDEX_FILENAME, + JSONL_INDEX_FILENAME, HTML_INDEX_FILENAME, ROBOTS_TXT_FILENAME, FAVICON_FILENAME, diff --git a/archivebox/core/models.py b/archivebox/core/models.py index e4d7a142..adfd8c02 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -600,13 +600,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if record.get('type') == 'Snapshot': data = record break - except: + except (json.JSONDecodeError, OSError): pass elif json_path.exists(): try: with open(json_path) as f: data = json.load(f) - except: + except (json.JSONDecodeError, OSError): pass if not data: @@ -661,13 +661,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if record.get('type') == 'Snapshot': data = record break - except: + except (json.JSONDecodeError, OSError): pass elif json_path.exists(): try: with open(json_path) as f: data = json.load(f) - except: + except (json.JSONDecodeError, OSError): pass if not data: diff --git a/archivebox/misc/legacy.py b/archivebox/misc/legacy.py index 5dfb787d..e936151d 100644 --- a/archivebox/misc/legacy.py +++ b/archivebox/misc/legacy.py @@ -92,7 +92,7 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]: except (json.JSONDecodeError, KeyError, TypeError): pass - elif json_file.exists(): + if link is None and json_file.exists(): try: with open(json_file, 'r', encoding='utf-8') as f: link = json.load(f)