mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
fix: address PR review comments from cubic-dev-ai
- Add JSONL_INDEX_FILENAME to ALLOWED_IN_DATA_DIR for consistency - Fix fallback logic in legacy.py to try JSON when JSONL parsing fails - Replace bare except clauses with specific exception types - Fix stdin double-consumption in archivebox_crawl.py - Merge CLI --tag option with crawl tags in archivebox_snapshot.py - Remove tautological mock tests (covered by integration tests) Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
This commit is contained in:
@@ -37,7 +37,7 @@ import rich_click as click
|
||||
|
||||
|
||||
def create_crawls(
|
||||
args: tuple,
|
||||
records: list,
|
||||
depth: int = 0,
|
||||
tag: str = '',
|
||||
created_by_id: Optional[int] = None,
|
||||
@@ -45,7 +45,7 @@ def create_crawls(
|
||||
"""
|
||||
Create a single Crawl job from all input URLs.
|
||||
|
||||
Reads from args or stdin, creates one Crawl with all URLs, outputs JSONL.
|
||||
Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
|
||||
Does NOT start the crawl - just creates the job in QUEUED state.
|
||||
|
||||
Exit codes:
|
||||
@@ -54,16 +54,13 @@ def create_crawls(
|
||||
"""
|
||||
from rich import print as rprint
|
||||
|
||||
from archivebox.misc.jsonl import read_args_or_stdin, write_record
|
||||
from archivebox.misc.jsonl import write_record
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
is_tty = sys.stdout.isatty()
|
||||
|
||||
# Collect all input records
|
||||
records = list(read_args_or_stdin(args))
|
||||
|
||||
if not records:
|
||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||
return 1
|
||||
@@ -188,7 +185,7 @@ def main(depth: int, tag: str, args: tuple):
|
||||
sys.exit(exit_code)
|
||||
else:
|
||||
# Default behavior: create Crawl jobs from URLs
|
||||
sys.exit(create_crawls(args, depth=depth, tag=tag))
|
||||
sys.exit(create_crawls(records, depth=depth, tag=tag))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -130,9 +130,16 @@ def create_snapshots(
|
||||
|
||||
# Create snapshots for each URL in the crawl
|
||||
for url in crawl.get_urls_list():
|
||||
# Merge CLI tags with crawl tags
|
||||
merged_tags = crawl.tags_str
|
||||
if tag:
|
||||
if merged_tags:
|
||||
merged_tags = f"{merged_tags},{tag}"
|
||||
else:
|
||||
merged_tags = tag
|
||||
snapshot_record = {
|
||||
'url': url,
|
||||
'tags': crawl.tags_str,
|
||||
'tags': merged_tags,
|
||||
'crawl_id': str(crawl.id),
|
||||
'depth': 0,
|
||||
}
|
||||
|
||||
@@ -178,57 +178,8 @@ class TestJSONLOutput(unittest.TestCase):
|
||||
self.assertEqual(result['urls'], 'https://example.com')
|
||||
self.assertEqual(result['status'], 'queued')
|
||||
|
||||
def test_snapshot_to_jsonl(self):
|
||||
"""Snapshot model should serialize to JSONL correctly."""
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create a mock snapshot with to_jsonl method configured
|
||||
mock_snapshot = MagicMock()
|
||||
mock_snapshot.to_jsonl.return_value = {
|
||||
'type': TYPE_SNAPSHOT,
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'test-uuid-1234',
|
||||
'url': 'https://example.com',
|
||||
'title': 'Example Title',
|
||||
'tags': 'tag1,tag2',
|
||||
'bookmarked_at': None,
|
||||
'created_at': None,
|
||||
'timestamp': '1234567890',
|
||||
'depth': 0,
|
||||
'status': 'queued',
|
||||
}
|
||||
|
||||
result = mock_snapshot.to_jsonl()
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['id'], 'test-uuid-1234')
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
self.assertEqual(result['title'], 'Example Title')
|
||||
|
||||
def test_archiveresult_to_jsonl(self):
|
||||
"""ArchiveResult model should serialize to JSONL correctly."""
|
||||
from archivebox.misc.jsonl import TYPE_ARCHIVERESULT
|
||||
|
||||
# Create a mock result with to_jsonl method configured
|
||||
mock_result = MagicMock()
|
||||
mock_result.to_jsonl.return_value = {
|
||||
'type': TYPE_ARCHIVERESULT,
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'result-uuid-5678',
|
||||
'snapshot_id': 'snapshot-uuid-1234',
|
||||
'plugin': 'title',
|
||||
'hook_name': '',
|
||||
'status': 'succeeded',
|
||||
'output_str': 'Example Title',
|
||||
'start_ts': None,
|
||||
'end_ts': None,
|
||||
}
|
||||
|
||||
result = mock_result.to_jsonl()
|
||||
self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
|
||||
self.assertEqual(result['id'], 'result-uuid-5678')
|
||||
self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
|
||||
self.assertEqual(result['plugin'], 'title')
|
||||
self.assertEqual(result['status'], 'succeeded')
|
||||
# Note: Snapshot and ArchiveResult serialization is tested in integration tests
|
||||
# (TestPipingWorkflowIntegration) using real model instances, not mocks.
|
||||
|
||||
|
||||
class TestReadArgsOrStdin(unittest.TestCase):
|
||||
@@ -395,28 +346,9 @@ class TestSnapshotCommand(unittest.TestCase):
|
||||
self.assertEqual(records[0]['tags'], 'tag1,tag2')
|
||||
self.assertEqual(records[0]['title'], 'Test')
|
||||
|
||||
def test_snapshot_output_format(self):
|
||||
"""snapshot output should include id and url."""
|
||||
mock_snapshot = MagicMock()
|
||||
mock_snapshot.to_jsonl.return_value = {
|
||||
'type': 'Snapshot',
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'test-id',
|
||||
'url': 'https://example.com',
|
||||
'title': 'Test',
|
||||
'tags': '',
|
||||
'bookmarked_at': None,
|
||||
'created_at': None,
|
||||
'timestamp': '123',
|
||||
'depth': 0,
|
||||
'status': 'queued',
|
||||
}
|
||||
|
||||
output = mock_snapshot.to_jsonl()
|
||||
|
||||
self.assertIn('id', output)
|
||||
self.assertIn('url', output)
|
||||
self.assertEqual(output['type'], 'Snapshot')
|
||||
# Note: Snapshot output format is tested in integration tests
|
||||
# (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
|
||||
# using real Snapshot instances.
|
||||
|
||||
|
||||
class TestExtractCommand(unittest.TestCase):
|
||||
|
||||
@@ -188,6 +188,7 @@ class ConstantsDict(Mapping):
|
||||
"queue.sqlite3-wal",
|
||||
"queue.sqlite3-shm",
|
||||
JSON_INDEX_FILENAME,
|
||||
JSONL_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
|
||||
@@ -600,13 +600,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if record.get('type') == 'Snapshot':
|
||||
data = record
|
||||
break
|
||||
except:
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
elif json_path.exists():
|
||||
try:
|
||||
with open(json_path) as f:
|
||||
data = json.load(f)
|
||||
except:
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
|
||||
if not data:
|
||||
@@ -661,13 +661,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
if record.get('type') == 'Snapshot':
|
||||
data = record
|
||||
break
|
||||
except:
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
elif json_path.exists():
|
||||
try:
|
||||
with open(json_path) as f:
|
||||
data = json.load(f)
|
||||
except:
|
||||
except (json.JSONDecodeError, OSError):
|
||||
pass
|
||||
|
||||
if not data:
|
||||
|
||||
@@ -92,7 +92,7 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
pass
|
||||
|
||||
elif json_file.exists():
|
||||
if link is None and json_file.exists():
|
||||
try:
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
link = json.load(f)
|
||||
|
||||
Reference in New Issue
Block a user