fix: address PR review comments from cubic-dev-ai

- Add JSONL_INDEX_FILENAME to ALLOWED_IN_DATA_DIR for consistency
- Fix fallback logic in legacy.py to try JSON when JSONL parsing fails
- Replace bare except clauses with specific exception types
- Fix stdin double-consumption in archivebox_crawl.py
- Merge CLI --tag option with crawl tags in archivebox_snapshot.py
- Remove tautological mock tests (covered by integration tests)

Co-authored-by: Nick Sweeting <pirate@users.noreply.github.com>
This commit is contained in:
claude[bot]
2025-12-30 20:08:54 +00:00
parent cf387ed59f
commit 762cddc8c5
6 changed files with 23 additions and 86 deletions

View File

@@ -37,7 +37,7 @@ import rich_click as click
def create_crawls(
args: tuple,
records: list,
depth: int = 0,
tag: str = '',
created_by_id: Optional[int] = None,
@@ -45,7 +45,7 @@ def create_crawls(
"""
Create a single Crawl job from all input URLs.
Reads from args or stdin, creates one Crawl with all URLs, outputs JSONL.
Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
Does NOT start the crawl - just creates the job in QUEUED state.
Exit codes:
@@ -54,16 +54,13 @@ def create_crawls(
"""
from rich import print as rprint
from archivebox.misc.jsonl import read_args_or_stdin, write_record
from archivebox.misc.jsonl import write_record
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
created_by_id = created_by_id or get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(args))
if not records:
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1
@@ -188,7 +185,7 @@ def main(depth: int, tag: str, args: tuple):
sys.exit(exit_code)
else:
# Default behavior: create Crawl jobs from URLs
sys.exit(create_crawls(args, depth=depth, tag=tag))
sys.exit(create_crawls(records, depth=depth, tag=tag))
if __name__ == '__main__':

View File

@@ -130,9 +130,16 @@ def create_snapshots(
# Create snapshots for each URL in the crawl
for url in crawl.get_urls_list():
# Merge CLI tags with crawl tags
merged_tags = crawl.tags_str
if tag:
if merged_tags:
merged_tags = f"{merged_tags},{tag}"
else:
merged_tags = tag
snapshot_record = {
'url': url,
'tags': crawl.tags_str,
'tags': merged_tags,
'crawl_id': str(crawl.id),
'depth': 0,
}

View File

@@ -178,57 +178,8 @@ class TestJSONLOutput(unittest.TestCase):
self.assertEqual(result['urls'], 'https://example.com')
self.assertEqual(result['status'], 'queued')
def test_snapshot_to_jsonl(self):
"""Snapshot model should serialize to JSONL correctly."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT
# Create a mock snapshot with to_jsonl method configured
mock_snapshot = MagicMock()
mock_snapshot.to_jsonl.return_value = {
'type': TYPE_SNAPSHOT,
'schema_version': '0.9.0',
'id': 'test-uuid-1234',
'url': 'https://example.com',
'title': 'Example Title',
'tags': 'tag1,tag2',
'bookmarked_at': None,
'created_at': None,
'timestamp': '1234567890',
'depth': 0,
'status': 'queued',
}
result = mock_snapshot.to_jsonl()
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['id'], 'test-uuid-1234')
self.assertEqual(result['url'], 'https://example.com')
self.assertEqual(result['title'], 'Example Title')
def test_archiveresult_to_jsonl(self):
"""ArchiveResult model should serialize to JSONL correctly."""
from archivebox.misc.jsonl import TYPE_ARCHIVERESULT
# Create a mock result with to_jsonl method configured
mock_result = MagicMock()
mock_result.to_jsonl.return_value = {
'type': TYPE_ARCHIVERESULT,
'schema_version': '0.9.0',
'id': 'result-uuid-5678',
'snapshot_id': 'snapshot-uuid-1234',
'plugin': 'title',
'hook_name': '',
'status': 'succeeded',
'output_str': 'Example Title',
'start_ts': None,
'end_ts': None,
}
result = mock_result.to_jsonl()
self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
self.assertEqual(result['id'], 'result-uuid-5678')
self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
self.assertEqual(result['plugin'], 'title')
self.assertEqual(result['status'], 'succeeded')
# Note: Snapshot and ArchiveResult serialization is tested in integration tests
# (TestPipingWorkflowIntegration) using real model instances, not mocks.
class TestReadArgsOrStdin(unittest.TestCase):
@@ -395,28 +346,9 @@ class TestSnapshotCommand(unittest.TestCase):
self.assertEqual(records[0]['tags'], 'tag1,tag2')
self.assertEqual(records[0]['title'], 'Test')
def test_snapshot_output_format(self):
"""snapshot output should include id and url."""
mock_snapshot = MagicMock()
mock_snapshot.to_jsonl.return_value = {
'type': 'Snapshot',
'schema_version': '0.9.0',
'id': 'test-id',
'url': 'https://example.com',
'title': 'Test',
'tags': '',
'bookmarked_at': None,
'created_at': None,
'timestamp': '123',
'depth': 0,
'status': 'queued',
}
output = mock_snapshot.to_jsonl()
self.assertIn('id', output)
self.assertIn('url', output)
self.assertEqual(output['type'], 'Snapshot')
# Note: Snapshot output format is tested in integration tests
# (TestPipingWorkflowIntegration.test_snapshot_creates_and_outputs_jsonl)
# using real Snapshot instances.
class TestExtractCommand(unittest.TestCase):

View File

@@ -188,6 +188,7 @@ class ConstantsDict(Mapping):
"queue.sqlite3-wal",
"queue.sqlite3-shm",
JSON_INDEX_FILENAME,
JSONL_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,

View File

@@ -600,13 +600,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if record.get('type') == 'Snapshot':
data = record
break
except:
except (json.JSONDecodeError, OSError):
pass
elif json_path.exists():
try:
with open(json_path) as f:
data = json.load(f)
except:
except (json.JSONDecodeError, OSError):
pass
if not data:
@@ -661,13 +661,13 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if record.get('type') == 'Snapshot':
data = record
break
except:
except (json.JSONDecodeError, OSError):
pass
elif json_path.exists():
try:
with open(json_path) as f:
data = json.load(f)
except:
except (json.JSONDecodeError, OSError):
pass
if not data:

View File

@@ -92,7 +92,7 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
except (json.JSONDecodeError, KeyError, TypeError):
pass
elif json_file.exists():
if link is None and json_file.exists():
try:
with open(json_file, 'r', encoding='utf-8') as f:
link = json.load(f)