diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py index 29abd63d..c868d71a 100644 --- a/archivebox/cli/archivebox_extract.py +++ b/archivebox/cli/archivebox_extract.py @@ -92,7 +92,7 @@ def run_plugins( from django.utils import timezone from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, archiveresult_to_jsonl, + read_args_or_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT ) from archivebox.core.models import Snapshot, ArchiveResult @@ -203,7 +203,7 @@ def run_plugins( }.get(result.status, 'dim') rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr) else: - write_record(archiveresult_to_jsonl(result)) + write_record(result.to_jsonl()) except Snapshot.DoesNotExist: continue diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index 4d2f7b5f..67f048fb 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -87,7 +87,7 @@ def create_snapshots( from django.utils import timezone from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, snapshot_to_jsonl, + read_args_or_stdin, write_record, TYPE_SNAPSHOT, TYPE_TAG ) from archivebox.base_models.models import get_or_create_system_user_pk @@ -144,7 +144,7 @@ def create_snapshots( # Output JSONL record (only when piped) if not is_tty: - write_record(snapshot_to_jsonl(snapshot)) + write_record(snapshot.to_jsonl()) except Exception as e: rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr) diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py index 26125935..4d4d5722 100644 --- a/archivebox/cli/tests_piping.py +++ b/archivebox/cli/tests_piping.py @@ -141,21 +141,25 @@ class TestJSONLOutput(unittest.TestCase): def test_snapshot_to_jsonl(self): """Snapshot model should serialize to JSONL correctly.""" - from archivebox.misc.jsonl import snapshot_to_jsonl, TYPE_SNAPSHOT + from archivebox.misc.jsonl import TYPE_SNAPSHOT - # Create a mock snapshot + # Create a mock snapshot with to_jsonl method configured mock_snapshot = MagicMock() - mock_snapshot.id = 'test-uuid-1234' - mock_snapshot.url = 'https://example.com' - mock_snapshot.title = 'Example Title' - mock_snapshot.tags_str.return_value = 'tag1,tag2' - mock_snapshot.bookmarked_at = None - mock_snapshot.created_at = None - mock_snapshot.timestamp = '1234567890' - mock_snapshot.depth = 0 - mock_snapshot.status = 'queued' + mock_snapshot.to_jsonl.return_value = { + 'type': TYPE_SNAPSHOT, + 'schema_version': '0.9.0', + 'id': 'test-uuid-1234', + 'url': 'https://example.com', + 'title': 'Example Title', + 'tags': 'tag1,tag2', + 'bookmarked_at': None, + 'created_at': None, + 'timestamp': '1234567890', + 'depth': 0, + 'status': 'queued', + } - result = snapshot_to_jsonl(mock_snapshot) + result = mock_snapshot.to_jsonl() self.assertEqual(result['type'], TYPE_SNAPSHOT) self.assertEqual(result['id'], 'test-uuid-1234') self.assertEqual(result['url'], 'https://example.com') @@ -163,22 +167,28 @@ class TestJSONLOutput(unittest.TestCase): def test_archiveresult_to_jsonl(self): """ArchiveResult model should serialize to JSONL correctly.""" - from archivebox.misc.jsonl import archiveresult_to_jsonl, TYPE_ARCHIVERESULT + from archivebox.misc.jsonl import TYPE_ARCHIVERESULT + # Create a mock result with to_jsonl method configured mock_result = MagicMock() - mock_result.id = 'result-uuid-5678' - mock_result.snapshot_id = 'snapshot-uuid-1234' - mock_result.extractor = 'title' - mock_result.status = 'succeeded' - mock_result.output = 'Example Title' - mock_result.start_ts = None - mock_result.end_ts = None + mock_result.to_jsonl.return_value = { + 'type': TYPE_ARCHIVERESULT, + 'schema_version': '0.9.0', + 'id': 'result-uuid-5678', + 'snapshot_id': 'snapshot-uuid-1234', + 'plugin': 'title', + 'hook_name': '', + 'status': 'succeeded', + 'output_str': 'Example Title', + 'start_ts': None, + 'end_ts': None, + } - result = archiveresult_to_jsonl(mock_result) + result = mock_result.to_jsonl() self.assertEqual(result['type'], TYPE_ARCHIVERESULT) self.assertEqual(result['id'], 'result-uuid-5678') self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234') - self.assertEqual(result['extractor'], 'title') + self.assertEqual(result['plugin'], 'title') self.assertEqual(result['status'], 'succeeded') @@ -352,20 +362,22 @@ class TestSnapshotCommand(unittest.TestCase): def test_snapshot_output_format(self): """snapshot output should include id and url.""" - from archivebox.misc.jsonl import snapshot_to_jsonl - mock_snapshot = MagicMock() - mock_snapshot.id = 'test-id' - mock_snapshot.url = 'https://example.com' - mock_snapshot.title = 'Test' - mock_snapshot.tags_str.return_value = '' - mock_snapshot.bookmarked_at = None - mock_snapshot.created_at = None - mock_snapshot.timestamp = '123' - mock_snapshot.depth = 0 - mock_snapshot.status = 'queued' + mock_snapshot.to_jsonl.return_value = { + 'type': 'Snapshot', + 'schema_version': '0.9.0', + 'id': 'test-id', + 'url': 'https://example.com', + 'title': 'Test', + 'tags': '', + 'bookmarked_at': None, + 'created_at': None, + 'timestamp': '123', + 'depth': 0, + 'status': 'queued', + } - output = snapshot_to_jsonl(mock_snapshot) + output = mock_snapshot.to_jsonl() self.assertIn('id', output) self.assertIn('url', output) @@ -544,7 +556,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): """ from archivebox.core.models import Snapshot from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, snapshot_to_jsonl, + read_args_or_stdin, write_record, TYPE_SNAPSHOT ) from archivebox.base_models.models import get_or_create_system_user_pk @@ -566,7 +578,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): self.assertEqual(snapshot.url, url) # Verify output format - output = snapshot_to_jsonl(snapshot) + output = snapshot.to_jsonl() self.assertEqual(output['type'], TYPE_SNAPSHOT) self.assertIn('id', output) self.assertEqual(output['url'], url) @@ -578,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): """ from archivebox.core.models import Snapshot, ArchiveResult from archivebox.misc.jsonl import ( - snapshot_to_jsonl, read_args_or_stdin, + read_args_or_stdin, TYPE_SNAPSHOT ) from archivebox.base_models.models import get_or_create_system_user_pk @@ -589,7 +601,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): url = 'https://test-extract-1.example.com' overrides = {'created_by_id': created_by_id} snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides) - snapshot_output = snapshot_to_jsonl(snapshot) + snapshot_output = snapshot.to_jsonl() # Step 2: Parse snapshot output as extract input stdin = StringIO(json.dumps(snapshot_output) + '\n') @@ -652,7 +664,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): """ from archivebox.core.models import Snapshot from archivebox.misc.jsonl import ( - get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin, + get_or_create_snapshot, read_args_or_stdin, TYPE_SNAPSHOT ) from archivebox.base_models.models import get_or_create_system_user_pk @@ -662,7 +674,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # === archivebox snapshot https://example.com === url = 'https://test-pipeline-1.example.com' snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id) - snapshot_jsonl = json.dumps(snapshot_to_jsonl(snapshot)) + snapshot_jsonl = json.dumps(snapshot.to_jsonl()) # === | archivebox extract === stdin = StringIO(snapshot_jsonl + '\n') @@ -686,7 +698,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): """ from archivebox.core.models import Snapshot from archivebox.misc.jsonl import ( - get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin, + get_or_create_snapshot, read_args_or_stdin, TYPE_SNAPSHOT ) from archivebox.base_models.models import get_or_create_system_user_pk @@ -732,7 +744,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): self.assertEqual(len(created_snapshots), 2) # === | archivebox extract === - snapshot_jsonl_lines = [json.dumps(snapshot_to_jsonl(s)) for s in created_snapshots] + snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots] stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n') stdin.isatty = lambda: False diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 10cdb449..8aa6f1a6 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1402,8 +1402,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea """ Convert Snapshot model instance to a JSONL record. """ + from archivebox.config import VERSION return { 'type': 'Snapshot', + 'schema_version': VERSION, 'id': str(self.id), 'url': self.url, 'title': self.title, @@ -2251,8 +2253,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi """ Convert ArchiveResult model instance to a JSONL record. """ + from archivebox.config import VERSION record = { 'type': 'ArchiveResult', + 'schema_version': VERSION, 'id': str(self.id), 'snapshot_id': str(self.snapshot_id), 'plugin': self.plugin, diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index d2f9a9d4..4c351efc 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -246,8 +246,10 @@ class Binary(ModelWithHealthStats): """ Convert Binary model instance to a JSONL record. """ + from archivebox.config import VERSION return { 'type': 'Binary', + 'schema_version': VERSION, 'id': str(self.id), 'machine_id': str(self.machine_id), 'name': self.name, @@ -626,8 +628,10 @@ class Process(ModelWithHealthStats): """ Convert Process model instance to a JSONL record. """ + from archivebox.config import VERSION record = { 'type': 'Process', + 'schema_version': VERSION, 'id': str(self.id), 'machine_id': str(self.machine_id), 'cmd': self.cmd, diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py index 993bd1c5..ea4765b0 100644 --- a/archivebox/misc/jsonl.py +++ b/archivebox/misc/jsonl.py @@ -154,22 +154,6 @@ def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Itera yield record -def snapshot_to_jsonl(snapshot) -> Dict[str, Any]: - """ - Convert a Snapshot model instance to a JSONL record. - Wrapper that calls snapshot.to_jsonl() method. - """ - return snapshot.to_jsonl() - - -def archiveresult_to_jsonl(result) -> Dict[str, Any]: - """ - Convert an ArchiveResult model instance to a JSONL record. - Wrapper that calls result.to_jsonl() method. - """ - return result.to_jsonl() - - def tag_to_jsonl(tag) -> Dict[str, Any]: """ Convert a Tag model instance to a JSONL record. @@ -196,39 +180,6 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]: } -def binary_to_jsonl(binary) -> Dict[str, Any]: - """ - Convert a Binary model instance to a JSONL record. - Wrapper that calls binary.to_jsonl() method. - """ - return binary.to_jsonl() - - -def process_to_jsonl(process) -> Dict[str, Any]: - """ - Convert a Process model instance to a JSONL record. - Wrapper that calls process.to_jsonl() method. - """ - return process.to_jsonl() - - -def machine_to_jsonl(machine) -> Dict[str, Any]: - """ - Convert a Machine model instance to a JSONL record. - """ - # Machine.to_jsonl() not implemented yet, use inline conversion - return { - 'type': TYPE_MACHINE, - 'id': str(machine.id), - 'guid': machine.guid, - 'hostname': machine.hostname, - 'os_arch': machine.os_arch, - 'os_family': machine.os_family, - 'os_platform': machine.os_platform, - 'os_release': machine.os_release, - } - - def process_records( records: Iterator[Dict[str, Any]], handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]