mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
feat: add schema_version to JSONL outputs and remove dead code
- Add schema_version (archivebox.VERSION) to all to_jsonl() outputs: - Snapshot.to_jsonl() - ArchiveResult.to_jsonl() - Binary.to_jsonl() - Process.to_jsonl() - Update CLI commands to use model methods directly: - archivebox_snapshot.py: snapshot.to_jsonl() - archivebox_extract.py: result.to_jsonl() - Remove dead wrapper functions from misc/jsonl.py: - snapshot_to_jsonl() - archiveresult_to_jsonl() - binary_to_jsonl() - process_to_jsonl() - machine_to_jsonl() - Update tests to use model methods directly
This commit is contained in:
@@ -92,7 +92,7 @@ def run_plugins(
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, archiveresult_to_jsonl,
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
)
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
@@ -203,7 +203,7 @@ def run_plugins(
|
||||
}.get(result.status, 'dim')
|
||||
rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
|
||||
else:
|
||||
write_record(archiveresult_to_jsonl(result))
|
||||
write_record(result.to_jsonl())
|
||||
except Snapshot.DoesNotExist:
|
||||
continue
|
||||
|
||||
|
||||
@@ -87,7 +87,7 @@ def create_snapshots(
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, TYPE_TAG
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
@@ -144,7 +144,7 @@ def create_snapshots(
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
write_record(snapshot_to_jsonl(snapshot))
|
||||
write_record(snapshot.to_jsonl())
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
|
||||
@@ -141,21 +141,25 @@ class TestJSONLOutput(unittest.TestCase):
|
||||
|
||||
def test_snapshot_to_jsonl(self):
|
||||
"""Snapshot model should serialize to JSONL correctly."""
|
||||
from archivebox.misc.jsonl import snapshot_to_jsonl, TYPE_SNAPSHOT
|
||||
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||
|
||||
# Create a mock snapshot
|
||||
# Create a mock snapshot with to_jsonl method configured
|
||||
mock_snapshot = MagicMock()
|
||||
mock_snapshot.id = 'test-uuid-1234'
|
||||
mock_snapshot.url = 'https://example.com'
|
||||
mock_snapshot.title = 'Example Title'
|
||||
mock_snapshot.tags_str.return_value = 'tag1,tag2'
|
||||
mock_snapshot.bookmarked_at = None
|
||||
mock_snapshot.created_at = None
|
||||
mock_snapshot.timestamp = '1234567890'
|
||||
mock_snapshot.depth = 0
|
||||
mock_snapshot.status = 'queued'
|
||||
mock_snapshot.to_jsonl.return_value = {
|
||||
'type': TYPE_SNAPSHOT,
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'test-uuid-1234',
|
||||
'url': 'https://example.com',
|
||||
'title': 'Example Title',
|
||||
'tags': 'tag1,tag2',
|
||||
'bookmarked_at': None,
|
||||
'created_at': None,
|
||||
'timestamp': '1234567890',
|
||||
'depth': 0,
|
||||
'status': 'queued',
|
||||
}
|
||||
|
||||
result = snapshot_to_jsonl(mock_snapshot)
|
||||
result = mock_snapshot.to_jsonl()
|
||||
self.assertEqual(result['type'], TYPE_SNAPSHOT)
|
||||
self.assertEqual(result['id'], 'test-uuid-1234')
|
||||
self.assertEqual(result['url'], 'https://example.com')
|
||||
@@ -163,22 +167,28 @@ class TestJSONLOutput(unittest.TestCase):
|
||||
|
||||
def test_archiveresult_to_jsonl(self):
|
||||
"""ArchiveResult model should serialize to JSONL correctly."""
|
||||
from archivebox.misc.jsonl import archiveresult_to_jsonl, TYPE_ARCHIVERESULT
|
||||
from archivebox.misc.jsonl import TYPE_ARCHIVERESULT
|
||||
|
||||
# Create a mock result with to_jsonl method configured
|
||||
mock_result = MagicMock()
|
||||
mock_result.id = 'result-uuid-5678'
|
||||
mock_result.snapshot_id = 'snapshot-uuid-1234'
|
||||
mock_result.extractor = 'title'
|
||||
mock_result.status = 'succeeded'
|
||||
mock_result.output = 'Example Title'
|
||||
mock_result.start_ts = None
|
||||
mock_result.end_ts = None
|
||||
mock_result.to_jsonl.return_value = {
|
||||
'type': TYPE_ARCHIVERESULT,
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'result-uuid-5678',
|
||||
'snapshot_id': 'snapshot-uuid-1234',
|
||||
'plugin': 'title',
|
||||
'hook_name': '',
|
||||
'status': 'succeeded',
|
||||
'output_str': 'Example Title',
|
||||
'start_ts': None,
|
||||
'end_ts': None,
|
||||
}
|
||||
|
||||
result = archiveresult_to_jsonl(mock_result)
|
||||
result = mock_result.to_jsonl()
|
||||
self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
|
||||
self.assertEqual(result['id'], 'result-uuid-5678')
|
||||
self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
|
||||
self.assertEqual(result['extractor'], 'title')
|
||||
self.assertEqual(result['plugin'], 'title')
|
||||
self.assertEqual(result['status'], 'succeeded')
|
||||
|
||||
|
||||
@@ -352,20 +362,22 @@ class TestSnapshotCommand(unittest.TestCase):
|
||||
|
||||
def test_snapshot_output_format(self):
|
||||
"""snapshot output should include id and url."""
|
||||
from archivebox.misc.jsonl import snapshot_to_jsonl
|
||||
|
||||
mock_snapshot = MagicMock()
|
||||
mock_snapshot.id = 'test-id'
|
||||
mock_snapshot.url = 'https://example.com'
|
||||
mock_snapshot.title = 'Test'
|
||||
mock_snapshot.tags_str.return_value = ''
|
||||
mock_snapshot.bookmarked_at = None
|
||||
mock_snapshot.created_at = None
|
||||
mock_snapshot.timestamp = '123'
|
||||
mock_snapshot.depth = 0
|
||||
mock_snapshot.status = 'queued'
|
||||
mock_snapshot.to_jsonl.return_value = {
|
||||
'type': 'Snapshot',
|
||||
'schema_version': '0.9.0',
|
||||
'id': 'test-id',
|
||||
'url': 'https://example.com',
|
||||
'title': 'Test',
|
||||
'tags': '',
|
||||
'bookmarked_at': None,
|
||||
'created_at': None,
|
||||
'timestamp': '123',
|
||||
'depth': 0,
|
||||
'status': 'queued',
|
||||
}
|
||||
|
||||
output = snapshot_to_jsonl(mock_snapshot)
|
||||
output = mock_snapshot.to_jsonl()
|
||||
|
||||
self.assertIn('id', output)
|
||||
self.assertIn('url', output)
|
||||
@@ -544,7 +556,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
@@ -566,7 +578,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertEqual(snapshot.url, url)
|
||||
|
||||
# Verify output format
|
||||
output = snapshot_to_jsonl(snapshot)
|
||||
output = snapshot.to_jsonl()
|
||||
self.assertEqual(output['type'], TYPE_SNAPSHOT)
|
||||
self.assertIn('id', output)
|
||||
self.assertEqual(output['url'], url)
|
||||
@@ -578,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.misc.jsonl import (
|
||||
snapshot_to_jsonl, read_args_or_stdin,
|
||||
read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
@@ -589,7 +601,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
url = 'https://test-extract-1.example.com'
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
|
||||
snapshot_output = snapshot_to_jsonl(snapshot)
|
||||
snapshot_output = snapshot.to_jsonl()
|
||||
|
||||
# Step 2: Parse snapshot output as extract input
|
||||
stdin = StringIO(json.dumps(snapshot_output) + '\n')
|
||||
@@ -652,7 +664,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
get_or_create_snapshot, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
@@ -662,7 +674,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
# === archivebox snapshot https://example.com ===
|
||||
url = 'https://test-pipeline-1.example.com'
|
||||
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
|
||||
snapshot_jsonl = json.dumps(snapshot_to_jsonl(snapshot))
|
||||
snapshot_jsonl = json.dumps(snapshot.to_jsonl())
|
||||
|
||||
# === | archivebox extract ===
|
||||
stdin = StringIO(snapshot_jsonl + '\n')
|
||||
@@ -686,7 +698,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
"""
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
get_or_create_snapshot, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
@@ -732,7 +744,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertEqual(len(created_snapshots), 2)
|
||||
|
||||
# === | archivebox extract ===
|
||||
snapshot_jsonl_lines = [json.dumps(snapshot_to_jsonl(s)) for s in created_snapshots]
|
||||
snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots]
|
||||
stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
|
||||
stdin.isatty = lambda: False
|
||||
|
||||
|
||||
@@ -1402,8 +1402,10 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
"""
|
||||
Convert Snapshot model instance to a JSONL record.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': 'Snapshot',
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'url': self.url,
|
||||
'title': self.title,
|
||||
@@ -2251,8 +2253,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
"""
|
||||
Convert ArchiveResult model instance to a JSONL record.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
record = {
|
||||
'type': 'ArchiveResult',
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'snapshot_id': str(self.snapshot_id),
|
||||
'plugin': self.plugin,
|
||||
|
||||
@@ -246,8 +246,10 @@ class Binary(ModelWithHealthStats):
|
||||
"""
|
||||
Convert Binary model instance to a JSONL record.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
return {
|
||||
'type': 'Binary',
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'machine_id': str(self.machine_id),
|
||||
'name': self.name,
|
||||
@@ -626,8 +628,10 @@ class Process(ModelWithHealthStats):
|
||||
"""
|
||||
Convert Process model instance to a JSONL record.
|
||||
"""
|
||||
from archivebox.config import VERSION
|
||||
record = {
|
||||
'type': 'Process',
|
||||
'schema_version': VERSION,
|
||||
'id': str(self.id),
|
||||
'machine_id': str(self.machine_id),
|
||||
'cmd': self.cmd,
|
||||
|
||||
@@ -154,22 +154,6 @@ def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Itera
|
||||
yield record
|
||||
|
||||
|
||||
def snapshot_to_jsonl(snapshot) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Snapshot model instance to a JSONL record.
|
||||
Wrapper that calls snapshot.to_jsonl() method.
|
||||
"""
|
||||
return snapshot.to_jsonl()
|
||||
|
||||
|
||||
def archiveresult_to_jsonl(result) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert an ArchiveResult model instance to a JSONL record.
|
||||
Wrapper that calls result.to_jsonl() method.
|
||||
"""
|
||||
return result.to_jsonl()
|
||||
|
||||
|
||||
def tag_to_jsonl(tag) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Tag model instance to a JSONL record.
|
||||
@@ -196,39 +180,6 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]:
|
||||
}
|
||||
|
||||
|
||||
def binary_to_jsonl(binary) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Binary model instance to a JSONL record.
|
||||
Wrapper that calls binary.to_jsonl() method.
|
||||
"""
|
||||
return binary.to_jsonl()
|
||||
|
||||
|
||||
def process_to_jsonl(process) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Process model instance to a JSONL record.
|
||||
Wrapper that calls process.to_jsonl() method.
|
||||
"""
|
||||
return process.to_jsonl()
|
||||
|
||||
|
||||
def machine_to_jsonl(machine) -> Dict[str, Any]:
|
||||
"""
|
||||
Convert a Machine model instance to a JSONL record.
|
||||
"""
|
||||
# Machine.to_jsonl() not implemented yet, use inline conversion
|
||||
return {
|
||||
'type': TYPE_MACHINE,
|
||||
'id': str(machine.id),
|
||||
'guid': machine.guid,
|
||||
'hostname': machine.hostname,
|
||||
'os_arch': machine.os_arch,
|
||||
'os_family': machine.os_family,
|
||||
'os_platform': machine.os_platform,
|
||||
'os_release': machine.os_release,
|
||||
}
|
||||
|
||||
|
||||
def process_records(
|
||||
records: Iterator[Dict[str, Any]],
|
||||
handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
|
||||
|
||||
Reference in New Issue
Block a user