refactor: move remaining JSONL methods to models, clean up jsonl.py

- Add Tag.to_jsonl() method with schema_version
- Add Crawl.to_jsonl() method with schema_version
- Fix Tag.from_jsonl() to not depend on jsonl.py helper
- Update tests to use Snapshot.from_jsonl() instead of non-existent get_or_create_snapshot

Remove model-specific functions from misc/jsonl.py:
- tag_to_jsonl() - use Tag.to_jsonl() instead
- crawl_to_jsonl() - use Crawl.to_jsonl() instead
- get_or_create_tag() - use Tag.from_jsonl() instead
- process_jsonl_records() - use model from_jsonl() methods directly

jsonl.py now only contains generic I/O utilities:
- Type constants (TYPE_SNAPSHOT, etc.)
- parse_line(), read_stdin(), read_file(), read_args_or_stdin()
- write_record(), write_records()
- filter_by_type(), process_records()
This commit is contained in:
Claude
2025-12-30 19:30:18 +00:00
parent bc273c5a7f
commit ae648c9bc1
4 changed files with 45 additions and 102 deletions

View File

@@ -91,6 +91,19 @@ class Tag(ModelWithSerializers):
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
def to_jsonl(self) -> dict:
"""
Convert Tag model instance to a JSONL record.
"""
from archivebox.config import VERSION
return {
'type': 'Tag',
'schema_version': VERSION,
'id': str(self.id),
'name': self.name,
'slug': self.slug,
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
"""
@@ -103,19 +116,18 @@ class Tag(ModelWithSerializers):
Returns:
Tag instance or None
"""
from archivebox.misc.jsonl import get_or_create_tag
try:
tag = get_or_create_tag(record)
# Auto-attach to snapshot if in overrides
if overrides and 'snapshot' in overrides and tag:
overrides['snapshot'].tags.add(tag)
return tag
except ValueError:
name = record.get('name')
if not name:
return None
tag, _ = Tag.objects.get_or_create(name=name)
# Auto-attach to snapshot if in overrides
if overrides and 'snapshot' in overrides and tag:
overrides['snapshot'].tags.add(tag)
return tag
class SnapshotTag(models.Model):
id = models.AutoField(primary_key=True)