type and test fixes

This commit is contained in:
Nick Sweeting
2026-03-15 20:12:27 -07:00
parent 3889eb4efa
commit bc21d4bfdb
52 changed files with 762 additions and 1317 deletions

View File

@@ -8,7 +8,6 @@ from pathlib import Path
from typing import TYPE_CHECKING, Callable
import rich_click as click
from rich import print
from django.db.models import Q, QuerySet
@@ -212,7 +211,11 @@ def search(filter_patterns: list[str] | None=None,
folders: dict[str, Snapshot | None] = {snapshot.output_dir: snapshot for snapshot in snapshots}
output = printable_folders(folders, with_headers)
print(output)
# Structured exports must be written directly to stdout.
# rich.print() reflows long lines to console width, which corrupts JSON/CSV/HTML output.
sys.stdout.write(output)
if not output.endswith('\n'):
sys.stdout.write('\n')
return output

View File

@@ -29,6 +29,7 @@ import tempfile
import unittest
from io import StringIO
from pathlib import Path
from typing import TypeVar
# Test configuration - disable slow extractors
TEST_CONFIG = {
@@ -58,6 +59,14 @@ TEST_CONFIG = {
os.environ.update(TEST_CONFIG)
T = TypeVar('T')
def require(value: T | None) -> T:
if value is None:
raise AssertionError('Expected value to be present')
return value
# =============================================================================
# JSONL Utility Tests
@@ -70,8 +79,7 @@ class TestJSONLParsing(unittest.TestCase):
"""Plain URLs should be parsed as Snapshot records."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
result = parse_line('https://example.com')
self.assertIsNotNone(result)
result = require(parse_line('https://example.com'))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'https://example.com')
@@ -80,8 +88,7 @@ class TestJSONLParsing(unittest.TestCase):
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
result = parse_line(line)
self.assertIsNotNone(result)
result = require(parse_line(line))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'https://example.com')
self.assertEqual(result['tags'], 'test,demo')
@@ -91,8 +98,7 @@ class TestJSONLParsing(unittest.TestCase):
from archivebox.misc.jsonl import parse_line, TYPE_CRAWL
line = '{"type": "Crawl", "id": "abc123", "urls": "https://example.com", "max_depth": 1}'
result = parse_line(line)
self.assertIsNotNone(result)
result = require(parse_line(line))
self.assertEqual(result['type'], TYPE_CRAWL)
self.assertEqual(result['id'], 'abc123')
self.assertEqual(result['urls'], 'https://example.com')
@@ -103,8 +109,7 @@ class TestJSONLParsing(unittest.TestCase):
from archivebox.misc.jsonl import parse_line
line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
result = parse_line(line)
self.assertIsNotNone(result)
result = require(parse_line(line))
self.assertEqual(result['id'], 'abc123')
self.assertEqual(result['url'], 'https://example.com')
@@ -113,8 +118,7 @@ class TestJSONLParsing(unittest.TestCase):
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
uuid = '01234567-89ab-cdef-0123-456789abcdef'
result = parse_line(uuid)
self.assertIsNotNone(result)
result = require(parse_line(uuid))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['id'], uuid)
@@ -144,8 +148,7 @@ class TestJSONLParsing(unittest.TestCase):
"""file:// URLs should be parsed."""
from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
result = parse_line('file:///path/to/file.txt')
self.assertIsNotNone(result)
result = require(parse_line('file:///path/to/file.txt'))
self.assertEqual(result['type'], TYPE_SNAPSHOT)
self.assertEqual(result['url'], 'file:///path/to/file.txt')
@@ -501,9 +504,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Create crawl with multiple URLs (as newline-separated string)
urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
self.assertIsNotNone(crawl)
crawl = require(Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}))
self.assertIsNotNone(crawl.id)
self.assertEqual(crawl.urls, urls)
self.assertEqual(crawl.status, 'queued')
@@ -538,7 +539,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create crawl (simulating 'archivebox crawl')
urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
crawl = require(Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}))
crawl_output = crawl.to_json()
# Step 2: Parse crawl output as snapshot input
@@ -590,7 +591,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Create snapshot
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_json(records[0], overrides=overrides)
snapshot = require(Snapshot.from_json(records[0], overrides=overrides))
self.assertIsNotNone(snapshot.id)
self.assertEqual(snapshot.url, url)
@@ -618,7 +619,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# Step 1: Create snapshot (simulating 'archivebox snapshot')
url = 'https://test-extract-1.example.com'
overrides = {'created_by_id': created_by_id}
snapshot = Snapshot.from_json({'url': url}, overrides=overrides)
snapshot = require(Snapshot.from_json({'url': url}, overrides=overrides))
snapshot_output = snapshot.to_json()
# Step 2: Parse snapshot output as extract input
@@ -657,7 +658,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
# === archivebox crawl https://example.com ===
url = 'https://test-pipeline-full.example.com'
crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id})
crawl = require(Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id}))
crawl_jsonl = json.dumps(crawl.to_json())
# === | archivebox snapshot ===
@@ -728,12 +729,12 @@ class TestDepthWorkflows(unittest.TestCase):
# Create crawl with depth 0
url = 'https://depth0-test.example.com'
crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
crawl = require(Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id}))
self.assertEqual(crawl.max_depth, 0)
# Create snapshot
snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
snapshot = require(Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id}))
self.assertEqual(snapshot.url, url)
def test_depth_metadata_in_crawl(self):
@@ -744,10 +745,10 @@ class TestDepthWorkflows(unittest.TestCase):
created_by_id = get_or_create_system_user_pk()
# Create crawl with depth
crawl = Crawl.from_json(
crawl = require(Crawl.from_json(
{'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
overrides={'created_by_id': created_by_id}
)
))
self.assertEqual(crawl.max_depth, 2)