mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 17:35:45 +10:00
wip
This commit is contained in:
@@ -19,7 +19,7 @@ from archivebox.config.permissions import USER, HOSTNAME
|
||||
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
|
||||
@enforce_types
|
||||
@@ -53,8 +53,8 @@ def add(urls: str | list[str],
|
||||
assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'
|
||||
|
||||
# import models once django is set up
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
|
||||
@@ -66,18 +66,38 @@ def config(*keys,
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = FLAT_CONFIG
|
||||
|
||||
|
||||
# Display core config sections
|
||||
for config_section in CONFIGS.values():
|
||||
if hasattr(config_section, 'toml_section_header'):
|
||||
print(f'[grey53]\\[{config_section.toml_section_header}][/grey53]')
|
||||
else:
|
||||
print('[grey53]\\[CONSTANTS] # (read-only)[/grey53]')
|
||||
|
||||
|
||||
kv_in_section = {key: val for key, val in dict(config_section).items() if key in matching_config}
|
||||
print(benedict(kv_in_section).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
|
||||
|
||||
|
||||
# Display plugin config section
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
plugin_keys = {}
|
||||
|
||||
# Collect all plugin config keys
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' not in schema:
|
||||
continue
|
||||
for key in schema['properties'].keys():
|
||||
if key in matching_config:
|
||||
plugin_keys[key] = matching_config[key]
|
||||
|
||||
# Display all plugin config in single [PLUGINS] section
|
||||
if plugin_keys:
|
||||
print(f'[grey53]\\[PLUGINS][/grey53]')
|
||||
print(benedict(plugin_keys).to_toml(encoder=CustomTOMLEncoder()).strip().replace('\n\n', '\n'))
|
||||
print('[grey53]################################################################[/grey53]')
|
||||
|
||||
raise SystemExit(not matching_config)
|
||||
|
||||
elif set:
|
||||
|
||||
@@ -72,11 +72,11 @@ def discover_outlinks(
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record,
|
||||
TYPE_SNAPSHOT, get_or_create_snapshot
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
@@ -130,8 +130,10 @@ def discover_outlinks(
|
||||
record['crawl_id'] = str(crawl.id)
|
||||
record['depth'] = record.get('depth', 0)
|
||||
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
snapshot_ids.append(str(snapshot.id))
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
|
||||
if snapshot:
|
||||
snapshot_ids.append(str(snapshot.id))
|
||||
|
||||
except Exception as e:
|
||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||
@@ -162,7 +164,6 @@ def discover_outlinks(
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': snapshot.created_by_id,
|
||||
}
|
||||
)
|
||||
else:
|
||||
@@ -229,7 +230,7 @@ def process_crawl_by_id(crawl_id: str) -> int:
|
||||
- Transition from started -> sealed (when all snapshots done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from crawls.models import Crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
try:
|
||||
crawl = Crawl.objects.get(id=crawl_id)
|
||||
@@ -256,7 +257,7 @@ def is_crawl_id(value: str) -> bool:
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually a Crawl (not a Snapshot or other object)
|
||||
from crawls.models import Crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
return Crawl.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
|
||||
@@ -43,7 +43,7 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
||||
Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from core.models import ArchiveResult
|
||||
from archivebox.core.models import ArchiveResult
|
||||
|
||||
try:
|
||||
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
||||
@@ -95,7 +95,7 @@ def run_plugins(
|
||||
read_args_or_stdin, write_record, archiveresult_to_jsonl,
|
||||
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
)
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from workers.orchestrator import Orchestrator
|
||||
|
||||
is_tty = sys.stdout.isatty()
|
||||
@@ -155,7 +155,6 @@ def run_plugins(
|
||||
defaults={
|
||||
'status': ArchiveResult.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': snapshot.created_by_id,
|
||||
}
|
||||
)
|
||||
if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
|
||||
@@ -218,7 +217,7 @@ def is_archiveresult_id(value: str) -> bool:
|
||||
if not uuid_pattern.match(value):
|
||||
return False
|
||||
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
||||
from core.models import ArchiveResult
|
||||
from archivebox.core.models import ArchiveResult
|
||||
return ArchiveResult.objects.filter(id=value).exists()
|
||||
|
||||
|
||||
|
||||
@@ -95,7 +95,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
|
||||
print()
|
||||
print('[dodger_blue3][*] Checking links from indexes and archive folders (safe to Ctrl+C)...[/dodger_blue3]')
|
||||
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
all_links = Snapshot.objects.none()
|
||||
pending_links: dict[str, SnapshotDict] = {}
|
||||
|
||||
@@ -42,7 +42,7 @@ def install(dry_run: bool=False) -> None:
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from crawls.models import Crawl
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
# Create a crawl for dependency detection
|
||||
@@ -70,7 +70,7 @@ def install(dry_run: bool=False) -> None:
|
||||
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
|
||||
|
||||
# Verify the crawl is in the queue
|
||||
from crawls.models import Crawl as CrawlModel
|
||||
from archivebox.crawls.models import Crawl as CrawlModel
|
||||
queued_crawls = CrawlModel.objects.filter(
|
||||
retry_at__lte=timezone.now()
|
||||
).exclude(
|
||||
|
||||
@@ -71,7 +71,7 @@ def remove(filter_patterns: Iterable[str]=(),
|
||||
to_remove = snapshots.count()
|
||||
|
||||
from archivebox.search import flush_search_index
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
flush_search_index(snapshots=snapshots)
|
||||
snapshots.delete()
|
||||
|
||||
@@ -36,7 +36,7 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
|
||||
before: Optional[float]=None,
|
||||
out_dir: Path=DATA_DIR) -> QuerySet:
|
||||
"""Filter and return Snapshots matching the given criteria."""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if snapshots:
|
||||
result = snapshots
|
||||
@@ -68,7 +68,7 @@ def search(filter_patterns: list[str] | None=None,
|
||||
csv: str | None=None,
|
||||
with_headers: bool=False):
|
||||
"""List, filter, and export information about archive entries"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
if with_headers and not (json or html or csv):
|
||||
stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
|
||||
|
||||
@@ -46,7 +46,7 @@ def process_snapshot_by_id(snapshot_id: str) -> int:
|
||||
- Transition from started -> sealed (when all ArchiveResults done)
|
||||
"""
|
||||
from rich import print as rprint
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
@@ -88,11 +88,11 @@ def create_snapshots(
|
||||
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
|
||||
TYPE_SNAPSHOT, TYPE_TAG
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from core.models import Snapshot
|
||||
from crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
created_by_id = created_by_id or get_or_create_system_user_pk()
|
||||
@@ -137,8 +137,10 @@ def create_snapshots(
|
||||
record['tags'] = tag
|
||||
|
||||
# Get or create the snapshot
|
||||
snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
|
||||
created_snapshots.append(snapshot)
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(record, overrides=overrides)
|
||||
if snapshot:
|
||||
created_snapshots.append(snapshot)
|
||||
|
||||
# Output JSONL record (only when piped)
|
||||
if not is_tty:
|
||||
|
||||
@@ -21,7 +21,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from archivebox.misc.db import get_admins
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
User = get_user_model()
|
||||
|
||||
print('[green]\\[*] Scanning archive main index...[/green]')
|
||||
|
||||
@@ -36,7 +36,7 @@ def update(filter_patterns: Iterable[str] = (),
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.utils import timezone
|
||||
|
||||
while True:
|
||||
@@ -83,7 +83,7 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
|
||||
Skip symlinks (already migrated).
|
||||
Create DB records and trigger migration on save().
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.config import CONSTANTS
|
||||
from django.db import transaction
|
||||
|
||||
@@ -151,7 +151,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
|
||||
Process all snapshots in DB.
|
||||
Reconcile index.json and queue for archiving.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
|
||||
@@ -189,7 +189,7 @@ def process_filtered_snapshots(
|
||||
batch_size: int
|
||||
) -> dict:
|
||||
"""Process snapshots matching filters (DB query only)."""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from django.db import transaction
|
||||
from django.utils import timezone
|
||||
from datetime import datetime
|
||||
|
||||
@@ -107,7 +107,7 @@ def version(quiet: bool=False,
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from machine.models import Machine, Binary
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
|
||||
@@ -542,10 +542,10 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
Test: archivebox snapshot URL
|
||||
Should create a Snapshot and output JSONL when piped.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
read_args_or_stdin, write_record, snapshot_to_jsonl,
|
||||
TYPE_SNAPSHOT, get_or_create_snapshot
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
@@ -559,7 +559,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
self.assertEqual(records[0]['url'], url)
|
||||
|
||||
# Create snapshot
|
||||
snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
|
||||
|
||||
self.assertIsNotNone(snapshot.id)
|
||||
self.assertEqual(snapshot.url, url)
|
||||
@@ -575,9 +576,9 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
Test: archivebox snapshot URL | archivebox extract
|
||||
Extract should accept JSONL output from snapshot command.
|
||||
"""
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.misc.jsonl import (
|
||||
snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
|
||||
snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
)
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
@@ -586,7 +587,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
# Step 1: Create snapshot (simulating 'archivebox snapshot')
|
||||
url = 'https://test-extract-1.example.com'
|
||||
snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
|
||||
overrides = {'created_by_id': created_by_id}
|
||||
snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
|
||||
snapshot_output = snapshot_to_jsonl(snapshot)
|
||||
|
||||
# Step 2: Parse snapshot output as extract input
|
||||
@@ -648,7 +650,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
This is equivalent to: archivebox add URL
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
@@ -682,7 +684,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
|
||||
|
||||
This is equivalent to: archivebox add --depth=1 URL
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import (
|
||||
get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
|
||||
TYPE_SNAPSHOT
|
||||
@@ -772,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
|
||||
|
||||
Depth 0: Only archive the specified URL, no crawling.
|
||||
"""
|
||||
from core.models import Snapshot
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.misc.jsonl import get_or_create_snapshot
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
|
||||
|
||||
Reference in New Issue
Block a user