diff --git a/archivebox/api/migrations/0002_alter_outboundwebhook_options_and_more.py b/archivebox/api/migrations/0002_alter_outboundwebhook_options_and_more.py
new file mode 100644
index 00000000..e2770792
--- /dev/null
+++ b/archivebox/api/migrations/0002_alter_outboundwebhook_options_and_more.py
@@ -0,0 +1,113 @@
+# Generated by Django 6.0 on 2025-12-25 09:34
+
+import django.utils.timezone
+import signal_webhooks.fields
+import signal_webhooks.utils
+import uuid
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('api', '0001_squashed'),
+ migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+ ]
+
+ operations = [
+ migrations.AlterModelOptions(
+ name='outboundwebhook',
+ options={'verbose_name': 'API Outbound Webhook'},
+ ),
+ migrations.AddField(
+ model_name='outboundwebhook',
+ name='created',
+ field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now, help_text='When the webhook was created.', verbose_name='created'),
+ preserve_default=False,
+ ),
+ migrations.AddField(
+ model_name='outboundwebhook',
+ name='updated',
+ field=models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated'),
+ ),
+ migrations.AlterField(
+ model_name='apitoken',
+ name='created_at',
+ field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='apitoken',
+ name='id',
+ field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='auth_token',
+ field=signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='created_at',
+ field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='enabled',
+ field=models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='endpoint',
+ field=models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='headers',
+ field=models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='id',
+ field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='keep_last_response',
+ field=models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='last_failure',
+ field=models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='last_response',
+ field=models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='last_success',
+ field=models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='name',
+ field=models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='ref',
+ field=models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model'),
+ ),
+ migrations.AlterField(
+ model_name='outboundwebhook',
+ name='signal',
+ field=models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal'),
+ ),
+ migrations.AddConstraint(
+ model_name='outboundwebhook',
+ constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
+ ),
+ ]
diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py
index 4e1c3f25..31235e68 100644
--- a/archivebox/api/v1_core.py
+++ b/archivebox/api/v1_core.py
@@ -15,7 +15,7 @@ from ninja.pagination import paginate, PaginationBase
from ninja.errors import HttpError
from core.models import Snapshot, ArchiveResult, Tag
-from api.v1_crawls import CrawlSchema, SeedSchema
+from api.v1_crawls import CrawlSchema
router = Router(tags=['Core Models'])
@@ -271,9 +271,9 @@ def get_tag(request, tag_id: str, with_snapshots: bool = True):
return Tag.objects.get(slug__icontains=tag_id)
-@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
+@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID")
def get_any(request, id: str):
- """Get any object by its ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.)."""
+ """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.)."""
request.with_snapshots = False
request.with_archiveresults = False
@@ -285,14 +285,6 @@ def get_any(request, id: str):
except Exception:
pass
- try:
- from api.v1_crawls import get_seed
- response = get_seed(request, id)
- if response:
- return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}")
- except Exception:
- pass
-
try:
from api.v1_crawls import get_crawl
response = get_crawl(request, id)
diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py
index d84f622d..600a0673 100644
--- a/archivebox/api/v1_crawls.py
+++ b/archivebox/api/v1_crawls.py
@@ -10,53 +10,13 @@ from django.contrib.auth import get_user_model
from ninja import Router, Schema
from core.models import Snapshot
-from crawls.models import Seed, Crawl
+from crawls.models import Crawl
from .auth import API_AUTH_METHODS
router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS)
-class SeedSchema(Schema):
- TYPE: str = 'crawls.models.Seed'
-
- id: UUID
-
- modified_at: datetime
- created_at: datetime
- created_by_id: str
- created_by_username: str
-
- uri: str
- tags_str: str
- config: dict
-
- @staticmethod
- def resolve_created_by_id(obj):
- return str(obj.created_by_id)
-
- @staticmethod
- def resolve_created_by_username(obj):
- User = get_user_model()
- return User.objects.get(id=obj.created_by_id).username
-
-@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds")
-def get_seeds(request):
- return Seed.objects.all().distinct()
-
-@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed")
-def get_seed(request, seed_id: str):
- seed = None
- request.with_snapshots = False
- request.with_archiveresults = False
-
- try:
- seed = Seed.objects.get(Q(id__icontains=seed_id))
- except Exception:
- pass
- return seed
-
-
class CrawlSchema(Schema):
TYPE: str = 'crawls.models.Crawl'
@@ -66,24 +26,27 @@ class CrawlSchema(Schema):
created_at: datetime
created_by_id: str
created_by_username: str
-
+
status: str
retry_at: datetime | None
- seed: SeedSchema
+ urls: str
+ extractor: str
max_depth: int
-
+ tags_str: str
+ config: dict
+
# snapshots: List[SnapshotSchema]
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
-
+
@staticmethod
def resolve_created_by_username(obj):
User = get_user_model()
return User.objects.get(id=obj.created_by_id).username
-
+
@staticmethod
def resolve_snapshots(obj, context):
if context['request'].with_snapshots:
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index 4c2737ee..5a33e11a 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -21,6 +21,7 @@ class ArchiveBoxGroup(click.Group):
meta_commands = {
'help': 'archivebox.cli.archivebox_help.main',
'version': 'archivebox.cli.archivebox_version.main',
+ 'mcp': 'archivebox.cli.archivebox_mcp.main',
}
setup_commands = {
'init': 'archivebox.cli.archivebox_init.main',
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index b668d26b..451ed0d3 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -36,15 +36,14 @@ def add(urls: str | list[str],
created_by_id: int | None=None) -> QuerySet['Snapshot']:
"""Add a new URL or list of URLs to your archive.
- The new flow is:
+ The flow is:
1. Save URLs to sources file
- 2. Create Seed pointing to the file
- 3. Create Crawl with max_depth
- 4. Create root Snapshot pointing to file:// URL (depth=0)
- 5. Orchestrator runs parser extractors on root snapshot
- 6. Parser extractors output to urls.jsonl
- 7. URLs are added to Crawl.urls and child Snapshots are created
- 8. Repeat until max_depth is reached
+ 2. Create Crawl with URLs and max_depth
+ 3. Orchestrator creates Snapshots from Crawl URLs (depth=0)
+ 4. Orchestrator runs parser extractors on root snapshots
+ 5. Parser extractors output to urls.jsonl
+ 6. URLs are added to Crawl.urls and child Snapshots are created
+ 7. Repeat until max_depth is reached
"""
from rich import print
@@ -55,7 +54,7 @@ def add(urls: str | list[str],
# import models once django is set up
from core.models import Snapshot
- from crawls.models import Seed, Crawl
+ from crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator
@@ -66,19 +65,24 @@ def add(urls: str | list[str],
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
- # 2. Create a new Seed pointing to the sources file
+ # 2. Create a new Crawl with inline URLs
cli_args = [*sys.argv]
if cli_args[0].lower().endswith('archivebox'):
cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args)
timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
- seed = Seed.from_file(
- sources_file,
+
+ # Read URLs directly into crawl
+ urls_content = sources_file.read_text()
+
+ crawl = Crawl.objects.create(
+ urls=urls_content,
+ extractor=parser,
+ max_depth=depth,
+ tags_str=tag,
label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
- parser=parser,
- tag=tag,
- created_by=created_by_id,
+ created_by_id=created_by_id,
config={
'ONLY_NEW': not update,
'INDEX_ONLY': index_only,
@@ -88,15 +92,13 @@ def add(urls: str | list[str],
}
)
- # 3. Create a new Crawl pointing to the Seed (status=queued)
- crawl = Crawl.from_seed(seed, max_depth=depth)
-
print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
- print(f' [dim]Seed: {seed.uri}[/dim]')
+ first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
+ print(f' [dim]First URL: {first_url}[/dim]')
- # 4. The CrawlMachine will create the root Snapshot when started
- # Root snapshot URL = file:///path/to/sources/...txt
- # Parser extractors will run on it and discover URLs
+ # 3. The CrawlMachine will create the root Snapshot when started
+ # If URLs are from a file: first URL = file:///path/to/sources/...txt
+ # Parser extractors will run on it and discover more URLs
# Those URLs become child Snapshots (depth=1)
if index_only:
diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py
index 0c7e4d16..4fb5d671 100644
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -76,7 +76,7 @@ def discover_outlinks(
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot, ArchiveResult
- from crawls.models import Seed, Crawl
+ from crawls.models import Crawl
from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator
@@ -117,12 +117,12 @@ def discover_outlinks(
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
- seed = Seed.from_file(
+ crawl = Crawl.from_file(
sources_file,
+ max_depth=depth,
label=f'crawl --depth={depth}',
created_by=created_by_id,
)
- crawl = Crawl.from_seed(seed, max_depth=depth)
# Create snapshots for new URLs
for record in new_url_records:
diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py
index b797944d..f7cb4c1a 100755
--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -42,27 +42,20 @@ def install(dry_run: bool=False) -> None:
setup_django()
from django.utils import timezone
- from crawls.models import Seed, Crawl
+ from crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk
- # Create a seed and crawl for dependency detection
+ # Create a crawl for dependency detection
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
- seed, _created = Seed.objects.get_or_create(
- uri='archivebox://install',
+ crawl, created = Crawl.objects.get_or_create(
+ urls='archivebox://install',
label='Dependency detection',
created_by_id=created_by_id,
defaults={
'extractor': 'auto',
- }
- )
-
- crawl, created = Crawl.objects.get_or_create(
- seed=seed,
- max_depth=0,
- created_by_id=created_by_id,
- defaults={
+ 'max_depth': 0,
'status': 'queued',
}
)
diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py
index bb41af47..eb9a1e40 100644
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -92,7 +92,7 @@ def create_snapshots(
)
from archivebox.base_models.models import get_or_create_system_user_pk
from core.models import Snapshot
- from crawls.models import Seed, Crawl
+ from crawls.models import Crawl
from archivebox.config import CONSTANTS
created_by_id = created_by_id or get_or_create_system_user_pk()
@@ -108,17 +108,17 @@ def create_snapshots(
# If depth > 0, we need a Crawl to manage recursive discovery
crawl = None
if depth > 0:
- # Create a seed for this batch
+ # Create a crawl for this batch
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
sources_file.parent.mkdir(parents=True, exist_ok=True)
sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
- seed = Seed.from_file(
+ crawl = Crawl.from_file(
sources_file,
+ max_depth=depth,
label=f'snapshot --depth={depth}',
created_by=created_by_id,
)
- crawl = Crawl.from_seed(seed, max_depth=depth)
# Process each record
created_snapshots = []
diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py
index c891b8ea..59902c4b 100755
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -111,53 +111,27 @@ def version(quiet: bool=False,
machine = Machine.current()
- # Get all *_BINARY config values
- binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
+ # Get all installed binaries from the database
+ all_installed = InstalledBinary.objects.filter(
+ machine=machine
+ ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
- if not binary_config_keys:
- prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
+ if not all_installed.exists():
+ prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
else:
- for key in sorted(set(binary_config_keys)):
- # Get the actual binary name/path from config value
- # Prioritize Machine.config overrides over base config
- bin_value = machine.config.get(key) or config.get(key, '').strip()
- if not bin_value:
+ for installed in all_installed:
+ # Skip if user specified specific binaries and this isn't one
+ if binaries and installed.name not in binaries:
continue
- # Check if it's a path (has slashes) or just a name
- is_path = '/' in str(bin_value)
-
- if is_path:
- # It's a full path - match against abspath
- bin_name = Path(bin_value).name
- # Skip if user specified specific binaries and this isn't one
- if binaries and bin_name not in binaries:
- continue
- # Find InstalledBinary where abspath ends with this path
- installed = InstalledBinary.objects.filter(
- machine=machine,
- abspath__endswith=bin_value,
- ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
- else:
- # It's just a binary name - match against name
- bin_name = bin_value
- # Skip if user specified specific binaries and this isn't one
- if binaries and bin_name not in binaries:
- continue
- # Find InstalledBinary by name
- installed = InstalledBinary.objects.filter(
- machine=machine,
- name__iexact=bin_name,
- ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
-
- if installed and installed.is_valid:
+ if installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8]
- prnt('', '[green]β[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
+ prnt('', '[green]β[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
else:
- prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
- failures.append(bin_name)
+ prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
+ failures.append(installed.name)
# Show hint if no binaries are installed yet
has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py
index 80894b58..a5c29ff4 100644
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -96,10 +96,8 @@ class ConstantsDict(Mapping):
# Data dir files
CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3'
- QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
- QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
JSON_INDEX_FILENAME: str = 'index.json'
HTML_INDEX_FILENAME: str = 'index.html'
@@ -184,10 +182,10 @@ class ConstantsDict(Mapping):
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm",
- QUEUE_DATABASE_FILENAME,
- f"{QUEUE_DATABASE_FILENAME}-wal",
- f"{QUEUE_DATABASE_FILENAME}-shm",
"search.sqlite3",
+ "queue.sqlite3",
+ "queue.sqlite3-wal",
+ "queue.sqlite3-shm",
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
diff --git a/archivebox/config/django.py b/archivebox/config/django.py
index d7910ec0..9b06db7b 100644
--- a/archivebox/config/django.py
+++ b/archivebox/config/django.py
@@ -56,6 +56,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
+ # Suppress the "database access during app initialization" warning
+ # This warning can be triggered during django.setup() but is safe to ignore
+ # since we're doing intentional setup operations
+ import warnings
+ warnings.filterwarnings('ignore',
+ message='.*Accessing the database during app initialization.*',
+ category=RuntimeWarning)
+
try:
from django.core.management import call_command
@@ -87,7 +95,8 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
style='bold red',
))
STDERR.print()
- STDERR.print_exception(show_locals=False)
+ import traceback
+ traceback.print_exc()
return
from django.conf import settings
diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py
index 7c6fcdd7..0eeb84f8 100644
--- a/archivebox/config/paths.py
+++ b/archivebox/config/paths.py
@@ -224,12 +224,6 @@ def get_data_locations():
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
- "QUEUE_DATABASE": {
- "path": CONSTANTS.QUEUE_DATABASE_FILE,
- "enabled": True,
- "is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
- "is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
- },
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index d25f291c..bd73c363 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -33,15 +33,18 @@ GLOBAL_CONTEXT = {}
class SnapshotActionForm(ActionForm):
- tags = forms.ModelMultipleChoiceField(
- label='Edit tags',
- queryset=Tag.objects.all(),
- required=False,
- widget=FilteredSelectMultiple(
- 'core_tag__name',
- False,
- ),
- )
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ # Define tags field in __init__ to avoid database access during app initialization
+ self.fields['tags'] = forms.ModelMultipleChoiceField(
+ label='Edit tags',
+ queryset=Tag.objects.all(),
+ required=False,
+ widget=FilteredSelectMultiple(
+ 'core_tag__name',
+ False,
+ ),
+ )
# TODO: allow selecting actions for specific extractors? is this useful?
# extractor = forms.ChoiceField(
@@ -165,14 +168,69 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def admin_actions(self, obj):
return format_html(
- # URL Hash: {}
'''
- Summary page β‘οΈ
- Result files π
- Admin actions βοΈ
+
+ Tip: Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute. +
''', obj.timestamp, obj.timestamp, + obj.url, + obj.pk, + obj.pk, + obj.pk, obj.pk, ) diff --git a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py new file mode 100644 index 00000000..dfead5b3 --- /dev/null +++ b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py @@ -0,0 +1,101 @@ +# Generated by Django 6.0 on 2025-12-25 09:34 + +import archivebox.base_models.models +import django.db.models.deletion +import django.utils.timezone +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0025_allow_duplicate_urls_per_crawl'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.RemoveField( + model_name='archiveresult', + name='output_dir', + ), + migrations.RemoveField( + model_name='snapshot', + name='output_dir', + ), + migrations.AlterField( + model_name='archiveresult', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='archiveresult', + name='created_by', + field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(db_index=True, max_length=32), + ), + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.AutoField(editable=False, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='bookmarked_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='created_by', + field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + # migrations.AlterField( + # model_name='snapshot', + # name='tags', + # field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + # ), + migrations.AlterField( + model_name='snapshottag', + name='id', + field=models.AutoField(primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='tag', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'tag')}, + ), + ] diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index d051229d..295dcfa4 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -59,7 +59,7 @@ INSTALLED_APPS = [ "config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here) "machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. "workers", # handles starting and managing background workers and processes (orchestrators and actors) - "crawls", # handles Seed, Crawl, and CrawlSchedule models and management + "crawls", # handles Crawl and CrawlSchedule models and management "personas", # handles Persona and session management "core", # core django model with Snapshot, ArchiveResult, etc. "api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. @@ -194,10 +194,6 @@ DATABASES = { "NAME": DATABASE_NAME, **SQLITE_CONNECTION_OPTIONS, }, - "queue": { - "NAME": CONSTANTS.QUEUE_DATABASE_FILE, - **SQLITE_CONNECTION_OPTIONS, - }, # "filestore": { # "NAME": CONSTANTS.FILESTORE_DATABASE_FILE, # **SQLITE_CONNECTION_OPTIONS, diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py index 3b4ecd05..85d6404a 100644 --- a/archivebox/core/settings_logging.py +++ b/archivebox/core/settings_logging.py @@ -2,8 +2,6 @@ __package__ = 'archivebox.core' import re import os - -import shutil import tempfile import logging @@ -11,7 +9,6 @@ import pydantic import django.template from archivebox.config import CONSTANTS -from archivebox.misc.logging import IS_TTY IGNORABLE_URL_PATTERNS = [ @@ -79,7 +76,6 @@ SETTINGS_LOGGING = { "formatters": { "rich": { "datefmt": "[%Y-%m-%d %H:%M:%S]", - # "format": "{asctime} {levelname} {module} {name} {message} {username}", "format": "%(name)s %(message)s", }, "outbound_webhooks": { @@ -99,26 +95,13 @@ SETTINGS_LOGGING = { }, }, "handlers": { - # "console": { - # "level": "DEBUG", - # 'formatter': 'simple', - # "class": "logging.StreamHandler", - # 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'], - # }, "default": { "class": "rich.logging.RichHandler", "formatter": "rich", "level": "DEBUG", "markup": False, - "rich_tracebacks": IS_TTY, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) "filters": ["noisyrequestsfilter"], - "tracebacks_suppress": [ - django, - pydantic, - ], - "tracebacks_width": shutil.get_terminal_size((100, 10)).columns - 1, - "tracebacks_word_wrap": False, - "tracebacks_show_locals": False, }, "logfile": { "level": "INFO", @@ -132,7 +115,7 @@ SETTINGS_LOGGING = { "outbound_webhooks": { "class": "rich.logging.RichHandler", "markup": False, - "rich_tracebacks": True, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) "formatter": "outbound_webhooks", }, # "mail_admins": { diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index fde35403..eccefbbd 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -15,7 +15,7 @@ from statemachine import State, StateMachine # from workers.actor import ActorType from core.models import Snapshot, ArchiveResult -from crawls.models import Crawl, Seed +from crawls.models import Crawl class SnapshotMachine(StateMachine, strict_states=True): @@ -247,17 +247,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True): ) self.archiveresult.save(write_indexes=True) - # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed + # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1) Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1) - # Also update Crawl and Seed health stats if snapshot has a crawl + # Also update Crawl health stats if snapshot has a crawl snapshot = self.archiveresult.snapshot if snapshot.crawl_id: Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1) - crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first() - if crawl: - Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1) @failed.enter def enter_failed(self): @@ -268,17 +265,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True): end_ts=timezone.now(), ) - # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed + # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1) Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1) - # Also update Crawl and Seed health stats if snapshot has a crawl + # Also update Crawl health stats if snapshot has a crawl snapshot = self.archiveresult.snapshot if snapshot.crawl_id: Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1) - crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first() - if crawl: - Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1) @skipped.enter def enter_skipped(self): diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 3f9b1794..4c6932df 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -33,7 +33,7 @@ from archivebox.search import query_search_index from core.models import Snapshot from core.forms import AddLinkForm -from crawls.models import Seed, Crawl +from crawls.models import Crawl from archivebox.hooks import get_extractors, get_extractor_name @@ -119,7 +119,11 @@ class SnapshotView(View): if result_file.name in existing_files or result_file.name == 'index.html': continue - file_size = result_file.stat().st_size or 0 + # Skip circular symlinks and other stat() failures + try: + file_size = result_file.stat().st_size or 0 + except OSError: + continue if file_size > min_size_threshold: archiveresults[result_file.name] = { @@ -471,14 +475,16 @@ class AddView(UserPassesTestMixin, FormView): sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt' sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) - # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_{}:{}', source_file, contents)
-
- return format_html('See URLs here: {}', obj.uri, obj.uri)
-
-
-
-
-class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
- list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
- sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
- search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
-
- readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
-
- fieldsets = (
- ('URLs', {
- 'fields': ('seed_urls_editor',),
- 'classes': ('card', 'wide'),
- }),
- ('Info', {
- 'fields': ('label', 'notes'),
- 'classes': ('card',),
- }),
- ('Settings', {
- 'fields': ('max_depth', 'config'),
+ 'fields': ('max_depth', 'extractor', 'config'),
'classes': ('card',),
}),
('Status', {
@@ -237,7 +161,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card',),
}),
('Relations', {
- 'fields': ('seed', 'schedule', 'created_by'),
+ 'fields': ('schedule', 'created_by'),
'classes': ('card',),
}),
('Timestamps', {
@@ -250,7 +174,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
}),
)
- list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
+ list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
actions = ["delete_selected"]
@@ -258,23 +182,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
@action(label='Recrawl', description='Create a new crawl with the same settings')
def recrawl(self, request, obj):
- """Duplicate this crawl as a new crawl with the same seed and settings."""
+ """Duplicate this crawl as a new crawl with the same URLs and settings."""
from django.utils import timezone
from django.shortcuts import redirect
- # Validate seed has a URI (required for crawl to start)
- if not obj.seed:
- messages.error(request, 'Cannot recrawl: original crawl has no seed.')
- return redirect('admin:crawls_crawl_change', obj.id)
-
- if not obj.seed.uri:
- messages.error(request, 'Cannot recrawl: seed has no URI.')
+ # Validate URLs (required for crawl to start)
+ if not obj.urls:
+ messages.error(request, 'Cannot recrawl: original crawl has no URLs.')
return redirect('admin:crawls_crawl_change', obj.id)
new_crawl = Crawl.objects.create(
- seed=obj.seed,
urls=obj.urls,
+ extractor=obj.extractor,
max_depth=obj.max_depth,
+ tags_str=obj.tags_str,
config=obj.config,
schedule=obj.schedule,
label=f"{obj.label} (recrawl)" if obj.label else "",
@@ -292,43 +213,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return redirect('admin:crawls_crawl_change', new_crawl.id)
- def get_urls(self):
- urls = super().get_urls()
- custom_urls = [
- path('+ {line_count} URL{'s' if line_count != 1 else ''} Β· URLs are read-only in admin, edit via API or CLI +