+ '''
+
+ def _escape(self, s):
+ """Escape HTML special chars in attribute values."""
+ if not s:
+ return ''
+ return str(s).replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"')
+
+ def value_from_datadict(self, data, files, name):
+ value = data.get(name, '{}')
+ return value
+
+
class ConfigEditorMixin:
"""
Mixin for admin classes with a config JSON field.
- Provides a readonly field that shows available config options
- from all discovered plugin schemas.
+ Provides a key-value editor widget with autocomplete for available config keys.
"""
- @admin.display(description='Available Config Options')
- def available_config_options(self, obj):
- """Show documentation for available config keys."""
- try:
- from archivebox.hooks import discover_plugin_configs
- plugin_configs = discover_plugin_configs()
- except ImportError:
- return format_html('Plugin config system not available')
-
- html_parts = [
- '',
- '',
- 'Click to see available config keys ({})'.format(
- sum(len(s.get('properties', {})) for s in plugin_configs.values())
- ),
- '
',
- ]
-
- for plugin_name, schema in sorted(plugin_configs.items()):
- properties = schema.get('properties', {})
- if not properties:
- continue
-
- html_parts.append(f'
{plugin_name}
')
- html_parts.append('
')
- html_parts.append('
Key
Type
Default
Description
')
-
- for key, prop in sorted(properties.items()):
- prop_type = prop.get('type', 'string')
- default = prop.get('default', '')
- description = prop.get('description', '')
-
- # Truncate long defaults
- default_str = str(default)
- if len(default_str) > 30:
- default_str = default_str[:27] + '...'
-
- html_parts.append(
- f'
'
- f'
{key}
'
- f'
{prop_type}
'
- f'
{default_str}
'
- f'
{description}
'
- f'
'
- )
-
- html_parts.append('
')
-
- html_parts.append('
')
- html_parts.append(
- '
'
- 'Usage: Add key-value pairs in JSON format, e.g., '
- '{"SAVE_WGET": false, "WGET_TIMEOUT": 120}'
- '
'
- )
-
- return mark_safe(''.join(html_parts))
+ def formfield_for_dbfield(self, db_field, request, **kwargs):
+ """Use KeyValueWidget for the config JSON field."""
+ if db_field.name == 'config':
+ kwargs['widget'] = KeyValueWidget()
+ return super().formfield_for_dbfield(db_field, request, **kwargs)
class BaseModelAdmin(DjangoObjectActions, admin.ModelAdmin):
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 051f9f72..e9bcc53e 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -72,9 +72,10 @@ def add(urls: str | list[str],
cli_args[0] = 'archivebox'
cmd_str = ' '.join(cli_args)
+ timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file,
- label=f'{USER}@{HOSTNAME} $ {cmd_str}',
+ label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]',
parser=parser,
tag=tag,
created_by=created_by_id,
diff --git a/archivebox/config/configset.py b/archivebox/config/configset.py
index f483d991..aeadbbca 100644
--- a/archivebox/config/configset.py
+++ b/archivebox/config/configset.py
@@ -11,21 +11,53 @@ __package__ = "archivebox.config"
import os
import json
from pathlib import Path
-from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
+from typing import Any, Dict, Optional, List, Type, Tuple, TYPE_CHECKING, cast
from configparser import ConfigParser
from pydantic import Field
-from pydantic_settings import BaseSettings
+from pydantic_settings import BaseSettings, PydanticBaseSettingsSource
+
+
+class IniConfigSettingsSource(PydanticBaseSettingsSource):
+ """
+ Custom settings source that reads from ArchiveBox.conf (INI format).
+ Flattens all sections into a single namespace.
+ """
+
+ def get_field_value(self, field: Any, field_name: str) -> Tuple[Any, str, bool]:
+ config_vals = self._load_config_file()
+ field_value = config_vals.get(field_name.upper())
+ return field_value, field_name, False
+
+ def __call__(self) -> Dict[str, Any]:
+ return self._load_config_file()
+
+ def _load_config_file(self) -> Dict[str, Any]:
+ try:
+ from archivebox.config.constants import CONSTANTS
+ config_path = CONSTANTS.CONFIG_FILE
+ except ImportError:
+ return {}
+
+ if not config_path.exists():
+ return {}
+
+ parser = ConfigParser()
+ parser.optionxform = lambda x: x # preserve case
+ parser.read(config_path)
+
+ # Flatten all sections into single namespace (ignore section headers)
+ return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
class BaseConfigSet(BaseSettings):
"""
Base class for config sections.
- Automatically loads values from:
- 1. Environment variables (highest priority)
- 2. ArchiveBox.conf file (if exists)
- 3. Default values (lowest priority)
+ Automatically loads values from (highest to lowest priority):
+ 1. Environment variables
+ 2. ArchiveBox.conf file (INI format, flattened)
+ 3. Default values
Subclasses define fields with defaults and types:
@@ -35,11 +67,30 @@ class BaseConfigSet(BaseSettings):
"""
class Config:
- # Use env vars with ARCHIVEBOX_ prefix or raw name
env_prefix = ""
extra = "ignore"
validate_default = True
+ @classmethod
+ def settings_customise_sources(
+ cls,
+ settings_cls: Type[BaseSettings],
+ init_settings: PydanticBaseSettingsSource,
+ env_settings: PydanticBaseSettingsSource,
+ dotenv_settings: PydanticBaseSettingsSource,
+ file_secret_settings: PydanticBaseSettingsSource,
+ ) -> Tuple[PydanticBaseSettingsSource, ...]:
+ """
+ Define the order of settings sources (first = highest priority).
+ """
+ return (
+ init_settings, # 1. Passed to __init__
+ env_settings, # 2. Environment variables
+ IniConfigSettingsSource(settings_cls), # 3. ArchiveBox.conf file
+ # dotenv_settings, # Skip .env files
+ # file_secret_settings, # Skip secrets files
+ )
+
@classmethod
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
"""Load config values from INI file."""
@@ -47,7 +98,7 @@ class BaseConfigSet(BaseSettings):
return {}
parser = ConfigParser()
- parser.optionxform = lambda x: x # type: ignore # preserve case
+ parser.optionxform = lambda x: x # preserve case
parser.read(config_path)
# Flatten all sections into single namespace
diff --git a/archivebox/config/views.py b/archivebox/config/views.py
index 5cfb0190..0f1c33b6 100644
--- a/archivebox/config/views.py
+++ b/archivebox/config/views.py
@@ -256,7 +256,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Show a helpful message when no plugins found
rows['Name'].append('(no plugins found)')
rows['Source'].append('-')
- rows['Path'].append(format_html('archivebox/plugins/ or data/plugins/'))
+ rows['Path'].append(mark_safe('archivebox/plugins/ or data/plugins/'))
rows['Hooks'].append('-')
return TableContext(
diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py
index 1e3b9be4..5497d2a6 100644
--- a/archivebox/core/admin_archiveresults.py
+++ b/archivebox/core/admin_archiveresults.py
@@ -9,25 +9,17 @@ from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
-from huey_monitor.admin import TaskModel
-
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
+from archivebox.hooks import get_extractor_icon
from core.models import ArchiveResult, Snapshot
-
-def result_url(result: TaskModel) -> str:
- url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
- return format_html('See progress...'.format(url=url))
-
-
-
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
model = ArchiveResult
@@ -101,9 +93,9 @@ class ArchiveResultInline(admin.TabularInline):
class ArchiveResultAdmin(BaseModelAdmin):
- list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor', 'cmd_str', 'output_str')
+ list_display = ('id', 'created_by', 'created_at', 'snapshot_info', 'tags_str', 'status', 'extractor_with_icon', 'cmd_str', 'output_str')
sort_fields = ('id', 'created_by', 'created_at', 'extractor', 'status')
- readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary')
+ readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'output_summary', 'extractor_with_icon')
search_fields = ('id', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'retry_at', 'start_ts', 'end_ts', 'created_by', 'pwd', 'cmd_version', 'cmd', 'output', *readonly_fields)
autocomplete_fields = ['snapshot']
@@ -144,17 +136,29 @@ class ArchiveResultAdmin(BaseModelAdmin):
def tags_str(self, result):
return result.snapshot.tags_str()
+ @admin.display(description='Extractor', ordering='extractor')
+ def extractor_with_icon(self, result):
+ icon = get_extractor_icon(result.extractor)
+ return format_html(
+ '{} {}',
+ result.extractor,
+ icon,
+ result.extractor,
+ )
+
def cmd_str(self, result):
return format_html(
'
{}
',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
-
+
def output_str(self, result):
+ # Determine output link path - use output if file exists, otherwise link to index
+ output_path = result.output if (result.status == 'succeeded' and result.output) else 'index.html'
return format_html(
'↗️
{}
',
result.snapshot.timestamp,
- result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
+ output_path,
result.output,
)
@@ -185,7 +189,7 @@ class ArchiveResultAdmin(BaseModelAdmin):
is_hidden = filename.startswith('.')
output_str += format_html('{}{} ', int(not is_hidden), indentation_str, filename.strip())
- return output_str + format_html('')
+ return output_str + mark_safe('')
diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py
index 0159b9bb..67e074ac 100644
--- a/archivebox/core/admin_site.py
+++ b/archivebox/core/admin_site.py
@@ -35,8 +35,19 @@ def register_admin_site():
admin.site = archivebox_admin
sites.site = archivebox_admin
-
- # Plugin admin registration is now handled by individual app admins
- # No longer using archivebox.pm.hook.register_admin()
-
+
+ # Register admin views for each app
+ # (Previously handled by ABX plugin system, now called directly)
+ from core.admin import register_admin as register_core_admin
+ from crawls.admin import register_admin as register_crawls_admin
+ from api.admin import register_admin as register_api_admin
+ from machine.admin import register_admin as register_machine_admin
+ from workers.admin import register_admin as register_workers_admin
+
+ register_core_admin(archivebox_admin)
+ register_crawls_admin(archivebox_admin)
+ register_api_admin(archivebox_admin)
+ register_machine_admin(archivebox_admin)
+ register_workers_admin(archivebox_admin)
+
return archivebox_admin
diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py
index a50d7b03..d1917e52 100644
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -25,7 +25,7 @@ from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.admin_tags import TagInline
-from core.admin_archiveresults import ArchiveResultInline, result_url
+from core.admin_archiveresults import ArchiveResultInline
# GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
@@ -54,10 +54,10 @@ class SnapshotActionForm(ActionForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
- readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
+ readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
- fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
+ fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', *readonly_fields)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
@@ -93,12 +93,10 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
- @admin.action(
- description="Imported Timestamp"
- )
+ @admin.display(description="Imported Timestamp")
def imported_timestamp(self, obj):
context = RequestContext(self.request, {
- 'bookmarked_date': obj.bookmarked,
+ 'bookmarked_date': obj.bookmarked_at,
'timestamp': obj.timestamp,
})
@@ -145,22 +143,15 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def status_info(self, obj):
return format_html(
- # URL Hash: {}
'''
Archived: {} ({} files {})
Favicon:
- Status code: {}
- Server: {}
- Content type: {}
Extension: {}
''',
'✅' if obj.is_archived else '❌',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
- obj.status_code or '-',
- obj.headers and obj.headers.get('Server') or '-',
- obj.headers and obj.headers.get('Content-Type') or '-',
obj.extension or '-',
)
@@ -184,8 +175,8 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.archive_path,
obj.archive_path,
obj.archive_path,
- 'fetched' if obj.latest_title or obj.title else 'pending',
- urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
+ 'fetched' if obj.title else 'pending',
+ urldecode(htmldecode(obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' {tags}')
@admin.display(
@@ -259,14 +250,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
description="ℹ️ Get Title"
)
def update_titles(self, request, queryset):
- from core.models import Snapshot
count = queryset.count()
# Queue snapshots for archiving via the state machine system
- result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
+ queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
- mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
+ f"Queued {queued} snapshots for title/favicon update. The orchestrator will process them in the background.",
)
@admin.action(
@@ -275,11 +265,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def update_snapshots(self, request, queryset):
count = queryset.count()
- result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
+ queued = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
request,
- mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
+ f"Queued {queued} snapshots for re-archiving. The orchestrator will process them in the background.",
)
@@ -291,11 +281,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
timestamp = timezone.now().isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
- result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
+ bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
messages.success(
request,
- mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
+ f"Creating {queryset.count()} new fresh snapshots. The orchestrator will process them in the background.",
)
@admin.action(
@@ -304,11 +294,11 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
def overwrite_snapshots(self, request, queryset):
count = queryset.count()
- result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
+ queued = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
request,
- mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
+ f"Queued {queued} snapshots for full re-archive (overwriting existing). The orchestrator will process them in the background.",
)
@admin.action(
diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py
index 981edc52..5193166d 100644
--- a/archivebox/core/apps.py
+++ b/archivebox/core/apps.py
@@ -1,5 +1,7 @@
__package__ = 'archivebox.core'
+import sys
+
from django.apps import AppConfig
@@ -10,6 +12,41 @@ class CoreConfig(AppConfig):
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
register_admin_site()
-
+ # Auto-start the orchestrator when running the web server
+ self._maybe_start_orchestrator()
+ def _maybe_start_orchestrator(self):
+ """Start the orchestrator if we're running a web server."""
+ import os
+
+ # Don't start orchestrator during migrations, shell, tests, etc.
+ # Only start when running: runserver, daphne, gunicorn, uwsgi
+ if not self._is_web_server():
+ return
+
+ # Don't start if RUN_ORCHESTRATOR env var is explicitly set to false
+ if os.environ.get('RUN_ORCHESTRATOR', '').lower() in ('false', '0', 'no'):
+ return
+
+ # Don't start in autoreload child process (avoid double-start)
+ if os.environ.get('RUN_MAIN') != 'true' and 'runserver' in sys.argv:
+ return
+
+ try:
+ from workers.orchestrator import Orchestrator
+
+ if not Orchestrator.is_running():
+ # Start orchestrator as daemon (won't exit on idle when started by server)
+ orchestrator = Orchestrator(exit_on_idle=False)
+ orchestrator.start()
+ except Exception as e:
+ # Don't crash the server if orchestrator fails to start
+ import logging
+ logging.getLogger('archivebox').warning(f'Failed to auto-start orchestrator: {e}')
+
+ def _is_web_server(self) -> bool:
+ """Check if we're running a web server command."""
+ # Check for common web server indicators
+ server_commands = ('runserver', 'daphne', 'gunicorn', 'uwsgi', 'server')
+ return any(cmd in ' '.join(sys.argv).lower() for cmd in server_commands)
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index e746c221..543435aa 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -23,7 +23,11 @@ from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
-from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from archivebox.hooks import (
+ ARCHIVE_METHODS_INDEXING_PRECEDENCE,
+ get_extractors, get_extractor_name, get_extractor_icon,
+ DEFAULT_EXTRACTOR_ICONS,
+)
from archivebox.base_models.models import (
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
@@ -343,45 +347,37 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def icons(self) -> str:
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe
- from collections import defaultdict
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
- archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
+ archive_results = {r.extractor: r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output}
else:
- archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
+ archive_results = {r.extractor: r for r in self.archiveresult_set.filter(status="succeeded", output__isnull=False)}
path = self.archive_path
canon = self.canonical_outputs()
output = ""
output_template = '{} '
- icons = {
- "singlefile": "❶", "wget": "🆆", "dom": "🅷", "pdf": "📄",
- "screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
- "readability": "🆁", "mercury": "🅼", "warc": "📦"
- }
- exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
- extractor_outputs = defaultdict(lambda: None)
- for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
- for result in archive_results:
- if result.extractor == extractor:
- extractor_outputs[extractor] = result
+ # Get all extractors from hooks system (sorted by numeric prefix)
+ all_extractors = [get_extractor_name(e) for e in get_extractors()]
- for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
- if extractor not in exclude:
- existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
- if extractor == "wget":
- exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
- if extractor == "archive_org":
- exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
- output += '{} '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
+ for extractor in all_extractors:
+ result = archive_results.get(extractor)
+ existing = result and result.status == 'succeeded' and result.output
+ icon = get_extractor_icon(extractor)
+ output += format_html(
+ output_template,
+ path,
+ canon.get(extractor, extractor + '/'),
+ str(bool(existing)),
+ extractor,
+ icon
+ )
- return format_html('{}', mark_safe(output))
+ return format_html('{}', mark_safe(output))
cache_result = cache.get(cache_key)
if cache_result:
@@ -767,12 +763,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
- EXTRACTOR_CHOICES = (
- ('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'),
- ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'),
- ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'),
- ('dom', 'dom'), ('title', 'title'), ('wget', 'wget'),
- )
+ @classmethod
+ def get_extractor_choices(cls):
+ """Get extractor choices from discovered hooks (for forms/admin)."""
+ extractors = [get_extractor_name(e) for e in get_extractors()]
+ return tuple((e, e) for e in extractors)
# Keep AutoField for backward compatibility with 0.7.x databases
# UUID field is added separately by migration for new records
@@ -783,7 +778,8 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
modified_at = models.DateTimeField(auto_now=True)
snapshot: Snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) # type: ignore
- extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32, blank=False, null=False, db_index=True)
+ # No choices= constraint - extractor names come from plugin system and can be any string
+ extractor = models.CharField(max_length=32, blank=False, null=False, db_index=True)
pwd = models.CharField(max_length=256, default=None, null=True, blank=True)
cmd = models.JSONField(default=None, null=True, blank=True)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@@ -835,6 +831,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def output_exists(self) -> bool:
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
+ def embed_path(self) -> Optional[str]:
+ """
+ Get the relative path to the embeddable output file for this result.
+
+ Returns the output field if set and file exists, otherwise tries to
+ find a reasonable default based on the extractor type.
+ """
+ if self.output:
+ return self.output
+
+ # Try to find output file based on extractor's canonical output path
+ canonical = self.snapshot.canonical_outputs()
+ extractor_key = f'{self.extractor}_path'
+ if extractor_key in canonical:
+ return canonical[extractor_key]
+
+ # Fallback to extractor directory
+ return f'{self.extractor}/'
+
def create_output_dir(self):
output_dir = Path(self.snapshot_dir) / self.extractor
output_dir.mkdir(parents=True, exist_ok=True)
@@ -891,6 +906,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
output_dir=extractor_dir,
config_objects=config_objects,
url=self.snapshot.url,
+ snapshot_id=str(self.snapshot.id),
)
end_ts = timezone.now()
@@ -1000,6 +1016,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
hook,
output_dir=self.output_dir,
config_objects=config_objects,
+ url=self.snapshot.url,
snapshot_id=str(self.snapshot.id),
extractor=self.extractor,
)
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index c3a67d09..d051229d 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -68,9 +68,6 @@ INSTALLED_APPS = [
# 3rd-party apps from PyPI that need to be loaded last
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
- "django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
- "bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils
- "huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
]
@@ -215,70 +212,6 @@ MIGRATION_MODULES = {"signal_webhooks": None}
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
-HUEY = {
- "huey_class": "huey.SqliteHuey",
- "filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
- "name": "commands",
- "results": True,
- "store_none": True,
- "immediate": False,
- "utc": True,
- "consumer": {
- "workers": 1,
- "worker_type": "thread",
- "initial_delay": 0.1, # Smallest polling interval, same as -d.
- "backoff": 1.15, # Exponential backoff using this rate, -b.
- "max_delay": 10.0, # Max possible polling interval, -m.
- "scheduler_interval": 1, # Check schedule every second, -s.
- "periodic": True, # Enable crontab feature.
- "check_worker_health": True, # Enable worker health checks.
- "health_check_interval": 1, # Check worker health every second.
- },
-}
-
-# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
-# https://github.com/gaiacoop/django-huey
-DJANGO_HUEY = {
- "default": "commands",
- "queues": {
- HUEY["name"]: HUEY.copy(),
- # more registered here at plugin import-time by BaseQueue.register()
- # Additional huey queues configured via settings
- },
-}
-
-
-class HueyDBRouter:
- """
- A router to store all the Huey result k:v / Huey Monitor models in the queue.sqlite3 database.
- We keep the databases separate because the queue database receives many more reads/writes per second
- and we want to avoid single-write lock contention with the main database. Also all the in-progress task
- data is ephemeral/not-important-long-term. This makes it easier to for the user to clear non-critical
- temp data by just deleting queue.sqlite3 and leaving index.sqlite3.
- """
-
- route_app_labels = {"huey_monitor", "django_huey", "djhuey"}
- db_name = "queue"
-
- def db_for_read(self, model, **hints):
- if model._meta.app_label in self.route_app_labels:
- return self.db_name
- return "default"
-
- def db_for_write(self, model, **hints):
- if model._meta.app_label in self.route_app_labels:
- return self.db_name
- return "default"
-
- def allow_relation(self, obj1, obj2, **hints):
- if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
- return obj1._meta.app_label == obj2._meta.app_label
- return None
-
- def allow_migrate(self, db, app_label, model_name=None, **hints):
- if app_label in self.route_app_labels:
- return db == self.db_name
- return db == "default"
# class FilestoreDBRouter:
@@ -311,7 +244,7 @@ class HueyDBRouter:
# return db == self.db_name
# return db == "default"
-DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
+DATABASE_ROUTERS = []
CACHES = {
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
diff --git a/archivebox/core/templatetags/core_tags.py b/archivebox/core/templatetags/core_tags.py
index 2de610bc..b2c126cd 100644
--- a/archivebox/core/templatetags/core_tags.py
+++ b/archivebox/core/templatetags/core_tags.py
@@ -1,9 +1,13 @@
from django import template
from django.contrib.admin.templatetags.base import InclusionAdminNode
-
+from django.utils.safestring import mark_safe
from typing import Union
+from archivebox.hooks import (
+ get_extractor_icon, get_extractor_template, get_extractor_name,
+)
+
register = template.Library()
@@ -44,3 +48,115 @@ def url_replace(context, **kwargs):
dict_ = context['request'].GET.copy()
dict_.update(**kwargs)
return dict_.urlencode()
+
+
+@register.simple_tag
+def extractor_icon(extractor: str) -> str:
+ """
+ Render the icon for an extractor.
+
+ Usage: {% extractor_icon "screenshot" %}
+ """
+ return mark_safe(get_extractor_icon(extractor))
+
+
+@register.simple_tag(takes_context=True)
+def extractor_thumbnail(context, result) -> str:
+ """
+ Render the thumbnail template for an archive result.
+
+ Usage: {% extractor_thumbnail result %}
+
+ Context variables passed to template:
+ - result: ArchiveResult object
+ - snapshot: Parent Snapshot object
+ - output_path: Path to output relative to snapshot dir (from embed_path())
+ - extractor: Extractor base name
+ """
+ extractor = get_extractor_name(result.extractor)
+ template_str = get_extractor_template(extractor, 'thumbnail')
+
+ if not template_str:
+ return ''
+
+ # Use embed_path() for the display path (includes canonical paths)
+ output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+
+ # Create a mini template and render it with context
+ try:
+ tpl = template.Template(template_str)
+ ctx = template.Context({
+ 'result': result,
+ 'snapshot': result.snapshot,
+ 'output_path': output_path,
+ 'extractor': extractor,
+ })
+ return mark_safe(tpl.render(ctx))
+ except Exception:
+ return ''
+
+
+@register.simple_tag(takes_context=True)
+def extractor_embed(context, result) -> str:
+ """
+ Render the embed iframe template for an archive result.
+
+ Usage: {% extractor_embed result %}
+ """
+ extractor = get_extractor_name(result.extractor)
+ template_str = get_extractor_template(extractor, 'embed')
+
+ if not template_str:
+ return ''
+
+ output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+
+ try:
+ tpl = template.Template(template_str)
+ ctx = template.Context({
+ 'result': result,
+ 'snapshot': result.snapshot,
+ 'output_path': output_path,
+ 'extractor': extractor,
+ })
+ return mark_safe(tpl.render(ctx))
+ except Exception:
+ return ''
+
+
+@register.simple_tag(takes_context=True)
+def extractor_fullscreen(context, result) -> str:
+ """
+ Render the fullscreen template for an archive result.
+
+ Usage: {% extractor_fullscreen result %}
+ """
+ extractor = get_extractor_name(result.extractor)
+ template_str = get_extractor_template(extractor, 'fullscreen')
+
+ if not template_str:
+ return ''
+
+ output_path = result.embed_path() if hasattr(result, 'embed_path') else (result.output or '')
+
+ try:
+ tpl = template.Template(template_str)
+ ctx = template.Context({
+ 'result': result,
+ 'snapshot': result.snapshot,
+ 'output_path': output_path,
+ 'extractor': extractor,
+ })
+ return mark_safe(tpl.render(ctx))
+ except Exception:
+ return ''
+
+
+@register.filter
+def extractor_name(value: str) -> str:
+ """
+ Get the base name of an extractor (strips numeric prefix).
+
+ Usage: {{ result.extractor|extractor_name }}
+ """
+ return get_extractor_name(value)
diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py
index c8b3bed9..910d59ee 100644
--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -8,7 +8,7 @@ from django.views.generic.base import RedirectView
from archivebox.misc.serve_static import serve_static
from core.admin_site import archivebox_admin
-from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
+from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView, live_progress_view
from workers.views import JobsDashboardView
@@ -43,8 +43,10 @@ urlpatterns = [
path('accounts/', include('django.contrib.auth.urls')),
+
+ path('admin/live-progress/', live_progress_view, name='live_progress'),
path('admin/', archivebox_admin.urls),
-
+
path("api/", include('api.urls'), name='api'),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 6a662d04..43110364 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -34,6 +34,7 @@ from archivebox.search import query_search_index
from core.models import Snapshot
from core.forms import AddLinkForm
from crawls.models import Seed, Crawl
+from archivebox.hooks import get_extractors, get_extractor_name
@@ -54,8 +55,10 @@ class SnapshotView(View):
@staticmethod
def render_live_index(request, snapshot):
TITLE_LOADING_MSG = 'Not yet archived...'
- HIDDEN_RESULTS = ('favicon', 'headers', 'title', 'htmltotext', 'warc', 'archive_org')
+ # Dict of extractor -> ArchiveResult object
+ archiveresult_objects = {}
+ # Dict of extractor -> result info dict (for template compatibility)
archiveresults = {}
results = snapshot.archiveresult_set.all()
@@ -65,18 +68,21 @@ class SnapshotView(View):
abs_path = result.snapshot_dir / (embed_path or 'None')
if (result.status == 'succeeded'
- and (result.extractor not in HIDDEN_RESULTS)
and embed_path
and os.access(abs_path, os.R_OK)
and abs_path.exists()):
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
continue
+ # Store the full ArchiveResult object for template tags
+ archiveresult_objects[result.extractor] = result
+
result_info = {
'name': result.extractor,
'path': embed_path,
'ts': ts_to_date_str(result.end_ts),
'size': abs_path.stat().st_size or '?',
+ 'result': result, # Include the full object for template tags
}
archiveresults[result.extractor] = result_info
@@ -101,11 +107,11 @@ class SnapshotView(View):
}
- # iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
+ # iterate through all the files in the snapshot dir and add the biggest ones to the result list
snap_dir = Path(snapshot.output_dir)
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
return {}
-
+
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
extension = result_file.suffix.lstrip('.').lower()
if result_file.is_dir() or result_file.name.startswith('.') or extension not in allowed_extensions:
@@ -121,12 +127,16 @@ class SnapshotView(View):
'path': result_file.relative_to(snap_dir),
'ts': ts_to_date_str(result_file.stat().st_mtime or 0),
'size': file_size,
+ 'result': None, # No ArchiveResult object for filesystem-discovered files
}
- preferred_types = ('singlefile', 'screenshot', 'wget', 'dom', 'media', 'pdf', 'readability', 'mercury')
+ # Get available extractors from hooks (sorted by numeric prefix for ordering)
+ # Convert to base names for display ordering
+ all_extractors = [get_extractor_name(e) for e in get_extractors()]
+ preferred_types = tuple(all_extractors)
all_types = preferred_types + tuple(result_type for result_type in archiveresults.keys() if result_type not in preferred_types)
- best_result = {'path': 'None'}
+ best_result = {'path': 'None', 'result': None}
for result_type in preferred_types:
if result_type in archiveresults:
best_result = archiveresults[result_type]
@@ -157,6 +167,7 @@ class SnapshotView(View):
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
+ 'snapshot': snapshot, # Pass the snapshot object for template tags
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -436,7 +447,7 @@ class AddView(UserPassesTestMixin, FormView):
def form_valid(self, form):
urls = form.cleaned_data["url"]
print(f'[+] Adding URL: {urls}')
- parser = form.cleaned_data["parser"]
+ parser = form.cleaned_data.get("parser", "auto") # default to auto-detect parser
tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
@@ -452,18 +463,19 @@ class AddView(UserPassesTestMixin, FormView):
if extractors:
input_kwargs.update({"extractors": extractors})
-
+
from archivebox.config.permissions import HOSTNAME
-
-
+
+
# 1. save the provided urls to sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt
sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt'
sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
-
+
# 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt
+ timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S")
seed = Seed.from_file(
sources_file,
- label=f'{self.request.user.username}@{HOSTNAME}{self.request.path}',
+ label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}',
parser=parser,
tag=tag,
created_by=self.request.user.pk,
@@ -472,7 +484,7 @@ class AddView(UserPassesTestMixin, FormView):
# 'INDEX_ONLY': index_only,
# 'OVERWRITE': False,
'DEPTH': depth,
- 'EXTRACTORS': parser,
+ 'EXTRACTORS': extractors or '',
# 'DEFAULT_PERSONA': persona or 'Default',
})
# 3. create a new Crawl pointing to the Seed
@@ -490,10 +502,15 @@ class AddView(UserPassesTestMixin, FormView):
self.request,
mark_safe(f"Adding {rough_url_count} URLs in the background. (refresh in a minute start seeing results) {crawl.admin_change_url}"),
)
- # if not bg:
- # from workers.orchestrator import Orchestrator
- # orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
- # orchestrator.start()
+
+ # Start orchestrator in background to process the queued crawl
+ try:
+ from archivebox.workers.tasks import ensure_orchestrator_running
+ ensure_orchestrator_running()
+ except Exception as e:
+ # Orchestrator may already be running via supervisord, or fail to start
+ # This is not fatal - the crawl will be processed when orchestrator runs
+ print(f'[!] Failed to start orchestrator: {e}')
return redirect(crawl.admin_change_url)
@@ -513,6 +530,141 @@ class HealthCheckView(View):
)
+import json
+from django.http import JsonResponse
+
+def live_progress_view(request):
+ """Simple JSON endpoint for live progress status - used by admin progress monitor."""
+ try:
+ from workers.orchestrator import Orchestrator
+ from crawls.models import Crawl
+ from core.models import Snapshot, ArchiveResult
+
+ # Get orchestrator status
+ orchestrator_running = Orchestrator.is_running()
+ total_workers = Orchestrator().get_total_worker_count() if orchestrator_running else 0
+
+ # Get model counts by status
+ crawls_pending = Crawl.objects.filter(status=Crawl.StatusChoices.QUEUED).count()
+ crawls_started = Crawl.objects.filter(status=Crawl.StatusChoices.STARTED).count()
+
+ # Get recent crawls (last 24 hours)
+ from datetime import timedelta
+ one_day_ago = timezone.now() - timedelta(days=1)
+ crawls_recent = Crawl.objects.filter(created_at__gte=one_day_ago).count()
+
+ snapshots_pending = Snapshot.objects.filter(status=Snapshot.StatusChoices.QUEUED).count()
+ snapshots_started = Snapshot.objects.filter(status=Snapshot.StatusChoices.STARTED).count()
+
+ archiveresults_pending = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
+ archiveresults_started = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.STARTED).count()
+ archiveresults_succeeded = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
+ archiveresults_failed = ArchiveResult.objects.filter(status=ArchiveResult.StatusChoices.FAILED).count()
+
+ # Build hierarchical active crawls with nested snapshots and archive results
+ active_crawls = []
+ for crawl in Crawl.objects.filter(
+ status__in=[Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]
+ ).order_by('-modified_at')[:10]:
+ # Get snapshots for this crawl
+ crawl_snapshots = Snapshot.objects.filter(crawl=crawl)
+ total_snapshots = crawl_snapshots.count()
+ completed_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.SEALED).count()
+ pending_snapshots = crawl_snapshots.filter(status=Snapshot.StatusChoices.QUEUED).count()
+
+ # Calculate crawl progress
+ crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0
+
+ # Get active snapshots for this crawl
+ active_snapshots_for_crawl = []
+ for snapshot in crawl_snapshots.filter(
+ status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
+ ).order_by('-modified_at')[:5]:
+ # Get archive results for this snapshot
+ snapshot_results = ArchiveResult.objects.filter(snapshot=snapshot)
+ total_extractors = snapshot_results.count()
+ completed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.SUCCEEDED).count()
+ failed_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.FAILED).count()
+ pending_extractors = snapshot_results.filter(status=ArchiveResult.StatusChoices.QUEUED).count()
+
+ # Calculate snapshot progress
+ snapshot_progress = int(((completed_extractors + failed_extractors) / total_extractors) * 100) if total_extractors > 0 else 0
+
+ # Get active extractors for this snapshot
+ active_extractors = [
+ {
+ 'id': str(ar.id),
+ 'extractor': ar.extractor,
+ 'status': ar.status,
+ 'started': ar.start_ts.isoformat() if ar.start_ts else None,
+ 'progress': 50,
+ }
+ for ar in snapshot_results.filter(status=ArchiveResult.StatusChoices.STARTED).order_by('-start_ts')[:5]
+ ]
+
+ active_snapshots_for_crawl.append({
+ 'id': str(snapshot.id),
+ 'url': snapshot.url[:80],
+ 'status': snapshot.status,
+ 'started': snapshot.modified_at.isoformat() if snapshot.modified_at else None,
+ 'progress': snapshot_progress,
+ 'total_extractors': total_extractors,
+ 'completed_extractors': completed_extractors,
+ 'failed_extractors': failed_extractors,
+ 'pending_extractors': pending_extractors,
+ 'active_extractors': active_extractors,
+ })
+
+ active_crawls.append({
+ 'id': str(crawl.id),
+ 'label': str(crawl)[:60],
+ 'status': crawl.status,
+ 'started': crawl.modified_at.isoformat() if crawl.modified_at else None,
+ 'progress': crawl_progress,
+ 'max_depth': crawl.max_depth,
+ 'total_snapshots': total_snapshots,
+ 'completed_snapshots': completed_snapshots,
+ 'failed_snapshots': 0,
+ 'pending_snapshots': pending_snapshots,
+ 'active_snapshots': active_snapshots_for_crawl,
+ })
+
+ return JsonResponse({
+ 'orchestrator_running': orchestrator_running,
+ 'total_workers': total_workers,
+ 'crawls_pending': crawls_pending,
+ 'crawls_started': crawls_started,
+ 'crawls_recent': crawls_recent,
+ 'snapshots_pending': snapshots_pending,
+ 'snapshots_started': snapshots_started,
+ 'archiveresults_pending': archiveresults_pending,
+ 'archiveresults_started': archiveresults_started,
+ 'archiveresults_succeeded': archiveresults_succeeded,
+ 'archiveresults_failed': archiveresults_failed,
+ 'active_crawls': active_crawls,
+ 'server_time': timezone.now().isoformat(),
+ })
+ except Exception as e:
+ import traceback
+ return JsonResponse({
+ 'error': str(e),
+ 'traceback': traceback.format_exc(),
+ 'orchestrator_running': False,
+ 'total_workers': 0,
+ 'crawls_pending': 0,
+ 'crawls_started': 0,
+ 'crawls_recent': 0,
+ 'snapshots_pending': 0,
+ 'snapshots_started': 0,
+ 'archiveresults_pending': 0,
+ 'archiveresults_started': 0,
+ 'archiveresults_succeeded': 0,
+ 'archiveresults_failed': 0,
+ 'active_crawls': [],
+ 'server_time': timezone.now().isoformat(),
+ }, status=500)
+
+
def find_config_section(key: str) -> str:
CONFIGS = get_all_configs()
diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py
index 3b6453c7..611a80bc 100644
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -1,10 +1,18 @@
__package__ = 'archivebox.crawls'
-from django.utils.html import format_html, format_html_join
-from django.contrib import admin
+import json
+from pathlib import Path
+
+from django.utils.html import format_html, format_html_join, mark_safe
+from django.contrib import admin, messages
+from django.urls import path
+from django.http import JsonResponse
+from django.views.decorators.http import require_POST
from archivebox import DATA_DIR
+from django_object_actions import action
+
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from core.models import Snapshot
@@ -16,8 +24,8 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str')
search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str')
- readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents', 'available_config_options')
- fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'available_config_options', 'created_by', *readonly_fields[:-1])
+ readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents')
+ fields = ('label', 'notes', 'uri', 'extractor', 'tags_str', 'config', 'created_by', *readonly_fields)
list_filter = ('extractor', 'created_by')
ordering = ['-created_at']
@@ -34,19 +42,19 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin):
return format_html_join(' ', ' - {}', (
(scheduledcrawl.admin_change_url, scheduledcrawl)
for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20]
- )) or format_html('No Scheduled Crawls yet...')
+ )) or mark_safe('No Scheduled Crawls yet...')
def crawls(self, obj):
return format_html_join(' ', ' - {}', (
(crawl.admin_change_url, crawl)
for crawl in obj.crawl_set.all().order_by('-created_at')[:20]
- )) or format_html('No Crawls yet...')
+ )) or mark_safe('No Crawls yet...')
def snapshots(self, obj):
return format_html_join(' ', ' - {}', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
- )) or format_html('No Snapshots yet...')
+ )) or mark_safe('No Snapshots yet...')
def contents(self, obj):
if obj.uri.startswith('file:///data/'):
@@ -69,14 +77,81 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri')
- readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_contents', 'available_config_options')
- fields = ('label', 'notes', 'urls', 'config', 'available_config_options', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', *readonly_fields[:-1])
+ readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor')
+ fields = ('label', 'notes', 'seed_urls_editor', 'config', 'status', 'retry_at', 'max_depth', 'seed', 'schedule', 'created_by', 'created_at', 'modified_at', 'snapshots')
list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
actions = ["delete_selected"]
-
+ change_actions = ['recrawl']
+
+ @action(label='Recrawl', description='Create a new crawl with the same settings')
+ def recrawl(self, request, obj):
+ """Duplicate this crawl as a new crawl with the same seed and settings."""
+ from django.utils import timezone
+
+ new_crawl = Crawl.objects.create(
+ seed=obj.seed,
+ urls=obj.urls,
+ max_depth=obj.max_depth,
+ config=obj.config,
+ schedule=obj.schedule,
+ label=f"{obj.label} (recrawl)" if obj.label else "",
+ notes=obj.notes,
+ created_by=request.user,
+ status=Crawl.StatusChoices.QUEUED,
+ retry_at=timezone.now(),
+ )
+
+ messages.success(
+ request,
+ f'Created new crawl {new_crawl.id} with the same settings. '
+ f'It will start processing shortly.'
+ )
+
+ # Redirect to the new crawl's change page
+ from django.shortcuts import redirect
+ return redirect('admin:crawls_crawl_change', new_crawl.id)
+
+ def get_urls(self):
+ urls = super().get_urls()
+ custom_urls = [
+ path('/save_seed_contents/',
+ self.admin_site.admin_view(self.save_seed_contents_view),
+ name='crawls_crawl_save_seed_contents'),
+ ]
+ return custom_urls + urls
+
+ def save_seed_contents_view(self, request, object_id):
+ """Handle saving seed file contents via AJAX."""
+ if request.method != 'POST':
+ return JsonResponse({'success': False, 'error': 'POST required'}, status=405)
+
+ try:
+ crawl = Crawl.objects.get(pk=object_id)
+ except Crawl.DoesNotExist:
+ return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404)
+
+ if not (crawl.seed and crawl.seed.uri and crawl.seed.uri.startswith('file:///data/')):
+ return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400)
+
+ try:
+ data = json.loads(request.body)
+ contents = data.get('contents', '')
+ except json.JSONDecodeError:
+ return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400)
+
+ source_file = DATA_DIR / crawl.seed.uri.replace('file:///data/', '', 1)
+
+ try:
+ # Ensure parent directory exists
+ source_file.parent.mkdir(parents=True, exist_ok=True)
+ source_file.write_text(contents)
+ return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'})
+ except Exception as e:
+ return JsonResponse({'success': False, 'error': str(e)}, status=500)
+
def num_snapshots(self, obj):
return obj.snapshot_set.count()
@@ -84,35 +159,175 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
return format_html_join(' ', '{}', (
(snapshot.admin_change_url, snapshot)
for snapshot in obj.snapshot_set.all().order_by('-created_at')[:20]
- )) or format_html('No Snapshots yet...')
-
+ )) or mark_safe('No Snapshots yet...')
+
@admin.display(description='Schedule', ordering='schedule')
def schedule_str(self, obj):
if not obj.schedule:
- return format_html('None')
+ return mark_safe('None')
return format_html('{}', obj.schedule.admin_change_url, obj.schedule)
-
+
@admin.display(description='Seed', ordering='seed')
def seed_str(self, obj):
if not obj.seed:
- return format_html('None')
+ return mark_safe('None')
return format_html('{}', obj.seed.admin_change_url, obj.seed)
-
- def seed_contents(self, obj):
- if not (obj.seed and obj.seed.uri):
- return format_html('None')
-
- if obj.seed.uri.startswith('file:///data/'):
- source_file = DATA_DIR / obj.seed.uri.replace('file:///data/', '', 1)
- contents = ""
+
+ @admin.display(description='URLs')
+ def seed_urls_editor(self, obj):
+ """Combined editor showing seed URL and file contents."""
+ widget_id = f'seed_urls_{obj.pk}'
+
+ # Get the seed URI (or use urls field if no seed)
+ seed_uri = ''
+ if obj.seed and obj.seed.uri:
+ seed_uri = obj.seed.uri
+ elif obj.urls:
+ seed_uri = obj.urls
+
+ # Check if it's a local file we can edit
+ is_file = seed_uri.startswith('file:///data/')
+ contents = ""
+ error = None
+ source_file = None
+
+ if is_file:
+ source_file = DATA_DIR / seed_uri.replace('file:///data/', '', 1)
try:
- contents = source_file.read_text().strip()[:14_000]
+ contents = source_file.read_text().strip()
except Exception as e:
- contents = f'Error reading {source_file}: {e}'
-
- return format_html('{}: