From bb53228ebfef090457af5cac85a5ff26b1e937ac Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Thu, 25 Dec 2025 01:52:38 -0800 Subject: [PATCH] remove Seed model in favor of Crawl as template --- ..._alter_outboundwebhook_options_and_more.py | 113 +++++++ archivebox/api/v1_core.py | 14 +- archivebox/api/v1_crawls.py | 55 +--- archivebox/cli/__init__.py | 1 + archivebox/cli/archivebox_add.py | 46 +-- archivebox/cli/archivebox_crawl.py | 6 +- archivebox/cli/archivebox_install.py | 17 +- archivebox/cli/archivebox_snapshot.py | 8 +- archivebox/cli/archivebox_version.py | 52 +-- archivebox/config/constants.py | 8 +- archivebox/config/django.py | 11 +- archivebox/config/paths.py | 6 - archivebox/core/admin_snapshots.py | 84 ++++- ...emove_archiveresult_output_dir_and_more.py | 101 ++++++ archivebox/core/settings.py | 6 +- archivebox/core/settings_logging.py | 21 +- archivebox/core/statemachines.py | 16 +- archivebox/core/views.py | 47 ++- archivebox/crawls/admin.py | 308 +++--------------- .../crawls/migrations/0002_drop_seed_model.py | 61 ++++ archivebox/crawls/models.py | 154 ++++----- archivebox/crawls/statemachines.py | 14 +- archivebox/logs/errors.log | 2 - ...0002_alter_dependency_bin_name_and_more.py | 65 ++++ archivebox/misc/jsonl.py | 6 +- archivebox/misc/logging.py | 8 +- archivebox/misc/logging_util.py | 8 +- archivebox/misc/monkey_patches.py | 14 +- .../templates/admin/progress_monitor.html | 10 +- archivebox/workers/supervisord_util.py | 213 +++++++----- 30 files changed, 785 insertions(+), 690 deletions(-) create mode 100644 archivebox/api/migrations/0002_alter_outboundwebhook_options_and_more.py create mode 100644 archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py create mode 100644 archivebox/crawls/migrations/0002_drop_seed_model.py delete mode 100644 archivebox/logs/errors.log create mode 100644 archivebox/machine/migrations/0002_alter_dependency_bin_name_and_more.py diff --git a/archivebox/api/migrations/0002_alter_outboundwebhook_options_and_more.py b/archivebox/api/migrations/0002_alter_outboundwebhook_options_and_more.py new file mode 100644 index 00000000..e2770792 --- /dev/null +++ b/archivebox/api/migrations/0002_alter_outboundwebhook_options_and_more.py @@ -0,0 +1,113 @@ +# Generated by Django 6.0 on 2025-12-25 09:34 + +import django.utils.timezone +import signal_webhooks.fields +import signal_webhooks.utils +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('api', '0001_squashed'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.AlterModelOptions( + name='outboundwebhook', + options={'verbose_name': 'API Outbound Webhook'}, + ), + migrations.AddField( + model_name='outboundwebhook', + name='created', + field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now, help_text='When the webhook was created.', verbose_name='created'), + preserve_default=False, + ), + migrations.AddField( + model_name='outboundwebhook', + name='updated', + field=models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated'), + ), + migrations.AlterField( + model_name='apitoken', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='apitoken', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='auth_token', + field=signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='enabled', + field=models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='endpoint', + field=models.URLField(help_text='Target endpoint for this webhook.', max_length=2047, verbose_name='endpoint'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='headers', + field=models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='keep_last_response', + field=models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='last_failure', + field=models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='last_response', + field=models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='last_success', + field=models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='name', + field=models.CharField(db_index=True, help_text='Webhook name.', max_length=255, unique=True, verbose_name='name'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='ref', + field=models.CharField(db_index=True, help_text='Dot import notation to the model the webhook is for.', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model'), + ), + migrations.AlterField( + model_name='outboundwebhook', + name='signal', + field=models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='Signal the webhook fires to.', max_length=255, verbose_name='signal'), + ), + migrations.AddConstraint( + model_name='outboundwebhook', + constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'), + ), + ] diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 4e1c3f25..31235e68 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -15,7 +15,7 @@ from ninja.pagination import paginate, PaginationBase from ninja.errors import HttpError from core.models import Snapshot, ArchiveResult, Tag -from api.v1_crawls import CrawlSchema, SeedSchema +from api.v1_crawls import CrawlSchema router = Router(tags=['Core Models']) @@ -271,9 +271,9 @@ def get_tag(request, tag_id: str, with_snapshots: bool = True): return Tag.objects.get(slug__icontains=tag_id) -@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, SeedSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID") +@router.get("/any/{id}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema, CrawlSchema], url_name="get_any", summary="Get any object by its ID") def get_any(request, id: str): - """Get any object by its ID (e.g. snapshot, archiveresult, tag, seed, crawl, etc.).""" + """Get any object by its ID (e.g. snapshot, archiveresult, tag, crawl, etc.).""" request.with_snapshots = False request.with_archiveresults = False @@ -285,14 +285,6 @@ def get_any(request, id: str): except Exception: pass - try: - from api.v1_crawls import get_seed - response = get_seed(request, id) - if response: - return redirect(f"/api/v1/{response._meta.app_label}/{response._meta.model_name}/{response.id}?{request.META['QUERY_STRING']}") - except Exception: - pass - try: from api.v1_crawls import get_crawl response = get_crawl(request, id) diff --git a/archivebox/api/v1_crawls.py b/archivebox/api/v1_crawls.py index d84f622d..600a0673 100644 --- a/archivebox/api/v1_crawls.py +++ b/archivebox/api/v1_crawls.py @@ -10,53 +10,13 @@ from django.contrib.auth import get_user_model from ninja import Router, Schema from core.models import Snapshot -from crawls.models import Seed, Crawl +from crawls.models import Crawl from .auth import API_AUTH_METHODS router = Router(tags=['Crawl Models'], auth=API_AUTH_METHODS) -class SeedSchema(Schema): - TYPE: str = 'crawls.models.Seed' - - id: UUID - - modified_at: datetime - created_at: datetime - created_by_id: str - created_by_username: str - - uri: str - tags_str: str - config: dict - - @staticmethod - def resolve_created_by_id(obj): - return str(obj.created_by_id) - - @staticmethod - def resolve_created_by_username(obj): - User = get_user_model() - return User.objects.get(id=obj.created_by_id).username - -@router.get("/seeds", response=List[SeedSchema], url_name="get_seeds") -def get_seeds(request): - return Seed.objects.all().distinct() - -@router.get("/seed/{seed_id}", response=SeedSchema, url_name="get_seed") -def get_seed(request, seed_id: str): - seed = None - request.with_snapshots = False - request.with_archiveresults = False - - try: - seed = Seed.objects.get(Q(id__icontains=seed_id)) - except Exception: - pass - return seed - - class CrawlSchema(Schema): TYPE: str = 'crawls.models.Crawl' @@ -66,24 +26,27 @@ class CrawlSchema(Schema): created_at: datetime created_by_id: str created_by_username: str - + status: str retry_at: datetime | None - seed: SeedSchema + urls: str + extractor: str max_depth: int - + tags_str: str + config: dict + # snapshots: List[SnapshotSchema] @staticmethod def resolve_created_by_id(obj): return str(obj.created_by_id) - + @staticmethod def resolve_created_by_username(obj): User = get_user_model() return User.objects.get(id=obj.created_by_id).username - + @staticmethod def resolve_snapshots(obj, context): if context['request'].with_snapshots: diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 4c2737ee..5a33e11a 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -21,6 +21,7 @@ class ArchiveBoxGroup(click.Group): meta_commands = { 'help': 'archivebox.cli.archivebox_help.main', 'version': 'archivebox.cli.archivebox_version.main', + 'mcp': 'archivebox.cli.archivebox_mcp.main', } setup_commands = { 'init': 'archivebox.cli.archivebox_init.main', diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index b668d26b..451ed0d3 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -36,15 +36,14 @@ def add(urls: str | list[str], created_by_id: int | None=None) -> QuerySet['Snapshot']: """Add a new URL or list of URLs to your archive. - The new flow is: + The flow is: 1. Save URLs to sources file - 2. Create Seed pointing to the file - 3. Create Crawl with max_depth - 4. Create root Snapshot pointing to file:// URL (depth=0) - 5. Orchestrator runs parser extractors on root snapshot - 6. Parser extractors output to urls.jsonl - 7. URLs are added to Crawl.urls and child Snapshots are created - 8. Repeat until max_depth is reached + 2. Create Crawl with URLs and max_depth + 3. Orchestrator creates Snapshots from Crawl URLs (depth=0) + 4. Orchestrator runs parser extractors on root snapshots + 5. Parser extractors output to urls.jsonl + 6. URLs are added to Crawl.urls and child Snapshots are created + 7. Repeat until max_depth is reached """ from rich import print @@ -55,7 +54,7 @@ def add(urls: str | list[str], # import models once django is set up from core.models import Snapshot - from crawls.models import Seed, Crawl + from crawls.models import Crawl from archivebox.base_models.models import get_or_create_system_user_pk from workers.orchestrator import Orchestrator @@ -66,19 +65,24 @@ def add(urls: str | list[str], sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) - # 2. Create a new Seed pointing to the sources file + # 2. Create a new Crawl with inline URLs cli_args = [*sys.argv] if cli_args[0].lower().endswith('archivebox'): cli_args[0] = 'archivebox' cmd_str = ' '.join(cli_args) timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") - seed = Seed.from_file( - sources_file, + + # Read URLs directly into crawl + urls_content = sources_file.read_text() + + crawl = Crawl.objects.create( + urls=urls_content, + extractor=parser, + max_depth=depth, + tags_str=tag, label=f'{USER}@{HOSTNAME} $ {cmd_str} [{timestamp}]', - parser=parser, - tag=tag, - created_by=created_by_id, + created_by_id=created_by_id, config={ 'ONLY_NEW': not update, 'INDEX_ONLY': index_only, @@ -88,15 +92,13 @@ def add(urls: str | list[str], } ) - # 3. Create a new Crawl pointing to the Seed (status=queued) - crawl = Crawl.from_seed(seed, max_depth=depth) - print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]') - print(f' [dim]Seed: {seed.uri}[/dim]') + first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else '' + print(f' [dim]First URL: {first_url}[/dim]') - # 4. The CrawlMachine will create the root Snapshot when started - # Root snapshot URL = file:///path/to/sources/...txt - # Parser extractors will run on it and discover URLs + # 3. The CrawlMachine will create the root Snapshot when started + # If URLs are from a file: first URL = file:///path/to/sources/...txt + # Parser extractors will run on it and discover more URLs # Those URLs become child Snapshots (depth=1) if index_only: diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py index 0c7e4d16..4fb5d671 100644 --- a/archivebox/cli/archivebox_crawl.py +++ b/archivebox/cli/archivebox_crawl.py @@ -76,7 +76,7 @@ def discover_outlinks( ) from archivebox.base_models.models import get_or_create_system_user_pk from core.models import Snapshot, ArchiveResult - from crawls.models import Seed, Crawl + from crawls.models import Crawl from archivebox.config import CONSTANTS from workers.orchestrator import Orchestrator @@ -117,12 +117,12 @@ def discover_outlinks( sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url'))) - seed = Seed.from_file( + crawl = Crawl.from_file( sources_file, + max_depth=depth, label=f'crawl --depth={depth}', created_by=created_by_id, ) - crawl = Crawl.from_seed(seed, max_depth=depth) # Create snapshots for new URLs for record in new_url_records: diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index b797944d..f7cb4c1a 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -42,27 +42,20 @@ def install(dry_run: bool=False) -> None: setup_django() from django.utils import timezone - from crawls.models import Seed, Crawl + from crawls.models import Crawl from archivebox.base_models.models import get_or_create_system_user_pk - # Create a seed and crawl for dependency detection + # Create a crawl for dependency detection # Using a minimal crawl that will trigger on_Crawl hooks created_by_id = get_or_create_system_user_pk() - seed, _created = Seed.objects.get_or_create( - uri='archivebox://install', + crawl, created = Crawl.objects.get_or_create( + urls='archivebox://install', label='Dependency detection', created_by_id=created_by_id, defaults={ 'extractor': 'auto', - } - ) - - crawl, created = Crawl.objects.get_or_create( - seed=seed, - max_depth=0, - created_by_id=created_by_id, - defaults={ + 'max_depth': 0, 'status': 'queued', } ) diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index bb41af47..eb9a1e40 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -92,7 +92,7 @@ def create_snapshots( ) from archivebox.base_models.models import get_or_create_system_user_pk from core.models import Snapshot - from crawls.models import Seed, Crawl + from crawls.models import Crawl from archivebox.config import CONSTANTS created_by_id = created_by_id or get_or_create_system_user_pk() @@ -108,17 +108,17 @@ def create_snapshots( # If depth > 0, we need a Crawl to manage recursive discovery crawl = None if depth > 0: - # Create a seed for this batch + # Create a crawl for this batch sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt' sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url'))) - seed = Seed.from_file( + crawl = Crawl.from_file( sources_file, + max_depth=depth, label=f'snapshot --depth={depth}', created_by=created_by_id, ) - crawl = Crawl.from_seed(seed, max_depth=depth) # Process each record created_snapshots = [] diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index c891b8ea..59902c4b 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -111,53 +111,27 @@ def version(quiet: bool=False, machine = Machine.current() - # Get all *_BINARY config values - binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')] + # Get all installed binaries from the database + all_installed = InstalledBinary.objects.filter( + machine=machine + ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name') - if not binary_config_keys: - prnt('', '[grey53]No binary dependencies defined in config.[/grey53]') + if not all_installed.exists(): + prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]') else: - for key in sorted(set(binary_config_keys)): - # Get the actual binary name/path from config value - # Prioritize Machine.config overrides over base config - bin_value = machine.config.get(key) or config.get(key, '').strip() - if not bin_value: + for installed in all_installed: + # Skip if user specified specific binaries and this isn't one + if binaries and installed.name not in binaries: continue - # Check if it's a path (has slashes) or just a name - is_path = '/' in str(bin_value) - - if is_path: - # It's a full path - match against abspath - bin_name = Path(bin_value).name - # Skip if user specified specific binaries and this isn't one - if binaries and bin_name not in binaries: - continue - # Find InstalledBinary where abspath ends with this path - installed = InstalledBinary.objects.filter( - machine=machine, - abspath__endswith=bin_value, - ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first() - else: - # It's just a binary name - match against name - bin_name = bin_value - # Skip if user specified specific binaries and this isn't one - if binaries and bin_name not in binaries: - continue - # Find InstalledBinary by name - installed = InstalledBinary.objects.filter( - machine=machine, - name__iexact=bin_name, - ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first() - - if installed and installed.is_valid: + if installed.is_valid: display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~') version_str = (installed.version or 'unknown')[:15] provider = (installed.binprovider or 'env')[:8] - prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False) + prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False) else: - prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False) - failures.append(bin_name) + prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False) + failures.append(installed.name) # Show hint if no binaries are installed yet has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists() diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 80894b58..a5c29ff4 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -96,10 +96,8 @@ class ConstantsDict(Mapping): # Data dir files CONFIG_FILENAME: str = 'ArchiveBox.conf' SQL_INDEX_FILENAME: str = 'index.sqlite3' - QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3' CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME - QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME JSON_INDEX_FILENAME: str = 'index.json' HTML_INDEX_FILENAME: str = 'index.html' @@ -184,10 +182,10 @@ class ConstantsDict(Mapping): SQL_INDEX_FILENAME, f"{SQL_INDEX_FILENAME}-wal", f"{SQL_INDEX_FILENAME}-shm", - QUEUE_DATABASE_FILENAME, - f"{QUEUE_DATABASE_FILENAME}-wal", - f"{QUEUE_DATABASE_FILENAME}-shm", "search.sqlite3", + "queue.sqlite3", + "queue.sqlite3-wal", + "queue.sqlite3-shm", JSON_INDEX_FILENAME, HTML_INDEX_FILENAME, ROBOTS_TXT_FILENAME, diff --git a/archivebox/config/django.py b/archivebox/config/django.py index d7910ec0..9b06db7b 100644 --- a/archivebox/config/django.py +++ b/archivebox/config/django.py @@ -56,6 +56,14 @@ def setup_django(check_db=False, in_memory_db=False) -> None: os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null') os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') + # Suppress the "database access during app initialization" warning + # This warning can be triggered during django.setup() but is safe to ignore + # since we're doing intentional setup operations + import warnings + warnings.filterwarnings('ignore', + message='.*Accessing the database during app initialization.*', + category=RuntimeWarning) + try: from django.core.management import call_command @@ -87,7 +95,8 @@ def setup_django(check_db=False, in_memory_db=False) -> None: style='bold red', )) STDERR.print() - STDERR.print_exception(show_locals=False) + import traceback + traceback.print_exc() return from django.conf import settings diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py index 7c6fcdd7..0eeb84f8 100644 --- a/archivebox/config/paths.py +++ b/archivebox/config/paths.py @@ -224,12 +224,6 @@ def get_data_locations(): "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), "is_mount": os.path.ismount(DATABASE_FILE.resolve()), }, - "QUEUE_DATABASE": { - "path": CONSTANTS.QUEUE_DATABASE_FILE, - "enabled": True, - "is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK), - "is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE), - }, "ARCHIVE_DIR": { "path": ARCHIVE_DIR.resolve(), "enabled": True, diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py index d25f291c..bd73c363 100644 --- a/archivebox/core/admin_snapshots.py +++ b/archivebox/core/admin_snapshots.py @@ -33,15 +33,18 @@ GLOBAL_CONTEXT = {} class SnapshotActionForm(ActionForm): - tags = forms.ModelMultipleChoiceField( - label='Edit tags', - queryset=Tag.objects.all(), - required=False, - widget=FilteredSelectMultiple( - 'core_tag__name', - False, - ), - ) + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Define tags field in __init__ to avoid database access during app initialization + self.fields['tags'] = forms.ModelMultipleChoiceField( + label='Edit tags', + queryset=Tag.objects.all(), + required=False, + widget=FilteredSelectMultiple( + 'core_tag__name', + False, + ), + ) # TODO: allow selecting actions for specific extractors? is this useful? # extractor = forms.ChoiceField( @@ -165,14 +168,69 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin): def admin_actions(self, obj): return format_html( - # URL Hash: {}
''' - Summary page ➑️     - Result files πŸ“‘     - Admin actions βš™οΈ +
+ + πŸ“„ Summary Page + + + πŸ“ Result Files + + + πŸ”— Original URL + + + + + + ⬇️ Get Missing + + + πŸ†• Archive Again + + + πŸ”„ Redo All + + + ☠️ Delete + +
+

+ Tip: Action buttons link to the list view with this snapshot pre-selected. Select it and use the action dropdown to execute. +

''', obj.timestamp, obj.timestamp, + obj.url, + obj.pk, + obj.pk, + obj.pk, obj.pk, ) diff --git a/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py new file mode 100644 index 00000000..dfead5b3 --- /dev/null +++ b/archivebox/core/migrations/0026_remove_archiveresult_output_dir_and_more.py @@ -0,0 +1,101 @@ +# Generated by Django 6.0 on 2025-12-25 09:34 + +import archivebox.base_models.models +import django.db.models.deletion +import django.utils.timezone +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0025_allow_duplicate_urls_per_crawl'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.RemoveField( + model_name='archiveresult', + name='output_dir', + ), + migrations.RemoveField( + model_name='snapshot', + name='output_dir', + ), + migrations.AlterField( + model_name='archiveresult', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='archiveresult', + name='created_by', + field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(db_index=True, max_length=32), + ), + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.AutoField(editable=False, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True, unique=True), + ), + migrations.AlterField( + model_name='snapshot', + name='bookmarked_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AlterField( + model_name='snapshot', + name='created_by', + field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='snapshot', + name='downloaded_at', + field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + # migrations.AlterField( + # model_name='snapshot', + # name='tags', + # field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'), + # ), + migrations.AlterField( + model_name='snapshottag', + name='id', + field=models.AutoField(primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='tag', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL), + ), + migrations.AlterUniqueTogether( + name='snapshottag', + unique_together={('snapshot', 'tag')}, + ), + ] diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index d051229d..295dcfa4 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -59,7 +59,7 @@ INSTALLED_APPS = [ "config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here) "machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. "workers", # handles starting and managing background workers and processes (orchestrators and actors) - "crawls", # handles Seed, Crawl, and CrawlSchedule models and management + "crawls", # handles Crawl and CrawlSchedule models and management "personas", # handles Persona and session management "core", # core django model with Snapshot, ArchiveResult, etc. "api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. @@ -194,10 +194,6 @@ DATABASES = { "NAME": DATABASE_NAME, **SQLITE_CONNECTION_OPTIONS, }, - "queue": { - "NAME": CONSTANTS.QUEUE_DATABASE_FILE, - **SQLITE_CONNECTION_OPTIONS, - }, # "filestore": { # "NAME": CONSTANTS.FILESTORE_DATABASE_FILE, # **SQLITE_CONNECTION_OPTIONS, diff --git a/archivebox/core/settings_logging.py b/archivebox/core/settings_logging.py index 3b4ecd05..85d6404a 100644 --- a/archivebox/core/settings_logging.py +++ b/archivebox/core/settings_logging.py @@ -2,8 +2,6 @@ __package__ = 'archivebox.core' import re import os - -import shutil import tempfile import logging @@ -11,7 +9,6 @@ import pydantic import django.template from archivebox.config import CONSTANTS -from archivebox.misc.logging import IS_TTY IGNORABLE_URL_PATTERNS = [ @@ -79,7 +76,6 @@ SETTINGS_LOGGING = { "formatters": { "rich": { "datefmt": "[%Y-%m-%d %H:%M:%S]", - # "format": "{asctime} {levelname} {module} {name} {message} {username}", "format": "%(name)s %(message)s", }, "outbound_webhooks": { @@ -99,26 +95,13 @@ SETTINGS_LOGGING = { }, }, "handlers": { - # "console": { - # "level": "DEBUG", - # 'formatter': 'simple', - # "class": "logging.StreamHandler", - # 'filters': ['noisyrequestsfilter', 'add_extra_logging_attrs'], - # }, "default": { "class": "rich.logging.RichHandler", "formatter": "rich", "level": "DEBUG", "markup": False, - "rich_tracebacks": IS_TTY, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) "filters": ["noisyrequestsfilter"], - "tracebacks_suppress": [ - django, - pydantic, - ], - "tracebacks_width": shutil.get_terminal_size((100, 10)).columns - 1, - "tracebacks_word_wrap": False, - "tracebacks_show_locals": False, }, "logfile": { "level": "INFO", @@ -132,7 +115,7 @@ SETTINGS_LOGGING = { "outbound_webhooks": { "class": "rich.logging.RichHandler", "markup": False, - "rich_tracebacks": True, + "rich_tracebacks": False, # Use standard Python tracebacks (no frame/box) "formatter": "outbound_webhooks", }, # "mail_admins": { diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index fde35403..eccefbbd 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -15,7 +15,7 @@ from statemachine import State, StateMachine # from workers.actor import ActorType from core.models import Snapshot, ArchiveResult -from crawls.models import Crawl, Seed +from crawls.models import Crawl class SnapshotMachine(StateMachine, strict_states=True): @@ -247,17 +247,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True): ) self.archiveresult.save(write_indexes=True) - # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed + # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1) Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1) - # Also update Crawl and Seed health stats if snapshot has a crawl + # Also update Crawl health stats if snapshot has a crawl snapshot = self.archiveresult.snapshot if snapshot.crawl_id: Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_succeeded=F('num_uses_succeeded') + 1) - crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first() - if crawl: - Seed.objects.filter(pk=crawl).update(num_uses_succeeded=F('num_uses_succeeded') + 1) @failed.enter def enter_failed(self): @@ -268,17 +265,14 @@ class ArchiveResultMachine(StateMachine, strict_states=True): end_ts=timezone.now(), ) - # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl/Seed + # Increment health stats on ArchiveResult, Snapshot, and optionally Crawl ArchiveResult.objects.filter(pk=self.archiveresult.pk).update(num_uses_failed=F('num_uses_failed') + 1) Snapshot.objects.filter(pk=self.archiveresult.snapshot_id).update(num_uses_failed=F('num_uses_failed') + 1) - # Also update Crawl and Seed health stats if snapshot has a crawl + # Also update Crawl health stats if snapshot has a crawl snapshot = self.archiveresult.snapshot if snapshot.crawl_id: Crawl.objects.filter(pk=snapshot.crawl_id).update(num_uses_failed=F('num_uses_failed') + 1) - crawl = Crawl.objects.filter(pk=snapshot.crawl_id).values_list('seed_id', flat=True).first() - if crawl: - Seed.objects.filter(pk=crawl).update(num_uses_failed=F('num_uses_failed') + 1) @skipped.enter def enter_skipped(self): diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 3f9b1794..4c6932df 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -33,7 +33,7 @@ from archivebox.search import query_search_index from core.models import Snapshot from core.forms import AddLinkForm -from crawls.models import Seed, Crawl +from crawls.models import Crawl from archivebox.hooks import get_extractors, get_extractor_name @@ -119,7 +119,11 @@ class SnapshotView(View): if result_file.name in existing_files or result_file.name == 'index.html': continue - file_size = result_file.stat().st_size or 0 + # Skip circular symlinks and other stat() failures + try: + file_size = result_file.stat().st_size or 0 + except OSError: + continue if file_size > min_size_threshold: archiveresults[result_file.name] = { @@ -471,14 +475,16 @@ class AddView(UserPassesTestMixin, FormView): sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__web_ui_add_by_user_{self.request.user.pk}.txt' sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls)) - # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__web_ui_add_by_user_.txt + # 2. create a new Crawl with the URLs from the file timestamp = timezone.now().strftime("%Y-%m-%d__%H-%M-%S") - seed = Seed.from_file( - sources_file, + urls_content = sources_file.read_text() + crawl = Crawl.objects.create( + urls=urls_content, + extractor=parser, + max_depth=depth, + tags_str=tag, label=f'{self.request.user.username}@{HOSTNAME}{self.request.path} {timestamp}', - parser=parser, - tag=tag, - created_by=self.request.user.pk, + created_by_id=self.request.user.pk, config={ # 'ONLY_NEW': not update, # 'INDEX_ONLY': index_only, @@ -486,9 +492,8 @@ class AddView(UserPassesTestMixin, FormView): 'DEPTH': depth, 'EXTRACTORS': extractors or '', # 'DEFAULT_PERSONA': persona or 'Default', - }) - # 3. create a new Crawl pointing to the Seed - crawl = Crawl.from_seed(seed, max_depth=depth) + } + ) # 4. start the Orchestrator & wait until it completes # ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ... @@ -569,19 +574,7 @@ def live_progress_view(request): # Count URLs in the crawl (for when snapshots haven't been created yet) urls_count = 0 if crawl.urls: - urls_count = len([u for u in crawl.urls.split('\n') if u.strip()]) - elif crawl.seed and crawl.seed.uri: - # Try to get URL count from seed - if crawl.seed.uri.startswith('file:///'): - try: - from pathlib import Path - seed_file = Path(crawl.seed.uri.replace('file://', '')) - if seed_file.exists(): - urls_count = len([l for l in seed_file.read_text().split('\n') if l.strip() and not l.startswith('#')]) - except: - pass - else: - urls_count = 1 # Single URL seed + urls_count = len([u for u in crawl.urls.split('\n') if u.strip() and not u.startswith('#')]) # Calculate crawl progress crawl_progress = int((completed_snapshots / total_snapshots) * 100) if total_snapshots > 0 else 0 @@ -635,8 +628,8 @@ def live_progress_view(request): }) # Check if crawl can start (for debugging stuck crawls) - can_start = bool(crawl.seed and crawl.seed.uri) - seed_uri = crawl.seed.uri[:60] if crawl.seed and crawl.seed.uri else None + can_start = bool(crawl.urls) + urls_preview = crawl.urls[:60] if crawl.urls else None # Check if retry_at is in the future (would prevent worker from claiming) retry_at_future = crawl.retry_at > timezone.now() if crawl.retry_at else False @@ -657,7 +650,7 @@ def live_progress_view(request): 'pending_snapshots': pending_snapshots, 'active_snapshots': active_snapshots_for_crawl, 'can_start': can_start, - 'seed_uri': seed_uri, + 'urls_preview': urls_preview, 'retry_at_future': retry_at_future, 'seconds_until_retry': seconds_until_retry, }) diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index e5e7f2eb..fa41f851 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -17,7 +17,7 @@ from django_object_actions import action from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin from core.models import Snapshot -from crawls.models import Seed, Crawl, CrawlSchedule +from crawls.models import Crawl, CrawlSchedule def render_snapshots_list(snapshots_qs, limit=20): @@ -136,16 +136,16 @@ def render_snapshots_list(snapshots_qs, limit=20): ''') -class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): - list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str', 'crawls', 'num_crawls', 'num_snapshots') - sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'uri', 'extractor', 'tags_str') - search_fields = ('id', 'created_by__username', 'label', 'notes', 'uri', 'extractor', 'tags_str') +class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): + list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots') + sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at') + search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls') - readonly_fields = ('created_at', 'modified_at', 'scheduled_crawls', 'crawls', 'snapshots', 'contents') + readonly_fields = ('created_at', 'modified_at', 'snapshots', 'urls_editor') fieldsets = ( - ('Source', { - 'fields': ('uri', 'contents'), + ('URLs', { + 'fields': ('urls_editor',), 'classes': ('card', 'wide'), }), ('Info', { @@ -153,83 +153,7 @@ class SeedAdmin(ConfigEditorMixin, BaseModelAdmin): 'classes': ('card',), }), ('Settings', { - 'fields': ('extractor', 'config'), - 'classes': ('card',), - }), - ('Metadata', { - 'fields': ('created_by', 'created_at', 'modified_at'), - 'classes': ('card',), - }), - ('Crawls', { - 'fields': ('scheduled_crawls', 'crawls'), - 'classes': ('card',), - }), - ('Snapshots', { - 'fields': ('snapshots',), - 'classes': ('card',), - }), - ) - - list_filter = ('extractor', 'created_by') - ordering = ['-created_at'] - list_per_page = 100 - actions = ["delete_selected"] - - def num_crawls(self, obj): - return obj.crawl_set.count() - - def num_snapshots(self, obj): - return obj.snapshot_set.count() - - def scheduled_crawls(self, obj): - return format_html_join('
', ' - {}', ( - (scheduledcrawl.admin_change_url, scheduledcrawl) - for scheduledcrawl in obj.scheduled_crawl_set.all().order_by('-created_at')[:20] - )) or mark_safe('No Scheduled Crawls yet...') - - def crawls(self, obj): - return format_html_join('
', ' - {}', ( - (crawl.admin_change_url, crawl) - for crawl in obj.crawl_set.all().order_by('-created_at')[:20] - )) or mark_safe('No Crawls yet...') - - def snapshots(self, obj): - return render_snapshots_list(obj.snapshot_set.all()) - - def contents(self, obj): - source_file = obj.get_file_path() - if source_file: - contents = "" - try: - contents = source_file.read_text().strip()[:14_000] - except Exception as e: - contents = f'Error reading {source_file}: {e}' - - return format_html('{}:
{}
', source_file, contents) - - return format_html('See URLs here: {}', obj.uri, obj.uri) - - - - -class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): - list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at', 'num_snapshots') - sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'seed_str', 'schedule_str', 'status', 'retry_at') - search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'seed_id', 'schedule_id', 'status', 'seed__uri') - - readonly_fields = ('created_at', 'modified_at', 'snapshots', 'seed_urls_editor') - - fieldsets = ( - ('URLs', { - 'fields': ('seed_urls_editor',), - 'classes': ('card', 'wide'), - }), - ('Info', { - 'fields': ('label', 'notes'), - 'classes': ('card',), - }), - ('Settings', { - 'fields': ('max_depth', 'config'), + 'fields': ('max_depth', 'extractor', 'config'), 'classes': ('card',), }), ('Status', { @@ -237,7 +161,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): 'classes': ('card',), }), ('Relations', { - 'fields': ('seed', 'schedule', 'created_by'), + 'fields': ('schedule', 'created_by'), 'classes': ('card',), }), ('Timestamps', { @@ -250,7 +174,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): }), ) - list_filter = ('max_depth', 'seed', 'schedule', 'created_by', 'status', 'retry_at') + list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at') ordering = ['-created_at', '-retry_at'] list_per_page = 100 actions = ["delete_selected"] @@ -258,23 +182,20 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): @action(label='Recrawl', description='Create a new crawl with the same settings') def recrawl(self, request, obj): - """Duplicate this crawl as a new crawl with the same seed and settings.""" + """Duplicate this crawl as a new crawl with the same URLs and settings.""" from django.utils import timezone from django.shortcuts import redirect - # Validate seed has a URI (required for crawl to start) - if not obj.seed: - messages.error(request, 'Cannot recrawl: original crawl has no seed.') - return redirect('admin:crawls_crawl_change', obj.id) - - if not obj.seed.uri: - messages.error(request, 'Cannot recrawl: seed has no URI.') + # Validate URLs (required for crawl to start) + if not obj.urls: + messages.error(request, 'Cannot recrawl: original crawl has no URLs.') return redirect('admin:crawls_crawl_change', obj.id) new_crawl = Crawl.objects.create( - seed=obj.seed, urls=obj.urls, + extractor=obj.extractor, max_depth=obj.max_depth, + tags_str=obj.tags_str, config=obj.config, schedule=obj.schedule, label=f"{obj.label} (recrawl)" if obj.label else "", @@ -292,43 +213,6 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): return redirect('admin:crawls_crawl_change', new_crawl.id) - def get_urls(self): - urls = super().get_urls() - custom_urls = [ - path('/save_seed_contents/', - self.admin_site.admin_view(self.save_seed_contents_view), - name='crawls_crawl_save_seed_contents'), - ] - return custom_urls + urls - - def save_seed_contents_view(self, request, object_id): - """Handle saving seed file contents via AJAX.""" - if request.method != 'POST': - return JsonResponse({'success': False, 'error': 'POST required'}, status=405) - - try: - crawl = Crawl.objects.get(pk=object_id) - except Crawl.DoesNotExist: - return JsonResponse({'success': False, 'error': 'Crawl not found'}, status=404) - - source_file = crawl.seed.get_file_path() if crawl.seed else None - if not source_file: - return JsonResponse({'success': False, 'error': 'Seed is not a local file'}, status=400) - - try: - data = json.loads(request.body) - contents = data.get('contents', '') - except json.JSONDecodeError: - return JsonResponse({'success': False, 'error': 'Invalid JSON'}, status=400) - - try: - # Ensure parent directory exists - source_file.parent.mkdir(parents=True, exist_ok=True) - source_file.write_text(contents) - return JsonResponse({'success': True, 'message': f'Saved {len(contents)} bytes to {source_file.name}'}) - except Exception as e: - return JsonResponse({'success': False, 'error': str(e)}, status=500) - def num_snapshots(self, obj): return obj.snapshot_set.count() @@ -341,163 +225,68 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): return mark_safe('None') return format_html('{}', obj.schedule.admin_change_url, obj.schedule) - @admin.display(description='Seed', ordering='seed') - def seed_str(self, obj): - if not obj.seed: - return mark_safe('None') - return format_html('{}', obj.seed.admin_change_url, obj.seed) + @admin.display(description='URLs', ordering='urls') + def urls_preview(self, obj): + first_url = obj.get_urls_list()[0] if obj.get_urls_list() else '' + return first_url[:80] + '...' if len(first_url) > 80 else first_url @admin.display(description='URLs') - def seed_urls_editor(self, obj): - """Combined editor showing seed URL and file contents.""" - widget_id = f'seed_urls_{obj.pk}' - - # Get the seed URI (or use urls field if no seed) - seed_uri = '' - if obj.seed and obj.seed.uri: - seed_uri = obj.seed.uri - elif obj.urls: - seed_uri = obj.urls + def urls_editor(self, obj): + """Editor for crawl URLs.""" + widget_id = f'crawl_urls_{obj.pk}' # Check if it's a local file we can edit - source_file = obj.seed.get_file_path() if obj.seed else None + source_file = obj.get_file_path() is_file = source_file is not None - contents = "" + file_contents = "" error = None if is_file and source_file: try: - contents = source_file.read_text().strip() + file_contents = source_file.read_text().strip() except Exception as e: error = f'Error reading {source_file}: {e}' # Escape for safe HTML embedding - escaped_uri = seed_uri.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') - escaped_contents = (contents or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + escaped_urls = (obj.urls or '').replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') + escaped_file_contents = file_contents.replace('&', '&').replace('<', '<').replace('>', '>').replace('"', '"') # Count lines for auto-expand logic - line_count = len(contents.split('\n')) if contents else 0 - uri_rows = min(max(1, seed_uri.count('\n') + 1), 3) + line_count = len((obj.urls or '').split('\n')) + file_line_count = len(file_contents.split('\n')) if file_contents else 0 + uri_rows = min(max(3, line_count), 10) html = f'''
- +
- - + placeholder="https://example.com https://example2.com # Comments start with #" + readonly>{escaped_urls} +

+ {line_count} URL{'s' if line_count != 1 else ''} Β· URLs are read-only in admin, edit via API or CLI +

{"" if not is_file else f''' - +
{"
" + error + "
" if error else ""} - -
- -
- - - +
'''} - {"" if is_file else f''' -
- {seed_uri} -
- '''} - -
''' return mark_safe(html) @@ -507,7 +296,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): class CrawlScheduleAdmin(BaseModelAdmin): list_display = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str', 'crawls', 'num_crawls', 'num_snapshots') sort_fields = ('id', 'created_at', 'created_by', 'label', 'notes', 'template_str') - search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__seed__uri') + search_fields = ('id', 'created_by__username', 'label', 'notes', 'schedule_id', 'template_id', 'template__urls') readonly_fields = ('created_at', 'modified_at', 'crawls', 'snapshots') @@ -561,6 +350,5 @@ class CrawlScheduleAdmin(BaseModelAdmin): def register_admin(admin_site): - admin_site.register(Seed, SeedAdmin) admin_site.register(Crawl, CrawlAdmin) admin_site.register(CrawlSchedule, CrawlScheduleAdmin) diff --git a/archivebox/crawls/migrations/0002_drop_seed_model.py b/archivebox/crawls/migrations/0002_drop_seed_model.py new file mode 100644 index 00000000..f0a66af5 --- /dev/null +++ b/archivebox/crawls/migrations/0002_drop_seed_model.py @@ -0,0 +1,61 @@ +# Generated by Django 6.0 on 2025-12-25 09:34 + +import archivebox.base_models.models +import django.db.models.deletion +import pathlib +import uuid +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0001_initial'), + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.RemoveField( + model_name='crawl', + name='seed', + ), + migrations.AddField( + model_name='crawl', + name='extractor', + field=models.CharField(default='auto', help_text='Parser for reading URLs (auto, html, json, rss, etc)', max_length=32), + ), + migrations.AlterField( + model_name='crawl', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='crawl', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='crawl', + name='output_dir', + field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')), + ), + migrations.AlterField( + model_name='crawl', + name='urls', + field=models.TextField(help_text='Newline-separated list of URLs to crawl'), + ), + migrations.AlterField( + model_name='crawlschedule', + name='created_by', + field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), + ), + migrations.AlterField( + model_name='crawlschedule', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.DeleteModel( + name='Seed', + ), + ] diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 4bd00328..263869fe 100644 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -20,91 +20,6 @@ if TYPE_CHECKING: from core.models import Snapshot, ArchiveResult -class Seed(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats): - id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) - created_at = models.DateTimeField(default=timezone.now, db_index=True) - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) - modified_at = models.DateTimeField(auto_now=True) - - uri = models.URLField(max_length=2048) - extractor = models.CharField(default='auto', max_length=32) - tags_str = models.CharField(max_length=255, null=False, blank=True, default='') - label = models.CharField(max_length=255, null=False, blank=True, default='') - config = models.JSONField(default=dict) - output_dir = models.FilePathField(path=settings.ARCHIVE_DIR, null=False, blank=True, default='') - notes = models.TextField(blank=True, null=False, default='') - - crawl_set: models.Manager['Crawl'] - - class Meta: - verbose_name = 'Seed' - verbose_name_plural = 'Seeds' - unique_together = (('created_by', 'uri', 'extractor'), ('created_by', 'label')) - - def __str__(self): - return f'[{self.id}] {self.uri[:64]}' - - def save(self, *args, **kwargs): - is_new = self._state.adding - super().save(*args, **kwargs) - if is_new: - from archivebox.misc.logging_util import log_worker_event - log_worker_event( - worker_type='DB', - event='Created Seed', - indent_level=0, - metadata={ - 'id': str(self.id), - 'uri': str(self.uri)[:64], - 'extractor': self.extractor, - 'label': self.label or None, - }, - ) - - @classmethod - def from_file(cls, source_file: Path, label: str = '', parser: str = 'auto', tag: str = '', created_by=None, config=None): - # Use absolute path for file:// URLs so extractors can find the files - source_path = str(source_file.resolve()) - seed, _ = cls.objects.get_or_create( - label=label or source_file.name, uri=f'file://{source_path}', - created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(), - extractor=parser, tags_str=tag, config=config or {}, - ) - return seed - - @property - def source_type(self): - return self.uri.split('://', 1)[0].lower() - - @property - def api_url(self) -> str: - return reverse_lazy('api-1:get_seed', args=[self.id]) - - def get_file_path(self) -> Path | None: - """ - Get the filesystem path for file:// URIs. - Handles both old format (file:///data/...) and new format (file:///absolute/path). - Returns None if URI is not a file:// URI. - """ - if not self.uri.startswith('file://'): - return None - - # Remove file:// prefix - path_str = self.uri.replace('file://', '', 1) - - # Handle old format: file:///data/... -> DATA_DIR/... - if path_str.startswith('/data/'): - return CONSTANTS.DATA_DIR / path_str.replace('/data/', '', 1) - - # Handle new format: file:///absolute/path - return Path(path_str) - - @property - def snapshot_set(self) -> QuerySet['Snapshot']: - from core.models import Snapshot - return Snapshot.objects.filter(crawl_id__in=self.crawl_set.values_list('pk', flat=True)) - - class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats): id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) @@ -124,14 +39,15 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats): verbose_name_plural = 'Scheduled Crawls' def __str__(self) -> str: - return f'[{self.id}] {self.template.seed.uri[:64] if self.template and self.template.seed else ""} @ {self.schedule}' + urls_preview = self.template.urls[:64] if self.template and self.template.urls else "" + return f'[{self.id}] {urls_preview} @ {self.schedule}' @property def api_url(self) -> str: return reverse_lazy('api-1:get_any', args=[self.id]) def save(self, *args, **kwargs): - self.label = self.label or (self.template.seed.label if self.template and self.template.seed else '') + self.label = self.label or (self.template.label if self.template else '') super().save(*args, **kwargs) if self.template: self.template.schedule = self @@ -144,8 +60,8 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) modified_at = models.DateTimeField(auto_now=True) - seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) - urls = models.TextField(blank=True, null=False, default='') + urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl') + extractor = models.CharField(default='auto', max_length=32, help_text='Parser for reading URLs (auto, html, json, rss, etc)') config = models.JSONField(default=dict) max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') @@ -171,31 +87,40 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith verbose_name_plural = 'Crawls' def __str__(self): - return f'[{self.id}] {self.seed.uri[:64] if self.seed else ""}' + first_url = self.get_urls_list()[0] if self.get_urls_list() else '' + return f'[{self.id}] {first_url[:64]}' def save(self, *args, **kwargs): is_new = self._state.adding super().save(*args, **kwargs) if is_new: from archivebox.misc.logging_util import log_worker_event + first_url = self.get_urls_list()[0] if self.get_urls_list() else '' log_worker_event( worker_type='DB', event='Created Crawl', indent_level=1, metadata={ 'id': str(self.id), - 'seed_uri': str(self.seed.uri)[:64] if self.seed else None, + 'first_url': first_url[:64], 'max_depth': self.max_depth, 'status': self.status, }, ) @classmethod - def from_seed(cls, seed: Seed, max_depth: int = 0, persona: str = 'Default', tags_str: str = '', config=None, created_by=None): - crawl, _ = cls.objects.get_or_create( - seed=seed, max_depth=max_depth, tags_str=tags_str or seed.tags_str, - config=seed.config or config or {}, - created_by_id=getattr(created_by, 'pk', created_by) or seed.created_by_id, + def from_file(cls, source_file: Path, max_depth: int = 0, label: str = '', extractor: str = 'auto', + tags_str: str = '', config=None, created_by=None): + """Create a crawl from a file containing URLs.""" + urls_content = source_file.read_text() + crawl = cls.objects.create( + urls=urls_content, + extractor=extractor, + max_depth=max_depth, + tags_str=tags_str, + label=label or source_file.name, + config=config or {}, + created_by_id=getattr(created_by, 'pk', created_by) or get_or_create_system_user_pk(), ) return crawl @@ -203,14 +128,47 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def api_url(self) -> str: return reverse_lazy('api-1:get_crawl', args=[self.id]) + def get_urls_list(self) -> list[str]: + """Get list of URLs from urls field, filtering out comments and empty lines.""" + if not self.urls: + return [] + return [ + url.strip() + for url in self.urls.split('\n') + if url.strip() and not url.strip().startswith('#') + ] + + def get_file_path(self) -> Path | None: + """ + Get filesystem path if this crawl references a local file. + Checks if the first URL is a file:// URI. + """ + urls = self.get_urls_list() + if not urls: + return None + + first_url = urls[0] + if not first_url.startswith('file://'): + return None + + # Remove file:// prefix + path_str = first_url.replace('file://', '', 1) + return Path(path_str) + def create_root_snapshot(self) -> 'Snapshot': from core.models import Snapshot + + first_url = self.get_urls_list()[0] if self.get_urls_list() else None + if not first_url: + raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from') + try: - return Snapshot.objects.get(crawl=self, url=self.seed.uri) + return Snapshot.objects.get(crawl=self, url=first_url) except Snapshot.DoesNotExist: pass + root_snapshot, _ = Snapshot.objects.update_or_create( - crawl=self, url=self.seed.uri, + crawl=self, url=first_url, defaults={ 'status': Snapshot.INITIAL_STATE, 'retry_at': timezone.now(), diff --git a/archivebox/crawls/statemachines.py b/archivebox/crawls/statemachines.py index a71cd010..45cb62fc 100644 --- a/archivebox/crawls/statemachines.py +++ b/archivebox/crawls/statemachines.py @@ -42,11 +42,12 @@ class CrawlMachine(StateMachine, strict_states=True): return self.__repr__() def can_start(self) -> bool: - if not self.crawl.seed: - print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no seed[/red]') + if not self.crawl.urls: + print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]') return False - if not self.crawl.seed.uri: - print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: seed has no URI[/red]') + urls_list = self.crawl.get_urls_list() + if not urls_list: + print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no valid URLs in urls field[/red]') return False return True @@ -121,13 +122,14 @@ class CrawlMachine(StateMachine, strict_states=True): output_dir.mkdir(parents=True, exist_ok=True) # Run all on_Crawl hooks + first_url = self.crawl.get_urls_list()[0] if self.crawl.get_urls_list() else '' results = run_hooks( event_name='Crawl', output_dir=output_dir, timeout=60, - config_objects=[self.crawl, self.crawl.seed] if self.crawl.seed else [self.crawl], + config_objects=[self.crawl], crawl_id=str(self.crawl.id), - seed_uri=self.crawl.seed.uri if self.crawl.seed else '', + seed_uri=first_url, ) # Process hook results - parse JSONL output and create DB objects diff --git a/archivebox/logs/errors.log b/archivebox/logs/errors.log deleted file mode 100644 index 715cf9d3..00000000 --- a/archivebox/logs/errors.log +++ /dev/null @@ -1,2 +0,0 @@ - -> /Users/squash/Local/Code/archiveboxes/archivebox-nue/archivebox/cli/archivebox_init.py --force; TS=2025-12-25__08:03:12 VERSION=0.9.0rc1 IN_DOCKER=False IS_TTY=False diff --git a/archivebox/machine/migrations/0002_alter_dependency_bin_name_and_more.py b/archivebox/machine/migrations/0002_alter_dependency_bin_name_and_more.py new file mode 100644 index 00000000..6df9a423 --- /dev/null +++ b/archivebox/machine/migrations/0002_alter_dependency_bin_name_and_more.py @@ -0,0 +1,65 @@ +# Generated by Django 6.0 on 2025-12-25 09:34 + +import django.db.models.deletion +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0001_squashed'), + ] + + operations = [ + migrations.AlterField( + model_name='dependency', + name='bin_name', + field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True), + ), + migrations.AlterField( + model_name='dependency', + name='bin_providers', + field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127), + ), + migrations.AlterField( + model_name='dependency', + name='config', + field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'), + ), + migrations.AlterField( + model_name='dependency', + name='custom_cmds', + field=models.JSONField(blank=True, default=dict, help_text="JSON map of provider -> custom install command (e.g., {'apt': 'apt install -y wget'})"), + ), + migrations.AlterField( + model_name='dependency', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='installedbinary', + name='dependency', + field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='installedbinary_set', to='machine.dependency'), + ), + migrations.AlterField( + model_name='installedbinary', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='machine', + name='config', + field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'), + ), + migrations.AlterField( + model_name='machine', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='networkinterface', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + ] diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py index 212ecc66..ba295cf5 100644 --- a/archivebox/misc/jsonl.py +++ b/archivebox/misc/jsonl.py @@ -27,10 +27,9 @@ TYPE_SNAPSHOT = 'Snapshot' TYPE_ARCHIVERESULT = 'ArchiveResult' TYPE_TAG = 'Tag' TYPE_CRAWL = 'Crawl' -TYPE_SEED = 'Seed' TYPE_INSTALLEDBINARY = 'InstalledBinary' -VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_SEED, TYPE_INSTALLEDBINARY} +VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_INSTALLEDBINARY} def parse_line(line: str) -> Optional[Dict[str, Any]]: @@ -206,7 +205,8 @@ def crawl_to_jsonl(crawl) -> Dict[str, Any]: return { 'type': TYPE_CRAWL, 'id': str(crawl.id), - 'seed_id': str(crawl.seed_id), + 'urls': crawl.urls, + 'extractor': crawl.extractor, 'status': crawl.status, 'max_depth': crawl.max_depth, 'created_at': crawl.created_at.isoformat() if crawl.created_at else None, diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py index 341c6c90..c571a903 100644 --- a/archivebox/misc/logging.py +++ b/archivebox/misc/logging.py @@ -13,9 +13,11 @@ from rich.console import Console from rich.highlighter import Highlighter # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS -CONSOLE = Console() -STDERR = Console(stderr=True) -IS_TTY = CONSOLE.is_interactive +# Disable wrapping - use soft_wrap=True and large width so text flows naturally +# Colors are preserved, just no hard line breaks inserted +CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True) +STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True) +IS_TTY = sys.stdout.isatty() class RainbowHighlighter(Highlighter): def highlight(self, text): diff --git a/archivebox/misc/logging_util.py b/archivebox/misc/logging_util.py index 766eed98..1016539e 100644 --- a/archivebox/misc/logging_util.py +++ b/archivebox/misc/logging_util.py @@ -603,21 +603,17 @@ def log_worker_event( # Build final message error_str = f' {type(error).__name__}: {error}' if error else '' - # Build colored message - worker_label needs to be inside color tags - # But first we need to format the color tags separately from the worker label from archivebox.misc.logging import CONSOLE from rich.text import Text # Create a Rich Text object for proper formatting text = Text() - text.append(indent) # Indentation - # Append worker label and event with color + text.append(indent) text.append(f'{worker_label} {event}{error_str}', style=color) - # Append metadata without color (add separator if metadata exists) if metadata_str: text.append(f' | {metadata_str}') - CONSOLE.print(text) + CONSOLE.print(text, soft_wrap=True) @enforce_types diff --git a/archivebox/misc/monkey_patches.py b/archivebox/misc/monkey_patches.py index 12ed05a1..2bfb7924 100644 --- a/archivebox/misc/monkey_patches.py +++ b/archivebox/misc/monkey_patches.py @@ -1,7 +1,5 @@ __package__ = 'archivebox' -import sys -import shutil import django import pydantic @@ -20,14 +18,10 @@ timezone.utc = datetime.timezone.utc # DjangoSignalWebhooksConfig.verbose_name = 'API' -# Install rich for pretty tracebacks in console logs -# https://rich.readthedocs.io/en/stable/traceback.html#traceback-handler - -from rich.traceback import install # noqa - -TERM_WIDTH = (shutil.get_terminal_size((200, 10)).columns - 1) if sys.stdout.isatty() else 200 -# os.environ.setdefault('COLUMNS', str(TERM_WIDTH)) -install(show_locals=True, word_wrap=False, locals_max_length=10, locals_hide_dunder=True, suppress=[django, pydantic], extra_lines=2, width=TERM_WIDTH) +# Rich traceback handler disabled - it adds frames/boxes that wrap weirdly in log files +# Standard Python tracebacks are used instead (full width, no frames) +# from rich.traceback import install +# install(show_locals=True, word_wrap=False, ...) # Hide site-packages/sonic/client.py:115: SyntaxWarning diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index 1b9d9dde..3b5299af 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -552,21 +552,21 @@ if (crawl.status === 'queued' && !crawl.can_start) { warningHtml = `
- ⚠️ Crawl cannot start: ${crawl.seed_uri ? 'unknown error' : 'no seed URI'} + ⚠️ Crawl cannot start: ${crawl.urls_preview ? 'unknown error' : 'no URLs'}
`; } else if (crawl.status === 'queued' && crawl.retry_at_future) { // Queued but retry_at is in future (was claimed by worker, will retry) warningHtml = `
- πŸ”„ Retrying in ${crawl.seconds_until_retry}s...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''} + πŸ”„ Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
`; } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) { // Queued and waiting to be picked up by worker warningHtml = `
- ⏳ Waiting for worker to pick up...${crawl.seed_uri ? ` (${crawl.seed_uri})` : ''} + ⏳ Waiting for worker to pick up...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
`; } @@ -577,8 +577,8 @@ metaText += ` | ${crawl.total_snapshots} snapshots`; } else if (crawl.urls_count > 0) { metaText += ` | ${crawl.urls_count} URLs`; - } else if (crawl.seed_uri) { - metaText += ` | ${crawl.seed_uri.substring(0, 40)}${crawl.seed_uri.length > 40 ? '...' : ''}`; + } else if (crawl.urls_preview) { + metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`; } return ` diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py index 898f87fe..691adae4 100644 --- a/archivebox/workers/supervisord_util.py +++ b/archivebox/workers/supervisord_util.py @@ -26,6 +26,9 @@ CONFIG_FILE_NAME = "supervisord.conf" PID_FILE_NAME = "supervisord.pid" WORKERS_DIR_NAME = "workers" +# Global reference to supervisord process for cleanup +_supervisord_proc = None + ORCHESTRATOR_WORKER = { "name": "worker_orchestrator", "command": "archivebox manage orchestrator", # runs forever by default @@ -78,7 +81,7 @@ def create_supervisord_config(): config_content = f""" [supervisord] nodaemon = true -environment = IS_SUPERVISORD_PARENT="true" +environment = IS_SUPERVISORD_PARENT="true",COLUMNS="200" pidfile = {PID_FILE} logfile = {LOG_FILE} childlogdir = {CONSTANTS.LOGS_DIR} @@ -143,11 +146,27 @@ def get_existing_supervisord_process(): return None def stop_existing_supervisord_process(): + global _supervisord_proc SOCK_FILE = get_sock_file() PID_FILE = SOCK_FILE.parent / PID_FILE_NAME - + try: - # if pid file exists, load PID int + # First try to stop via the global proc reference + if _supervisord_proc and _supervisord_proc.poll() is None: + try: + print(f"[πŸ¦Έβ€β™‚οΈ] Stopping supervisord process (pid={_supervisord_proc.pid})...") + _supervisord_proc.terminate() + try: + _supervisord_proc.wait(timeout=5) + except subprocess.TimeoutExpired: + _supervisord_proc.kill() + _supervisord_proc.wait(timeout=2) + except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt): + pass + _supervisord_proc = None + return + + # Fallback: if pid file exists, load PID int and kill that process try: pid = int(PID_FILE.read_text()) except (FileNotFoundError, ValueError): @@ -156,8 +175,25 @@ def stop_existing_supervisord_process(): try: print(f"[πŸ¦Έβ€β™‚οΈ] Stopping supervisord process (pid={pid})...") proc = psutil.Process(pid) + # Kill the entire process group to ensure all children are stopped + children = proc.children(recursive=True) proc.terminate() + # Also terminate all children + for child in children: + try: + child.terminate() + except psutil.NoSuchProcess: + pass proc.wait(timeout=5) + # Kill any remaining children + for child in children: + try: + if child.is_running(): + child.kill() + except psutil.NoSuchProcess: + pass + except psutil.NoSuchProcess: + pass except (BaseException, BrokenPipeError, IOError, KeyboardInterrupt): pass finally: @@ -174,7 +210,7 @@ def start_new_supervisord_process(daemonize=False): LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME PID_FILE = SOCK_FILE.parent / PID_FILE_NAME - + print(f"[πŸ¦Έβ€β™‚οΈ] Supervisord starting{' in background' if daemonize else ''}...") pretty_log_path = pretty_path(LOG_FILE) print(f" > Writing supervisord logs to: {pretty_log_path}") @@ -182,50 +218,54 @@ def start_new_supervisord_process(daemonize=False): print(f' > Using supervisord config file: {pretty_path(CONFIG_FILE)}') print(f" > Using supervisord UNIX socket: {pretty_path(SOCK_FILE)}") print() - + # clear out existing stale state files shutil.rmtree(WORKERS_DIR, ignore_errors=True) PID_FILE.unlink(missing_ok=True) get_sock_file().unlink(missing_ok=True) CONFIG_FILE.unlink(missing_ok=True) - + # create the supervisord config file create_supervisord_config() - # Start supervisord - # panel = Panel(f"Starting supervisord with config: {SUPERVISORD_CONFIG_FILE}") - # with Live(panel, refresh_per_second=1) as live: - - subprocess.Popen( - f"supervisord --configuration={CONFIG_FILE}", - stdin=None, - shell=True, - start_new_session=daemonize, - ) + # Open log file for supervisord output + LOG_FILE.parent.mkdir(parents=True, exist_ok=True) + log_handle = open(LOG_FILE, 'a') - def exit_signal_handler(signum, frame): - if signum == 2: - STDERR.print("\n[πŸ›‘] Got Ctrl+C. Terminating child processes...") - elif signum != 13: - STDERR.print(f"\n[πŸ¦Έβ€β™‚οΈ] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...") - stop_existing_supervisord_process() - raise SystemExit(0) + if daemonize: + # Start supervisord in background (daemon mode) + subprocess.Popen( + f"supervisord --configuration={CONFIG_FILE}", + stdin=None, + stdout=log_handle, + stderr=log_handle, + shell=True, + start_new_session=True, + ) + time.sleep(2) + return get_existing_supervisord_process() + else: + # Start supervisord in FOREGROUND - this will block until supervisord exits + # supervisord with nodaemon=true will run in foreground and handle signals properly + # When supervisord gets SIGINT/SIGTERM, it will stop all child processes before exiting + proc = subprocess.Popen( + f"supervisord --configuration={CONFIG_FILE}", + stdin=None, + stdout=log_handle, + stderr=log_handle, + shell=True, + start_new_session=False, # Keep in same process group so signals propagate + ) - # Monitor for termination signals and cleanup child processes - if not daemonize: - try: - signal.signal(signal.SIGINT, exit_signal_handler) - signal.signal(signal.SIGHUP, exit_signal_handler) - signal.signal(signal.SIGPIPE, exit_signal_handler) - signal.signal(signal.SIGTERM, exit_signal_handler) - except Exception: - # signal handlers only work in main thread - pass - # otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode) + # Store the process so we can wait on it later + global _supervisord_proc + _supervisord_proc = proc - time.sleep(2) + # Wait a bit for supervisord to start up + time.sleep(2) + + return get_existing_supervisord_process() - return get_existing_supervisord_process() def get_or_create_supervisord_process(daemonize=False): SOCK_FILE = get_sock_file() @@ -353,9 +393,15 @@ def tail_worker_logs(log_path: str): pass -def tail_multiple_worker_logs(log_files: list[str], follow=True): - """Tail multiple log files simultaneously, interleaving their output.""" - import select +def tail_multiple_worker_logs(log_files: list[str], follow=True, proc=None): + """Tail multiple log files simultaneously, interleaving their output. + + Args: + log_files: List of log file paths to tail + follow: Whether to keep following (True) or just read existing content (False) + proc: Optional subprocess.Popen object - stop tailing when this process exits + """ + import re from pathlib import Path # Convert relative paths to absolute paths @@ -377,48 +423,53 @@ def tail_multiple_worker_logs(log_files: list[str], follow=True): for log_path in log_paths: try: f = open(log_path, 'r') - # Seek to end of file if following - if follow: - f.seek(0, 2) # Seek to end - file_handles.append((log_path.name, f)) + # Don't seek to end - show recent content so user sees something + # Go to end minus 4KB to show some recent logs + f.seek(0, 2) # Go to end first + file_size = f.tell() + if file_size > 4096: + f.seek(file_size - 4096) + f.readline() # Skip partial line + else: + f.seek(0) # Small file, read from start + + file_handles.append((log_path, f)) + print(f" [tailing {log_path.name}]") except Exception as e: - print(f"[yellow]Warning: Could not open {log_path}: {e}[/yellow]") + sys.stderr.write(f"Warning: Could not open {log_path}: {e}\n") if not file_handles: - print("[red]No log files could be opened[/red]") + sys.stderr.write("No log files could be opened\n") return - # Print which logs we're tailing - log_names = [name for name, _ in file_handles] - print(f"[dim]Tailing: {', '.join(log_names)}[/dim]") print() try: while follow: - # Read available lines from all files - for log_name, f in file_handles: - line = f.readline() - if line: - # Colorize based on log source - if 'orchestrator' in log_name.lower(): - color = 'cyan' - elif 'daphne' in log_name.lower(): - color = 'green' - else: - color = 'white' + # Check if the monitored process has exited + if proc is not None and proc.poll() is not None: + print(f"\n[server process exited with code {proc.returncode}]") + break + had_output = False + # Read ALL available lines from all files (not just one per iteration) + for log_path, f in file_handles: + while True: + line = f.readline() + if not line: + break # No more lines available in this file + had_output = True # Strip ANSI codes if present (supervisord does this but just in case) - import re line_clean = re.sub(r'\x1b\[[0-9;]*m', '', line.rstrip()) - if line_clean: - print(f'[{color}][{log_name}][/{color}] {line_clean}') + print(line_clean) - # Small sleep to avoid busy-waiting - time.sleep(0.1) + # Small sleep to avoid busy-waiting (only when no output) + if not had_output: + time.sleep(0.05) except (KeyboardInterrupt, BrokenPipeError, IOError): - print("\n[yellow][i] Stopped tailing logs[/i][/yellow]") + pass # Let the caller handle the cleanup message except SystemExit: pass finally: @@ -451,6 +502,8 @@ def watch_worker(supervisor, daemon_name, interval=5): def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): + global _supervisord_proc + supervisor = get_or_create_supervisord_process(daemonize=daemonize) bg_workers = [ @@ -466,36 +519,50 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): if not daemonize: try: - watch_worker(supervisor, "worker_daphne") + # Tail worker logs while supervisord runs + sys.stdout.write('Tailing worker logs (Ctrl+C to stop)...\n\n') + sys.stdout.flush() + tail_multiple_worker_logs( + log_files=['logs/worker_daphne.log', 'logs/worker_orchestrator.log'], + follow=True, + proc=_supervisord_proc, # Stop tailing when supervisord exits + ) except (KeyboardInterrupt, BrokenPipeError, IOError): STDERR.print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") except SystemExit: pass except BaseException as e: - STDERR.print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping web server gracefully...") - raise + STDERR.print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping gracefully...") finally: - stop_worker(supervisor, "worker_daphne") + # Ensure supervisord and all children are stopped + stop_existing_supervisord_process() time.sleep(0.5) def start_cli_workers(watch=False): + global _supervisord_proc + supervisor = get_or_create_supervisord_process(daemonize=False) start_worker(supervisor, ORCHESTRATOR_WORKER) if watch: try: - watch_worker(supervisor, ORCHESTRATOR_WORKER['name']) + # Block on supervisord process - it will handle signals and stop children + if _supervisord_proc: + _supervisord_proc.wait() + else: + # Fallback to watching worker if no proc reference + watch_worker(supervisor, ORCHESTRATOR_WORKER['name']) except (KeyboardInterrupt, BrokenPipeError, IOError): STDERR.print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") except SystemExit: pass except BaseException as e: - STDERR.print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping orchestrator gracefully...") - raise + STDERR.print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping gracefully...") finally: - stop_worker(supervisor, ORCHESTRATOR_WORKER['name']) + # Ensure supervisord and all children are stopped + stop_existing_supervisord_process() time.sleep(0.5) return [ORCHESTRATOR_WORKER]