actually working migration path from 0.7.2 and 0.8.6 + renames and test coverage

This commit is contained in:
Nick Sweeting
2026-01-01 15:49:56 -08:00
parent 6fadcf5168
commit 876feac522
33 changed files with 825 additions and 333 deletions

View File

@@ -83,15 +83,15 @@ class ConstantsDict(Mapping):
CRONTABS_DIR_NAME: str = 'crontabs'
CACHE_DIR_NAME: str = 'cache'
LOGS_DIR_NAME: str = 'logs'
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
CUSTOM_PLUGINS_DIR_NAME: str = 'custom_plugins'
CUSTOM_TEMPLATES_DIR_NAME: str = 'custom_templates'
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
# Data dir files
CONFIG_FILENAME: str = 'ArchiveBox.conf'
@@ -171,8 +171,11 @@ class ConstantsDict(Mapping):
TMP_DIR_NAME,
PERSONAS_DIR_NAME,
CUSTOM_TEMPLATES_DIR_NAME,
USER_PLUGINS_DIR_NAME,
CUSTOM_PLUGINS_DIR_NAME,
CRONTABS_DIR_NAME,
# Backwards compatibility with old directory names
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount / sonic FTS process
".git",

View File

@@ -117,7 +117,7 @@ class SnapshotAdminForm(forms.ModelForm):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
form = SnapshotAdminForm
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'url_str')
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
@@ -488,6 +488,12 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
obj.url[:128],
)
@admin.display(description='Health', ordering='health')
def health_display(self, obj):
h = obj.health
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
return format_html('<span style="color: {};">{}</span>', color, h)
def grid_view(self, request, extra_context=None):
# cl = self.get_changelist_instance(request)

View File

@@ -3,6 +3,7 @@
# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0
from django.db import migrations, models, connection
import django.utils.timezone
def get_table_columns(table_name):
@@ -95,31 +96,31 @@ def upgrade_core_tables(apps, schema_editor):
# ============================================================================
# PART 2: Upgrade core_snapshot table
# ============================================================================
# Create table with NEW field names for timestamps (bookmarked_at, created_at, modified_at)
# and all other fields needed by later migrations
cursor.execute("""
CREATE TABLE IF NOT EXISTS core_snapshot_new (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
url TEXT NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
title VARCHAR(512),
crawl_id TEXT,
parent_snapshot_id TEXT,
title VARCHAR(512),
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
downloaded_at DATETIME,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
status VARCHAR(15) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
current_step INTEGER NOT NULL DEFAULT 0,
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
@@ -141,29 +142,23 @@ def upgrade_core_tables(apps, schema_editor):
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
if has_added and not has_bookmarked_at:
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
# Migrating from v0.7.2 (has added/updated fields)
print('Migrating Snapshot from v0.7.2 schema...')
# Debug: Check what data we're about to copy
cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3")
sample_data = cursor.fetchall()
print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}')
# Transform added→bookmarked_at/created_at and updated→modified_at
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
id, url, timestamp, title, bookmarked_at, created_at, modified_at
id, url, timestamp, title,
bookmarked_at, created_at, modified_at,
status
)
SELECT
id, url, timestamp, title,
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at,
'queued' as status
FROM core_snapshot;
""")
# Debug: Check what was inserted
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3")
inserted_data = cursor.fetchall()
print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}')
elif has_bookmarked_at and not has_added:
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.8.6rc0 schema...')
@@ -308,14 +303,29 @@ class Migration(migrations.Migration):
),
],
state_operations=[
# NOTE: We do NOT remove extractor/output here for ArchiveResult!
# NOTE: We do NOT remove extractor/output for ArchiveResult!
# They are still in the database and will be removed by migration 0025
# after copying their data to the new field names (plugin, output_str).
# after copying their data to plugin/output_str.
# However, for Snapshot, we DO remove added/updated here because
# the database operations above already renamed them to bookmarked_at/created_at/modified_at.
# However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields
# because the SQL above already transformed them.
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
# SnapshotTag table already exists from v0.7.2, just declare it in state
migrations.CreateModel(

View File

@@ -103,15 +103,21 @@ class Migration(migrations.Migration):
);
INSERT INTO core_snapshot_final (
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
config, notes, num_uses_succeeded, num_uses_failed,
id, url, timestamp, title,
bookmarked_at, created_at, modified_at,
crawl_id, parent_snapshot_id,
downloaded_at, depth, fs_version,
config, notes,
num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
)
SELECT
id, created_at, modified_at, url, timestamp, bookmarked_at,
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
id, url, timestamp, title,
bookmarked_at, created_at, modified_at,
crawl_id, parent_snapshot_id,
downloaded_at, depth, fs_version,
COALESCE(config, '{}'), COALESCE(notes, ''),
num_uses_succeeded, num_uses_failed,
status, retry_at, current_step
FROM core_snapshot;

View File

@@ -9,23 +9,16 @@ from django.db import migrations, models, connection
def copy_old_fields_to_new(apps, schema_editor):
"""Copy data from old field names to new field names before AddField operations."""
"""Copy data from old field names to new field names after AddField operations."""
cursor = connection.cursor()
# Check if old fields still exist
cursor.execute("PRAGMA table_info(core_archiveresult)")
cols = {row[1] for row in cursor.fetchall()}
print(f'DEBUG 0025: ArchiveResult columns: {sorted(cols)}')
if 'extractor' in cols and 'plugin' in cols:
# Copy extractor -> plugin
print('DEBUG 0025: Copying extractor -> plugin')
cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL")
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE plugin != ''")
count = cursor.fetchone()[0]
print(f'DEBUG 0025: Updated {count} rows with plugin data')
else:
print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}')
if 'output' in cols and 'output_str' in cols:
# Copy output -> output_str
@@ -38,16 +31,13 @@ def copy_old_fields_to_new(apps, schema_editor):
if 'end_ts' in cols and 'modified_at' in cols:
cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
# Same for Snapshot table
cursor.execute("PRAGMA table_info(core_snapshot)")
snap_cols = {row[1] for row in cursor.fetchall()}
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
# transformed by migration 0023, so we don't need to copy them here.
if 'added' in snap_cols and 'bookmarked_at' in snap_cols:
cursor.execute("UPDATE core_snapshot SET bookmarked_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE bookmarked_at IS NULL OR bookmarked_at = ''")
cursor.execute("UPDATE core_snapshot SET created_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''")
if 'updated' in snap_cols and 'modified_at' in snap_cols:
cursor.execute("UPDATE core_snapshot SET modified_at = COALESCE(updated, added, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
# Debug: Check Snapshot timestamps at end of RunPython
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
snap_after = cursor.fetchall()
print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}')
class Migration(migrations.Migration):
@@ -149,21 +139,12 @@ class Migration(migrations.Migration):
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
# NOTE: bookmarked_at and created_at already added by migration 0023
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AddField(
model_name='snapshot',
name='current_step',
@@ -184,11 +165,7 @@ class Migration(migrations.Migration):
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
# NOTE: modified_at already added by migration 0023
migrations.AddField(
model_name='snapshot',
name='notes',
@@ -248,7 +225,7 @@ class Migration(migrations.Migration):
model_name='archiveresult',
name='output',
),
# NOTE: Snapshot's added/updated fields were already removed by migration 0023
# NOTE: Snapshot's added/updated were already removed by migration 0023
migrations.AlterField(
model_name='archiveresult',
name='end_ts',

View File

@@ -0,0 +1,28 @@
# Generated by Django 6.0 on 2026-01-01 23:28
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
('machine', '0003_add_process_type_and_parent'),
]
operations = [
migrations.RemoveField(
model_name='archiveresult',
name='num_uses_failed',
),
migrations.RemoveField(
model_name='archiveresult',
name='num_uses_succeeded',
),
migrations.AddField(
model_name='archiveresult',
name='process',
field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
),
]

View File

@@ -2285,13 +2285,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
# Added POST-v0.9.0, will be added in a separate migration
# process = models.OneToOneField(
# 'machine.Process',
# on_delete=models.PROTECT,
# null=False,
# related_name='archiveresult',
# help_text='Process execution details for this archive result'
# )
process = models.OneToOneField(
'machine.Process',
on_delete=models.PROTECT,
null=True,
blank=True,
related_name='archiveresult',
help_text='Process execution details for this archive result'
)
# New output fields (replacing old 'output' field)
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')

View File

@@ -154,7 +154,7 @@ class CrawlAdminForm(forms.ModelForm):
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
form = CrawlAdminForm
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'health_display', 'num_snapshots')
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
@@ -270,6 +270,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
return first_url[:80] + '...' if len(first_url) > 80 else first_url
@admin.display(description='Health', ordering='health')
def health_display(self, obj):
h = obj.health
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
return format_html('<span style="color: {};">{}</span>', color, h)
@admin.display(description='URLs')
def urls_editor(self, obj):
"""Editor for crawl URLs."""

View File

@@ -0,0 +1,21 @@
# Generated by Django 6.0 on 2026-01-01 23:36
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('crawls', '0002_upgrade_from_0_8_6'),
]
operations = [
migrations.RemoveField(
model_name='crawlschedule',
name='num_uses_failed',
),
migrations.RemoveField(
model_name='crawlschedule',
name='num_uses_succeeded',
),
]

View File

@@ -519,12 +519,14 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
def is_finished(self) -> bool:
from archivebox.core.models import Snapshot
# check that at least one snapshot exists for this crawl
# Check if any snapshots exist for this crawl
snapshots = Snapshot.objects.filter(crawl=self.crawl)
if not snapshots.exists():
return False
# check if all snapshots are sealed
# If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
if not snapshots.exists():
return True
# If snapshots exist, check if all are sealed
# Snapshots handle their own background hooks via the step system,
# so we just need to wait for all snapshots to reach sealed state
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():

View File

@@ -8,7 +8,7 @@ from archivebox.machine.models import Machine, NetworkInterface, Binary, Process
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health_display')
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
@@ -52,9 +52,15 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
machine.id, ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)),
)
@admin.display(description='Health', ordering='health')
def health_display(self, obj):
h = obj.health
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
return format_html('<span style="color: {};">{}</span>', color, h)
class NetworkInterfaceAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health')
list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health_display')
sort_fields = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address')
search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
@@ -95,9 +101,15 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
iface.machine.id, str(iface.machine.id)[:8], iface.machine.hostname,
)
@admin.display(description='Health', ordering='health')
def health_display(self, obj):
h = obj.health
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
return format_html('<span style="color: {};">{}</span>', color, h)
class BinaryAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health')
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health_display')
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
@@ -142,6 +154,12 @@ class BinaryAdmin(BaseModelAdmin):
binary.machine.id, str(binary.machine.id)[:8], binary.machine.hostname,
)
@admin.display(description='Health', ordering='health')
def health_display(self, obj):
h = obj.health
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
return format_html('<span style="color: {};">{}</span>', color, h)
class ProcessAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info')

View File

@@ -0,0 +1,24 @@
# Generated by Django 6.0 on 2026-01-01 22:55
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0002_process'),
]
operations = [
migrations.AddField(
model_name='process',
name='parent',
field=models.ForeignKey(blank=True, help_text='Parent process that spawned this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='machine.process'),
),
migrations.AddField(
model_name='process',
name='process_type',
field=models.CharField(choices=[('supervisord', 'Supervisord'), ('orchestrator', 'Orchestrator'), ('worker', 'Worker'), ('cli', 'CLI'), ('binary', 'Binary')], db_index=True, default='cli', help_text='Type of process (cli, worker, orchestrator, binary, supervisord)', max_length=16),
),
]

View File

@@ -153,8 +153,8 @@ class NetworkInterface(ModelWithHealthStats):
city = models.CharField(max_length=63, default=None, null=False)
region = models.CharField(max_length=63, default=None, null=False)
country = models.CharField(max_length=63, default=None, null=False)
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
# num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
# num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
objects: NetworkInterfaceManager = NetworkInterfaceManager()
@@ -588,6 +588,13 @@ class Process(models.Model):
RUNNING = 'running', 'Running'
EXITED = 'exited', 'Exited'
class TypeChoices(models.TextChoices):
SUPERVISORD = 'supervisord', 'Supervisord'
ORCHESTRATOR = 'orchestrator', 'Orchestrator'
WORKER = 'worker', 'Worker'
CLI = 'cli', 'CLI'
BINARY = 'binary', 'Binary'
# Primary fields
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -602,6 +609,24 @@ class Process(models.Model):
help_text='Machine where this process executed'
)
# Parent process (optional)
parent = models.ForeignKey(
'self',
on_delete=models.SET_NULL,
null=True, blank=True,
related_name='children',
help_text='Parent process that spawned this process'
)
# Process type (cli, worker, orchestrator, binary, supervisord)
process_type = models.CharField(
max_length=16,
choices=TypeChoices.choices,
default=TypeChoices.CLI,
db_index=True,
help_text='Type of process (cli, worker, orchestrator, binary, supervisord)'
)
# Execution metadata
pwd = models.CharField(max_length=512, default='', null=False, blank=True,
help_text='Working directory for process execution')

View File

@@ -8,7 +8,7 @@
* - Accessibility snapshot
* - ARIA labels and roles
*
* Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>
* Output: Writes accessibility/accessibility.json
*
* Environment variables:
@@ -203,7 +203,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -8,7 +8,7 @@
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
* --load-extension and --disable-extensions-except flags.
*
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
* Output: Writes to current directory (executor creates chrome/ dir):
* - cdp_url.txt: WebSocket URL for CDP connection
* - chrome.pid: Chromium process ID (for cleanup)

View File

@@ -2,7 +2,7 @@
/**
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
*
* If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
* this connects to it and creates a new tab. Otherwise, falls back to launching
* its own Chrome instance.
*

View File

@@ -73,8 +73,8 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
# Hook script locations
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py'
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'

View File

@@ -5,7 +5,7 @@
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
* Otherwise launches a new Chrome instance.
*
* Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>
* Output: Writes dom/output.html
*
* Environment variables:
@@ -175,7 +175,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -6,7 +6,7 @@
* response headers from chrome plugin/response_headers.json.
* Otherwise falls back to making an HTTP HEAD request.
*
* Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>
* Output: Writes headers/headers.json
*
* Environment variables:
@@ -116,7 +116,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -11,7 +11,7 @@
* - iframes: <iframe src>
* - links: <link> tags with rel/href
*
* Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
* Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
*
* Environment variables:
@@ -216,7 +216,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -5,7 +5,7 @@
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
* Otherwise launches a new Chrome instance.
*
* Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>
* Output: Writes pdf/output.pdf
*
* Environment variables:
@@ -184,7 +184,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -5,7 +5,7 @@
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
* Otherwise launches a new Chrome instance.
*
* Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>
* Output: Writes screenshot/screenshot.png
*
* Environment variables:
@@ -177,7 +177,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -8,7 +8,7 @@
* - description, keywords, author
* - Any other meta tags
*
* Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>
* Output: Writes seo/seo.json
*
* Environment variables:
@@ -157,7 +157,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -6,7 +6,7 @@
* Content-Type from the initial response. If it's a static file (PDF, image, etc.),
* it downloads the content directly using CDP.
*
* Usage: on_Snapshot__31_staticfile.bg.js --url=<url> --snapshot-id=<uuid>
* Usage: on_Snapshot__32_staticfile.bg.js --url=<url> --snapshot-id=<uuid>
* Output: Downloads static file
*/
@@ -288,7 +288,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__31_staticfile.bg.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Snapshot__32_staticfile.bg.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -28,7 +28,7 @@ const EXTENSION = {
/**
* Main entry point - install extension before archiving
*
* Note: 2captcha configuration is handled by on_Crawl__25_configure_twocaptcha_extension_options.js
* Note: 2captcha configuration is handled by on_Crawl__25_twocaptcha_config.js
* during first-time browser setup to avoid repeated configuration on every snapshot.
* The API key is injected via chrome.storage API once per browser session.
*/

View File

@@ -5,7 +5,7 @@
* Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
* Runs once per crawl to inject configuration into extension storage.
*
* Priority: 25 (after chrome_launch at 30, before snapshots start)
* Priority: 25 (after chrome_launch at 20, before snapshots start)
* Hook: on_Crawl (runs once per crawl, not per snapshot)
*
* Config Options (from config.json / environment):
@@ -346,7 +346,7 @@ async function main() {
const snapshotId = args.snapshot_id;
if (!url || !snapshotId) {
console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
console.error('Usage: on_Crawl__25_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
}

View File

@@ -26,8 +26,8 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
PLUGIN_DIR = Path(__file__).parent.parent
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__05_twocaptcha_install.js'
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_twocaptcha_config.js'
TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile'

View File

@@ -269,30 +269,44 @@ class Orchestrator:
from archivebox.misc.logging import IS_TTY
import archivebox.misc.logging as logging_module
self.on_startup()
# Enable progress bars only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle
# Save original consoles
original_console = logging_module.CONSOLE
original_stderr = logging_module.STDERR
# Create Progress with the console it will control
progress = Progress(
TextColumn("[cyan]{task.description}"),
BarColumn(bar_width=40),
TaskProgressColumn(),
transient=False,
console=original_console, # Use the original console
) if show_progress else None
task_ids = {} # snapshot_id -> task_id
# Replace global CONSOLE with progress.console when active
original_console = logging_module.CONSOLE
original_stderr = logging_module.STDERR
# Wrapper to convert console.print() to console.log() for Rich Progress
class ConsoleLogWrapper:
def __init__(self, console):
self._console = console
def print(self, *args, **kwargs):
# Use log() instead of print() to work with Live display
self._console.log(*args)
def __getattr__(self, name):
return getattr(self._console, name)
try:
if progress:
progress.start()
# Redirect all logging through progress.console
logging_module.CONSOLE = progress.console
logging_module.STDERR = progress.console
# Wrap progress.console so print() calls become log() calls
wrapped_console = ConsoleLogWrapper(progress.console)
logging_module.CONSOLE = wrapped_console
logging_module.STDERR = wrapped_console
# Call on_startup AFTER redirecting consoles
self.on_startup()
while True:
# Check queues and spawn workers
@@ -302,9 +316,15 @@ class Orchestrator:
if progress:
from archivebox.core.models import Snapshot
active_snapshots = Snapshot.objects.filter(status='started').iterator(chunk_size=100)
# Get all started snapshots
active_snapshots = list(Snapshot.objects.filter(status='started'))
# Track which snapshots are still active
active_ids = set()
for snapshot in active_snapshots:
active_ids.add(snapshot.id)
total = snapshot.archiveresult_set.count()
if total == 0:
continue
@@ -316,9 +336,15 @@ class Orchestrator:
# Create or update task
if snapshot.id not in task_ids:
url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url
task_ids[snapshot.id] = progress.add_task(url, total=total)
task_ids[snapshot.id] = progress.add_task(url, total=total, completed=completed)
else:
progress.update(task_ids[snapshot.id], completed=completed)
progress.update(task_ids[snapshot.id], completed=completed)
# Remove tasks for snapshots that are no longer active
for snapshot_id in list(task_ids.keys()):
if snapshot_id not in active_ids:
progress.remove_task(task_ids[snapshot_id])
del task_ids[snapshot_id]
# Track idle state
if self.has_pending_work(queue_sizes) or self.has_running_workers():