mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
actually working migration path from 0.7.2 and 0.8.6 + renames and test coverage
This commit is contained in:
@@ -83,15 +83,15 @@ class ConstantsDict(Mapping):
|
||||
CRONTABS_DIR_NAME: str = 'crontabs'
|
||||
CACHE_DIR_NAME: str = 'cache'
|
||||
LOGS_DIR_NAME: str = 'logs'
|
||||
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
|
||||
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
|
||||
CUSTOM_PLUGINS_DIR_NAME: str = 'custom_plugins'
|
||||
CUSTOM_TEMPLATES_DIR_NAME: str = 'custom_templates'
|
||||
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
|
||||
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
|
||||
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
|
||||
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
|
||||
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
|
||||
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
|
||||
USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
|
||||
USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME
|
||||
|
||||
# Data dir files
|
||||
CONFIG_FILENAME: str = 'ArchiveBox.conf'
|
||||
@@ -171,8 +171,11 @@ class ConstantsDict(Mapping):
|
||||
TMP_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
CUSTOM_TEMPLATES_DIR_NAME,
|
||||
USER_PLUGINS_DIR_NAME,
|
||||
CUSTOM_PLUGINS_DIR_NAME,
|
||||
CRONTABS_DIR_NAME,
|
||||
# Backwards compatibility with old directory names
|
||||
"user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins')
|
||||
"user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
|
||||
"static", # created by old static exports <v0.6.0
|
||||
"sonic", # created by docker bind mount / sonic FTS process
|
||||
".git",
|
||||
|
||||
@@ -117,7 +117,7 @@ class SnapshotAdminForm(forms.ModelForm):
|
||||
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
form = SnapshotAdminForm
|
||||
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'url_str')
|
||||
list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str')
|
||||
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
|
||||
readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
|
||||
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
|
||||
@@ -488,6 +488,12 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
|
||||
obj.url[:128],
|
||||
)
|
||||
|
||||
@admin.display(description='Health', ordering='health')
|
||||
def health_display(self, obj):
|
||||
h = obj.health
|
||||
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
|
||||
return format_html('<span style="color: {};">{}</span>', color, h)
|
||||
|
||||
def grid_view(self, request, extra_context=None):
|
||||
|
||||
# cl = self.get_changelist_instance(request)
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
# Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0
|
||||
|
||||
from django.db import migrations, models, connection
|
||||
import django.utils.timezone
|
||||
|
||||
|
||||
def get_table_columns(table_name):
|
||||
@@ -95,31 +96,31 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
# ============================================================================
|
||||
# PART 2: Upgrade core_snapshot table
|
||||
# ============================================================================
|
||||
# Create table with NEW field names for timestamps (bookmarked_at, created_at, modified_at)
|
||||
# and all other fields needed by later migrations
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS core_snapshot_new (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
url TEXT NOT NULL,
|
||||
timestamp VARCHAR(32) NOT NULL UNIQUE,
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
title VARCHAR(512),
|
||||
crawl_id TEXT,
|
||||
parent_snapshot_id TEXT,
|
||||
|
||||
title VARCHAR(512),
|
||||
bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
|
||||
|
||||
downloaded_at DATETIME,
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
status VARCHAR(15) NOT NULL DEFAULT 'queued',
|
||||
retry_at DATETIME,
|
||||
current_step INTEGER NOT NULL DEFAULT 0,
|
||||
|
||||
FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
|
||||
@@ -141,29 +142,23 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
has_bookmarked_at = 'bookmarked_at' in snapshot_cols
|
||||
|
||||
if has_added and not has_bookmarked_at:
|
||||
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
|
||||
# Migrating from v0.7.2 (has added/updated fields)
|
||||
print('Migrating Snapshot from v0.7.2 schema...')
|
||||
# Debug: Check what data we're about to copy
|
||||
cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3")
|
||||
sample_data = cursor.fetchall()
|
||||
print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}')
|
||||
|
||||
# Transform added→bookmarked_at/created_at and updated→modified_at
|
||||
cursor.execute("""
|
||||
INSERT OR IGNORE INTO core_snapshot_new (
|
||||
id, url, timestamp, title, bookmarked_at, created_at, modified_at
|
||||
id, url, timestamp, title,
|
||||
bookmarked_at, created_at, modified_at,
|
||||
status
|
||||
)
|
||||
SELECT
|
||||
id, url, timestamp, title,
|
||||
COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
|
||||
COALESCE(added, CURRENT_TIMESTAMP) as created_at,
|
||||
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
|
||||
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at,
|
||||
'queued' as status
|
||||
FROM core_snapshot;
|
||||
""")
|
||||
|
||||
# Debug: Check what was inserted
|
||||
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3")
|
||||
inserted_data = cursor.fetchall()
|
||||
print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}')
|
||||
elif has_bookmarked_at and not has_added:
|
||||
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
|
||||
print('Migrating Snapshot from v0.8.6rc0 schema...')
|
||||
@@ -308,14 +303,29 @@ class Migration(migrations.Migration):
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
# NOTE: We do NOT remove extractor/output here for ArchiveResult!
|
||||
# NOTE: We do NOT remove extractor/output for ArchiveResult!
|
||||
# They are still in the database and will be removed by migration 0025
|
||||
# after copying their data to the new field names (plugin, output_str).
|
||||
# after copying their data to plugin/output_str.
|
||||
|
||||
# However, for Snapshot, we DO remove added/updated here because
|
||||
# the database operations above already renamed them to bookmarked_at/created_at/modified_at.
|
||||
# However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields
|
||||
# because the SQL above already transformed them.
|
||||
migrations.RemoveField(model_name='snapshot', name='added'),
|
||||
migrations.RemoveField(model_name='snapshot', name='updated'),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
|
||||
# SnapshotTag table already exists from v0.7.2, just declare it in state
|
||||
migrations.CreateModel(
|
||||
|
||||
@@ -103,15 +103,21 @@ class Migration(migrations.Migration):
|
||||
);
|
||||
|
||||
INSERT INTO core_snapshot_final (
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
config, notes, num_uses_succeeded, num_uses_failed,
|
||||
id, url, timestamp, title,
|
||||
bookmarked_at, created_at, modified_at,
|
||||
crawl_id, parent_snapshot_id,
|
||||
downloaded_at, depth, fs_version,
|
||||
config, notes,
|
||||
num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
)
|
||||
SELECT
|
||||
id, created_at, modified_at, url, timestamp, bookmarked_at,
|
||||
crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
|
||||
COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
|
||||
id, url, timestamp, title,
|
||||
bookmarked_at, created_at, modified_at,
|
||||
crawl_id, parent_snapshot_id,
|
||||
downloaded_at, depth, fs_version,
|
||||
COALESCE(config, '{}'), COALESCE(notes, ''),
|
||||
num_uses_succeeded, num_uses_failed,
|
||||
status, retry_at, current_step
|
||||
FROM core_snapshot;
|
||||
|
||||
|
||||
@@ -9,23 +9,16 @@ from django.db import migrations, models, connection
|
||||
|
||||
|
||||
def copy_old_fields_to_new(apps, schema_editor):
|
||||
"""Copy data from old field names to new field names before AddField operations."""
|
||||
"""Copy data from old field names to new field names after AddField operations."""
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if old fields still exist
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
cols = {row[1] for row in cursor.fetchall()}
|
||||
print(f'DEBUG 0025: ArchiveResult columns: {sorted(cols)}')
|
||||
|
||||
if 'extractor' in cols and 'plugin' in cols:
|
||||
# Copy extractor -> plugin
|
||||
print('DEBUG 0025: Copying extractor -> plugin')
|
||||
cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL")
|
||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE plugin != ''")
|
||||
count = cursor.fetchone()[0]
|
||||
print(f'DEBUG 0025: Updated {count} rows with plugin data')
|
||||
else:
|
||||
print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}')
|
||||
|
||||
if 'output' in cols and 'output_str' in cols:
|
||||
# Copy output -> output_str
|
||||
@@ -38,16 +31,13 @@ def copy_old_fields_to_new(apps, schema_editor):
|
||||
if 'end_ts' in cols and 'modified_at' in cols:
|
||||
cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
|
||||
|
||||
# Same for Snapshot table
|
||||
cursor.execute("PRAGMA table_info(core_snapshot)")
|
||||
snap_cols = {row[1] for row in cursor.fetchall()}
|
||||
# NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
|
||||
# transformed by migration 0023, so we don't need to copy them here.
|
||||
|
||||
if 'added' in snap_cols and 'bookmarked_at' in snap_cols:
|
||||
cursor.execute("UPDATE core_snapshot SET bookmarked_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE bookmarked_at IS NULL OR bookmarked_at = ''")
|
||||
cursor.execute("UPDATE core_snapshot SET created_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''")
|
||||
|
||||
if 'updated' in snap_cols and 'modified_at' in snap_cols:
|
||||
cursor.execute("UPDATE core_snapshot SET modified_at = COALESCE(updated, added, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
|
||||
# Debug: Check Snapshot timestamps at end of RunPython
|
||||
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
|
||||
snap_after = cursor.fetchall()
|
||||
print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -149,21 +139,12 @@ class Migration(migrations.Migration):
|
||||
name='retry_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='bookmarked_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
# NOTE: bookmarked_at and created_at already added by migration 0023
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='config',
|
||||
field=models.JSONField(default=dict),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='created_at',
|
||||
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
@@ -184,11 +165,7 @@ class Migration(migrations.Migration):
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
# NOTE: modified_at already added by migration 0023
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='notes',
|
||||
@@ -248,7 +225,7 @@ class Migration(migrations.Migration):
|
||||
model_name='archiveresult',
|
||||
name='output',
|
||||
),
|
||||
# NOTE: Snapshot's added/updated fields were already removed by migration 0023
|
||||
# NOTE: Snapshot's added/updated were already removed by migration 0023
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='end_ts',
|
||||
|
||||
@@ -0,0 +1,28 @@
|
||||
# Generated by Django 6.0 on 2026-01-01 23:28
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
|
||||
('machine', '0003_add_process_type_and_parent'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_failed',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='num_uses_succeeded',
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='process',
|
||||
field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
|
||||
),
|
||||
]
|
||||
@@ -2285,13 +2285,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
# Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
|
||||
# Added POST-v0.9.0, will be added in a separate migration
|
||||
# process = models.OneToOneField(
|
||||
# 'machine.Process',
|
||||
# on_delete=models.PROTECT,
|
||||
# null=False,
|
||||
# related_name='archiveresult',
|
||||
# help_text='Process execution details for this archive result'
|
||||
# )
|
||||
process = models.OneToOneField(
|
||||
'machine.Process',
|
||||
on_delete=models.PROTECT,
|
||||
null=True,
|
||||
blank=True,
|
||||
related_name='archiveresult',
|
||||
help_text='Process execution details for this archive result'
|
||||
)
|
||||
|
||||
# New output fields (replacing old 'output' field)
|
||||
output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
|
||||
|
||||
@@ -154,7 +154,7 @@ class CrawlAdminForm(forms.ModelForm):
|
||||
|
||||
class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
form = CrawlAdminForm
|
||||
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
|
||||
list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'health_display', 'num_snapshots')
|
||||
sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
|
||||
search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')
|
||||
|
||||
@@ -270,6 +270,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
|
||||
return first_url[:80] + '...' if len(first_url) > 80 else first_url
|
||||
|
||||
@admin.display(description='Health', ordering='health')
|
||||
def health_display(self, obj):
|
||||
h = obj.health
|
||||
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
|
||||
return format_html('<span style="color: {};">{}</span>', color, h)
|
||||
|
||||
@admin.display(description='URLs')
|
||||
def urls_editor(self, obj):
|
||||
"""Editor for crawl URLs."""
|
||||
|
||||
@@ -0,0 +1,21 @@
|
||||
# Generated by Django 6.0 on 2026-01-01 23:36
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0002_upgrade_from_0_8_6'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='crawlschedule',
|
||||
name='num_uses_failed',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='crawlschedule',
|
||||
name='num_uses_succeeded',
|
||||
),
|
||||
]
|
||||
@@ -519,12 +519,14 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
def is_finished(self) -> bool:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# check that at least one snapshot exists for this crawl
|
||||
# Check if any snapshots exist for this crawl
|
||||
snapshots = Snapshot.objects.filter(crawl=self.crawl)
|
||||
if not snapshots.exists():
|
||||
return False
|
||||
|
||||
# check if all snapshots are sealed
|
||||
# If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
|
||||
if not snapshots.exists():
|
||||
return True
|
||||
|
||||
# If snapshots exist, check if all are sealed
|
||||
# Snapshots handle their own background hooks via the step system,
|
||||
# so we just need to wait for all snapshots to reach sealed state
|
||||
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
|
||||
|
||||
@@ -8,7 +8,7 @@ from archivebox.machine.models import Machine, NetworkInterface, Binary, Process
|
||||
|
||||
|
||||
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
|
||||
list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health_display')
|
||||
sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
|
||||
|
||||
readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
|
||||
@@ -52,9 +52,15 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
machine.id, ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)),
|
||||
)
|
||||
|
||||
@admin.display(description='Health', ordering='health')
|
||||
def health_display(self, obj):
|
||||
h = obj.health
|
||||
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
|
||||
return format_html('<span style="color: {};">{}</span>', color, h)
|
||||
|
||||
|
||||
class NetworkInterfaceAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health')
|
||||
list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health_display')
|
||||
sort_fields = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address')
|
||||
search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
|
||||
|
||||
@@ -95,9 +101,15 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
|
||||
iface.machine.id, str(iface.machine.id)[:8], iface.machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Health', ordering='health')
|
||||
def health_display(self, obj):
|
||||
h = obj.health
|
||||
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
|
||||
return format_html('<span style="color: {};">{}</span>', color, h)
|
||||
|
||||
|
||||
class BinaryAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health')
|
||||
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health_display')
|
||||
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
|
||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
||||
|
||||
@@ -142,6 +154,12 @@ class BinaryAdmin(BaseModelAdmin):
|
||||
binary.machine.id, str(binary.machine.id)[:8], binary.machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Health', ordering='health')
|
||||
def health_display(self, obj):
|
||||
h = obj.health
|
||||
color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
|
||||
return format_html('<span style="color: {};">{}</span>', color, h)
|
||||
|
||||
|
||||
class ProcessAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info')
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
# Generated by Django 6.0 on 2026-01-01 22:55
|
||||
|
||||
import django.db.models.deletion
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0002_process'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='process',
|
||||
name='parent',
|
||||
field=models.ForeignKey(blank=True, help_text='Parent process that spawned this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='machine.process'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='process',
|
||||
name='process_type',
|
||||
field=models.CharField(choices=[('supervisord', 'Supervisord'), ('orchestrator', 'Orchestrator'), ('worker', 'Worker'), ('cli', 'CLI'), ('binary', 'Binary')], db_index=True, default='cli', help_text='Type of process (cli, worker, orchestrator, binary, supervisord)', max_length=16),
|
||||
),
|
||||
]
|
||||
@@ -153,8 +153,8 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
city = models.CharField(max_length=63, default=None, null=False)
|
||||
region = models.CharField(max_length=63, default=None, null=False)
|
||||
country = models.CharField(max_length=63, default=None, null=False)
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
# num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
|
||||
# num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
|
||||
|
||||
objects: NetworkInterfaceManager = NetworkInterfaceManager()
|
||||
|
||||
@@ -588,6 +588,13 @@ class Process(models.Model):
|
||||
RUNNING = 'running', 'Running'
|
||||
EXITED = 'exited', 'Exited'
|
||||
|
||||
class TypeChoices(models.TextChoices):
|
||||
SUPERVISORD = 'supervisord', 'Supervisord'
|
||||
ORCHESTRATOR = 'orchestrator', 'Orchestrator'
|
||||
WORKER = 'worker', 'Worker'
|
||||
CLI = 'cli', 'CLI'
|
||||
BINARY = 'binary', 'Binary'
|
||||
|
||||
# Primary fields
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
@@ -602,6 +609,24 @@ class Process(models.Model):
|
||||
help_text='Machine where this process executed'
|
||||
)
|
||||
|
||||
# Parent process (optional)
|
||||
parent = models.ForeignKey(
|
||||
'self',
|
||||
on_delete=models.SET_NULL,
|
||||
null=True, blank=True,
|
||||
related_name='children',
|
||||
help_text='Parent process that spawned this process'
|
||||
)
|
||||
|
||||
# Process type (cli, worker, orchestrator, binary, supervisord)
|
||||
process_type = models.CharField(
|
||||
max_length=16,
|
||||
choices=TypeChoices.choices,
|
||||
default=TypeChoices.CLI,
|
||||
db_index=True,
|
||||
help_text='Type of process (cli, worker, orchestrator, binary, supervisord)'
|
||||
)
|
||||
|
||||
# Execution metadata
|
||||
pwd = models.CharField(max_length=512, default='', null=False, blank=True,
|
||||
help_text='Working directory for process execution')
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* - Accessibility snapshot
|
||||
* - ARIA labels and roles
|
||||
*
|
||||
* Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes accessibility/accessibility.json
|
||||
*
|
||||
* Environment variables:
|
||||
@@ -203,7 +203,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
|
||||
* --load-extension and --disable-extensions-except flags.
|
||||
*
|
||||
* Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
|
||||
* Output: Writes to current directory (executor creates chrome/ dir):
|
||||
* - cdp_url.txt: WebSocket URL for CDP connection
|
||||
* - chrome.pid: Chromium process ID (for cleanup)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
/**
|
||||
* Create a Chrome tab for this snapshot in the shared crawl Chrome session.
|
||||
*
|
||||
* If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
|
||||
* If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
|
||||
* this connects to it and creates a new tab. Otherwise, falls back to launching
|
||||
* its own Chrome instance.
|
||||
*
|
||||
|
||||
@@ -73,8 +73,8 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
|
||||
PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent
|
||||
|
||||
# Hook script locations
|
||||
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
|
||||
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
|
||||
CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py'
|
||||
CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
|
||||
CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
|
||||
CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
|
||||
CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
|
||||
* Otherwise launches a new Chrome instance.
|
||||
*
|
||||
* Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes dom/output.html
|
||||
*
|
||||
* Environment variables:
|
||||
@@ -175,7 +175,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
* response headers from chrome plugin/response_headers.json.
|
||||
* Otherwise falls back to making an HTTP HEAD request.
|
||||
*
|
||||
* Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes headers/headers.json
|
||||
*
|
||||
* Environment variables:
|
||||
@@ -116,7 +116,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@
|
||||
* - iframes: <iframe src>
|
||||
* - links: <link> tags with rel/href
|
||||
*
|
||||
* Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
|
||||
*
|
||||
* Environment variables:
|
||||
@@ -216,7 +216,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
|
||||
* Otherwise launches a new Chrome instance.
|
||||
*
|
||||
* Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes pdf/output.pdf
|
||||
*
|
||||
* Environment variables:
|
||||
@@ -184,7 +184,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* If a Chrome session exists (from chrome plugin), connects to it via CDP.
|
||||
* Otherwise launches a new Chrome instance.
|
||||
*
|
||||
* Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes screenshot/screenshot.png
|
||||
*
|
||||
* Environment variables:
|
||||
@@ -177,7 +177,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
* - description, keywords, author
|
||||
* - Any other meta tags
|
||||
*
|
||||
* Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Writes seo/seo.json
|
||||
*
|
||||
* Environment variables:
|
||||
@@ -157,7 +157,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@
|
||||
* Content-Type from the initial response. If it's a static file (PDF, image, etc.),
|
||||
* it downloads the content directly using CDP.
|
||||
*
|
||||
* Usage: on_Snapshot__31_staticfile.bg.js --url=<url> --snapshot-id=<uuid>
|
||||
* Usage: on_Snapshot__32_staticfile.bg.js --url=<url> --snapshot-id=<uuid>
|
||||
* Output: Downloads static file
|
||||
*/
|
||||
|
||||
@@ -288,7 +288,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__31_staticfile.bg.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Snapshot__32_staticfile.bg.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ const EXTENSION = {
|
||||
/**
|
||||
* Main entry point - install extension before archiving
|
||||
*
|
||||
* Note: 2captcha configuration is handled by on_Crawl__25_configure_twocaptcha_extension_options.js
|
||||
* Note: 2captcha configuration is handled by on_Crawl__25_twocaptcha_config.js
|
||||
* during first-time browser setup to avoid repeated configuration on every snapshot.
|
||||
* The API key is injected via chrome.storage API once per browser session.
|
||||
*/
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
* Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
|
||||
* Runs once per crawl to inject configuration into extension storage.
|
||||
*
|
||||
* Priority: 25 (after chrome_launch at 30, before snapshots start)
|
||||
* Priority: 25 (after chrome_launch at 20, before snapshots start)
|
||||
* Hook: on_Crawl (runs once per crawl, not per snapshot)
|
||||
*
|
||||
* Config Options (from config.json / environment):
|
||||
@@ -346,7 +346,7 @@ async function main() {
|
||||
const snapshotId = args.snapshot_id;
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
|
||||
console.error('Usage: on_Crawl__25_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
|
||||
@@ -26,8 +26,8 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
|
||||
INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__05_twocaptcha_install.js'
|
||||
CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_twocaptcha_config.js'
|
||||
|
||||
TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile'
|
||||
|
||||
|
||||
@@ -269,30 +269,44 @@ class Orchestrator:
|
||||
from archivebox.misc.logging import IS_TTY
|
||||
import archivebox.misc.logging as logging_module
|
||||
|
||||
self.on_startup()
|
||||
|
||||
# Enable progress bars only in TTY + foreground mode
|
||||
show_progress = IS_TTY and self.exit_on_idle
|
||||
|
||||
# Save original consoles
|
||||
original_console = logging_module.CONSOLE
|
||||
original_stderr = logging_module.STDERR
|
||||
|
||||
# Create Progress with the console it will control
|
||||
progress = Progress(
|
||||
TextColumn("[cyan]{task.description}"),
|
||||
BarColumn(bar_width=40),
|
||||
TaskProgressColumn(),
|
||||
transient=False,
|
||||
console=original_console, # Use the original console
|
||||
) if show_progress else None
|
||||
|
||||
task_ids = {} # snapshot_id -> task_id
|
||||
|
||||
# Replace global CONSOLE with progress.console when active
|
||||
original_console = logging_module.CONSOLE
|
||||
original_stderr = logging_module.STDERR
|
||||
# Wrapper to convert console.print() to console.log() for Rich Progress
|
||||
class ConsoleLogWrapper:
|
||||
def __init__(self, console):
|
||||
self._console = console
|
||||
def print(self, *args, **kwargs):
|
||||
# Use log() instead of print() to work with Live display
|
||||
self._console.log(*args)
|
||||
def __getattr__(self, name):
|
||||
return getattr(self._console, name)
|
||||
|
||||
try:
|
||||
if progress:
|
||||
progress.start()
|
||||
# Redirect all logging through progress.console
|
||||
logging_module.CONSOLE = progress.console
|
||||
logging_module.STDERR = progress.console
|
||||
# Wrap progress.console so print() calls become log() calls
|
||||
wrapped_console = ConsoleLogWrapper(progress.console)
|
||||
logging_module.CONSOLE = wrapped_console
|
||||
logging_module.STDERR = wrapped_console
|
||||
|
||||
# Call on_startup AFTER redirecting consoles
|
||||
self.on_startup()
|
||||
|
||||
while True:
|
||||
# Check queues and spawn workers
|
||||
@@ -302,9 +316,15 @@ class Orchestrator:
|
||||
if progress:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
active_snapshots = Snapshot.objects.filter(status='started').iterator(chunk_size=100)
|
||||
# Get all started snapshots
|
||||
active_snapshots = list(Snapshot.objects.filter(status='started'))
|
||||
|
||||
# Track which snapshots are still active
|
||||
active_ids = set()
|
||||
|
||||
for snapshot in active_snapshots:
|
||||
active_ids.add(snapshot.id)
|
||||
|
||||
total = snapshot.archiveresult_set.count()
|
||||
if total == 0:
|
||||
continue
|
||||
@@ -316,9 +336,15 @@ class Orchestrator:
|
||||
# Create or update task
|
||||
if snapshot.id not in task_ids:
|
||||
url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url
|
||||
task_ids[snapshot.id] = progress.add_task(url, total=total)
|
||||
task_ids[snapshot.id] = progress.add_task(url, total=total, completed=completed)
|
||||
else:
|
||||
progress.update(task_ids[snapshot.id], completed=completed)
|
||||
|
||||
progress.update(task_ids[snapshot.id], completed=completed)
|
||||
# Remove tasks for snapshots that are no longer active
|
||||
for snapshot_id in list(task_ids.keys()):
|
||||
if snapshot_id not in active_ids:
|
||||
progress.remove_task(task_ids[snapshot_id])
|
||||
del task_ids[snapshot_id]
|
||||
|
||||
# Track idle state
|
||||
if self.has_pending_work(queue_sizes) or self.has_running_workers():
|
||||
|
||||
Reference in New Issue
Block a user