fix initial migrtaions

This commit is contained in:
Nick Sweeting
2025-12-29 21:27:31 -08:00
parent 3dd329600e
commit 2e350d317d
6 changed files with 285 additions and 109 deletions

View File

@@ -104,8 +104,17 @@ def add(urls: str | list[str],
if index_only:
# Just create the crawl but don't start processing
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
# Create root snapshot manually
crawl.create_root_snapshot()
# Create snapshots for all URLs in the crawl
for url in crawl.get_urls_list():
Snapshot.objects.update_or_create(
crawl=crawl, url=url,
defaults={
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()),
'depth': 0,
},
)
return crawl.snapshot_set.all()
# 5. Start the orchestrator to process the queue

View File

@@ -577,17 +577,20 @@ def live_progress_view(request):
active_crawls = []
for crawl in active_crawls_qs:
# Get active snapshots for this crawl - filter in Python since we prefetched all
crawl_snapshots = [
s for s in crawl.snapshot_set.all()
if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
][:5] # Limit to 5 most recent
# Get ALL snapshots for this crawl to count status (already prefetched)
all_crawl_snapshots = list(crawl.snapshot_set.all())
# Count snapshots by status (in memory, not DB)
total_snapshots = Snapshot.objects.filter(crawl=crawl).count() # Full count needs DB
completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
# Count snapshots by status from ALL snapshots
total_snapshots = len(all_crawl_snapshots)
completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
# Get only ACTIVE snapshots to display (limit to 5 most recent)
active_crawl_snapshots = [
s for s in all_crawl_snapshots
if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
][:5]
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
@@ -599,7 +602,7 @@ def live_progress_view(request):
# Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
for snapshot in crawl_snapshots:
for snapshot in active_crawl_snapshots:
# Get archive results for this snapshot (already prefetched)
snapshot_results = snapshot.archiveresult_set.all()

View File

@@ -1,7 +1,10 @@
# Generated by hand on 2025-12-29
# Creates Machine, Binary, NetworkInterface, and Process tables using raw SQL
from django.db import migrations
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from archivebox.uuid_compat import uuid7
class Migration(migrations.Migration):
@@ -12,9 +15,10 @@ class Migration(migrations.Migration):
]
operations = [
migrations.RunSQL(
# Forward SQL
sql="""
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunSQL(
sql="""
-- Create machine_machine table
CREATE TABLE IF NOT EXISTS machine_machine (
id TEXT PRIMARY KEY NOT NULL,
@@ -136,12 +140,133 @@ class Migration(migrations.Migration):
CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id);
CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at);
""",
# Reverse SQL
reverse_sql="""
DROP TABLE IF EXISTS machine_process;
DROP TABLE IF EXISTS machine_binary;
DROP TABLE IF EXISTS machine_networkinterface;
DROP TABLE IF EXISTS machine_machine;
"""
reverse_sql="""
DROP TABLE IF EXISTS machine_process;
DROP TABLE IF EXISTS machine_binary;
DROP TABLE IF EXISTS machine_networkinterface;
DROP TABLE IF EXISTS machine_machine;
"""
),
],
state_operations=[
migrations.CreateModel(
name='Machine',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('guid', models.CharField(default=None, editable=False, max_length=64, unique=True)),
('hostname', models.CharField(default=None, max_length=63)),
('hw_in_docker', models.BooleanField(default=False)),
('hw_in_vm', models.BooleanField(default=False)),
('hw_manufacturer', models.CharField(default=None, max_length=63)),
('hw_product', models.CharField(default=None, max_length=63)),
('hw_uuid', models.CharField(default=None, max_length=255)),
('os_arch', models.CharField(default=None, max_length=15)),
('os_family', models.CharField(default=None, max_length=15)),
('os_platform', models.CharField(default=None, max_length=63)),
('os_release', models.CharField(default=None, max_length=63)),
('os_kernel', models.CharField(default=None, max_length=255)),
('stats', models.JSONField(blank=True, default=dict, null=True)),
('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)),
],
options={
'app_label': 'machine',
},
),
migrations.CreateModel(
name='NetworkInterface',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('mac_address', models.CharField(default=None, editable=False, max_length=17)),
('ip_public', models.GenericIPAddressField(default=None, editable=False)),
('ip_local', models.GenericIPAddressField(default=None, editable=False)),
('dns_server', models.GenericIPAddressField(default=None, editable=False)),
('hostname', models.CharField(default=None, max_length=63)),
('iface', models.CharField(default=None, max_length=15)),
('isp', models.CharField(default=None, max_length=63)),
('city', models.CharField(default=None, max_length=63)),
('region', models.CharField(default=None, max_length=63)),
('country', models.CharField(default=None, max_length=63)),
('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
],
options={
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
'app_label': 'machine',
},
),
migrations.CreateModel(
name='Binary',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('name', models.CharField(blank=True, db_index=True, default='', max_length=63)),
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
('binprovider', models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31)),
('abspath', models.CharField(blank=True, default='', max_length=255)),
('version', models.CharField(blank=True, default='', max_length=32)),
('sha256', models.CharField(blank=True, default='', max_length=64)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
],
options={
'verbose_name': 'Binary',
'verbose_name_plural': 'Binaries',
'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')},
'app_label': 'machine',
},
),
migrations.CreateModel(
name='Process',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='processes', to='machine.machine')),
('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.binary')),
('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.networkinterface')),
],
options={
'verbose_name': 'Process',
'verbose_name_plural': 'Processes',
'app_label': 'machine',
},
),
migrations.AddIndex(
model_name='process',
index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_c69cf0_idx'),
),
migrations.AddIndex(
model_name='process',
index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__f79cc6_idx'),
),
],
),
]

View File

@@ -363,6 +363,20 @@
background: rgba(248, 81, 73, 0.25);
width: 100%;
}
#progress-monitor .extractor-badge.backoff {
color: #b8860b;
}
#progress-monitor .extractor-badge.backoff .progress-fill {
background: rgba(210, 153, 34, 0.2);
width: 30%;
}
#progress-monitor .extractor-badge.skipped {
color: #6e7681;
}
#progress-monitor .extractor-badge.skipped .progress-fill {
background: rgba(110, 118, 129, 0.15);
width: 100%;
}
#progress-monitor .extractor-badge .badge-icon {
font-size: 10px;
}
@@ -400,6 +414,14 @@
background: rgba(248, 81, 73, 0.2);
color: #f85149;
}
#progress-monitor .status-badge.backoff {
background: rgba(210, 153, 34, 0.15);
color: #b8860b;
}
#progress-monitor .status-badge.unknown {
background: #21262d;
color: #6e7681;
}
</style>
@@ -470,25 +492,28 @@
});
function formatUrl(url) {
if (!url) return '(no URL)';
try {
const u = new URL(url);
return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
} catch {
return url.substring(0, 50) + (url.length > 50 ? '...' : '');
return String(url).substring(0, 50) + (String(url).length > 50 ? '...' : '');
}
}
function renderExtractor(extractor) {
const icon = extractor.status === 'started' ? '&#8635;' :
extractor.status === 'succeeded' ? '&#10003;' :
extractor.status === 'failed' ? '&#10007;' : '&#9675;';
extractor.status === 'failed' ? '&#10007;' :
extractor.status === 'backoff' ? '&#8987;' :
extractor.status === 'skipped' ? '&#8674;' : '&#9675;';
return `
<span class="extractor-badge ${extractor.status}">
<span class="extractor-badge ${extractor.status || 'queued'}">
<span class="progress-fill"></span>
<span class="badge-content">
<span class="badge-icon">${icon}</span>
<span>${extractor.plugin}</span>
<span>${extractor.plugin || 'unknown'}</span>
</span>
</span>
`;
@@ -496,13 +521,13 @@
function renderSnapshot(snapshot, crawlId) {
const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`;
const adminUrl = `/admin/core/snapshot/${snapshot.id || 'unknown'}/change/`;
let extractorHtml = '';
if (snapshot.all_plugins && snapshot.all_plugins.length > 0) {
// Sort plugins alphabetically by name to prevent reordering on updates
const sortedExtractors = [...snapshot.all_plugins].sort((a, b) =>
a.plugin.localeCompare(b.plugin)
(a.plugin || '').localeCompare(b.plugin || '')
);
extractorHtml = `
<div class="extractor-list">
@@ -518,16 +543,17 @@
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
<div class="snapshot-meta">
${snapshot.completed_plugins}/${snapshot.total_plugins} extractors
${snapshot.failed_plugins > 0 ? `<span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}
${(snapshot.total_plugins || 0) > 0
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
: 'Waiting for extractors...'}
</div>
</div>
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>
<span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span>
</a>
<div class="snapshot-progress">
<div class="progress-bar-container">
<div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
style="width: ${snapshot.progress}%"></div>
<div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}"
style="width: ${snapshot.progress || 0}%"></div>
</div>
</div>
${extractorHtml}
@@ -537,7 +563,7 @@
function renderCrawl(crawl) {
const statusIcon = crawl.status === 'started' ? '&#8635;' : '&#128269;';
const adminUrl = `/admin/crawls/crawl/${crawl.id}/change/`;
const adminUrl = `/admin/crawls/crawl/${crawl.id || 'unknown'}/change/`;
let snapshotsHtml = '';
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
@@ -556,7 +582,7 @@
// Queued but retry_at is in future (was claimed by worker, will retry)
warningHtml = `
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
🔄 Retrying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
</div>
`;
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
@@ -569,34 +595,34 @@
}
// Show snapshot info or URL count if no snapshots yet
let metaText = `depth: ${crawl.max_depth}`;
if (crawl.total_snapshots > 0) {
let metaText = `depth: ${crawl.max_depth || 0}`;
if ((crawl.total_snapshots || 0) > 0) {
metaText += ` | ${crawl.total_snapshots} snapshots`;
} else if (crawl.urls_count > 0) {
} else if ((crawl.urls_count || 0) > 0) {
metaText += ` | ${crawl.urls_count} URLs`;
} else if (crawl.urls_preview) {
metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`;
}
return `
<div class="crawl-item" data-crawl-id="${crawl.id}">
<div class="crawl-item" data-crawl-id="${crawl.id || 'unknown'}">
<a class="crawl-header" href="${adminUrl}">
<span class="crawl-icon">${statusIcon}</span>
<div class="crawl-info">
<div class="crawl-label">${crawl.label}</div>
<div class="crawl-label">${crawl.label || '(no label)'}</div>
<div class="crawl-meta">${metaText}</div>
</div>
<div class="crawl-stats">
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
<span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span>
<span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
<span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span>
</div>
<span class="status-badge ${crawl.status}">${crawl.status}</span>
<span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span>
</a>
<div class="crawl-progress">
<div class="progress-bar-container">
<div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
style="width: ${crawl.progress}%"></div>
<div class="progress-bar crawl ${crawl.status === 'started' && (crawl.progress || 0) === 0 ? 'indeterminate' : ''}"
style="width: ${crawl.progress || 0}%"></div>
</div>
</div>
${warningHtml}
@@ -668,7 +694,7 @@
idleMessage.style.display = 'none';
crawlTree.innerHTML = `
<div class="idle-message">
${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
${data.snapshots_started || 0} snapshots processing, ${data.archiveresults_started || 0} extractors running
</div>
`;
} else {
@@ -676,7 +702,7 @@
// Build the URL for recent crawls (last 24 hours)
var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
idleMessage.innerHTML = `No active crawls (${data.crawls_pending || 0} pending, ${data.crawls_started || 0} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent || 0} recent</a>)`;
crawlTree.innerHTML = '';
}
}