mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-02 17:05:38 +10:00
fix initial migrtaions
This commit is contained in:
@@ -104,8 +104,17 @@ def add(urls: str | list[str],
|
||||
if index_only:
|
||||
# Just create the crawl but don't start processing
|
||||
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
|
||||
# Create root snapshot manually
|
||||
crawl.create_root_snapshot()
|
||||
# Create snapshots for all URLs in the crawl
|
||||
for url in crawl.get_urls_list():
|
||||
Snapshot.objects.update_or_create(
|
||||
crawl=crawl, url=url,
|
||||
defaults={
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'depth': 0,
|
||||
},
|
||||
)
|
||||
return crawl.snapshot_set.all()
|
||||
|
||||
# 5. Start the orchestrator to process the queue
|
||||
|
||||
@@ -577,17 +577,20 @@ def live_progress_view(request):
|
||||
|
||||
active_crawls = []
|
||||
for crawl in active_crawls_qs:
|
||||
# Get active snapshots for this crawl - filter in Python since we prefetched all
|
||||
crawl_snapshots = [
|
||||
s for s in crawl.snapshot_set.all()
|
||||
if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
||||
][:5] # Limit to 5 most recent
|
||||
# Get ALL snapshots for this crawl to count status (already prefetched)
|
||||
all_crawl_snapshots = list(crawl.snapshot_set.all())
|
||||
|
||||
# Count snapshots by status (in memory, not DB)
|
||||
total_snapshots = Snapshot.objects.filter(crawl=crawl).count() # Full count needs DB
|
||||
completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
|
||||
started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
|
||||
pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
|
||||
# Count snapshots by status from ALL snapshots
|
||||
total_snapshots = len(all_crawl_snapshots)
|
||||
completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
|
||||
started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
|
||||
pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
|
||||
|
||||
# Get only ACTIVE snapshots to display (limit to 5 most recent)
|
||||
active_crawl_snapshots = [
|
||||
s for s in all_crawl_snapshots
|
||||
if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
||||
][:5]
|
||||
|
||||
# Count URLs in the crawl (for when snapshots haven't been created yet)
|
||||
urls_count = 0
|
||||
@@ -599,7 +602,7 @@ def live_progress_view(request):
|
||||
|
||||
# Get active snapshots for this crawl (already prefetched)
|
||||
active_snapshots_for_crawl = []
|
||||
for snapshot in crawl_snapshots:
|
||||
for snapshot in active_crawl_snapshots:
|
||||
# Get archive results for this snapshot (already prefetched)
|
||||
snapshot_results = snapshot.archiveresult_set.all()
|
||||
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
# Generated by hand on 2025-12-29
|
||||
# Creates Machine, Binary, NetworkInterface, and Process tables using raw SQL
|
||||
|
||||
from django.db import migrations
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
@@ -12,9 +15,10 @@ class Migration(migrations.Migration):
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunSQL(
|
||||
# Forward SQL
|
||||
sql="""
|
||||
migrations.SeparateDatabaseAndState(
|
||||
database_operations=[
|
||||
migrations.RunSQL(
|
||||
sql="""
|
||||
-- Create machine_machine table
|
||||
CREATE TABLE IF NOT EXISTS machine_machine (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
@@ -136,12 +140,133 @@ class Migration(migrations.Migration):
|
||||
CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id);
|
||||
CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at);
|
||||
""",
|
||||
# Reverse SQL
|
||||
reverse_sql="""
|
||||
DROP TABLE IF EXISTS machine_process;
|
||||
DROP TABLE IF EXISTS machine_binary;
|
||||
DROP TABLE IF EXISTS machine_networkinterface;
|
||||
DROP TABLE IF EXISTS machine_machine;
|
||||
"""
|
||||
reverse_sql="""
|
||||
DROP TABLE IF EXISTS machine_process;
|
||||
DROP TABLE IF EXISTS machine_binary;
|
||||
DROP TABLE IF EXISTS machine_networkinterface;
|
||||
DROP TABLE IF EXISTS machine_machine;
|
||||
"""
|
||||
),
|
||||
],
|
||||
state_operations=[
|
||||
migrations.CreateModel(
|
||||
name='Machine',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('guid', models.CharField(default=None, editable=False, max_length=64, unique=True)),
|
||||
('hostname', models.CharField(default=None, max_length=63)),
|
||||
('hw_in_docker', models.BooleanField(default=False)),
|
||||
('hw_in_vm', models.BooleanField(default=False)),
|
||||
('hw_manufacturer', models.CharField(default=None, max_length=63)),
|
||||
('hw_product', models.CharField(default=None, max_length=63)),
|
||||
('hw_uuid', models.CharField(default=None, max_length=255)),
|
||||
('os_arch', models.CharField(default=None, max_length=15)),
|
||||
('os_family', models.CharField(default=None, max_length=15)),
|
||||
('os_platform', models.CharField(default=None, max_length=63)),
|
||||
('os_release', models.CharField(default=None, max_length=63)),
|
||||
('os_kernel', models.CharField(default=None, max_length=255)),
|
||||
('stats', models.JSONField(blank=True, default=dict, null=True)),
|
||||
('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)),
|
||||
],
|
||||
options={
|
||||
'app_label': 'machine',
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='NetworkInterface',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('mac_address', models.CharField(default=None, editable=False, max_length=17)),
|
||||
('ip_public', models.GenericIPAddressField(default=None, editable=False)),
|
||||
('ip_local', models.GenericIPAddressField(default=None, editable=False)),
|
||||
('dns_server', models.GenericIPAddressField(default=None, editable=False)),
|
||||
('hostname', models.CharField(default=None, max_length=63)),
|
||||
('iface', models.CharField(default=None, max_length=15)),
|
||||
('isp', models.CharField(default=None, max_length=63)),
|
||||
('city', models.CharField(default=None, max_length=63)),
|
||||
('region', models.CharField(default=None, max_length=63)),
|
||||
('country', models.CharField(default=None, max_length=63)),
|
||||
('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
],
|
||||
options={
|
||||
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
|
||||
'app_label': 'machine',
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Binary',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('name', models.CharField(blank=True, db_index=True, default='', max_length=63)),
|
||||
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
|
||||
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
|
||||
('binprovider', models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31)),
|
||||
('abspath', models.CharField(blank=True, default='', max_length=255)),
|
||||
('version', models.CharField(blank=True, default='', max_length=32)),
|
||||
('sha256', models.CharField(blank=True, default='', max_length=64)),
|
||||
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
|
||||
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
|
||||
('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Binary',
|
||||
'verbose_name_plural': 'Binaries',
|
||||
'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')},
|
||||
'app_label': 'machine',
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='Process',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
|
||||
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
|
||||
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
|
||||
('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
|
||||
('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
|
||||
('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
|
||||
('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
|
||||
('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
|
||||
('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
|
||||
('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
|
||||
('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
|
||||
('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
|
||||
('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='processes', to='machine.machine')),
|
||||
('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.binary')),
|
||||
('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.networkinterface')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Process',
|
||||
'verbose_name_plural': 'Processes',
|
||||
'app_label': 'machine',
|
||||
},
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_c69cf0_idx'),
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='process',
|
||||
index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__f79cc6_idx'),
|
||||
),
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
@@ -363,6 +363,20 @@
|
||||
background: rgba(248, 81, 73, 0.25);
|
||||
width: 100%;
|
||||
}
|
||||
#progress-monitor .extractor-badge.backoff {
|
||||
color: #b8860b;
|
||||
}
|
||||
#progress-monitor .extractor-badge.backoff .progress-fill {
|
||||
background: rgba(210, 153, 34, 0.2);
|
||||
width: 30%;
|
||||
}
|
||||
#progress-monitor .extractor-badge.skipped {
|
||||
color: #6e7681;
|
||||
}
|
||||
#progress-monitor .extractor-badge.skipped .progress-fill {
|
||||
background: rgba(110, 118, 129, 0.15);
|
||||
width: 100%;
|
||||
}
|
||||
#progress-monitor .extractor-badge .badge-icon {
|
||||
font-size: 10px;
|
||||
}
|
||||
@@ -400,6 +414,14 @@
|
||||
background: rgba(248, 81, 73, 0.2);
|
||||
color: #f85149;
|
||||
}
|
||||
#progress-monitor .status-badge.backoff {
|
||||
background: rgba(210, 153, 34, 0.15);
|
||||
color: #b8860b;
|
||||
}
|
||||
#progress-monitor .status-badge.unknown {
|
||||
background: #21262d;
|
||||
color: #6e7681;
|
||||
}
|
||||
|
||||
</style>
|
||||
|
||||
@@ -470,25 +492,28 @@
|
||||
});
|
||||
|
||||
function formatUrl(url) {
|
||||
if (!url) return '(no URL)';
|
||||
try {
|
||||
const u = new URL(url);
|
||||
return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
|
||||
} catch {
|
||||
return url.substring(0, 50) + (url.length > 50 ? '...' : '');
|
||||
return String(url).substring(0, 50) + (String(url).length > 50 ? '...' : '');
|
||||
}
|
||||
}
|
||||
|
||||
function renderExtractor(extractor) {
|
||||
const icon = extractor.status === 'started' ? '↻' :
|
||||
extractor.status === 'succeeded' ? '✓' :
|
||||
extractor.status === 'failed' ? '✗' : '○';
|
||||
extractor.status === 'failed' ? '✗' :
|
||||
extractor.status === 'backoff' ? '⌛' :
|
||||
extractor.status === 'skipped' ? '⇢' : '○';
|
||||
|
||||
return `
|
||||
<span class="extractor-badge ${extractor.status}">
|
||||
<span class="extractor-badge ${extractor.status || 'queued'}">
|
||||
<span class="progress-fill"></span>
|
||||
<span class="badge-content">
|
||||
<span class="badge-icon">${icon}</span>
|
||||
<span>${extractor.plugin}</span>
|
||||
<span>${extractor.plugin || 'unknown'}</span>
|
||||
</span>
|
||||
</span>
|
||||
`;
|
||||
@@ -496,13 +521,13 @@
|
||||
|
||||
function renderSnapshot(snapshot, crawlId) {
|
||||
const statusIcon = snapshot.status === 'started' ? '↻' : '📄';
|
||||
const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`;
|
||||
const adminUrl = `/admin/core/snapshot/${snapshot.id || 'unknown'}/change/`;
|
||||
|
||||
let extractorHtml = '';
|
||||
if (snapshot.all_plugins && snapshot.all_plugins.length > 0) {
|
||||
// Sort plugins alphabetically by name to prevent reordering on updates
|
||||
const sortedExtractors = [...snapshot.all_plugins].sort((a, b) =>
|
||||
a.plugin.localeCompare(b.plugin)
|
||||
(a.plugin || '').localeCompare(b.plugin || '')
|
||||
);
|
||||
extractorHtml = `
|
||||
<div class="extractor-list">
|
||||
@@ -518,16 +543,17 @@
|
||||
<div class="snapshot-info">
|
||||
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
|
||||
<div class="snapshot-meta">
|
||||
${snapshot.completed_plugins}/${snapshot.total_plugins} extractors
|
||||
${snapshot.failed_plugins > 0 ? `<span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}
|
||||
${(snapshot.total_plugins || 0) > 0
|
||||
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
|
||||
: 'Waiting for extractors...'}
|
||||
</div>
|
||||
</div>
|
||||
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>
|
||||
<span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span>
|
||||
</a>
|
||||
<div class="snapshot-progress">
|
||||
<div class="progress-bar-container">
|
||||
<div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
|
||||
style="width: ${snapshot.progress}%"></div>
|
||||
<div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}"
|
||||
style="width: ${snapshot.progress || 0}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
${extractorHtml}
|
||||
@@ -537,7 +563,7 @@
|
||||
|
||||
function renderCrawl(crawl) {
|
||||
const statusIcon = crawl.status === 'started' ? '↻' : '🔍';
|
||||
const adminUrl = `/admin/crawls/crawl/${crawl.id}/change/`;
|
||||
const adminUrl = `/admin/crawls/crawl/${crawl.id || 'unknown'}/change/`;
|
||||
|
||||
let snapshotsHtml = '';
|
||||
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
|
||||
@@ -556,7 +582,7 @@
|
||||
// Queued but retry_at is in future (was claimed by worker, will retry)
|
||||
warningHtml = `
|
||||
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
|
||||
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
|
||||
🔄 Retrying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
|
||||
</div>
|
||||
`;
|
||||
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
|
||||
@@ -569,34 +595,34 @@
|
||||
}
|
||||
|
||||
// Show snapshot info or URL count if no snapshots yet
|
||||
let metaText = `depth: ${crawl.max_depth}`;
|
||||
if (crawl.total_snapshots > 0) {
|
||||
let metaText = `depth: ${crawl.max_depth || 0}`;
|
||||
if ((crawl.total_snapshots || 0) > 0) {
|
||||
metaText += ` | ${crawl.total_snapshots} snapshots`;
|
||||
} else if (crawl.urls_count > 0) {
|
||||
} else if ((crawl.urls_count || 0) > 0) {
|
||||
metaText += ` | ${crawl.urls_count} URLs`;
|
||||
} else if (crawl.urls_preview) {
|
||||
metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`;
|
||||
}
|
||||
|
||||
return `
|
||||
<div class="crawl-item" data-crawl-id="${crawl.id}">
|
||||
<div class="crawl-item" data-crawl-id="${crawl.id || 'unknown'}">
|
||||
<a class="crawl-header" href="${adminUrl}">
|
||||
<span class="crawl-icon">${statusIcon}</span>
|
||||
<div class="crawl-info">
|
||||
<div class="crawl-label">${crawl.label}</div>
|
||||
<div class="crawl-label">${crawl.label || '(no label)'}</div>
|
||||
<div class="crawl-meta">${metaText}</div>
|
||||
</div>
|
||||
<div class="crawl-stats">
|
||||
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
|
||||
<span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span>
|
||||
<span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
|
||||
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
|
||||
<span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span>
|
||||
</div>
|
||||
<span class="status-badge ${crawl.status}">${crawl.status}</span>
|
||||
<span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span>
|
||||
</a>
|
||||
<div class="crawl-progress">
|
||||
<div class="progress-bar-container">
|
||||
<div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
|
||||
style="width: ${crawl.progress}%"></div>
|
||||
<div class="progress-bar crawl ${crawl.status === 'started' && (crawl.progress || 0) === 0 ? 'indeterminate' : ''}"
|
||||
style="width: ${crawl.progress || 0}%"></div>
|
||||
</div>
|
||||
</div>
|
||||
${warningHtml}
|
||||
@@ -668,7 +694,7 @@
|
||||
idleMessage.style.display = 'none';
|
||||
crawlTree.innerHTML = `
|
||||
<div class="idle-message">
|
||||
${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
|
||||
${data.snapshots_started || 0} snapshots processing, ${data.archiveresults_started || 0} extractors running
|
||||
</div>
|
||||
`;
|
||||
} else {
|
||||
@@ -676,7 +702,7 @@
|
||||
// Build the URL for recent crawls (last 24 hours)
|
||||
var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
|
||||
var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
|
||||
idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
|
||||
idleMessage.innerHTML = `No active crawls (${data.crawls_pending || 0} pending, ${data.crawls_started || 0} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent || 0} recent</a>)`;
|
||||
crawlTree.innerHTML = '';
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,7 +91,11 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
|
||||
|
||||
|
||||
def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding URLs from a file."""
|
||||
"""Test adding URLs from a file.
|
||||
|
||||
With --index-only, this creates a snapshot for the file itself, not the URLs inside.
|
||||
To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Create a file with URLs
|
||||
@@ -108,10 +112,13 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert snapshot_count == 2
|
||||
# With --index-only, creates 1 snapshot for the file itself
|
||||
assert crawl_count == 1
|
||||
assert snapshot_count == 1
|
||||
|
||||
|
||||
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
|
||||
@@ -141,7 +148,11 @@ def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
|
||||
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
"""Test adding URL with tags creates tag records."""
|
||||
"""Test adding URL with tags stores tags_str in crawl.
|
||||
|
||||
With --index-only, Tag objects are not created until archiving happens.
|
||||
Tags are stored as a string in the Crawl.tags_str field.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
|
||||
@@ -151,15 +162,19 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
tags = c.execute("SELECT name FROM core_tag").fetchall()
|
||||
tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
tag_names = [t[0] for t in tags]
|
||||
assert 'test' in tag_names or 'example' in tag_names
|
||||
# Tags are stored as a comma-separated string in crawl
|
||||
assert 'test' in tags_str or 'example' in tags_str
|
||||
|
||||
|
||||
def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding the same URL twice updates rather than duplicates."""
|
||||
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that adding the same URL twice creates separate crawls and snapshots.
|
||||
|
||||
Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
|
||||
This allows re-archiving URLs at different times.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add URL first time
|
||||
@@ -179,10 +194,12 @@ def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractor
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
|
||||
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Should still only have one snapshot for this URL
|
||||
assert snapshot_count == 1
|
||||
# Each add creates a new crawl with its own snapshot
|
||||
assert crawl_count == 2
|
||||
assert snapshot_count == 2
|
||||
|
||||
|
||||
def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
|
||||
@@ -208,7 +225,10 @@ def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
|
||||
|
||||
|
||||
def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add creates archive subdirectory for the snapshot."""
|
||||
"""Test that add creates archive subdirectory for the snapshot.
|
||||
|
||||
Archive subdirectories are named by timestamp, not by snapshot ID.
|
||||
"""
|
||||
os.chdir(tmp_path)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
@@ -216,14 +236,14 @@ def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get the snapshot ID from the database
|
||||
# Get the snapshot timestamp from the database
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_id = c.execute("SELECT id FROM core_snapshot").fetchone()[0]
|
||||
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Check that archive subdirectory was created
|
||||
archive_dir = tmp_path / "archive" / snapshot_id
|
||||
# Check that archive subdirectory was created using timestamp
|
||||
archive_dir = tmp_path / "archive" / str(timestamp)
|
||||
assert archive_dir.exists()
|
||||
assert archive_dir.is_dir()
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Comprehensive tests for archivebox update command.
|
||||
Verify update re-archives snapshots and updates DB status.
|
||||
Verify update drains old dirs, reconciles DB, and queues snapshots.
|
||||
"""
|
||||
|
||||
import os
|
||||
@@ -15,7 +15,7 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
|
||||
"""Test that update runs without error on empty archive."""
|
||||
os.chdir(tmp_path)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--index-only'],
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
@@ -25,41 +25,21 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_update_re_archives_existing_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that update command re-archives existing snapshots."""
|
||||
def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that update command reconciles existing snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Run update
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--index-only'],
|
||||
['archivebox', 'add', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_update_index_only_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that --index-only flag skips extraction."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Update with index-only should be fast
|
||||
# Run update - should reconcile and queue
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--index-only'],
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -74,26 +54,28 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor
|
||||
|
||||
# Add multiple snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
['archivebox', 'add', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Update with filter
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--index-only', '--filter-type=search', '--filter=example.com'],
|
||||
['archivebox', 'add', '--depth=0', 'https://example.org'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete (may succeed or show usage)
|
||||
assert result.returncode in [0, 1, 2]
|
||||
# Update with filter pattern (uses filter_patterns argument)
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--filter-type=substring', 'example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete successfully
|
||||
assert result.returncode == 0
|
||||
|
||||
|
||||
def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_dict):
|
||||
@@ -102,9 +84,10 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
|
||||
|
||||
# Add snapshots
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
['archivebox', 'add', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Count before update
|
||||
@@ -115,9 +98,9 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
|
||||
|
||||
assert count_before == 1
|
||||
|
||||
# Run update
|
||||
# Run update (should reconcile + queue, not create new snapshots)
|
||||
subprocess.run(
|
||||
['archivebox', 'update', '--index-only'],
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
@@ -133,21 +116,31 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
|
||||
assert count_after == count_before
|
||||
|
||||
|
||||
def test_update_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
|
||||
"""Test update with --overwrite flag forces re-archiving."""
|
||||
def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that update queues snapshots for archiving."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
['archivebox', 'add', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Run update
|
||||
result = subprocess.run(
|
||||
['archivebox', 'update', '--index-only', '--overwrite'],
|
||||
['archivebox', 'update'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode == 0
|
||||
|
||||
# Check that snapshot is queued
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
status = c.execute("SELECT status FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert status == 'queued'
|
||||
|
||||
Reference in New Issue
Block a user