fix initial migrtaions

This commit is contained in:
Nick Sweeting
2025-12-29 21:27:31 -08:00
parent 3dd329600e
commit 2e350d317d
6 changed files with 285 additions and 109 deletions

View File

@@ -104,8 +104,17 @@ def add(urls: str | list[str],
if index_only:
# Just create the crawl but don't start processing
print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
# Create root snapshot manually
crawl.create_root_snapshot()
# Create snapshots for all URLs in the crawl
for url in crawl.get_urls_list():
Snapshot.objects.update_or_create(
crawl=crawl, url=url,
defaults={
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()),
'depth': 0,
},
)
return crawl.snapshot_set.all()
# 5. Start the orchestrator to process the queue

View File

@@ -577,17 +577,20 @@ def live_progress_view(request):
active_crawls = []
for crawl in active_crawls_qs:
# Get active snapshots for this crawl - filter in Python since we prefetched all
crawl_snapshots = [
s for s in crawl.snapshot_set.all()
if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
][:5] # Limit to 5 most recent
# Get ALL snapshots for this crawl to count status (already prefetched)
all_crawl_snapshots = list(crawl.snapshot_set.all())
# Count snapshots by status (in memory, not DB)
total_snapshots = Snapshot.objects.filter(crawl=crawl).count() # Full count needs DB
completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
# Count snapshots by status from ALL snapshots
total_snapshots = len(all_crawl_snapshots)
completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
# Get only ACTIVE snapshots to display (limit to 5 most recent)
active_crawl_snapshots = [
s for s in all_crawl_snapshots
if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
][:5]
# Count URLs in the crawl (for when snapshots haven't been created yet)
urls_count = 0
@@ -599,7 +602,7 @@ def live_progress_view(request):
# Get active snapshots for this crawl (already prefetched)
active_snapshots_for_crawl = []
for snapshot in crawl_snapshots:
for snapshot in active_crawl_snapshots:
# Get archive results for this snapshot (already prefetched)
snapshot_results = snapshot.archiveresult_set.all()

View File

@@ -1,7 +1,10 @@
# Generated by hand on 2025-12-29
# Creates Machine, Binary, NetworkInterface, and Process tables using raw SQL
from django.db import migrations
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
from archivebox.uuid_compat import uuid7
class Migration(migrations.Migration):
@@ -12,9 +15,10 @@ class Migration(migrations.Migration):
]
operations = [
migrations.RunSQL(
# Forward SQL
sql="""
migrations.SeparateDatabaseAndState(
database_operations=[
migrations.RunSQL(
sql="""
-- Create machine_machine table
CREATE TABLE IF NOT EXISTS machine_machine (
id TEXT PRIMARY KEY NOT NULL,
@@ -136,12 +140,133 @@ class Migration(migrations.Migration):
CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id);
CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at);
""",
# Reverse SQL
reverse_sql="""
DROP TABLE IF EXISTS machine_process;
DROP TABLE IF EXISTS machine_binary;
DROP TABLE IF EXISTS machine_networkinterface;
DROP TABLE IF EXISTS machine_machine;
"""
reverse_sql="""
DROP TABLE IF EXISTS machine_process;
DROP TABLE IF EXISTS machine_binary;
DROP TABLE IF EXISTS machine_networkinterface;
DROP TABLE IF EXISTS machine_machine;
"""
),
],
state_operations=[
migrations.CreateModel(
name='Machine',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('guid', models.CharField(default=None, editable=False, max_length=64, unique=True)),
('hostname', models.CharField(default=None, max_length=63)),
('hw_in_docker', models.BooleanField(default=False)),
('hw_in_vm', models.BooleanField(default=False)),
('hw_manufacturer', models.CharField(default=None, max_length=63)),
('hw_product', models.CharField(default=None, max_length=63)),
('hw_uuid', models.CharField(default=None, max_length=255)),
('os_arch', models.CharField(default=None, max_length=15)),
('os_family', models.CharField(default=None, max_length=15)),
('os_platform', models.CharField(default=None, max_length=63)),
('os_release', models.CharField(default=None, max_length=63)),
('os_kernel', models.CharField(default=None, max_length=255)),
('stats', models.JSONField(blank=True, default=dict, null=True)),
('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)),
],
options={
'app_label': 'machine',
},
),
migrations.CreateModel(
name='NetworkInterface',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('mac_address', models.CharField(default=None, editable=False, max_length=17)),
('ip_public', models.GenericIPAddressField(default=None, editable=False)),
('ip_local', models.GenericIPAddressField(default=None, editable=False)),
('dns_server', models.GenericIPAddressField(default=None, editable=False)),
('hostname', models.CharField(default=None, max_length=63)),
('iface', models.CharField(default=None, max_length=15)),
('isp', models.CharField(default=None, max_length=63)),
('city', models.CharField(default=None, max_length=63)),
('region', models.CharField(default=None, max_length=63)),
('country', models.CharField(default=None, max_length=63)),
('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
],
options={
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
'app_label': 'machine',
},
),
migrations.CreateModel(
name='Binary',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('name', models.CharField(blank=True, db_index=True, default='', max_length=63)),
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
('binprovider', models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31)),
('abspath', models.CharField(blank=True, default='', max_length=255)),
('version', models.CharField(blank=True, default='', max_length=32)),
('sha256', models.CharField(blank=True, default='', max_length=64)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
],
options={
'verbose_name': 'Binary',
'verbose_name_plural': 'Binaries',
'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')},
'app_label': 'machine',
},
),
migrations.CreateModel(
name='Process',
fields=[
('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='processes', to='machine.machine')),
('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.binary')),
('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.networkinterface')),
],
options={
'verbose_name': 'Process',
'verbose_name_plural': 'Processes',
'app_label': 'machine',
},
),
migrations.AddIndex(
model_name='process',
index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_c69cf0_idx'),
),
migrations.AddIndex(
model_name='process',
index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__f79cc6_idx'),
),
],
),
]

View File

@@ -363,6 +363,20 @@
background: rgba(248, 81, 73, 0.25);
width: 100%;
}
#progress-monitor .extractor-badge.backoff {
color: #b8860b;
}
#progress-monitor .extractor-badge.backoff .progress-fill {
background: rgba(210, 153, 34, 0.2);
width: 30%;
}
#progress-monitor .extractor-badge.skipped {
color: #6e7681;
}
#progress-monitor .extractor-badge.skipped .progress-fill {
background: rgba(110, 118, 129, 0.15);
width: 100%;
}
#progress-monitor .extractor-badge .badge-icon {
font-size: 10px;
}
@@ -400,6 +414,14 @@
background: rgba(248, 81, 73, 0.2);
color: #f85149;
}
#progress-monitor .status-badge.backoff {
background: rgba(210, 153, 34, 0.15);
color: #b8860b;
}
#progress-monitor .status-badge.unknown {
background: #21262d;
color: #6e7681;
}
</style>
@@ -470,25 +492,28 @@
});
function formatUrl(url) {
if (!url) return '(no URL)';
try {
const u = new URL(url);
return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
} catch {
return url.substring(0, 50) + (url.length > 50 ? '...' : '');
return String(url).substring(0, 50) + (String(url).length > 50 ? '...' : '');
}
}
function renderExtractor(extractor) {
const icon = extractor.status === 'started' ? '&#8635;' :
extractor.status === 'succeeded' ? '&#10003;' :
extractor.status === 'failed' ? '&#10007;' : '&#9675;';
extractor.status === 'failed' ? '&#10007;' :
extractor.status === 'backoff' ? '&#8987;' :
extractor.status === 'skipped' ? '&#8674;' : '&#9675;';
return `
<span class="extractor-badge ${extractor.status}">
<span class="extractor-badge ${extractor.status || 'queued'}">
<span class="progress-fill"></span>
<span class="badge-content">
<span class="badge-icon">${icon}</span>
<span>${extractor.plugin}</span>
<span>${extractor.plugin || 'unknown'}</span>
</span>
</span>
`;
@@ -496,13 +521,13 @@
function renderSnapshot(snapshot, crawlId) {
const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`;
const adminUrl = `/admin/core/snapshot/${snapshot.id || 'unknown'}/change/`;
let extractorHtml = '';
if (snapshot.all_plugins && snapshot.all_plugins.length > 0) {
// Sort plugins alphabetically by name to prevent reordering on updates
const sortedExtractors = [...snapshot.all_plugins].sort((a, b) =>
a.plugin.localeCompare(b.plugin)
(a.plugin || '').localeCompare(b.plugin || '')
);
extractorHtml = `
<div class="extractor-list">
@@ -518,16 +543,17 @@
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
<div class="snapshot-meta">
${snapshot.completed_plugins}/${snapshot.total_plugins} extractors
${snapshot.failed_plugins > 0 ? `<span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}
${(snapshot.total_plugins || 0) > 0
? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
: 'Waiting for extractors...'}
</div>
</div>
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>
<span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span>
</a>
<div class="snapshot-progress">
<div class="progress-bar-container">
<div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
style="width: ${snapshot.progress}%"></div>
<div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}"
style="width: ${snapshot.progress || 0}%"></div>
</div>
</div>
${extractorHtml}
@@ -537,7 +563,7 @@
function renderCrawl(crawl) {
const statusIcon = crawl.status === 'started' ? '&#8635;' : '&#128269;';
const adminUrl = `/admin/crawls/crawl/${crawl.id}/change/`;
const adminUrl = `/admin/crawls/crawl/${crawl.id || 'unknown'}/change/`;
let snapshotsHtml = '';
if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
@@ -556,7 +582,7 @@
// Queued but retry_at is in future (was claimed by worker, will retry)
warningHtml = `
<div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
🔄 Retrying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
</div>
`;
} else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
@@ -569,34 +595,34 @@
}
// Show snapshot info or URL count if no snapshots yet
let metaText = `depth: ${crawl.max_depth}`;
if (crawl.total_snapshots > 0) {
let metaText = `depth: ${crawl.max_depth || 0}`;
if ((crawl.total_snapshots || 0) > 0) {
metaText += ` | ${crawl.total_snapshots} snapshots`;
} else if (crawl.urls_count > 0) {
} else if ((crawl.urls_count || 0) > 0) {
metaText += ` | ${crawl.urls_count} URLs`;
} else if (crawl.urls_preview) {
metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`;
}
return `
<div class="crawl-item" data-crawl-id="${crawl.id}">
<div class="crawl-item" data-crawl-id="${crawl.id || 'unknown'}">
<a class="crawl-header" href="${adminUrl}">
<span class="crawl-icon">${statusIcon}</span>
<div class="crawl-info">
<div class="crawl-label">${crawl.label}</div>
<div class="crawl-label">${crawl.label || '(no label)'}</div>
<div class="crawl-meta">${metaText}</div>
</div>
<div class="crawl-stats">
<span style="color:#3fb950">${crawl.completed_snapshots} done</span>
<span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span>
<span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
<span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
<span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span>
</div>
<span class="status-badge ${crawl.status}">${crawl.status}</span>
<span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span>
</a>
<div class="crawl-progress">
<div class="progress-bar-container">
<div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
style="width: ${crawl.progress}%"></div>
<div class="progress-bar crawl ${crawl.status === 'started' && (crawl.progress || 0) === 0 ? 'indeterminate' : ''}"
style="width: ${crawl.progress || 0}%"></div>
</div>
</div>
${warningHtml}
@@ -668,7 +694,7 @@
idleMessage.style.display = 'none';
crawlTree.innerHTML = `
<div class="idle-message">
${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
${data.snapshots_started || 0} snapshots processing, ${data.archiveresults_started || 0} extractors running
</div>
`;
} else {
@@ -676,7 +702,7 @@
// Build the URL for recent crawls (last 24 hours)
var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
idleMessage.innerHTML = `No active crawls (${data.crawls_pending || 0} pending, ${data.crawls_started || 0} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent || 0} recent</a>)`;
crawlTree.innerHTML = '';
}
}

View File

@@ -91,7 +91,11 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_
def test_add_from_file(tmp_path, process, disable_extractors_dict):
"""Test adding URLs from a file."""
"""Test adding URLs from a file.
With --index-only, this creates a snapshot for the file itself, not the URLs inside.
To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
"""
os.chdir(tmp_path)
# Create a file with URLs
@@ -108,10 +112,13 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
assert snapshot_count == 2
# With --index-only, creates 1 snapshot for the file itself
assert crawl_count == 1
assert snapshot_count == 1
def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
@@ -141,7 +148,11 @@ def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):
def test_add_with_tags(tmp_path, process, disable_extractors_dict):
"""Test adding URL with tags creates tag records."""
"""Test adding URL with tags stores tags_str in crawl.
With --index-only, Tag objects are not created until archiving happens.
Tags are stored as a string in the Crawl.tags_str field.
"""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
@@ -151,15 +162,19 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
tags = c.execute("SELECT name FROM core_tag").fetchall()
tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
conn.close()
tag_names = [t[0] for t in tags]
assert 'test' in tag_names or 'example' in tag_names
# Tags are stored as a comma-separated string in crawl
assert 'test' in tags_str or 'example' in tags_str
def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice updates rather than duplicates."""
def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
"""Test that adding the same URL twice creates separate crawls and snapshots.
Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
This allows re-archiving URLs at different times.
"""
os.chdir(tmp_path)
# Add URL first time
@@ -179,10 +194,12 @@ def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractor
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
conn.close()
# Should still only have one snapshot for this URL
assert snapshot_count == 1
# Each add creates a new crawl with its own snapshot
assert crawl_count == 2
assert snapshot_count == 2
def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
@@ -208,7 +225,10 @@ def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
"""Test that add creates archive subdirectory for the snapshot."""
"""Test that add creates archive subdirectory for the snapshot.
Archive subdirectories are named by timestamp, not by snapshot ID.
"""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
@@ -216,14 +236,14 @@ def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_
env=disable_extractors_dict,
)
# Get the snapshot ID from the database
# Get the snapshot timestamp from the database
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
snapshot_id = c.execute("SELECT id FROM core_snapshot").fetchone()[0]
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
conn.close()
# Check that archive subdirectory was created
archive_dir = tmp_path / "archive" / snapshot_id
# Check that archive subdirectory was created using timestamp
archive_dir = tmp_path / "archive" / str(timestamp)
assert archive_dir.exists()
assert archive_dir.is_dir()

View File

@@ -1,7 +1,7 @@
#!/usr/bin/env python3
"""
Comprehensive tests for archivebox update command.
Verify update re-archives snapshots and updates DB status.
Verify update drains old dirs, reconciles DB, and queues snapshots.
"""
import os
@@ -15,7 +15,7 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
"""Test that update runs without error on empty archive."""
os.chdir(tmp_path)
result = subprocess.run(
['archivebox', 'update', '--index-only'],
['archivebox', 'update'],
capture_output=True,
text=True,
timeout=30,
@@ -25,41 +25,21 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
assert result.returncode == 0
def test_update_re_archives_existing_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that update command re-archives existing snapshots."""
def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that update command reconciles existing snapshots."""
os.chdir(tmp_path)
# Add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Run update
result = subprocess.run(
['archivebox', 'update', '--index-only'],
['archivebox', 'add', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
assert result.returncode == 0
def test_update_index_only_flag(tmp_path, process, disable_extractors_dict):
"""Test that --index-only flag skips extraction."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Update with index-only should be fast
# Run update - should reconcile and queue
result = subprocess.run(
['archivebox', 'update', '--index-only'],
['archivebox', 'update'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -74,26 +54,28 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor
# Add multiple snapshots
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
['archivebox', 'add', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'],
capture_output=True,
env=disable_extractors_dict,
)
# Update with filter
result = subprocess.run(
['archivebox', 'update', '--index-only', '--filter-type=search', '--filter=example.com'],
['archivebox', 'add', '--depth=0', 'https://example.org'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
# Should complete (may succeed or show usage)
assert result.returncode in [0, 1, 2]
# Update with filter pattern (uses filter_patterns argument)
result = subprocess.run(
['archivebox', 'update', '--filter-type=substring', 'example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
# Should complete successfully
assert result.returncode == 0
def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_dict):
@@ -102,9 +84,10 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
# Add snapshots
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
['archivebox', 'add', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
# Count before update
@@ -115,9 +98,9 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
assert count_before == 1
# Run update
# Run update (should reconcile + queue, not create new snapshots)
subprocess.run(
['archivebox', 'update', '--index-only'],
['archivebox', 'update'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
@@ -133,21 +116,31 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
assert count_after == count_before
def test_update_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
"""Test update with --overwrite flag forces re-archiving."""
def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extractors_dict):
"""Test that update queues snapshots for archiving."""
os.chdir(tmp_path)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
['archivebox', 'add', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
# Run update
result = subprocess.run(
['archivebox', 'update', '--index-only', '--overwrite'],
['archivebox', 'update'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
assert result.returncode == 0
# Check that snapshot is queued
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
status = c.execute("SELECT status FROM core_snapshot").fetchone()[0]
conn.close()
assert status == 'queued'