diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 234d1316..ce255b04 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -104,8 +104,17 @@ def add(urls: str | list[str], if index_only: # Just create the crawl but don't start processing print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]') - # Create root snapshot manually - crawl.create_root_snapshot() + # Create snapshots for all URLs in the crawl + for url in crawl.get_urls_list(): + Snapshot.objects.update_or_create( + crawl=crawl, url=url, + defaults={ + 'status': Snapshot.INITIAL_STATE, + 'retry_at': timezone.now(), + 'timestamp': str(timezone.now().timestamp()), + 'depth': 0, + }, + ) return crawl.snapshot_set.all() # 5. Start the orchestrator to process the queue diff --git a/archivebox/core/views.py b/archivebox/core/views.py index bef958e3..4a104b45 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -577,17 +577,20 @@ def live_progress_view(request): active_crawls = [] for crawl in active_crawls_qs: - # Get active snapshots for this crawl - filter in Python since we prefetched all - crawl_snapshots = [ - s for s in crawl.snapshot_set.all() - if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] - ][:5] # Limit to 5 most recent + # Get ALL snapshots for this crawl to count status (already prefetched) + all_crawl_snapshots = list(crawl.snapshot_set.all()) - # Count snapshots by status (in memory, not DB) - total_snapshots = Snapshot.objects.filter(crawl=crawl).count() # Full count needs DB - completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED) - started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED) - pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED) + # Count snapshots by status from ALL snapshots + total_snapshots = len(all_crawl_snapshots) + completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED) + started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED) + pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED) + + # Get only ACTIVE snapshots to display (limit to 5 most recent) + active_crawl_snapshots = [ + s for s in all_crawl_snapshots + if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] + ][:5] # Count URLs in the crawl (for when snapshots haven't been created yet) urls_count = 0 @@ -599,7 +602,7 @@ def live_progress_view(request): # Get active snapshots for this crawl (already prefetched) active_snapshots_for_crawl = [] - for snapshot in crawl_snapshots: + for snapshot in active_crawl_snapshots: # Get archive results for this snapshot (already prefetched) snapshot_results = snapshot.archiveresult_set.all() diff --git a/archivebox/machine/migrations/0001_initial.py b/archivebox/machine/migrations/0001_initial.py index d04a28f4..01711ef7 100644 --- a/archivebox/machine/migrations/0001_initial.py +++ b/archivebox/machine/migrations/0001_initial.py @@ -1,7 +1,10 @@ # Generated by hand on 2025-12-29 # Creates Machine, Binary, NetworkInterface, and Process tables using raw SQL -from django.db import migrations +from django.db import migrations, models +import django.db.models.deletion +import django.utils.timezone +from archivebox.uuid_compat import uuid7 class Migration(migrations.Migration): @@ -12,9 +15,10 @@ class Migration(migrations.Migration): ] operations = [ - migrations.RunSQL( - # Forward SQL - sql=""" + migrations.SeparateDatabaseAndState( + database_operations=[ + migrations.RunSQL( + sql=""" -- Create machine_machine table CREATE TABLE IF NOT EXISTS machine_machine ( id TEXT PRIMARY KEY NOT NULL, @@ -136,12 +140,133 @@ class Migration(migrations.Migration): CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id); CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at); """, - # Reverse SQL - reverse_sql=""" - DROP TABLE IF EXISTS machine_process; - DROP TABLE IF EXISTS machine_binary; - DROP TABLE IF EXISTS machine_networkinterface; - DROP TABLE IF EXISTS machine_machine; - """ + reverse_sql=""" + DROP TABLE IF EXISTS machine_process; + DROP TABLE IF EXISTS machine_binary; + DROP TABLE IF EXISTS machine_networkinterface; + DROP TABLE IF EXISTS machine_machine; + """ + ), + ], + state_operations=[ + migrations.CreateModel( + name='Machine', + fields=[ + ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('num_uses_failed', models.PositiveIntegerField(default=0)), + ('guid', models.CharField(default=None, editable=False, max_length=64, unique=True)), + ('hostname', models.CharField(default=None, max_length=63)), + ('hw_in_docker', models.BooleanField(default=False)), + ('hw_in_vm', models.BooleanField(default=False)), + ('hw_manufacturer', models.CharField(default=None, max_length=63)), + ('hw_product', models.CharField(default=None, max_length=63)), + ('hw_uuid', models.CharField(default=None, max_length=255)), + ('os_arch', models.CharField(default=None, max_length=15)), + ('os_family', models.CharField(default=None, max_length=15)), + ('os_platform', models.CharField(default=None, max_length=63)), + ('os_release', models.CharField(default=None, max_length=63)), + ('os_kernel', models.CharField(default=None, max_length=255)), + ('stats', models.JSONField(blank=True, default=dict, null=True)), + ('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)), + ], + options={ + 'app_label': 'machine', + }, + ), + migrations.CreateModel( + name='NetworkInterface', + fields=[ + ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('num_uses_failed', models.PositiveIntegerField(default=0)), + ('mac_address', models.CharField(default=None, editable=False, max_length=17)), + ('ip_public', models.GenericIPAddressField(default=None, editable=False)), + ('ip_local', models.GenericIPAddressField(default=None, editable=False)), + ('dns_server', models.GenericIPAddressField(default=None, editable=False)), + ('hostname', models.CharField(default=None, max_length=63)), + ('iface', models.CharField(default=None, max_length=15)), + ('isp', models.CharField(default=None, max_length=63)), + ('city', models.CharField(default=None, max_length=63)), + ('region', models.CharField(default=None, max_length=63)), + ('country', models.CharField(default=None, max_length=63)), + ('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), + ], + options={ + 'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')}, + 'app_label': 'machine', + }, + ), + migrations.CreateModel( + name='Binary', + fields=[ + ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('num_uses_failed', models.PositiveIntegerField(default=0)), + ('name', models.CharField(blank=True, db_index=True, default='', max_length=63)), + ('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)), + ('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")), + ('binprovider', models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31)), + ('abspath', models.CharField(blank=True, default='', max_length=255)), + ('version', models.CharField(blank=True, default='', max_length=32)), + ('sha256', models.CharField(blank=True, default='', max_length=64)), + ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)), + ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)), + ('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)), + ('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), + ], + options={ + 'verbose_name': 'Binary', + 'verbose_name_plural': 'Binaries', + 'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')}, + 'app_label': 'machine', + }, + ), + migrations.CreateModel( + name='Process', + fields=[ + ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('modified_at', models.DateTimeField(auto_now=True)), + ('num_uses_succeeded', models.PositiveIntegerField(default=0)), + ('num_uses_failed', models.PositiveIntegerField(default=0)), + ('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)), + ('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')), + ('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')), + ('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')), + ('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)), + ('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)), + ('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')), + ('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')), + ('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)), + ('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)), + ('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)), + ('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)), + ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)), + ('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='processes', to='machine.machine')), + ('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.binary')), + ('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.networkinterface')), + ], + options={ + 'verbose_name': 'Process', + 'verbose_name_plural': 'Processes', + 'app_label': 'machine', + }, + ), + migrations.AddIndex( + model_name='process', + index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_c69cf0_idx'), + ), + migrations.AddIndex( + model_name='process', + index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__f79cc6_idx'), + ), + ], ), ] diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index bbc65663..acc7ebdf 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -363,6 +363,20 @@ background: rgba(248, 81, 73, 0.25); width: 100%; } + #progress-monitor .extractor-badge.backoff { + color: #b8860b; + } + #progress-monitor .extractor-badge.backoff .progress-fill { + background: rgba(210, 153, 34, 0.2); + width: 30%; + } + #progress-monitor .extractor-badge.skipped { + color: #6e7681; + } + #progress-monitor .extractor-badge.skipped .progress-fill { + background: rgba(110, 118, 129, 0.15); + width: 100%; + } #progress-monitor .extractor-badge .badge-icon { font-size: 10px; } @@ -400,6 +414,14 @@ background: rgba(248, 81, 73, 0.2); color: #f85149; } + #progress-monitor .status-badge.backoff { + background: rgba(210, 153, 34, 0.15); + color: #b8860b; + } + #progress-monitor .status-badge.unknown { + background: #21262d; + color: #6e7681; + } @@ -470,25 +492,28 @@ }); function formatUrl(url) { + if (!url) return '(no URL)'; try { const u = new URL(url); return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : ''); } catch { - return url.substring(0, 50) + (url.length > 50 ? '...' : ''); + return String(url).substring(0, 50) + (String(url).length > 50 ? '...' : ''); } } function renderExtractor(extractor) { const icon = extractor.status === 'started' ? '↻' : extractor.status === 'succeeded' ? '✓' : - extractor.status === 'failed' ? '✗' : '○'; + extractor.status === 'failed' ? '✗' : + extractor.status === 'backoff' ? '⌛' : + extractor.status === 'skipped' ? '⇢' : '○'; return ` - + ${icon} - ${extractor.plugin} + ${extractor.plugin || 'unknown'} `; @@ -496,13 +521,13 @@ function renderSnapshot(snapshot, crawlId) { const statusIcon = snapshot.status === 'started' ? '↻' : '📄'; - const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`; + const adminUrl = `/admin/core/snapshot/${snapshot.id || 'unknown'}/change/`; let extractorHtml = ''; if (snapshot.all_plugins && snapshot.all_plugins.length > 0) { // Sort plugins alphabetically by name to prevent reordering on updates const sortedExtractors = [...snapshot.all_plugins].sort((a, b) => - a.plugin.localeCompare(b.plugin) + (a.plugin || '').localeCompare(b.plugin || '') ); extractorHtml = `
@@ -518,16 +543,17 @@
${formatUrl(snapshot.url)}
- ${snapshot.completed_plugins}/${snapshot.total_plugins} extractors - ${snapshot.failed_plugins > 0 ? `(${snapshot.failed_plugins} failed)` : ''} + ${(snapshot.total_plugins || 0) > 0 + ? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` (${snapshot.failed_plugins} failed)` : ''}` + : 'Waiting for extractors...'}
- ${snapshot.status} + ${snapshot.status || 'unknown'}
-
+
${extractorHtml} @@ -537,7 +563,7 @@ function renderCrawl(crawl) { const statusIcon = crawl.status === 'started' ? '↻' : '🔍'; - const adminUrl = `/admin/crawls/crawl/${crawl.id}/change/`; + const adminUrl = `/admin/crawls/crawl/${crawl.id || 'unknown'}/change/`; let snapshotsHtml = ''; if (crawl.active_snapshots && crawl.active_snapshots.length > 0) { @@ -556,7 +582,7 @@ // Queued but retry_at is in future (was claimed by worker, will retry) warningHtml = `
- 🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''} + 🔄 Retrying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
`; } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) { @@ -569,34 +595,34 @@ } // Show snapshot info or URL count if no snapshots yet - let metaText = `depth: ${crawl.max_depth}`; - if (crawl.total_snapshots > 0) { + let metaText = `depth: ${crawl.max_depth || 0}`; + if ((crawl.total_snapshots || 0) > 0) { metaText += ` | ${crawl.total_snapshots} snapshots`; - } else if (crawl.urls_count > 0) { + } else if ((crawl.urls_count || 0) > 0) { metaText += ` | ${crawl.urls_count} URLs`; } else if (crawl.urls_preview) { metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`; } return ` -
+
${statusIcon}
-
${crawl.label}
+
${crawl.label || '(no label)'}
${metaText}
- ${crawl.completed_snapshots} done + ${crawl.completed_snapshots || 0} done ${crawl.started_snapshots || 0} active - ${crawl.pending_snapshots} pending + ${crawl.pending_snapshots || 0} pending
- ${crawl.status} + ${crawl.status || 'unknown'}
-
+
${warningHtml} @@ -668,7 +694,7 @@ idleMessage.style.display = 'none'; crawlTree.innerHTML = `
- ${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running + ${data.snapshots_started || 0} snapshots processing, ${data.archiveresults_started || 0} extractors running
`; } else { @@ -676,7 +702,7 @@ // Build the URL for recent crawls (last 24 hours) var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0]; var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1'; - idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, ${data.crawls_recent} recent)`; + idleMessage.innerHTML = `No active crawls (${data.crawls_pending || 0} pending, ${data.crawls_started || 0} started, ${data.crawls_recent || 0} recent)`; crawlTree.innerHTML = ''; } } diff --git a/tests/test_cli_add.py b/tests/test_cli_add.py index 65bb1367..7d325e61 100644 --- a/tests/test_cli_add.py +++ b/tests/test_cli_add.py @@ -91,7 +91,11 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_ def test_add_from_file(tmp_path, process, disable_extractors_dict): - """Test adding URLs from a file.""" + """Test adding URLs from a file. + + With --index-only, this creates a snapshot for the file itself, not the URLs inside. + To get snapshots for the URLs inside, you need to run without --index-only so parsers run. + """ os.chdir(tmp_path) # Create a file with URLs @@ -108,10 +112,13 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict): conn = sqlite3.connect("index.sqlite3") c = conn.cursor() + crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] conn.close() - assert snapshot_count == 2 + # With --index-only, creates 1 snapshot for the file itself + assert crawl_count == 1 + assert snapshot_count == 1 def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict): @@ -141,7 +148,11 @@ def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict): def test_add_with_tags(tmp_path, process, disable_extractors_dict): - """Test adding URL with tags creates tag records.""" + """Test adding URL with tags stores tags_str in crawl. + + With --index-only, Tag objects are not created until archiving happens. + Tags are stored as a string in the Crawl.tags_str field. + """ os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'], @@ -151,15 +162,19 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict): conn = sqlite3.connect("index.sqlite3") c = conn.cursor() - tags = c.execute("SELECT name FROM core_tag").fetchall() + tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0] conn.close() - tag_names = [t[0] for t in tags] - assert 'test' in tag_names or 'example' in tag_names + # Tags are stored as a comma-separated string in crawl + assert 'test' in tags_str or 'example' in tags_str -def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractors_dict): - """Test that adding the same URL twice updates rather than duplicates.""" +def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict): + """Test that adding the same URL twice creates separate crawls and snapshots. + + Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL. + This allows re-archiving URLs at different times. + """ os.chdir(tmp_path) # Add URL first time @@ -179,10 +194,12 @@ def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractor conn = sqlite3.connect("index.sqlite3") c = conn.cursor() snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0] + crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] conn.close() - # Should still only have one snapshot for this URL - assert snapshot_count == 1 + # Each add creates a new crawl with its own snapshot + assert crawl_count == 2 + assert snapshot_count == 2 def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict): @@ -208,7 +225,10 @@ def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict): def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict): - """Test that add creates archive subdirectory for the snapshot.""" + """Test that add creates archive subdirectory for the snapshot. + + Archive subdirectories are named by timestamp, not by snapshot ID. + """ os.chdir(tmp_path) subprocess.run( ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], @@ -216,14 +236,14 @@ def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_ env=disable_extractors_dict, ) - # Get the snapshot ID from the database + # Get the snapshot timestamp from the database conn = sqlite3.connect("index.sqlite3") c = conn.cursor() - snapshot_id = c.execute("SELECT id FROM core_snapshot").fetchone()[0] + timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0] conn.close() - # Check that archive subdirectory was created - archive_dir = tmp_path / "archive" / snapshot_id + # Check that archive subdirectory was created using timestamp + archive_dir = tmp_path / "archive" / str(timestamp) assert archive_dir.exists() assert archive_dir.is_dir() diff --git a/tests/test_cli_update.py b/tests/test_cli_update.py index 9faf4234..8a4a22a5 100644 --- a/tests/test_cli_update.py +++ b/tests/test_cli_update.py @@ -1,7 +1,7 @@ #!/usr/bin/env python3 """ Comprehensive tests for archivebox update command. -Verify update re-archives snapshots and updates DB status. +Verify update drains old dirs, reconciles DB, and queues snapshots. """ import os @@ -15,7 +15,7 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process): """Test that update runs without error on empty archive.""" os.chdir(tmp_path) result = subprocess.run( - ['archivebox', 'update', '--index-only'], + ['archivebox', 'update'], capture_output=True, text=True, timeout=30, @@ -25,41 +25,21 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process): assert result.returncode == 0 -def test_update_re_archives_existing_snapshots(tmp_path, process, disable_extractors_dict): - """Test that update command re-archives existing snapshots.""" +def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extractors_dict): + """Test that update command reconciles existing snapshots.""" os.chdir(tmp_path) # Add a snapshot subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Run update - result = subprocess.run( - ['archivebox', 'update', '--index-only'], + ['archivebox', 'add', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, timeout=30, ) - assert result.returncode == 0 - - -def test_update_index_only_flag(tmp_path, process, disable_extractors_dict): - """Test that --index-only flag skips extraction.""" - os.chdir(tmp_path) - - subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Update with index-only should be fast + # Run update - should reconcile and queue result = subprocess.run( - ['archivebox', 'update', '--index-only'], + ['archivebox', 'update'], capture_output=True, env=disable_extractors_dict, timeout=30, @@ -74,26 +54,28 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor # Add multiple snapshots subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], + ['archivebox', 'add', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, + timeout=30, ) subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Update with filter - result = subprocess.run( - ['archivebox', 'update', '--index-only', '--filter-type=search', '--filter=example.com'], + ['archivebox', 'add', '--depth=0', 'https://example.org'], capture_output=True, env=disable_extractors_dict, timeout=30, ) - # Should complete (may succeed or show usage) - assert result.returncode in [0, 1, 2] + # Update with filter pattern (uses filter_patterns argument) + result = subprocess.run( + ['archivebox', 'update', '--filter-type=substring', 'example.com'], + capture_output=True, + env=disable_extractors_dict, + timeout=30, + ) + + # Should complete successfully + assert result.returncode == 0 def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_dict): @@ -102,9 +84,10 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d # Add snapshots subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], + ['archivebox', 'add', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, + timeout=30, ) # Count before update @@ -115,9 +98,9 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d assert count_before == 1 - # Run update + # Run update (should reconcile + queue, not create new snapshots) subprocess.run( - ['archivebox', 'update', '--index-only'], + ['archivebox', 'update'], capture_output=True, env=disable_extractors_dict, timeout=30, @@ -133,21 +116,31 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d assert count_after == count_before -def test_update_with_overwrite_flag(tmp_path, process, disable_extractors_dict): - """Test update with --overwrite flag forces re-archiving.""" +def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extractors_dict): + """Test that update queues snapshots for archiving.""" os.chdir(tmp_path) subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], + ['archivebox', 'add', '--depth=0', 'https://example.com'], capture_output=True, env=disable_extractors_dict, + timeout=30, ) + # Run update result = subprocess.run( - ['archivebox', 'update', '--index-only', '--overwrite'], + ['archivebox', 'update'], capture_output=True, env=disable_extractors_dict, timeout=30, ) assert result.returncode == 0 + + # Check that snapshot is queued + conn = sqlite3.connect("index.sqlite3") + c = conn.cursor() + status = c.execute("SELECT status FROM core_snapshot").fetchone()[0] + conn.close() + + assert status == 'queued'