fix initial migrtaions

2026-01-02 17:05:38 +10:00 · 2025-12-29 21:27:31 -08:00
parent 3dd329600e
commit 2e350d317d
6 changed files with 285 additions and 109 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -104,8 +104,17 @@ def add(urls: str | list[str],
    if index_only:
        # Just create the crawl but don't start processing
        print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
-        # Create root snapshot manually
-        crawl.create_root_snapshot()
+        # Create snapshots for all URLs in the crawl
+        for url in crawl.get_urls_list():
+            Snapshot.objects.update_or_create(
+                crawl=crawl, url=url,
+                defaults={
+                    'status': Snapshot.INITIAL_STATE,
+                    'retry_at': timezone.now(),
+                    'timestamp': str(timezone.now().timestamp()),
+                    'depth': 0,
+                },
+            )
        return crawl.snapshot_set.all()

    # 5. Start the orchestrator to process the queue
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -577,17 +577,20 @@ def live_progress_view(request):

        active_crawls = []
        for crawl in active_crawls_qs:
-            # Get active snapshots for this crawl - filter in Python since we prefetched all
-            crawl_snapshots = [
-                s for s in crawl.snapshot_set.all()
-                if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
-            ][:5]  # Limit to 5 most recent
+            # Get ALL snapshots for this crawl to count status (already prefetched)
+            all_crawl_snapshots = list(crawl.snapshot_set.all())

-            # Count snapshots by status (in memory, not DB)
-            total_snapshots = Snapshot.objects.filter(crawl=crawl).count()  # Full count needs DB
-            completed_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
-            started_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
-            pending_snapshots = sum(1 for s in crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
+            # Count snapshots by status from ALL snapshots
+            total_snapshots = len(all_crawl_snapshots)
+            completed_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.SEALED)
+            started_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.STARTED)
+            pending_snapshots = sum(1 for s in all_crawl_snapshots if s.status == Snapshot.StatusChoices.QUEUED)
+
+            # Get only ACTIVE snapshots to display (limit to 5 most recent)
+            active_crawl_snapshots = [
+                s for s in all_crawl_snapshots
+                if s.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
+            ][:5]

            # Count URLs in the crawl (for when snapshots haven't been created yet)
            urls_count = 0
@@ -599,7 +602,7 @@ def live_progress_view(request):

            # Get active snapshots for this crawl (already prefetched)
            active_snapshots_for_crawl = []
-            for snapshot in crawl_snapshots:
+            for snapshot in active_crawl_snapshots:
                # Get archive results for this snapshot (already prefetched)
                snapshot_results = snapshot.archiveresult_set.all()

--- a/archivebox/machine/migrations/0001_initial.py
+++ b/archivebox/machine/migrations/0001_initial.py
@@ -1,7 +1,10 @@
 # Generated by hand on 2025-12-29
 # Creates Machine, Binary, NetworkInterface, and Process tables using raw SQL

-from django.db import migrations
+from django.db import migrations, models
+import django.db.models.deletion
+import django.utils.timezone
+from archivebox.uuid_compat import uuid7


 class Migration(migrations.Migration):
@@ -12,9 +15,10 @@ class Migration(migrations.Migration):
    ]

    operations = [
-        migrations.RunSQL(
-            # Forward SQL
-            sql="""
+        migrations.SeparateDatabaseAndState(
+            database_operations=[
+                migrations.RunSQL(
+                    sql="""
                -- Create machine_machine table
                CREATE TABLE IF NOT EXISTS machine_machine (
                    id TEXT PRIMARY KEY NOT NULL,
@@ -136,12 +140,133 @@ class Migration(migrations.Migration):
                CREATE INDEX IF NOT EXISTS machine_process_binary_id_idx ON machine_process(binary_id);
                CREATE INDEX IF NOT EXISTS machine_process_machine_status_retry_idx ON machine_process(machine_id, status, retry_at);
            """,
-            # Reverse SQL
-            reverse_sql="""
-                DROP TABLE IF EXISTS machine_process;
-                DROP TABLE IF EXISTS machine_binary;
-                DROP TABLE IF EXISTS machine_networkinterface;
-                DROP TABLE IF EXISTS machine_machine;
-            """
+                    reverse_sql="""
+                        DROP TABLE IF EXISTS machine_process;
+                        DROP TABLE IF EXISTS machine_binary;
+                        DROP TABLE IF EXISTS machine_networkinterface;
+                        DROP TABLE IF EXISTS machine_machine;
+                    """
+                ),
+            ],
+            state_operations=[
+                migrations.CreateModel(
+                    name='Machine',
+                    fields=[
+                        ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                        ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                        ('modified_at', models.DateTimeField(auto_now=True)),
+                        ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
+                        ('num_uses_failed', models.PositiveIntegerField(default=0)),
+                        ('guid', models.CharField(default=None, editable=False, max_length=64, unique=True)),
+                        ('hostname', models.CharField(default=None, max_length=63)),
+                        ('hw_in_docker', models.BooleanField(default=False)),
+                        ('hw_in_vm', models.BooleanField(default=False)),
+                        ('hw_manufacturer', models.CharField(default=None, max_length=63)),
+                        ('hw_product', models.CharField(default=None, max_length=63)),
+                        ('hw_uuid', models.CharField(default=None, max_length=255)),
+                        ('os_arch', models.CharField(default=None, max_length=15)),
+                        ('os_family', models.CharField(default=None, max_length=15)),
+                        ('os_platform', models.CharField(default=None, max_length=63)),
+                        ('os_release', models.CharField(default=None, max_length=63)),
+                        ('os_kernel', models.CharField(default=None, max_length=255)),
+                        ('stats', models.JSONField(blank=True, default=dict, null=True)),
+                        ('config', models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True)),
+                    ],
+                    options={
+                        'app_label': 'machine',
+                    },
+                ),
+                migrations.CreateModel(
+                    name='NetworkInterface',
+                    fields=[
+                        ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                        ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                        ('modified_at', models.DateTimeField(auto_now=True)),
+                        ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
+                        ('num_uses_failed', models.PositiveIntegerField(default=0)),
+                        ('mac_address', models.CharField(default=None, editable=False, max_length=17)),
+                        ('ip_public', models.GenericIPAddressField(default=None, editable=False)),
+                        ('ip_local', models.GenericIPAddressField(default=None, editable=False)),
+                        ('dns_server', models.GenericIPAddressField(default=None, editable=False)),
+                        ('hostname', models.CharField(default=None, max_length=63)),
+                        ('iface', models.CharField(default=None, max_length=15)),
+                        ('isp', models.CharField(default=None, max_length=63)),
+                        ('city', models.CharField(default=None, max_length=63)),
+                        ('region', models.CharField(default=None, max_length=63)),
+                        ('country', models.CharField(default=None, max_length=63)),
+                        ('machine', models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
+                    ],
+                    options={
+                        'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
+                        'app_label': 'machine',
+                    },
+                ),
+                migrations.CreateModel(
+                    name='Binary',
+                    fields=[
+                        ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                        ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                        ('modified_at', models.DateTimeField(auto_now=True)),
+                        ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
+                        ('num_uses_failed', models.PositiveIntegerField(default=0)),
+                        ('name', models.CharField(blank=True, db_index=True, default='', max_length=63)),
+                        ('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
+                        ('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
+                        ('binprovider', models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31)),
+                        ('abspath', models.CharField(blank=True, default='', max_length=255)),
+                        ('version', models.CharField(blank=True, default='', max_length=32)),
+                        ('sha256', models.CharField(blank=True, default='', max_length=64)),
+                        ('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
+                        ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
+                        ('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
+                        ('machine', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
+                    ],
+                    options={
+                        'verbose_name': 'Binary',
+                        'verbose_name_plural': 'Binaries',
+                        'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')},
+                        'app_label': 'machine',
+                    },
+                ),
+                migrations.CreateModel(
+                    name='Process',
+                    fields=[
+                        ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
+                        ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                        ('modified_at', models.DateTimeField(auto_now=True)),
+                        ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
+                        ('num_uses_failed', models.PositiveIntegerField(default=0)),
+                        ('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
+                        ('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
+                        ('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
+                        ('timeout', models.IntegerField(default=120, help_text='Timeout in seconds')),
+                        ('pid', models.IntegerField(blank=True, default=None, help_text='OS process ID', null=True)),
+                        ('exit_code', models.IntegerField(blank=True, default=None, help_text='Process exit code (0 = success)', null=True)),
+                        ('stdout', models.TextField(blank=True, default='', help_text='Standard output from process')),
+                        ('stderr', models.TextField(blank=True, default='', help_text='Standard error from process')),
+                        ('started_at', models.DateTimeField(blank=True, default=None, help_text='When process was launched', null=True)),
+                        ('ended_at', models.DateTimeField(blank=True, default=None, help_text='When process completed/terminated', null=True)),
+                        ('url', models.URLField(blank=True, default=None, help_text='Connection URL (CDP endpoint, sonic server, etc.)', max_length=2048, null=True)),
+                        ('status', models.CharField(choices=[('queued', 'Queued'), ('running', 'Running'), ('exited', 'Exited')], db_index=True, default='queued', max_length=16)),
+                        ('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this process', null=True)),
+                        ('machine', models.ForeignKey(help_text='Machine where this process executed', on_delete=django.db.models.deletion.CASCADE, related_name='processes', to='machine.machine')),
+                        ('binary', models.ForeignKey(blank=True, help_text='Binary used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.binary')),
+                        ('iface', models.ForeignKey(blank=True, help_text='Network interface used by this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='processes', to='machine.networkinterface')),
+                    ],
+                    options={
+                        'verbose_name': 'Process',
+                        'verbose_name_plural': 'Processes',
+                        'app_label': 'machine',
+                    },
+                ),
+                migrations.AddIndex(
+                    model_name='process',
+                    index=models.Index(fields=['machine', 'status', 'retry_at'], name='machine_pro_machine_c69cf0_idx'),
+                ),
+                migrations.AddIndex(
+                    model_name='process',
+                    index=models.Index(fields=['binary', 'exit_code'], name='machine_pro_binary__f79cc6_idx'),
+                ),
+            ],
        ),
    ]
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -363,6 +363,20 @@
        background: rgba(248, 81, 73, 0.25);
        width: 100%;
    }
+    #progress-monitor .extractor-badge.backoff {
+        color: #b8860b;
+    }
+    #progress-monitor .extractor-badge.backoff .progress-fill {
+        background: rgba(210, 153, 34, 0.2);
+        width: 30%;
+    }
+    #progress-monitor .extractor-badge.skipped {
+        color: #6e7681;
+    }
+    #progress-monitor .extractor-badge.skipped .progress-fill {
+        background: rgba(110, 118, 129, 0.15);
+        width: 100%;
+    }
    #progress-monitor .extractor-badge .badge-icon {
        font-size: 10px;
    }
@@ -400,6 +414,14 @@
        background: rgba(248, 81, 73, 0.2);
        color: #f85149;
    }
+    #progress-monitor .status-badge.backoff {
+        background: rgba(210, 153, 34, 0.15);
+        color: #b8860b;
+    }
+    #progress-monitor .status-badge.unknown {
+        background: #21262d;
+        color: #6e7681;
+    }

 </style>

@@ -470,25 +492,28 @@
    });

    function formatUrl(url) {
+        if (!url) return '(no URL)';
        try {
            const u = new URL(url);
            return u.hostname + u.pathname.substring(0, 30) + (u.pathname.length > 30 ? '...' : '');
        } catch {
-            return url.substring(0, 50) + (url.length > 50 ? '...' : '');
+            return String(url).substring(0, 50) + (String(url).length > 50 ? '...' : '');
        }
    }

    function renderExtractor(extractor) {
        const icon = extractor.status === 'started' ? '&#8635;' :
                    extractor.status === 'succeeded' ? '&#10003;' :
-                    extractor.status === 'failed' ? '&#10007;' : '&#9675;';
+                    extractor.status === 'failed' ? '&#10007;' :
+                    extractor.status === 'backoff' ? '&#8987;' :
+                    extractor.status === 'skipped' ? '&#8674;' : '&#9675;';

        return `
-            <span class="extractor-badge ${extractor.status}">
+            <span class="extractor-badge ${extractor.status || 'queued'}">
                <span class="progress-fill"></span>
                <span class="badge-content">
                    <span class="badge-icon">${icon}</span>
-                    <span>${extractor.plugin}</span>
+                    <span>${extractor.plugin || 'unknown'}</span>
                </span>
            </span>
        `;
@@ -496,13 +521,13 @@

    function renderSnapshot(snapshot, crawlId) {
        const statusIcon = snapshot.status === 'started' ? '&#8635;' : '&#128196;';
-        const adminUrl = `/admin/core/snapshot/${snapshot.id}/change/`;
+        const adminUrl = `/admin/core/snapshot/${snapshot.id || 'unknown'}/change/`;

        let extractorHtml = '';
        if (snapshot.all_plugins && snapshot.all_plugins.length > 0) {
            // Sort plugins alphabetically by name to prevent reordering on updates
            const sortedExtractors = [...snapshot.all_plugins].sort((a, b) =>
-                a.plugin.localeCompare(b.plugin)
+                (a.plugin || '').localeCompare(b.plugin || '')
            );
            extractorHtml = `
                <div class="extractor-list">
@@ -518,16 +543,17 @@
                    <div class="snapshot-info">
                        <div class="snapshot-url">${formatUrl(snapshot.url)}</div>
                        <div class="snapshot-meta">
-                            ${snapshot.completed_plugins}/${snapshot.total_plugins} extractors
-                            ${snapshot.failed_plugins > 0 ? `<span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}
+                            ${(snapshot.total_plugins || 0) > 0
+                                ? `${snapshot.completed_plugins || 0}/${snapshot.total_plugins || 0} extractors${(snapshot.failed_plugins || 0) > 0 ? ` <span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}`
+                                : 'Waiting for extractors...'}
                        </div>
                    </div>
-                    <span class="status-badge ${snapshot.status}">${snapshot.status}</span>
+                    <span class="status-badge ${snapshot.status || 'unknown'}">${snapshot.status || 'unknown'}</span>
                </a>
                <div class="snapshot-progress">
                    <div class="progress-bar-container">
-                        <div class="progress-bar snapshot ${snapshot.status === 'started' && snapshot.progress === 0 ? 'indeterminate' : ''}"
-                             style="width: ${snapshot.progress}%"></div>
+                        <div class="progress-bar snapshot ${snapshot.status === 'started' && (snapshot.progress || 0) === 0 ? 'indeterminate' : ''}"
+                             style="width: ${snapshot.progress || 0}%"></div>
                    </div>
                </div>
                ${extractorHtml}
@@ -537,7 +563,7 @@

    function renderCrawl(crawl) {
        const statusIcon = crawl.status === 'started' ? '&#8635;' : '&#128269;';
-        const adminUrl = `/admin/crawls/crawl/${crawl.id}/change/`;
+        const adminUrl = `/admin/crawls/crawl/${crawl.id || 'unknown'}/change/`;

        let snapshotsHtml = '';
        if (crawl.active_snapshots && crawl.active_snapshots.length > 0) {
@@ -556,7 +582,7 @@
            // Queued but retry_at is in future (was claimed by worker, will retry)
            warningHtml = `
                <div style="padding: 8px 14px; background: rgba(88, 166, 255, 0.1); border-top: 1px solid #58a6ff; color: #58a6ff; font-size: 11px;">
-                    🔄 Retrying in ${crawl.seconds_until_retry}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
+                    🔄 Retrying in ${crawl.seconds_until_retry || 0}s...${crawl.urls_preview ? ` (${crawl.urls_preview})` : ''}
                </div>
            `;
        } else if (crawl.status === 'queued' && crawl.total_snapshots === 0) {
@@ -569,34 +595,34 @@
        }

        // Show snapshot info or URL count if no snapshots yet
-        let metaText = `depth: ${crawl.max_depth}`;
-        if (crawl.total_snapshots > 0) {
+        let metaText = `depth: ${crawl.max_depth || 0}`;
+        if ((crawl.total_snapshots || 0) > 0) {
            metaText += ` | ${crawl.total_snapshots} snapshots`;
-        } else if (crawl.urls_count > 0) {
+        } else if ((crawl.urls_count || 0) > 0) {
            metaText += ` | ${crawl.urls_count} URLs`;
        } else if (crawl.urls_preview) {
            metaText += ` | ${crawl.urls_preview.substring(0, 40)}${crawl.urls_preview.length > 40 ? '...' : ''}`;
        }

        return `
-            <div class="crawl-item" data-crawl-id="${crawl.id}">
+            <div class="crawl-item" data-crawl-id="${crawl.id || 'unknown'}">
                <a class="crawl-header" href="${adminUrl}">
                    <span class="crawl-icon">${statusIcon}</span>
                    <div class="crawl-info">
-                        <div class="crawl-label">${crawl.label}</div>
+                        <div class="crawl-label">${crawl.label || '(no label)'}</div>
                        <div class="crawl-meta">${metaText}</div>
                    </div>
                    <div class="crawl-stats">
-                        <span style="color:#3fb950">${crawl.completed_snapshots} done</span>
+                        <span style="color:#3fb950">${crawl.completed_snapshots || 0} done</span>
                        <span style="color:#d29922">${crawl.started_snapshots || 0} active</span>
-                        <span style="color:#8b949e">${crawl.pending_snapshots} pending</span>
+                        <span style="color:#8b949e">${crawl.pending_snapshots || 0} pending</span>
                    </div>
-                    <span class="status-badge ${crawl.status}">${crawl.status}</span>
+                    <span class="status-badge ${crawl.status || 'unknown'}">${crawl.status || 'unknown'}</span>
                </a>
                <div class="crawl-progress">
                    <div class="progress-bar-container">
-                        <div class="progress-bar crawl ${crawl.status === 'started' && crawl.progress === 0 ? 'indeterminate' : ''}"
-                             style="width: ${crawl.progress}%"></div>
+                        <div class="progress-bar crawl ${crawl.status === 'started' && (crawl.progress || 0) === 0 ? 'indeterminate' : ''}"
+                             style="width: ${crawl.progress || 0}%"></div>
                    </div>
                </div>
                ${warningHtml}
@@ -668,7 +694,7 @@
            idleMessage.style.display = 'none';
            crawlTree.innerHTML = `
                <div class="idle-message">
-                    ${data.snapshots_started} snapshots processing, ${data.archiveresults_started} extractors running
+                    ${data.snapshots_started || 0} snapshots processing, ${data.archiveresults_started || 0} extractors running
                </div>
            `;
        } else {
@@ -676,7 +702,7 @@
            // Build the URL for recent crawls (last 24 hours)
            var yesterday = new Date(Date.now() - 24*60*60*1000).toISOString().split('T')[0];
            var recentUrl = '/admin/crawls/crawl/?created_at__gte=' + yesterday + '&o=-1';
-            idleMessage.innerHTML = `No active crawls (${data.crawls_pending} pending, ${data.crawls_started} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent} recent</a>)`;
+            idleMessage.innerHTML = `No active crawls (${data.crawls_pending || 0} pending, ${data.crawls_started || 0} started, <a href="${recentUrl}" style="color: #58a6ff;">${data.crawls_recent || 0} recent</a>)`;
            crawlTree.innerHTML = '';
        }
    }
--- a/tests/test_cli_add.py
+++ b/tests/test_cli_add.py
@@ -91,7 +91,11 @@ def test_add_multiple_urls_single_command(tmp_path, process, disable_extractors_


 def test_add_from_file(tmp_path, process, disable_extractors_dict):
-    """Test adding URLs from a file."""
+    """Test adding URLs from a file.
+
+    With --index-only, this creates a snapshot for the file itself, not the URLs inside.
+    To get snapshots for the URLs inside, you need to run without --index-only so parsers run.
+    """
    os.chdir(tmp_path)

    # Create a file with URLs
@@ -108,10 +112,13 @@ def test_add_from_file(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
+    crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
    snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
    conn.close()

-    assert snapshot_count == 2
+    # With --index-only, creates 1 snapshot for the file itself
+    assert crawl_count == 1
+    assert snapshot_count == 1


 def test_add_with_depth_0_flag(tmp_path, process, disable_extractors_dict):
@@ -141,7 +148,11 @@ def test_add_with_depth_1_flag(tmp_path, process, disable_extractors_dict):


 def test_add_with_tags(tmp_path, process, disable_extractors_dict):
-    """Test adding URL with tags creates tag records."""
+    """Test adding URL with tags stores tags_str in crawl.
+
+    With --index-only, Tag objects are not created until archiving happens.
+    Tags are stored as a string in the Crawl.tags_str field.
+    """
    os.chdir(tmp_path)
    subprocess.run(
        ['archivebox', 'add', '--index-only', '--depth=0', '--tag=test,example', 'https://example.com'],
@@ -151,15 +162,19 @@ def test_add_with_tags(tmp_path, process, disable_extractors_dict):

    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    tags = c.execute("SELECT name FROM core_tag").fetchall()
+    tags_str = c.execute("SELECT tags_str FROM crawls_crawl").fetchone()[0]
    conn.close()

-    tag_names = [t[0] for t in tags]
-    assert 'test' in tag_names or 'example' in tag_names
+    # Tags are stored as a comma-separated string in crawl
+    assert 'test' in tags_str or 'example' in tags_str


-def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractors_dict):
-    """Test that adding the same URL twice updates rather than duplicates."""
+def test_add_duplicate_url_creates_separate_crawls(tmp_path, process, disable_extractors_dict):
+    """Test that adding the same URL twice creates separate crawls and snapshots.
+
+    Each 'add' command creates a new Crawl. Multiple crawls can archive the same URL.
+    This allows re-archiving URLs at different times.
+    """
    os.chdir(tmp_path)

    # Add URL first time
@@ -179,10 +194,12 @@ def test_add_duplicate_url_updates_existing(tmp_path, process, disable_extractor
    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
    snapshot_count = c.execute("SELECT COUNT(*) FROM core_snapshot WHERE url='https://example.com'").fetchone()[0]
+    crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
    conn.close()

-    # Should still only have one snapshot for this URL
-    assert snapshot_count == 1
+    # Each add creates a new crawl with its own snapshot
+    assert crawl_count == 2
+    assert snapshot_count == 2


 def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
@@ -208,7 +225,10 @@ def test_add_with_overwrite_flag(tmp_path, process, disable_extractors_dict):


 def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_dict):
-    """Test that add creates archive subdirectory for the snapshot."""
+    """Test that add creates archive subdirectory for the snapshot.
+
+    Archive subdirectories are named by timestamp, not by snapshot ID.
+    """
    os.chdir(tmp_path)
    subprocess.run(
        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
@@ -216,14 +236,14 @@ def test_add_creates_archive_subdirectory(tmp_path, process, disable_extractors_
        env=disable_extractors_dict,
    )

-    # Get the snapshot ID from the database
+    # Get the snapshot timestamp from the database
    conn = sqlite3.connect("index.sqlite3")
    c = conn.cursor()
-    snapshot_id = c.execute("SELECT id FROM core_snapshot").fetchone()[0]
+    timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
    conn.close()

-    # Check that archive subdirectory was created
-    archive_dir = tmp_path / "archive" / snapshot_id
+    # Check that archive subdirectory was created using timestamp
+    archive_dir = tmp_path / "archive" / str(timestamp)
    assert archive_dir.exists()
    assert archive_dir.is_dir()

--- a/tests/test_cli_update.py
+++ b/tests/test_cli_update.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python3
 """
 Comprehensive tests for archivebox update command.
-Verify update re-archives snapshots and updates DB status.
+Verify update drains old dirs, reconciles DB, and queues snapshots.
 """

 import os
@@ -15,7 +15,7 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
    """Test that update runs without error on empty archive."""
    os.chdir(tmp_path)
    result = subprocess.run(
-        ['archivebox', 'update', '--index-only'],
+        ['archivebox', 'update'],
        capture_output=True,
        text=True,
        timeout=30,
@@ -25,41 +25,21 @@ def test_update_runs_successfully_on_empty_archive(tmp_path, process):
    assert result.returncode == 0


-def test_update_re_archives_existing_snapshots(tmp_path, process, disable_extractors_dict):
-    """Test that update command re-archives existing snapshots."""
+def test_update_reconciles_existing_snapshots(tmp_path, process, disable_extractors_dict):
+    """Test that update command reconciles existing snapshots."""
    os.chdir(tmp_path)

    # Add a snapshot
    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Run update
-    result = subprocess.run(
-        ['archivebox', 'update', '--index-only'],
+        ['archivebox', 'add', '--depth=0', 'https://example.com'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=30,
    )

-    assert result.returncode == 0
-
-
-def test_update_index_only_flag(tmp_path, process, disable_extractors_dict):
-    """Test that --index-only flag skips extraction."""
-    os.chdir(tmp_path)
-
-    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Update with index-only should be fast
+    # Run update - should reconcile and queue
    result = subprocess.run(
-        ['archivebox', 'update', '--index-only'],
+        ['archivebox', 'update'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=30,
@@ -74,26 +54,28 @@ def test_update_specific_snapshot_by_filter(tmp_path, process, disable_extractor

    # Add multiple snapshots
    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
+        ['archivebox', 'add', '--depth=0', 'https://example.com'],
        capture_output=True,
        env=disable_extractors_dict,
+        timeout=30,
    )
    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.org'],
-        capture_output=True,
-        env=disable_extractors_dict,
-    )
-
-    # Update with filter
-    result = subprocess.run(
-        ['archivebox', 'update', '--index-only', '--filter-type=search', '--filter=example.com'],
+        ['archivebox', 'add', '--depth=0', 'https://example.org'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=30,
    )

-    # Should complete (may succeed or show usage)
-    assert result.returncode in [0, 1, 2]
+    # Update with filter pattern (uses filter_patterns argument)
+    result = subprocess.run(
+        ['archivebox', 'update', '--filter-type=substring', 'example.com'],
+        capture_output=True,
+        env=disable_extractors_dict,
+        timeout=30,
+    )
+
+    # Should complete successfully
+    assert result.returncode == 0


 def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_dict):
@@ -102,9 +84,10 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d

    # Add snapshots
    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
+        ['archivebox', 'add', '--depth=0', 'https://example.com'],
        capture_output=True,
        env=disable_extractors_dict,
+        timeout=30,
    )

    # Count before update
@@ -115,9 +98,9 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d

    assert count_before == 1

-    # Run update
+    # Run update (should reconcile + queue, not create new snapshots)
    subprocess.run(
-        ['archivebox', 'update', '--index-only'],
+        ['archivebox', 'update'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=30,
@@ -133,21 +116,31 @@ def test_update_preserves_snapshot_count(tmp_path, process, disable_extractors_d
    assert count_after == count_before


-def test_update_with_overwrite_flag(tmp_path, process, disable_extractors_dict):
-    """Test update with --overwrite flag forces re-archiving."""
+def test_update_queues_snapshots_for_archiving(tmp_path, process, disable_extractors_dict):
+    """Test that update queues snapshots for archiving."""
    os.chdir(tmp_path)

    subprocess.run(
-        ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
+        ['archivebox', 'add', '--depth=0', 'https://example.com'],
        capture_output=True,
        env=disable_extractors_dict,
+        timeout=30,
    )

+    # Run update
    result = subprocess.run(
-        ['archivebox', 'update', '--index-only', '--overwrite'],
+        ['archivebox', 'update'],
        capture_output=True,
        env=disable_extractors_dict,
        timeout=30,
    )

    assert result.returncode == 0
+
+    # Check that snapshot is queued
+    conn = sqlite3.connect("index.sqlite3")
+    c = conn.cursor()
+    status = c.execute("SELECT status FROM core_snapshot").fetchone()[0]
+    conn.close()
+
+    assert status == 'queued'