diff --git a/.claude/settings.local.json b/.claude/settings.local.json index cd9c657a..fede3847 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -25,7 +25,8 @@ "Bash(echo:*)", "Bash(grep:*)", "WebFetch(domain:python-statemachine.readthedocs.io)", - "Bash(./bin/run_plugin_tests.sh:*)" + "Bash(./bin/run_plugin_tests.sh:*)", + "Bash(done)" ] } } diff --git a/.gitignore b/.gitignore index 066d722a..5f6ffcae 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,12 @@ __pycache__/ .eggs/ tests/out/ +# Coverage +.coverage +.coverage.* +coverage.json +htmlcov/ + # Python and Node dependencies venv/ .venv/ diff --git a/CLAUDE.md b/CLAUDE.md index 35a58346..e0446e65 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,135 +27,17 @@ uv sync --dev --all-extras # Always use uv, never pip directly source .venv/bin/activate ``` -### Generate and Apply Migrations -```bash -# Generate migrations (run from archivebox subdirectory) -cd archivebox -./manage.py makemigrations +### Common Gotchas -# Apply migrations to test database -cd data/ -archivebox init -``` - -## Running Tests - -### CRITICAL: Never Run as Root -ArchiveBox has a root check that prevents running as root user. All ArchiveBox commands (including tests) must run as non-root user inside a data directory: - -```bash -# Run all migration tests -sudo -u testuser bash -c 'source /path/to/.venv/bin/activate && python -m pytest archivebox/tests/test_migrations_*.py -v' - -# Run specific test file -sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_08_to_09.py -v' - -# Run single test -sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_fresh.py::TestFreshInstall::test_init_creates_database -xvs' -``` - -### Test File Structure -``` -archivebox/tests/ -├── test_migrations_helpers.py # Schemas, seeding functions, verification helpers -├── test_migrations_fresh.py # Fresh install tests -├── test_migrations_04_to_09.py # 0.4.x → 0.9.x migration tests -├── test_migrations_07_to_09.py # 0.7.x → 0.9.x migration tests -└── test_migrations_08_to_09.py # 0.8.x → 0.9.x migration tests -``` - -## Test Writing Standards - -### NO MOCKS - Real Tests Only -Tests must exercise real code paths: -- Create real SQLite databases with version-specific schemas -- Seed with realistic test data -- Run actual `python -m archivebox` commands via subprocess -- Query SQLite directly to verify results - -**If something is hard to test**: Modify the implementation to make it easier to test, or fix the underlying issue. Never mock, skip, simulate, or exit early from a test because you can't get something working inside the test. - -### NO SKIPS -Never use `@skip`, `skipTest`, or `pytest.mark.skip`. Every test must run. If a test is difficult, fix the code or test environment - don't disable the test. - -### Strict Assertions -- `init` command must return exit code 0 (not `[0, 1]`) -- Verify ALL data is preserved, not just "at least one" -- Use exact counts (`==`) not loose bounds (`>=`) - -### Example Test Pattern -```python -def test_migration_preserves_snapshots(self): - """Migration should preserve all snapshots.""" - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") - - ok, msg = verify_snapshot_count(self.db_path, expected_count) - self.assertTrue(ok, msg) -``` - -## Migration Testing - -### Schema Versions -- **0.4.x**: First Django version. Tags as comma-separated string, no ArchiveResult model -- **0.7.x**: Tag model with M2M, ArchiveResult model, AutoField PKs -- **0.8.x**: Crawl/Seed models, UUID PKs, status fields, depth/retry_at -- **0.9.x**: Seed model removed, seed_id FK removed from Crawl - -### Testing a Migration Path -1. Create SQLite DB with source version schema (from `test_migrations_helpers.py`) -2. Seed with realistic test data using `seed_0_X_data()` -3. Run `archivebox init` to trigger migrations -4. Verify data preservation with `verify_*` functions -5. Test CLI commands work post-migration (`status`, `list`, `add`, etc.) - -### Squashed Migrations -When testing 0.8.x (dev branch), you must record ALL replaced migrations: -```python -# The squashed migration replaces these - all must be recorded -('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'), -('core', '0024_auto_20240513_1143'), -# ... all 52 migrations from 0023-0074 ... -('core', '0023_new_schema'), # Also record the squashed migration itself -``` - -## Common Gotchas - -### 1. File Permissions +#### File Permissions New files created by root need permissions fixed for testuser: ```bash chmod 644 archivebox/tests/test_*.py ``` -### 2. DATA_DIR Environment Variable +#### DATA_DIR Environment Variable ArchiveBox commands must run inside a data directory. Tests use temp directories - the `run_archivebox()` helper sets `DATA_DIR` automatically. -### 3. Extractors Disabled for Speed -Tests disable all extractors via environment variables for faster execution: -```python -env['SAVE_TITLE'] = 'False' -env['SAVE_FAVICON'] = 'False' -# ... etc -``` - -### 4. Timeout Settings -Use appropriate timeouts for migration tests (45s for init, 60s default). - -### 5. Circular FK References in Schemas -SQLite handles circular references with `IF NOT EXISTS`. Order matters less than in other DBs. - -## Architecture Notes - -### Crawl Model (0.9.x) -- Crawl groups multiple Snapshots from a single `add` command -- Each `add` creates one Crawl with one or more Snapshots -- Seed model was removed - crawls now store URLs directly - -### Migration Strategy -- Squashed migrations for clean installs -- Individual migrations recorded for upgrades from dev branch -- `replaces` attribute in squashed migrations lists what they replace - ## Code Style Guidelines ### Naming Conventions for Grep-ability @@ -207,6 +89,334 @@ class Binary(models.Model): **Principle**: If you're storing the same conceptual data (e.g., `overrides`), use the same field name across all models and keep the internal structure identical. This makes the codebase predictable and reduces cognitive load. +## Testing + +### CRITICAL: Never Run as Root +ArchiveBox has a root check that prevents running as root user. All ArchiveBox commands (including tests) must run as non-root user inside a data directory: + +```bash +# Run all migration tests +sudo -u testuser bash -c 'source /path/to/.venv/bin/activate && python -m pytest archivebox/tests/test_migrations_*.py -v' + +# Run specific test file +sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_08_to_09.py -v' + +# Run single test +sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_migrations_fresh.py::TestFreshInstall::test_init_creates_database -xvs' +``` + +### Test File Structure +``` +archivebox/tests/ +├── test_migrations_helpers.py # Schemas, seeding functions, verification helpers +├── test_migrations_fresh.py # Fresh install tests +├── test_migrations_04_to_09.py # 0.4.x → 0.9.x migration tests +├── test_migrations_07_to_09.py # 0.7.x → 0.9.x migration tests +└── test_migrations_08_to_09.py # 0.8.x → 0.9.x migration tests +``` + +### Test Writing Standards + +#### NO MOCKS - Real Tests Only +Tests must exercise real code paths: +- Create real SQLite databases with version-specific schemas +- Seed with realistic test data +- Run actual `python -m archivebox` commands via subprocess +- Query SQLite directly to verify results + +**If something is hard to test**: Modify the implementation to make it easier to test, or fix the underlying issue. Never mock, skip, simulate, or exit early from a test because you can't get something working inside the test. + +#### NO SKIPS +Never use `@skip`, `skipTest`, or `pytest.mark.skip`. Every test must run. If a test is difficult, fix the code or test environment - don't disable the test. + +#### Strict Assertions +- `init` command must return exit code 0 (not `[0, 1]`) +- Verify ALL data is preserved, not just "at least one" +- Use exact counts (`==`) not loose bounds (`>=`) + +### Example Test Pattern +```python +def test_migration_preserves_snapshots(self): + """Migration should preserve all snapshots.""" + result = run_archivebox(self.work_dir, ['init'], timeout=45) + self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + + ok, msg = verify_snapshot_count(self.db_path, expected_count) + self.assertTrue(ok, msg) +``` + +### Testing Gotchas + +#### Extractors Disabled for Speed +Tests disable all extractors via environment variables for faster execution: +```python +env['SAVE_TITLE'] = 'False' +env['SAVE_FAVICON'] = 'False' +# ... etc +``` + +#### Timeout Settings +Use appropriate timeouts for migration tests (45s for init, 60s default). + +## Database Migrations + +### Generate and Apply Migrations +```bash +# Generate migrations (run from archivebox subdirectory) +cd archivebox +./manage.py makemigrations + +# Apply migrations to test database +cd data/ +archivebox init +``` + +### Schema Versions +- **0.4.x**: First Django version. Tags as comma-separated string, no ArchiveResult model +- **0.7.x**: Tag model with M2M, ArchiveResult model, AutoField PKs +- **0.8.x**: Crawl/Seed models, UUID PKs, status fields, depth/retry_at +- **0.9.x**: Seed model removed, seed_id FK removed from Crawl + +### Testing a Migration Path +1. Create SQLite DB with source version schema (from `test_migrations_helpers.py`) +2. Seed with realistic test data using `seed_0_X_data()` +3. Run `archivebox init` to trigger migrations +4. Verify data preservation with `verify_*` functions +5. Test CLI commands work post-migration (`status`, `list`, `add`, etc.) + +### Squashed Migrations +When testing 0.8.x (dev branch), you must record ALL replaced migrations: +```python +# The squashed migration replaces these - all must be recorded +('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'), +('core', '0024_auto_20240513_1143'), +# ... all 52 migrations from 0023-0074 ... +('core', '0023_new_schema'), # Also record the squashed migration itself +``` + +### Migration Strategy +- Squashed migrations for clean installs +- Individual migrations recorded for upgrades from dev branch +- `replaces` attribute in squashed migrations lists what they replace + +### Migration Gotchas + +#### Circular FK References in Schemas +SQLite handles circular references with `IF NOT EXISTS`. Order matters less than in other DBs. + +## Plugin System Architecture + +### Plugin Dependency Rules + +Like other plugins, chrome plugins **ARE NOT ALLOWED TO DEPEND ON ARCHIVEBOX OR DJANGO**. +However, they are allowed to depend on two shared files ONLY: +- `archivebox/plugins/chrome/chrome_utils.js` ← source of truth API for all basic chrome ops +- `archivebox/plugins/chrome/tests/chrome_test_utils.py` ← use for your tests, do not implement launching/killing/pid files/cdp/etc. in python, just extend this file as needed. + +### Chrome-Dependent Plugins + +Many plugins depend on Chrome/Chromium via CDP (Chrome DevTools Protocol). When checking for script name references or debugging Chrome-related issues, check these plugins: + +**Main puppeteer-based chrome installer + launcher plugin**: +- `chrome` - Core Chrome integration (CDP, launch, navigation) + +**Metadata extraction using chrome/chrome_utils.js / CDP**: +- `dns` - DNS resolution info +- `ssl` - SSL certificate info +- `headers` - HTTP response headers +- `redirects` - Capture redirect chains +- `staticfile` - Direct file downloads (e.g. if the url itself is a .png, .exe, .zip, etc.) +- `responses` - Capture network responses +- `consolelog` - Capture console.log output +- `title` - Extract page title +- `accessibility` - Extract accessibility tree +- `seo` - Extract SEO metadata + +**Extensions installed using chrome/chrome_utils.js / controlled using CDP**: +- `ublock` - uBlock Origin ad blocking +- `istilldontcareaboutcookies` - Cookie banner dismissal +- `twocaptcha` - 2captcha CAPTCHA solver integration + +**Page-alteration plugins to prepare the content for archiving**: +- `modalcloser` - Modal dialog dismissal +- `infiniscroll` - Infinite scroll handler + +**Main Extractor Outputs**: +- `dom` - DOM snapshot extraction +- `pdf` - Generate PDF snapshots +- `screenshot` - Generate screenshots +- `singlefile` - SingleFile archival, can be single-file-cli that launches chrome, or singlefile extension running inside chrome + +**Crawl URL parsers** (post-process dom.html, singlefile.html, staticfile, responses, headers, etc. for URLs to re-emit as new queued Snapshots during recursive crawling): +- `parse_dom_outlinks` - Extract outlinks from DOM (special, uses CDP to directly query browser) +- `parse_html_urls` - Parse URLs from HTML (doesn't use chrome directly, just reads dom.html) +- `parse_jsonl_urls` - Parse URLs from JSONL (doesn't use chrome directly, just reads dom.html) +- `parse_netscape_urls` - Parse Netscape bookmark format (doesn't use chrome directly, just reads dom.html) + +### Finding Chrome-Dependent Plugins + +```bash +# Find all files containing "chrom" (case-insensitive) +grep -ri "chrom" archivebox/plugins/*/on_*.* --include="*.*" 2>/dev/null | cut -d: -f1 | sort -u + +# Or get just the plugin names +grep -ri "chrom" archivebox/plugins/*/on_*.* --include="*.*" 2>/dev/null | cut -d/ -f3 | sort -u +``` + +**Note**: This list may not be complete. Always run the grep command above when checking for Chrome-related script references or debugging Chrome integration issues. + +## Architecture Notes + +### Crawl Model (0.9.x) +- Crawl groups multiple Snapshots from a single `add` command +- Each `add` creates one Crawl with one or more Snapshots +- Seed model was removed - crawls now store URLs directly + +## Code Coverage + +### Overview + +Coverage tracking is enabled for passive collection across all contexts: +- Unit tests (pytest) +- Integration tests +- Dev server (manual testing) +- CLI usage + +Coverage data accumulates in `.coverage` file and can be viewed/analyzed to find dead code. + +### Install Coverage Tools + +```bash +uv sync --dev # Installs pytest-cov and coverage +``` + +### Running with Coverage + +#### Unit Tests +```bash +# Run tests with coverage +pytest --cov=archivebox --cov-report=term archivebox/tests/ + +# Or run specific test file +pytest --cov=archivebox --cov-report=term archivebox/tests/test_migrations_08_to_09.py +``` + +#### Dev Server with Coverage +```bash +# Start dev server with coverage tracking +coverage run --parallel-mode -m archivebox server + +# Or CLI commands +coverage run --parallel-mode -m archivebox init +coverage run --parallel-mode -m archivebox add https://example.com +``` + +#### Manual Testing (Always-On) +To enable coverage during ALL Python executions (passive tracking): + +```bash +# Option 1: Use coverage run wrapper +coverage run --parallel-mode -m archivebox [command] + +# Option 2: Set environment variable (tracks everything) +export COVERAGE_PROCESS_START=pyproject.toml +# Now all Python processes will track coverage +archivebox server +archivebox add https://example.com +``` + +### Viewing Coverage + +#### Text Report (Quick View) +```bash +# Combine all parallel coverage data +coverage combine + +# View summary +coverage report + +# View detailed report with missing lines +coverage report --show-missing + +# View specific file +coverage report --include="archivebox/core/models.py" --show-missing +``` + +#### JSON Report (LLM-Friendly) +```bash +# Generate JSON report +coverage json + +# View the JSON +cat coverage.json | jq '.files | keys' # List all files + +# Find files with low coverage +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered < 50) | "\(.key): \(.value.summary.percent_covered)%"' + +# Find completely uncovered files (dead code candidates) +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered == 0) | .key' + +# Get missing lines for a specific file +cat coverage.json | jq '.files["archivebox/core/models.py"].missing_lines' +``` + +#### HTML Report (Visual) +```bash +# Generate interactive HTML report +coverage html + +# Open in browser +open htmlcov/index.html +``` + +### Isolated Runs + +To measure coverage for specific scenarios: + +```bash +# 1. Reset coverage data +coverage erase + +# 2. Run your isolated test/scenario +pytest --cov=archivebox archivebox/tests/test_migrations_fresh.py +# OR +coverage run --parallel-mode -m archivebox add https://example.com + +# 3. View results +coverage combine +coverage report --show-missing + +# 4. Optionally export for analysis +coverage json +``` + +### Finding Dead Code + +```bash +# 1. Run comprehensive tests + manual testing to build coverage +pytest --cov=archivebox archivebox/tests/ +coverage run --parallel-mode -m archivebox server # Use the app manually +coverage combine + +# 2. Find files with 0% coverage (strong dead code candidates) +coverage json +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered == 0) | .key' + +# 3. Find files with <10% coverage (likely dead code) +cat coverage.json | jq -r '.files | to_entries[] | select(.value.summary.percent_covered < 10) | "\(.key): \(.value.summary.percent_covered)%"' | sort -t: -k2 -n + +# 4. Generate detailed report for analysis +coverage report --show-missing > coverage_report.txt +``` + +### Tips + +- **Parallel mode** (`--parallel-mode`): Allows multiple processes to track coverage simultaneously without conflicts +- **Combine**: Always run `coverage combine` before viewing reports to merge parallel data +- **Reset**: Use `coverage erase` to start fresh for isolated measurements +- **Branch coverage**: Enabled by default - tracks if both branches of if/else are executed +- **Exclude patterns**: Config in `pyproject.toml` excludes tests, migrations, type stubs + ## Debugging Tips ### Check Migration State diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 59e64aeb..ec11dff9 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -83,15 +83,15 @@ class ConstantsDict(Mapping): CRONTABS_DIR_NAME: str = 'crontabs' CACHE_DIR_NAME: str = 'cache' LOGS_DIR_NAME: str = 'logs' - USER_PLUGINS_DIR_NAME: str = 'user_plugins' - CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates' + CUSTOM_PLUGINS_DIR_NAME: str = 'custom_plugins' + CUSTOM_TEMPLATES_DIR_NAME: str = 'custom_templates' ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME - USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME + USER_PLUGINS_DIR: Path = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME # Data dir files CONFIG_FILENAME: str = 'ArchiveBox.conf' @@ -171,8 +171,11 @@ class ConstantsDict(Mapping): TMP_DIR_NAME, PERSONAS_DIR_NAME, CUSTOM_TEMPLATES_DIR_NAME, - USER_PLUGINS_DIR_NAME, + CUSTOM_PLUGINS_DIR_NAME, CRONTABS_DIR_NAME, + # Backwards compatibility with old directory names + "user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins') + "user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates') "static", # created by old static exports = 80 else 'orange' if h >= 50 else 'red' + return format_html('{}', color, h) + def grid_view(self, request, extra_context=None): # cl = self.get_changelist_instance(request) diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py index 59dcd9e4..8ad24966 100644 --- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -3,6 +3,7 @@ # Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0 from django.db import migrations, models, connection +import django.utils.timezone def get_table_columns(table_name): @@ -95,31 +96,31 @@ def upgrade_core_tables(apps, schema_editor): # ============================================================================ # PART 2: Upgrade core_snapshot table # ============================================================================ + # Create table with NEW field names for timestamps (bookmarked_at, created_at, modified_at) + # and all other fields needed by later migrations cursor.execute(""" CREATE TABLE IF NOT EXISTS core_snapshot_new ( id TEXT PRIMARY KEY NOT NULL, - created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - url TEXT NOT NULL, timestamp VARCHAR(32) NOT NULL UNIQUE, - bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, - + title VARCHAR(512), crawl_id TEXT, parent_snapshot_id TEXT, - title VARCHAR(512), + bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + downloaded_at DATETIME, + status VARCHAR(15) NOT NULL DEFAULT 'queued', + retry_at DATETIME, + depth INTEGER NOT NULL DEFAULT 0, fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', - config TEXT NOT NULL DEFAULT '{}', notes TEXT NOT NULL DEFAULT '', num_uses_succeeded INTEGER NOT NULL DEFAULT 0, num_uses_failed INTEGER NOT NULL DEFAULT 0, - - status VARCHAR(15) NOT NULL DEFAULT 'queued', - retry_at DATETIME, current_step INTEGER NOT NULL DEFAULT 0, FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE, @@ -141,29 +142,23 @@ def upgrade_core_tables(apps, schema_editor): has_bookmarked_at = 'bookmarked_at' in snapshot_cols if has_added and not has_bookmarked_at: - # Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at) + # Migrating from v0.7.2 (has added/updated fields) print('Migrating Snapshot from v0.7.2 schema...') - # Debug: Check what data we're about to copy - cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3") - sample_data = cursor.fetchall() - print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}') - + # Transform added→bookmarked_at/created_at and updated→modified_at cursor.execute(""" INSERT OR IGNORE INTO core_snapshot_new ( - id, url, timestamp, title, bookmarked_at, created_at, modified_at + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + status ) SELECT id, url, timestamp, title, COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at, COALESCE(added, CURRENT_TIMESTAMP) as created_at, - COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at + COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at, + 'queued' as status FROM core_snapshot; """) - - # Debug: Check what was inserted - cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3") - inserted_data = cursor.fetchall() - print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}') elif has_bookmarked_at and not has_added: # Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at) print('Migrating Snapshot from v0.8.6rc0 schema...') @@ -308,14 +303,29 @@ class Migration(migrations.Migration): ), ], state_operations=[ - # NOTE: We do NOT remove extractor/output here for ArchiveResult! + # NOTE: We do NOT remove extractor/output for ArchiveResult! # They are still in the database and will be removed by migration 0025 - # after copying their data to the new field names (plugin, output_str). + # after copying their data to plugin/output_str. - # However, for Snapshot, we DO remove added/updated here because - # the database operations above already renamed them to bookmarked_at/created_at/modified_at. + # However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields + # because the SQL above already transformed them. migrations.RemoveField(model_name='snapshot', name='added'), migrations.RemoveField(model_name='snapshot', name='updated'), + migrations.AddField( + model_name='snapshot', + name='bookmarked_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='created_at', + field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), + ), + migrations.AddField( + model_name='snapshot', + name='modified_at', + field=models.DateTimeField(auto_now=True), + ), # SnapshotTag table already exists from v0.7.2, just declare it in state migrations.CreateModel( diff --git a/archivebox/core/migrations/0024_assign_default_crawl.py b/archivebox/core/migrations/0024_assign_default_crawl.py index b6890b8c..ddd3c87b 100644 --- a/archivebox/core/migrations/0024_assign_default_crawl.py +++ b/archivebox/core/migrations/0024_assign_default_crawl.py @@ -103,15 +103,21 @@ class Migration(migrations.Migration): ); INSERT INTO core_snapshot_final ( - id, created_at, modified_at, url, timestamp, bookmarked_at, - crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version, - config, notes, num_uses_succeeded, num_uses_failed, + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + crawl_id, parent_snapshot_id, + downloaded_at, depth, fs_version, + config, notes, + num_uses_succeeded, num_uses_failed, status, retry_at, current_step ) SELECT - id, created_at, modified_at, url, timestamp, bookmarked_at, - crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version, - COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed, + id, url, timestamp, title, + bookmarked_at, created_at, modified_at, + crawl_id, parent_snapshot_id, + downloaded_at, depth, fs_version, + COALESCE(config, '{}'), COALESCE(notes, ''), + num_uses_succeeded, num_uses_failed, status, retry_at, current_step FROM core_snapshot; diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py index 1a68ab06..676639c7 100644 --- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -9,23 +9,16 @@ from django.db import migrations, models, connection def copy_old_fields_to_new(apps, schema_editor): - """Copy data from old field names to new field names before AddField operations.""" + """Copy data from old field names to new field names after AddField operations.""" cursor = connection.cursor() # Check if old fields still exist cursor.execute("PRAGMA table_info(core_archiveresult)") cols = {row[1] for row in cursor.fetchall()} - print(f'DEBUG 0025: ArchiveResult columns: {sorted(cols)}') if 'extractor' in cols and 'plugin' in cols: # Copy extractor -> plugin - print('DEBUG 0025: Copying extractor -> plugin') cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL") - cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE plugin != ''") - count = cursor.fetchone()[0] - print(f'DEBUG 0025: Updated {count} rows with plugin data') - else: - print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}') if 'output' in cols and 'output_str' in cols: # Copy output -> output_str @@ -38,16 +31,13 @@ def copy_old_fields_to_new(apps, schema_editor): if 'end_ts' in cols and 'modified_at' in cols: cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''") - # Same for Snapshot table - cursor.execute("PRAGMA table_info(core_snapshot)") - snap_cols = {row[1] for row in cursor.fetchall()} + # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already + # transformed by migration 0023, so we don't need to copy them here. - if 'added' in snap_cols and 'bookmarked_at' in snap_cols: - cursor.execute("UPDATE core_snapshot SET bookmarked_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE bookmarked_at IS NULL OR bookmarked_at = ''") - cursor.execute("UPDATE core_snapshot SET created_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''") - - if 'updated' in snap_cols and 'modified_at' in snap_cols: - cursor.execute("UPDATE core_snapshot SET modified_at = COALESCE(updated, added, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''") + # Debug: Check Snapshot timestamps at end of RunPython + cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2") + snap_after = cursor.fetchall() + print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}') class Migration(migrations.Migration): @@ -149,21 +139,12 @@ class Migration(migrations.Migration): name='retry_at', field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), ), - migrations.AddField( - model_name='snapshot', - name='bookmarked_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), + # NOTE: bookmarked_at and created_at already added by migration 0023 migrations.AddField( model_name='snapshot', name='config', field=models.JSONField(default=dict), ), - migrations.AddField( - model_name='snapshot', - name='created_at', - field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), - ), migrations.AddField( model_name='snapshot', name='current_step', @@ -184,11 +165,7 @@ class Migration(migrations.Migration): name='fs_version', field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), ), - migrations.AddField( - model_name='snapshot', - name='modified_at', - field=models.DateTimeField(auto_now=True), - ), + # NOTE: modified_at already added by migration 0023 migrations.AddField( model_name='snapshot', name='notes', @@ -248,7 +225,7 @@ class Migration(migrations.Migration): model_name='archiveresult', name='output', ), - # NOTE: Snapshot's added/updated fields were already removed by migration 0023 + # NOTE: Snapshot's added/updated were already removed by migration 0023 migrations.AlterField( model_name='archiveresult', name='end_ts', diff --git a/archivebox/core/migrations/0026_add_process_to_archiveresult.py b/archivebox/core/migrations/0026_add_process_to_archiveresult.py new file mode 100644 index 00000000..eef7b265 --- /dev/null +++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py @@ -0,0 +1,28 @@ +# Generated by Django 6.0 on 2026-01-01 23:28 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'), + ('machine', '0003_add_process_type_and_parent'), + ] + + operations = [ + migrations.RemoveField( + model_name='archiveresult', + name='num_uses_failed', + ), + migrations.RemoveField( + model_name='archiveresult', + name='num_uses_succeeded', + ), + migrations.AddField( + model_name='archiveresult', + name='process', + field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 85d5cee0..c3731354 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2285,13 +2285,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.) # Added POST-v0.9.0, will be added in a separate migration - # process = models.OneToOneField( - # 'machine.Process', - # on_delete=models.PROTECT, - # null=False, - # related_name='archiveresult', - # help_text='Process execution details for this archive result' - # ) + process = models.OneToOneField( + 'machine.Process', + on_delete=models.PROTECT, + null=True, + blank=True, + related_name='archiveresult', + help_text='Process execution details for this archive result' + ) # New output fields (replacing old 'output' field) output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary') diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py index 016559a7..da08b0ac 100644 --- a/archivebox/crawls/admin.py +++ b/archivebox/crawls/admin.py @@ -154,7 +154,7 @@ class CrawlAdminForm(forms.ModelForm): class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): form = CrawlAdminForm - list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots') + list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'health_display', 'num_snapshots') sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at') search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls') @@ -270,6 +270,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin): first_url = obj.get_urls_list()[0] if obj.get_urls_list() else '' return first_url[:80] + '...' if len(first_url) > 80 else first_url + @admin.display(description='Health', ordering='health') + def health_display(self, obj): + h = obj.health + color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + return format_html('{}', color, h) + @admin.display(description='URLs') def urls_editor(self, obj): """Editor for crawl URLs.""" diff --git a/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py new file mode 100644 index 00000000..e3740a3b --- /dev/null +++ b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py @@ -0,0 +1,21 @@ +# Generated by Django 6.0 on 2026-01-01 23:36 + +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('crawls', '0002_upgrade_from_0_8_6'), + ] + + operations = [ + migrations.RemoveField( + model_name='crawlschedule', + name='num_uses_failed', + ), + migrations.RemoveField( + model_name='crawlschedule', + name='num_uses_succeeded', + ), + ] diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index e39526b5..dd849d2a 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -519,12 +519,14 @@ class CrawlMachine(BaseStateMachine, strict_states=True): def is_finished(self) -> bool: from archivebox.core.models import Snapshot - # check that at least one snapshot exists for this crawl + # Check if any snapshots exist for this crawl snapshots = Snapshot.objects.filter(crawl=self.crawl) - if not snapshots.exists(): - return False - # check if all snapshots are sealed + # If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks) + if not snapshots.exists(): + return True + + # If snapshots exist, check if all are sealed # Snapshots handle their own background hooks via the step system, # so we just need to wait for all snapshots to reach sealed state if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists(): diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py index 13834ced..27bdf060 100644 --- a/archivebox/machine/admin.py +++ b/archivebox/machine/admin.py @@ -8,7 +8,7 @@ from archivebox.machine.models import Machine, NetworkInterface, Binary, Process class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): - list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health') + list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health_display') sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid') readonly_fields = ('guid', 'created_at', 'modified_at', 'ips') @@ -52,9 +52,15 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin): machine.id, ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)), ) + @admin.display(description='Health', ordering='health') + def health_display(self, obj): + h = obj.health + color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + return format_html('{}', color, h) + class NetworkInterfaceAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health') + list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health_display') sort_fields = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address') search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country') @@ -95,9 +101,15 @@ class NetworkInterfaceAdmin(BaseModelAdmin): iface.machine.id, str(iface.machine.id)[:8], iface.machine.hostname, ) + @admin.display(description='Health', ordering='health') + def health_display(self, obj): + h = obj.health + color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + return format_html('{}', color, h) + class BinaryAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health') + list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health_display') sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status') search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256') @@ -142,6 +154,12 @@ class BinaryAdmin(BaseModelAdmin): binary.machine.id, str(binary.machine.id)[:8], binary.machine.hostname, ) + @admin.display(description='Health', ordering='health') + def health_display(self, obj): + h = obj.health + color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red' + return format_html('{}', color, h) + class ProcessAdmin(BaseModelAdmin): list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info') diff --git a/archivebox/machine/migrations/0003_add_process_type_and_parent.py b/archivebox/machine/migrations/0003_add_process_type_and_parent.py new file mode 100644 index 00000000..ae97725c --- /dev/null +++ b/archivebox/machine/migrations/0003_add_process_type_and_parent.py @@ -0,0 +1,24 @@ +# Generated by Django 6.0 on 2026-01-01 22:55 + +import django.db.models.deletion +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0002_process'), + ] + + operations = [ + migrations.AddField( + model_name='process', + name='parent', + field=models.ForeignKey(blank=True, help_text='Parent process that spawned this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='machine.process'), + ), + migrations.AddField( + model_name='process', + name='process_type', + field=models.CharField(choices=[('supervisord', 'Supervisord'), ('orchestrator', 'Orchestrator'), ('worker', 'Worker'), ('cli', 'CLI'), ('binary', 'Binary')], db_index=True, default='cli', help_text='Type of process (cli, worker, orchestrator, binary, supervisord)', max_length=16), + ), + ] diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 814b5c1a..7c1068b9 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -153,8 +153,8 @@ class NetworkInterface(ModelWithHealthStats): city = models.CharField(max_length=63, default=None, null=False) region = models.CharField(max_length=63, default=None, null=False) country = models.CharField(max_length=63, default=None, null=False) - num_uses_failed = models.PositiveIntegerField(default=0) - num_uses_succeeded = models.PositiveIntegerField(default=0) + # num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats + # num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats objects: NetworkInterfaceManager = NetworkInterfaceManager() @@ -588,6 +588,13 @@ class Process(models.Model): RUNNING = 'running', 'Running' EXITED = 'exited', 'Exited' + class TypeChoices(models.TextChoices): + SUPERVISORD = 'supervisord', 'Supervisord' + ORCHESTRATOR = 'orchestrator', 'Orchestrator' + WORKER = 'worker', 'Worker' + CLI = 'cli', 'CLI' + BINARY = 'binary', 'Binary' + # Primary fields id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) @@ -602,6 +609,24 @@ class Process(models.Model): help_text='Machine where this process executed' ) + # Parent process (optional) + parent = models.ForeignKey( + 'self', + on_delete=models.SET_NULL, + null=True, blank=True, + related_name='children', + help_text='Parent process that spawned this process' + ) + + # Process type (cli, worker, orchestrator, binary, supervisord) + process_type = models.CharField( + max_length=16, + choices=TypeChoices.choices, + default=TypeChoices.CLI, + db_index=True, + help_text='Type of process (cli, worker, orchestrator, binary, supervisord)' + ) + # Execution metadata pwd = models.CharField(max_length=512, default='', null=False, blank=True, help_text='Working directory for process execution') diff --git a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js index e25136e0..4a99028a 100755 --- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js +++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js @@ -8,7 +8,7 @@ * - Accessibility snapshot * - ARIA labels and roles * - * Usage: on_Snapshot__18_accessibility.js --url= --snapshot-id= + * Usage: on_Snapshot__39_accessibility.js --url= --snapshot-id= * Output: Writes accessibility/accessibility.json * * Environment variables: @@ -203,7 +203,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__18_accessibility.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__39_accessibility.js --url= --snapshot-id='); process.exit(1); } diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js index 0799f3ad..f4d659e1 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js @@ -8,7 +8,7 @@ * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for * --load-extension and --disable-extensions-except flags. * - * Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id= --source-url= + * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id= --source-url= * Output: Writes to current directory (executor creates chrome/ dir): * - cdp_url.txt: WebSocket URL for CDP connection * - chrome.pid: Chromium process ID (for cleanup) diff --git a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js index 592381cf..db953ef0 100755 --- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js +++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js @@ -2,7 +2,7 @@ /** * Create a Chrome tab for this snapshot in the shared crawl Chrome session. * - * If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js), + * If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js), * this connects to it and creates a new tab. Otherwise, falls back to launching * its own Chrome instance. * diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 17c27ff2..89301f5f 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -73,8 +73,8 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent # Hook script locations -CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py' -CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js' +CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py' +CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js' CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js' CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None) CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js' diff --git a/archivebox/plugins/dom/on_Snapshot__53_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js index 56d8ccc2..cc35645e 100644 --- a/archivebox/plugins/dom/on_Snapshot__53_dom.js +++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js @@ -5,7 +5,7 @@ * If a Chrome session exists (from chrome plugin), connects to it via CDP. * Otherwise launches a new Chrome instance. * - * Usage: on_Snapshot__23_dom.js --url= --snapshot-id= + * Usage: on_Snapshot__53_dom.js --url= --snapshot-id= * Output: Writes dom/output.html * * Environment variables: @@ -175,7 +175,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__23_dom.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__53_dom.js --url= --snapshot-id='); process.exit(1); } diff --git a/archivebox/plugins/headers/on_Snapshot__55_headers.js b/archivebox/plugins/headers/on_Snapshot__55_headers.js index 533beeeb..098b95e7 100644 --- a/archivebox/plugins/headers/on_Snapshot__55_headers.js +++ b/archivebox/plugins/headers/on_Snapshot__55_headers.js @@ -6,7 +6,7 @@ * response headers from chrome plugin/response_headers.json. * Otherwise falls back to making an HTTP HEAD request. * - * Usage: on_Snapshot__12_headers.js --url= --snapshot-id= + * Usage: on_Snapshot__55_headers.js --url= --snapshot-id= * Output: Writes headers/headers.json * * Environment variables: @@ -116,7 +116,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__12_headers.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__55_headers.js --url= --snapshot-id='); process.exit(1); } diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js index 9eb86c26..d3eafb0b 100755 --- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js +++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js @@ -11,7 +11,7 @@ * - iframes: