make Claude.md stricter and improve migration tests

This commit is contained in:
Nick Sweeting
2025-12-27 00:33:51 -08:00
parent 35dd9acafe
commit cffbef84ed
3 changed files with 63 additions and 50 deletions

View File

@@ -3,10 +3,10 @@
## Quick Start ## Quick Start
```bash ```bash
# Set up dev environment # Set up dev environment (always use uv, never pip directly)
uv sync --dev uv sync --dev --all-extras
# Run tests as non-root user (required - ArchiveBox refuses to run as root) # Run tests as non-root user (required - ArchiveBox always refuses to run as root)
sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/ -v' sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/ -v'
``` ```
@@ -19,7 +19,7 @@ sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archiveb
### Install Dependencies ### Install Dependencies
```bash ```bash
uv sync --dev uv sync --dev --all-extras # Always use uv, never pip directly
``` ```
### Activate Virtual Environment ### Activate Virtual Environment
@@ -30,7 +30,7 @@ source .venv/bin/activate
## Running Tests ## Running Tests
### CRITICAL: Never Run as Root ### CRITICAL: Never Run as Root
ArchiveBox has a root check that prevents running as root user. Always run tests as a non-root user: ArchiveBox has a root check that prevents running as root user. All ArchiveBox commands (including tests) must run as non-root user inside a data directory:
```bash ```bash
# Run all migration tests # Run all migration tests
@@ -62,8 +62,10 @@ Tests must exercise real code paths:
- Run actual `python -m archivebox` commands via subprocess - Run actual `python -m archivebox` commands via subprocess
- Query SQLite directly to verify results - Query SQLite directly to verify results
**If something is hard to test**: Modify the implementation to make it easier to test, or fix the underlying issue. Never mock, skip, simulate, or exit early from a test because you can't get something working inside the test.
### NO SKIPS ### NO SKIPS
Never use `@skip`, `skipTest`, or `pytest.mark.skip`. Every test must run. Never use `@skip`, `skipTest`, or `pytest.mark.skip`. Every test must run. If a test is difficult, fix the code or test environment - don't disable the test.
### Strict Assertions ### Strict Assertions
- `init` command must return exit code 0 (not `[0, 1]`) - `init` command must return exit code 0 (not `[0, 1]`)
@@ -115,7 +117,7 @@ chmod 644 archivebox/tests/test_*.py
``` ```
### 2. DATA_DIR Environment Variable ### 2. DATA_DIR Environment Variable
Tests use temp directories. The `run_archivebox()` helper sets `DATA_DIR` automatically. ArchiveBox commands must run inside a data directory. Tests use temp directories - the `run_archivebox()` helper sets `DATA_DIR` automatically.
### 3. Extractors Disabled for Speed ### 3. Extractors Disabled for Speed
Tests disable all extractors via environment variables for faster execution: Tests disable all extractors via environment variables for faster execution:

View File

@@ -12,6 +12,7 @@ Migration tests from 0.8.x to 0.9.x.
import shutil import shutil
import sqlite3 import sqlite3
import subprocess
import tempfile import tempfile
import unittest import unittest
from pathlib import Path from pathlib import Path
@@ -440,28 +441,34 @@ class TestFilesystemMigration08to09(unittest.TestCase):
result = run_archivebox(self.work_dir, ['init'], timeout=45) result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Step 2: Archive example.com with some extractors enabled # Step 2: Archive example.com with ALL extractors enabled
# Enable a subset of fast extractors for testing # This ensures we test migration with all file types
try:
result = run_archivebox( result = run_archivebox(
self.work_dir, self.work_dir,
['add', '--depth=0', 'https://example.com'], ['add', '--depth=0', 'https://example.com'],
timeout=120, timeout=300, # 5 minutes for all extractors
env={ env={
'SAVE_TITLE': 'True', 'SAVE_TITLE': 'True',
'SAVE_FAVICON': 'True', 'SAVE_FAVICON': 'True',
'SAVE_WGET': 'True', 'SAVE_WGET': 'True',
'SAVE_SCREENSHOT': 'False', # Disable slow extractors 'SAVE_SCREENSHOT': 'True',
'SAVE_DOM': 'False', 'SAVE_DOM': 'True',
'SAVE_SINGLEFILE': 'False', 'SAVE_SINGLEFILE': 'True',
'SAVE_READABILITY': 'False', 'SAVE_READABILITY': 'True',
'SAVE_MERCURY': 'False', 'SAVE_MERCURY': 'True',
'SAVE_PDF': 'False', 'SAVE_PDF': 'True',
'SAVE_MEDIA': 'False', 'SAVE_MEDIA': 'True',
'SAVE_ARCHIVE_DOT_ORG': 'False', 'SAVE_ARCHIVE_DOT_ORG': 'True',
'SAVE_HEADERS': 'True',
'SAVE_HTMLTOTEXT': 'True',
'SAVE_GIT': 'True',
} }
) )
# Note: Add may fail if network is down or extractors fail, but we still want to test except subprocess.TimeoutExpired as e:
# the filesystem migration logic even with partial failures # If timeout, still continue - we want to test with whatever files were created
print(f"\n[!] Add command timed out after {e.timeout}s, continuing with partial results...")
# Note: Snapshot may still have been created even if command timed out
# Step 3: Get the snapshot and verify files were created # Step 3: Get the snapshot and verify files were created
conn = sqlite3.connect(str(self.db_path)) conn = sqlite3.connect(str(self.db_path))

View File

@@ -986,27 +986,31 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
# Helper Functions # Helper Functions
# ============================================================================= # =============================================================================
def run_archivebox(data_dir: Path, args: list, timeout: int = 60) -> subprocess.CompletedProcess: def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = None) -> subprocess.CompletedProcess:
"""Run archivebox command in subprocess with given data directory.""" """Run archivebox command in subprocess with given data directory."""
env = os.environ.copy() base_env = os.environ.copy()
env['DATA_DIR'] = str(data_dir) base_env['DATA_DIR'] = str(data_dir)
env['USE_COLOR'] = 'False' base_env['USE_COLOR'] = 'False'
env['SHOW_PROGRESS'] = 'False' base_env['SHOW_PROGRESS'] = 'False'
# Disable ALL extractors for faster tests # Disable ALL extractors for faster tests (can be overridden by env parameter)
env['SAVE_ARCHIVE_DOT_ORG'] = 'False' base_env['SAVE_ARCHIVE_DOT_ORG'] = 'False'
env['SAVE_TITLE'] = 'False' base_env['SAVE_TITLE'] = 'False'
env['SAVE_FAVICON'] = 'False' base_env['SAVE_FAVICON'] = 'False'
env['SAVE_WGET'] = 'False' base_env['SAVE_WGET'] = 'False'
env['SAVE_SINGLEFILE'] = 'False' base_env['SAVE_SINGLEFILE'] = 'False'
env['SAVE_SCREENSHOT'] = 'False' base_env['SAVE_SCREENSHOT'] = 'False'
env['SAVE_PDF'] = 'False' base_env['SAVE_PDF'] = 'False'
env['SAVE_DOM'] = 'False' base_env['SAVE_DOM'] = 'False'
env['SAVE_READABILITY'] = 'False' base_env['SAVE_READABILITY'] = 'False'
env['SAVE_MERCURY'] = 'False' base_env['SAVE_MERCURY'] = 'False'
env['SAVE_GIT'] = 'False' base_env['SAVE_GIT'] = 'False'
env['SAVE_MEDIA'] = 'False' base_env['SAVE_MEDIA'] = 'False'
env['SAVE_HEADERS'] = 'False' base_env['SAVE_HEADERS'] = 'False'
env['SAVE_HTMLTOTEXT'] = 'False' base_env['SAVE_HTMLTOTEXT'] = 'False'
# Override with any custom env vars
if env:
base_env.update(env)
cmd = [sys.executable, '-m', 'archivebox'] + args cmd = [sys.executable, '-m', 'archivebox'] + args
@@ -1014,7 +1018,7 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 60) -> subprocess.
cmd, cmd,
capture_output=True, capture_output=True,
text=True, text=True,
env=env, env=base_env,
cwd=str(data_dir), cwd=str(data_dir),
timeout=timeout, timeout=timeout,
) )