Files
ArchiveBox/archivebox/tests/test_migrations_helpers.py

1192 lines
44 KiB
Python

#!/usr/bin/env python3
"""
Helper functions and schema definitions for migration tests.
This module provides:
- Schema definitions for each major ArchiveBox version (0.4.x, 0.7.x, 0.8.x)
- Data seeding functions to populate test databases
- Helper functions to run archivebox commands and verify results
"""
import os
import sys
import json
import sqlite3
import subprocess
from pathlib import Path
from datetime import datetime, timezone
from typing import Dict, List, Tuple
from uuid import uuid4
# =============================================================================
# Schema Definitions for Each Version
# =============================================================================
SCHEMA_0_4 = """
-- Django system tables (minimal)
CREATE TABLE IF NOT EXISTS django_migrations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
app VARCHAR(255) NOT NULL,
name VARCHAR(255) NOT NULL,
applied DATETIME NOT NULL
);
-- Core tables for 0.4.x
CREATE TABLE IF NOT EXISTS core_snapshot (
id CHAR(32) PRIMARY KEY,
url VARCHAR(2000) NOT NULL UNIQUE,
timestamp VARCHAR(32) NOT NULL UNIQUE,
title VARCHAR(128),
tags VARCHAR(256),
added DATETIME NOT NULL,
updated DATETIME
);
CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added);
"""
SCHEMA_0_7 = """
-- Django system tables (complete for 0.7.x)
CREATE TABLE IF NOT EXISTS django_migrations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
app VARCHAR(255) NOT NULL,
name VARCHAR(255) NOT NULL,
applied DATETIME NOT NULL
);
CREATE TABLE IF NOT EXISTS django_content_type (
id INTEGER PRIMARY KEY AUTOINCREMENT,
app_label VARCHAR(100) NOT NULL,
model VARCHAR(100) NOT NULL,
UNIQUE(app_label, model)
);
CREATE TABLE IF NOT EXISTS auth_permission (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(255) NOT NULL,
content_type_id INTEGER NOT NULL REFERENCES django_content_type(id),
codename VARCHAR(100) NOT NULL,
UNIQUE(content_type_id, codename)
);
CREATE TABLE IF NOT EXISTS auth_group (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(150) NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS auth_group_permissions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES auth_group(id),
permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
UNIQUE(group_id, permission_id)
);
CREATE TABLE IF NOT EXISTS auth_user (
id INTEGER PRIMARY KEY AUTOINCREMENT,
password VARCHAR(128) NOT NULL,
last_login DATETIME,
is_superuser BOOL NOT NULL,
username VARCHAR(150) NOT NULL UNIQUE,
first_name VARCHAR(150) NOT NULL,
last_name VARCHAR(150) NOT NULL,
email VARCHAR(254) NOT NULL,
is_staff BOOL NOT NULL,
is_active BOOL NOT NULL,
date_joined DATETIME NOT NULL
);
CREATE TABLE IF NOT EXISTS auth_user_groups (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES auth_user(id),
group_id INTEGER NOT NULL REFERENCES auth_group(id),
UNIQUE(user_id, group_id)
);
CREATE TABLE IF NOT EXISTS auth_user_user_permissions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES auth_user(id),
permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
UNIQUE(user_id, permission_id)
);
CREATE TABLE IF NOT EXISTS django_admin_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
action_time DATETIME NOT NULL,
object_id TEXT,
object_repr VARCHAR(200) NOT NULL,
action_flag SMALLINT UNSIGNED NOT NULL,
change_message TEXT NOT NULL,
content_type_id INTEGER REFERENCES django_content_type(id),
user_id INTEGER NOT NULL REFERENCES auth_user(id)
);
CREATE TABLE IF NOT EXISTS django_session (
session_key VARCHAR(40) NOT NULL PRIMARY KEY,
session_data TEXT NOT NULL,
expire_date DATETIME NOT NULL
);
-- Core tables for 0.7.x
CREATE TABLE IF NOT EXISTS core_tag (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS core_snapshot (
id CHAR(32) PRIMARY KEY,
url VARCHAR(2000) NOT NULL UNIQUE,
timestamp VARCHAR(32) NOT NULL UNIQUE,
title VARCHAR(512),
added DATETIME NOT NULL,
updated DATETIME
);
CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
CREATE INDEX IF NOT EXISTS core_snapshot_added ON core_snapshot(added);
-- Many-to-many for snapshot tags
CREATE TABLE IF NOT EXISTS core_snapshot_tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
tag_id INTEGER NOT NULL REFERENCES core_tag(id),
UNIQUE(snapshot_id, tag_id)
);
CREATE TABLE IF NOT EXISTS core_archiveresult (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_id CHAR(32) NOT NULL REFERENCES core_snapshot(id),
extractor VARCHAR(32) NOT NULL,
cmd TEXT,
pwd VARCHAR(256),
cmd_version VARCHAR(128),
output VARCHAR(1024),
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(16) NOT NULL
);
CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id);
CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor);
-- Insert required content types
INSERT INTO django_content_type (app_label, model) VALUES
('contenttypes', 'contenttype'),
('auth', 'permission'),
('auth', 'group'),
('auth', 'user'),
('admin', 'logentry'),
('sessions', 'session'),
('core', 'snapshot'),
('core', 'archiveresult'),
('core', 'tag');
"""
SCHEMA_0_8 = """
-- Django system tables (complete for 0.8.x)
CREATE TABLE IF NOT EXISTS django_migrations (
id INTEGER PRIMARY KEY AUTOINCREMENT,
app VARCHAR(255) NOT NULL,
name VARCHAR(255) NOT NULL,
applied DATETIME NOT NULL
);
CREATE TABLE IF NOT EXISTS django_content_type (
id INTEGER PRIMARY KEY AUTOINCREMENT,
app_label VARCHAR(100) NOT NULL,
model VARCHAR(100) NOT NULL,
UNIQUE(app_label, model)
);
CREATE TABLE IF NOT EXISTS auth_permission (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(255) NOT NULL,
content_type_id INTEGER NOT NULL REFERENCES django_content_type(id),
codename VARCHAR(100) NOT NULL,
UNIQUE(content_type_id, codename)
);
CREATE TABLE IF NOT EXISTS auth_group (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(150) NOT NULL UNIQUE
);
CREATE TABLE IF NOT EXISTS auth_group_permissions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
group_id INTEGER NOT NULL REFERENCES auth_group(id),
permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
UNIQUE(group_id, permission_id)
);
CREATE TABLE IF NOT EXISTS auth_user (
id INTEGER PRIMARY KEY AUTOINCREMENT,
password VARCHAR(128) NOT NULL,
last_login DATETIME,
is_superuser BOOL NOT NULL,
username VARCHAR(150) NOT NULL UNIQUE,
first_name VARCHAR(150) NOT NULL,
last_name VARCHAR(150) NOT NULL,
email VARCHAR(254) NOT NULL,
is_staff BOOL NOT NULL,
is_active BOOL NOT NULL,
date_joined DATETIME NOT NULL
);
CREATE TABLE IF NOT EXISTS auth_user_groups (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES auth_user(id),
group_id INTEGER NOT NULL REFERENCES auth_group(id),
UNIQUE(user_id, group_id)
);
CREATE TABLE IF NOT EXISTS auth_user_user_permissions (
id INTEGER PRIMARY KEY AUTOINCREMENT,
user_id INTEGER NOT NULL REFERENCES auth_user(id),
permission_id INTEGER NOT NULL REFERENCES auth_permission(id),
UNIQUE(user_id, permission_id)
);
CREATE TABLE IF NOT EXISTS django_admin_log (
id INTEGER PRIMARY KEY AUTOINCREMENT,
action_time DATETIME NOT NULL,
object_id TEXT,
object_repr VARCHAR(200) NOT NULL,
action_flag SMALLINT UNSIGNED NOT NULL,
change_message TEXT NOT NULL,
content_type_id INTEGER REFERENCES django_content_type(id),
user_id INTEGER NOT NULL REFERENCES auth_user(id)
);
CREATE TABLE IF NOT EXISTS django_session (
session_key VARCHAR(40) NOT NULL PRIMARY KEY,
session_data TEXT NOT NULL,
expire_date DATETIME NOT NULL
);
-- Machine app tables (added in 0.8.x)
CREATE TABLE IF NOT EXISTS machine_machine (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
modified_at DATETIME,
guid VARCHAR(64) NOT NULL UNIQUE,
hostname VARCHAR(63),
hw_in_docker BOOLEAN NOT NULL DEFAULT 0,
hw_in_vm BOOLEAN NOT NULL DEFAULT 0,
hw_manufacturer VARCHAR(63),
hw_product VARCHAR(63),
hw_uuid VARCHAR(255),
os_arch VARCHAR(15),
os_family VARCHAR(15),
os_platform VARCHAR(63),
os_release VARCHAR(63),
os_kernel VARCHAR(255),
stats TEXT DEFAULT '{}',
config TEXT DEFAULT '{}',
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS machine_networkinterface (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
modified_at DATETIME,
machine_id CHAR(36) NOT NULL REFERENCES machine_machine(id),
mac_address VARCHAR(17),
ip_public VARCHAR(45),
ip_local VARCHAR(45),
dns_server VARCHAR(45),
hostname VARCHAR(63),
iface VARCHAR(15),
isp VARCHAR(63),
city VARCHAR(63),
region VARCHAR(63),
country VARCHAR(63),
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS machine_dependency (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
modified_at DATETIME,
bin_name VARCHAR(63) NOT NULL UNIQUE,
bin_providers VARCHAR(127) NOT NULL DEFAULT '*',
custom_cmds TEXT DEFAULT '{}',
config TEXT DEFAULT '{}'
);
CREATE TABLE IF NOT EXISTS machine_binary (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
modified_at DATETIME,
machine_id CHAR(36) REFERENCES machine_machine(id),
dependency_id CHAR(36) REFERENCES machine_dependency(id),
name VARCHAR(63),
binprovider VARCHAR(31),
abspath VARCHAR(255),
version VARCHAR(32),
sha256 VARCHAR(64),
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
-- API app tables (added in 0.8.x)
CREATE TABLE IF NOT EXISTS api_apitoken (
id CHAR(36) PRIMARY KEY,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
created_at DATETIME NOT NULL,
modified_at DATETIME,
token VARCHAR(32) NOT NULL UNIQUE,
expires DATETIME
);
CREATE TABLE IF NOT EXISTS api_outboundwebhook (
id CHAR(36) PRIMARY KEY,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
created_at DATETIME NOT NULL,
modified_at DATETIME,
name VARCHAR(255) NOT NULL DEFAULT '',
signal VARCHAR(255) NOT NULL,
ref VARCHAR(255) NOT NULL,
endpoint VARCHAR(2083) NOT NULL,
headers TEXT DEFAULT '{}',
auth_token VARCHAR(4000) NOT NULL DEFAULT '',
enabled BOOLEAN NOT NULL DEFAULT 1,
keep_last_response BOOLEAN NOT NULL DEFAULT 0,
last_response TEXT NOT NULL DEFAULT '',
last_success DATETIME,
last_failure DATETIME,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
-- Core Tag table (AutoField PK in 0.8.x)
CREATE TABLE IF NOT EXISTS core_tag (
id INTEGER PRIMARY KEY AUTOINCREMENT,
name VARCHAR(100) NOT NULL UNIQUE,
slug VARCHAR(100) NOT NULL UNIQUE,
created_at DATETIME,
modified_at DATETIME,
created_by_id INTEGER REFERENCES auth_user(id)
);
-- Crawls tables (new in 0.8.x)
CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
modified_at DATETIME,
schedule VARCHAR(64) NOT NULL,
is_enabled BOOLEAN NOT NULL DEFAULT 1,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
template_id CHAR(36) REFERENCES crawls_crawl(id),
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS crawls_crawl (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
modified_at DATETIME,
urls TEXT NOT NULL,
config TEXT DEFAULT '{}',
max_depth SMALLINT UNSIGNED NOT NULL DEFAULT 0,
tags_str VARCHAR(1024) NOT NULL DEFAULT '',
persona_id CHAR(36),
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
schedule_id CHAR(36),
output_dir VARCHAR(256) NOT NULL DEFAULT '',
status VARCHAR(16) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
-- Core Snapshot table (0.8.x with UUID PK, status, crawl FK)
CREATE TABLE IF NOT EXISTS core_snapshot (
id CHAR(36) PRIMARY KEY,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
created_at DATETIME NOT NULL,
modified_at DATETIME,
url VARCHAR(2000) NOT NULL,
timestamp VARCHAR(32) NOT NULL UNIQUE,
bookmarked_at DATETIME NOT NULL,
crawl_id CHAR(36) REFERENCES crawls_crawl(id),
title VARCHAR(512),
downloaded_at DATETIME,
depth SMALLINT UNSIGNED NOT NULL DEFAULT 0,
retry_at DATETIME,
status VARCHAR(16) NOT NULL DEFAULT 'queued',
config TEXT DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
output_dir VARCHAR(256),
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
CREATE INDEX IF NOT EXISTS core_snapshot_url ON core_snapshot(url);
CREATE INDEX IF NOT EXISTS core_snapshot_timestamp ON core_snapshot(timestamp);
CREATE INDEX IF NOT EXISTS core_snapshot_created_at ON core_snapshot(created_at);
-- Many-to-many for snapshot tags
CREATE TABLE IF NOT EXISTS core_snapshot_tags (
id INTEGER PRIMARY KEY AUTOINCREMENT,
snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id),
tag_id INTEGER NOT NULL REFERENCES core_tag(id),
UNIQUE(snapshot_id, tag_id)
);
-- Core ArchiveResult table (0.8.x with AutoField PK + UUID, status)
CREATE TABLE IF NOT EXISTS core_archiveresult (
id INTEGER PRIMARY KEY AUTOINCREMENT,
uuid CHAR(36) UNIQUE,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
created_at DATETIME NOT NULL,
modified_at DATETIME,
snapshot_id CHAR(36) NOT NULL REFERENCES core_snapshot(id),
extractor VARCHAR(32) NOT NULL,
pwd VARCHAR(256),
cmd TEXT,
cmd_version VARCHAR(128),
output VARCHAR(1024),
start_ts DATETIME,
end_ts DATETIME,
status VARCHAR(16) NOT NULL DEFAULT 'queued',
retry_at DATETIME,
notes TEXT NOT NULL DEFAULT '',
output_dir VARCHAR(256),
iface_id INTEGER,
config TEXT DEFAULT '{}',
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
CREATE INDEX IF NOT EXISTS core_archiveresult_snapshot ON core_archiveresult(snapshot_id);
CREATE INDEX IF NOT EXISTS core_archiveresult_extractor ON core_archiveresult(extractor);
-- Insert required content types
INSERT INTO django_content_type (app_label, model) VALUES
('contenttypes', 'contenttype'),
('auth', 'permission'),
('auth', 'group'),
('auth', 'user'),
('admin', 'logentry'),
('sessions', 'session'),
('core', 'snapshot'),
('core', 'archiveresult'),
('core', 'tag'),
('machine', 'machine'),
('machine', 'networkinterface'),
('machine', 'dependency'),
('machine', 'binary'),
('crawls', 'crawl'),
('crawls', 'crawlschedule'),
('crawls', 'seed'),
('api', 'apitoken'),
('api', 'outboundwebhook');
"""
# =============================================================================
# Test Data Generators
# =============================================================================
def generate_uuid() -> str:
"""Generate a UUID string without dashes for SQLite."""
return uuid4().hex
def generate_timestamp() -> str:
"""Generate a timestamp string like ArchiveBox uses."""
return datetime.now(timezone.utc).strftime('%Y%m%d%H%M%S') + '.000000'
def seed_0_4_data(db_path: Path) -> Dict[str, List[Dict]]:
"""Seed a 0.4.x database with realistic test data."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
created_data = {
'snapshots': [],
'tags_str': [],
}
test_urls = [
('https://example.com/page1', 'Example Page 1', 'news,tech'),
('https://example.org/article', 'Article Title', 'blog,reading'),
('https://github.com/user/repo', 'GitHub Repository', 'code,github'),
('https://news.ycombinator.com/item?id=12345', 'HN Discussion', 'news,discussion'),
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', 'reference,wiki'),
]
for i, (url, title, tags) in enumerate(test_urls):
snapshot_id = generate_uuid()
timestamp = f'2024010{i+1}120000.000000'
added = f'2024-01-0{i+1} 12:00:00'
cursor.execute("""
INSERT INTO core_snapshot (id, url, timestamp, title, tags, added, updated)
VALUES (?, ?, ?, ?, ?, ?, ?)
""", (snapshot_id, url, timestamp, title, tags, added, added))
created_data['snapshots'].append({
'id': snapshot_id,
'url': url,
'timestamp': timestamp,
'title': title,
'tags': tags,
})
created_data['tags_str'].append(tags)
cursor.execute("""
INSERT INTO django_migrations (app, name, applied)
VALUES ('core', '0001_initial', datetime('now'))
""")
conn.commit()
conn.close()
return created_data
def seed_0_7_data(db_path: Path) -> Dict[str, List[Dict]]:
"""Seed a 0.7.x database with realistic test data."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
created_data = {
'users': [],
'snapshots': [],
'tags': [],
'archiveresults': [],
}
# Create a user
cursor.execute("""
INSERT INTO auth_user (password, is_superuser, username, first_name, last_name,
email, is_staff, is_active, date_joined)
VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User',
'admin@example.com', 1, 1, datetime('now'))
""")
user_id = cursor.lastrowid
created_data['users'].append({'id': user_id, 'username': 'admin'})
# Create 5 tags
tag_names = ['news', 'tech', 'blog', 'reference', 'code']
for name in tag_names:
cursor.execute("""
INSERT INTO core_tag (name, slug) VALUES (?, ?)
""", (name, name.lower()))
tag_id = cursor.lastrowid
created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
# Create 5 snapshots
test_urls = [
('https://example.com/page1', 'Example Page 1'),
('https://example.org/article', 'Article Title'),
('https://github.com/user/repo', 'GitHub Repository'),
('https://news.ycombinator.com/item?id=12345', 'HN Discussion'),
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test'),
]
for i, (url, title) in enumerate(test_urls):
snapshot_id = generate_uuid()
timestamp = f'2024010{i+1}120000.000000'
added = f'2024-01-0{i+1} 12:00:00'
cursor.execute("""
INSERT INTO core_snapshot (id, url, timestamp, title, added, updated)
VALUES (?, ?, ?, ?, ?, ?)
""", (snapshot_id, url, timestamp, title, added, added))
created_data['snapshots'].append({
'id': snapshot_id,
'url': url,
'timestamp': timestamp,
'title': title,
})
# Assign 2 tags to each snapshot
tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
for tag_id in tag_ids:
cursor.execute("""
INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
""", (snapshot_id, tag_id))
# Create 5 archive results for each snapshot
extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
for j, (extractor, status) in enumerate(zip(extractors, statuses)):
cursor.execute("""
INSERT INTO core_archiveresult
(snapshot_id, extractor, cmd, pwd, cmd_version, output, start_ts, end_ts, status)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
""", (
snapshot_id, extractor,
json.dumps([extractor, '--version']),
f'/data/archive/{timestamp}',
'1.0.0',
f'{extractor}/index.html' if status == 'succeeded' else '',
f'2024-01-0{i+1} 12:00:0{j}',
f'2024-01-0{i+1} 12:00:1{j}',
status
))
created_data['archiveresults'].append({
'snapshot_id': snapshot_id,
'extractor': extractor,
'status': status,
})
# Record migrations as applied (0.7.x migrations up to 0022)
migrations = [
('contenttypes', '0001_initial'),
('contenttypes', '0002_remove_content_type_name'),
('auth', '0001_initial'),
('auth', '0002_alter_permission_name_max_length'),
('auth', '0003_alter_user_email_max_length'),
('auth', '0004_alter_user_username_opts'),
('auth', '0005_alter_user_last_login_null'),
('auth', '0006_require_contenttypes_0002'),
('auth', '0007_alter_validators_add_error_messages'),
('auth', '0008_alter_user_username_max_length'),
('auth', '0009_alter_user_last_name_max_length'),
('auth', '0010_alter_group_name_max_length'),
('auth', '0011_update_proxy_permissions'),
('auth', '0012_alter_user_first_name_max_length'),
('admin', '0001_initial'),
('admin', '0002_logentry_remove_auto_add'),
('admin', '0003_logentry_add_action_flag_choices'),
('sessions', '0001_initial'),
('core', '0001_initial'),
('core', '0002_auto_20200625_1521'),
('core', '0003_auto_20200630_1034'),
('core', '0004_auto_20200713_1552'),
('core', '0005_auto_20200728_0326'),
('core', '0006_auto_20201012_1520'),
('core', '0007_archiveresult'),
('core', '0008_auto_20210105_1421'),
('core', '0009_auto_20210216_1038'),
('core', '0010_auto_20210216_1055'),
('core', '0011_auto_20210216_1331'),
('core', '0012_auto_20210216_1425'),
('core', '0013_auto_20210218_0729'),
('core', '0014_auto_20210218_0729'),
('core', '0015_auto_20210218_0730'),
('core', '0016_auto_20210218_1204'),
('core', '0017_auto_20210219_0211'),
('core', '0018_auto_20210327_0952'),
('core', '0019_auto_20210401_0654'),
('core', '0020_auto_20210410_1031'),
('core', '0021_auto_20220914_0934'),
('core', '0022_auto_20231023_2008'),
]
for app, name in migrations:
cursor.execute("""
INSERT INTO django_migrations (app, name, applied)
VALUES (?, ?, datetime('now'))
""", (app, name))
conn.commit()
conn.close()
return created_data
def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
"""Seed a 0.8.x database with realistic test data including Crawls."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
created_data = {
'users': [],
'crawls': [],
'snapshots': [],
'tags': [],
'archiveresults': [],
}
# Create a user
cursor.execute("""
INSERT INTO auth_user (password, is_superuser, username, first_name, last_name,
email, is_staff, is_active, date_joined)
VALUES ('pbkdf2_sha256$test', 1, 'admin', 'Admin', 'User',
'admin@example.com', 1, 1, datetime('now'))
""")
user_id = cursor.lastrowid
created_data['users'].append({'id': user_id, 'username': 'admin'})
# Create 5 tags
tag_names = ['news', 'tech', 'blog', 'reference', 'code']
for name in tag_names:
cursor.execute("""
INSERT INTO core_tag (name, slug, created_at, modified_at, created_by_id)
VALUES (?, ?, datetime('now'), datetime('now'), ?)
""", (name, name.lower(), user_id))
tag_id = cursor.lastrowid
created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
# Create 2 Crawls (0.9.0 schema - no seeds)
test_crawls = [
('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
]
for i, (urls, max_depth, label) in enumerate(test_crawls):
crawl_id = generate_uuid()
cursor.execute("""
INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
config, max_depth, tags_str, label, status, retry_at,
num_uses_failed, num_uses_succeeded)
VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
""", (crawl_id, user_id, urls, max_depth, label))
created_data['crawls'].append({
'id': crawl_id,
'urls': urls,
'max_depth': max_depth,
'label': label,
})
# Create 5 snapshots linked to crawls
test_urls = [
('https://example.com/page1', 'Example Page 1', created_data['crawls'][0]['id']),
('https://example.org/article', 'Article Title', created_data['crawls'][0]['id']),
('https://github.com/user/repo', 'GitHub Repository', created_data['crawls'][1]['id']),
('https://news.ycombinator.com/item?id=12345', 'HN Discussion', None),
('https://en.wikipedia.org/wiki/Test', 'Wikipedia Test', None),
]
for i, (url, title, crawl_id) in enumerate(test_urls):
snapshot_id = generate_uuid()
timestamp = f'2024010{i+1}120000.000000'
created_at = f'2024-01-0{i+1} 12:00:00'
cursor.execute("""
INSERT INTO core_snapshot (id, created_by_id, created_at, modified_at, url, timestamp,
bookmarked_at, crawl_id, title, depth, status, config, notes)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, 0, 'queued', '{}', '')
""", (snapshot_id, user_id, created_at, created_at, url, timestamp, created_at, crawl_id, title))
created_data['snapshots'].append({
'id': snapshot_id,
'url': url,
'timestamp': timestamp,
'title': title,
'crawl_id': crawl_id,
})
# Assign 2 tags to each snapshot
tag_ids = [created_data['tags'][i % 5]['id'], created_data['tags'][(i + 1) % 5]['id']]
for tag_id in tag_ids:
cursor.execute("""
INSERT INTO core_snapshot_tags (snapshot_id, tag_id) VALUES (?, ?)
""", (snapshot_id, tag_id))
# Create 5 archive results for each snapshot
extractors = ['title', 'favicon', 'screenshot', 'singlefile', 'wget']
statuses = ['succeeded', 'succeeded', 'failed', 'succeeded', 'skipped']
for j, (extractor, status) in enumerate(zip(extractors, statuses)):
result_uuid = generate_uuid()
cursor.execute("""
INSERT INTO core_archiveresult
(uuid, created_by_id, created_at, modified_at, snapshot_id, extractor, pwd,
cmd, cmd_version, output, start_ts, end_ts, status, retry_at, notes, output_dir)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, datetime('now'), '', ?)
""", (
result_uuid, user_id, f'2024-01-0{i+1} 12:00:0{j}', f'2024-01-0{i+1} 12:00:1{j}',
snapshot_id, extractor,
f'/data/archive/{timestamp}',
json.dumps([extractor, '--version']),
'1.0.0',
f'{extractor}/index.html' if status == 'succeeded' else '',
f'2024-01-0{i+1} 12:00:0{j}',
f'2024-01-0{i+1} 12:00:1{j}',
status,
f'{extractor}',
))
created_data['archiveresults'].append({
'uuid': result_uuid,
'snapshot_id': snapshot_id,
'extractor': extractor,
'status': status,
})
# Record migrations as applied (0.8.x migrations)
migrations = [
('contenttypes', '0001_initial'),
('contenttypes', '0002_remove_content_type_name'),
('auth', '0001_initial'),
('auth', '0002_alter_permission_name_max_length'),
('auth', '0003_alter_user_email_max_length'),
('auth', '0004_alter_user_username_opts'),
('auth', '0005_alter_user_last_login_null'),
('auth', '0006_require_contenttypes_0002'),
('auth', '0007_alter_validators_add_error_messages'),
('auth', '0008_alter_user_username_max_length'),
('auth', '0009_alter_user_last_name_max_length'),
('auth', '0010_alter_group_name_max_length'),
('auth', '0011_update_proxy_permissions'),
('auth', '0012_alter_user_first_name_max_length'),
('admin', '0001_initial'),
('admin', '0002_logentry_remove_auto_add'),
('admin', '0003_logentry_add_action_flag_choices'),
('sessions', '0001_initial'),
('core', '0001_initial'),
('core', '0002_auto_20200625_1521'),
('core', '0003_auto_20200630_1034'),
('core', '0004_auto_20200713_1552'),
('core', '0005_auto_20200728_0326'),
('core', '0006_auto_20201012_1520'),
('core', '0007_archiveresult'),
('core', '0008_auto_20210105_1421'),
('core', '0009_auto_20210216_1038'),
('core', '0010_auto_20210216_1055'),
('core', '0011_auto_20210216_1331'),
('core', '0012_auto_20210216_1425'),
('core', '0013_auto_20210218_0729'),
('core', '0014_auto_20210218_0729'),
('core', '0015_auto_20210218_0730'),
('core', '0016_auto_20210218_1204'),
('core', '0017_auto_20210219_0211'),
('core', '0018_auto_20210327_0952'),
('core', '0019_auto_20210401_0654'),
('core', '0020_auto_20210410_1031'),
('core', '0021_auto_20220914_0934'),
('core', '0022_auto_20231023_2008'),
# For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
('core', '0024_auto_20240513_1143'),
('core', '0025_alter_archiveresult_uuid'),
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
('core', '0027_update_snapshot_ids'),
('core', '0028_alter_archiveresult_uuid'),
('core', '0029_alter_archiveresult_id'),
('core', '0030_alter_archiveresult_uuid'),
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
('core', '0032_alter_archiveresult_id'),
('core', '0033_rename_id_archiveresult_old_id'),
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
('core', '0037_rename_id_snapshot_old_id'),
('core', '0038_rename_uuid_snapshot_id'),
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
('core', '0040_archiveresult_snapshot'),
('core', '0041_alter_archiveresult_snapshot_and_more'),
('core', '0042_remove_archiveresult_snapshot_old'),
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
('core', '0045_alter_snapshot_old_id'),
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0047_alter_snapshottag_unique_together_and_more'),
('core', '0048_alter_archiveresult_snapshot_and_more'),
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
('core', '0050_alter_snapshottag_snapshot_old'),
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
('core', '0052_alter_snapshottag_unique_together_and_more'),
('core', '0053_remove_snapshottag_snapshot_old'),
('core', '0054_alter_snapshot_timestamp'),
('core', '0055_alter_tag_slug'),
('core', '0056_remove_tag_uuid'),
('core', '0057_rename_id_tag_old_id'),
('core', '0058_alter_tag_old_id'),
('core', '0059_tag_id'),
('core', '0060_alter_tag_id'),
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
('core', '0062_alter_snapshottag_old_tag'),
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
('core', '0064_alter_snapshottag_unique_together_and_more'),
('core', '0065_remove_snapshottag_old_tag'),
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
('core', '0067_alter_snapshottag_tag'),
('core', '0068_alter_archiveresult_options'),
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
('core', '0073_rename_created_archiveresult_created_at_and_more'),
('core', '0074_alter_snapshot_downloaded_at'),
# For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
# We already recorded 0023-0074 above, so Django will know the state
# For 0.8.x: Record original machine migrations (before squashing)
# DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
('machine', '0001_initial'),
('machine', '0002_alter_machine_stats_installedbinary'),
('machine', '0003_alter_installedbinary_options_and_more'),
('machine', '0004_alter_installedbinary_abspath_and_more'),
# Then the new migrations after squashing
('machine', '0002_rename_custom_cmds_to_overrides'),
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
('machine', '0004_drop_dependency_table'),
# Crawls must come before core.0024 because 0024_b depends on it
('crawls', '0001_initial'),
# Core 0024 migrations chain (in dependency order)
('core', '0024_b_clear_config_fields'),
('core', '0024_c_disable_fk_checks'),
('core', '0024_d_fix_crawls_config'),
('core', '0024_snapshot_crawl'),
('core', '0024_f_add_snapshot_config'),
('core', '0025_allow_duplicate_urls_per_crawl'),
# For 0.8.x: Record original api migration (before squashing)
# DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
('api', '0001_initial'),
('api', '0002_alter_apitoken_options'),
('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
('api', '0007_alter_apitoken_created_by'),
('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
('api', '0009_rename_created_apitoken_created_at_and_more'),
# Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
# Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
# Do NOT record 0026+ as they need to be tested during migration
]
for app, name in migrations:
cursor.execute("""
INSERT INTO django_migrations (app, name, applied)
VALUES (?, ?, datetime('now'))
""", (app, name))
conn.commit()
conn.close()
return created_data
# =============================================================================
# Helper Functions
# =============================================================================
def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = None) -> subprocess.CompletedProcess:
"""Run archivebox command in subprocess with given data directory."""
base_env = os.environ.copy()
base_env['DATA_DIR'] = str(data_dir)
base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False'
# Disable ALL extractors for faster tests (can be overridden by env parameter)
base_env['SAVE_ARCHIVEDOTORG'] = 'False'
base_env['SAVE_TITLE'] = 'False'
base_env['SAVE_FAVICON'] = 'False'
base_env['SAVE_WGET'] = 'False'
base_env['SAVE_SINGLEFILE'] = 'False'
base_env['SAVE_SCREENSHOT'] = 'False'
base_env['SAVE_PDF'] = 'False'
base_env['SAVE_DOM'] = 'False'
base_env['SAVE_READABILITY'] = 'False'
base_env['SAVE_MERCURY'] = 'False'
base_env['SAVE_GIT'] = 'False'
base_env['SAVE_YTDLP'] = 'False'
base_env['SAVE_HEADERS'] = 'False'
base_env['SAVE_HTMLTOTEXT'] = 'False'
# Override with any custom env vars
if env:
base_env.update(env)
cmd = [sys.executable, '-m', 'archivebox'] + args
return subprocess.run(
cmd,
capture_output=True,
text=True,
env=base_env,
cwd=str(data_dir),
timeout=timeout,
)
def create_data_dir_structure(data_dir: Path):
"""Create the basic ArchiveBox data directory structure."""
(data_dir / 'archive').mkdir(parents=True, exist_ok=True)
(data_dir / 'sources').mkdir(parents=True, exist_ok=True)
(data_dir / 'logs').mkdir(parents=True, exist_ok=True)
def verify_snapshot_count(db_path: Path, expected: int) -> Tuple[bool, str]:
"""Verify the number of snapshots in the database."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM core_snapshot")
count = cursor.fetchone()[0]
conn.close()
if count == expected:
return True, f"Snapshot count OK: {count}"
return False, f"Snapshot count mismatch: expected {expected}, got {count}"
def verify_tag_count(db_path: Path, expected: int) -> Tuple[bool, str]:
"""Verify the number of tags in the database (exact match)."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM core_tag")
count = cursor.fetchone()[0]
conn.close()
if count == expected:
return True, f"Tag count OK: {count}"
return False, f"Tag count mismatch: expected {expected}, got {count}"
def verify_archiveresult_count(db_path: Path, expected: int) -> Tuple[bool, str]:
"""Verify the number of archive results in the database."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
count = cursor.fetchone()[0]
conn.close()
if count == expected:
return True, f"ArchiveResult count OK: {count}"
return False, f"ArchiveResult count mismatch: expected {expected}, got {count}"
def verify_snapshot_urls(db_path: Path, expected_urls: List[str]) -> Tuple[bool, str]:
"""Verify ALL expected URLs exist in snapshots."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT url FROM core_snapshot")
actual_urls = {row[0] for row in cursor.fetchall()}
conn.close()
missing = set(expected_urls) - actual_urls
if not missing:
return True, "All URLs preserved"
return False, f"Missing URLs: {missing}"
def verify_snapshot_titles(db_path: Path, expected_titles: Dict[str, str]) -> Tuple[bool, str]:
"""Verify ALL snapshot titles are preserved."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT url, title FROM core_snapshot")
actual = {row[0]: row[1] for row in cursor.fetchall()}
conn.close()
mismatches = []
for url, expected_title in expected_titles.items():
if url not in actual:
mismatches.append(f"{url}: missing from database")
elif actual[url] != expected_title:
mismatches.append(f"{url}: expected '{expected_title}', got '{actual[url]}'")
if not mismatches:
return True, "All titles preserved"
return False, f"Title mismatches: {mismatches}"
def verify_foreign_keys(db_path: Path) -> Tuple[bool, str]:
"""Verify foreign key relationships are intact."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
# Check ArchiveResult -> Snapshot FK
cursor.execute("""
SELECT COUNT(*) FROM core_archiveresult ar
WHERE NOT EXISTS (SELECT 1 FROM core_snapshot s WHERE s.id = ar.snapshot_id)
""")
orphaned_results = cursor.fetchone()[0]
conn.close()
if orphaned_results == 0:
return True, "Foreign keys intact"
return False, f"Found {orphaned_results} orphaned ArchiveResults"
def verify_all_snapshots_in_output(output: str, snapshots: List[Dict]) -> Tuple[bool, str]:
"""Verify ALL snapshots appear in command output (not just one)."""
missing = []
for snapshot in snapshots:
url_fragment = snapshot['url'][:30]
title = snapshot.get('title', '')
if url_fragment not in output and (not title or title not in output):
missing.append(snapshot['url'])
if not missing:
return True, "All snapshots found in output"
return False, f"Missing snapshots in output: {missing}"
def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
"""Verify the number of crawls in the database."""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
count = cursor.fetchone()[0]
conn.close()
if count == expected:
return True, f"Crawl count OK: {count}"
return False, f"Crawl count mismatch: expected {expected}, got {count}"
def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
"""
Verify that ArchiveResults were properly migrated to Process records.
Checks:
1. All ArchiveResults have process_id set
2. Process count matches ArchiveResult count
3. Binary records created for unique cmd_version values
4. Status mapping is correct
"""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
# Check all ArchiveResults have process_id
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL")
null_count = cursor.fetchone()[0]
if null_count > 0:
conn.close()
return False, f"Found {null_count} ArchiveResults without process_id"
# Check Process count
cursor.execute("SELECT COUNT(*) FROM machine_process")
process_count = cursor.fetchone()[0]
if process_count != expected_archiveresult_count:
conn.close()
return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}"
# Check status mapping
cursor.execute("""
SELECT ar.status, p.status, p.exit_code
FROM core_archiveresult ar
JOIN machine_process p ON ar.process_id = p.id
""")
status_errors = []
for ar_status, p_status, p_exit_code in cursor.fetchall():
expected_p_status, expected_exit_code = {
'queued': ('queued', None),
'started': ('running', None),
'backoff': ('queued', None),
'succeeded': ('exited', 0),
'failed': ('exited', 1),
'skipped': ('exited', None),
}.get(ar_status, ('queued', None))
if p_status != expected_p_status:
status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
if p_exit_code != expected_exit_code:
status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}")
if status_errors:
conn.close()
return False, f"Status mapping errors: {'; '.join(status_errors[:5])}"
conn.close()
return True, f"Process migration verified: {process_count} Processes created"