use full dotted paths for all archivebox imports, add migrations and more fixes

This commit is contained in:
Nick Sweeting
2025-12-29 00:47:08 -08:00
parent 1e4d3ffd11
commit f4e7820533
61 changed files with 1082 additions and 2985 deletions

View File

@@ -763,7 +763,7 @@ The configuration is documented here: **[Configuration Wiki](https://github.com/
<br/> <br/>
TIMEOUT=240 # default: 60 add more seconds on slower networks TIMEOUT=240 # default: 60 add more seconds on slower networks
CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL CHECK_SSL_VALIDITY=False # default: True False = allow saving URLs w/ bad SSL
SAVE_ARCHIVE_DOT_ORG=False # default: True False = disable Archive.org saving SAVE_ARCHIVEDOTORG=False # default: True False = disable Archive.org saving
MAX_MEDIA_SIZE=1500m # default: 750m raise/lower youtubedl output size MAX_MEDIA_SIZE=1500m # default: 750m raise/lower youtubedl output size
<br/> <br/>
PUBLIC_INDEX=True # default: True whether anon users can view index PUBLIC_INDEX=True # default: True whether anon users can view index
@@ -959,7 +959,7 @@ archivebox add 'https://docs.google.com/document/d/12345somePrivateDocument'
archivebox add 'https://vimeo.com/somePrivateVideo' archivebox add 'https://vimeo.com/somePrivateVideo'
# without first disabling saving to Archive.org: # without first disabling saving to Archive.org:
archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org archivebox config --set SAVE_ARCHIVEDOTORG=False # disable saving all URLs in Archive.org
# restrict the main index, Snapshot content, and Add Page to authenticated users as-needed: # restrict the main index, Snapshot content, and Add Page to authenticated users as-needed:
archivebox config --set PUBLIC_INDEX=False archivebox config --set PUBLIC_INDEX=False

View File

@@ -26,10 +26,10 @@ ASCII_LOGO = """
PACKAGE_DIR = Path(__file__).resolve().parent PACKAGE_DIR = Path(__file__).resolve().parent
# Add PACKAGE_DIR to sys.path - required for Django migrations to import models # # Add PACKAGE_DIR to sys.path - required for Django migrations to import models
# Migrations reference models like 'machine.Binary' which need to be importable # # Migrations reference models like 'machine.Binary' which need to be importable
if str(PACKAGE_DIR) not in sys.path: # if str(PACKAGE_DIR) not in sys.path:
sys.path.append(str(PACKAGE_DIR)) # sys.path.append(str(PACKAGE_DIR))
os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings' os.environ['DJANGO_SETTINGS_MODULE'] = 'archivebox.core.settings'
os.environ['TZ'] = 'UTC' os.environ['TZ'] = 'UTC'

View File

@@ -5,6 +5,7 @@ from django.apps import AppConfig
class APIConfig(AppConfig): class APIConfig(AppConfig):
name = 'archivebox.api' name = 'archivebox.api'
label = 'api'
def register_admin(admin_site): def register_admin(admin_site):

View File

@@ -94,7 +94,7 @@ class OrchestratorSchema(Schema):
@router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator") @router.get("/orchestrator", response=OrchestratorSchema, url_name="get_orchestrator")
def get_orchestrator(request): def get_orchestrator(request):
"""Get the orchestrator status and all worker queues.""" """Get the orchestrator status and all worker queues."""
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker from workers.worker import CrawlWorker, SnapshotWorker, ArchiveResultWorker
orchestrator = Orchestrator() orchestrator = Orchestrator()

View File

@@ -73,7 +73,7 @@ class ModelWithUUID(models.Model):
return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}' return f'/api/v1/docs#/{self._meta.app_label.title()}%20Models/api_v1_{self._meta.app_label}_get_{self._meta.db_table}'
def as_json(self, keys: Iterable[str] = ()) -> dict: def as_json(self, keys: Iterable[str] = ()) -> dict:
default_keys = ('id', 'created_at', 'modified_at', 'created_by_id') default_keys = ('id', 'created_at', 'modified_at')
return {key: getattr(self, key) for key in (keys or default_keys) if hasattr(self, key)} return {key: getattr(self, key) for key in (keys or default_keys) if hasattr(self, key)}
@@ -119,7 +119,7 @@ class ModelWithHealthStats(models.Model):
class ModelWithConfig(models.Model): class ModelWithConfig(models.Model):
"""Mixin for models with a JSON config field.""" """Mixin for models with a JSON config field."""
config = models.JSONField(default=dict, null=False, blank=False, editable=True) config = models.JSONField(default=dict, null=True, blank=True, editable=True)
class Meta: class Meta:
abstract = True abstract = True

View File

@@ -56,7 +56,7 @@ def add(urls: str | list[str],
from archivebox.core.models import Snapshot from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
created_by_id = created_by_id or get_or_create_system_user_pk() created_by_id = created_by_id or get_or_create_system_user_pk()

View File

@@ -78,7 +78,7 @@ def discover_outlinks(
from archivebox.core.models import Snapshot, ArchiveResult from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
created_by_id = get_or_create_system_user_pk() created_by_id = get_or_create_system_user_pk()
is_tty = sys.stdout.isatty() is_tty = sys.stdout.isatty()

View File

@@ -96,7 +96,7 @@ def run_plugins(
TYPE_SNAPSHOT, TYPE_ARCHIVERESULT TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
) )
from archivebox.core.models import Snapshot, ArchiveResult from archivebox.core.models import Snapshot, ArchiveResult
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
is_tty = sys.stdout.isatty() is_tty = sys.stdout.isatty()

View File

@@ -13,11 +13,9 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types @enforce_types
def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=False) -> None: def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
"""Initialize a new ArchiveBox collection in the current directory""" """Initialize a new ArchiveBox collection in the current directory"""
install = install or setup
from archivebox.config import CONSTANTS, VERSION, DATA_DIR from archivebox.config import CONSTANTS, VERSION, DATA_DIR
from archivebox.config.common import SERVER_CONFIG from archivebox.config.common import SERVER_CONFIG
from archivebox.config.collection import write_config_file from archivebox.config.collection import write_config_file
@@ -128,7 +126,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]') print(f' [yellow]√ Added {len(orphaned_data_dir_links)} orphaned links from existing archive directories.[/yellow]')
if pending_links: if pending_links:
Snapshot.objects.create_from_dicts(list(pending_links.values())) for link_dict in pending_links.values():
Snapshot.from_jsonl(link_dict)
# Hint for orphaned snapshot directories # Hint for orphaned snapshot directories
print() print()
@@ -187,7 +186,6 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
@click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway') @click.option('--force', '-f', is_flag=True, help='Ignore unrecognized files in current directory and initialize anyway')
@click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs') @click.option('--quick', '-q', is_flag=True, help='Run any updates or migrations without rechecking all snapshot dirs')
@click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving') @click.option('--install', '-s', is_flag=True, help='Automatically install dependencies and extras used for archiving')
@click.option('--setup', '-s', is_flag=True, help='DEPRECATED: equivalent to --install')
@docstring(init.__doc__) @docstring(init.__doc__)
def main(**kwargs) -> None: def main(**kwargs) -> None:
init(**kwargs) init(**kwargs)

View File

@@ -85,7 +85,7 @@ def install(dry_run: bool=False) -> None:
print() print()
# Run the crawl synchronously (this triggers on_Crawl hooks) # Run the crawl synchronously (this triggers on_Crawl hooks)
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=True) orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop() orchestrator.runloop()

View File

@@ -37,7 +37,7 @@ def orchestrator(daemon: bool = False, watch: bool = False) -> int:
0: All work completed successfully 0: All work completed successfully
1: Error occurred 1: Error occurred
""" """
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
if Orchestrator.is_running(): if Orchestrator.is_running():
print('[yellow]Orchestrator is already running[/yellow]') print('[yellow]Orchestrator is already running[/yellow]')

View File

@@ -74,7 +74,7 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
tail_multiple_worker_logs, tail_multiple_worker_logs,
is_port_in_use, is_port_in_use,
) )
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
import sys import sys
# Check if port is already in use # Check if port is already in use

View File

@@ -163,7 +163,7 @@ def create_snapshots(
# If --plugins is passed, run the orchestrator for those plugins # If --plugins is passed, run the orchestrator for those plugins
if plugins: if plugins:
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr) rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
orchestrator = Orchestrator(exit_on_idle=True) orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.runloop() orchestrator.runloop()

View File

@@ -160,7 +160,7 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
total = Snapshot.objects.count() total = Snapshot.objects.count()
print(f'[*] Processing {total} snapshots from database...') print(f'[*] Processing {total} snapshots from database...')
for snapshot in Snapshot.objects.iterator(): for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
# Reconcile index.json with DB # Reconcile index.json with DB
snapshot.reconcile_with_index_json() snapshot.reconcile_with_index_json()
@@ -209,7 +209,7 @@ def process_filtered_snapshots(
total = snapshots.count() total = snapshots.count()
print(f'[*] Found {total} matching snapshots') print(f'[*] Found {total} matching snapshots')
for snapshot in snapshots.iterator(): for snapshot in snapshots.iterator(chunk_size=batch_size):
# Reconcile index.json with DB # Reconcile index.json with DB
snapshot.reconcile_with_index_json() snapshot.reconcile_with_index_json()

View File

@@ -17,7 +17,7 @@ TEST_CONFIG = {
'DATA_DIR': 'data.tests', 'DATA_DIR': 'data.tests',
'SAVE_ARCHIVE_DOT_ORG': 'False', 'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'False', 'SAVE_TITLE': 'False',
'USE_CURL': 'False', 'USE_CURL': 'False',

View File

@@ -32,7 +32,7 @@ from unittest.mock import patch, MagicMock
TEST_CONFIG = { TEST_CONFIG = {
'USE_COLOR': 'False', 'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False', 'SHOW_PROGRESS': 'False',
'SAVE_ARCHIVE_DOT_ORG': 'False', 'SAVE_ARCHIVEDOTORG': 'False',
'SAVE_TITLE': 'True', # Fast extractor 'SAVE_TITLE': 'True', # Fast extractor
'SAVE_FAVICON': 'False', 'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False', 'SAVE_WGET': 'False',

View File

@@ -216,6 +216,29 @@ def get_config(
if snapshot and hasattr(snapshot, "config") and snapshot.config: if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config) config.update(snapshot.config)
# Normalize all aliases to canonical names (after all sources merged)
# This handles aliases that came from user/crawl/snapshot configs, not just env
try:
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
aliases_to_normalize = {} # {alias_key: canonical_key}
# Build alias mapping from all plugin schemas
for plugin_name, schema in plugin_configs.items():
for canonical_key, prop_schema in schema.get('properties', {}).items():
for alias in prop_schema.get('x-aliases', []):
aliases_to_normalize[alias] = canonical_key
# Normalize: copy alias values to canonical keys (aliases take precedence)
for alias_key, canonical_key in aliases_to_normalize.items():
if alias_key in config:
# Alias exists - copy to canonical key (overwriting any default)
config[canonical_key] = config[alias_key]
# Remove alias from config to keep it clean
del config[alias_key]
except ImportError:
pass
return config return config

View File

@@ -5,8 +5,12 @@ from django.apps import AppConfig
class CoreConfig(AppConfig): class CoreConfig(AppConfig):
name = 'archivebox.core' name = 'archivebox.core'
label = 'core'
def ready(self): def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site""" """Register the archivebox.core.admin_site as the main django admin site"""
from archivebox.core.admin_site import register_admin_site from archivebox.core.admin_site import register_admin_site
register_admin_site() register_admin_site()
# Import models to register state machines with the registry
from archivebox.core import models # noqa: F401

View File

@@ -0,0 +1,57 @@
# Data migration to clear config fields that may contain invalid JSON
# This runs before 0025 to prevent CHECK constraint failures
from django.db import migrations
def clear_config_fields(apps, schema_editor):
"""Clear all config fields in related tables to avoid JSON validation errors."""
db_alias = schema_editor.connection.alias
# Disable foreign key checks temporarily to allow updates
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF")
tables_to_clear = [
('crawls_seed', 'config'),
('crawls_crawl', 'config'),
('crawls_crawlschedule', 'config') if 'crawlschedule' in dir() else None,
('machine_machine', 'stats'),
('machine_machine', 'config'),
]
for table_info in tables_to_clear:
if table_info is None:
continue
table_name, field_name = table_info
try:
with schema_editor.connection.cursor() as cursor:
# Check if table exists first
cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table_name}'")
if not cursor.fetchone():
print(f" Skipping {table_name}.{field_name}: table does not exist")
continue
# Set all to empty JSON object
cursor.execute(f"UPDATE {table_name} SET {field_name} = '{{}}' WHERE {field_name} IS NOT NULL")
print(f" Cleared {field_name} in {table_name}: {cursor.rowcount} rows")
except Exception as e:
print(f" Skipping {table_name}.{field_name}: {e}")
# Re-enable foreign key checks
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=ON")
class Migration(migrations.Migration):
dependencies = [
('core', '0023_new_schema'),
('crawls', '0001_initial'),
('machine', '0001_squashed'),
]
operations = [
migrations.RunPython(clear_config_fields, reverse_code=migrations.RunPython.noop),
]

View File

@@ -0,0 +1,28 @@
# Disable foreign key checks before 0025 to prevent CHECK constraint validation errors
from django.db import migrations
def disable_fk_checks(apps, schema_editor):
"""Temporarily disable foreign key checks."""
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=OFF")
print(" Disabled foreign key checks")
def enable_fk_checks(apps, schema_editor):
"""Re-enable foreign key checks."""
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA foreign_keys=ON")
print(" Enabled foreign key checks")
class Migration(migrations.Migration):
dependencies = [
('core', '0024_b_clear_config_fields'),
]
operations = [
migrations.RunPython(disable_fk_checks, reverse_code=enable_fk_checks),
]

View File

@@ -0,0 +1,93 @@
# Fix crawls_crawl config field to avoid CHECK constraint errors during table rebuilds
from django.db import migrations
def fix_crawls_config(apps, schema_editor):
"""
Rebuild crawls_crawl table to fix CHECK constraints and make seed_id nullable.
Only runs for UPGRADES from 0.8.x (when crawls.0001_initial didn't exist yet).
For fresh installs, crawls.0001_initial creates the correct schema.
"""
with schema_editor.connection.cursor() as cursor:
# Check if this is an upgrade from old 0.8.x or a fresh install
# In fresh installs, crawls.0001_initial was applied, creating seed FK
# In upgrades, the table was created by old migrations before 0001_initial existed
cursor.execute("""
SELECT COUNT(*) FROM django_migrations
WHERE app='crawls' AND name='0001_initial'
""")
has_crawls_0001 = cursor.fetchone()[0] > 0
if has_crawls_0001:
# Fresh install - crawls.0001_initial already created the correct schema
# Just clear config to avoid CHECK constraint issues
print(" Fresh install detected - clearing config field only")
try:
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
except Exception as e:
print(f" Skipping config clear: {e}")
return
# Upgrade from 0.8.x - rebuild table to make seed_id nullable and remove CHECK constraint
print(" Upgrading from 0.8.x - rebuilding crawls_crawl table")
cursor.execute("PRAGMA foreign_keys=OFF")
# Backup
cursor.execute("CREATE TABLE crawls_crawl_backup AS SELECT * FROM crawls_crawl")
# Recreate without config CHECK constraint, with nullable seed_id
cursor.execute("DROP TABLE crawls_crawl")
cursor.execute("""
CREATE TABLE "crawls_crawl" (
"num_uses_failed" integer unsigned NOT NULL CHECK ("num_uses_failed" >= 0),
"num_uses_succeeded" integer unsigned NOT NULL CHECK ("num_uses_succeeded" >= 0),
"id" char(32) NOT NULL PRIMARY KEY,
"created_at" datetime NOT NULL,
"modified_at" datetime NOT NULL,
"urls" text NOT NULL,
"config" text,
"max_depth" smallint unsigned NOT NULL CHECK ("max_depth" >= 0),
"tags_str" varchar(1024) NOT NULL,
"persona_id" char(32) NULL,
"label" varchar(64) NOT NULL,
"notes" text NOT NULL,
"output_dir" varchar(512) NOT NULL,
"status" varchar(15) NOT NULL,
"retry_at" datetime NULL,
"created_by_id" integer NOT NULL REFERENCES "auth_user" ("id") DEFERRABLE INITIALLY DEFERRED,
"seed_id" char(32) NULL DEFAULT NULL,
"schedule_id" char(32) NULL REFERENCES "crawls_crawlschedule" ("id") DEFERRABLE INITIALLY DEFERRED
)
""")
# Restore data
cursor.execute("""
INSERT INTO "crawls_crawl" (
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
)
SELECT
"num_uses_failed", "num_uses_succeeded", "id", "created_at", "modified_at",
"urls", "config", "max_depth", "tags_str", "persona_id", "label", "notes",
"output_dir", "status", "retry_at", "created_by_id", "seed_id", "schedule_id"
FROM crawls_crawl_backup
""")
cursor.execute("DROP TABLE crawls_crawl_backup")
# NULL out config to avoid any invalid JSON
cursor.execute('UPDATE "crawls_crawl" SET "config" = NULL')
class Migration(migrations.Migration):
dependencies = [
('core', '0024_c_disable_fk_checks'),
('crawls', '0001_initial'),
]
operations = [
migrations.RunPython(fix_crawls_config, reverse_code=migrations.RunPython.noop),
]

View File

@@ -8,9 +8,7 @@ import django.db.models.deletion
class Migration(migrations.Migration): class Migration(migrations.Migration):
dependencies = [ dependencies = [
('core', '0023_new_schema'), ('core', '0024_d_fix_crawls_config'),
('crawls', '0001_initial'),
('machine', '0001_squashed'),
] ]
operations = [ operations = [

View File

@@ -10,6 +10,13 @@ from django.db import migrations, models
def populate_archiveresult_uuids(apps, schema_editor): def populate_archiveresult_uuids(apps, schema_editor):
"""Generate unique UUIDs for ArchiveResults that don't have one.""" """Generate unique UUIDs for ArchiveResults that don't have one."""
# Check if uuid column exists before trying to populate it
with schema_editor.connection.cursor() as cursor:
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = [row[1] for row in cursor.fetchall()]
if 'uuid' not in columns:
return # uuid column doesn't exist, skip this data migration
ArchiveResult = apps.get_model('core', 'ArchiveResult') ArchiveResult = apps.get_model('core', 'ArchiveResult')
for result in ArchiveResult.objects.filter(uuid__isnull=True): for result in ArchiveResult.objects.filter(uuid__isnull=True):
result.uuid = uuid_compat.uuid7() result.uuid = uuid_compat.uuid7()
@@ -21,6 +28,22 @@ def reverse_populate_uuids(apps, schema_editor):
pass pass
def remove_output_dir_if_exists(apps, schema_editor):
"""Remove output_dir columns if they exist."""
with schema_editor.connection.cursor() as cursor:
# Check and remove from core_archiveresult
cursor.execute("PRAGMA table_info(core_archiveresult)")
columns = [row[1] for row in cursor.fetchall()]
if 'output_dir' in columns:
cursor.execute("ALTER TABLE core_archiveresult DROP COLUMN output_dir")
# Check and remove from core_snapshot
cursor.execute("PRAGMA table_info(core_snapshot)")
columns = [row[1] for row in cursor.fetchall()]
if 'output_dir' in columns:
cursor.execute("ALTER TABLE core_snapshot DROP COLUMN output_dir")
class Migration(migrations.Migration): class Migration(migrations.Migration):
dependencies = [ dependencies = [
@@ -33,82 +56,90 @@ class Migration(migrations.Migration):
migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids), migrations.RunPython(populate_archiveresult_uuids, reverse_populate_uuids),
# Remove output_dir fields (not needed, computed from snapshot) # Remove output_dir fields (not needed, computed from snapshot)
migrations.RemoveField( migrations.RunPython(remove_output_dir_if_exists, reverse_code=migrations.RunPython.noop),
model_name='archiveresult',
name='output_dir', # Update Django's migration state to match 0.9.x schema
), # Database already has correct types from 0.8.x, just update state
migrations.RemoveField( migrations.SeparateDatabaseAndState(
model_name='snapshot', state_operations=[
name='output_dir', # Archiveresult field alterations
migrations.AlterField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(db_index=True, max_length=32),
),
# Convert id from AutoField to UUIDField (database already has UUID CHAR(32))
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
# Snapshot field alterations
migrations.AlterField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
],
database_operations=[
# No actual database changes needed - schema is already correct from 0.8.x
],
), ),
# Archiveresult field alterations # SnapshotTag and Tag alterations - state only, DB already correct
migrations.AlterField( migrations.SeparateDatabaseAndState(
model_name='archiveresult', state_operations=[
name='created_at', migrations.AlterField(
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now), model_name='snapshottag',
), name='id',
migrations.AlterField( field=models.AutoField(primary_key=True, serialize=False),
model_name='archiveresult', ),
name='created_by', migrations.AlterField(
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL), model_name='tag',
), name='created_by',
migrations.AlterField( field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
model_name='archiveresult', ),
name='extractor', migrations.AlterUniqueTogether(
field=models.CharField(db_index=True, max_length=32), name='snapshottag',
), unique_together={('snapshot', 'tag')},
migrations.AlterField( ),
model_name='archiveresult', ],
name='id', database_operations=[],
field=models.AutoField(editable=False, primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
),
# Snapshot field alterations
migrations.AlterField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
# SnapshotTag and Tag alterations
migrations.AlterField(
model_name='snapshottag',
name='id',
field=models.AutoField(primary_key=True, serialize=False),
),
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
), ),
] ]

View File

@@ -13,68 +13,79 @@ class Migration(migrations.Migration):
] ]
operations = [ operations = [
# Add new output fields (keep old 'output' temporarily for migration) # Add new output fields using SeparateDatabaseAndState to avoid table rebuilds
migrations.AddField( migrations.SeparateDatabaseAndState(
model_name='archiveresult', state_operations=[
name='output_str', migrations.AddField(
field=models.TextField( model_name='archiveresult',
blank=True, name='output_str',
default='', field=models.TextField(
help_text='Human-readable output summary (e.g., "Downloaded 5 files")' blank=True,
), default='',
), help_text='Human-readable output summary (e.g., "Downloaded 5 files")'
),
migrations.AddField( ),
model_name='archiveresult', migrations.AddField(
name='output_json', model_name='archiveresult',
field=models.JSONField( name='output_json',
null=True, field=models.JSONField(
blank=True, null=True,
default=None, blank=True,
help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields' default=None,
), help_text='Structured metadata (headers, redirects, etc.) - should NOT duplicate ArchiveResult fields'
), ),
),
migrations.AddField( migrations.AddField(
model_name='archiveresult', model_name='archiveresult',
name='output_files', name='output_files',
field=models.JSONField( field=models.JSONField(
default=dict, default=dict,
help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata' help_text='Dict of {relative_path: {metadata}} - values are empty dicts for now, extensible for future metadata'
), ),
), ),
migrations.AddField(
migrations.AddField( model_name='archiveresult',
model_name='archiveresult', name='output_size',
name='output_size', field=models.BigIntegerField(
field=models.BigIntegerField( default=0,
default=0, help_text='Total recursive size in bytes of all output files'
help_text='Total recursive size in bytes of all output files' ),
), ),
), migrations.AddField(
model_name='archiveresult',
migrations.AddField( name='output_mimetypes',
model_name='archiveresult', field=models.CharField(
name='output_mimetypes', max_length=512,
field=models.CharField( blank=True,
max_length=512, default='',
blank=True, help_text='CSV of mimetypes sorted by size descending'
default='', ),
help_text='CSV of mimetypes sorted by size descending' ),
), migrations.AddField(
), model_name='archiveresult',
name='binary',
# Add binary FK (optional) field=models.ForeignKey(
migrations.AddField( 'machine.Binary',
model_name='archiveresult', on_delete=models.SET_NULL,
name='binary', null=True,
field=models.ForeignKey( blank=True,
'machine.Binary', related_name='archiveresults',
on_delete=models.SET_NULL, help_text='Primary binary used by this hook (optional)'
null=True, ),
blank=True, ),
related_name='archiveresults', ],
help_text='Primary binary used by this hook (optional)' database_operations=[
), migrations.RunSQL(
sql="""
ALTER TABLE core_archiveresult ADD COLUMN output_str TEXT DEFAULT '';
ALTER TABLE core_archiveresult ADD COLUMN output_json TEXT;
ALTER TABLE core_archiveresult ADD COLUMN output_files TEXT DEFAULT '{}';
ALTER TABLE core_archiveresult ADD COLUMN output_size BIGINT DEFAULT 0;
ALTER TABLE core_archiveresult ADD COLUMN output_mimetypes VARCHAR(512) DEFAULT '';
ALTER TABLE core_archiveresult ADD COLUMN binary_id CHAR(32) REFERENCES machine_binary(id);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
), ),
] ]

View File

@@ -12,27 +12,46 @@ def migrate_output_field(apps, schema_editor):
Logic: Logic:
- If output contains JSON {...}, move to output_json - If output contains JSON {...}, move to output_json
- Otherwise, move to output_str - Otherwise, move to output_str
Use raw SQL to avoid CHECK constraint issues during migration.
""" """
ArchiveResult = apps.get_model('core', 'ArchiveResult') # Use raw SQL to migrate data without triggering CHECK constraints
with schema_editor.connection.cursor() as cursor:
# Get all archive results
cursor.execute("""
SELECT id, output FROM core_archiveresult
""")
for ar in ArchiveResult.objects.all().iterator(): for row in cursor.fetchall():
old_output = ar.output or '' ar_id, old_output = row
old_output = old_output or ''
# Case 1: JSON output # Case 1: JSON output
if old_output.strip().startswith('{'): if old_output.strip().startswith('{'):
try: try:
parsed = json.loads(old_output) # Validate it's actual JSON
ar.output_json = parsed parsed = json.loads(old_output)
ar.output_str = '' # Update with JSON - cast to JSON to satisfy CHECK constraint
except json.JSONDecodeError: json_str = json.dumps(parsed)
# Not valid JSON, treat as string cursor.execute("""
ar.output_str = old_output UPDATE core_archiveresult
SET output_str = '', output_json = json(?)
# Case 2: File path or plain string WHERE id = ?
else: """, (json_str, ar_id))
ar.output_str = old_output except json.JSONDecodeError:
# Not valid JSON, treat as string
ar.save(update_fields=['output_str', 'output_json']) cursor.execute("""
UPDATE core_archiveresult
SET output_str = ?, output_json = NULL
WHERE id = ?
""", (old_output, ar_id))
# Case 2: File path or plain string
else:
cursor.execute("""
UPDATE core_archiveresult
SET output_str = ?, output_json = NULL
WHERE id = ?
""", (old_output, ar_id))
def reverse_migrate(apps, schema_editor): def reverse_migrate(apps, schema_editor):

View File

@@ -16,43 +16,62 @@ class Migration(migrations.Migration):
] ]
operations = [ operations = [
migrations.AlterField( # Update Django's state only - database already has correct schema from 0029
model_name='archiveresult', migrations.SeparateDatabaseAndState(
name='binary', state_operations=[
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'), migrations.AlterField(
model_name='archiveresult',
name='binary',
field=models.ForeignKey(blank=True, help_text='Primary binary used by this hook', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='archiveresults', to='machine.binary'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_files',
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_json',
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output_mimetypes',
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
),
],
database_operations=[
# No database changes needed - columns already exist with correct types
],
), ),
migrations.AlterField( # Add unique constraint without table rebuild
model_name='archiveresult', migrations.SeparateDatabaseAndState(
name='output_files', state_operations=[
field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'), migrations.AddConstraint(
), model_name='snapshot',
migrations.AlterField( constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
model_name='archiveresult', ),
name='output_json', ],
field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True), database_operations=[
), migrations.RunSQL(
migrations.AlterField( sql="CREATE UNIQUE INDEX IF NOT EXISTS unique_timestamp ON core_snapshot (timestamp);",
model_name='archiveresult', reverse_sql="DROP INDEX IF EXISTS unique_timestamp;",
name='output_mimetypes', ),
field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512), ],
),
migrations.AlterField(
model_name='archiveresult',
name='output_size',
field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
),
migrations.AlterField(
model_name='archiveresult',
name='output_str',
field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
),
migrations.AddConstraint(
model_name='snapshot',
constraint=models.UniqueConstraint(fields=('timestamp',), name='unique_timestamp'),
), ),
] ]

View File

@@ -10,20 +10,35 @@ class Migration(migrations.Migration):
] ]
operations = [ operations = [
migrations.RenameField( # Use SeparateDatabaseAndState to avoid table rebuilds that would re-add CHECK constraints
model_name='archiveresult', migrations.SeparateDatabaseAndState(
old_name='extractor', state_operations=[
new_name='plugin', migrations.RenameField(
), model_name='archiveresult',
migrations.AddField( old_name='extractor',
model_name='archiveresult', new_name='plugin',
name='hook_name', ),
field=models.CharField( migrations.AddField(
blank=True, model_name='archiveresult',
default='', name='hook_name',
max_length=255, field=models.CharField(
db_index=True, blank=True,
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)' default='',
), max_length=255,
db_index=True,
help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_archiveresult RENAME COLUMN extractor TO plugin;
ALTER TABLE core_archiveresult ADD COLUMN hook_name VARCHAR(255) DEFAULT '' NOT NULL;
CREATE INDEX IF NOT EXISTS core_archiveresult_hook_name_idx ON core_archiveresult (hook_name);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
), ),
] ]

View File

@@ -11,13 +11,27 @@ class Migration(migrations.Migration):
] ]
operations = [ operations = [
migrations.AddField( # Use SeparateDatabaseAndState to avoid table rebuild that would fail on config NOT NULL constraint
model_name='snapshot', migrations.SeparateDatabaseAndState(
name='current_step', state_operations=[
field=models.PositiveSmallIntegerField( migrations.AddField(
default=0, model_name='snapshot',
db_index=True, name='current_step',
help_text='Current hook step being executed (0-9). Used for sequential hook execution.' field=models.PositiveSmallIntegerField(
), default=0,
db_index=True,
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
),
),
],
database_operations=[
migrations.RunSQL(
sql="""
ALTER TABLE core_snapshot ADD COLUMN current_step SMALLINT UNSIGNED DEFAULT 0 NOT NULL;
CREATE INDEX IF NOT EXISTS core_snapshot_current_step_idx ON core_snapshot (current_step);
""",
reverse_sql=migrations.RunSQL.noop,
),
],
), ),
] ]

View File

@@ -54,7 +54,7 @@ class Migration(migrations.Migration):
dependencies = [ dependencies = [
('core', '0034_snapshot_current_step'), ('core', '0034_snapshot_current_step'),
('crawls', '0004_alter_crawl_output_dir'), ('crawls', '0005_drop_seed_id_column'),
] ]
operations = [ operations = [
@@ -64,16 +64,24 @@ class Migration(migrations.Migration):
reverse_code=migrations.RunPython.noop, reverse_code=migrations.RunPython.noop,
), ),
# Step 2: Make crawl non-nullable # Step 2 & 3: Update Django's state only - leave created_by_id column in database (unused but harmless)
migrations.AlterField( migrations.SeparateDatabaseAndState(
model_name='snapshot', state_operations=[
name='crawl', # Make crawl non-nullable
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), migrations.AlterField(
), model_name='snapshot',
name='crawl',
# Step 3: Remove created_by field field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
migrations.RemoveField( ),
model_name='snapshot', # Remove created_by field from Django's state
name='created_by', migrations.RemoveField(
model_name='snapshot',
name='created_by',
),
],
database_operations=[
# No database changes - crawl_id already exists and NOT NULL constraint will be enforced by model
# created_by_id column remains in database but is unused
],
), ),
] ]

View File

@@ -10,10 +10,18 @@ class Migration(migrations.Migration):
] ]
operations = [ operations = [
# Remove created_by field from ArchiveResult # Remove created_by field from ArchiveResult (state only)
# No data migration needed - created_by can be accessed via snapshot.crawl.created_by # No data migration needed - created_by can be accessed via snapshot.crawl.created_by
migrations.RemoveField( # Leave created_by_id column in database (unused but harmless, avoids table rebuild)
model_name='archiveresult', migrations.SeparateDatabaseAndState(
name='created_by', state_operations=[
migrations.RemoveField(
model_name='archiveresult',
name='created_by',
),
],
database_operations=[
# No database changes - leave created_by_id column in place to avoid table rebuild
],
), ),
] ]

View File

@@ -0,0 +1,44 @@
# Generated by Django 6.0 on 2025-12-29 06:45
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0036_remove_archiveresult_created_by'),
]
operations = [
# Update Django's state only - database columns remain for backwards compat
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.RemoveField(
model_name='archiveresult',
name='output_dir',
),
migrations.RemoveField(
model_name='snapshot',
name='output_dir',
),
migrations.AlterField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', through_fields=('snapshot', 'tag'), to='core.tag'),
),
],
database_operations=[
# No database changes - columns remain in place to avoid table rebuilds
],
),
]

View File

@@ -0,0 +1,84 @@
# Add missing columns to ArchiveResult and remove created_by_id from Snapshot
from django.db import migrations, models, connection
import django.utils.timezone
def add_columns_if_not_exist(apps, schema_editor):
"""Add columns to ArchiveResult only if they don't already exist."""
with connection.cursor() as cursor:
# Get existing columns
cursor.execute("PRAGMA table_info(core_archiveresult)")
existing_columns = {row[1] for row in cursor.fetchall()}
# Add num_uses_failed if it doesn't exist
if 'num_uses_failed' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_failed integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_failed >= 0)")
# Add num_uses_succeeded if it doesn't exist
if 'num_uses_succeeded' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN num_uses_succeeded integer unsigned NOT NULL DEFAULT 0 CHECK (num_uses_succeeded >= 0)")
# Add config if it doesn't exist
if 'config' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN config text NULL")
# Add retry_at if it doesn't exist
if 'retry_at' not in existing_columns:
cursor.execute("ALTER TABLE core_archiveresult ADD COLUMN retry_at datetime NULL")
cursor.execute("CREATE INDEX IF NOT EXISTS core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
class Migration(migrations.Migration):
dependencies = [
('core', '0037_remove_archiveresult_output_dir_and_more'),
]
operations = [
# Add missing columns to ArchiveResult
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='archiveresult',
name='num_uses_failed',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='num_uses_succeeded',
field=models.PositiveIntegerField(default=0),
),
migrations.AddField(
model_name='archiveresult',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
),
],
database_operations=[
migrations.RunPython(add_columns_if_not_exist, reverse_code=migrations.RunPython.noop),
],
),
# Drop created_by_id from Snapshot (database only, already removed from model in 0035)
migrations.SeparateDatabaseAndState(
state_operations=[
# No state changes - field already removed in 0035
],
database_operations=[
migrations.RunSQL(
sql="""
-- Drop index first, then column
DROP INDEX IF EXISTS core_snapshot_created_by_id_6dbd6149;
ALTER TABLE core_snapshot DROP COLUMN created_by_id;
""",
reverse_sql=migrations.RunSQL.noop,
),
],
),
]

View File

@@ -0,0 +1,30 @@
# Fix num_uses_failed and num_uses_succeeded string values to integers
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0038_fix_missing_columns'),
]
operations = [
# Fix string values that got inserted as literals instead of integers
migrations.RunSQL(
sql="""
UPDATE core_snapshot
SET num_uses_failed = 0
WHERE typeof(num_uses_failed) = 'text' OR num_uses_failed = 'num_uses_failed';
UPDATE core_snapshot
SET num_uses_succeeded = 0
WHERE typeof(num_uses_succeeded) = 'text' OR num_uses_succeeded = 'num_uses_succeeded';
UPDATE core_snapshot
SET depth = 0
WHERE typeof(depth) = 'text' OR depth = 'depth';
""",
reverse_sql=migrations.RunSQL.noop,
),
]

View File

@@ -911,7 +911,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
) )
merged = 0 merged = 0
for dup in duplicates.iterator(): for dup in duplicates.iterator(chunk_size=500):
snapshots = list( snapshots = list(
cls.objects cls.objects
.filter(url=dup['url'], timestamp=dup['timestamp']) .filter(url=dup['url'], timestamp=dup['timestamp'])

File diff suppressed because it is too large Load Diff

View File

@@ -91,7 +91,11 @@ def plugin_thumbnail(context, result) -> str:
'output_path': output_path, 'output_path': output_path,
'plugin': plugin, 'plugin': plugin,
}) })
return mark_safe(tpl.render(ctx)) rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
except Exception: except Exception:
return '' return ''
@@ -119,7 +123,11 @@ def plugin_embed(context, result) -> str:
'output_path': output_path, 'output_path': output_path,
'plugin': plugin, 'plugin': plugin,
}) })
return mark_safe(tpl.render(ctx)) rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
except Exception: except Exception:
return '' return ''
@@ -147,7 +155,11 @@ def plugin_fullscreen(context, result) -> str:
'output_path': output_path, 'output_path': output_path,
'plugin': plugin, 'plugin': plugin,
}) })
return mark_safe(tpl.render(ctx)) rendered = tpl.render(ctx)
# Only return non-empty content (strip whitespace to check)
if rendered.strip():
return mark_safe(rendered)
return ''
except Exception: except Exception:
return '' return ''

View File

@@ -539,7 +539,7 @@ from django.http import JsonResponse
def live_progress_view(request): def live_progress_view(request):
"""Simple JSON endpoint for live progress status - used by admin progress monitor.""" """Simple JSON endpoint for live progress status - used by admin progress monitor."""
try: try:
from workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult from archivebox.core.models import Snapshot, ArchiveResult
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField

View File

@@ -4,3 +4,8 @@ from django.apps import AppConfig
class CrawlsConfig(AppConfig): class CrawlsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField" default_auto_field = "django.db.models.BigAutoField"
name = "archivebox.crawls" name = "archivebox.crawls"
label = "crawls"
def ready(self):
"""Import models to register state machines with the registry"""
from archivebox.crawls.models import CrawlMachine # noqa: F401

View File

@@ -17,39 +17,62 @@ class Migration(migrations.Migration):
] ]
operations = [ operations = [
# Remove the seed foreign key from Crawl # Remove the seed foreign key from Crawl (no-op if already removed by core/0024_d)
migrations.RemoveField( migrations.RunPython(
model_name='crawl', code=lambda apps, schema_editor: None,
name='seed', reverse_code=migrations.RunPython.noop,
), ),
# Delete the Seed model entirely # Delete the Seed model entirely (already done)
migrations.DeleteModel( migrations.RunPython(
name='Seed', code=lambda apps, schema_editor: None,
reverse_code=migrations.RunPython.noop,
), ),
# Update fields to new schema # Drop seed_id column if it exists, then update Django's migration state
migrations.AlterField( migrations.SeparateDatabaseAndState(
model_name='crawl', state_operations=[
name='created_by', # Update fields to new schema
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), migrations.AlterField(
), model_name='crawl',
migrations.AlterField( name='created_by',
model_name='crawl', field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
name='id', ),
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), migrations.AlterField(
), model_name='crawl',
migrations.AlterField( name='id',
model_name='crawl', field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
name='urls', ),
field=models.TextField(help_text='Newline-separated list of URLs to crawl'), migrations.AlterField(
), model_name='crawl',
migrations.AlterField( name='urls',
model_name='crawlschedule', field=models.TextField(help_text='Newline-separated list of URLs to crawl'),
name='created_by', ),
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL), migrations.AlterField(
), model_name='crawlschedule',
migrations.AlterField( name='created_by',
model_name='crawlschedule', field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
name='id', ),
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), migrations.AlterField(
model_name='crawlschedule',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
],
database_operations=[
# Drop seed table and NULL out seed_id FK values
migrations.RunSQL(
sql="""
PRAGMA foreign_keys=OFF;
-- NULL out seed_id values in crawls_crawl
UPDATE crawls_crawl SET seed_id = NULL;
-- Drop seed table if it exists
DROP TABLE IF EXISTS crawls_seed;
PRAGMA foreign_keys=ON;
""",
reverse_sql=migrations.RunSQL.noop,
),
],
), ),
] ]

View File

@@ -8,12 +8,21 @@ class Migration(migrations.Migration):
dependencies = [ dependencies = [
('crawls', '0002_drop_seed_model'), ('crawls', '0002_drop_seed_model'),
('core', '0024_d_fix_crawls_config'), # Depends on config fix
] ]
operations = [ operations = [
migrations.AlterField( # Update Django's state only to avoid table rebuild that would re-apply old constraints
model_name='crawl', migrations.SeparateDatabaseAndState(
name='output_dir', state_operations=[
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')), migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/test_archivebox_migrations/archive')),
),
],
database_operations=[
# No database changes - output_dir type change is cosmetic for Django admin
],
), ),
] ]

View File

@@ -11,9 +11,17 @@ class Migration(migrations.Migration):
] ]
operations = [ operations = [
migrations.AlterField( # Update Django's state only to avoid table rebuild that would re-apply old constraints
model_name='crawl', migrations.SeparateDatabaseAndState(
name='output_dir', state_operations=[
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')), migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/private/tmp/archivebox-makemigrations/archive')),
),
],
database_operations=[
# No database changes - output_dir type change is cosmetic for Django admin
],
), ),
] ]

View File

@@ -0,0 +1,28 @@
# Drop seed_id column from Django's state (leave in database to avoid FK issues)
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('crawls', '0004_alter_crawl_output_dir'),
]
operations = [
# Update Django's state only - leave seed_id column in database (unused but harmless)
# This avoids FK mismatch errors with crawls_crawlschedule
migrations.SeparateDatabaseAndState(
state_operations=[
# Remove seed field from Django's migration state
migrations.RemoveField(
model_name='crawl',
name='seed',
),
],
database_operations=[
# No database changes - seed_id column remains to avoid FK rebuild issues
# crawls_seed table can be manually dropped by DBA if needed
],
),
]

View File

@@ -0,0 +1,35 @@
# Generated by Django 6.0 on 2025-12-29 06:45
import pathlib
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('crawls', '0005_drop_seed_id_column'),
]
operations = [
# Update Django's state only - database already correct
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AlterField(
model_name='crawl',
name='config',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='crawl',
name='output_dir',
field=models.FilePathField(blank=True, default='', path=pathlib.PurePosixPath('/Users/squash/Local/Code/archiveboxes/archivebox-nue/data/archive')),
),
migrations.DeleteModel(
name='Seed',
),
],
database_operations=[
# No database changes - Seed table already dropped in 0005
],
),
]

View File

@@ -65,7 +65,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
modified_at = models.DateTimeField(auto_now=True) modified_at = models.DateTimeField(auto_now=True)
urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl') urls = models.TextField(blank=False, null=False, help_text='Newline-separated list of URLs to crawl')
config = models.JSONField(default=dict) config = models.JSONField(default=dict, null=True, blank=True)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
persona_id = models.UUIDField(null=True, blank=True) persona_id = models.UUIDField(null=True, blank=True)
@@ -77,7 +77,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
state_machine_name = 'crawls.models.CrawlMachine' state_machine_name = 'archivebox.crawls.models.CrawlMachine'
retry_at_field_name = 'retry_at' retry_at_field_name = 'retry_at'
state_field_name = 'status' state_field_name = 'status'
StatusChoices = ModelWithStateMachine.StatusChoices StatusChoices = ModelWithStateMachine.StatusChoices
@@ -190,7 +190,6 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'status': Snapshot.INITIAL_STATE, 'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(), 'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()), 'timestamp': str(timezone.now().timestamp()),
'created_by_id': self.created_by_id,
'depth': 0, 'depth': 0,
}, },
) )
@@ -290,7 +289,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
'timestamp': timestamp or str(timezone.now().timestamp()), 'timestamp': timestamp or str(timezone.now().timestamp()),
'status': Snapshot.INITIAL_STATE, 'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(), 'retry_at': timezone.now(),
'created_by_id': self.created_by_id, # Note: created_by removed in 0.9.0 - Snapshot inherits from Crawl
} }
) )

View File

@@ -7,8 +7,13 @@ class MachineConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField' default_auto_field = 'django.db.models.BigAutoField'
name = 'archivebox.machine' name = 'archivebox.machine'
label = 'machine' # Explicit label for migrations
verbose_name = 'Machine Info' verbose_name = 'Machine Info'
def ready(self):
"""Import models to register state machines with the registry"""
from archivebox.machine import models # noqa: F401
def register_admin(admin_site): def register_admin(admin_site):
from archivebox.machine.admin import register_admin from archivebox.machine.admin import register_admin

View File

@@ -85,6 +85,12 @@ class Migration(migrations.Migration):
('version', models.CharField(blank=True, default=None, max_length=32)), ('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)), ('sha256', models.CharField(blank=True, default=None, max_length=64)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')), ('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
# Fields added in migration 0005 (included here for fresh installs)
('binproviders', models.CharField(blank=True, default='env', max_length=127)),
('output_dir', models.CharField(blank=True, default='', max_length=255)),
('overrides', models.JSONField(blank=True, default=dict)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
# dependency FK removed - Dependency model deleted # dependency FK removed - Dependency model deleted
], ],
options={ options={

View File

@@ -0,0 +1,104 @@
# Generated by Django 6.0 on 2025-12-29 06:45
import django.db.models.deletion
import django.utils.timezone
from archivebox.uuid_compat import uuid7
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0004_drop_dependency_table'),
]
operations = [
# Update Django's state only - database already has correct schema
migrations.SeparateDatabaseAndState(
state_operations=[
migrations.AddField(
model_name='binary',
name='binproviders',
field=models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127),
),
migrations.AddField(
model_name='binary',
name='output_dir',
field=models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255),
),
migrations.AddField(
model_name='binary',
name='overrides',
field=models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}"),
),
migrations.AddField(
model_name='binary',
name='retry_at',
field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True),
),
migrations.AddField(
model_name='binary',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16),
),
migrations.AlterField(
model_name='binary',
name='abspath',
field=models.CharField(blank=True, default='', max_length=255),
),
migrations.AlterField(
model_name='binary',
name='binprovider',
field=models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31),
),
migrations.AlterField(
model_name='binary',
name='id',
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='binary',
name='machine',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='machine.machine'),
),
migrations.AlterField(
model_name='binary',
name='name',
field=models.CharField(blank=True, db_index=True, default='', max_length=63),
),
migrations.AlterField(
model_name='binary',
name='sha256',
field=models.CharField(blank=True, default='', max_length=64),
),
migrations.AlterField(
model_name='binary',
name='version',
field=models.CharField(blank=True, default='', max_length=32),
),
migrations.AlterField(
model_name='machine',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)', null=True),
),
migrations.AlterField(
model_name='machine',
name='id',
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='machine',
name='stats',
field=models.JSONField(blank=True, default=dict, null=True),
),
migrations.AlterField(
model_name='networkinterface',
name='id',
field=models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
],
database_operations=[
# No database changes - schema already correct from previous migrations
],
),
]

View File

@@ -44,8 +44,8 @@ class Machine(ModelWithHealthStats):
os_platform = models.CharField(max_length=63, default=None, null=False) os_platform = models.CharField(max_length=63, default=None, null=False)
os_release = models.CharField(max_length=63, default=None, null=False) os_release = models.CharField(max_length=63, default=None, null=False)
os_kernel = models.CharField(max_length=255, default=None, null=False) os_kernel = models.CharField(max_length=255, default=None, null=False)
stats = models.JSONField(default=dict, null=False) stats = models.JSONField(default=dict, null=True, blank=True)
config = models.JSONField(default=dict, null=False, blank=True, config = models.JSONField(default=dict, null=True, blank=True,
help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)") help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)")
num_uses_failed = models.PositiveIntegerField(default=0) num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0) num_uses_succeeded = models.PositiveIntegerField(default=0)
@@ -213,7 +213,7 @@ class Binary(ModelWithHealthStats):
num_uses_failed = models.PositiveIntegerField(default=0) num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0) num_uses_succeeded = models.PositiveIntegerField(default=0)
state_machine_name: str = 'machine.models.BinaryMachine' state_machine_name: str = 'archivebox.machine.models.BinaryMachine'
objects: BinaryManager = BinaryManager() objects: BinaryManager = BinaryManager()

View File

@@ -4,3 +4,4 @@ from django.apps import AppConfig
class SessionsConfig(AppConfig): class SessionsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField" default_auto_field = "django.db.models.BigAutoField"
name = "archivebox.personas" name = "archivebox.personas"
label = "personas"

View File

@@ -21,7 +21,7 @@
# # COOKIES_TXT_FILE: '/path/to/cookies.txt', # # COOKIES_TXT_FILE: '/path/to/cookies.txt',
# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir', # # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
# # CHECK_SSL_VALIDITY: False, # # CHECK_SSL_VALIDITY: False,
# # SAVE_ARCHIVE_DOT_ORG: True, # # SAVE_ARCHIVEDOTORG: True,
# # CHROME_BINARY: 'chromium' # # CHROME_BINARY: 'chromium'
# # ... # # ...
# # } # # }

View File

@@ -63,7 +63,7 @@ def test_ripgrep_hook_detects_binary_from_path():
def test_ripgrep_hook_skips_when_backend_not_ripgrep(): def test_ripgrep_hook_skips_when_backend_not_ripgrep():
"""Test that ripgrep hook exits silently when search backend is not ripgrep.""" """Test that ripgrep hook exits silently when search backend is not ripgrep."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
env = os.environ.copy() env = os.environ.copy()
env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend env['SEARCH_BACKEND_ENGINE'] = 'sqlite' # Different backend
@@ -82,7 +82,7 @@ def test_ripgrep_hook_skips_when_backend_not_ripgrep():
def test_ripgrep_hook_handles_absolute_path(): def test_ripgrep_hook_handles_absolute_path():
"""Test that ripgrep hook works when RIPGREP_BINARY is an absolute path.""" """Test that ripgrep hook works when RIPGREP_BINARY is an absolute path."""
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
rg_path = shutil.which('rg') rg_path = shutil.which('rg')
if not rg_path: if not rg_path:
@@ -222,7 +222,7 @@ def test_ripgrep_only_detected_when_backend_enabled():
if not shutil.which('rg'): if not shutil.which('rg'):
pytest.skip("ripgrep not installed") pytest.skip("ripgrep not installed")
hook_path = Path(__file__).parent.parent / 'on_Crawl__00_validate_ripgrep.py' hook_path = Path(__file__).parent.parent / 'on_Crawl__00_install_ripgrep.py'
# Test 1: With ripgrep backend - should output Binary record # Test 1: With ripgrep backend - should output Binary record
env1 = os.environ.copy() env1 = os.environ.copy()

View File

@@ -360,9 +360,11 @@
<div class="row header-bottom-frames"> <div class="row header-bottom-frames">
{% for result_info in archiveresults %} {% for result_info in archiveresults %}
{% if result_info.result %} {% if result_info.result %}
{% plugin_thumbnail result_info.result as thumbnail_html %}
{% if thumbnail_html %}
<div class="col-lg-2"> <div class="col-lg-2">
<div class="card{% if forloop.first %} selected-card{% endif %}"> <div class="card{% if forloop.first %} selected-card{% endif %}">
{% plugin_thumbnail result_info.result %} {{ thumbnail_html }}
<div class="card-body"> <div class="card-body">
<a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener"> <a href="{{ result_info.path }}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>{{ result_info.path }}</code></p> <p class="card-text"><code>{{ result_info.path }}</code></p>
@@ -373,6 +375,7 @@
</div> </div>
</div> </div>
</div> </div>
{% endif %}
{% endif %} {% endif %}
{% endfor %} {% endfor %}
@@ -395,7 +398,7 @@
</div> </div>
</div> </div>
</header> </header>
<iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe> <iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{best_preview_path}}" name="preview"></iframe>
<script> <script>
/*! jQuery v3.2.1 -ajax,-ajax/jsonp,-ajax/load,-ajax/parseXML,-ajax/script,-ajax/var/location,-ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-event/ajax,-effects,-effects/Tween,-effects/animatedSelector | (c) JS Foundation and other contributors | jquery.org/license */ /*! jQuery v3.2.1 -ajax,-ajax/jsonp,-ajax/load,-ajax/parseXML,-ajax/script,-ajax/var/location,-ajax/var/nonce,-ajax/var/rquery,-ajax/xhr,-manipulation/_evalUrl,-event/ajax,-effects,-effects/Tween,-effects/animatedSelector | (c) JS Foundation and other contributors | jquery.org/license */

View File

@@ -429,19 +429,6 @@ class TestInstallHookOutput(unittest.TestCase):
self.assertEqual(data['name'], 'wget') self.assertEqual(data['name'], 'wget')
self.assertTrue(data['abspath'].startswith('/')) self.assertTrue(data['abspath'].startswith('/'))
def test_install_hook_outputs_dependency(self):
"""Install hook should output Dependency JSONL when binary not found."""
hook_output = json.dumps({
'type': 'Dependency',
'bin_name': 'wget',
'bin_providers': 'apt,brew,env',
})
data = json.loads(hook_output)
self.assertEqual(data['type'], 'Dependency')
self.assertEqual(data['bin_name'], 'wget')
self.assertIn('apt', data['bin_providers'])
def test_install_hook_outputs_machine_config(self): def test_install_hook_outputs_machine_config(self):
"""Install hook should output Machine config update JSONL.""" """Install hook should output Machine config update JSONL."""
hook_output = json.dumps({ hook_output = json.dumps({

View File

@@ -459,7 +459,7 @@ class TestFilesystemMigration08to09(unittest.TestCase):
'SAVE_MERCURY': 'True', 'SAVE_MERCURY': 'True',
'SAVE_PDF': 'True', 'SAVE_PDF': 'True',
'SAVE_MEDIA': 'True', 'SAVE_MEDIA': 'True',
'SAVE_ARCHIVE_DOT_ORG': 'True', 'SAVE_ARCHIVEDOTORG': 'True',
'SAVE_HEADERS': 'True', 'SAVE_HEADERS': 'True',
'SAVE_HTMLTOTEXT': 'True', 'SAVE_HTMLTOTEXT': 'True',
'SAVE_GIT': 'True', 'SAVE_GIT': 'True',

View File

@@ -949,19 +949,30 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'), ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
('core', '0073_rename_created_archiveresult_created_at_and_more'), ('core', '0073_rename_created_archiveresult_created_at_and_more'),
('core', '0074_alter_snapshot_downloaded_at'), ('core', '0074_alter_snapshot_downloaded_at'),
('core', '0023_new_schema'), # For 0.8.x: DO NOT record 0023_new_schema - it replaces 0023-0074 for fresh installs
# We already recorded 0023-0074 above, so Django will know the state
# For 0.8.x: Record original machine migrations (before squashing)
# DO NOT record 0001_squashed here - it replaces 0001-0004 for fresh installs
('machine', '0001_initial'), ('machine', '0001_initial'),
('machine', '0002_alter_machine_stats_installedbinary'), ('machine', '0002_alter_machine_stats_installedbinary'),
('machine', '0003_alter_installedbinary_options_and_more'), ('machine', '0003_alter_installedbinary_options_and_more'),
('machine', '0004_alter_installedbinary_abspath_and_more'), ('machine', '0004_alter_installedbinary_abspath_and_more'),
('machine', '0001_squashed'), # Then the new migrations after squashing
('machine', '0002_rename_custom_cmds_to_overrides'), ('machine', '0002_rename_custom_cmds_to_overrides'),
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'), ('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
('machine', '0004_drop_dependency_table'), ('machine', '0004_drop_dependency_table'),
# Crawls must come before core.0024 because 0024_b depends on it
('crawls', '0001_initial'),
# Core 0024 migrations chain (in dependency order)
('core', '0024_b_clear_config_fields'),
('core', '0024_c_disable_fk_checks'),
('core', '0024_d_fix_crawls_config'),
('core', '0024_snapshot_crawl'), ('core', '0024_snapshot_crawl'),
('core', '0024_f_add_snapshot_config'),
('core', '0025_allow_duplicate_urls_per_crawl'), ('core', '0025_allow_duplicate_urls_per_crawl'),
# For 0.8.x: Record original api migration (before squashing)
# DO NOT record 0001_squashed here - it replaces 0001 for fresh installs
('api', '0001_initial'), ('api', '0001_initial'),
('api', '0001_squashed'),
('api', '0002_alter_apitoken_options'), ('api', '0002_alter_apitoken_options'),
('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'), ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
('api', '0004_alter_apitoken_id_alter_apitoken_uuid'), ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
@@ -970,11 +981,9 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
('api', '0007_alter_apitoken_created_by'), ('api', '0007_alter_apitoken_created_by'),
('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'), ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
('api', '0009_rename_created_apitoken_created_at_and_more'), ('api', '0009_rename_created_apitoken_created_at_and_more'),
('crawls', '0001_initial'), # Note: crawls.0001_initial moved earlier (before core.0024) due to dependencies
('crawls', '0002_drop_seed_model'), # Stop here - 0.8.x ends at core.0025, crawls.0001, and we want to TEST the later migrations
('crawls', '0003_alter_crawl_output_dir'), # Do NOT record 0026+ as they need to be tested during migration
('crawls', '0004_alter_crawl_output_dir'),
('core', '0035_snapshot_crawl_non_nullable_remove_created_by'),
] ]
for app, name in migrations: for app, name in migrations:
@@ -1000,7 +1009,7 @@ def run_archivebox(data_dir: Path, args: list, timeout: int = 60, env: dict = No
base_env['USE_COLOR'] = 'False' base_env['USE_COLOR'] = 'False'
base_env['SHOW_PROGRESS'] = 'False' base_env['SHOW_PROGRESS'] = 'False'
# Disable ALL extractors for faster tests (can be overridden by env parameter) # Disable ALL extractors for faster tests (can be overridden by env parameter)
base_env['SAVE_ARCHIVE_DOT_ORG'] = 'False' base_env['SAVE_ARCHIVEDOTORG'] = 'False'
base_env['SAVE_TITLE'] = 'False' base_env['SAVE_TITLE'] = 'False'
base_env['SAVE_FAVICON'] = 'False' base_env['SAVE_FAVICON'] = 'False'
base_env['SAVE_WGET'] = 'False' base_env['SAVE_WGET'] = 'False'

View File

@@ -4,4 +4,5 @@ from django.apps import AppConfig
class WorkersConfig(AppConfig): class WorkersConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField' default_auto_field = 'django.db.models.BigAutoField'
name = 'archivebox.workers' name = 'archivebox.workers'
label = 'workers'

View File

@@ -2,7 +2,7 @@
# mkdir -p ~/archivebox/data && cd ~/archivebox # mkdir -p ~/archivebox/data && cd ~/archivebox
# curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml # curl -fsSL 'https://docker-compose.archivebox.io' > docker-compose.yml
# docker compose run archivebox version # docker compose run archivebox version
# docker compose run archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # docker compose run archivebox config --set SAVE_ARCHIVEDOTORG=False
# docker compose run archivebox add --depth=1 'https://news.ycombinator.com' # docker compose run archivebox add --depth=1 'https://news.ycombinator.com'
# docker compose run -T archivebox add < bookmarks.txt # docker compose run -T archivebox add < bookmarks.txt
# docker compose up -d && open 'https://localhost:8000' # docker compose up -d && open 'https://localhost:8000'
@@ -35,7 +35,7 @@ services:
# - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files # - MEDIA_MAX_SIZE=750m # increase this filesize limit to allow archiving larger audio/video files
# - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out # - TIMEOUT=60 # increase this number to 120+ seconds if you see many slow downloads timing out
# - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs) # - CHECK_SSL_VALIDITY=True # set to False to disable strict SSL checking (allows saving URLs w/ broken certs)
# - SAVE_ARCHIVE_DOT_ORG=True # set to False to disable submitting all URLs to Archive.org when archiving # - SAVE_ARCHIVEDOTORG=True # set to False to disable submitting all URLs to Archive.org when archiving
# - USER_AGENT="..." # set a custom USER_AGENT to avoid being blocked as a bot # - USER_AGENT="..." # set a custom USER_AGENT to avoid being blocked as a bot
# ... # ...
# For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration # For more info, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#configuration

View File

@@ -85,9 +85,9 @@ dependencies = [
### Binary/Package Management ### Binary/Package Management
"abx-pkg>=0.1.0", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm "abx-pkg>=0.1.0", # for: detecting, versioning, and installing binaries via apt/brew/pip/npm
"gallery-dl>=1.31.1", "gallery-dl>=1.31.1",
### UUID7 backport for Python <3.14 ### UUID7 backport for Python <3.14
"uuid7>=0.1.0; python_version < '3.14'", # for: uuid7 support on Python 3.13 (provides uuid_extensions module) "uuid7>=0.1.0; python_version < '3.14'", # for: uuid7 support on Python 3.13 (provides uuid_extensions module)
"pytest-django>=4.11.1",
] ]
[project.optional-dependencies] [project.optional-dependencies]
@@ -183,6 +183,7 @@ ignore = ["E731", "E303", "E266", "E241", "E222"]
[tool.pytest.ini_options] [tool.pytest.ini_options]
testpaths = [ "tests" ] testpaths = [ "tests" ]
DJANGO_SETTINGS_MODULE = "archivebox.core.settings"
[tool.mypy] [tool.mypy]
mypy_path = "archivebox,archivebox/typings" mypy_path = "archivebox,archivebox/typings"

View File

@@ -24,7 +24,7 @@ def disable_extractors_dict():
"SAVE_HEADERS": "false", "SAVE_HEADERS": "false",
"USE_GIT": "false", "USE_GIT": "false",
"SAVE_MEDIA": "false", "SAVE_MEDIA": "false",
"SAVE_ARCHIVE_DOT_ORG": "false", "SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false", "SAVE_TITLE": "false",
"SAVE_FAVICON": "false", "SAVE_FAVICON": "false",
}) })

View File

@@ -33,7 +33,7 @@ def test_background_hooks_dont_block_parser_extractors(tmp_path, process):
"SAVE_HEADERS": "false", "SAVE_HEADERS": "false",
"USE_GIT": "false", "USE_GIT": "false",
"SAVE_MEDIA": "false", "SAVE_MEDIA": "false",
"SAVE_ARCHIVE_DOT_ORG": "false", "SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false", "SAVE_TITLE": "false",
"SAVE_FAVICON": "false", "SAVE_FAVICON": "false",
# Enable chrome session (required for background hooks to start) # Enable chrome session (required for background hooks to start)
@@ -133,7 +133,7 @@ def test_parser_extractors_emit_snapshot_jsonl(tmp_path, process):
"SAVE_HEADERS": "false", "SAVE_HEADERS": "false",
"USE_GIT": "false", "USE_GIT": "false",
"SAVE_MEDIA": "false", "SAVE_MEDIA": "false",
"SAVE_ARCHIVE_DOT_ORG": "false", "SAVE_ARCHIVEDOTORG": "false",
"SAVE_TITLE": "false", "SAVE_TITLE": "false",
"SAVE_FAVICON": "false", "SAVE_FAVICON": "false",
"USE_CHROME": "false", "USE_CHROME": "false",

14
uv.lock generated
View File

@@ -88,6 +88,7 @@ dependencies = [
{ name = "py-machineid", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "py-machineid", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pydantic", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pydantic-settings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pydantic-settings", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "pytest-django", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-benedict", extra = ["io", "parse"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-benedict", extra = ["io", "parse"], marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-crontab", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-crontab", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
{ name = "python-statemachine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "python-statemachine", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
@@ -186,6 +187,7 @@ requires-dist = [
{ name = "py-machineid", specifier = ">=0.6.0" }, { name = "py-machineid", specifier = ">=0.6.0" },
{ name = "pydantic", specifier = ">=2.8.0" }, { name = "pydantic", specifier = ">=2.8.0" },
{ name = "pydantic-settings", specifier = ">=2.5.2" }, { name = "pydantic-settings", specifier = ">=2.5.2" },
{ name = "pytest-django", specifier = ">=4.11.1" },
{ name = "python-benedict", extras = ["io", "parse"], specifier = ">=0.33.2" }, { name = "python-benedict", extras = ["io", "parse"], specifier = ">=0.33.2" },
{ name = "python-crontab", specifier = ">=3.2.0" }, { name = "python-crontab", specifier = ">=3.2.0" },
{ name = "python-ldap", marker = "extra == 'ldap'", specifier = ">=3.4.3" }, { name = "python-ldap", marker = "extra == 'ldap'", specifier = ">=3.4.3" },
@@ -1848,6 +1850,18 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" }, { url = "https://files.pythonhosted.org/packages/3b/ab/b3226f0bd7cdcf710fbede2b3548584366da3b19b5021e74f5bde2a8fa3f/pytest-9.0.2-py3-none-any.whl", hash = "sha256:711ffd45bf766d5264d487b917733b453d917afd2b0ad65223959f59089f875b", size = 374801, upload-time = "2025-12-06T21:30:49.154Z" },
] ]
[[package]]
name = "pytest-django"
version = "4.11.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "pytest", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b1/fb/55d580352db26eb3d59ad50c64321ddfe228d3d8ac107db05387a2fadf3a/pytest_django-4.11.1.tar.gz", hash = "sha256:a949141a1ee103cb0e7a20f1451d355f83f5e4a5d07bdd4dcfdd1fd0ff227991", size = 86202, upload-time = "2025-04-03T18:56:09.338Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/be/ac/bd0608d229ec808e51a21044f3f2f27b9a37e7a0ebaca7247882e67876af/pytest_django-4.11.1-py3-none-any.whl", hash = "sha256:1b63773f648aa3d8541000c26929c1ea63934be1cfa674c76436966d73fe6a10", size = 25281, upload-time = "2025-04-03T18:56:07.678Z" },
]
[[package]] [[package]]
name = "python-benedict" name = "python-benedict"
version = "0.35.0" version = "0.35.0"