wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -1,16 +1,13 @@
__package__ = 'archivebox.core'
__order__ = 100
import abx
@abx.hookimpl
def register_admin(admin_site):
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
from core.admin import register_admin
register_admin(admin_site)
from core.admin import register_admin as do_register
do_register(admin_site)
@abx.hookimpl
def get_CONFIG():
from archivebox.config.common import (
SHELL_CONFIG,
@@ -28,4 +25,3 @@ def get_CONFIG():
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
}

View File

@@ -9,10 +9,7 @@ from core.admin_snapshots import SnapshotAdmin
from core.admin_archiveresults import ArchiveResultAdmin
from core.admin_users import UserAdmin
import abx
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(get_user_model(), UserAdmin)
admin_site.register(ArchiveResult, ArchiveResultAdmin)

View File

@@ -11,8 +11,6 @@ from django.utils import timezone
from huey_monitor.admin import TaskModel
import abx
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
@@ -43,7 +41,6 @@ class ArchiveResultInline(admin.TabularInline):
ordering = ('end_ts',)
show_change_link = True
# # classes = ['collapse']
# # list_display_links = ['abid']
def get_parent_object_from_request(self, request):
resolved = resolve(request.path_info)
@@ -80,7 +77,7 @@ class ArchiveResultInline(admin.TabularInline):
formset.form.base_fields['start_ts'].initial = timezone.now()
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
formset.form.base_fields['pwd'].initial = str(snapshot.output_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'].initial = '["-"]'
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
@@ -193,6 +190,5 @@ class ArchiveResultAdmin(BaseModelAdmin):
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(ArchiveResult, ArchiveResultAdmin)

View File

@@ -36,7 +36,7 @@ def register_admin_site():
admin.site = archivebox_admin
sites.site = archivebox_admin
# register all plugins admin classes
archivebox.pm.hook.register_admin(admin_site=archivebox_admin)
# Plugin admin registration is now handled by individual app admins
# No longer using archivebox.pm.hook.register_admin()
return archivebox_admin

View File

@@ -19,11 +19,9 @@ from archivebox.misc.util import htmldecode, urldecode
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.misc.logging_util import printable_filesize
from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.index.html import snapshot_icons
from archivebox.extractors import archive_links
from archivebox.base_models.admin import BaseModelAdmin
from archivebox.workers.tasks import bg_archive_links, bg_add
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from archivebox.workers.tasks import bg_archive_snapshots, bg_add
from core.models import Tag
from core.admin_tags import TagInline
@@ -53,13 +51,13 @@ class SnapshotActionForm(ActionForm):
# )
class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
list_display = ('created_at', 'title_str', 'status', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'link_dir', 'available_config_options')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', *readonly_fields)
fields = ('url', 'title', 'created_by', 'bookmarked_at', 'status', 'retry_at', 'crawl', 'config', 'available_config_options', *readonly_fields[:-1])
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
@@ -196,14 +194,14 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
)
def files(self, obj):
# return '-'
return snapshot_icons(obj)
return obj.icons()
@admin.display(
# ordering='archiveresult_count'
)
def size(self, obj):
archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
archive_size = os.access(Path(obj.output_dir) / 'index.html', os.F_OK) and obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
@@ -261,30 +259,27 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
description=" Get Title"
)
def update_titles(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
if len(links) < 3:
# run syncronously if there are only 1 or 2 links
archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
else:
# otherwise run in a background worker
result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
)
from core.models import Snapshot
count = queryset.count()
# Queue snapshots for archiving via the state machine system
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Title and favicon are updating in the background for {count} URLs. {result_url(result)}"),
)
@admin.action(
description="⬇️ Get Missing"
)
def update_snapshots(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
count = queryset.count()
result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
result = bg_archive_snapshots(queryset, kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
mark_safe(f"Re-trying any previously failed methods for {count} URLs in the background. {result_url(result)}"),
)
@@ -307,13 +302,13 @@ class SnapshotAdmin(SearchResultsAdminMixin, BaseModelAdmin):
description="🔄 Redo"
)
def overwrite_snapshots(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
count = queryset.count()
result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
result = bg_archive_snapshots(queryset, kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
mark_safe(f"Clearing all previous results and re-downloading {count} URLs in the background. {result_url(result)}"),
)
@admin.action(

View File

@@ -3,8 +3,6 @@ __package__ = 'archivebox.core'
from django.contrib import admin
from django.utils.html import format_html, mark_safe
import abx
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.base_models.admin import BaseModelAdmin
@@ -150,7 +148,7 @@ class TagAdmin(BaseModelAdmin):
# @admin.register(SnapshotTag, site=archivebox_admin)
# class SnapshotTagAdmin(ABIDModelAdmin):
# class SnapshotTagAdmin(BaseModelAdmin):
# list_display = ('id', 'snapshot', 'tag')
# sort_fields = ('id', 'snapshot', 'tag')
# search_fields = ('id', 'snapshot_id', 'tag_id')
@@ -159,7 +157,6 @@ class TagAdmin(BaseModelAdmin):
# ordering = ['-id']
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(Tag, TagAdmin)

View File

@@ -5,8 +5,6 @@ from django.contrib.auth.admin import UserAdmin
from django.utils.html import format_html, mark_safe
from django.contrib.auth import get_user_model
import abx
class CustomUserAdmin(UserAdmin):
sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
@@ -86,6 +84,5 @@ class CustomUserAdmin(UserAdmin):
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(get_user_model(), CustomUserAdmin)

View File

@@ -2,17 +2,12 @@ __package__ = 'archivebox.core'
from django.apps import AppConfig
import archivebox
class CoreConfig(AppConfig):
name = 'core'
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
from django.conf import settings
archivebox.pm.hook.ready(settings=settings)
from core.admin_site import register_admin_site
register_admin_site()

View File

@@ -3,37 +3,34 @@ __package__ = 'archivebox.core'
from django import forms
from archivebox.misc.util import URL_REGEX
from ..parsers import PARSERS
from taggit.utils import edit_string_for_tags, parse_tags
PARSER_CHOICES = [
(parser_key, parser[0])
for parser_key, parser in PARSERS.items()
]
DEPTH_CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
)
from ..extractors import get_default_archive_methods
from archivebox.hooks import get_extractors
ARCHIVE_METHODS = [
(name, name)
for name, _, _ in get_default_archive_methods()
]
def get_archive_methods():
"""Get available archive methods from discovered hooks."""
return [(name, name) for name in get_extractors()]
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
parser = forms.ChoiceField(label="URLs format", choices=[('auto', 'Auto-detect parser'), *PARSER_CHOICES], initial='auto')
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
depth = forms.ChoiceField(label="Archive depth", choices=DEPTH_CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
archive_methods = forms.MultipleChoiceField(
label="Archive methods (select at least 1, otherwise all will be used by default)",
required=False,
widget=forms.SelectMultiple,
choices=ARCHIVE_METHODS,
choices=[], # populated dynamically in __init__
)
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.fields['archive_methods'].choices = get_archive_methods()
# TODO: hook these up to the view and put them
# in a collapsible UI section labeled "Advanced"
#

View File

@@ -1,18 +1,14 @@
# Generated by Django 3.0.8 on 2020-11-04 12:25
import os
import json
from pathlib import Path
from django.db import migrations, models
import django.db.models.deletion
from config import CONFIG
from index.json import to_json
DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir
try:
JSONField = models.JSONField
except AttributeError:
@@ -21,12 +17,14 @@ except AttributeError:
def forwards_func(apps, schema_editor):
from core.models import EXTRACTORS
Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult")
snapshots = Snapshot.objects.all()
for snapshot in snapshots:
out_dir = ARCHIVE_DIR / snapshot.timestamp
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
try:
with open(out_dir / "index.json", "r") as f:
@@ -61,7 +59,7 @@ def forwards_func(apps, schema_editor):
def verify_json_index_integrity(snapshot):
results = snapshot.archiveresult_set.all()
out_dir = ARCHIVE_DIR / snapshot.timestamp
out_dir = Path(CONFIG['ARCHIVE_DIR']) / snapshot.timestamp
with open(out_dir / "index.json", "r") as f:
index = json.load(f)

View File

@@ -1,58 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 10:56
import charidfield.fields
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0022_auto_20231023_2008'),
]
operations = [
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Result'},
),
migrations.AddField(
model_name='archiveresult',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
),
migrations.AddField(
model_name='snapshot',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
),
migrations.AddField(
model_name='snapshot',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AddField(
model_name='tag',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=(
('htmltotext', 'htmltotext'),
('git', 'git'),
('singlefile', 'singlefile'),
('media', 'media'),
('archive_org', 'archive_org'),
('readability', 'readability'),
('mercury', 'mercury'),
('favicon', 'favicon'),
('pdf', 'pdf'),
('headers', 'headers'),
('screenshot', 'screenshot'),
('dom', 'dom'),
('title', 'title'),
('wget', 'wget'),
), max_length=32),
),
]

View File

@@ -0,0 +1,466 @@
# Generated by Django 5.0.6 on 2024-12-25
# Transforms schema from 0022 to new simplified schema (ABID system removed)
from uuid import uuid4
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
def get_or_create_system_user_pk(apps, schema_editor):
"""Get or create system user for migrations."""
User = apps.get_model('auth', 'User')
user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
return user.pk
def populate_created_by_snapshot(apps, schema_editor):
"""Populate created_by for existing snapshots."""
User = apps.get_model('auth', 'User')
Snapshot = apps.get_model('core', 'Snapshot')
system_user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
Snapshot.objects.filter(created_by__isnull=True).update(created_by=system_user)
def populate_created_by_archiveresult(apps, schema_editor):
"""Populate created_by for existing archive results."""
User = apps.get_model('auth', 'User')
ArchiveResult = apps.get_model('core', 'ArchiveResult')
system_user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
ArchiveResult.objects.filter(created_by__isnull=True).update(created_by=system_user)
def populate_created_by_tag(apps, schema_editor):
"""Populate created_by for existing tags."""
User = apps.get_model('auth', 'User')
Tag = apps.get_model('core', 'Tag')
system_user, _ = User.objects.get_or_create(
username='system',
defaults={'is_active': False, 'password': '!'}
)
Tag.objects.filter(created_by__isnull=True).update(created_by=system_user)
def generate_uuid_for_archiveresults(apps, schema_editor):
"""Generate UUIDs for archive results that don't have them."""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
for ar in ArchiveResult.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
ar.uuid = uuid4()
ar.save(update_fields=['uuid'])
def generate_uuid_for_tags(apps, schema_editor):
"""Generate UUIDs for tags that don't have them."""
Tag = apps.get_model('core', 'Tag')
for tag in Tag.objects.filter(uuid__isnull=True).iterator(chunk_size=500):
tag.uuid = uuid4()
tag.save(update_fields=['uuid'])
def copy_bookmarked_at_from_added(apps, schema_editor):
"""Copy added timestamp to bookmarked_at."""
Snapshot = apps.get_model('core', 'Snapshot')
Snapshot.objects.filter(bookmarked_at__isnull=True).update(
bookmarked_at=models.F('added')
)
def copy_created_at_from_added(apps, schema_editor):
"""Copy added timestamp to created_at for snapshots."""
Snapshot = apps.get_model('core', 'Snapshot')
Snapshot.objects.filter(created_at__isnull=True).update(
created_at=models.F('added')
)
def copy_created_at_from_start_ts(apps, schema_editor):
"""Copy start_ts to created_at for archive results."""
ArchiveResult = apps.get_model('core', 'ArchiveResult')
ArchiveResult.objects.filter(created_at__isnull=True).update(
created_at=models.F('start_ts')
)
class Migration(migrations.Migration):
"""
This migration transforms the schema from the main branch (0022) to the new
simplified schema without the ABID system.
For dev branch users who had ABID migrations (0023-0074), this replaces them
with a clean transformation.
"""
replaces = [
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
('core', '0024_auto_20240513_1143'),
('core', '0025_alter_archiveresult_uuid'),
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
('core', '0027_update_snapshot_ids'),
('core', '0028_alter_archiveresult_uuid'),
('core', '0029_alter_archiveresult_id'),
('core', '0030_alter_archiveresult_uuid'),
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
('core', '0032_alter_archiveresult_id'),
('core', '0033_rename_id_archiveresult_old_id'),
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
('core', '0037_rename_id_snapshot_old_id'),
('core', '0038_rename_uuid_snapshot_id'),
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
('core', '0040_archiveresult_snapshot'),
('core', '0041_alter_archiveresult_snapshot_and_more'),
('core', '0042_remove_archiveresult_snapshot_old'),
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
('core', '0045_alter_snapshot_old_id'),
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0047_alter_snapshottag_unique_together_and_more'),
('core', '0048_alter_archiveresult_snapshot_and_more'),
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
('core', '0050_alter_snapshottag_snapshot_old'),
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
('core', '0052_alter_snapshottag_unique_together_and_more'),
('core', '0053_remove_snapshottag_snapshot_old'),
('core', '0054_alter_snapshot_timestamp'),
('core', '0055_alter_tag_slug'),
('core', '0056_remove_tag_uuid'),
('core', '0057_rename_id_tag_old_id'),
('core', '0058_alter_tag_old_id'),
('core', '0059_tag_id'),
('core', '0060_alter_tag_id'),
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
('core', '0062_alter_snapshottag_old_tag'),
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
('core', '0064_alter_snapshottag_unique_together_and_more'),
('core', '0065_remove_snapshottag_old_tag'),
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
('core', '0067_alter_snapshottag_tag'),
('core', '0068_alter_archiveresult_options'),
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
('core', '0073_rename_created_archiveresult_created_at_and_more'),
('core', '0074_alter_snapshot_downloaded_at'),
]
dependencies = [
('core', '0022_auto_20231023_2008'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
# === SNAPSHOT CHANGES ===
# Add new fields to Snapshot
migrations.AddField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(
default=None, null=True, blank=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='snapshot_set',
to=settings.AUTH_USER_MODEL,
),
),
migrations.AddField(
model_name='snapshot',
name='created_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='snapshot',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='bookmarked_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(default=None, null=True, blank=True, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='depth',
field=models.PositiveSmallIntegerField(default=0, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='status',
field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('sealed', 'Sealed')], default='queued', max_length=15, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='retry_at',
field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
),
migrations.AddField(
model_name='snapshot',
name='config',
field=models.JSONField(default=dict, blank=False),
),
migrations.AddField(
model_name='snapshot',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='snapshot',
name='output_dir',
field=models.CharField(max_length=256, default=None, null=True, blank=True),
),
# Copy data from old fields to new
migrations.RunPython(copy_bookmarked_at_from_added, migrations.RunPython.noop),
migrations.RunPython(copy_created_at_from_added, migrations.RunPython.noop),
migrations.RunPython(populate_created_by_snapshot, migrations.RunPython.noop),
# Make created_by non-nullable after population
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='snapshot_set',
to=settings.AUTH_USER_MODEL,
db_index=True,
),
),
# Update timestamp field constraints
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(max_length=32, unique=True, db_index=True, editable=False),
),
# Update title field size
migrations.AlterField(
model_name='snapshot',
name='title',
field=models.CharField(max_length=512, null=True, blank=True, db_index=True),
),
# Remove old 'added' and 'updated' fields
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# Remove old 'tags' CharField (now M2M via Tag model)
migrations.RemoveField(model_name='snapshot', name='tags'),
# === TAG CHANGES ===
# Add uuid field to Tag temporarily for ID migration
migrations.AddField(
model_name='tag',
name='uuid',
field=models.UUIDField(default=uuid4, null=True, blank=True),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(
default=None, null=True, blank=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='tag_set',
to=settings.AUTH_USER_MODEL,
),
),
migrations.AddField(
model_name='tag',
name='created_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='tag',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
# Populate UUIDs for tags
migrations.RunPython(generate_uuid_for_tags, migrations.RunPython.noop),
migrations.RunPython(populate_created_by_tag, migrations.RunPython.noop),
# Make created_by non-nullable
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='tag_set',
to=settings.AUTH_USER_MODEL,
),
),
# Update slug field
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(unique=True, max_length=100, editable=False),
),
# === ARCHIVERESULT CHANGES ===
# Add uuid field for new ID
migrations.AddField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid4, null=True, blank=True),
),
migrations.AddField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(
default=None, null=True, blank=True,
on_delete=django.db.models.deletion.CASCADE,
related_name='archiveresult_set',
to=settings.AUTH_USER_MODEL,
),
),
migrations.AddField(
model_name='archiveresult',
name='created_at',
field=models.DateTimeField(default=django.utils.timezone.now, db_index=True, null=True),
),
migrations.AddField(
model_name='archiveresult',
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='archiveresult',
name='retry_at',
field=models.DateTimeField(default=django.utils.timezone.now, null=True, blank=True, db_index=True),
),
migrations.AddField(
model_name='archiveresult',
name='notes',
field=models.TextField(blank=True, default=''),
),
migrations.AddField(
model_name='archiveresult',
name='output_dir',
field=models.CharField(max_length=256, default=None, null=True, blank=True),
),
# Populate UUIDs and data for archive results
migrations.RunPython(generate_uuid_for_archiveresults, migrations.RunPython.noop),
migrations.RunPython(copy_created_at_from_start_ts, migrations.RunPython.noop),
migrations.RunPython(populate_created_by_archiveresult, migrations.RunPython.noop),
# Make created_by non-nullable
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='archiveresult_set',
to=settings.AUTH_USER_MODEL,
db_index=True,
),
),
# Update extractor choices
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(
choices=[
('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'),
('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'),
('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'),
('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'),
('title', 'title'), ('wget', 'wget'),
],
max_length=32, db_index=True,
),
),
# Update status field
migrations.AlterField(
model_name='archiveresult',
name='status',
field=models.CharField(
choices=[
('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'),
('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped'),
],
max_length=16, default='queued', db_index=True,
),
),
# Update output field size
migrations.AlterField(
model_name='archiveresult',
name='output',
field=models.CharField(max_length=1024, default=None, null=True, blank=True),
),
# Update cmd_version field size
migrations.AlterField(
model_name='archiveresult',
name='cmd_version',
field=models.CharField(max_length=128, default=None, null=True, blank=True),
),
# Make start_ts and end_ts nullable
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(default=None, null=True, blank=True),
),
migrations.AlterField(
model_name='archiveresult',
name='end_ts',
field=models.DateTimeField(default=None, null=True, blank=True),
),
# Make pwd nullable
migrations.AlterField(
model_name='archiveresult',
name='pwd',
field=models.CharField(max_length=256, default=None, null=True, blank=True),
),
# Make cmd nullable
migrations.AlterField(
model_name='archiveresult',
name='cmd',
field=models.JSONField(default=None, null=True, blank=True),
),
# Update model options
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
migrations.AlterModelOptions(
name='snapshot',
options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'},
),
migrations.AlterModelOptions(
name='tag',
options={'verbose_name': 'Tag', 'verbose_name_plural': 'Tags'},
),
]

View File

@@ -1,101 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 11:43
from django.db import migrations
from datetime import datetime
from archivebox.base_models.abid import abid_from_values, DEFAULT_ABID_URI_SALT
def calculate_abid(self):
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix = self.abid_prefix
ts = eval(self.abid_ts_src)
uri = eval(self.abid_uri_src)
subtype = eval(self.abid_subtype_src)
rand = eval(self.abid_rand_src)
if (not prefix) or prefix == 'obj_':
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
salt=DEFAULT_ABID_URI_SALT,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
def copy_snapshot_uuids(apps, schema_editor):
print(' Copying snapshot.id -> snapshot.uuid...')
Snapshot = apps.get_model("core", "Snapshot")
for snapshot in Snapshot.objects.all():
snapshot.uuid = snapshot.id
snapshot.save(update_fields=["uuid"])
def generate_snapshot_abids(apps, schema_editor):
print(' Generating snapshot.abid values...')
Snapshot = apps.get_model("core", "Snapshot")
for snapshot in Snapshot.objects.all():
snapshot.abid_prefix = 'snp_'
snapshot.abid_ts_src = 'self.added'
snapshot.abid_uri_src = 'self.url'
snapshot.abid_subtype_src = '"01"'
snapshot.abid_rand_src = 'self.uuid'
snapshot.abid = calculate_abid(snapshot)
snapshot.uuid = snapshot.abid.uuid
snapshot.save(update_fields=["abid", "uuid"])
def generate_archiveresult_abids(apps, schema_editor):
print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
ArchiveResult = apps.get_model("core", "ArchiveResult")
Snapshot = apps.get_model("core", "Snapshot")
for result in ArchiveResult.objects.all():
result.abid_prefix = 'res_'
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
result.snapshot_added = result.snapshot.added
result.snapshot_url = result.snapshot.url
result.abid_ts_src = 'self.snapshot_added'
result.abid_uri_src = 'self.snapshot_url'
result.abid_subtype_src = 'self.extractor'
result.abid_rand_src = 'self.id'
result.abid = calculate_abid(result)
result.uuid = result.abid.uuid
result.save(update_fields=["abid", "uuid"])
class Migration(migrations.Migration):
dependencies = [
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
]
operations = [
migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 12:08
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_auto_20240513_1143'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
),
]

View File

@@ -1,117 +0,0 @@
# Generated by Django 5.0.6 on 2024-05-13 13:01
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
from django.db import migrations, models
import archivebox.base_models.models
def updated_created_by_ids(apps, schema_editor):
"""Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
User = apps.get_model("auth", "User")
ArchiveResult = apps.get_model("core", "ArchiveResult")
Snapshot = apps.get_model("core", "Snapshot")
Tag = apps.get_model("core", "Tag")
# if only one user exists total, return that user
if User.objects.filter(is_superuser=True).count() == 1:
user_id = User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
# otherwise, create a dedicated "system" user
user_id = User.objects.get_or_create(username='system', is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})[0].pk
ArchiveResult.objects.all().update(created_by_id=user_id)
Snapshot.objects.all().update(created_by_id=user_id)
Tag.objects.all().update(created_by_id=user_id)
class Migration(migrations.Migration):
dependencies = [
('core', '0025_alter_archiveresult_uuid'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AddField(
model_name='archiveresult',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='archiveresult',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='snapshot',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(null=True, default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='tag',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.RunPython(updated_created_by_ids, reverse_code=migrations.RunPython.noop),
migrations.AddField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
]

View File

@@ -1,105 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 02:48
from django.db import migrations
from datetime import datetime
from archivebox.base_models.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT
def calculate_abid(self):
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix = self.abid_prefix
ts = eval(self.abid_ts_src)
uri = eval(self.abid_uri_src)
subtype = eval(self.abid_subtype_src)
rand = eval(self.abid_rand_src)
if (not prefix) or prefix == 'obj_':
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
salt=DEFAULT_ABID_URI_SALT,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
def update_snapshot_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
num_total = Snapshot.objects.all().count()
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator(chunk_size=500)):
assert snapshot.abid
snapshot.abid_prefix = 'snp_'
snapshot.abid_ts_src = 'self.added'
snapshot.abid_uri_src = 'self.url'
snapshot.abid_subtype_src = '"01"'
snapshot.abid_rand_src = 'self.uuid'
snapshot.abid = calculate_abid(snapshot)
snapshot.uuid = snapshot.abid.uuid
snapshot.save(update_fields=["abid", "uuid"])
assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
if idx % 1000 == 0:
print(f'Migrated {idx}/{num_total} Snapshot objects...')
def update_archiveresult_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult")
num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator(chunk_size=500)):
assert result.abid
result.abid_prefix = 'res_'
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
result.snapshot_added = result.snapshot.added
result.snapshot_url = result.snapshot.url
result.abid_ts_src = 'self.snapshot_added'
result.abid_uri_src = 'self.snapshot_url'
result.abid_subtype_src = 'self.extractor'
result.abid_rand_src = 'self.id'
result.abid = calculate_abid(result)
result.uuid = result.abid.uuid
result.uuid = ABID.parse(result.abid).uuid
result.save(update_fields=["abid", "uuid"])
assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
if idx % 5000 == 0:
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
class Migration(migrations.Migration):
dependencies = [
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
]
operations = [
migrations.RunPython(update_snapshot_ids, reverse_code=migrations.RunPython.noop),
migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 04:28
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0027_update_snapshot_ids'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid.uuid4),
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 04:28
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0028_alter_archiveresult_uuid'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.BigIntegerField(primary_key=True, serialize=False, verbose_name='ID'),
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 05:00
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0029_alter_archiveresult_id'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(unique=True),
),
]

View File

@@ -1,34 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 05:09
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0030_alter_archiveresult_uuid'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.IntegerField(default=uuid.uuid4, primary_key=True, serialize=False, verbose_name='ID'),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, unique=True),
),
migrations.AlterField(
model_name='tag',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, null=True, unique=True),
),
]

View File

@@ -1,23 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 05:20
import core.models
import random
from django.db import migrations, models
def rand_int_id():
return random.getrandbits(32)
class Migration(migrations.Migration):
dependencies = [
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='ID'),
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 05:34
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0032_alter_archiveresult_id'),
]
operations = [
migrations.RenameField(
model_name='archiveresult',
old_name='id',
new_name='old_id',
),
]

View File

@@ -1,45 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 05:37
import uuid
import random
from django.db import migrations, models
from archivebox.base_models.abid import ABID
def rand_int_id():
return random.getrandbits(32)
def update_archiveresult_ids(apps, schema_editor):
ArchiveResult = apps.get_model("core", "ArchiveResult")
num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator(chunk_size=500)):
assert result.abid
result.uuid = ABID.parse(result.abid).uuid
result.save(update_fields=["uuid"])
assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
if idx % 2500 == 0:
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
class Migration(migrations.Migration):
dependencies = [
('core', '0033_rename_id_archiveresult_old_id'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='old_id',
field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='ID'),
),
migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 05:49
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
]
operations = [
migrations.RenameField(
model_name='archiveresult',
old_name='uuid',
new_name='id',
),
]

View File

@@ -1,29 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 05:59
import core.models
import uuid
import random
from django.db import migrations, models
def rand_int_id():
return random.getrandbits(32)
class Migration(migrations.Migration):
dependencies = [
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
),
migrations.AlterField(
model_name='archiveresult',
name='old_id',
field=models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID'),
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 06:08
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
]
operations = [
migrations.RenameField(
model_name='snapshot',
old_name='id',
new_name='old_id',
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 06:09
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0037_rename_id_snapshot_old_id'),
]
operations = [
migrations.RenameField(
model_name='snapshot',
old_name='uuid',
new_name='id',
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 06:25
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0038_rename_uuid_snapshot_id'),
]
operations = [
migrations.RenameField(
model_name='archiveresult',
old_name='snapshot',
new_name='snapshot_old',
),
]

View File

@@ -1,34 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 06:46
import django.db.models.deletion
from django.db import migrations, models
def update_archiveresult_snapshot_ids(apps, schema_editor):
ArchiveResult = apps.get_model("core", "ArchiveResult")
Snapshot = apps.get_model("core", "Snapshot")
num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
assert result.snapshot_old_id
snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
result.snapshot_id = snapshot.id
result.save(update_fields=["snapshot_id"])
assert str(result.snapshot_id) == str(snapshot.id)
if idx % 5000 == 0:
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
class Migration(migrations.Migration):
dependencies = [
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
]
operations = [
migrations.AddField(
model_name='archiveresult',
name='snapshot',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults', to='core.snapshot', to_field='id'),
),
migrations.RunPython(update_archiveresult_snapshot_ids, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,24 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 06:50
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0040_archiveresult_snapshot'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='snapshot',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
),
migrations.AlterField(
model_name='archiveresult',
name='snapshot_old',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='archiveresults_old', to='core.snapshot'),
),
]

View File

@@ -1,17 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 06:51
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0041_alter_archiveresult_snapshot_and_more'),
]
operations = [
migrations.RemoveField(
model_name='archiveresult',
name='snapshot_old',
),
]

View File

@@ -1,20 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-18 06:52
import django.db.models.deletion
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0042_remove_archiveresult_snapshot_old'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
),
]

View File

@@ -1,40 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-19 23:01
import django.db.models.deletion
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
]
operations = [
migrations.SeparateDatabaseAndState(
database_operations=[
# No-op, SnapshotTag model already exists in DB
],
state_operations=[
migrations.CreateModel(
name='SnapshotTag',
fields=[
('id', models.AutoField(primary_key=True, serialize=False)),
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.snapshot')),
('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.tag')),
],
options={
'db_table': 'core_snapshot_tags',
'unique_together': {('snapshot', 'tag')},
},
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, related_name='snapshot_set', through='core.SnapshotTag', to='core.tag'),
),
],
),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 01:54
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='old_id',
field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -1,30 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 01:55
import django.db.models.deletion
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0045_alter_snapshot_old_id'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='old_id',
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
),
]

View File

@@ -1,24 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:16
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='id'),
),
migrations.AlterField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
),
]

View File

@@ -1,24 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:17
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0047_alter_snapshottag_unique_together_and_more'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshottag',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
),
]

View File

@@ -1,22 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:26
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0048_alter_archiveresult_snapshot_and_more'),
]
operations = [
migrations.RenameField(
model_name='snapshottag',
old_name='snapshot',
new_name='snapshot_old',
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot_old', 'tag')},
),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:30
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
]
operations = [
migrations.AlterField(
model_name='snapshottag',
name='snapshot_old',
field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot', to_field='old_id'),
),
]

View File

@@ -1,40 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:31
import django.db.models.deletion
from django.db import migrations, models
def update_snapshottag_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
SnapshotTag = apps.get_model("core", "SnapshotTag")
num_total = SnapshotTag.objects.all().count()
print(f' Updating {num_total} SnapshotTag.snapshot_id values in place... (may take an hour or longer for large collections...)')
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('snapshot_old_id').iterator(chunk_size=500)):
assert snapshottag.snapshot_old_id
snapshot = Snapshot.objects.get(old_id=snapshottag.snapshot_old_id)
snapshottag.snapshot_id = snapshot.id
snapshottag.save(update_fields=["snapshot_id"])
assert str(snapshottag.snapshot_id) == str(snapshot.id)
if idx % 100 == 0:
print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
class Migration(migrations.Migration):
dependencies = [
('core', '0050_alter_snapshottag_snapshot_old'),
]
operations = [
migrations.AddField(
model_name='snapshottag',
name='snapshot',
field=models.ForeignKey(blank=True, db_column='snapshot_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
),
migrations.AlterField(
model_name='snapshottag',
name='snapshot_old',
field=models.ForeignKey(db_column='snapshot_old_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottag_old_set', to='core.snapshot', to_field='old_id'),
),
migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,27 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:37
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
]
operations = [
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together=set(),
),
migrations.AlterField(
model_name='snapshottag',
name='snapshot',
field=models.ForeignKey(db_column='snapshot_id', on_delete=django.db.models.deletion.CASCADE, to='core.snapshot'),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
]

View File

@@ -1,17 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:38
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0052_alter_snapshottag_unique_together_and_more'),
]
operations = [
migrations.RemoveField(
model_name='snapshottag',
name='snapshot_old',
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 02:40
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0053_remove_snapshottag_snapshot_old'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, editable=False, max_length=32, unique=True),
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:24
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0054_alter_snapshot_timestamp'),
]
operations = [
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(editable=False, max_length=100, unique=True),
),
]

View File

@@ -1,17 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:25
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0055_alter_tag_slug'),
]
operations = [
migrations.RemoveField(
model_name='tag',
name='uuid',
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:29
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0056_remove_tag_uuid'),
]
operations = [
migrations.RenameField(
model_name='tag',
old_name='id',
new_name='old_id',
),
]

View File

@@ -1,22 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:30
import random
from django.db import migrations, models
def rand_int_id():
return random.getrandbits(32)
class Migration(migrations.Migration):
dependencies = [
('core', '0057_rename_id_tag_old_id'),
]
operations = [
migrations.AlterField(
model_name='tag',
name='old_id',
field=models.BigIntegerField(default=rand_int_id, primary_key=True, serialize=False, verbose_name='Old ID'),
),
]

View File

@@ -1,90 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:33
from datetime import datetime
from django.db import migrations, models
from archivebox.base_models.abid import abid_from_values
from archivebox.base_models.models import ABID
def calculate_abid(self):
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix = self.abid_prefix
ts = eval(self.abid_ts_src)
uri = eval(self.abid_uri_src)
subtype = eval(self.abid_subtype_src)
rand = eval(self.abid_rand_src)
if (not prefix) or prefix == 'obj_':
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
def update_archiveresult_ids(apps, schema_editor):
Tag = apps.get_model("core", "Tag")
num_total = Tag.objects.all().count()
print(f' Updating {num_total} Tag.id, ArchiveResult.uuid values in place...')
for idx, tag in enumerate(Tag.objects.all().iterator(chunk_size=500)):
if not tag.slug:
tag.slug = tag.name.lower().replace(' ', '_')
if not tag.name:
tag.name = tag.slug
if not (tag.name or tag.slug):
tag.delete()
continue
assert tag.slug or tag.name, f'Tag.slug must be defined! You have a Tag(id={tag.pk}) missing a slug!'
tag.abid_prefix = 'tag_'
tag.abid_ts_src = 'self.created'
tag.abid_uri_src = 'self.slug'
tag.abid_subtype_src = '"03"'
tag.abid_rand_src = 'self.old_id'
tag.abid = calculate_abid(tag)
tag.id = tag.abid.uuid
tag.save(update_fields=["abid", "id", "name", "slug"])
assert str(ABID.parse(tag.abid).uuid) == str(tag.id)
if idx % 10 == 0:
print(f'Migrated {idx}/{num_total} Tag objects...')
class Migration(migrations.Migration):
dependencies = [
('core', '0058_alter_tag_old_id'),
]
operations = [
migrations.AddField(
model_name='tag',
name='id',
field=models.UUIDField(blank=True, null=True),
),
migrations.RunPython(update_archiveresult_ids, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:42
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0059_tag_id'),
]
operations = [
migrations.AlterField(
model_name='tag',
name='id',
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
),
]

View File

@@ -1,22 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:43
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0060_alter_tag_id'),
]
operations = [
migrations.RenameField(
model_name='snapshottag',
old_name='tag',
new_name='old_tag',
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'old_tag')},
),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:44
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
]
operations = [
migrations.AlterField(
model_name='snapshottag',
name='old_tag',
field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
),
]

View File

@@ -1,40 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:45
import django.db.models.deletion
from django.db import migrations, models
def update_snapshottag_ids(apps, schema_editor):
Tag = apps.get_model("core", "Tag")
SnapshotTag = apps.get_model("core", "SnapshotTag")
num_total = SnapshotTag.objects.all().count()
print(f' Updating {num_total} SnapshotTag.tag_id values in place... (may take an hour or longer for large collections...)')
for idx, snapshottag in enumerate(SnapshotTag.objects.all().only('old_tag_id').iterator(chunk_size=500)):
assert snapshottag.old_tag_id
tag = Tag.objects.get(old_id=snapshottag.old_tag_id)
snapshottag.tag_id = tag.id
snapshottag.save(update_fields=["tag_id"])
assert str(snapshottag.tag_id) == str(tag.id)
if idx % 100 == 0:
print(f'Migrated {idx}/{num_total} SnapshotTag objects...')
class Migration(migrations.Migration):
dependencies = [
('core', '0062_alter_snapshottag_old_tag'),
]
operations = [
migrations.AddField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(blank=True, db_column='tag_id', null=True, on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
),
migrations.AlterField(
model_name='snapshottag',
name='old_tag',
field=models.ForeignKey(db_column='old_tag_id', on_delete=django.db.models.deletion.CASCADE, related_name='snapshottags_old', to='core.tag'),
),
migrations.RunPython(update_snapshottag_ids, reverse_code=migrations.RunPython.noop),
]

View File

@@ -1,27 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:50
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
]
operations = [
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together=set(),
),
migrations.AlterField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
),
migrations.AlterUniqueTogether(
name='snapshottag',
unique_together={('snapshot', 'tag')},
),
]

View File

@@ -1,17 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:51
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0064_alter_snapshottag_unique_together_and_more'),
]
operations = [
migrations.RemoveField(
model_name='snapshottag',
name='old_tag',
),
]

View File

@@ -1,34 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:52
import core.models
import django.db.models.deletion
import uuid
import random
from django.db import migrations, models
def rand_int_id():
return random.getrandbits(32)
class Migration(migrations.Migration):
dependencies = [
('core', '0065_remove_snapshottag_old_tag'),
]
operations = [
migrations.AlterField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag', to_field='id'),
),
migrations.AlterField(
model_name='tag',
name='id',
field=models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='tag',
name='old_id',
field=models.BigIntegerField(default=rand_int_id, serialize=False, unique=True, verbose_name='Old ID'),
),
]

View File

@@ -1,19 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 03:53
import django.db.models.deletion
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
]
operations = [
migrations.AlterField(
model_name='snapshottag',
name='tag',
field=models.ForeignKey(db_column='tag_id', on_delete=django.db.models.deletion.CASCADE, to='core.tag'),
),
]

View File

@@ -1,17 +0,0 @@
# Generated by Django 5.0.6 on 2024-08-20 07:26
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0067_alter_snapshottag_tag'),
]
operations = [
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Archive Result', 'verbose_name_plural': 'Archive Results Log'},
),
]

View File

@@ -1,36 +0,0 @@
# Generated by Django 5.1 on 2024-08-28 09:40
import django.utils.timezone
from django.db import migrations
import archivebox.base_models.models
class Migration(migrations.Migration):
dependencies = [
('core', '0068_alter_archiveresult_options'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='created',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='added',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='snapshot',
name='created',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
migrations.AlterField(
model_name='tag',
name='created',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=django.utils.timezone.now),
),
]

View File

@@ -1,53 +0,0 @@
# Generated by Django 5.1 on 2024-09-04 09:00
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
import archivebox.base_models.models
class Migration(migrations.Migration):
dependencies = [
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='added',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
),
migrations.AlterField(
model_name='snapshot',
name='created',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=None, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='snapshot',
name='old_id',
field=models.UUIDField(default=None, editable=False, unique=True),
),
migrations.AlterField(
model_name='tag',
name='created',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
),
]

View File

@@ -1,66 +0,0 @@
# Generated by Django 5.1 on 2024-09-04 23:23
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
import archivebox.base_models.models
class Migration(migrations.Migration):
dependencies = [
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.RemoveField(
model_name='archiveresult',
name='old_id',
),
migrations.RemoveField(
model_name='snapshot',
name='old_id',
),
migrations.RemoveField(
model_name='tag',
name='old_id',
),
migrations.AlterField(
model_name='archiveresult',
name='created',
field=archivebox.base_models.models.AutoDateTimeField(db_index=True, default=None),
),
migrations.AlterField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='archiveresult_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='archiveresult',
name='id',
field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
),
migrations.AlterField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='snapshot',
name='id',
field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
),
migrations.AlterField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=None, on_delete=django.db.models.deletion.CASCADE, related_name='tag_set', to=settings.AUTH_USER_MODEL),
),
migrations.AlterField(
model_name='tag',
name='id',
field=models.UUIDField(default=None, editable=False, primary_key=True, serialize=False, unique=True, verbose_name='ID'),
),
]

View File

@@ -1,23 +0,0 @@
# Generated by Django 5.1 on 2024-09-05 00:05
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
]
operations = [
migrations.RenameField(
model_name='snapshot',
old_name='added',
new_name='bookmarked_at',
),
migrations.RenameField(
model_name='snapshot',
old_name='updated',
new_name='downloaded_at',
),
]

View File

@@ -1,43 +0,0 @@
# Generated by Django 5.1 on 2024-09-05 00:25
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
]
operations = [
migrations.RenameField(
model_name='archiveresult',
old_name='created',
new_name='created_at',
),
migrations.RenameField(
model_name='archiveresult',
old_name='modified',
new_name='modified_at',
),
migrations.RenameField(
model_name='snapshot',
old_name='created',
new_name='created_at',
),
migrations.RenameField(
model_name='snapshot',
old_name='modified',
new_name='modified_at',
),
migrations.RenameField(
model_name='tag',
old_name='created',
new_name='created_at',
),
migrations.RenameField(
model_name='tag',
old_name='modified',
new_name='modified_at',
),
]

View File

@@ -1,18 +0,0 @@
# Generated by Django 5.1 on 2024-09-05 01:24
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0073_rename_created_archiveresult_created_at_and_more'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
]

View File

@@ -1,7 +1,8 @@
__package__ = 'archivebox.core'
from typing import Optional, Dict, Iterable, Any
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
from uuid import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
import os
@@ -18,15 +19,11 @@ from django.urls import reverse, reverse_lazy
from django.contrib import admin
from django.conf import settings
import abx
from archivebox.config import CONSTANTS
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import parse_date, base_url, domain as url_domain
from archivebox.misc.system import get_dir_size, atomic_write
from archivebox.misc.util import parse_date, base_url, domain as url_domain, to_json, ts_to_date_str, urlencode, htmlencode, urldecode
from archivebox.misc.hashing import get_dir_info
from archivebox.index.schema import Link
from archivebox.index.html import snapshot_icons
from archivebox.extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE
from archivebox.hooks import ARCHIVE_METHODS_INDEXING_PRECEDENCE
from archivebox.base_models.models import (
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
@@ -38,6 +35,7 @@ from crawls.models import Crawl
from machine.models import NetworkInterface
class Tag(ModelWithSerializers):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False, related_name='tag_set')
@@ -94,8 +92,181 @@ class SnapshotManager(models.Manager):
def get_queryset(self):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set')
# =========================================================================
# Filtering Methods
# =========================================================================
class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
FILTER_TYPES = {
'exact': lambda pattern: models.Q(url=pattern),
'substring': lambda pattern: models.Q(url__icontains=pattern),
'regex': lambda pattern: models.Q(url__iregex=pattern),
'domain': lambda pattern: models.Q(url__istartswith=f"http://{pattern}") | models.Q(url__istartswith=f"https://{pattern}") | models.Q(url__istartswith=f"ftp://{pattern}"),
'tag': lambda pattern: models.Q(tags__name=pattern),
'timestamp': lambda pattern: models.Q(timestamp=pattern),
}
def filter_by_patterns(self, patterns: List[str], filter_type: str = 'exact') -> QuerySet:
"""Filter snapshots by URL patterns using specified filter type"""
from archivebox.misc.logging import stderr
q_filter = models.Q()
for pattern in patterns:
try:
q_filter = q_filter | self.FILTER_TYPES[filter_type](pattern)
except KeyError:
stderr()
stderr(f'[X] Got invalid pattern for --filter-type={filter_type}:', color='red')
stderr(f' {pattern}')
raise SystemExit(2)
return self.filter(q_filter)
def search(self, patterns: List[str]) -> QuerySet:
"""Search snapshots using the configured search backend"""
from archivebox.config.common import SEARCH_BACKEND_CONFIG
from archivebox.search import query_search_index
from archivebox.misc.logging import stderr
if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
stderr()
stderr('[X] The search backend is not enabled, set config.USE_SEARCHING_BACKEND = True', color='red')
raise SystemExit(2)
qsearch = self.none()
for pattern in patterns:
try:
qsearch |= query_search_index(pattern)
except:
raise SystemExit(2)
return self.all() & qsearch
# =========================================================================
# Export Methods
# =========================================================================
def to_json(self, with_headers: bool = False) -> str:
"""Generate JSON index from snapshots"""
import sys
from datetime import datetime, timezone as tz
from archivebox.config import VERSION
from archivebox.config.common import SERVER_CONFIG
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
'copyright_info': SERVER_CONFIG.FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
'git_sha': VERSION,
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
'dependencies': {},
},
} if with_headers else {}
snapshot_dicts = [s.to_dict(extended=True) for s in self.iterator(chunk_size=500)]
if with_headers:
output = {
**MAIN_INDEX_HEADER,
'num_links': len(snapshot_dicts),
'updated': datetime.now(tz.utc),
'last_run_cmd': sys.argv,
'links': snapshot_dicts,
}
else:
output = snapshot_dicts
return to_json(output, indent=4, sort_keys=True)
def to_csv(self, cols: Optional[List[str]] = None, header: bool = True, separator: str = ',', ljust: int = 0) -> str:
"""Generate CSV output from snapshots"""
cols = cols or ['timestamp', 'is_archived', 'url']
header_str = separator.join(col.ljust(ljust) for col in cols) if header else ''
row_strs = (s.to_csv(cols=cols, ljust=ljust, separator=separator) for s in self.iterator(chunk_size=500))
return '\n'.join((header_str, *row_strs))
def to_html(self, with_headers: bool = True) -> str:
"""Generate main index HTML from snapshots"""
from datetime import datetime, timezone as tz
from django.template.loader import render_to_string
from archivebox.config import VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
template = 'static_index.html' if with_headers else 'minimal_index.html'
snapshot_list = list(self.iterator(chunk_size=500))
return render_to_string(template, {
'version': VERSION,
'git_sha': get_COMMIT_HASH() or VERSION,
'num_links': str(len(snapshot_list)),
'date_updated': datetime.now(tz.utc).strftime('%Y-%m-%d'),
'time_updated': datetime.now(tz.utc).strftime('%Y-%m-%d %H:%M'),
'links': snapshot_list,
'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO,
})
# =========================================================================
# Import Methods
# =========================================================================
def create_or_update_from_dict(self, link_dict: Dict[str, Any], created_by_id: Optional[int] = None) -> 'Snapshot':
"""Create or update a Snapshot from a SnapshotDict (parser output)"""
import re
from archivebox.config.common import GENERAL_CONFIG
url = link_dict['url']
timestamp = link_dict.get('timestamp')
title = link_dict.get('title')
tags_str = link_dict.get('tags')
tag_list = []
if tags_str:
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(GENERAL_CONFIG.TAG_SEPARATOR_PATTERN, tags_str)
if tag.strip()
))
try:
snapshot = self.get(url=url)
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
except self.model.DoesNotExist:
if timestamp:
while self.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
snapshot = self.create(
url=url,
timestamp=timestamp,
title=title,
created_by_id=created_by_id or get_or_create_system_user_pk(),
)
if tag_list:
existing_tags = set(snapshot.tags.values_list('name', flat=True))
new_tags = set(tag_list) | existing_tags
snapshot.save_tags(new_tags)
return snapshot
def create_from_dicts(self, link_dicts: List[Dict[str, Any]], created_by_id: Optional[int] = None) -> List['Snapshot']:
"""Create or update multiple Snapshots from a list of SnapshotDicts"""
return [self.create_or_update_from_dict(d, created_by_id=created_by_id) for d in link_dicts]
def remove(self, atomic: bool = False) -> tuple:
"""Remove snapshots from the database"""
from django.db import transaction
if atomic:
with transaction.atomic():
return self.delete()
return self.delete()
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='snapshot_set', db_index=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -108,6 +279,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
@@ -152,9 +324,6 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
def archive(self, overwrite=False, methods=None):
return bg_archive_snapshot(self, overwrite=overwrite, methods=methods)
def as_link(self) -> Link:
return Link.from_json(self.as_json())
@admin.display(description='Tags')
def tags_str(self, nocache=True) -> str | None:
calc_tags_str = lambda: ','.join(sorted(tag.name for tag in self.tags.all()))
@@ -164,7 +333,55 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
return cache.get_or_set(cache_key, calc_tags_str) if not nocache else calc_tags_str()
def icons(self) -> str:
return snapshot_icons(self)
"""Generate HTML icons showing which extractors have succeeded for this snapshot"""
from django.utils.html import format_html, mark_safe
from collections import defaultdict
cache_key = f'result_icons:{self.pk}:{(self.downloaded_at or self.modified_at or self.created_at or self.bookmarked_at).timestamp()}'
def calc_icons():
if hasattr(self, '_prefetched_objects_cache') and 'archiveresult_set' in self._prefetched_objects_cache:
archive_results = [r for r in self.archiveresult_set.all() if r.status == "succeeded" and r.output]
else:
archive_results = self.archiveresult_set.filter(status="succeeded", output__isnull=False)
path = self.archive_path
canon = self.canonical_outputs()
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
icons = {
"singlefile": "", "wget": "🆆", "dom": "🅷", "pdf": "📄",
"screenshot": "💻", "media": "📼", "git": "🅶", "archive_org": "🏛",
"readability": "🆁", "mercury": "🅼", "warc": "📦"
}
exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"]
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
for result in archive_results:
if result.extractor == extractor:
extractor_outputs[extractor] = result
for extractor, _ in ArchiveResult.EXTRACTOR_CHOICES:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += format_html(output_template, path, canon.get(extractor, ''), str(bool(existing)), extractor, icons.get(extractor, "?"))
if extractor == "wget":
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += format_html(output_template, path, canon.get("warc", "warc/"), str(bool(exists)), "warc", icons.get("warc", "?"))
if extractor == "archive_org":
exists = extractor in extractor_outputs and extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon.get("archive_org", ""), str(exists), "archive_org", icons.get("archive_org", "?"))
return format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
cache_result = cache.get(cache_key)
if cache_result:
return cache_result
fresh_result = calc_icons()
cache.set(cache_key, fresh_result, timeout=60 * 60 * 24)
return fresh_result
@property
def api_url(self) -> str:
@@ -178,7 +395,8 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
return url_domain(self.url)
@cached_property
def link_dir(self):
def output_dir(self):
"""The filesystem path to the snapshot's output directory."""
return str(CONSTANTS.ARCHIVE_DIR / self.timestamp)
@cached_property
@@ -188,7 +406,7 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
@cached_property
def archive_size(self):
try:
return get_dir_size(self.link_dir)[0]
return get_dir_size(self.output_dir)[0]
except Exception:
return 0
@@ -200,20 +418,327 @@ class Snapshot(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelW
def pending_archiveresults(self) -> QuerySet['ArchiveResult']:
return self.archiveresult_set.exclude(status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES)
def run(self) -> list['ArchiveResult']:
"""
Execute this Snapshot by creating ArchiveResults for all enabled extractors.
Called by the state machine when entering the 'started' state.
"""
return self.create_pending_archiveresults()
def create_pending_archiveresults(self) -> list['ArchiveResult']:
ALL_EXTRACTORS = ['favicon', 'title', 'screenshot', 'headers', 'singlefile', 'dom', 'git', 'archive_org', 'readability', 'mercury', 'pdf', 'wget']
"""
Create ArchiveResult records for all enabled extractors.
Uses the hooks system to discover available extractors from:
- archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
- data/plugins/*/on_Snapshot__*.{py,sh,js}
"""
from archivebox.hooks import get_enabled_extractors
extractors = get_enabled_extractors()
archiveresults = []
for extractor in ALL_EXTRACTORS:
for extractor in extractors:
if ArchiveResult.objects.filter(snapshot=self, extractor=extractor).exists():
continue
archiveresult, _ = ArchiveResult.objects.get_or_create(
snapshot=self, extractor=extractor,
defaults={'status': ArchiveResult.INITIAL_STATE, 'retry_at': timezone.now()},
defaults={
'status': ArchiveResult.INITIAL_STATE,
'retry_at': timezone.now(),
'created_by_id': self.created_by_id,
},
)
if archiveresult.status == ArchiveResult.INITIAL_STATE:
archiveresults.append(archiveresult)
return archiveresults
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
This enables seamless retry of the entire extraction pipeline:
- Resets FAILED and SKIPPED results to QUEUED
- Sets retry_at so workers pick them up
- Extractors run in order (numeric prefix)
- Each extractor checks its dependencies at runtime
Dependency handling (e.g., chrome_session → screenshot):
- Extractors check if required outputs exist before running
- If dependency output missing → extractor returns 'skipped'
- On retry, if dependency now succeeds → dependent can run
Returns count of ArchiveResults reset.
"""
retry_at = retry_at or timezone.now()
count = self.archiveresult_set.filter(
status__in=[
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
]
).update(
status=ArchiveResult.StatusChoices.QUEUED,
retry_at=retry_at,
output=None,
start_ts=None,
end_ts=None,
)
# Also reset the snapshot so it gets re-checked
if count > 0:
self.status = self.StatusChoices.STARTED
self.retry_at = retry_at
self.save(update_fields=['status', 'retry_at', 'modified_at'])
return count
# =========================================================================
# URL Helper Properties (migrated from Link schema)
# =========================================================================
@cached_property
def url_hash(self) -> str:
from hashlib import sha256
return sha256(self.url.encode()).hexdigest()[:8]
@cached_property
def scheme(self) -> str:
return self.url.split('://')[0]
@cached_property
def path(self) -> str:
parts = self.url.split('://', 1)
return '/' + parts[1].split('/', 1)[1] if len(parts) > 1 and '/' in parts[1] else '/'
@cached_property
def basename(self) -> str:
return self.path.split('/')[-1]
@cached_property
def extension(self) -> str:
basename = self.basename
return basename.split('.')[-1] if '.' in basename else ''
@cached_property
def base_url(self) -> str:
return f'{self.scheme}://{self.domain}'
@cached_property
def is_static(self) -> bool:
static_extensions = {'.pdf', '.jpg', '.jpeg', '.png', '.gif', '.webp', '.svg', '.mp4', '.mp3', '.wav', '.webm'}
return any(self.url.lower().endswith(ext) for ext in static_extensions)
@cached_property
def is_archived(self) -> bool:
output_paths = (
self.domain,
'output.html',
'output.pdf',
'screenshot.png',
'singlefile.html',
'readability/content.html',
'mercury/content.html',
'htmltotext.txt',
'media',
'git',
)
return any((Path(self.output_dir) / path).exists() for path in output_paths)
# =========================================================================
# Date/Time Properties (migrated from Link schema)
# =========================================================================
@cached_property
def bookmarked_date(self) -> Optional[str]:
max_ts = (timezone.now() + timedelta(days=30)).timestamp()
if self.timestamp and self.timestamp.replace('.', '').isdigit():
if 0 < float(self.timestamp) < max_ts:
return self._ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
return str(self.timestamp)
return None
@cached_property
def downloaded_datestr(self) -> Optional[str]:
return self._ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
@cached_property
def archive_dates(self) -> List[datetime]:
return [
result.start_ts
for result in self.archiveresult_set.all()
if result.start_ts
]
@cached_property
def oldest_archive_date(self) -> Optional[datetime]:
dates = self.archive_dates
return min(dates) if dates else None
@cached_property
def newest_archive_date(self) -> Optional[datetime]:
dates = self.archive_dates
return max(dates) if dates else None
@cached_property
def num_outputs(self) -> int:
return self.archiveresult_set.filter(status='succeeded').count()
@cached_property
def num_failures(self) -> int:
return self.archiveresult_set.filter(status='failed').count()
# =========================================================================
# Output Path Methods (migrated from Link schema)
# =========================================================================
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""Predict the expected output paths that should be present after archiving"""
FAVICON_PROVIDER = 'https://www.google.com/s2/favicons?domain={}'
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
'wget_path': f'warc/{self.timestamp}',
'warc_path': 'warc/',
'singlefile_path': 'singlefile.html',
'readability_path': 'readability/content.html',
'mercury_path': 'mercury/content.html',
'htmltotext_path': 'htmltotext.txt',
'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png',
'dom_path': 'output.html',
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
'git_path': 'git/',
'media_path': 'media/',
'headers_path': 'headers.json',
}
if self.is_static:
static_path = f'warc/{self.timestamp}'
canonical.update({
'title': self.basename,
'wget_path': static_path,
'pdf_path': static_path,
'screenshot_path': static_path,
'dom_path': static_path,
'singlefile_path': static_path,
'readability_path': static_path,
'mercury_path': static_path,
'htmltotext_path': static_path,
})
return canonical
def latest_outputs(self, status: Optional[str] = None) -> Dict[str, Any]:
"""Get the latest output that each archive method produced"""
from archivebox.hooks import get_extractors
latest: Dict[str, Any] = {}
for archive_method in get_extractors():
results = self.archiveresult_set.filter(extractor=archive_method)
if status is not None:
results = results.filter(status=status)
results = results.filter(output__isnull=False).order_by('-start_ts')
latest[archive_method] = results.first().output if results.exists() else None
return latest
# =========================================================================
# Serialization Methods
# =========================================================================
def to_dict(self, extended: bool = False) -> Dict[str, Any]:
"""Convert Snapshot to a dictionary (replacement for Link._asdict())"""
from archivebox.misc.util import ts_to_date_str
result = {
'TYPE': 'core.models.Snapshot',
'id': str(self.id),
'url': self.url,
'timestamp': self.timestamp,
'title': self.title,
'tags': self.tags_str(),
'downloaded_at': self.downloaded_at.isoformat() if self.downloaded_at else None,
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
'created_at': self.created_at.isoformat() if self.created_at else None,
# Computed properties
'domain': self.domain,
'scheme': self.scheme,
'base_url': self.base_url,
'path': self.path,
'basename': self.basename,
'extension': self.extension,
'is_static': self.is_static,
'is_archived': self.is_archived,
'archive_path': self.archive_path,
'output_dir': self.output_dir,
'link_dir': self.output_dir, # backwards compatibility alias
'archive_size': self.archive_size,
'bookmarked_date': self.bookmarked_date,
'downloaded_datestr': self.downloaded_datestr,
'num_outputs': self.num_outputs,
'num_failures': self.num_failures,
}
if extended:
result['canonical'] = self.canonical_outputs()
return result
def to_json(self, indent: int = 4) -> str:
"""Convert to JSON string"""
return to_json(self.to_dict(extended=True), indent=indent)
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
"""Convert to CSV string"""
data = self.to_dict()
cols = cols or ['timestamp', 'is_archived', 'url']
return separator.join(to_json(data.get(col, ''), indent=None).ljust(ljust) for col in cols)
def write_json_details(self, out_dir: Optional[str] = None) -> None:
"""Write JSON index file for this snapshot to its output directory"""
out_dir = out_dir or self.output_dir
path = Path(out_dir) / CONSTANTS.JSON_INDEX_FILENAME
atomic_write(str(path), self.to_dict(extended=True))
def write_html_details(self, out_dir: Optional[str] = None) -> None:
"""Write HTML detail page for this snapshot to its output directory"""
from django.template.loader import render_to_string
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.configset import get_config
from archivebox.misc.logging_util import printable_filesize
out_dir = out_dir or self.output_dir
config = get_config()
SAVE_ARCHIVE_DOT_ORG = config.get('SAVE_ARCHIVE_DOT_ORG', True)
TITLE_LOADING_MSG = 'Not yet archived...'
canonical = self.canonical_outputs()
context = {
**self.to_dict(extended=True),
**{f'{k}_path': v for k, v in canonical.items()},
'canonical': {f'{k}_path': v for k, v in canonical.items()},
'title': htmlencode(self.title or (self.base_url if self.is_archived else TITLE_LOADING_MSG)),
'url_str': htmlencode(urldecode(self.base_url)),
'archive_url': urlencode(f'warc/{self.timestamp}' or (self.domain if self.is_archived else '')) or 'about:blank',
'extension': self.extension or 'html',
'tags': self.tags_str() or 'untagged',
'size': printable_filesize(self.archive_size) if self.archive_size else 'pending',
'status': 'archived' if self.is_archived else 'not yet archived',
'status_color': 'success' if self.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(self.oldest_archive_date),
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
}
rendered_html = render_to_string('snapshot.html', context)
atomic_write(str(Path(out_dir) / CONSTANTS.HTML_INDEX_FILENAME), rendered_html)
# =========================================================================
# Helper Methods
# =========================================================================
@staticmethod
def _ts_to_date_str(dt: Optional[datetime]) -> Optional[str]:
return dt.strftime('%Y-%m-%d %H:%M:%S') if dt else None
class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True):
@@ -225,7 +750,7 @@ class ArchiveResultManager(models.Manager):
return qs
class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -277,7 +802,7 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.link_dir)
return Path(self.snapshot.output_dir)
@cached_property
def url(self):
@@ -292,7 +817,9 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
@property
def extractor_module(self) -> Any | None:
return abx.as_dict(abx.pm.hook.get_EXTRACTORS()).get(self.extractor, None)
# Hook scripts are now used instead of Python extractor modules
# The extractor name maps to hooks in archivebox/plugins/{extractor}/
return None
def output_exists(self) -> bool:
return os.path.exists(Path(self.snapshot_dir) / self.extractor)
@@ -315,3 +842,150 @@ class ArchiveResult(ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, M
def save_search_index(self):
pass
def run(self):
"""
Execute this ArchiveResult's extractor and update status.
Discovers and runs the hook script for self.extractor,
updates status/output fields, queues discovered URLs, and triggers indexing.
"""
from django.utils import timezone
from archivebox.hooks import discover_hooks, run_hook
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Discover hook for this extractor
hooks = discover_hooks(f'Snapshot__{self.extractor}')
if not hooks:
self.status = self.StatusChoices.FAILED
self.output = f'No hook found for: {self.extractor}'
self.retry_at = None
self.save()
return
# Run the hook
start_ts = timezone.now()
result = run_hook(
hooks[0],
output_dir=extractor_dir,
config_objects=config_objects,
url=self.snapshot.url,
)
end_ts = timezone.now()
# Determine status from return code and JSON output
output_json = result.get('output_json') or {}
json_status = output_json.get('status')
if json_status == 'skipped':
status = 'skipped'
elif json_status == 'failed':
status = 'failed'
elif result['returncode'] == 0:
status = 'succeeded'
else:
status = 'failed'
# Update self from result
status_map = {
'succeeded': self.StatusChoices.SUCCEEDED,
'failed': self.StatusChoices.FAILED,
'skipped': self.StatusChoices.SKIPPED,
}
self.status = status_map.get(status, self.StatusChoices.FAILED)
self.output = output_json.get('output') or result['stdout'][:1024] or result['stderr'][:1024] or None
self.start_ts = start_ts
self.end_ts = end_ts
self.retry_at = None
self.save()
# Queue any discovered URLs for crawling (parser extractors write urls.jsonl)
self._queue_urls_for_crawl(extractor_dir)
# Trigger search indexing if succeeded
if self.status == self.StatusChoices.SUCCEEDED:
self.trigger_search_indexing()
def _queue_urls_for_crawl(self, extractor_dir: Path):
"""
Read urls.jsonl and queue discovered URLs for crawling.
Parser extractors output urls.jsonl with discovered URLs and Tags.
- Tag records: {"type": "Tag", "name": "..."}
- Snapshot records: {"type": "Snapshot", "url": "...", ...}
Tags are created in the database.
URLs get added to the parent Crawl's queue with metadata
(depth, via_snapshot, via_extractor) for recursive crawling.
Used at all depths:
- depth=0: Initial source file (e.g., bookmarks.html) parsed for URLs
- depth>0: Crawled pages parsed for outbound links
"""
import json
if not self.snapshot.crawl:
return
urls_file = extractor_dir / 'urls.jsonl'
if not urls_file.exists():
return
urls_added = 0
tags_created = 0
with open(urls_file, 'r') as f:
for line in f:
line = line.strip()
if not line:
continue
try:
entry = json.loads(line)
record_type = entry.get('type', 'Snapshot')
# Handle Tag records
if record_type == 'Tag':
tag_name = entry.get('name')
if tag_name:
Tag.objects.get_or_create(name=tag_name)
tags_created += 1
continue
# Handle Snapshot records (or records without type)
if not entry.get('url'):
continue
# Add crawl metadata
entry['depth'] = self.snapshot.depth + 1
entry['via_snapshot'] = str(self.snapshot.id)
entry['via_extractor'] = self.extractor
if self.snapshot.crawl.add_url(entry):
urls_added += 1
except json.JSONDecodeError:
continue
if urls_added > 0:
self.snapshot.crawl.create_snapshots_from_urls()
def trigger_search_indexing(self):
"""Run any ArchiveResult__index hooks to update search indexes."""
from archivebox.hooks import discover_hooks, run_hook
# Pass config objects in priority order (later overrides earlier)
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
for hook in discover_hooks('ArchiveResult__index'):
run_hook(
hook,
output_dir=self.output_dir,
config_objects=config_objects,
snapshot_id=str(self.snapshot.id),
extractor=self.extractor,
)
@property
def output_dir(self) -> Path:
"""Get the output directory for this extractor's results."""
return Path(self.snapshot.output_dir) / self.extractor

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
import os
import sys
@@ -8,17 +8,16 @@ from pathlib import Path
from django.utils.crypto import get_random_string
import abx
import archivebox
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
IS_GETTING_VERSION_OR_HELP = 'version' in sys.argv or 'help' in sys.argv or '--version' in sys.argv or '--help' in sys.argv
IS_MIGRATING = "makemigrations" in sys.argv[:3] or "migrate" in sys.argv[:3]
IS_TESTING = "test" in sys.argv[:3] or "PYTEST_CURRENT_TEST" in os.environ
IS_SHELL = "shell" in sys.argv[:3] or "shell_plus" in sys.argv[:3]
IS_GETTING_VERSION_OR_HELP = "version" in sys.argv or "help" in sys.argv or "--version" in sys.argv or "--help" in sys.argv
################################################################################
### ArchiveBox Plugin Settings
@@ -31,71 +30,61 @@ LOADED_PLUGINS = archivebox.LOADED_PLUGINS
### Django Core Settings
################################################################################
WSGI_APPLICATION = 'core.wsgi.application'
WSGI_APPLICATION = "core.wsgi.application"
ASGI_APPLICATION = "core.asgi.application"
ROOT_URLCONF = 'core.urls'
ROOT_URLCONF = "core.urls"
LOGIN_URL = '/accounts/login/'
LOGOUT_REDIRECT_URL = os.environ.get('LOGOUT_REDIRECT_URL', '/')
LOGIN_URL = "/accounts/login/"
LOGOUT_REDIRECT_URL = os.environ.get("LOGOUT_REDIRECT_URL", "/")
PASSWORD_RESET_URL = '/accounts/password_reset/'
PASSWORD_RESET_URL = "/accounts/password_reset/"
APPEND_SLASH = True
DEBUG = SHELL_CONFIG.DEBUG or ('--debug' in sys.argv)
DEBUG = SHELL_CONFIG.DEBUG or ("--debug" in sys.argv)
INSTALLED_APPS = [
'daphne',
"daphne",
# Django default apps
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.admin',
"django.contrib.auth",
"django.contrib.contenttypes",
"django.contrib.sessions",
"django.contrib.messages",
"django.contrib.staticfiles",
"django.contrib.admin",
# 3rd-party apps from PyPI
'signal_webhooks', # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
"signal_webhooks", # handles REST API outbound webhooks https://github.com/MrThearMan/django-signal-webhooks
"django_object_actions", # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'workers', # handles starting and managing background workers and processes (orchestrators and actors)
'crawls', # handles Seed, Crawl, and CrawlSchedule models and management
'personas', # handles Persona and session management
'core', # core django model with Snapshot, ArchiveResult, etc.
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# ArchiveBox plugins
*abx.as_list(abx.pm.hook.get_INSTALLED_APPS()), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
"config", # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
"machine", # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
"workers", # handles starting and managing background workers and processes (orchestrators and actors)
"crawls", # handles Seed, Crawl, and CrawlSchedule models and management
"personas", # handles Persona and session management
"core", # core django model with Snapshot, ArchiveResult, etc.
"api", # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# ArchiveBox plugins (hook-based plugins no longer add Django apps)
# Use hooks.py discover_hooks() for plugin functionality
# 3rd-party apps from PyPI that need to be loaded last
'admin_data_views', # handles rendering some convenient automatic read-only views of data in Django admin
'django_extensions', # provides Django Debug Toolbar (and other non-debug helpers)
'django_huey', # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
'bx_django_utils', # needed for huey_monitor https://github.com/boxine/bx_django_utils
'huey_monitor', # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
# load plugins last so all other apps are already .ready() when we call plugins.ready()
'abx',
"admin_data_views", # handles rendering some convenient automatic read-only views of data in Django admin
"django_extensions", # provides Django Debug Toolbar (and other non-debug helpers)
"django_huey", # provides multi-queue support for django huey https://github.com/gaiacoop/django-huey
"bx_django_utils", # needed for huey_monitor https://github.com/boxine/bx_django_utils
"huey_monitor", # adds an admin UI for monitoring background huey tasks https://github.com/boxine/django-huey-monitor
]
MIDDLEWARE = [
'core.middleware.TimezoneMiddleware',
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'core.middleware.ReverseProxyAuthMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'core.middleware.CacheControlMiddleware',
*abx.as_list(abx.pm.hook.get_MIDDLEWARES()),
"core.middleware.TimezoneMiddleware",
"django.middleware.security.SecurityMiddleware",
"django.contrib.sessions.middleware.SessionMiddleware",
"django.middleware.common.CommonMiddleware",
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"core.middleware.ReverseProxyAuthMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"core.middleware.CacheControlMiddleware",
# Additional middlewares from plugins (if any)
]
@@ -106,9 +95,9 @@ MIDDLEWARE = [
# AUTH_USER_MODEL = 'auth.User' # cannot be easily changed unfortunately
AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.RemoteUserBackend',
'django.contrib.auth.backends.ModelBackend',
*abx.as_list(abx.pm.hook.get_AUTHENTICATION_BACKENDS()),
"django.contrib.auth.backends.RemoteUserBackend",
"django.contrib.auth.backends.ModelBackend",
# Additional auth backends (e.g., LDAP) configured via settings
]
@@ -120,25 +109,25 @@ AUTHENTICATION_BACKENDS = [
# AUTH_LDAP_BIND_PASSWORD = LDAP_CONFIG.LDAP_BIND_PASSWORD
# AUTH_LDAP_USER_ATTR_MAP = LDAP_CONFIG.LDAP_USER_ATTR_MAP
# AUTH_LDAP_USER_SEARCH = LDAP_CONFIG.AUTH_LDAP_USER_SEARCH
# AUTHENTICATION_BACKENDS = LDAP_CONFIG.AUTHENTICATION_BACKENDS
################################################################################
### Staticfile and Template Settings
################################################################################
STATIC_URL = '/static/'
TEMPLATES_DIR_NAME = 'templates'
STATIC_URL = "/static/"
TEMPLATES_DIR_NAME = "templates"
CUSTOM_TEMPLATES_ENABLED = os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK)
STATICFILES_DIRS = [
*([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_ENABLED else []),
*([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / "static")] if CUSTOM_TEMPLATES_ENABLED else []),
# *[
# str(plugin_dir / 'static')
# for plugin_dir in PLUGIN_DIRS.values()
# if (plugin_dir / 'static').is_dir()
# ],
*abx.as_list(abx.pm.hook.get_STATICFILES_DIRS()),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
# Additional static file dirs from plugins
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "static"),
]
TEMPLATE_DIRS = [
@@ -148,23 +137,23 @@ TEMPLATE_DIRS = [
# for plugin_dir in PLUGIN_DIRS.values()
# if (plugin_dir / 'templates').is_dir()
# ],
*abx.as_list(abx.pm.hook.get_TEMPLATE_DIRS()),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
# Additional template dirs from plugins
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "core"),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / "admin"),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
]
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': TEMPLATE_DIRS,
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
"BACKEND": "django.template.backends.django.DjangoTemplates",
"DIRS": TEMPLATE_DIRS,
"APP_DIRS": True,
"OPTIONS": {
"context_processors": [
"django.template.context_processors.debug",
"django.template.context_processors.request",
"django.contrib.auth.context_processors.auth",
"django.contrib.messages.context_processors.messages",
],
},
},
@@ -221,10 +210,10 @@ DATABASES = {
# **SQLITE_CONNECTION_OPTIONS,
# },
}
MIGRATION_MODULES = {'signal_webhooks': None}
MIGRATION_MODULES = {"signal_webhooks": None}
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
HUEY = {
"huey_class": "huey.SqliteHuey",
@@ -254,7 +243,7 @@ DJANGO_HUEY = {
"queues": {
HUEY["name"]: HUEY.copy(),
# more registered here at plugin import-time by BaseQueue.register()
**abx.as_dict(abx.pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME)),
# Additional huey queues configured via settings
},
}
@@ -274,12 +263,12 @@ class HueyDBRouter:
def db_for_read(self, model, **hints):
if model._meta.app_label in self.route_app_labels:
return self.db_name
return 'default'
return "default"
def db_for_write(self, model, **hints):
if model._meta.app_label in self.route_app_labels:
return self.db_name
return 'default'
return "default"
def allow_relation(self, obj1, obj2, **hints):
if obj1._meta.app_label in self.route_app_labels or obj2._meta.app_label in self.route_app_labels:
@@ -291,6 +280,7 @@ class HueyDBRouter:
return db == self.db_name
return db == "default"
# class FilestoreDBRouter:
# """
# A router to store all the File models in the filestore.sqlite3 database.
@@ -321,16 +311,16 @@ class HueyDBRouter:
# return db == self.db_name
# return db == "default"
DATABASE_ROUTERS = ['core.settings.HueyDBRouter']
DATABASE_ROUTERS = ["core.settings.HueyDBRouter"]
CACHES = {
'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
"default": {"BACKEND": "django.core.cache.backends.locmem.LocMemCache"},
# 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
# 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
# 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'},
}
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
EMAIL_BACKEND = "django.core.mail.backends.console.EmailBackend"
STORAGES = {
@@ -363,32 +353,28 @@ STORAGES = {
# },
}
CHANNEL_LAYERS = {
"default": {
"BACKEND": "channels.layers.InMemoryChannelLayer"
}
}
CHANNEL_LAYERS = {"default": {"BACKEND": "channels.layers.InMemoryChannelLayer"}}
################################################################################
### Security Settings
################################################################################
SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
SECRET_KEY = SERVER_CONFIG.SECRET_KEY or get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_")
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(',')
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(',')))
ALLOWED_HOSTS = SERVER_CONFIG.ALLOWED_HOSTS.split(",")
CSRF_TRUSTED_ORIGINS = list(set(SERVER_CONFIG.CSRF_TRUSTED_ORIGINS.split(",")))
# automatically fix case when user sets ALLOWED_HOSTS (e.g. to archivebox.example.com)
# but forgets to add https://archivebox.example.com to CSRF_TRUSTED_ORIGINS
for hostname in ALLOWED_HOSTS:
https_endpoint = f'https://{hostname}'
if hostname != '*' and https_endpoint not in CSRF_TRUSTED_ORIGINS:
print(f'[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS')
https_endpoint = f"https://{hostname}"
if hostname != "*" and https_endpoint not in CSRF_TRUSTED_ORIGINS:
print(f"[!] WARNING: {https_endpoint} from ALLOWED_HOSTS should be added to CSRF_TRUSTED_ORIGINS")
CSRF_TRUSTED_ORIGINS.append(https_endpoint)
SECURE_BROWSER_XSS_FILTER = True
SECURE_CONTENT_TYPE_NOSNIFF = True
SECURE_REFERRER_POLICY = 'strict-origin-when-cross-origin'
SECURE_REFERRER_POLICY = "strict-origin-when-cross-origin"
CSRF_COOKIE_SECURE = False
SESSION_COOKIE_SECURE = False
@@ -401,10 +387,10 @@ SESSION_SAVE_EVERY_REQUEST = False
SESSION_ENGINE = "django.contrib.sessions.backends.db"
AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
{'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator'},
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
{"NAME": "django.contrib.auth.password_validation.UserAttributeSimilarityValidator"},
{"NAME": "django.contrib.auth.password_validation.MinimumLengthValidator"},
{"NAME": "django.contrib.auth.password_validation.CommonPasswordValidator"},
{"NAME": "django.contrib.auth.password_validation.NumericPasswordValidator"},
]
DATA_UPLOAD_MAX_NUMBER_FIELDS = None
@@ -414,29 +400,29 @@ DATA_UPLOAD_MAX_MEMORY_SIZE = 26_214_400 # 25MB
### Shell Settings
################################################################################
SHELL_PLUS = 'ipython'
SHELL_PLUS = "ipython"
SHELL_PLUS_PRINT_SQL = False
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
IPYTHON_ARGUMENTS = ["--no-confirm-exit", "--no-banner"]
IPYTHON_KERNEL_DISPLAY_NAME = "ArchiveBox Django Shell"
if IS_SHELL:
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'misc' / 'shell_welcome_message.py')
os.environ["PYTHONSTARTUP"] = str(PACKAGE_DIR / "misc" / "shell_welcome_message.py")
################################################################################
### Internationalization & Localization Settings
################################################################################
LANGUAGE_CODE = 'en-us'
LANGUAGE_CODE = "en-us"
USE_I18N = True
USE_TZ = True
DATETIME_FORMAT = 'Y-m-d h:i:s A'
SHORT_DATETIME_FORMAT = 'Y-m-d h:i:s A'
TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
DATETIME_FORMAT = "Y-m-d h:i:s A"
SHORT_DATETIME_FORMAT = "Y-m-d h:i:s A"
TIME_ZONE = CONSTANTS.TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
from django.conf.locale.en import formats as en_formats # type: ignore
from django.conf.locale.en import formats as en_formats # type: ignore
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
en_formats.DATETIME_FORMAT = DATETIME_FORMAT # monkey patch en_format default with our preferred format
en_formats.SHORT_DATETIME_FORMAT = SHORT_DATETIME_FORMAT
@@ -455,7 +441,7 @@ LOGGING = SETTINGS_LOGGING
################################################################################
# Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
SIGNAL_WEBHOOKS_CUSTOM_MODEL = "api.models.OutboundWebhook"
SIGNAL_WEBHOOKS = {
"HOOKS": {
# ... is a special sigil value that means "use the default autogenerated hooks"
@@ -524,7 +510,7 @@ ADMIN_DATA_VIEWS = {
"name": "log",
},
},
*abx.as_list(abx.pm.hook.get_ADMIN_DATA_VIEWS_URLS()),
# Additional admin data views from plugins
],
}
@@ -535,44 +521,45 @@ ADMIN_DATA_VIEWS = {
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
DEBUG_TOOLBAR = False
DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
DEBUG_TOOLBAR = DEBUG_TOOLBAR and DEBUG and ("--nothreading" in sys.argv) and ("--reload" not in sys.argv)
if DEBUG_TOOLBAR:
try:
import debug_toolbar # noqa
import debug_toolbar # noqa
DEBUG_TOOLBAR = True
except ImportError:
DEBUG_TOOLBAR = False
if DEBUG_TOOLBAR:
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
INSTALLED_APPS = [*INSTALLED_APPS, "debug_toolbar"]
INTERNAL_IPS = ["0.0.0.0", "127.0.0.1", "*"]
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
"RENDER_PANELS": True,
}
DEBUG_TOOLBAR_PANELS = [
'debug_toolbar.panels.history.HistoryPanel',
'debug_toolbar.panels.versions.VersionsPanel',
'debug_toolbar.panels.timer.TimerPanel',
'debug_toolbar.panels.settings.SettingsPanel',
'debug_toolbar.panels.headers.HeadersPanel',
'debug_toolbar.panels.request.RequestPanel',
'debug_toolbar.panels.sql.SQLPanel',
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
"debug_toolbar.panels.history.HistoryPanel",
"debug_toolbar.panels.versions.VersionsPanel",
"debug_toolbar.panels.timer.TimerPanel",
"debug_toolbar.panels.settings.SettingsPanel",
"debug_toolbar.panels.headers.HeadersPanel",
"debug_toolbar.panels.request.RequestPanel",
"debug_toolbar.panels.sql.SQLPanel",
"debug_toolbar.panels.staticfiles.StaticFilesPanel",
# 'debug_toolbar.panels.templates.TemplatesPanel',
'debug_toolbar.panels.cache.CachePanel',
'debug_toolbar.panels.signals.SignalsPanel',
'debug_toolbar.panels.logging.LoggingPanel',
'debug_toolbar.panels.redirects.RedirectsPanel',
'debug_toolbar.panels.profiling.ProfilingPanel',
'djdt_flamegraph.FlamegraphPanel',
"debug_toolbar.panels.cache.CachePanel",
"debug_toolbar.panels.signals.SignalsPanel",
"debug_toolbar.panels.logging.LoggingPanel",
"debug_toolbar.panels.redirects.RedirectsPanel",
"debug_toolbar.panels.profiling.ProfilingPanel",
"djdt_flamegraph.FlamegraphPanel",
]
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
MIDDLEWARE = [*MIDDLEWARE, "debug_toolbar.middleware.DebugToolbarMiddleware"]
if DEBUG:
from django_autotyping.typing import AutotypingSettingsDict
INSTALLED_APPS += ['django_autotyping']
INSTALLED_APPS += ["django_autotyping"]
AUTOTYPING: AutotypingSettingsDict = {
"STUBS_GENERATION": {
"LOCAL_STUBS_DIR": PACKAGE_DIR / "typings",

View File

@@ -79,15 +79,16 @@ class SnapshotMachine(StateMachine, strict_states=True):
@started.enter
def enter_started(self):
print(f'{self}.on_started() ↳ snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
print(f'{self}.on_started() ↳ snapshot.run()')
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
)
# create the pending archiveresults
self.snapshot.create_pending_archiveresults()
# unlock the snapshot after we're done creating the pending archiveresults + set status = started
# Run the snapshot - creates pending archiveresults for all enabled extractors
self.snapshot.run()
# unlock the snapshot after we're done + set status = started
self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5), # wait 5s before checking it again
status=Snapshot.StatusChoices.STARTED,
@@ -135,19 +136,22 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
backoff = State(value=ArchiveResult.StatusChoices.BACKOFF)
succeeded = State(value=ArchiveResult.StatusChoices.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.StatusChoices.FAILED, final=True)
skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)
# Tick Event
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(skipped, cond='is_skipped') |
started.to(backoff, cond='is_backoff') |
backoff.to.itself(unless='can_start') |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed')
backoff.to(failed, cond='is_failed') |
backoff.to(skipped, cond='is_skipped')
)
def __init__(self, archiveresult, *args, **kwargs):
@@ -167,22 +171,32 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
return can_start
def is_succeeded(self) -> bool:
if self.archiveresult.output and 'err' not in self.archiveresult.output.lower():
return True
return False
"""Check if extraction succeeded (status was set by run_extractor())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
if self.archiveresult.output and 'err' in self.archiveresult.output.lower():
return True
return False
"""Check if extraction failed (status was set by run_extractor())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.FAILED
def is_skipped(self) -> bool:
"""Check if extraction was skipped (status was set by run_extractor())."""
return self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED
def is_backoff(self) -> bool:
if self.archiveresult.output is None:
return True
return False
"""Check if we should backoff and retry later."""
# Backoff if status is still started (extractor didn't complete) and output is None
return (
self.archiveresult.status == ArchiveResult.StatusChoices.STARTED and
self.archiveresult.output is None
)
def is_finished(self) -> bool:
return self.is_failed() or self.is_succeeded()
"""Check if extraction has completed (success, failure, or skipped)."""
return self.archiveresult.status in (
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
)
@queued.enter
def enter_queued(self):
@@ -195,27 +209,28 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@started.enter
def enter_started(self):
print(f'{self}.on_started() ↳ archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
# lock the object for the next 30sec
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30),
status=ArchiveResult.StatusChoices.QUEUED,
start_ts=timezone.now(),
) # lock the obj for the next ~30s to limit racing with other workers
# create the output directory and fork the new extractor job subprocess
self.archiveresult.create_output_dir()
# self.archiveresult.extract(background=True)
print(f'{self}.on_started() ↳ archiveresult.start_ts + run_extractor()')
# mark the object as started
# Lock the object and mark start time
self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # retry it again in 30s if it fails
retry_at=timezone.now() + timedelta(seconds=120), # 2 min timeout for extractor
status=ArchiveResult.StatusChoices.STARTED,
start_ts=timezone.now(),
)
# simulate slow running extractor that completes after 2 seconds
time.sleep(2)
self.archiveresult.update_for_workers(output='completed')
# Run the extractor - this updates status, output, timestamps, etc.
self.archiveresult.run()
# Save the updated result
self.archiveresult.save()
# Log the result
if self.archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
print(f'{self} ✅ extractor succeeded: {self.archiveresult.output[:50] if self.archiveresult.output else ""}...')
elif self.archiveresult.status == ArchiveResult.StatusChoices.FAILED:
print(f'{self} ❌ extractor failed: {self.archiveresult.output[:100] if self.archiveresult.output else ""}...')
elif self.archiveresult.status == ArchiveResult.StatusChoices.SKIPPED:
print(f'{self} ⏭️ extractor skipped: {self.archiveresult.output[:50] if self.archiveresult.output else ""}')
@backoff.enter
def enter_backoff(self):
@@ -246,7 +261,15 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
end_ts=timezone.now(),
# **self.archiveresult.get_output_dict(), # {output, output_json, stderr, stdout, returncode, errors, cmd_version, pwd, cmd, machine}
)
@skipped.enter
def enter_skipped(self):
print(f'{self}.on_skipped() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
self.archiveresult.update_for_workers(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
def after_transition(self, event: str, source: State, target: State):

View File

@@ -23,8 +23,9 @@ from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.config import CONSTANTS, CONSTANTS_CONFIG, DATA_DIR, VERSION, SAVE_ARCHIVE_DOT_ORG
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.configset import get_flat_config, get_config, get_all_configs
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.serve_static import serve_static_with_byterange_support
from archivebox.misc.logging_util import printable_filesize
@@ -101,7 +102,7 @@ class SnapshotView(View):
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
snap_dir = Path(snapshot.link_dir)
snap_dir = Path(snapshot.output_dir)
if not os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK):
return {}
@@ -131,9 +132,7 @@ class SnapshotView(View):
best_result = archiveresults[result_type]
break
link = snapshot.as_link()
link_info = link._asdict(extended=True)
snapshot_info = snapshot.to_dict(extended=True)
try:
warc_path = 'warc/' + list(Path(snap_dir).glob('warc/*.warc.*'))[0].name
@@ -141,24 +140,23 @@ class SnapshotView(View):
warc_path = 'warc/'
context = {
**link_info,
**link_info['canonical'],
**snapshot_info,
**snapshot_info.get('canonical', {}),
'title': htmlencode(
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
snapshot.title
or (snapshot.base_url if snapshot.is_archived else TITLE_LOADING_MSG)
),
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'size': printable_filesize(link.archive_size) if link.archive_size else 'pending',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
'extension': snapshot.extension or 'html',
'tags': snapshot.tags_str() or 'untagged',
'size': printable_filesize(snapshot.archive_size) if snapshot.archive_size else 'pending',
'status': 'archived' if snapshot.is_archived else 'not yet archived',
'status_color': 'success' if snapshot.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(snapshot.oldest_archive_date),
'warc_path': warc_path,
'SAVE_ARCHIVE_DOT_ORG': archivebox.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG,
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
# 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234',
}
return render(template_name='core/snapshot_live.html', request=request, context=context)
@@ -190,7 +188,7 @@ class SnapshotView(View):
response = self.render_live_index(request, snapshot)
else:
response = serve_static_with_byterange_support(
request, archivefile, document_root=snapshot.link_dir, show_indexes=True,
request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
@@ -516,7 +514,7 @@ class HealthCheckView(View):
def find_config_section(key: str) -> str:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
CONFIGS = get_all_configs()
if key in CONSTANTS_CONFIG:
return 'CONSTANT'
@@ -527,7 +525,7 @@ def find_config_section(key: str) -> str:
return section
def find_config_default(key: str) -> str:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
CONFIGS = get_all_configs()
if key in CONSTANTS_CONFIG:
return str(CONSTANTS_CONFIG[key])
@@ -550,7 +548,7 @@ def find_config_default(key: str) -> str:
return default_val
def find_config_type(key: str) -> str:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
CONFIGS = get_all_configs()
for config in CONFIGS.values():
if hasattr(config, key):
@@ -569,7 +567,7 @@ def key_is_safe(key: str) -> bool:
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
CONFIGS = get_all_configs()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
@@ -611,8 +609,8 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
CONFIGS = get_all_configs()
FLAT_CONFIG = get_flat_config()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'